From c74608b67d88d809ebec51c0e84add37a0b98711 Mon Sep 17 00:00:00 2001 From: Robert Varga Date: Mon, 15 May 2017 16:56:14 +0200 Subject: [PATCH] BUG-8452: make NoShardLeaderException retriable We can recover from this exception by retrying the connection to the backend. Wrap it in a TimeoutException, which will cause a new connection attempt. Change-Id: I1d5c771fdb89cbdd7723e0425542154a1ed85853 Signed-off-by: Robert Varga --- .../client/AbstractClientConnection.java | 18 +++++++++---- .../access/client/BackendInfoResolver.java | 2 +- .../access/client/ClientActorBehavior.java | 4 +-- .../client/ConnectedClientConnection.java | 5 ++-- .../client/ConnectingClientConnection.java | 5 ++-- .../client/ReconnectingClientConnection.java | 23 +++++++++++++--- .../client/ConnectedClientConnectionTest.java | 3 ++- .../ReconnectingClientConnectionTest.java | 5 ++-- .../dds/AbstractShardBackendResolver.java | 10 +++++-- .../DistributedDataStoreIntegrationTest.java | 27 +++++++++++++++---- ...butedDataStoreRemotingIntegrationTest.java | 11 +++++++- 11 files changed, 87 insertions(+), 26 deletions(-) diff --git a/opendaylight/md-sal/cds-access-client/src/main/java/org/opendaylight/controller/cluster/access/client/AbstractClientConnection.java b/opendaylight/md-sal/cds-access-client/src/main/java/org/opendaylight/controller/cluster/access/client/AbstractClientConnection.java index 47c0676979..21e5f67a74 100644 --- a/opendaylight/md-sal/cds-access-client/src/main/java/org/opendaylight/controller/cluster/access/client/AbstractClientConnection.java +++ b/opendaylight/md-sal/cds-access-client/src/main/java/org/opendaylight/controller/cluster/access/client/AbstractClientConnection.java @@ -16,6 +16,7 @@ import edu.umd.cs.findbugs.annotations.SuppressFBWarnings; import java.util.Iterator; import java.util.Optional; import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReentrantLock; import java.util.function.Consumer; @@ -26,6 +27,7 @@ import org.opendaylight.controller.cluster.access.concepts.Request; import org.opendaylight.controller.cluster.access.concepts.RequestException; import org.opendaylight.controller.cluster.access.concepts.Response; import org.opendaylight.controller.cluster.access.concepts.ResponseEnvelope; +import org.opendaylight.controller.cluster.access.concepts.RuntimeRequestException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import scala.concurrent.duration.FiniteDuration; @@ -181,7 +183,8 @@ public abstract class AbstractClientConnection { } @GuardedBy("lock") - abstract ClientActorBehavior lockedReconnect(ClientActorBehavior current); + abstract ClientActorBehavior lockedReconnect(ClientActorBehavior current, + RequestException runtimeRequestException); final long enqueueEntry(final ConnectionEntry entry, final long now) { lock.lock(); @@ -201,10 +204,10 @@ public abstract class AbstractClientConnection { } } - final ClientActorBehavior reconnect(final ClientActorBehavior current) { + final ClientActorBehavior reconnect(final ClientActorBehavior current, final RequestException cause) { lock.lock(); try { - return lockedReconnect(current); + return lockedReconnect(current, cause); } finally { lock.unlock(); } @@ -269,7 +272,8 @@ public abstract class AbstractClientConnection { delay = lockedCheckTimeout(now); if (delay == null) { // We have timed out. There is no point in scheduling a timer - return lockedReconnect(current); + return lockedReconnect(current, new RuntimeRequestException("Backend connection timed out", + new TimeoutException())); } if (delay.isPresent()) { @@ -347,10 +351,14 @@ public abstract class AbstractClientConnection { @GuardedBy("lock") private void lockedPoison(final RequestException cause) { - poisoned = cause; + poisoned = enrichPoison(cause); queue.poison(cause); } + RequestException enrichPoison(final RequestException ex) { + return ex; + } + @VisibleForTesting final RequestException poisoned() { return poisoned; diff --git a/opendaylight/md-sal/cds-access-client/src/main/java/org/opendaylight/controller/cluster/access/client/BackendInfoResolver.java b/opendaylight/md-sal/cds-access-client/src/main/java/org/opendaylight/controller/cluster/access/client/BackendInfoResolver.java index 2e8ab4e103..4ece691d89 100644 --- a/opendaylight/md-sal/cds-access-client/src/main/java/org/opendaylight/controller/cluster/access/client/BackendInfoResolver.java +++ b/opendaylight/md-sal/cds-access-client/src/main/java/org/opendaylight/controller/cluster/access/client/BackendInfoResolver.java @@ -21,7 +21,7 @@ import javax.annotation.Nonnull; * If the completion stage returned by this interface's methods fails with a * {@link org.opendaylight.controller.cluster.access.concepts.RequestException}, it will be forwarded to all * outstanding requests towards the leader. If it fails with a {@link java.util.concurrent.TimeoutException}, - * resolution process will be retries. If it fails with any other cause, it will we wrapped as a + * resolution process will be retried. If it fails with any other cause, it will we wrapped as a * {@link org.opendaylight.controller.cluster.access.concepts.RuntimeRequestException} wrapping that cause. * * @author Robert Varga diff --git a/opendaylight/md-sal/cds-access-client/src/main/java/org/opendaylight/controller/cluster/access/client/ClientActorBehavior.java b/opendaylight/md-sal/cds-access-client/src/main/java/org/opendaylight/controller/cluster/access/client/ClientActorBehavior.java index ca78d0cb66..9a459eee78 100644 --- a/opendaylight/md-sal/cds-access-client/src/main/java/org/opendaylight/controller/cluster/access/client/ClientActorBehavior.java +++ b/opendaylight/md-sal/cds-access-client/src/main/java/org/opendaylight/controller/cluster/access/client/ClientActorBehavior.java @@ -173,7 +173,7 @@ public abstract class ClientActorBehavior extends return this; } else if (conn != null) { LOG.info("{}: connection {} indicated no leadership, reconnecting it", persistenceId(), conn, cause); - return conn.reconnect(this); + return conn.reconnect(this, cause); } } if (cause instanceof OutOfSequenceEnvelopeException) { @@ -184,7 +184,7 @@ public abstract class ClientActorBehavior extends } else if (conn != null) { LOG.info("{}: connection {} indicated no sequencing mismatch on {} sequence {}, reconnecting it", persistenceId(), conn, failure.getTarget(), failure.getSequence(), cause); - return conn.reconnect(this); + return conn.reconnect(this, cause); } } diff --git a/opendaylight/md-sal/cds-access-client/src/main/java/org/opendaylight/controller/cluster/access/client/ConnectedClientConnection.java b/opendaylight/md-sal/cds-access-client/src/main/java/org/opendaylight/controller/cluster/access/client/ConnectedClientConnection.java index cade27f77d..0afc7acf49 100644 --- a/opendaylight/md-sal/cds-access-client/src/main/java/org/opendaylight/controller/cluster/access/client/ConnectedClientConnection.java +++ b/opendaylight/md-sal/cds-access-client/src/main/java/org/opendaylight/controller/cluster/access/client/ConnectedClientConnection.java @@ -9,6 +9,7 @@ package org.opendaylight.controller.cluster.access.client; import com.google.common.annotations.Beta; import javax.annotation.concurrent.NotThreadSafe; +import org.opendaylight.controller.cluster.access.concepts.RequestException; @Beta @NotThreadSafe @@ -18,8 +19,8 @@ public final class ConnectedClientConnection extends Abst } @Override - ClientActorBehavior lockedReconnect(final ClientActorBehavior current) { - final ReconnectingClientConnection next = new ReconnectingClientConnection<>(this); + ClientActorBehavior lockedReconnect(final ClientActorBehavior current, final RequestException cause) { + final ReconnectingClientConnection next = new ReconnectingClientConnection<>(this, cause); setForwarder(new SimpleReconnectForwarder(next)); current.reconnectConnection(this, next); return current; diff --git a/opendaylight/md-sal/cds-access-client/src/main/java/org/opendaylight/controller/cluster/access/client/ConnectingClientConnection.java b/opendaylight/md-sal/cds-access-client/src/main/java/org/opendaylight/controller/cluster/access/client/ConnectingClientConnection.java index 12c520bb17..a36267c44f 100644 --- a/opendaylight/md-sal/cds-access-client/src/main/java/org/opendaylight/controller/cluster/access/client/ConnectingClientConnection.java +++ b/opendaylight/md-sal/cds-access-client/src/main/java/org/opendaylight/controller/cluster/access/client/ConnectingClientConnection.java @@ -9,6 +9,7 @@ package org.opendaylight.controller.cluster.access.client; import com.google.common.annotations.Beta; import java.util.Optional; +import org.opendaylight.controller.cluster.access.concepts.RequestException; @Beta public final class ConnectingClientConnection extends AbstractClientConnection { @@ -30,7 +31,7 @@ public final class ConnectingClientConnection extends Abs } @Override - ClientActorBehavior lockedReconnect(final ClientActorBehavior current) { - throw new UnsupportedOperationException("Attempted to reconnect a connecting connection"); + ClientActorBehavior lockedReconnect(final ClientActorBehavior current, final RequestException cause) { + throw new UnsupportedOperationException("Attempted to reconnect a connecting connection", cause); } } diff --git a/opendaylight/md-sal/cds-access-client/src/main/java/org/opendaylight/controller/cluster/access/client/ReconnectingClientConnection.java b/opendaylight/md-sal/cds-access-client/src/main/java/org/opendaylight/controller/cluster/access/client/ReconnectingClientConnection.java index 0aac7f4663..b59c9e324a 100644 --- a/opendaylight/md-sal/cds-access-client/src/main/java/org/opendaylight/controller/cluster/access/client/ReconnectingClientConnection.java +++ b/opendaylight/md-sal/cds-access-client/src/main/java/org/opendaylight/controller/cluster/access/client/ReconnectingClientConnection.java @@ -7,6 +7,8 @@ */ package org.opendaylight.controller.cluster.access.client; +import com.google.common.base.Preconditions; +import org.opendaylight.controller.cluster.access.concepts.RequestException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -20,14 +22,29 @@ import org.slf4j.LoggerFactory; public final class ReconnectingClientConnection extends AbstractReceivingClientConnection { private static final Logger LOG = LoggerFactory.getLogger(ReconnectingClientConnection.class); - ReconnectingClientConnection(final ConnectedClientConnection oldConnection) { + private RequestException cause; + + ReconnectingClientConnection(final ConnectedClientConnection oldConnection, final RequestException cause) { super(oldConnection); + this.cause = Preconditions.checkNotNull(cause); } @Override - ClientActorBehavior lockedReconnect(final ClientActorBehavior current) { - // Intentional no-op + ClientActorBehavior lockedReconnect(final ClientActorBehavior current, final RequestException cause) { + this.cause = Preconditions.checkNotNull(cause); LOG.debug("Skipping reconnect of already-reconnecting connection {}", this); return current; } + + @Override + RequestException enrichPoison(final RequestException ex) { + if (ex.getCause() != null) { + ex.addSuppressed(cause); + } else { + ex.initCause(cause); + } + + return ex; + } + } diff --git a/opendaylight/md-sal/cds-access-client/src/test/java/org/opendaylight/controller/cluster/access/client/ConnectedClientConnectionTest.java b/opendaylight/md-sal/cds-access-client/src/test/java/org/opendaylight/controller/cluster/access/client/ConnectedClientConnectionTest.java index eb101520e3..e66512d041 100644 --- a/opendaylight/md-sal/cds-access-client/src/test/java/org/opendaylight/controller/cluster/access/client/ConnectedClientConnectionTest.java +++ b/opendaylight/md-sal/cds-access-client/src/test/java/org/opendaylight/controller/cluster/access/client/ConnectedClientConnectionTest.java @@ -14,6 +14,7 @@ import static org.mockito.Mockito.verify; import org.junit.Test; import org.opendaylight.controller.cluster.access.ABIVersion; +import org.opendaylight.controller.cluster.access.concepts.RequestException; public class ConnectedClientConnectionTest extends AbstractClientConnectionTest, BackendInfo> { @@ -28,7 +29,7 @@ public class ConnectedClientConnectionTest @Test public void testReconnectConnection() throws Exception { final ClientActorBehavior behavior = mock(ClientActorBehavior.class); - connection.lockedReconnect(behavior); + connection.lockedReconnect(behavior, mock(RequestException.class)); verify(behavior).reconnectConnection(same(connection), any(ReconnectingClientConnection.class)); } diff --git a/opendaylight/md-sal/cds-access-client/src/test/java/org/opendaylight/controller/cluster/access/client/ReconnectingClientConnectionTest.java b/opendaylight/md-sal/cds-access-client/src/test/java/org/opendaylight/controller/cluster/access/client/ReconnectingClientConnectionTest.java index 142e948820..b36dd7ccec 100644 --- a/opendaylight/md-sal/cds-access-client/src/test/java/org/opendaylight/controller/cluster/access/client/ReconnectingClientConnectionTest.java +++ b/opendaylight/md-sal/cds-access-client/src/test/java/org/opendaylight/controller/cluster/access/client/ReconnectingClientConnectionTest.java @@ -19,6 +19,7 @@ import org.opendaylight.controller.cluster.access.ABIVersion; import org.opendaylight.controller.cluster.access.commands.TransactionAbortSuccess; import org.opendaylight.controller.cluster.access.concepts.LocalHistoryIdentifier; import org.opendaylight.controller.cluster.access.concepts.Request; +import org.opendaylight.controller.cluster.access.concepts.RequestException; import org.opendaylight.controller.cluster.access.concepts.RequestSuccess; import org.opendaylight.controller.cluster.access.concepts.Response; import org.opendaylight.controller.cluster.access.concepts.ResponseEnvelope; @@ -34,14 +35,14 @@ public class ReconnectingClientConnectionTest final ConnectedClientConnection oldConnection = new ConnectedClientConnection<>(context, 0L, backend); - return new ReconnectingClientConnection<>(oldConnection); + return new ReconnectingClientConnection<>(oldConnection, mock(RequestException.class)); } @Override @Test public void testReconnectConnection() throws Exception { final ClientActorBehavior behavior = mock(ClientActorBehavior.class); - Assert.assertSame(behavior, connection.lockedReconnect(behavior)); + Assert.assertSame(behavior, connection.lockedReconnect(behavior, mock(RequestException.class))); } @Override diff --git a/opendaylight/md-sal/sal-distributed-datastore/src/main/java/org/opendaylight/controller/cluster/databroker/actors/dds/AbstractShardBackendResolver.java b/opendaylight/md-sal/sal-distributed-datastore/src/main/java/org/opendaylight/controller/cluster/databroker/actors/dds/AbstractShardBackendResolver.java index 93cf7931e5..6b221da766 100644 --- a/opendaylight/md-sal/sal-distributed-datastore/src/main/java/org/opendaylight/controller/cluster/databroker/actors/dds/AbstractShardBackendResolver.java +++ b/opendaylight/md-sal/sal-distributed-datastore/src/main/java/org/opendaylight/controller/cluster/databroker/actors/dds/AbstractShardBackendResolver.java @@ -14,6 +14,7 @@ import com.google.common.primitives.UnsignedLong; import java.util.concurrent.CompletableFuture; import java.util.concurrent.CompletionStage; import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; import java.util.concurrent.atomic.AtomicLong; import javax.annotation.Nonnull; import javax.annotation.Nullable; @@ -107,8 +108,7 @@ abstract class AbstractShardBackendResolver extends BackendInfoResolver future) { LOG.debug("Shard {} resolved to {}, attempting to connect", shardName, info); diff --git a/opendaylight/md-sal/sal-distributed-datastore/src/test/java/org/opendaylight/controller/cluster/datastore/DistributedDataStoreIntegrationTest.java b/opendaylight/md-sal/sal-distributed-datastore/src/test/java/org/opendaylight/controller/cluster/datastore/DistributedDataStoreIntegrationTest.java index 2cd2b29c8d..7019073961 100644 --- a/opendaylight/md-sal/sal-distributed-datastore/src/test/java/org/opendaylight/controller/cluster/datastore/DistributedDataStoreIntegrationTest.java +++ b/opendaylight/md-sal/sal-distributed-datastore/src/test/java/org/opendaylight/controller/cluster/datastore/DistributedDataStoreIntegrationTest.java @@ -43,6 +43,7 @@ import java.util.List; import java.util.concurrent.CountDownLatch; import java.util.concurrent.ExecutionException; import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; import java.util.concurrent.atomic.AtomicReference; import org.junit.After; import org.junit.Before; @@ -668,8 +669,11 @@ public class DistributedDataStoreIntegrationTest { assertTrue("Expected LocalShardFound. Actual: " + result, result instanceof LocalShardFound); // Create the write Tx. - try (DOMStoreWriteTransaction writeTx = writeOnly ? dataStore.newWriteOnlyTransaction() - : dataStore.newReadWriteTransaction()) { + DOMStoreWriteTransaction writeTxToClose = null; + try { + writeTxToClose = writeOnly ? dataStore.newWriteOnlyTransaction() + : dataStore.newReadWriteTransaction(); + final DOMStoreWriteTransaction writeTx = writeTxToClose; assertNotNull("newReadWriteTransaction returned null", writeTx); // Do some modifications and ready the Tx on a separate @@ -708,7 +712,20 @@ public class DistributedDataStoreIntegrationTest { txCohort.get().canCommit().get(5, TimeUnit.SECONDS); fail("Expected NoShardLeaderException"); } catch (final ExecutionException e) { - Throwables.propagate(Throwables.getRootCause(e)); + assertTrue(Throwables.getRootCause(e) instanceof NoShardLeaderException); + assertEquals(DistributedDataStore.class, testParameter); + } catch (TimeoutException e) { + // ClientBackedDataStore doesn't set cause to ExecutionException, future just time outs + assertEquals(ClientBackedDataStore.class, testParameter); + } + } finally { + try { + if (writeTxToClose != null) { + writeTxToClose.close(); + } + } catch (Exception e) { + // FIXME TransactionProxy.close throws IllegalStateException: + // Transaction is ready, it cannot be closed } } } @@ -716,13 +733,13 @@ public class DistributedDataStoreIntegrationTest { }; } - @Test(expected = NoShardLeaderException.class) + @Test public void testWriteOnlyTransactionCommitFailureWithNoShardLeader() throws Exception { datastoreContextBuilder.writeOnlyTransactionOptimizationsEnabled(true); testTransactionCommitFailureWithNoShardLeader(true, "testWriteOnlyTransactionCommitFailureWithNoShardLeader"); } - @Test(expected = NoShardLeaderException.class) + @Test public void testReadWriteTransactionCommitFailureWithNoShardLeader() throws Exception { testTransactionCommitFailureWithNoShardLeader(false, "testReadWriteTransactionCommitFailureWithNoShardLeader"); } diff --git a/opendaylight/md-sal/sal-distributed-datastore/src/test/java/org/opendaylight/controller/cluster/datastore/DistributedDataStoreRemotingIntegrationTest.java b/opendaylight/md-sal/sal-distributed-datastore/src/test/java/org/opendaylight/controller/cluster/datastore/DistributedDataStoreRemotingIntegrationTest.java index dc5c2d422f..6a8b1bffe7 100644 --- a/opendaylight/md-sal/sal-distributed-datastore/src/test/java/org/opendaylight/controller/cluster/datastore/DistributedDataStoreRemotingIntegrationTest.java +++ b/opendaylight/md-sal/sal-distributed-datastore/src/test/java/org/opendaylight/controller/cluster/datastore/DistributedDataStoreRemotingIntegrationTest.java @@ -42,6 +42,7 @@ import java.util.LinkedList; import java.util.List; import java.util.concurrent.ExecutionException; import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; import java.util.concurrent.atomic.AtomicLong; import org.junit.After; import org.junit.Assume; @@ -320,7 +321,7 @@ public class DistributedDataStoreRemotingIntegrationTest extends AbstractTest { final ActorSystem newSystem = newActorSystem("reinstated-member2", "Member2"); - try (final AbstractDataStore member2Datastore = new IntegrationTestKit(newSystem, leaderDatastoreContextBuilder) + try (AbstractDataStore member2Datastore = new IntegrationTestKit(newSystem, leaderDatastoreContextBuilder) .setupAbstractDataStore(testParameter, testName, "module-shards-member2", true, CARS)) { verifyCars(member2Datastore.newReadOnlyTransaction(), car2); } @@ -1023,6 +1024,10 @@ public class DistributedDataStoreRemotingIntegrationTest extends AbstractTest { final String msg = "Unexpected exception: " + Throwables.getStackTraceAsString(e.getCause()); assertTrue(msg, Throwables.getRootCause(e) instanceof NoShardLeaderException || e.getCause() instanceof ShardLeaderNotRespondingException); + assertEquals(DistributedDataStore.class, testParameter); + } catch (final TimeoutException e) { + // ClientBackedDataStore doesn't set cause to ExecutionException, future just time outs + assertEquals(ClientBackedDataStore.class, testParameter); } } @@ -1057,6 +1062,10 @@ public class DistributedDataStoreRemotingIntegrationTest extends AbstractTest { final String msg = "Expected instance of NoShardLeaderException, actual: \n" + Throwables.getStackTraceAsString(e.getCause()); assertTrue(msg, Throwables.getRootCause(e) instanceof NoShardLeaderException); + assertEquals(DistributedDataStore.class, testParameter); + } catch (TimeoutException e) { + // ClientBackedDataStore doesn't set cause to ExecutionException, future just time outs + assertEquals(ClientBackedDataStore.class, testParameter); } } -- 2.36.6