BUG-8452: make NoShardLeaderException retriable
[controller.git] / opendaylight / md-sal / cds-access-client / src / main / java / org / opendaylight / controller / cluster / access / client / ClientActorBehavior.java
index 896d85b713fcecfb611cc0d23987a973125718d1..9ba118ed6b9c95a1d5c914a4b681413af4cce95d 100644 (file)
@@ -10,6 +10,7 @@ package org.opendaylight.controller.cluster.access.client;
 import com.google.common.annotations.Beta;
 import com.google.common.base.Preconditions;
 import com.google.common.base.Verify;
+import java.util.Collection;
 import java.util.Map;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.TimeUnit;
@@ -17,6 +18,8 @@ import java.util.concurrent.TimeoutException;
 import javax.annotation.Nonnull;
 import javax.annotation.Nullable;
 import javax.annotation.concurrent.GuardedBy;
+import org.opendaylight.controller.cluster.access.commands.NotLeaderException;
+import org.opendaylight.controller.cluster.access.commands.OutOfSequenceEnvelopeException;
 import org.opendaylight.controller.cluster.access.concepts.ClientIdentifier;
 import org.opendaylight.controller.cluster.access.concepts.FailureEnvelope;
 import org.opendaylight.controller.cluster.access.concepts.LocalHistoryIdentifier;
@@ -52,7 +55,7 @@ public abstract class ClientActorBehavior<T extends BackendInfo> extends
          * @param enqueuedEntries Previously-enqueued entries
          * @return A {@link ReconnectForwarder} to handle any straggler messages which arrive after this method returns.
          */
-        @Nonnull ReconnectForwarder finishReconnect(@Nonnull Iterable<ConnectionEntry> enqueuedEntries);
+        @Nonnull ReconnectForwarder finishReconnect(@Nonnull Collection<ConnectionEntry> enqueuedEntries);
     }
 
     private static final Logger LOG = LoggerFactory.getLogger(ClientActorBehavior.class);
@@ -105,6 +108,11 @@ public abstract class ClientActorBehavior<T extends BackendInfo> extends
         }
     }
 
+    private AbstractClientConnection<T> getConnection(final ResponseEnvelope<?> response) {
+        // Always called from actor context: no locking required
+        return connections.get(extractCookie(response.getMessage().getTarget()));
+    }
+
     @SuppressWarnings("unchecked")
     @Override
     final ClientActorBehavior<T> onReceiveCommand(final Object command) {
@@ -132,8 +140,7 @@ public abstract class ClientActorBehavior<T extends BackendInfo> extends
     }
 
     private void onResponse(final ResponseEnvelope<?> response) {
-        final long cookie = extractCookie(response.getMessage().getTarget());
-        final AbstractClientConnection<T> connection = connections.get(cookie);
+        final AbstractClientConnection<T> connection = getConnection(response);
         if (connection != null) {
             connection.receiveResponse(response);
         } else {
@@ -160,6 +167,27 @@ public abstract class ClientActorBehavior<T extends BackendInfo> extends
             poison(cause);
             return null;
         }
+        if (cause instanceof NotLeaderException) {
+            final AbstractClientConnection<T> conn = getConnection(command);
+            if (conn instanceof ReconnectingClientConnection) {
+                // Already reconnecting, do not churn the logs
+                return this;
+            } else if (conn != null) {
+                LOG.info("{}: connection {} indicated no leadership, reconnecting it", persistenceId(), conn, cause);
+                return conn.reconnect(this, cause);
+            }
+        }
+        if (cause instanceof OutOfSequenceEnvelopeException) {
+            final AbstractClientConnection<T> conn = getConnection(command);
+            if (conn instanceof ReconnectingClientConnection) {
+                // Already reconnecting, do not churn the logs
+                return this;
+            } else if (conn != null) {
+                LOG.info("{}: connection {} indicated no sequencing mismatch on {} sequence {}, reconnecting it",
+                    persistenceId(), conn, failure.getTarget(), failure.getSequence(), cause);
+                return conn.reconnect(this, cause);
+            }
+        }
 
         return onRequestFailure(command);
     }
@@ -247,19 +275,19 @@ public abstract class ClientActorBehavior<T extends BackendInfo> extends
             return;
         }
 
-        LOG.debug("{}: resolved shard {} to {}", persistenceId(), shard, backend);
+        LOG.info("{}: resolved shard {} to {}", persistenceId(), shard, backend);
         final long stamp = connectionsLock.writeLock();
         try {
             // Create a new connected connection
             final ConnectedClientConnection<T> newConn = new ConnectedClientConnection<>(conn.context(),
                     conn.cookie(), backend);
-            LOG.debug("{}: resolving connection {} to {}", persistenceId(), conn, newConn);
+            LOG.info("{}: resolving connection {} to {}", persistenceId(), conn, newConn);
 
             // Start reconnecting without the old connection lock held
             final ConnectionConnectCohort cohort = Verify.verifyNotNull(connectionUp(newConn));
 
             // Lock the old connection and get a reference to its entries
-            final Iterable<ConnectionEntry> replayIterable = conn.startReplay();
+            final Collection<ConnectionEntry> replayIterable = conn.startReplay();
 
             // Finish the connection attempt
             final ReconnectForwarder forwarder = Verify.verifyNotNull(cohort.finishReconnect(replayIterable));
@@ -268,26 +296,61 @@ public abstract class ClientActorBehavior<T extends BackendInfo> extends
             conn.finishReplay(forwarder);
 
             // Make sure new lookups pick up the new connection
-            connections.replace(shard, conn, newConn);
-            LOG.debug("{}: replaced connection {} with {}", persistenceId(), conn, newConn);
+            if (!connections.replace(shard, conn, newConn)) {
+                final AbstractClientConnection<T> existing = connections.get(conn.cookie());
+                LOG.warn("{}: old connection {} does not match existing {}, new connection {} in limbo",
+                    persistenceId(), conn, existing, newConn);
+            } else {
+                LOG.info("{}: replaced connection {} with {}", persistenceId(), conn, newConn);
+            }
         } finally {
             connectionsLock.unlockWrite(stamp);
         }
     }
 
     void removeConnection(final AbstractClientConnection<?> conn) {
-        connections.remove(conn.cookie(), conn);
-        LOG.debug("{}: removed connection {}", persistenceId(), conn);
+        final long stamp = connectionsLock.writeLock();
+        try {
+            if (!connections.remove(conn.cookie(), conn)) {
+                final AbstractClientConnection<T> existing = connections.get(conn.cookie());
+                if (existing != null) {
+                    LOG.warn("{}: failed to remove connection {}, as it was superseded by {}", persistenceId(), conn,
+                        existing);
+                } else {
+                    LOG.warn("{}: failed to remove connection {}, as it was not tracked", persistenceId(), conn);
+                }
+            } else {
+                LOG.info("{}: removed connection {}", persistenceId(), conn);
+            }
+        } finally {
+            connectionsLock.unlockWrite(stamp);
+        }
     }
 
     @SuppressWarnings("unchecked")
     void reconnectConnection(final ConnectedClientConnection<?> oldConn,
             final ReconnectingClientConnection<?> newConn) {
         final ReconnectingClientConnection<T> conn = (ReconnectingClientConnection<T>)newConn;
-        connections.replace(oldConn.cookie(), (AbstractClientConnection<T>)oldConn, conn);
-        LOG.debug("{}: connection {} reconnecting as {}", persistenceId(), oldConn, newConn);
+        LOG.info("{}: connection {} reconnecting as {}", persistenceId(), oldConn, newConn);
+
+        final long stamp = connectionsLock.writeLock();
+        try {
+            final boolean replaced = connections.replace(oldConn.cookie(), (AbstractClientConnection<T>)oldConn, conn);
+            if (!replaced) {
+                final AbstractClientConnection<T> existing = connections.get(oldConn.cookie());
+                if (existing != null) {
+                    LOG.warn("{}: failed to replace connection {}, as it was superseded by {}", persistenceId(), conn,
+                        existing);
+                } else {
+                    LOG.warn("{}: failed to replace connection {}, as it was not tracked", persistenceId(), conn);
+                }
+            }
+        } finally {
+            connectionsLock.unlockWrite(stamp);
+        }
 
         final Long shard = oldConn.cookie();
+        LOG.info("{}: refreshing backend for shard {}", persistenceId(), shard);
         resolver().refreshBackendInfo(shard, conn.getBackendInfo().get()).whenComplete(
             (backend, failure) -> context().executeInActor(behavior -> {
                 backendConnectFinished(shard, conn, backend, failure);