opendaylight/md-sal/cds-access-client/src/main/java/org/opendaylight/controller/cluster/access/client/ClientActorBehavior.java

   1 /*
   2  * Copyright (c) 2016 Cisco Systems, Inc. and others.  All rights reserved.
   3  *
   4  * This program and the accompanying materials are made available under the
   5  * terms of the Eclipse Public License v1.0 which accompanies this distribution,
   6  * and is available at http://www.eclipse.org/legal/epl-v10.html
   7  */
   8 package org.opendaylight.controller.cluster.access.client;
   9
  10 import com.google.common.annotations.Beta;
  11 import com.google.common.base.Preconditions;
  12 import com.google.common.base.Stopwatch;
  13 import com.google.common.base.Verify;
  14 import java.util.Collection;
  15 import java.util.Map;
  16 import java.util.Optional;
  17 import java.util.concurrent.ConcurrentHashMap;
  18 import java.util.concurrent.TimeUnit;
  19 import java.util.concurrent.TimeoutException;
  20 import javax.annotation.Nonnull;
  21 import javax.annotation.Nullable;
  22 import javax.annotation.concurrent.GuardedBy;
  23 import org.opendaylight.controller.cluster.access.commands.NotLeaderException;
  24 import org.opendaylight.controller.cluster.access.commands.OutOfSequenceEnvelopeException;
  25 import org.opendaylight.controller.cluster.access.concepts.ClientIdentifier;
  26 import org.opendaylight.controller.cluster.access.concepts.FailureEnvelope;
  27 import org.opendaylight.controller.cluster.access.concepts.LocalHistoryIdentifier;
  28 import org.opendaylight.controller.cluster.access.concepts.RequestException;
  29 import org.opendaylight.controller.cluster.access.concepts.RequestFailure;
  30 import org.opendaylight.controller.cluster.access.concepts.ResponseEnvelope;
  31 import org.opendaylight.controller.cluster.access.concepts.RetiredGenerationException;
  32 import org.opendaylight.controller.cluster.access.concepts.RuntimeRequestException;
  33 import org.opendaylight.controller.cluster.access.concepts.SuccessEnvelope;
  34 import org.opendaylight.controller.cluster.access.concepts.TransactionIdentifier;
  35 import org.opendaylight.yangtools.concepts.Identifiable;
  36 import org.opendaylight.yangtools.concepts.WritableIdentifier;
  37 import org.slf4j.Logger;
  38 import org.slf4j.LoggerFactory;
  39 import scala.concurrent.duration.FiniteDuration;
  40
  41 /**
  42  * A behavior, which handles messages sent to a {@link AbstractClientActor}.
  43  *
  44  * @author Robert Varga
  45  */
  46 @Beta
  47 public abstract class ClientActorBehavior<T extends BackendInfo> extends
  48         RecoveredClientActorBehavior<ClientActorContext> implements Identifiable<ClientIdentifier> {
  49     /**
  50      * Connection reconnect cohort, driven by this class.
  51      */
  52     @FunctionalInterface
  53     protected interface ConnectionConnectCohort {
  54         /**
  55          * Finish the connection by replaying previous messages onto the new connection.
  56          *
  57          * @param enqueuedEntries Previously-enqueued entries
  58          * @return A {@link ReconnectForwarder} to handle any straggler messages which arrive after this method returns.
  59          */
  60         @Nonnull ReconnectForwarder finishReconnect(@Nonnull Collection<ConnectionEntry> enqueuedEntries);
  61     }
  62
  63     private static final Logger LOG = LoggerFactory.getLogger(ClientActorBehavior.class);
  64     private static final FiniteDuration RESOLVE_RETRY_DURATION = FiniteDuration.apply(5, TimeUnit.SECONDS);
  65
  66     /**
  67      * Map of connections to the backend. This map is concurrent to allow lookups, but given complex operations
  68      * involved in connection transitions it is protected by a {@link InversibleLock}. Write-side of the lock is taken
  69      * during connection transitions. Optimistic read-side of the lock is taken when new connections are introduced
  70      * into the map.
  71      *
  72      * <p>
  73      * The lock detects potential AB/BA deadlock scenarios and will force the reader side out by throwing
  74      * a {@link InversibleLockException} -- which must be propagated up, releasing locks as it propagates. The initial
  75      * entry point causing the the conflicting lookup must then call {@link InversibleLockException#awaitResolution()}
  76      * before retrying the operation.
  77      */
  78     // TODO: it should be possible to move these two into ClientActorContext
  79     private final Map<Long, AbstractClientConnection<T>> connections = new ConcurrentHashMap<>();
  80     private final InversibleLock connectionsLock = new InversibleLock();
  81     private final BackendInfoResolver<T> resolver;
  82
  83     protected ClientActorBehavior(@Nonnull final ClientActorContext context,
  84             @Nonnull final BackendInfoResolver<T> resolver) {
  85         super(context);
  86         this.resolver = Preconditions.checkNotNull(resolver);
  87     }
  88
  89     @Override
  90     @Nonnull
  91     public final ClientIdentifier getIdentifier() {
  92         return context().getIdentifier();
  93     }
  94
  95     /**
  96      * Get a connection to a shard.
  97      *
  98      * @param shard Shard cookie
  99      * @return Connection to a shard
 100      * @throws InversibleLockException if the shard is being reconnected
 101      */
 102     public final AbstractClientConnection<T> getConnection(final Long shard) {
 103         while (true) {
 104             final long stamp = connectionsLock.optimisticRead();
 105             final AbstractClientConnection<T> conn = connections.computeIfAbsent(shard, this::createConnection);
 106             if (connectionsLock.validate(stamp)) {
 107                 // No write-lock in-between, return success
 108                 return conn;
 109             }
 110         }
 111     }
 112
 113     private AbstractClientConnection<T> getConnection(final ResponseEnvelope<?> response) {
 114         // Always called from actor context: no locking required
 115         return connections.get(extractCookie(response.getMessage().getTarget()));
 116     }
 117
 118     @SuppressWarnings("unchecked")
 119     @Override
 120     final ClientActorBehavior<T> onReceiveCommand(final Object command) {
 121         if (command instanceof InternalCommand) {
 122             return ((InternalCommand<T>) command).execute(this);
 123         }
 124         if (command instanceof SuccessEnvelope) {
 125             return onRequestSuccess((SuccessEnvelope) command);
 126         }
 127         if (command instanceof FailureEnvelope) {
 128             return internalOnRequestFailure((FailureEnvelope) command);
 129         }
 130
 131         return onCommand(command);
 132     }
 133
 134     private static long extractCookie(final WritableIdentifier id) {
 135         if (id instanceof TransactionIdentifier) {
 136             return ((TransactionIdentifier) id).getHistoryId().getCookie();
 137         } else if (id instanceof LocalHistoryIdentifier) {
 138             return ((LocalHistoryIdentifier) id).getCookie();
 139         } else {
 140             throw new IllegalArgumentException("Unhandled identifier " + id);
 141         }
 142     }
 143
 144     private void onResponse(final ResponseEnvelope<?> response) {
 145         final AbstractClientConnection<T> connection = getConnection(response);
 146         if (connection != null) {
 147             connection.receiveResponse(response);
 148         } else {
 149             LOG.info("{}: Ignoring unknown response {}", persistenceId(), response);
 150         }
 151     }
 152
 153     private ClientActorBehavior<T> onRequestSuccess(final SuccessEnvelope success) {
 154         onResponse(success);
 155         return this;
 156     }
 157
 158     private ClientActorBehavior<T> onRequestFailure(final FailureEnvelope failure) {
 159         onResponse(failure);
 160         return this;
 161     }
 162
 163     private ClientActorBehavior<T> internalOnRequestFailure(final FailureEnvelope command) {
 164         final AbstractClientConnection<T> conn = getConnection(command);
 165         if (conn != null) {
 166             /*
 167              * We are talking to multiple actors, which may be lagging behind our state significantly. This has
 168              * the effect that we may be receiving responses from a previous connection after we have created a new
 169              * one to a different actor.
 170              *
 171              * Since we are already replaying requests to the new actor, we want to ignore errors reported on the old
 172              * connection -- for example NotLeaderException, which must not cause a new reconnect. Check the envelope's
 173              * sessionId and if it does not match our current connection just ignore it.
 174              */
 175             final Optional<T> optBackend = conn.getBackendInfo();
 176             if (optBackend.isPresent() && optBackend.get().getSessionId() != command.getSessionId()) {
 177                 LOG.debug("{}: Mismatched current connection {} and envelope {}, ignoring response", persistenceId(),
 178                     conn, command);
 179                 return this;
 180             }
 181         }
 182
 183         final RequestFailure<?, ?> failure = command.getMessage();
 184         final RequestException cause = failure.getCause();
 185         if (cause instanceof RetiredGenerationException) {
 186             LOG.error("{}: current generation {} has been superseded", persistenceId(), getIdentifier(), cause);
 187             haltClient(cause);
 188             poison(cause);
 189             return null;
 190         }
 191         if (cause instanceof NotLeaderException) {
 192             if (conn instanceof ReconnectingClientConnection) {
 193                 // Already reconnecting, do not churn the logs
 194                 return this;
 195             } else if (conn != null) {
 196                 LOG.info("{}: connection {} indicated no leadership, reconnecting it", persistenceId(), conn, cause);
 197                 return conn.reconnect(this, cause);
 198             }
 199         }
 200         if (cause instanceof OutOfSequenceEnvelopeException) {
 201             if (conn instanceof ReconnectingClientConnection) {
 202                 // Already reconnecting, do not churn the logs
 203                 return this;
 204             } else if (conn != null) {
 205                 LOG.info("{}: connection {} indicated no sequencing mismatch on {} sequence {}, reconnecting it",
 206                     persistenceId(), conn, failure.getTarget(), failure.getSequence(), cause);
 207                 return conn.reconnect(this, cause);
 208             }
 209         }
 210
 211         return onRequestFailure(command);
 212     }
 213
 214     private void poison(final RequestException cause) {
 215         final long stamp = connectionsLock.writeLock();
 216         try {
 217             for (AbstractClientConnection<T> q : connections.values()) {
 218                 q.poison(cause);
 219             }
 220
 221             connections.clear();
 222         } finally {
 223             connectionsLock.unlockWrite(stamp);
 224         }
 225     }
 226
 227     /**
 228      * Halt And Catch Fire. Halt processing on this client. Implementations need to ensure they initiate state flush
 229      * procedures. No attempt to use this instance should be made after this method returns. Any such use may result
 230      * in undefined behavior.
 231      *
 232      * @param cause Failure cause
 233      */
 234     protected abstract void haltClient(@Nonnull Throwable cause);
 235
 236     /**
 237      * Override this method to handle any command which is not handled by the base behavior.
 238      *
 239      * @param command the command to process
 240      * @return Next behavior to use, null if this actor should shut down.
 241      */
 242     @Nullable
 243     protected abstract ClientActorBehavior<T> onCommand(@Nonnull Object command);
 244
 245     /**
 246      * Override this method to provide a backend resolver instance.
 247      *
 248      * @return a backend resolver instance
 249      */
 250     protected final @Nonnull BackendInfoResolver<T> resolver() {
 251         return resolver;
 252     }
 253
 254     /**
 255      * Callback invoked when a new connection has been established. Implementations are expected perform preparatory
 256      * tasks before the previous connection is frozen.
 257      *
 258      * @param newConn New connection
 259      * @return ConnectionConnectCohort which will be used to complete the process of bringing the connection up.
 260      */
 261     @GuardedBy("connectionsLock")
 262     @Nonnull protected abstract ConnectionConnectCohort connectionUp(@Nonnull ConnectedClientConnection<T> newConn);
 263
 264     private void backendConnectFinished(final Long shard, final AbstractClientConnection<T> conn,
 265             final T backend, final Throwable failure) {
 266         if (failure != null) {
 267             if (failure instanceof TimeoutException) {
 268                 if (!conn.equals(connections.get(shard))) {
 269                     // AbstractClientConnection will remove itself when it decides there is no point in continuing,
 270                     // at which point we want to stop retrying
 271                     LOG.info("{}: stopping resolution of shard {} on stale connection {}", persistenceId(), shard, conn,
 272                         failure);
 273                     return;
 274                 }
 275
 276                 LOG.debug("{}: timed out resolving shard {}, scheduling retry in {}", persistenceId(), shard,
 277                     RESOLVE_RETRY_DURATION, failure);
 278                 context().executeInActor(b -> {
 279                     resolveConnection(shard, conn);
 280                     return b;
 281                 }, RESOLVE_RETRY_DURATION);
 282                 return;
 283             }
 284
 285             LOG.error("{}: failed to resolve shard {}", persistenceId(), shard, failure);
 286             final RequestException cause;
 287             if (failure instanceof RequestException) {
 288                 cause = (RequestException) failure;
 289             } else {
 290                 cause = new RuntimeRequestException("Failed to resolve shard " + shard, failure);
 291             }
 292
 293             conn.poison(cause);
 294             return;
 295         }
 296
 297         LOG.info("{}: resolved shard {} to {}", persistenceId(), shard, backend);
 298         final long stamp = connectionsLock.writeLock();
 299         try {
 300             final Stopwatch sw = Stopwatch.createStarted();
 301
 302             // Create a new connected connection
 303             final ConnectedClientConnection<T> newConn = new ConnectedClientConnection<>(conn.context(),
 304                     conn.cookie(), backend);
 305             LOG.info("{}: resolving connection {} to {}", persistenceId(), conn, newConn);
 306
 307             // Start reconnecting without the old connection lock held
 308             final ConnectionConnectCohort cohort = Verify.verifyNotNull(connectionUp(newConn));
 309
 310             // Lock the old connection and get a reference to its entries
 311             final Collection<ConnectionEntry> replayIterable = conn.startReplay();
 312
 313             // Finish the connection attempt
 314             final ReconnectForwarder forwarder = Verify.verifyNotNull(cohort.finishReconnect(replayIterable));
 315
 316             // Install the forwarder, unlocking the old connection
 317             conn.finishReplay(forwarder);
 318
 319             // Make sure new lookups pick up the new connection
 320             if (!connections.replace(shard, conn, newConn)) {
 321                 final AbstractClientConnection<T> existing = connections.get(conn.cookie());
 322                 LOG.warn("{}: old connection {} does not match existing {}, new connection {} in limbo",
 323                     persistenceId(), conn, existing, newConn);
 324             } else {
 325                 LOG.info("{}: replaced connection {} with {} in {}", persistenceId(), conn, newConn, sw);
 326             }
 327         } finally {
 328             connectionsLock.unlockWrite(stamp);
 329         }
 330     }
 331
 332     void removeConnection(final AbstractClientConnection<?> conn) {
 333         final long stamp = connectionsLock.writeLock();
 334         try {
 335             if (!connections.remove(conn.cookie(), conn)) {
 336                 final AbstractClientConnection<T> existing = connections.get(conn.cookie());
 337                 if (existing != null) {
 338                     LOG.warn("{}: failed to remove connection {}, as it was superseded by {}", persistenceId(), conn,
 339                         existing);
 340                 } else {
 341                     LOG.warn("{}: failed to remove connection {}, as it was not tracked", persistenceId(), conn);
 342                 }
 343             } else {
 344                 LOG.info("{}: removed connection {}", persistenceId(), conn);
 345             }
 346         } finally {
 347             connectionsLock.unlockWrite(stamp);
 348         }
 349     }
 350
 351     @SuppressWarnings("unchecked")
 352     void reconnectConnection(final ConnectedClientConnection<?> oldConn,
 353             final ReconnectingClientConnection<?> newConn) {
 354         final ReconnectingClientConnection<T> conn = (ReconnectingClientConnection<T>)newConn;
 355         LOG.info("{}: connection {} reconnecting as {}", persistenceId(), oldConn, newConn);
 356
 357         final long stamp = connectionsLock.writeLock();
 358         try {
 359             final boolean replaced = connections.replace(oldConn.cookie(), (AbstractClientConnection<T>)oldConn, conn);
 360             if (!replaced) {
 361                 final AbstractClientConnection<T> existing = connections.get(oldConn.cookie());
 362                 if (existing != null) {
 363                     LOG.warn("{}: failed to replace connection {}, as it was superseded by {}", persistenceId(), conn,
 364                         existing);
 365                 } else {
 366                     LOG.warn("{}: failed to replace connection {}, as it was not tracked", persistenceId(), conn);
 367                 }
 368             }
 369         } finally {
 370             connectionsLock.unlockWrite(stamp);
 371         }
 372
 373         final Long shard = oldConn.cookie();
 374         LOG.info("{}: refreshing backend for shard {}", persistenceId(), shard);
 375         resolver().refreshBackendInfo(shard, conn.getBackendInfo().get()).whenComplete(
 376             (backend, failure) -> context().executeInActor(behavior -> {
 377                 backendConnectFinished(shard, conn, backend, failure);
 378                 return behavior;
 379             }));
 380     }
 381
 382     private ConnectingClientConnection<T> createConnection(final Long shard) {
 383         final ConnectingClientConnection<T> conn = new ConnectingClientConnection<>(context(), shard);
 384         resolveConnection(shard, conn);
 385         return conn;
 386     }
 387
 388     private void resolveConnection(final Long shard, final AbstractClientConnection<T> conn) {
 389         LOG.debug("{}: resolving shard {} connection {}", persistenceId(), shard, conn);
 390         resolver().getBackendInfo(shard).whenComplete((backend, failure) -> context().executeInActor(behavior -> {
 391             backendConnectFinished(shard, conn, backend, failure);
 392             return behavior;
 393         }));
 394     }
 395 }