2 * Copyright (c) 2016 Cisco Systems, Inc. and others. All rights reserved.
4 * This program and the accompanying materials are made available under the
5 * terms of the Eclipse Public License v1.0 which accompanies this distribution,
6 * and is available at http://www.eclipse.org/legal/epl-v10.html
8 package org.opendaylight.controller.cluster.access.client;
10 import static java.util.Objects.requireNonNull;
12 import com.google.common.base.Stopwatch;
13 import com.google.common.base.Verify;
14 import java.util.Collection;
16 import java.util.Optional;
17 import java.util.concurrent.ConcurrentHashMap;
18 import java.util.concurrent.TimeUnit;
19 import java.util.concurrent.TimeoutException;
20 import org.checkerframework.checker.lock.qual.Holding;
21 import org.eclipse.jdt.annotation.NonNull;
22 import org.eclipse.jdt.annotation.Nullable;
23 import org.opendaylight.controller.cluster.access.commands.NotLeaderException;
24 import org.opendaylight.controller.cluster.access.commands.OutOfSequenceEnvelopeException;
25 import org.opendaylight.controller.cluster.access.concepts.ClientIdentifier;
26 import org.opendaylight.controller.cluster.access.concepts.FailureEnvelope;
27 import org.opendaylight.controller.cluster.access.concepts.LocalHistoryIdentifier;
28 import org.opendaylight.controller.cluster.access.concepts.RequestException;
29 import org.opendaylight.controller.cluster.access.concepts.RequestFailure;
30 import org.opendaylight.controller.cluster.access.concepts.ResponseEnvelope;
31 import org.opendaylight.controller.cluster.access.concepts.RetiredGenerationException;
32 import org.opendaylight.controller.cluster.access.concepts.RuntimeRequestException;
33 import org.opendaylight.controller.cluster.access.concepts.SuccessEnvelope;
34 import org.opendaylight.controller.cluster.access.concepts.TransactionIdentifier;
35 import org.opendaylight.controller.cluster.common.actor.Dispatchers.DispatcherType;
36 import org.opendaylight.controller.cluster.io.FileBackedOutputStreamFactory;
37 import org.opendaylight.controller.cluster.messaging.MessageAssembler;
38 import org.opendaylight.yangtools.concepts.Identifiable;
39 import org.opendaylight.yangtools.concepts.Identifier;
40 import org.opendaylight.yangtools.concepts.Registration;
41 import org.slf4j.Logger;
42 import org.slf4j.LoggerFactory;
43 import scala.concurrent.duration.FiniteDuration;
46 * A behavior, which handles messages sent to a {@link AbstractClientActor}.
48 public abstract class ClientActorBehavior<T extends BackendInfo> extends
49 RecoveredClientActorBehavior<ClientActorContext> implements Identifiable<ClientIdentifier> {
51 * Connection reconnect cohort, driven by this class.
54 protected interface ConnectionConnectCohort {
56 * Finish the connection by replaying previous messages onto the new connection.
58 * @param enqueuedEntries Previously-enqueued entries
59 * @return A {@link ReconnectForwarder} to handle any straggler messages which arrive after this method returns.
61 @NonNull ReconnectForwarder finishReconnect(@NonNull Collection<ConnectionEntry> enqueuedEntries);
64 private static final Logger LOG = LoggerFactory.getLogger(ClientActorBehavior.class);
65 private static final FiniteDuration RESOLVE_RETRY_DURATION = FiniteDuration.apply(1, TimeUnit.SECONDS);
68 * Map of connections to the backend. This map is concurrent to allow lookups, but given complex operations
69 * involved in connection transitions it is protected by a {@link InversibleLock}. Write-side of the lock is taken
70 * during connection transitions. Optimistic read-side of the lock is taken when new connections are introduced
74 * The lock detects potential AB/BA deadlock scenarios and will force the reader side out by throwing
75 * a {@link InversibleLockException} -- which must be propagated up, releasing locks as it propagates. The initial
76 * entry point causing the the conflicting lookup must then call {@link InversibleLockException#awaitResolution()}
77 * before retrying the operation.
79 // TODO: it should be possible to move these two into ClientActorContext
80 private final Map<Long, AbstractClientConnection<T>> connections = new ConcurrentHashMap<>();
81 private final InversibleLock connectionsLock = new InversibleLock();
82 private final BackendInfoResolver<T> resolver;
83 private final MessageAssembler responseMessageAssembler;
84 private final Registration staleBackendInfoReg;
86 protected ClientActorBehavior(final @NonNull ClientActorContext context,
87 final @NonNull BackendInfoResolver<T> resolver) {
89 this.resolver = requireNonNull(resolver);
91 final ClientActorConfig config = context.config();
92 responseMessageAssembler = MessageAssembler.builder().logContext(persistenceId())
93 .fileBackedStreamFactory(new FileBackedOutputStreamFactory(config.getFileBackedStreamingThreshold(),
94 config.getTempFileDirectory()))
95 .assembledMessageCallback((message, sender) -> context.self().tell(message, sender)).build();
97 staleBackendInfoReg = resolver.notifyWhenBackendInfoIsStale(shard -> {
98 context().executeInActor(behavior -> {
99 LOG.debug("BackendInfo for shard {} is now stale", shard);
100 final AbstractClientConnection<T> conn = connections.get(shard);
101 if (conn instanceof ConnectedClientConnection) {
102 conn.reconnect(this, new BackendStaleException(shard));
110 public final ClientIdentifier getIdentifier() {
111 return context().getIdentifier();
115 public void close() {
117 responseMessageAssembler.close();
118 staleBackendInfoReg.close();
122 * Get a connection to a shard.
124 * @param shard Shard cookie
125 * @return Connection to a shard
126 * @throws InversibleLockException if the shard is being reconnected
128 public final AbstractClientConnection<T> getConnection(final Long shard) {
130 final long stamp = connectionsLock.optimisticRead();
131 final AbstractClientConnection<T> conn = connections.computeIfAbsent(shard, this::createConnection);
132 if (connectionsLock.validate(stamp)) {
133 // No write-lock in-between, return success
139 private AbstractClientConnection<T> getConnection(final ResponseEnvelope<?> response) {
140 // Always called from actor context: no locking required
141 return connections.get(extractCookie(response.getMessage().getTarget()));
144 @SuppressWarnings("unchecked")
146 final ClientActorBehavior<T> onReceiveCommand(final Object command) {
147 if (command instanceof InternalCommand) {
148 return ((InternalCommand<T>) command).execute(this);
151 if (command instanceof SuccessEnvelope successEnvelope) {
152 return onRequestSuccess(successEnvelope);
154 if (command instanceof FailureEnvelope failureEnvelope) {
155 return internalOnRequestFailure(failureEnvelope);
158 if (MessageAssembler.isHandledMessage(command)) {
159 context().dispatchers().getDispatcher(DispatcherType.Serialization).execute(
160 () -> responseMessageAssembler.handleMessage(command, context().self()));
164 if (context().messageSlicer().handleMessage(command)) {
168 return onCommand(command);
171 private static long extractCookie(final Identifier id) {
172 if (id instanceof TransactionIdentifier transactionId) {
173 return transactionId.getHistoryId().getCookie();
174 } else if (id instanceof LocalHistoryIdentifier historyId) {
175 return historyId.getCookie();
177 throw new IllegalArgumentException("Unhandled identifier " + id);
181 private void onResponse(final ResponseEnvelope<?> response) {
182 final AbstractClientConnection<T> connection = getConnection(response);
183 if (connection != null) {
184 connection.receiveResponse(response);
186 LOG.info("{}: Ignoring unknown response {}", persistenceId(), response);
190 private ClientActorBehavior<T> onRequestSuccess(final SuccessEnvelope success) {
195 private ClientActorBehavior<T> onRequestFailure(final FailureEnvelope failure) {
200 private ClientActorBehavior<T> internalOnRequestFailure(final FailureEnvelope command) {
201 final AbstractClientConnection<T> conn = getConnection(command);
204 * We are talking to multiple actors, which may be lagging behind our state significantly. This has
205 * the effect that we may be receiving responses from a previous connection after we have created a new
206 * one to a different actor.
208 * Since we are already replaying requests to the new actor, we want to ignore errors reported on the old
209 * connection -- for example NotLeaderException, which must not cause a new reconnect. Check the envelope's
210 * sessionId and if it does not match our current connection just ignore it.
212 final Optional<T> optBackend = conn.getBackendInfo();
213 if (optBackend.isPresent() && optBackend.orElseThrow().getSessionId() != command.getSessionId()) {
214 LOG.debug("{}: Mismatched current connection {} and envelope {}, ignoring response", persistenceId(),
220 final RequestFailure<?, ?> failure = command.getMessage();
221 final RequestException cause = failure.getCause();
222 if (cause instanceof RetiredGenerationException) {
223 LOG.error("{}: current generation {} has been superseded", persistenceId(), getIdentifier(), cause);
228 if (cause instanceof NotLeaderException) {
229 if (conn instanceof ReconnectingClientConnection) {
230 // Already reconnecting, do not churn the logs
232 } else if (conn != null) {
233 LOG.info("{}: connection {} indicated no leadership, reconnecting it", persistenceId(), conn, cause);
234 return conn.reconnect(this, cause);
237 if (cause instanceof OutOfSequenceEnvelopeException) {
238 if (conn instanceof ReconnectingClientConnection) {
239 // Already reconnecting, do not churn the logs
241 } else if (conn != null) {
242 LOG.info("{}: connection {} indicated sequencing mismatch on {} sequence {} ({}), reconnecting it",
243 persistenceId(), conn, failure.getTarget(), failure.getSequence(), command.getTxSequence(), cause);
244 return conn.reconnect(this, cause);
248 return onRequestFailure(command);
251 private void poison(final RequestException cause) {
252 final long stamp = connectionsLock.writeLock();
254 for (AbstractClientConnection<T> q : connections.values()) {
260 connectionsLock.unlockWrite(stamp);
263 context().messageSlicer().close();
267 * Halt And Catch Fire. Halt processing on this client. Implementations need to ensure they initiate state flush
268 * procedures. No attempt to use this instance should be made after this method returns. Any such use may result
269 * in undefined behavior.
271 * @param cause Failure cause
273 protected abstract void haltClient(@NonNull Throwable cause);
276 * Override this method to handle any command which is not handled by the base behavior.
278 * @param command the command to process
279 * @return Next behavior to use, null if this actor should shut down.
281 protected abstract @Nullable ClientActorBehavior<T> onCommand(@NonNull Object command);
284 * Override this method to provide a backend resolver instance.
286 * @return a backend resolver instance
288 protected final @NonNull BackendInfoResolver<T> resolver() {
293 * Callback invoked when a new connection has been established. Implementations are expected perform preparatory
294 * tasks before the previous connection is frozen.
296 * @param newConn New connection
297 * @return ConnectionConnectCohort which will be used to complete the process of bringing the connection up.
299 @Holding("connectionsLock")
300 protected abstract @NonNull ConnectionConnectCohort connectionUp(@NonNull ConnectedClientConnection<T> newConn);
302 private void backendConnectFinished(final Long shard, final AbstractClientConnection<T> oldConn,
303 final T backend, final Throwable failure) {
304 if (failure != null) {
305 if (failure instanceof TimeoutException) {
306 if (!oldConn.equals(connections.get(shard))) {
307 // AbstractClientConnection will remove itself when it decides there is no point in continuing,
308 // at which point we want to stop retrying
309 LOG.info("{}: stopping resolution of shard {} on stale connection {}", persistenceId(), shard,
314 LOG.debug("{}: timed out resolving shard {}, scheduling retry in {}", persistenceId(), shard,
315 RESOLVE_RETRY_DURATION, failure);
316 context().executeInActor(b -> {
317 resolveConnection(shard, oldConn);
319 }, RESOLVE_RETRY_DURATION);
323 LOG.error("{}: failed to resolve shard {}", persistenceId(), shard, failure);
324 final RequestException cause;
325 if (failure instanceof RequestException requestException) {
326 cause = requestException;
328 cause = new RuntimeRequestException("Failed to resolve shard " + shard, failure);
331 oldConn.poison(cause);
335 LOG.info("{}: resolved shard {} to {}", persistenceId(), shard, backend);
336 final long stamp = connectionsLock.writeLock();
338 final Stopwatch sw = Stopwatch.createStarted();
340 // Create a new connected connection
341 final ConnectedClientConnection<T> newConn = new ConnectedClientConnection<>(oldConn, backend);
342 LOG.info("{}: resolving connection {} to {}", persistenceId(), oldConn, newConn);
344 // Start reconnecting without the old connection lock held
345 final ConnectionConnectCohort cohort = Verify.verifyNotNull(connectionUp(newConn));
347 // Lock the old connection and get a reference to its entries
348 final Collection<ConnectionEntry> replayIterable = oldConn.startReplay();
350 // Finish the connection attempt
351 final ReconnectForwarder forwarder = Verify.verifyNotNull(cohort.finishReconnect(replayIterable));
353 // Cancel sleep debt after entries were replayed, before new connection starts receiving.
354 newConn.cancelDebt();
356 // Install the forwarder, unlocking the old connection
357 oldConn.finishReplay(forwarder);
359 // Make sure new lookups pick up the new connection
360 if (!connections.replace(shard, oldConn, newConn)) {
361 final AbstractClientConnection<T> existing = connections.get(oldConn.cookie());
362 LOG.warn("{}: old connection {} does not match existing {}, new connection {} in limbo",
363 persistenceId(), oldConn, existing, newConn);
365 LOG.info("{}: replaced connection {} with {} in {}", persistenceId(), oldConn, newConn, sw);
368 connectionsLock.unlockWrite(stamp);
372 void removeConnection(final AbstractClientConnection<?> conn) {
373 final long stamp = connectionsLock.writeLock();
375 if (!connections.remove(conn.cookie(), conn)) {
376 final AbstractClientConnection<T> existing = connections.get(conn.cookie());
377 if (existing != null) {
378 LOG.warn("{}: failed to remove connection {}, as it was superseded by {}", persistenceId(), conn,
381 LOG.warn("{}: failed to remove connection {}, as it was not tracked", persistenceId(), conn);
384 LOG.info("{}: removed connection {}", persistenceId(), conn);
385 cancelSlicing(conn.cookie());
388 connectionsLock.unlockWrite(stamp);
392 @SuppressWarnings("unchecked")
393 void reconnectConnection(final ConnectedClientConnection<?> oldConn,
394 final ReconnectingClientConnection<?> newConn) {
395 final ReconnectingClientConnection<T> conn = (ReconnectingClientConnection<T>)newConn;
396 LOG.info("{}: connection {} reconnecting as {}", persistenceId(), oldConn, newConn);
398 final long stamp = connectionsLock.writeLock();
400 final boolean replaced = connections.replace(oldConn.cookie(), (AbstractClientConnection<T>)oldConn, conn);
402 final AbstractClientConnection<T> existing = connections.get(oldConn.cookie());
403 if (existing != null) {
404 LOG.warn("{}: failed to replace connection {}, as it was superseded by {}", persistenceId(), conn,
407 LOG.warn("{}: failed to replace connection {}, as it was not tracked", persistenceId(), conn);
410 cancelSlicing(oldConn.cookie());
413 connectionsLock.unlockWrite(stamp);
416 final Long shard = oldConn.cookie();
417 LOG.info("{}: refreshing backend for shard {}", persistenceId(), shard);
418 resolver().refreshBackendInfo(shard, conn.getBackendInfo().orElseThrow()).whenComplete(
419 (backend, failure) -> context().executeInActor(behavior -> {
420 backendConnectFinished(shard, conn, backend, failure);
425 private void cancelSlicing(final Long cookie) {
426 context().messageSlicer().cancelSlicing(id -> {
428 return cookie.equals(extractCookie(id));
429 } catch (IllegalArgumentException e) {
430 LOG.debug("extractCookie failed while cancelling slicing for cookie {}", cookie, e);
436 private ConnectingClientConnection<T> createConnection(final Long shard) {
437 final ConnectingClientConnection<T> conn = new ConnectingClientConnection<>(context(), shard,
438 resolver().resolveCookieName(shard));
439 resolveConnection(shard, conn);
443 private void resolveConnection(final Long shard, final AbstractClientConnection<T> conn) {
444 LOG.debug("{}: resolving shard {} connection {}", persistenceId(), shard, conn);
445 resolver().getBackendInfo(shard).whenComplete((backend, failure) -> context().executeInActor(behavior -> {
446 backendConnectFinished(shard, conn, backend, failure);
451 private static class BackendStaleException extends RequestException {
452 private static final long serialVersionUID = 1L;
454 BackendStaleException(final Long shard) {
455 super("Backend for shard " + shard + " is stale");
459 public boolean isRetriable() {