40f07f137a59712fe16da23b67d5e3ea4cdffbc2
[controller.git] / opendaylight / md-sal / cds-access-client / src / main / java / org / opendaylight / controller / cluster / access / client / ClientActorBehavior.java
1 /*
2  * Copyright (c) 2016 Cisco Systems, Inc. and others.  All rights reserved.
3  *
4  * This program and the accompanying materials are made available under the
5  * terms of the Eclipse Public License v1.0 which accompanies this distribution,
6  * and is available at http://www.eclipse.org/legal/epl-v10.html
7  */
8 package org.opendaylight.controller.cluster.access.client;
9
10 import static java.util.Objects.requireNonNull;
11
12 import com.google.common.base.Stopwatch;
13 import com.google.common.base.Verify;
14 import java.util.Collection;
15 import java.util.Map;
16 import java.util.Optional;
17 import java.util.concurrent.ConcurrentHashMap;
18 import java.util.concurrent.TimeUnit;
19 import java.util.concurrent.TimeoutException;
20 import org.checkerframework.checker.lock.qual.Holding;
21 import org.eclipse.jdt.annotation.NonNull;
22 import org.eclipse.jdt.annotation.Nullable;
23 import org.opendaylight.controller.cluster.access.commands.NotLeaderException;
24 import org.opendaylight.controller.cluster.access.commands.OutOfSequenceEnvelopeException;
25 import org.opendaylight.controller.cluster.access.concepts.ClientIdentifier;
26 import org.opendaylight.controller.cluster.access.concepts.FailureEnvelope;
27 import org.opendaylight.controller.cluster.access.concepts.LocalHistoryIdentifier;
28 import org.opendaylight.controller.cluster.access.concepts.RequestException;
29 import org.opendaylight.controller.cluster.access.concepts.RequestFailure;
30 import org.opendaylight.controller.cluster.access.concepts.ResponseEnvelope;
31 import org.opendaylight.controller.cluster.access.concepts.RetiredGenerationException;
32 import org.opendaylight.controller.cluster.access.concepts.RuntimeRequestException;
33 import org.opendaylight.controller.cluster.access.concepts.SuccessEnvelope;
34 import org.opendaylight.controller.cluster.access.concepts.TransactionIdentifier;
35 import org.opendaylight.controller.cluster.common.actor.Dispatchers.DispatcherType;
36 import org.opendaylight.controller.cluster.io.FileBackedOutputStreamFactory;
37 import org.opendaylight.controller.cluster.messaging.MessageAssembler;
38 import org.opendaylight.yangtools.concepts.Identifiable;
39 import org.opendaylight.yangtools.concepts.Identifier;
40 import org.opendaylight.yangtools.concepts.Registration;
41 import org.slf4j.Logger;
42 import org.slf4j.LoggerFactory;
43 import scala.concurrent.duration.FiniteDuration;
44
45 /**
46  * A behavior, which handles messages sent to a {@link AbstractClientActor}.
47  */
48 public abstract class ClientActorBehavior<T extends BackendInfo> extends
49         RecoveredClientActorBehavior<ClientActorContext> implements Identifiable<ClientIdentifier> {
50     /**
51      * Connection reconnect cohort, driven by this class.
52      */
53     @FunctionalInterface
54     protected interface ConnectionConnectCohort {
55         /**
56          * Finish the connection by replaying previous messages onto the new connection.
57          *
58          * @param enqueuedEntries Previously-enqueued entries
59          * @return A {@link ReconnectForwarder} to handle any straggler messages which arrive after this method returns.
60          */
61         @NonNull ReconnectForwarder finishReconnect(@NonNull Collection<ConnectionEntry> enqueuedEntries);
62     }
63
64     private static final Logger LOG = LoggerFactory.getLogger(ClientActorBehavior.class);
65     private static final FiniteDuration RESOLVE_RETRY_DURATION = FiniteDuration.apply(1, TimeUnit.SECONDS);
66
67     /**
68      * Map of connections to the backend. This map is concurrent to allow lookups, but given complex operations
69      * involved in connection transitions it is protected by a {@link InversibleLock}. Write-side of the lock is taken
70      * during connection transitions. Optimistic read-side of the lock is taken when new connections are introduced
71      * into the map.
72      *
73      * <p>
74      * The lock detects potential AB/BA deadlock scenarios and will force the reader side out by throwing
75      * a {@link InversibleLockException} -- which must be propagated up, releasing locks as it propagates. The initial
76      * entry point causing the the conflicting lookup must then call {@link InversibleLockException#awaitResolution()}
77      * before retrying the operation.
78      */
79     // TODO: it should be possible to move these two into ClientActorContext
80     private final Map<Long, AbstractClientConnection<T>> connections = new ConcurrentHashMap<>();
81     private final InversibleLock connectionsLock = new InversibleLock();
82     private final BackendInfoResolver<T> resolver;
83     private final MessageAssembler responseMessageAssembler;
84     private final Registration staleBackendInfoReg;
85
86     protected ClientActorBehavior(final @NonNull ClientActorContext context,
87             final @NonNull BackendInfoResolver<T> resolver) {
88         super(context);
89         this.resolver = requireNonNull(resolver);
90
91         final ClientActorConfig config = context.config();
92         responseMessageAssembler = MessageAssembler.builder().logContext(persistenceId())
93                 .fileBackedStreamFactory(new FileBackedOutputStreamFactory(config.getFileBackedStreamingThreshold(),
94                         config.getTempFileDirectory()))
95                 .assembledMessageCallback((message, sender) -> context.self().tell(message, sender)).build();
96
97         staleBackendInfoReg = resolver.notifyWhenBackendInfoIsStale(shard -> {
98             context().executeInActor(behavior -> {
99                 LOG.debug("BackendInfo for shard {} is now stale", shard);
100                 final AbstractClientConnection<T> conn = connections.get(shard);
101                 if (conn instanceof ConnectedClientConnection) {
102                     conn.reconnect(this, new BackendStaleException(shard));
103                 }
104                 return behavior;
105             });
106         });
107     }
108
109     @Override
110     public final ClientIdentifier getIdentifier() {
111         return context().getIdentifier();
112     }
113
114     @Override
115     public void close() {
116         super.close();
117         responseMessageAssembler.close();
118         staleBackendInfoReg.close();
119     }
120
121     /**
122      * Get a connection to a shard.
123      *
124      * @param shard Shard cookie
125      * @return Connection to a shard
126      * @throws InversibleLockException if the shard is being reconnected
127      */
128     public final AbstractClientConnection<T> getConnection(final Long shard) {
129         while (true) {
130             final long stamp = connectionsLock.optimisticRead();
131             final AbstractClientConnection<T> conn = connections.computeIfAbsent(shard, this::createConnection);
132             if (connectionsLock.validate(stamp)) {
133                 // No write-lock in-between, return success
134                 return conn;
135             }
136         }
137     }
138
139     private AbstractClientConnection<T> getConnection(final ResponseEnvelope<?> response) {
140         // Always called from actor context: no locking required
141         return connections.get(extractCookie(response.getMessage().getTarget()));
142     }
143
144     @SuppressWarnings("unchecked")
145     @Override
146     final ClientActorBehavior<T> onReceiveCommand(final Object command) {
147         if (command instanceof InternalCommand) {
148             return ((InternalCommand<T>) command).execute(this);
149         }
150
151         if (command instanceof SuccessEnvelope) {
152             return onRequestSuccess((SuccessEnvelope) command);
153         }
154
155         if (command instanceof FailureEnvelope) {
156             return internalOnRequestFailure((FailureEnvelope) command);
157         }
158
159         if (MessageAssembler.isHandledMessage(command)) {
160             context().dispatchers().getDispatcher(DispatcherType.Serialization).execute(
161                 () -> responseMessageAssembler.handleMessage(command, context().self()));
162             return this;
163         }
164
165         if (context().messageSlicer().handleMessage(command)) {
166             return this;
167         }
168
169         return onCommand(command);
170     }
171
172     private static long extractCookie(final Identifier id) {
173         if (id instanceof TransactionIdentifier) {
174             return ((TransactionIdentifier) id).getHistoryId().getCookie();
175         } else if (id instanceof LocalHistoryIdentifier) {
176             return ((LocalHistoryIdentifier) id).getCookie();
177         } else {
178             throw new IllegalArgumentException("Unhandled identifier " + id);
179         }
180     }
181
182     private void onResponse(final ResponseEnvelope<?> response) {
183         final AbstractClientConnection<T> connection = getConnection(response);
184         if (connection != null) {
185             connection.receiveResponse(response);
186         } else {
187             LOG.info("{}: Ignoring unknown response {}", persistenceId(), response);
188         }
189     }
190
191     private ClientActorBehavior<T> onRequestSuccess(final SuccessEnvelope success) {
192         onResponse(success);
193         return this;
194     }
195
196     private ClientActorBehavior<T> onRequestFailure(final FailureEnvelope failure) {
197         onResponse(failure);
198         return this;
199     }
200
201     private ClientActorBehavior<T> internalOnRequestFailure(final FailureEnvelope command) {
202         final AbstractClientConnection<T> conn = getConnection(command);
203         if (conn != null) {
204             /*
205              * We are talking to multiple actors, which may be lagging behind our state significantly. This has
206              * the effect that we may be receiving responses from a previous connection after we have created a new
207              * one to a different actor.
208              *
209              * Since we are already replaying requests to the new actor, we want to ignore errors reported on the old
210              * connection -- for example NotLeaderException, which must not cause a new reconnect. Check the envelope's
211              * sessionId and if it does not match our current connection just ignore it.
212              */
213             final Optional<T> optBackend = conn.getBackendInfo();
214             if (optBackend.isPresent() && optBackend.get().getSessionId() != command.getSessionId()) {
215                 LOG.debug("{}: Mismatched current connection {} and envelope {}, ignoring response", persistenceId(),
216                     conn, command);
217                 return this;
218             }
219         }
220
221         final RequestFailure<?, ?> failure = command.getMessage();
222         final RequestException cause = failure.getCause();
223         if (cause instanceof RetiredGenerationException) {
224             LOG.error("{}: current generation {} has been superseded", persistenceId(), getIdentifier(), cause);
225             haltClient(cause);
226             poison(cause);
227             return null;
228         }
229         if (cause instanceof NotLeaderException) {
230             if (conn instanceof ReconnectingClientConnection) {
231                 // Already reconnecting, do not churn the logs
232                 return this;
233             } else if (conn != null) {
234                 LOG.info("{}: connection {} indicated no leadership, reconnecting it", persistenceId(), conn, cause);
235                 return conn.reconnect(this, cause);
236             }
237         }
238         if (cause instanceof OutOfSequenceEnvelopeException) {
239             if (conn instanceof ReconnectingClientConnection) {
240                 // Already reconnecting, do not churn the logs
241                 return this;
242             } else if (conn != null) {
243                 LOG.info("{}: connection {} indicated sequencing mismatch on {} sequence {} ({}), reconnecting it",
244                     persistenceId(), conn, failure.getTarget(), failure.getSequence(), command.getTxSequence(), cause);
245                 return conn.reconnect(this, cause);
246             }
247         }
248
249         return onRequestFailure(command);
250     }
251
252     private void poison(final RequestException cause) {
253         final long stamp = connectionsLock.writeLock();
254         try {
255             for (AbstractClientConnection<T> q : connections.values()) {
256                 q.poison(cause);
257             }
258
259             connections.clear();
260         } finally {
261             connectionsLock.unlockWrite(stamp);
262         }
263
264         context().messageSlicer().close();
265     }
266
267     /**
268      * Halt And Catch Fire. Halt processing on this client. Implementations need to ensure they initiate state flush
269      * procedures. No attempt to use this instance should be made after this method returns. Any such use may result
270      * in undefined behavior.
271      *
272      * @param cause Failure cause
273      */
274     protected abstract void haltClient(@NonNull Throwable cause);
275
276     /**
277      * Override this method to handle any command which is not handled by the base behavior.
278      *
279      * @param command the command to process
280      * @return Next behavior to use, null if this actor should shut down.
281      */
282     protected abstract @Nullable ClientActorBehavior<T> onCommand(@NonNull Object command);
283
284     /**
285      * Override this method to provide a backend resolver instance.
286      *
287      * @return a backend resolver instance
288      */
289     protected final @NonNull BackendInfoResolver<T> resolver() {
290         return resolver;
291     }
292
293     /**
294      * Callback invoked when a new connection has been established. Implementations are expected perform preparatory
295      * tasks before the previous connection is frozen.
296      *
297      * @param newConn New connection
298      * @return ConnectionConnectCohort which will be used to complete the process of bringing the connection up.
299      */
300     @Holding("connectionsLock")
301     protected abstract @NonNull ConnectionConnectCohort connectionUp(@NonNull ConnectedClientConnection<T> newConn);
302
303     private void backendConnectFinished(final Long shard, final AbstractClientConnection<T> oldConn,
304             final T backend, final Throwable failure) {
305         if (failure != null) {
306             if (failure instanceof TimeoutException) {
307                 if (!oldConn.equals(connections.get(shard))) {
308                     // AbstractClientConnection will remove itself when it decides there is no point in continuing,
309                     // at which point we want to stop retrying
310                     LOG.info("{}: stopping resolution of shard {} on stale connection {}", persistenceId(), shard,
311                         oldConn, failure);
312                     return;
313                 }
314
315                 LOG.debug("{}: timed out resolving shard {}, scheduling retry in {}", persistenceId(), shard,
316                     RESOLVE_RETRY_DURATION, failure);
317                 context().executeInActor(b -> {
318                     resolveConnection(shard, oldConn);
319                     return b;
320                 }, RESOLVE_RETRY_DURATION);
321                 return;
322             }
323
324             LOG.error("{}: failed to resolve shard {}", persistenceId(), shard, failure);
325             final RequestException cause;
326             if (failure instanceof RequestException) {
327                 cause = (RequestException) failure;
328             } else {
329                 cause = new RuntimeRequestException("Failed to resolve shard " + shard, failure);
330             }
331
332             oldConn.poison(cause);
333             return;
334         }
335
336         LOG.info("{}: resolved shard {} to {}", persistenceId(), shard, backend);
337         final long stamp = connectionsLock.writeLock();
338         try {
339             final Stopwatch sw = Stopwatch.createStarted();
340
341             // Create a new connected connection
342             final ConnectedClientConnection<T> newConn = new ConnectedClientConnection<>(oldConn, backend);
343             LOG.info("{}: resolving connection {} to {}", persistenceId(), oldConn, newConn);
344
345             // Start reconnecting without the old connection lock held
346             final ConnectionConnectCohort cohort = Verify.verifyNotNull(connectionUp(newConn));
347
348             // Lock the old connection and get a reference to its entries
349             final Collection<ConnectionEntry> replayIterable = oldConn.startReplay();
350
351             // Finish the connection attempt
352             final ReconnectForwarder forwarder = Verify.verifyNotNull(cohort.finishReconnect(replayIterable));
353
354             // Cancel sleep debt after entries were replayed, before new connection starts receiving.
355             newConn.cancelDebt();
356
357             // Install the forwarder, unlocking the old connection
358             oldConn.finishReplay(forwarder);
359
360             // Make sure new lookups pick up the new connection
361             if (!connections.replace(shard, oldConn, newConn)) {
362                 final AbstractClientConnection<T> existing = connections.get(oldConn.cookie());
363                 LOG.warn("{}: old connection {} does not match existing {}, new connection {} in limbo",
364                     persistenceId(), oldConn, existing, newConn);
365             } else {
366                 LOG.info("{}: replaced connection {} with {} in {}", persistenceId(), oldConn, newConn, sw);
367             }
368         } finally {
369             connectionsLock.unlockWrite(stamp);
370         }
371     }
372
373     void removeConnection(final AbstractClientConnection<?> conn) {
374         final long stamp = connectionsLock.writeLock();
375         try {
376             if (!connections.remove(conn.cookie(), conn)) {
377                 final AbstractClientConnection<T> existing = connections.get(conn.cookie());
378                 if (existing != null) {
379                     LOG.warn("{}: failed to remove connection {}, as it was superseded by {}", persistenceId(), conn,
380                         existing);
381                 } else {
382                     LOG.warn("{}: failed to remove connection {}, as it was not tracked", persistenceId(), conn);
383                 }
384             } else {
385                 LOG.info("{}: removed connection {}", persistenceId(), conn);
386                 cancelSlicing(conn.cookie());
387             }
388         } finally {
389             connectionsLock.unlockWrite(stamp);
390         }
391     }
392
393     @SuppressWarnings("unchecked")
394     void reconnectConnection(final ConnectedClientConnection<?> oldConn,
395             final ReconnectingClientConnection<?> newConn) {
396         final ReconnectingClientConnection<T> conn = (ReconnectingClientConnection<T>)newConn;
397         LOG.info("{}: connection {} reconnecting as {}", persistenceId(), oldConn, newConn);
398
399         final long stamp = connectionsLock.writeLock();
400         try {
401             final boolean replaced = connections.replace(oldConn.cookie(), (AbstractClientConnection<T>)oldConn, conn);
402             if (!replaced) {
403                 final AbstractClientConnection<T> existing = connections.get(oldConn.cookie());
404                 if (existing != null) {
405                     LOG.warn("{}: failed to replace connection {}, as it was superseded by {}", persistenceId(), conn,
406                         existing);
407                 } else {
408                     LOG.warn("{}: failed to replace connection {}, as it was not tracked", persistenceId(), conn);
409                 }
410             } else {
411                 cancelSlicing(oldConn.cookie());
412             }
413         } finally {
414             connectionsLock.unlockWrite(stamp);
415         }
416
417         final Long shard = oldConn.cookie();
418         LOG.info("{}: refreshing backend for shard {}", persistenceId(), shard);
419         resolver().refreshBackendInfo(shard, conn.getBackendInfo().get()).whenComplete(
420             (backend, failure) -> context().executeInActor(behavior -> {
421                 backendConnectFinished(shard, conn, backend, failure);
422                 return behavior;
423             }));
424     }
425
426     private void cancelSlicing(final Long cookie) {
427         context().messageSlicer().cancelSlicing(id -> {
428             try {
429                 return cookie.equals(extractCookie(id));
430             } catch (IllegalArgumentException e) {
431                 LOG.debug("extractCookie failed while cancelling slicing for cookie {}", cookie, e);
432                 return false;
433             }
434         });
435     }
436
437     private ConnectingClientConnection<T> createConnection(final Long shard) {
438         final ConnectingClientConnection<T> conn = new ConnectingClientConnection<>(context(), shard,
439                 resolver().resolveCookieName(shard));
440         resolveConnection(shard, conn);
441         return conn;
442     }
443
444     private void resolveConnection(final Long shard, final AbstractClientConnection<T> conn) {
445         LOG.debug("{}: resolving shard {} connection {}", persistenceId(), shard, conn);
446         resolver().getBackendInfo(shard).whenComplete((backend, failure) -> context().executeInActor(behavior -> {
447             backendConnectFinished(shard, conn, backend, failure);
448             return behavior;
449         }));
450     }
451
452     private static class BackendStaleException extends RequestException {
453         private static final long serialVersionUID = 1L;
454
455         BackendStaleException(final Long shard) {
456             super("Backend for shard " + shard + " is stale");
457         }
458
459         @Override
460         public boolean isRetriable() {
461             return false;
462         }
463     }
464 }