ddf1dc190b8ed1a4ea3c70b2a1386a078f426053
[controller.git] / opendaylight / md-sal / cds-access-client / src / main / java / org / opendaylight / controller / cluster / access / client / ClientActorBehavior.java
1 /*
2  * Copyright (c) 2016 Cisco Systems, Inc. and others.  All rights reserved.
3  *
4  * This program and the accompanying materials are made available under the
5  * terms of the Eclipse Public License v1.0 which accompanies this distribution,
6  * and is available at http://www.eclipse.org/legal/epl-v10.html
7  */
8 package org.opendaylight.controller.cluster.access.client;
9
10 import com.google.common.annotations.Beta;
11 import com.google.common.base.Preconditions;
12 import com.google.common.base.Stopwatch;
13 import com.google.common.base.Verify;
14 import java.util.Collection;
15 import java.util.Map;
16 import java.util.Optional;
17 import java.util.concurrent.ConcurrentHashMap;
18 import java.util.concurrent.TimeUnit;
19 import java.util.concurrent.TimeoutException;
20 import javax.annotation.Nonnull;
21 import javax.annotation.Nullable;
22 import javax.annotation.concurrent.GuardedBy;
23 import org.opendaylight.controller.cluster.access.commands.NotLeaderException;
24 import org.opendaylight.controller.cluster.access.commands.OutOfSequenceEnvelopeException;
25 import org.opendaylight.controller.cluster.access.concepts.ClientIdentifier;
26 import org.opendaylight.controller.cluster.access.concepts.FailureEnvelope;
27 import org.opendaylight.controller.cluster.access.concepts.LocalHistoryIdentifier;
28 import org.opendaylight.controller.cluster.access.concepts.RequestException;
29 import org.opendaylight.controller.cluster.access.concepts.RequestFailure;
30 import org.opendaylight.controller.cluster.access.concepts.ResponseEnvelope;
31 import org.opendaylight.controller.cluster.access.concepts.RetiredGenerationException;
32 import org.opendaylight.controller.cluster.access.concepts.RuntimeRequestException;
33 import org.opendaylight.controller.cluster.access.concepts.SuccessEnvelope;
34 import org.opendaylight.controller.cluster.access.concepts.TransactionIdentifier;
35 import org.opendaylight.controller.cluster.common.actor.Dispatchers.DispatcherType;
36 import org.opendaylight.controller.cluster.io.FileBackedOutputStreamFactory;
37 import org.opendaylight.controller.cluster.messaging.MessageAssembler;
38 import org.opendaylight.yangtools.concepts.Identifiable;
39 import org.opendaylight.yangtools.concepts.Identifier;
40 import org.opendaylight.yangtools.concepts.Registration;
41 import org.slf4j.Logger;
42 import org.slf4j.LoggerFactory;
43 import scala.concurrent.duration.FiniteDuration;
44
45 /**
46  * A behavior, which handles messages sent to a {@link AbstractClientActor}.
47  *
48  * @author Robert Varga
49  */
50 @Beta
51 public abstract class ClientActorBehavior<T extends BackendInfo> extends
52         RecoveredClientActorBehavior<ClientActorContext> implements Identifiable<ClientIdentifier> {
53     /**
54      * Connection reconnect cohort, driven by this class.
55      */
56     @FunctionalInterface
57     protected interface ConnectionConnectCohort {
58         /**
59          * Finish the connection by replaying previous messages onto the new connection.
60          *
61          * @param enqueuedEntries Previously-enqueued entries
62          * @return A {@link ReconnectForwarder} to handle any straggler messages which arrive after this method returns.
63          */
64         @Nonnull ReconnectForwarder finishReconnect(@Nonnull Collection<ConnectionEntry> enqueuedEntries);
65     }
66
67     private static final Logger LOG = LoggerFactory.getLogger(ClientActorBehavior.class);
68     private static final FiniteDuration RESOLVE_RETRY_DURATION = FiniteDuration.apply(1, TimeUnit.SECONDS);
69
70     /**
71      * Map of connections to the backend. This map is concurrent to allow lookups, but given complex operations
72      * involved in connection transitions it is protected by a {@link InversibleLock}. Write-side of the lock is taken
73      * during connection transitions. Optimistic read-side of the lock is taken when new connections are introduced
74      * into the map.
75      *
76      * <p>
77      * The lock detects potential AB/BA deadlock scenarios and will force the reader side out by throwing
78      * a {@link InversibleLockException} -- which must be propagated up, releasing locks as it propagates. The initial
79      * entry point causing the the conflicting lookup must then call {@link InversibleLockException#awaitResolution()}
80      * before retrying the operation.
81      */
82     // TODO: it should be possible to move these two into ClientActorContext
83     private final Map<Long, AbstractClientConnection<T>> connections = new ConcurrentHashMap<>();
84     private final InversibleLock connectionsLock = new InversibleLock();
85     private final BackendInfoResolver<T> resolver;
86     private final MessageAssembler responseMessageAssembler;
87     private final Registration staleBackendInfoReg;
88
89     protected ClientActorBehavior(@Nonnull final ClientActorContext context,
90             @Nonnull final BackendInfoResolver<T> resolver) {
91         super(context);
92         this.resolver = Preconditions.checkNotNull(resolver);
93
94         final ClientActorConfig config = context.config();
95         responseMessageAssembler = MessageAssembler.builder().logContext(persistenceId())
96                 .fileBackedStreamFactory(new FileBackedOutputStreamFactory(config.getFileBackedStreamingThreshold(),
97                         config.getTempFileDirectory()))
98                 .assembledMessageCallback((message, sender) -> context.self().tell(message, sender)).build();
99
100         staleBackendInfoReg = resolver.notifyWhenBackendInfoIsStale(shard -> {
101             context().executeInActor(behavior -> {
102                 LOG.debug("BackendInfo for shard {} is now stale", shard);
103                 final AbstractClientConnection<T> conn = connections.get(shard);
104                 if (conn instanceof ConnectedClientConnection) {
105                     conn.reconnect(this, new BackendStaleException(shard));
106                 }
107                 return behavior;
108             });
109         });
110     }
111
112     @Override
113     @Nonnull
114     public final ClientIdentifier getIdentifier() {
115         return context().getIdentifier();
116     }
117
118     @Override
119     public void close() {
120         super.close();
121         responseMessageAssembler.close();
122         staleBackendInfoReg.close();
123     }
124
125     /**
126      * Get a connection to a shard.
127      *
128      * @param shard Shard cookie
129      * @return Connection to a shard
130      * @throws InversibleLockException if the shard is being reconnected
131      */
132     public final AbstractClientConnection<T> getConnection(final Long shard) {
133         while (true) {
134             final long stamp = connectionsLock.optimisticRead();
135             final AbstractClientConnection<T> conn = connections.computeIfAbsent(shard, this::createConnection);
136             if (connectionsLock.validate(stamp)) {
137                 // No write-lock in-between, return success
138                 return conn;
139             }
140         }
141     }
142
143     private AbstractClientConnection<T> getConnection(final ResponseEnvelope<?> response) {
144         // Always called from actor context: no locking required
145         return connections.get(extractCookie(response.getMessage().getTarget()));
146     }
147
148     @SuppressWarnings("unchecked")
149     @Override
150     final ClientActorBehavior<T> onReceiveCommand(final Object command) {
151         if (command instanceof InternalCommand) {
152             return ((InternalCommand<T>) command).execute(this);
153         }
154
155         if (command instanceof SuccessEnvelope) {
156             return onRequestSuccess((SuccessEnvelope) command);
157         }
158
159         if (command instanceof FailureEnvelope) {
160             return internalOnRequestFailure((FailureEnvelope) command);
161         }
162
163         if (MessageAssembler.isHandledMessage(command)) {
164             context().dispatchers().getDispatcher(DispatcherType.Serialization).execute(
165                 () -> responseMessageAssembler.handleMessage(command, context().self()));
166             return this;
167         }
168
169         if (context().messageSlicer().handleMessage(command)) {
170             return this;
171         }
172
173         return onCommand(command);
174     }
175
176     private static long extractCookie(final Identifier id) {
177         if (id instanceof TransactionIdentifier) {
178             return ((TransactionIdentifier) id).getHistoryId().getCookie();
179         } else if (id instanceof LocalHistoryIdentifier) {
180             return ((LocalHistoryIdentifier) id).getCookie();
181         } else {
182             throw new IllegalArgumentException("Unhandled identifier " + id);
183         }
184     }
185
186     private void onResponse(final ResponseEnvelope<?> response) {
187         final AbstractClientConnection<T> connection = getConnection(response);
188         if (connection != null) {
189             connection.receiveResponse(response);
190         } else {
191             LOG.info("{}: Ignoring unknown response {}", persistenceId(), response);
192         }
193     }
194
195     private ClientActorBehavior<T> onRequestSuccess(final SuccessEnvelope success) {
196         onResponse(success);
197         return this;
198     }
199
200     private ClientActorBehavior<T> onRequestFailure(final FailureEnvelope failure) {
201         onResponse(failure);
202         return this;
203     }
204
205     private ClientActorBehavior<T> internalOnRequestFailure(final FailureEnvelope command) {
206         final AbstractClientConnection<T> conn = getConnection(command);
207         if (conn != null) {
208             /*
209              * We are talking to multiple actors, which may be lagging behind our state significantly. This has
210              * the effect that we may be receiving responses from a previous connection after we have created a new
211              * one to a different actor.
212              *
213              * Since we are already replaying requests to the new actor, we want to ignore errors reported on the old
214              * connection -- for example NotLeaderException, which must not cause a new reconnect. Check the envelope's
215              * sessionId and if it does not match our current connection just ignore it.
216              */
217             final Optional<T> optBackend = conn.getBackendInfo();
218             if (optBackend.isPresent() && optBackend.get().getSessionId() != command.getSessionId()) {
219                 LOG.debug("{}: Mismatched current connection {} and envelope {}, ignoring response", persistenceId(),
220                     conn, command);
221                 return this;
222             }
223         }
224
225         final RequestFailure<?, ?> failure = command.getMessage();
226         final RequestException cause = failure.getCause();
227         if (cause instanceof RetiredGenerationException) {
228             LOG.error("{}: current generation {} has been superseded", persistenceId(), getIdentifier(), cause);
229             haltClient(cause);
230             poison(cause);
231             return null;
232         }
233         if (cause instanceof NotLeaderException) {
234             if (conn instanceof ReconnectingClientConnection) {
235                 // Already reconnecting, do not churn the logs
236                 return this;
237             } else if (conn != null) {
238                 LOG.info("{}: connection {} indicated no leadership, reconnecting it", persistenceId(), conn, cause);
239                 return conn.reconnect(this, cause);
240             }
241         }
242         if (cause instanceof OutOfSequenceEnvelopeException) {
243             if (conn instanceof ReconnectingClientConnection) {
244                 // Already reconnecting, do not churn the logs
245                 return this;
246             } else if (conn != null) {
247                 LOG.info("{}: connection {} indicated sequencing mismatch on {} sequence {} ({}), reconnecting it",
248                     persistenceId(), conn, failure.getTarget(), failure.getSequence(), command.getTxSequence(), cause);
249                 return conn.reconnect(this, cause);
250             }
251         }
252
253         return onRequestFailure(command);
254     }
255
256     private void poison(final RequestException cause) {
257         final long stamp = connectionsLock.writeLock();
258         try {
259             for (AbstractClientConnection<T> q : connections.values()) {
260                 q.poison(cause);
261             }
262
263             connections.clear();
264         } finally {
265             connectionsLock.unlockWrite(stamp);
266         }
267
268         context().messageSlicer().close();
269     }
270
271     /**
272      * Halt And Catch Fire. Halt processing on this client. Implementations need to ensure they initiate state flush
273      * procedures. No attempt to use this instance should be made after this method returns. Any such use may result
274      * in undefined behavior.
275      *
276      * @param cause Failure cause
277      */
278     protected abstract void haltClient(@Nonnull Throwable cause);
279
280     /**
281      * Override this method to handle any command which is not handled by the base behavior.
282      *
283      * @param command the command to process
284      * @return Next behavior to use, null if this actor should shut down.
285      */
286     @Nullable
287     protected abstract ClientActorBehavior<T> onCommand(@Nonnull Object command);
288
289     /**
290      * Override this method to provide a backend resolver instance.
291      *
292      * @return a backend resolver instance
293      */
294     protected final @Nonnull BackendInfoResolver<T> resolver() {
295         return resolver;
296     }
297
298     /**
299      * Callback invoked when a new connection has been established. Implementations are expected perform preparatory
300      * tasks before the previous connection is frozen.
301      *
302      * @param newConn New connection
303      * @return ConnectionConnectCohort which will be used to complete the process of bringing the connection up.
304      */
305     @GuardedBy("connectionsLock")
306     @Nonnull protected abstract ConnectionConnectCohort connectionUp(@Nonnull ConnectedClientConnection<T> newConn);
307
308     private void backendConnectFinished(final Long shard, final AbstractClientConnection<T> oldConn,
309             final T backend, final Throwable failure) {
310         if (failure != null) {
311             if (failure instanceof TimeoutException) {
312                 if (!oldConn.equals(connections.get(shard))) {
313                     // AbstractClientConnection will remove itself when it decides there is no point in continuing,
314                     // at which point we want to stop retrying
315                     LOG.info("{}: stopping resolution of shard {} on stale connection {}", persistenceId(), shard,
316                         oldConn, failure);
317                     return;
318                 }
319
320                 LOG.debug("{}: timed out resolving shard {}, scheduling retry in {}", persistenceId(), shard,
321                     RESOLVE_RETRY_DURATION, failure);
322                 context().executeInActor(b -> {
323                     resolveConnection(shard, oldConn);
324                     return b;
325                 }, RESOLVE_RETRY_DURATION);
326                 return;
327             }
328
329             LOG.error("{}: failed to resolve shard {}", persistenceId(), shard, failure);
330             final RequestException cause;
331             if (failure instanceof RequestException) {
332                 cause = (RequestException) failure;
333             } else {
334                 cause = new RuntimeRequestException("Failed to resolve shard " + shard, failure);
335             }
336
337             oldConn.poison(cause);
338             return;
339         }
340
341         LOG.info("{}: resolved shard {} to {}", persistenceId(), shard, backend);
342         final long stamp = connectionsLock.writeLock();
343         try {
344             final Stopwatch sw = Stopwatch.createStarted();
345
346             // Create a new connected connection
347             final ConnectedClientConnection<T> newConn = new ConnectedClientConnection<>(oldConn, backend);
348             LOG.info("{}: resolving connection {} to {}", persistenceId(), oldConn, newConn);
349
350             // Start reconnecting without the old connection lock held
351             final ConnectionConnectCohort cohort = Verify.verifyNotNull(connectionUp(newConn));
352
353             // Lock the old connection and get a reference to its entries
354             final Collection<ConnectionEntry> replayIterable = oldConn.startReplay();
355
356             // Finish the connection attempt
357             final ReconnectForwarder forwarder = Verify.verifyNotNull(cohort.finishReconnect(replayIterable));
358
359             // Cancel sleep debt after entries were replayed, before new connection starts receiving.
360             newConn.cancelDebt();
361
362             // Install the forwarder, unlocking the old connection
363             oldConn.finishReplay(forwarder);
364
365             // Make sure new lookups pick up the new connection
366             if (!connections.replace(shard, oldConn, newConn)) {
367                 final AbstractClientConnection<T> existing = connections.get(oldConn.cookie());
368                 LOG.warn("{}: old connection {} does not match existing {}, new connection {} in limbo",
369                     persistenceId(), oldConn, existing, newConn);
370             } else {
371                 LOG.info("{}: replaced connection {} with {} in {}", persistenceId(), oldConn, newConn, sw);
372             }
373         } finally {
374             connectionsLock.unlockWrite(stamp);
375         }
376     }
377
378     void removeConnection(final AbstractClientConnection<?> conn) {
379         final long stamp = connectionsLock.writeLock();
380         try {
381             if (!connections.remove(conn.cookie(), conn)) {
382                 final AbstractClientConnection<T> existing = connections.get(conn.cookie());
383                 if (existing != null) {
384                     LOG.warn("{}: failed to remove connection {}, as it was superseded by {}", persistenceId(), conn,
385                         existing);
386                 } else {
387                     LOG.warn("{}: failed to remove connection {}, as it was not tracked", persistenceId(), conn);
388                 }
389             } else {
390                 LOG.info("{}: removed connection {}", persistenceId(), conn);
391                 cancelSlicing(conn.cookie());
392             }
393         } finally {
394             connectionsLock.unlockWrite(stamp);
395         }
396     }
397
398     @SuppressWarnings("unchecked")
399     void reconnectConnection(final ConnectedClientConnection<?> oldConn,
400             final ReconnectingClientConnection<?> newConn) {
401         final ReconnectingClientConnection<T> conn = (ReconnectingClientConnection<T>)newConn;
402         LOG.info("{}: connection {} reconnecting as {}", persistenceId(), oldConn, newConn);
403
404         final long stamp = connectionsLock.writeLock();
405         try {
406             final boolean replaced = connections.replace(oldConn.cookie(), (AbstractClientConnection<T>)oldConn, conn);
407             if (!replaced) {
408                 final AbstractClientConnection<T> existing = connections.get(oldConn.cookie());
409                 if (existing != null) {
410                     LOG.warn("{}: failed to replace connection {}, as it was superseded by {}", persistenceId(), conn,
411                         existing);
412                 } else {
413                     LOG.warn("{}: failed to replace connection {}, as it was not tracked", persistenceId(), conn);
414                 }
415             } else {
416                 cancelSlicing(oldConn.cookie());
417             }
418         } finally {
419             connectionsLock.unlockWrite(stamp);
420         }
421
422         final Long shard = oldConn.cookie();
423         LOG.info("{}: refreshing backend for shard {}", persistenceId(), shard);
424         resolver().refreshBackendInfo(shard, conn.getBackendInfo().get()).whenComplete(
425             (backend, failure) -> context().executeInActor(behavior -> {
426                 backendConnectFinished(shard, conn, backend, failure);
427                 return behavior;
428             }));
429     }
430
431     private void cancelSlicing(final Long cookie) {
432         context().messageSlicer().cancelSlicing(id -> {
433             try {
434                 return cookie.equals(extractCookie(id));
435             } catch (IllegalArgumentException e) {
436                 LOG.debug("extractCookie failed while cancelling slicing for cookie {}", cookie, e);
437                 return false;
438             }
439         });
440     }
441
442     private ConnectingClientConnection<T> createConnection(final Long shard) {
443         final ConnectingClientConnection<T> conn = new ConnectingClientConnection<>(context(), shard);
444         resolveConnection(shard, conn);
445         return conn;
446     }
447
448     private void resolveConnection(final Long shard, final AbstractClientConnection<T> conn) {
449         LOG.debug("{}: resolving shard {} connection {}", persistenceId(), shard, conn);
450         resolver().getBackendInfo(shard).whenComplete((backend, failure) -> context().executeInActor(behavior -> {
451             backendConnectFinished(shard, conn, backend, failure);
452             return behavior;
453         }));
454     }
455
456     private static class BackendStaleException extends RequestException {
457         private static final long serialVersionUID = 1L;
458
459         BackendStaleException(final Long shard) {
460             super("Backend for shard " + shard + " is stale");
461         }
462
463         @Override
464         public boolean isRetriable() {
465             return false;
466         }
467     }
468 }