Slice front-end request messages
[controller.git] / opendaylight / md-sal / cds-access-client / src / main / java / org / opendaylight / controller / cluster / access / client / ClientActorBehavior.java
1 /*
2  * Copyright (c) 2016 Cisco Systems, Inc. and others.  All rights reserved.
3  *
4  * This program and the accompanying materials are made available under the
5  * terms of the Eclipse Public License v1.0 which accompanies this distribution,
6  * and is available at http://www.eclipse.org/legal/epl-v10.html
7  */
8 package org.opendaylight.controller.cluster.access.client;
9
10 import com.google.common.annotations.Beta;
11 import com.google.common.base.Preconditions;
12 import com.google.common.base.Stopwatch;
13 import com.google.common.base.Verify;
14 import java.util.Collection;
15 import java.util.Map;
16 import java.util.Optional;
17 import java.util.concurrent.ConcurrentHashMap;
18 import java.util.concurrent.TimeUnit;
19 import java.util.concurrent.TimeoutException;
20 import javax.annotation.Nonnull;
21 import javax.annotation.Nullable;
22 import javax.annotation.concurrent.GuardedBy;
23 import org.opendaylight.controller.cluster.access.commands.NotLeaderException;
24 import org.opendaylight.controller.cluster.access.commands.OutOfSequenceEnvelopeException;
25 import org.opendaylight.controller.cluster.access.concepts.ClientIdentifier;
26 import org.opendaylight.controller.cluster.access.concepts.FailureEnvelope;
27 import org.opendaylight.controller.cluster.access.concepts.LocalHistoryIdentifier;
28 import org.opendaylight.controller.cluster.access.concepts.RequestException;
29 import org.opendaylight.controller.cluster.access.concepts.RequestFailure;
30 import org.opendaylight.controller.cluster.access.concepts.ResponseEnvelope;
31 import org.opendaylight.controller.cluster.access.concepts.RetiredGenerationException;
32 import org.opendaylight.controller.cluster.access.concepts.RuntimeRequestException;
33 import org.opendaylight.controller.cluster.access.concepts.SuccessEnvelope;
34 import org.opendaylight.controller.cluster.access.concepts.TransactionIdentifier;
35 import org.opendaylight.controller.cluster.common.actor.Dispatchers.DispatcherType;
36 import org.opendaylight.controller.cluster.io.FileBackedOutputStreamFactory;
37 import org.opendaylight.controller.cluster.messaging.MessageAssembler;
38 import org.opendaylight.yangtools.concepts.Identifiable;
39 import org.opendaylight.yangtools.concepts.Identifier;
40 import org.slf4j.Logger;
41 import org.slf4j.LoggerFactory;
42 import scala.concurrent.duration.FiniteDuration;
43
44 /**
45  * A behavior, which handles messages sent to a {@link AbstractClientActor}.
46  *
47  * @author Robert Varga
48  */
49 @Beta
50 public abstract class ClientActorBehavior<T extends BackendInfo> extends
51         RecoveredClientActorBehavior<ClientActorContext> implements Identifiable<ClientIdentifier> {
52     /**
53      * Connection reconnect cohort, driven by this class.
54      */
55     @FunctionalInterface
56     protected interface ConnectionConnectCohort {
57         /**
58          * Finish the connection by replaying previous messages onto the new connection.
59          *
60          * @param enqueuedEntries Previously-enqueued entries
61          * @return A {@link ReconnectForwarder} to handle any straggler messages which arrive after this method returns.
62          */
63         @Nonnull ReconnectForwarder finishReconnect(@Nonnull Collection<ConnectionEntry> enqueuedEntries);
64     }
65
66     private static final Logger LOG = LoggerFactory.getLogger(ClientActorBehavior.class);
67     private static final FiniteDuration RESOLVE_RETRY_DURATION = FiniteDuration.apply(1, TimeUnit.SECONDS);
68
69     /**
70      * Map of connections to the backend. This map is concurrent to allow lookups, but given complex operations
71      * involved in connection transitions it is protected by a {@link InversibleLock}. Write-side of the lock is taken
72      * during connection transitions. Optimistic read-side of the lock is taken when new connections are introduced
73      * into the map.
74      *
75      * <p>
76      * The lock detects potential AB/BA deadlock scenarios and will force the reader side out by throwing
77      * a {@link InversibleLockException} -- which must be propagated up, releasing locks as it propagates. The initial
78      * entry point causing the the conflicting lookup must then call {@link InversibleLockException#awaitResolution()}
79      * before retrying the operation.
80      */
81     // TODO: it should be possible to move these two into ClientActorContext
82     private final Map<Long, AbstractClientConnection<T>> connections = new ConcurrentHashMap<>();
83     private final InversibleLock connectionsLock = new InversibleLock();
84     private final BackendInfoResolver<T> resolver;
85     private final MessageAssembler responseMessageAssembler;
86
87     protected ClientActorBehavior(@Nonnull final ClientActorContext context,
88             @Nonnull final BackendInfoResolver<T> resolver) {
89         super(context);
90         this.resolver = Preconditions.checkNotNull(resolver);
91
92         final ClientActorConfig config = context.config();
93         responseMessageAssembler = MessageAssembler.builder().logContext(persistenceId())
94                 .fileBackedStreamFactory(new FileBackedOutputStreamFactory(config.getFileBackedStreamingThreshold(),
95                         config.getTempFileDirectory()))
96                 .assembledMessageCallback((message, sender) -> context.self().tell(message, sender)).build();
97     }
98
99     @Override
100     @Nonnull
101     public final ClientIdentifier getIdentifier() {
102         return context().getIdentifier();
103     }
104
105     @Override
106     public void close() {
107         responseMessageAssembler.close();
108     }
109
110     /**
111      * Get a connection to a shard.
112      *
113      * @param shard Shard cookie
114      * @return Connection to a shard
115      * @throws InversibleLockException if the shard is being reconnected
116      */
117     public final AbstractClientConnection<T> getConnection(final Long shard) {
118         while (true) {
119             final long stamp = connectionsLock.optimisticRead();
120             final AbstractClientConnection<T> conn = connections.computeIfAbsent(shard, this::createConnection);
121             if (connectionsLock.validate(stamp)) {
122                 // No write-lock in-between, return success
123                 return conn;
124             }
125         }
126     }
127
128     private AbstractClientConnection<T> getConnection(final ResponseEnvelope<?> response) {
129         // Always called from actor context: no locking required
130         return connections.get(extractCookie(response.getMessage().getTarget()));
131     }
132
133     @SuppressWarnings("unchecked")
134     @Override
135     final ClientActorBehavior<T> onReceiveCommand(final Object command) {
136         if (command instanceof InternalCommand) {
137             return ((InternalCommand<T>) command).execute(this);
138         }
139
140         if (command instanceof SuccessEnvelope) {
141             return onRequestSuccess((SuccessEnvelope) command);
142         }
143
144         if (command instanceof FailureEnvelope) {
145             return internalOnRequestFailure((FailureEnvelope) command);
146         }
147
148         if (MessageAssembler.isHandledMessage(command)) {
149             context().dispatchers().getDispatcher(DispatcherType.Serialization).execute(
150                 () -> responseMessageAssembler.handleMessage(command, context().self()));
151             return this;
152         }
153
154         if (context().messageSlicer().handleMessage(command)) {
155             return this;
156         }
157
158         return onCommand(command);
159     }
160
161     private static long extractCookie(final Identifier id) {
162         if (id instanceof TransactionIdentifier) {
163             return ((TransactionIdentifier) id).getHistoryId().getCookie();
164         } else if (id instanceof LocalHistoryIdentifier) {
165             return ((LocalHistoryIdentifier) id).getCookie();
166         } else {
167             throw new IllegalArgumentException("Unhandled identifier " + id);
168         }
169     }
170
171     private void onResponse(final ResponseEnvelope<?> response) {
172         final AbstractClientConnection<T> connection = getConnection(response);
173         if (connection != null) {
174             connection.receiveResponse(response);
175         } else {
176             LOG.info("{}: Ignoring unknown response {}", persistenceId(), response);
177         }
178     }
179
180     private ClientActorBehavior<T> onRequestSuccess(final SuccessEnvelope success) {
181         onResponse(success);
182         return this;
183     }
184
185     private ClientActorBehavior<T> onRequestFailure(final FailureEnvelope failure) {
186         onResponse(failure);
187         return this;
188     }
189
190     private ClientActorBehavior<T> internalOnRequestFailure(final FailureEnvelope command) {
191         final AbstractClientConnection<T> conn = getConnection(command);
192         if (conn != null) {
193             /*
194              * We are talking to multiple actors, which may be lagging behind our state significantly. This has
195              * the effect that we may be receiving responses from a previous connection after we have created a new
196              * one to a different actor.
197              *
198              * Since we are already replaying requests to the new actor, we want to ignore errors reported on the old
199              * connection -- for example NotLeaderException, which must not cause a new reconnect. Check the envelope's
200              * sessionId and if it does not match our current connection just ignore it.
201              */
202             final Optional<T> optBackend = conn.getBackendInfo();
203             if (optBackend.isPresent() && optBackend.get().getSessionId() != command.getSessionId()) {
204                 LOG.debug("{}: Mismatched current connection {} and envelope {}, ignoring response", persistenceId(),
205                     conn, command);
206                 return this;
207             }
208         }
209
210         final RequestFailure<?, ?> failure = command.getMessage();
211         final RequestException cause = failure.getCause();
212         if (cause instanceof RetiredGenerationException) {
213             LOG.error("{}: current generation {} has been superseded", persistenceId(), getIdentifier(), cause);
214             haltClient(cause);
215             poison(cause);
216             return null;
217         }
218         if (cause instanceof NotLeaderException) {
219             if (conn instanceof ReconnectingClientConnection) {
220                 // Already reconnecting, do not churn the logs
221                 return this;
222             } else if (conn != null) {
223                 LOG.info("{}: connection {} indicated no leadership, reconnecting it", persistenceId(), conn, cause);
224                 return conn.reconnect(this, cause);
225             }
226         }
227         if (cause instanceof OutOfSequenceEnvelopeException) {
228             if (conn instanceof ReconnectingClientConnection) {
229                 // Already reconnecting, do not churn the logs
230                 return this;
231             } else if (conn != null) {
232                 LOG.info("{}: connection {} indicated sequencing mismatch on {} sequence {} ({}), reconnecting it",
233                     persistenceId(), conn, failure.getTarget(), failure.getSequence(), command.getTxSequence(), cause);
234                 return conn.reconnect(this, cause);
235             }
236         }
237
238         return onRequestFailure(command);
239     }
240
241     private void poison(final RequestException cause) {
242         final long stamp = connectionsLock.writeLock();
243         try {
244             for (AbstractClientConnection<T> q : connections.values()) {
245                 q.poison(cause);
246             }
247
248             connections.clear();
249         } finally {
250             connectionsLock.unlockWrite(stamp);
251         }
252
253         context().messageSlicer().close();
254     }
255
256     /**
257      * Halt And Catch Fire. Halt processing on this client. Implementations need to ensure they initiate state flush
258      * procedures. No attempt to use this instance should be made after this method returns. Any such use may result
259      * in undefined behavior.
260      *
261      * @param cause Failure cause
262      */
263     protected abstract void haltClient(@Nonnull Throwable cause);
264
265     /**
266      * Override this method to handle any command which is not handled by the base behavior.
267      *
268      * @param command the command to process
269      * @return Next behavior to use, null if this actor should shut down.
270      */
271     @Nullable
272     protected abstract ClientActorBehavior<T> onCommand(@Nonnull Object command);
273
274     /**
275      * Override this method to provide a backend resolver instance.
276      *
277      * @return a backend resolver instance
278      */
279     protected final @Nonnull BackendInfoResolver<T> resolver() {
280         return resolver;
281     }
282
283     /**
284      * Callback invoked when a new connection has been established. Implementations are expected perform preparatory
285      * tasks before the previous connection is frozen.
286      *
287      * @param newConn New connection
288      * @return ConnectionConnectCohort which will be used to complete the process of bringing the connection up.
289      */
290     @GuardedBy("connectionsLock")
291     @Nonnull protected abstract ConnectionConnectCohort connectionUp(@Nonnull ConnectedClientConnection<T> newConn);
292
293     private void backendConnectFinished(final Long shard, final AbstractClientConnection<T> oldConn,
294             final T backend, final Throwable failure) {
295         if (failure != null) {
296             if (failure instanceof TimeoutException) {
297                 if (!oldConn.equals(connections.get(shard))) {
298                     // AbstractClientConnection will remove itself when it decides there is no point in continuing,
299                     // at which point we want to stop retrying
300                     LOG.info("{}: stopping resolution of shard {} on stale connection {}", persistenceId(), shard,
301                         oldConn, failure);
302                     return;
303                 }
304
305                 LOG.debug("{}: timed out resolving shard {}, scheduling retry in {}", persistenceId(), shard,
306                     RESOLVE_RETRY_DURATION, failure);
307                 context().executeInActor(b -> {
308                     resolveConnection(shard, oldConn);
309                     return b;
310                 }, RESOLVE_RETRY_DURATION);
311                 return;
312             }
313
314             LOG.error("{}: failed to resolve shard {}", persistenceId(), shard, failure);
315             final RequestException cause;
316             if (failure instanceof RequestException) {
317                 cause = (RequestException) failure;
318             } else {
319                 cause = new RuntimeRequestException("Failed to resolve shard " + shard, failure);
320             }
321
322             oldConn.poison(cause);
323             return;
324         }
325
326         LOG.info("{}: resolved shard {} to {}", persistenceId(), shard, backend);
327         final long stamp = connectionsLock.writeLock();
328         try {
329             final Stopwatch sw = Stopwatch.createStarted();
330
331             // Create a new connected connection
332             final ConnectedClientConnection<T> newConn = new ConnectedClientConnection<>(oldConn, backend);
333             LOG.info("{}: resolving connection {} to {}", persistenceId(), oldConn, newConn);
334
335             // Start reconnecting without the old connection lock held
336             final ConnectionConnectCohort cohort = Verify.verifyNotNull(connectionUp(newConn));
337
338             // Lock the old connection and get a reference to its entries
339             final Collection<ConnectionEntry> replayIterable = oldConn.startReplay();
340
341             // Finish the connection attempt
342             final ReconnectForwarder forwarder = Verify.verifyNotNull(cohort.finishReconnect(replayIterable));
343
344             // Cancel sleep debt after entries were replayed, before new connection starts receiving.
345             newConn.cancelDebt();
346
347             // Install the forwarder, unlocking the old connection
348             oldConn.finishReplay(forwarder);
349
350             // Make sure new lookups pick up the new connection
351             if (!connections.replace(shard, oldConn, newConn)) {
352                 final AbstractClientConnection<T> existing = connections.get(oldConn.cookie());
353                 LOG.warn("{}: old connection {} does not match existing {}, new connection {} in limbo",
354                     persistenceId(), oldConn, existing, newConn);
355             } else {
356                 LOG.info("{}: replaced connection {} with {} in {}", persistenceId(), oldConn, newConn, sw);
357             }
358         } finally {
359             connectionsLock.unlockWrite(stamp);
360         }
361     }
362
363     void removeConnection(final AbstractClientConnection<?> conn) {
364         final long stamp = connectionsLock.writeLock();
365         try {
366             if (!connections.remove(conn.cookie(), conn)) {
367                 final AbstractClientConnection<T> existing = connections.get(conn.cookie());
368                 if (existing != null) {
369                     LOG.warn("{}: failed to remove connection {}, as it was superseded by {}", persistenceId(), conn,
370                         existing);
371                 } else {
372                     LOG.warn("{}: failed to remove connection {}, as it was not tracked", persistenceId(), conn);
373                 }
374             } else {
375                 LOG.info("{}: removed connection {}", persistenceId(), conn);
376                 cancelSlicing(conn.cookie());
377             }
378         } finally {
379             connectionsLock.unlockWrite(stamp);
380         }
381     }
382
383     @SuppressWarnings("unchecked")
384     void reconnectConnection(final ConnectedClientConnection<?> oldConn,
385             final ReconnectingClientConnection<?> newConn) {
386         final ReconnectingClientConnection<T> conn = (ReconnectingClientConnection<T>)newConn;
387         LOG.info("{}: connection {} reconnecting as {}", persistenceId(), oldConn, newConn);
388
389         final long stamp = connectionsLock.writeLock();
390         try {
391             final boolean replaced = connections.replace(oldConn.cookie(), (AbstractClientConnection<T>)oldConn, conn);
392             if (!replaced) {
393                 final AbstractClientConnection<T> existing = connections.get(oldConn.cookie());
394                 if (existing != null) {
395                     LOG.warn("{}: failed to replace connection {}, as it was superseded by {}", persistenceId(), conn,
396                         existing);
397                 } else {
398                     LOG.warn("{}: failed to replace connection {}, as it was not tracked", persistenceId(), conn);
399                 }
400             } else {
401                 cancelSlicing(oldConn.cookie());
402             }
403         } finally {
404             connectionsLock.unlockWrite(stamp);
405         }
406
407         final Long shard = oldConn.cookie();
408         LOG.info("{}: refreshing backend for shard {}", persistenceId(), shard);
409         resolver().refreshBackendInfo(shard, conn.getBackendInfo().get()).whenComplete(
410             (backend, failure) -> context().executeInActor(behavior -> {
411                 backendConnectFinished(shard, conn, backend, failure);
412                 return behavior;
413             }));
414     }
415
416     private void cancelSlicing(final Long cookie) {
417         context().messageSlicer().cancelSlicing(id -> {
418             try {
419                 return cookie.equals(extractCookie(id));
420             } catch (IllegalArgumentException e) {
421                 LOG.debug("extractCookie failed while cancelling slicing for cookie {}: {}", cookie, e);
422                 return false;
423             }
424         });
425     }
426
427     private ConnectingClientConnection<T> createConnection(final Long shard) {
428         final ConnectingClientConnection<T> conn = new ConnectingClientConnection<>(context(), shard);
429         resolveConnection(shard, conn);
430         return conn;
431     }
432
433     private void resolveConnection(final Long shard, final AbstractClientConnection<T> conn) {
434         LOG.debug("{}: resolving shard {} connection {}", persistenceId(), shard, conn);
435         resolver().getBackendInfo(shard).whenComplete((backend, failure) -> context().executeInActor(behavior -> {
436             backendConnectFinished(shard, conn, backend, failure);
437             return behavior;
438         }));
439     }
440 }

©2013 OpenDaylight, A Linux Foundation Collaborative Project. All Rights Reserved.
OpenDaylight is a registered trademark of The OpenDaylight Project, Inc.
Linux Foundation and OpenDaylight are registered trademarks of the Linux Foundation.
Linux is a registered trademark of Linus Torvalds.