BUG-8402: Separate out OutOfOrderRequestException
[controller.git] / opendaylight / md-sal / cds-access-client / src / main / java / org / opendaylight / controller / cluster / access / client / ClientActorBehavior.java
1 /*
2  * Copyright (c) 2016 Cisco Systems, Inc. and others.  All rights reserved.
3  *
4  * This program and the accompanying materials are made available under the
5  * terms of the Eclipse Public License v1.0 which accompanies this distribution,
6  * and is available at http://www.eclipse.org/legal/epl-v10.html
7  */
8 package org.opendaylight.controller.cluster.access.client;
9
10 import com.google.common.annotations.Beta;
11 import com.google.common.base.Preconditions;
12 import com.google.common.base.Verify;
13 import java.util.Map;
14 import java.util.concurrent.ConcurrentHashMap;
15 import java.util.concurrent.TimeUnit;
16 import java.util.concurrent.TimeoutException;
17 import javax.annotation.Nonnull;
18 import javax.annotation.Nullable;
19 import javax.annotation.concurrent.GuardedBy;
20 import org.opendaylight.controller.cluster.access.commands.NotLeaderException;
21 import org.opendaylight.controller.cluster.access.commands.OutOfSequenceEnvelopeException;
22 import org.opendaylight.controller.cluster.access.concepts.ClientIdentifier;
23 import org.opendaylight.controller.cluster.access.concepts.FailureEnvelope;
24 import org.opendaylight.controller.cluster.access.concepts.LocalHistoryIdentifier;
25 import org.opendaylight.controller.cluster.access.concepts.RequestException;
26 import org.opendaylight.controller.cluster.access.concepts.RequestFailure;
27 import org.opendaylight.controller.cluster.access.concepts.ResponseEnvelope;
28 import org.opendaylight.controller.cluster.access.concepts.RetiredGenerationException;
29 import org.opendaylight.controller.cluster.access.concepts.RuntimeRequestException;
30 import org.opendaylight.controller.cluster.access.concepts.SuccessEnvelope;
31 import org.opendaylight.controller.cluster.access.concepts.TransactionIdentifier;
32 import org.opendaylight.yangtools.concepts.Identifiable;
33 import org.opendaylight.yangtools.concepts.WritableIdentifier;
34 import org.slf4j.Logger;
35 import org.slf4j.LoggerFactory;
36 import scala.concurrent.duration.FiniteDuration;
37
38 /**
39  * A behavior, which handles messages sent to a {@link AbstractClientActor}.
40  *
41  * @author Robert Varga
42  */
43 @Beta
44 public abstract class ClientActorBehavior<T extends BackendInfo> extends
45         RecoveredClientActorBehavior<ClientActorContext> implements Identifiable<ClientIdentifier> {
46     /**
47      * Connection reconnect cohort, driven by this class.
48      */
49     @FunctionalInterface
50     protected interface ConnectionConnectCohort {
51         /**
52          * Finish the connection by replaying previous messages onto the new connection.
53          *
54          * @param enqueuedEntries Previously-enqueued entries
55          * @return A {@link ReconnectForwarder} to handle any straggler messages which arrive after this method returns.
56          */
57         @Nonnull ReconnectForwarder finishReconnect(@Nonnull Iterable<ConnectionEntry> enqueuedEntries);
58     }
59
60     private static final Logger LOG = LoggerFactory.getLogger(ClientActorBehavior.class);
61     private static final FiniteDuration RESOLVE_RETRY_DURATION = FiniteDuration.apply(5, TimeUnit.SECONDS);
62
63     /**
64      * Map of connections to the backend. This map is concurrent to allow lookups, but given complex operations
65      * involved in connection transitions it is protected by a {@link InversibleLock}. Write-side of the lock is taken
66      * during connection transitions. Optimistic read-side of the lock is taken when new connections are introduced
67      * into the map.
68      *
69      * <p>
70      * The lock detects potential AB/BA deadlock scenarios and will force the reader side out by throwing
71      * a {@link InversibleLockException} -- which must be propagated up, releasing locks as it propagates. The initial
72      * entry point causing the the conflicting lookup must then call {@link InversibleLockException#awaitResolution()}
73      * before retrying the operation.
74      */
75     // TODO: it should be possible to move these two into ClientActorContext
76     private final Map<Long, AbstractClientConnection<T>> connections = new ConcurrentHashMap<>();
77     private final InversibleLock connectionsLock = new InversibleLock();
78     private final BackendInfoResolver<T> resolver;
79
80     protected ClientActorBehavior(@Nonnull final ClientActorContext context,
81             @Nonnull final BackendInfoResolver<T> resolver) {
82         super(context);
83         this.resolver = Preconditions.checkNotNull(resolver);
84     }
85
86     @Override
87     @Nonnull
88     public final ClientIdentifier getIdentifier() {
89         return context().getIdentifier();
90     }
91
92     /**
93      * Get a connection to a shard.
94      *
95      * @param shard Shard cookie
96      * @return Connection to a shard
97      * @throws InversibleLockException if the shard is being reconnected
98      */
99     public final AbstractClientConnection<T> getConnection(final Long shard) {
100         while (true) {
101             final long stamp = connectionsLock.optimisticRead();
102             final AbstractClientConnection<T> conn = connections.computeIfAbsent(shard, this::createConnection);
103             if (connectionsLock.validate(stamp)) {
104                 // No write-lock in-between, return success
105                 return conn;
106             }
107         }
108     }
109
110     private AbstractClientConnection<T> getConnection(final ResponseEnvelope<?> response) {
111         // Always called from actor context: no locking required
112         return connections.get(extractCookie(response.getMessage().getTarget()));
113     }
114
115     @SuppressWarnings("unchecked")
116     @Override
117     final ClientActorBehavior<T> onReceiveCommand(final Object command) {
118         if (command instanceof InternalCommand) {
119             return ((InternalCommand<T>) command).execute(this);
120         }
121         if (command instanceof SuccessEnvelope) {
122             return onRequestSuccess((SuccessEnvelope) command);
123         }
124         if (command instanceof FailureEnvelope) {
125             return internalOnRequestFailure((FailureEnvelope) command);
126         }
127
128         return onCommand(command);
129     }
130
131     private static long extractCookie(final WritableIdentifier id) {
132         if (id instanceof TransactionIdentifier) {
133             return ((TransactionIdentifier) id).getHistoryId().getCookie();
134         } else if (id instanceof LocalHistoryIdentifier) {
135             return ((LocalHistoryIdentifier) id).getCookie();
136         } else {
137             throw new IllegalArgumentException("Unhandled identifier " + id);
138         }
139     }
140
141     private void onResponse(final ResponseEnvelope<?> response) {
142         final AbstractClientConnection<T> connection = getConnection(response);
143         if (connection != null) {
144             connection.receiveResponse(response);
145         } else {
146             LOG.info("{}: Ignoring unknown response {}", persistenceId(), response);
147         }
148     }
149
150     private ClientActorBehavior<T> onRequestSuccess(final SuccessEnvelope success) {
151         onResponse(success);
152         return this;
153     }
154
155     private ClientActorBehavior<T> onRequestFailure(final FailureEnvelope failure) {
156         onResponse(failure);
157         return this;
158     }
159
160     private ClientActorBehavior<T> internalOnRequestFailure(final FailureEnvelope command) {
161         final RequestFailure<?, ?> failure = command.getMessage();
162         final RequestException cause = failure.getCause();
163         if (cause instanceof RetiredGenerationException) {
164             LOG.error("{}: current generation {} has been superseded", persistenceId(), getIdentifier(), cause);
165             haltClient(cause);
166             poison(cause);
167             return null;
168         }
169         if (cause instanceof NotLeaderException) {
170             final AbstractClientConnection<T> conn = getConnection(command);
171             if (conn instanceof ReconnectingClientConnection) {
172                 // Already reconnecting, do not churn the logs
173                 return this;
174             } else if (conn != null) {
175                 LOG.info("{}: connection {} indicated no leadership, reconnecting it", persistenceId(), conn, cause);
176                 return conn.reconnect(this);
177             }
178         }
179         if (cause instanceof OutOfSequenceEnvelopeException) {
180             final AbstractClientConnection<T> conn = getConnection(command);
181             if (conn instanceof ReconnectingClientConnection) {
182                 // Already reconnecting, do not churn the logs
183                 return this;
184             } else if (conn != null) {
185                 LOG.info("{}: connection {} indicated no sequencing mismatch on {} sequence {}, reconnecting it",
186                     persistenceId(), conn, failure.getTarget(), failure.getSequence(), cause);
187                 return conn.reconnect(this);
188             }
189         }
190
191         return onRequestFailure(command);
192     }
193
194     private void poison(final RequestException cause) {
195         final long stamp = connectionsLock.writeLock();
196         try {
197             for (AbstractClientConnection<T> q : connections.values()) {
198                 q.poison(cause);
199             }
200
201             connections.clear();
202         } finally {
203             connectionsLock.unlockWrite(stamp);
204         }
205     }
206
207     /**
208      * Halt And Catch Fire. Halt processing on this client. Implementations need to ensure they initiate state flush
209      * procedures. No attempt to use this instance should be made after this method returns. Any such use may result
210      * in undefined behavior.
211      *
212      * @param cause Failure cause
213      */
214     protected abstract void haltClient(@Nonnull Throwable cause);
215
216     /**
217      * Override this method to handle any command which is not handled by the base behavior.
218      *
219      * @param command the command to process
220      * @return Next behavior to use, null if this actor should shut down.
221      */
222     @Nullable
223     protected abstract ClientActorBehavior<T> onCommand(@Nonnull Object command);
224
225     /**
226      * Override this method to provide a backend resolver instance.
227      *
228      * @return a backend resolver instance
229      */
230     protected final @Nonnull BackendInfoResolver<T> resolver() {
231         return resolver;
232     }
233
234     /**
235      * Callback invoked when a new connection has been established. Implementations are expected perform preparatory
236      * tasks before the previous connection is frozen.
237      *
238      * @param newConn New connection
239      * @return ConnectionConnectCohort which will be used to complete the process of bringing the connection up.
240      */
241     @GuardedBy("connectionsLock")
242     @Nonnull protected abstract ConnectionConnectCohort connectionUp(@Nonnull ConnectedClientConnection<T> newConn);
243
244     private void backendConnectFinished(final Long shard, final AbstractClientConnection<T> conn,
245             final T backend, final Throwable failure) {
246         if (failure != null) {
247             if (failure instanceof TimeoutException) {
248                 if (!conn.equals(connections.get(shard))) {
249                     // AbstractClientConnection will remove itself when it decides there is no point in continuing,
250                     // at which point we want to stop retrying
251                     LOG.info("{}: stopping resolution of shard {} on stale connection {}", persistenceId(), shard, conn,
252                         failure);
253                     return;
254                 }
255
256                 LOG.debug("{}: timed out resolving shard {}, scheduling retry in {}", persistenceId(), shard,
257                     RESOLVE_RETRY_DURATION, failure);
258                 context().executeInActor(b -> {
259                     resolveConnection(shard, conn);
260                     return b;
261                 }, RESOLVE_RETRY_DURATION);
262                 return;
263             }
264
265             LOG.error("{}: failed to resolve shard {}", persistenceId(), shard, failure);
266             final RequestException cause;
267             if (failure instanceof RequestException) {
268                 cause = (RequestException) failure;
269             } else {
270                 cause = new RuntimeRequestException("Failed to resolve shard " + shard, failure);
271             }
272
273             conn.poison(cause);
274             return;
275         }
276
277         LOG.info("{}: resolved shard {} to {}", persistenceId(), shard, backend);
278         final long stamp = connectionsLock.writeLock();
279         try {
280             // Create a new connected connection
281             final ConnectedClientConnection<T> newConn = new ConnectedClientConnection<>(conn.context(),
282                     conn.cookie(), backend);
283             LOG.info("{}: resolving connection {} to {}", persistenceId(), conn, newConn);
284
285             // Start reconnecting without the old connection lock held
286             final ConnectionConnectCohort cohort = Verify.verifyNotNull(connectionUp(newConn));
287
288             // Lock the old connection and get a reference to its entries
289             final Iterable<ConnectionEntry> replayIterable = conn.startReplay();
290
291             // Finish the connection attempt
292             final ReconnectForwarder forwarder = Verify.verifyNotNull(cohort.finishReconnect(replayIterable));
293
294             // Install the forwarder, unlocking the old connection
295             conn.finishReplay(forwarder);
296
297             // Make sure new lookups pick up the new connection
298             connections.replace(shard, conn, newConn);
299             LOG.info("{}: replaced connection {} with {}", persistenceId(), conn, newConn);
300         } finally {
301             connectionsLock.unlockWrite(stamp);
302         }
303     }
304
305     void removeConnection(final AbstractClientConnection<?> conn) {
306         connections.remove(conn.cookie(), conn);
307         LOG.debug("{}: removed connection {}", persistenceId(), conn);
308     }
309
310     @SuppressWarnings("unchecked")
311     void reconnectConnection(final ConnectedClientConnection<?> oldConn,
312             final ReconnectingClientConnection<?> newConn) {
313         final ReconnectingClientConnection<T> conn = (ReconnectingClientConnection<T>)newConn;
314         LOG.info("{}: connection {} reconnecting as {}", persistenceId(), oldConn, newConn);
315
316         final boolean replaced = connections.replace(oldConn.cookie(), (AbstractClientConnection<T>)oldConn, conn);
317         if (!replaced) {
318             final AbstractClientConnection<T> existing = connections.get(oldConn.cookie());
319             LOG.warn("{}: old connection {} does not match existing {}, new connection {} in limbo", persistenceId(),
320                 oldConn, existing, newConn);
321         }
322
323         final Long shard = oldConn.cookie();
324         LOG.info("{}: refreshing backend for shard {}", persistenceId(), shard);
325         resolver().refreshBackendInfo(shard, conn.getBackendInfo().get()).whenComplete(
326             (backend, failure) -> context().executeInActor(behavior -> {
327                 backendConnectFinished(shard, conn, backend, failure);
328                 return behavior;
329             }));
330     }
331
332     private ConnectingClientConnection<T> createConnection(final Long shard) {
333         final ConnectingClientConnection<T> conn = new ConnectingClientConnection<>(context(), shard);
334         resolveConnection(shard, conn);
335         return conn;
336     }
337
338     private void resolveConnection(final Long shard, final AbstractClientConnection<T> conn) {
339         LOG.debug("{}: resolving shard {} connection {}", persistenceId(), shard, conn);
340         resolver().getBackendInfo(shard).whenComplete((backend, failure) -> context().executeInActor(behavior -> {
341             backendConnectFinished(shard, conn, backend, failure);
342             return behavior;
343         }));
344     }
345 }