BUG-8445: check sessionId before propagating failures
[controller.git] / opendaylight / md-sal / cds-access-client / src / main / java / org / opendaylight / controller / cluster / access / client / ClientActorBehavior.java
1 /*
2  * Copyright (c) 2016 Cisco Systems, Inc. and others.  All rights reserved.
3  *
4  * This program and the accompanying materials are made available under the
5  * terms of the Eclipse Public License v1.0 which accompanies this distribution,
6  * and is available at http://www.eclipse.org/legal/epl-v10.html
7  */
8 package org.opendaylight.controller.cluster.access.client;
9
10 import com.google.common.annotations.Beta;
11 import com.google.common.base.Preconditions;
12 import com.google.common.base.Stopwatch;
13 import com.google.common.base.Verify;
14 import java.util.Collection;
15 import java.util.Map;
16 import java.util.Optional;
17 import java.util.concurrent.ConcurrentHashMap;
18 import java.util.concurrent.TimeUnit;
19 import java.util.concurrent.TimeoutException;
20 import javax.annotation.Nonnull;
21 import javax.annotation.Nullable;
22 import javax.annotation.concurrent.GuardedBy;
23 import org.opendaylight.controller.cluster.access.commands.NotLeaderException;
24 import org.opendaylight.controller.cluster.access.commands.OutOfSequenceEnvelopeException;
25 import org.opendaylight.controller.cluster.access.concepts.ClientIdentifier;
26 import org.opendaylight.controller.cluster.access.concepts.FailureEnvelope;
27 import org.opendaylight.controller.cluster.access.concepts.LocalHistoryIdentifier;
28 import org.opendaylight.controller.cluster.access.concepts.RequestException;
29 import org.opendaylight.controller.cluster.access.concepts.RequestFailure;
30 import org.opendaylight.controller.cluster.access.concepts.ResponseEnvelope;
31 import org.opendaylight.controller.cluster.access.concepts.RetiredGenerationException;
32 import org.opendaylight.controller.cluster.access.concepts.RuntimeRequestException;
33 import org.opendaylight.controller.cluster.access.concepts.SuccessEnvelope;
34 import org.opendaylight.controller.cluster.access.concepts.TransactionIdentifier;
35 import org.opendaylight.yangtools.concepts.Identifiable;
36 import org.opendaylight.yangtools.concepts.WritableIdentifier;
37 import org.slf4j.Logger;
38 import org.slf4j.LoggerFactory;
39 import scala.concurrent.duration.FiniteDuration;
40
41 /**
42  * A behavior, which handles messages sent to a {@link AbstractClientActor}.
43  *
44  * @author Robert Varga
45  */
46 @Beta
47 public abstract class ClientActorBehavior<T extends BackendInfo> extends
48         RecoveredClientActorBehavior<ClientActorContext> implements Identifiable<ClientIdentifier> {
49     /**
50      * Connection reconnect cohort, driven by this class.
51      */
52     @FunctionalInterface
53     protected interface ConnectionConnectCohort {
54         /**
55          * Finish the connection by replaying previous messages onto the new connection.
56          *
57          * @param enqueuedEntries Previously-enqueued entries
58          * @return A {@link ReconnectForwarder} to handle any straggler messages which arrive after this method returns.
59          */
60         @Nonnull ReconnectForwarder finishReconnect(@Nonnull Collection<ConnectionEntry> enqueuedEntries);
61     }
62
63     private static final Logger LOG = LoggerFactory.getLogger(ClientActorBehavior.class);
64     private static final FiniteDuration RESOLVE_RETRY_DURATION = FiniteDuration.apply(5, TimeUnit.SECONDS);
65
66     /**
67      * Map of connections to the backend. This map is concurrent to allow lookups, but given complex operations
68      * involved in connection transitions it is protected by a {@link InversibleLock}. Write-side of the lock is taken
69      * during connection transitions. Optimistic read-side of the lock is taken when new connections are introduced
70      * into the map.
71      *
72      * <p>
73      * The lock detects potential AB/BA deadlock scenarios and will force the reader side out by throwing
74      * a {@link InversibleLockException} -- which must be propagated up, releasing locks as it propagates. The initial
75      * entry point causing the the conflicting lookup must then call {@link InversibleLockException#awaitResolution()}
76      * before retrying the operation.
77      */
78     // TODO: it should be possible to move these two into ClientActorContext
79     private final Map<Long, AbstractClientConnection<T>> connections = new ConcurrentHashMap<>();
80     private final InversibleLock connectionsLock = new InversibleLock();
81     private final BackendInfoResolver<T> resolver;
82
83     protected ClientActorBehavior(@Nonnull final ClientActorContext context,
84             @Nonnull final BackendInfoResolver<T> resolver) {
85         super(context);
86         this.resolver = Preconditions.checkNotNull(resolver);
87     }
88
89     @Override
90     @Nonnull
91     public final ClientIdentifier getIdentifier() {
92         return context().getIdentifier();
93     }
94
95     /**
96      * Get a connection to a shard.
97      *
98      * @param shard Shard cookie
99      * @return Connection to a shard
100      * @throws InversibleLockException if the shard is being reconnected
101      */
102     public final AbstractClientConnection<T> getConnection(final Long shard) {
103         while (true) {
104             final long stamp = connectionsLock.optimisticRead();
105             final AbstractClientConnection<T> conn = connections.computeIfAbsent(shard, this::createConnection);
106             if (connectionsLock.validate(stamp)) {
107                 // No write-lock in-between, return success
108                 return conn;
109             }
110         }
111     }
112
113     private AbstractClientConnection<T> getConnection(final ResponseEnvelope<?> response) {
114         // Always called from actor context: no locking required
115         return connections.get(extractCookie(response.getMessage().getTarget()));
116     }
117
118     @SuppressWarnings("unchecked")
119     @Override
120     final ClientActorBehavior<T> onReceiveCommand(final Object command) {
121         if (command instanceof InternalCommand) {
122             return ((InternalCommand<T>) command).execute(this);
123         }
124         if (command instanceof SuccessEnvelope) {
125             return onRequestSuccess((SuccessEnvelope) command);
126         }
127         if (command instanceof FailureEnvelope) {
128             return internalOnRequestFailure((FailureEnvelope) command);
129         }
130
131         return onCommand(command);
132     }
133
134     private static long extractCookie(final WritableIdentifier id) {
135         if (id instanceof TransactionIdentifier) {
136             return ((TransactionIdentifier) id).getHistoryId().getCookie();
137         } else if (id instanceof LocalHistoryIdentifier) {
138             return ((LocalHistoryIdentifier) id).getCookie();
139         } else {
140             throw new IllegalArgumentException("Unhandled identifier " + id);
141         }
142     }
143
144     private void onResponse(final ResponseEnvelope<?> response) {
145         final AbstractClientConnection<T> connection = getConnection(response);
146         if (connection != null) {
147             connection.receiveResponse(response);
148         } else {
149             LOG.info("{}: Ignoring unknown response {}", persistenceId(), response);
150         }
151     }
152
153     private ClientActorBehavior<T> onRequestSuccess(final SuccessEnvelope success) {
154         onResponse(success);
155         return this;
156     }
157
158     private ClientActorBehavior<T> onRequestFailure(final FailureEnvelope failure) {
159         onResponse(failure);
160         return this;
161     }
162
163     private ClientActorBehavior<T> internalOnRequestFailure(final FailureEnvelope command) {
164         final AbstractClientConnection<T> conn = getConnection(command);
165         if (conn != null) {
166             /*
167              * We are talking to multiple actors, which may be lagging behind our state significantly. This has
168              * the effect that we may be receiving responses from a previous connection after we have created a new
169              * one to a different actor.
170              *
171              * Since we are already replaying requests to the new actor, we want to ignore errors reported on the old
172              * connection -- for example NotLeaderException, which must not cause a new reconnect. Check the envelope's
173              * sessionId and if it does not match our current connection just ignore it.
174              */
175             final Optional<T> optBackend = conn.getBackendInfo();
176             if (optBackend.isPresent() && optBackend.get().getSessionId() != command.getSessionId()) {
177                 LOG.debug("{}: Mismatched current connection {} and envelope {}, ignoring response", persistenceId(),
178                     conn, command);
179                 return this;
180             }
181         }
182
183         final RequestFailure<?, ?> failure = command.getMessage();
184         final RequestException cause = failure.getCause();
185         if (cause instanceof RetiredGenerationException) {
186             LOG.error("{}: current generation {} has been superseded", persistenceId(), getIdentifier(), cause);
187             haltClient(cause);
188             poison(cause);
189             return null;
190         }
191         if (cause instanceof NotLeaderException) {
192             if (conn instanceof ReconnectingClientConnection) {
193                 // Already reconnecting, do not churn the logs
194                 return this;
195             } else if (conn != null) {
196                 LOG.info("{}: connection {} indicated no leadership, reconnecting it", persistenceId(), conn, cause);
197                 return conn.reconnect(this, cause);
198             }
199         }
200         if (cause instanceof OutOfSequenceEnvelopeException) {
201             if (conn instanceof ReconnectingClientConnection) {
202                 // Already reconnecting, do not churn the logs
203                 return this;
204             } else if (conn != null) {
205                 LOG.info("{}: connection {} indicated no sequencing mismatch on {} sequence {}, reconnecting it",
206                     persistenceId(), conn, failure.getTarget(), failure.getSequence(), cause);
207                 return conn.reconnect(this, cause);
208             }
209         }
210
211         return onRequestFailure(command);
212     }
213
214     private void poison(final RequestException cause) {
215         final long stamp = connectionsLock.writeLock();
216         try {
217             for (AbstractClientConnection<T> q : connections.values()) {
218                 q.poison(cause);
219             }
220
221             connections.clear();
222         } finally {
223             connectionsLock.unlockWrite(stamp);
224         }
225     }
226
227     /**
228      * Halt And Catch Fire. Halt processing on this client. Implementations need to ensure they initiate state flush
229      * procedures. No attempt to use this instance should be made after this method returns. Any such use may result
230      * in undefined behavior.
231      *
232      * @param cause Failure cause
233      */
234     protected abstract void haltClient(@Nonnull Throwable cause);
235
236     /**
237      * Override this method to handle any command which is not handled by the base behavior.
238      *
239      * @param command the command to process
240      * @return Next behavior to use, null if this actor should shut down.
241      */
242     @Nullable
243     protected abstract ClientActorBehavior<T> onCommand(@Nonnull Object command);
244
245     /**
246      * Override this method to provide a backend resolver instance.
247      *
248      * @return a backend resolver instance
249      */
250     protected final @Nonnull BackendInfoResolver<T> resolver() {
251         return resolver;
252     }
253
254     /**
255      * Callback invoked when a new connection has been established. Implementations are expected perform preparatory
256      * tasks before the previous connection is frozen.
257      *
258      * @param newConn New connection
259      * @return ConnectionConnectCohort which will be used to complete the process of bringing the connection up.
260      */
261     @GuardedBy("connectionsLock")
262     @Nonnull protected abstract ConnectionConnectCohort connectionUp(@Nonnull ConnectedClientConnection<T> newConn);
263
264     private void backendConnectFinished(final Long shard, final AbstractClientConnection<T> conn,
265             final T backend, final Throwable failure) {
266         if (failure != null) {
267             if (failure instanceof TimeoutException) {
268                 if (!conn.equals(connections.get(shard))) {
269                     // AbstractClientConnection will remove itself when it decides there is no point in continuing,
270                     // at which point we want to stop retrying
271                     LOG.info("{}: stopping resolution of shard {} on stale connection {}", persistenceId(), shard, conn,
272                         failure);
273                     return;
274                 }
275
276                 LOG.debug("{}: timed out resolving shard {}, scheduling retry in {}", persistenceId(), shard,
277                     RESOLVE_RETRY_DURATION, failure);
278                 context().executeInActor(b -> {
279                     resolveConnection(shard, conn);
280                     return b;
281                 }, RESOLVE_RETRY_DURATION);
282                 return;
283             }
284
285             LOG.error("{}: failed to resolve shard {}", persistenceId(), shard, failure);
286             final RequestException cause;
287             if (failure instanceof RequestException) {
288                 cause = (RequestException) failure;
289             } else {
290                 cause = new RuntimeRequestException("Failed to resolve shard " + shard, failure);
291             }
292
293             conn.poison(cause);
294             return;
295         }
296
297         LOG.info("{}: resolved shard {} to {}", persistenceId(), shard, backend);
298         final long stamp = connectionsLock.writeLock();
299         try {
300             final Stopwatch sw = Stopwatch.createStarted();
301
302             // Create a new connected connection
303             final ConnectedClientConnection<T> newConn = new ConnectedClientConnection<>(conn.context(),
304                     conn.cookie(), backend);
305             LOG.info("{}: resolving connection {} to {}", persistenceId(), conn, newConn);
306
307             // Start reconnecting without the old connection lock held
308             final ConnectionConnectCohort cohort = Verify.verifyNotNull(connectionUp(newConn));
309
310             // Lock the old connection and get a reference to its entries
311             final Collection<ConnectionEntry> replayIterable = conn.startReplay();
312
313             // Finish the connection attempt
314             final ReconnectForwarder forwarder = Verify.verifyNotNull(cohort.finishReconnect(replayIterable));
315
316             // Install the forwarder, unlocking the old connection
317             conn.finishReplay(forwarder);
318
319             // Make sure new lookups pick up the new connection
320             if (!connections.replace(shard, conn, newConn)) {
321                 final AbstractClientConnection<T> existing = connections.get(conn.cookie());
322                 LOG.warn("{}: old connection {} does not match existing {}, new connection {} in limbo",
323                     persistenceId(), conn, existing, newConn);
324             } else {
325                 LOG.info("{}: replaced connection {} with {} in {}", persistenceId(), conn, newConn, sw);
326             }
327         } finally {
328             connectionsLock.unlockWrite(stamp);
329         }
330     }
331
332     void removeConnection(final AbstractClientConnection<?> conn) {
333         final long stamp = connectionsLock.writeLock();
334         try {
335             if (!connections.remove(conn.cookie(), conn)) {
336                 final AbstractClientConnection<T> existing = connections.get(conn.cookie());
337                 if (existing != null) {
338                     LOG.warn("{}: failed to remove connection {}, as it was superseded by {}", persistenceId(), conn,
339                         existing);
340                 } else {
341                     LOG.warn("{}: failed to remove connection {}, as it was not tracked", persistenceId(), conn);
342                 }
343             } else {
344                 LOG.info("{}: removed connection {}", persistenceId(), conn);
345             }
346         } finally {
347             connectionsLock.unlockWrite(stamp);
348         }
349     }
350
351     @SuppressWarnings("unchecked")
352     void reconnectConnection(final ConnectedClientConnection<?> oldConn,
353             final ReconnectingClientConnection<?> newConn) {
354         final ReconnectingClientConnection<T> conn = (ReconnectingClientConnection<T>)newConn;
355         LOG.info("{}: connection {} reconnecting as {}", persistenceId(), oldConn, newConn);
356
357         final long stamp = connectionsLock.writeLock();
358         try {
359             final boolean replaced = connections.replace(oldConn.cookie(), (AbstractClientConnection<T>)oldConn, conn);
360             if (!replaced) {
361                 final AbstractClientConnection<T> existing = connections.get(oldConn.cookie());
362                 if (existing != null) {
363                     LOG.warn("{}: failed to replace connection {}, as it was superseded by {}", persistenceId(), conn,
364                         existing);
365                 } else {
366                     LOG.warn("{}: failed to replace connection {}, as it was not tracked", persistenceId(), conn);
367                 }
368             }
369         } finally {
370             connectionsLock.unlockWrite(stamp);
371         }
372
373         final Long shard = oldConn.cookie();
374         LOG.info("{}: refreshing backend for shard {}", persistenceId(), shard);
375         resolver().refreshBackendInfo(shard, conn.getBackendInfo().get()).whenComplete(
376             (backend, failure) -> context().executeInActor(behavior -> {
377                 backendConnectFinished(shard, conn, backend, failure);
378                 return behavior;
379             }));
380     }
381
382     private ConnectingClientConnection<T> createConnection(final Long shard) {
383         final ConnectingClientConnection<T> conn = new ConnectingClientConnection<>(context(), shard);
384         resolveConnection(shard, conn);
385         return conn;
386     }
387
388     private void resolveConnection(final Long shard, final AbstractClientConnection<T> conn) {
389         LOG.debug("{}: resolving shard {} connection {}", persistenceId(), shard, conn);
390         resolver().getBackendInfo(shard).whenComplete((backend, failure) -> context().executeInActor(behavior -> {
391             backendConnectFinished(shard, conn, backend, failure);
392             return behavior;
393         }));
394     }
395 }