BUG-8494: Cleanup clustering-it-provider
[controller.git] / opendaylight / md-sal / cds-access-client / src / main / java / org / opendaylight / controller / cluster / access / client / ClientActorBehavior.java
1 /*
2  * Copyright (c) 2016 Cisco Systems, Inc. and others.  All rights reserved.
3  *
4  * This program and the accompanying materials are made available under the
5  * terms of the Eclipse Public License v1.0 which accompanies this distribution,
6  * and is available at http://www.eclipse.org/legal/epl-v10.html
7  */
8 package org.opendaylight.controller.cluster.access.client;
9
10 import com.google.common.annotations.Beta;
11 import com.google.common.base.Preconditions;
12 import com.google.common.base.Stopwatch;
13 import com.google.common.base.Verify;
14 import java.util.Collection;
15 import java.util.Map;
16 import java.util.concurrent.ConcurrentHashMap;
17 import java.util.concurrent.TimeUnit;
18 import java.util.concurrent.TimeoutException;
19 import javax.annotation.Nonnull;
20 import javax.annotation.Nullable;
21 import javax.annotation.concurrent.GuardedBy;
22 import org.opendaylight.controller.cluster.access.commands.NotLeaderException;
23 import org.opendaylight.controller.cluster.access.commands.OutOfSequenceEnvelopeException;
24 import org.opendaylight.controller.cluster.access.concepts.ClientIdentifier;
25 import org.opendaylight.controller.cluster.access.concepts.FailureEnvelope;
26 import org.opendaylight.controller.cluster.access.concepts.LocalHistoryIdentifier;
27 import org.opendaylight.controller.cluster.access.concepts.RequestException;
28 import org.opendaylight.controller.cluster.access.concepts.RequestFailure;
29 import org.opendaylight.controller.cluster.access.concepts.ResponseEnvelope;
30 import org.opendaylight.controller.cluster.access.concepts.RetiredGenerationException;
31 import org.opendaylight.controller.cluster.access.concepts.RuntimeRequestException;
32 import org.opendaylight.controller.cluster.access.concepts.SuccessEnvelope;
33 import org.opendaylight.controller.cluster.access.concepts.TransactionIdentifier;
34 import org.opendaylight.yangtools.concepts.Identifiable;
35 import org.opendaylight.yangtools.concepts.WritableIdentifier;
36 import org.slf4j.Logger;
37 import org.slf4j.LoggerFactory;
38 import scala.concurrent.duration.FiniteDuration;
39
40 /**
41  * A behavior, which handles messages sent to a {@link AbstractClientActor}.
42  *
43  * @author Robert Varga
44  */
45 @Beta
46 public abstract class ClientActorBehavior<T extends BackendInfo> extends
47         RecoveredClientActorBehavior<ClientActorContext> implements Identifiable<ClientIdentifier> {
48     /**
49      * Connection reconnect cohort, driven by this class.
50      */
51     @FunctionalInterface
52     protected interface ConnectionConnectCohort {
53         /**
54          * Finish the connection by replaying previous messages onto the new connection.
55          *
56          * @param enqueuedEntries Previously-enqueued entries
57          * @return A {@link ReconnectForwarder} to handle any straggler messages which arrive after this method returns.
58          */
59         @Nonnull ReconnectForwarder finishReconnect(@Nonnull Collection<ConnectionEntry> enqueuedEntries);
60     }
61
62     private static final Logger LOG = LoggerFactory.getLogger(ClientActorBehavior.class);
63     private static final FiniteDuration RESOLVE_RETRY_DURATION = FiniteDuration.apply(5, TimeUnit.SECONDS);
64
65     /**
66      * Map of connections to the backend. This map is concurrent to allow lookups, but given complex operations
67      * involved in connection transitions it is protected by a {@link InversibleLock}. Write-side of the lock is taken
68      * during connection transitions. Optimistic read-side of the lock is taken when new connections are introduced
69      * into the map.
70      *
71      * <p>
72      * The lock detects potential AB/BA deadlock scenarios and will force the reader side out by throwing
73      * a {@link InversibleLockException} -- which must be propagated up, releasing locks as it propagates. The initial
74      * entry point causing the the conflicting lookup must then call {@link InversibleLockException#awaitResolution()}
75      * before retrying the operation.
76      */
77     // TODO: it should be possible to move these two into ClientActorContext
78     private final Map<Long, AbstractClientConnection<T>> connections = new ConcurrentHashMap<>();
79     private final InversibleLock connectionsLock = new InversibleLock();
80     private final BackendInfoResolver<T> resolver;
81
82     protected ClientActorBehavior(@Nonnull final ClientActorContext context,
83             @Nonnull final BackendInfoResolver<T> resolver) {
84         super(context);
85         this.resolver = Preconditions.checkNotNull(resolver);
86     }
87
88     @Override
89     @Nonnull
90     public final ClientIdentifier getIdentifier() {
91         return context().getIdentifier();
92     }
93
94     /**
95      * Get a connection to a shard.
96      *
97      * @param shard Shard cookie
98      * @return Connection to a shard
99      * @throws InversibleLockException if the shard is being reconnected
100      */
101     public final AbstractClientConnection<T> getConnection(final Long shard) {
102         while (true) {
103             final long stamp = connectionsLock.optimisticRead();
104             final AbstractClientConnection<T> conn = connections.computeIfAbsent(shard, this::createConnection);
105             if (connectionsLock.validate(stamp)) {
106                 // No write-lock in-between, return success
107                 return conn;
108             }
109         }
110     }
111
112     private AbstractClientConnection<T> getConnection(final ResponseEnvelope<?> response) {
113         // Always called from actor context: no locking required
114         return connections.get(extractCookie(response.getMessage().getTarget()));
115     }
116
117     @SuppressWarnings("unchecked")
118     @Override
119     final ClientActorBehavior<T> onReceiveCommand(final Object command) {
120         if (command instanceof InternalCommand) {
121             return ((InternalCommand<T>) command).execute(this);
122         }
123         if (command instanceof SuccessEnvelope) {
124             return onRequestSuccess((SuccessEnvelope) command);
125         }
126         if (command instanceof FailureEnvelope) {
127             return internalOnRequestFailure((FailureEnvelope) command);
128         }
129
130         return onCommand(command);
131     }
132
133     private static long extractCookie(final WritableIdentifier id) {
134         if (id instanceof TransactionIdentifier) {
135             return ((TransactionIdentifier) id).getHistoryId().getCookie();
136         } else if (id instanceof LocalHistoryIdentifier) {
137             return ((LocalHistoryIdentifier) id).getCookie();
138         } else {
139             throw new IllegalArgumentException("Unhandled identifier " + id);
140         }
141     }
142
143     private void onResponse(final ResponseEnvelope<?> response) {
144         final AbstractClientConnection<T> connection = getConnection(response);
145         if (connection != null) {
146             connection.receiveResponse(response);
147         } else {
148             LOG.info("{}: Ignoring unknown response {}", persistenceId(), response);
149         }
150     }
151
152     private ClientActorBehavior<T> onRequestSuccess(final SuccessEnvelope success) {
153         onResponse(success);
154         return this;
155     }
156
157     private ClientActorBehavior<T> onRequestFailure(final FailureEnvelope failure) {
158         onResponse(failure);
159         return this;
160     }
161
162     private ClientActorBehavior<T> internalOnRequestFailure(final FailureEnvelope command) {
163         final RequestFailure<?, ?> failure = command.getMessage();
164         final RequestException cause = failure.getCause();
165         if (cause instanceof RetiredGenerationException) {
166             LOG.error("{}: current generation {} has been superseded", persistenceId(), getIdentifier(), cause);
167             haltClient(cause);
168             poison(cause);
169             return null;
170         }
171         if (cause instanceof NotLeaderException) {
172             final AbstractClientConnection<T> conn = getConnection(command);
173             if (conn instanceof ReconnectingClientConnection) {
174                 // Already reconnecting, do not churn the logs
175                 return this;
176             } else if (conn != null) {
177                 LOG.info("{}: connection {} indicated no leadership, reconnecting it", persistenceId(), conn, cause);
178                 return conn.reconnect(this, cause);
179             }
180         }
181         if (cause instanceof OutOfSequenceEnvelopeException) {
182             final AbstractClientConnection<T> conn = getConnection(command);
183             if (conn instanceof ReconnectingClientConnection) {
184                 // Already reconnecting, do not churn the logs
185                 return this;
186             } else if (conn != null) {
187                 LOG.info("{}: connection {} indicated no sequencing mismatch on {} sequence {}, reconnecting it",
188                     persistenceId(), conn, failure.getTarget(), failure.getSequence(), cause);
189                 return conn.reconnect(this, cause);
190             }
191         }
192
193         return onRequestFailure(command);
194     }
195
196     private void poison(final RequestException cause) {
197         final long stamp = connectionsLock.writeLock();
198         try {
199             for (AbstractClientConnection<T> q : connections.values()) {
200                 q.poison(cause);
201             }
202
203             connections.clear();
204         } finally {
205             connectionsLock.unlockWrite(stamp);
206         }
207     }
208
209     /**
210      * Halt And Catch Fire. Halt processing on this client. Implementations need to ensure they initiate state flush
211      * procedures. No attempt to use this instance should be made after this method returns. Any such use may result
212      * in undefined behavior.
213      *
214      * @param cause Failure cause
215      */
216     protected abstract void haltClient(@Nonnull Throwable cause);
217
218     /**
219      * Override this method to handle any command which is not handled by the base behavior.
220      *
221      * @param command the command to process
222      * @return Next behavior to use, null if this actor should shut down.
223      */
224     @Nullable
225     protected abstract ClientActorBehavior<T> onCommand(@Nonnull Object command);
226
227     /**
228      * Override this method to provide a backend resolver instance.
229      *
230      * @return a backend resolver instance
231      */
232     protected final @Nonnull BackendInfoResolver<T> resolver() {
233         return resolver;
234     }
235
236     /**
237      * Callback invoked when a new connection has been established. Implementations are expected perform preparatory
238      * tasks before the previous connection is frozen.
239      *
240      * @param newConn New connection
241      * @return ConnectionConnectCohort which will be used to complete the process of bringing the connection up.
242      */
243     @GuardedBy("connectionsLock")
244     @Nonnull protected abstract ConnectionConnectCohort connectionUp(@Nonnull ConnectedClientConnection<T> newConn);
245
246     private void backendConnectFinished(final Long shard, final AbstractClientConnection<T> conn,
247             final T backend, final Throwable failure) {
248         if (failure != null) {
249             if (failure instanceof TimeoutException) {
250                 if (!conn.equals(connections.get(shard))) {
251                     // AbstractClientConnection will remove itself when it decides there is no point in continuing,
252                     // at which point we want to stop retrying
253                     LOG.info("{}: stopping resolution of shard {} on stale connection {}", persistenceId(), shard, conn,
254                         failure);
255                     return;
256                 }
257
258                 LOG.debug("{}: timed out resolving shard {}, scheduling retry in {}", persistenceId(), shard,
259                     RESOLVE_RETRY_DURATION, failure);
260                 context().executeInActor(b -> {
261                     resolveConnection(shard, conn);
262                     return b;
263                 }, RESOLVE_RETRY_DURATION);
264                 return;
265             }
266
267             LOG.error("{}: failed to resolve shard {}", persistenceId(), shard, failure);
268             final RequestException cause;
269             if (failure instanceof RequestException) {
270                 cause = (RequestException) failure;
271             } else {
272                 cause = new RuntimeRequestException("Failed to resolve shard " + shard, failure);
273             }
274
275             conn.poison(cause);
276             return;
277         }
278
279         LOG.info("{}: resolved shard {} to {}", persistenceId(), shard, backend);
280         final long stamp = connectionsLock.writeLock();
281         try {
282             final Stopwatch sw = Stopwatch.createStarted();
283
284             // Create a new connected connection
285             final ConnectedClientConnection<T> newConn = new ConnectedClientConnection<>(conn.context(),
286                     conn.cookie(), backend);
287             LOG.info("{}: resolving connection {} to {}", persistenceId(), conn, newConn);
288
289             // Start reconnecting without the old connection lock held
290             final ConnectionConnectCohort cohort = Verify.verifyNotNull(connectionUp(newConn));
291
292             // Lock the old connection and get a reference to its entries
293             final Collection<ConnectionEntry> replayIterable = conn.startReplay();
294
295             // Finish the connection attempt
296             final ReconnectForwarder forwarder = Verify.verifyNotNull(cohort.finishReconnect(replayIterable));
297
298             // Install the forwarder, unlocking the old connection
299             conn.finishReplay(forwarder);
300
301             // Make sure new lookups pick up the new connection
302             if (!connections.replace(shard, conn, newConn)) {
303                 final AbstractClientConnection<T> existing = connections.get(conn.cookie());
304                 LOG.warn("{}: old connection {} does not match existing {}, new connection {} in limbo",
305                     persistenceId(), conn, existing, newConn);
306             } else {
307                 LOG.info("{}: replaced connection {} with {} in {}", persistenceId(), conn, newConn, sw);
308             }
309         } finally {
310             connectionsLock.unlockWrite(stamp);
311         }
312     }
313
314     void removeConnection(final AbstractClientConnection<?> conn) {
315         final long stamp = connectionsLock.writeLock();
316         try {
317             if (!connections.remove(conn.cookie(), conn)) {
318                 final AbstractClientConnection<T> existing = connections.get(conn.cookie());
319                 if (existing != null) {
320                     LOG.warn("{}: failed to remove connection {}, as it was superseded by {}", persistenceId(), conn,
321                         existing);
322                 } else {
323                     LOG.warn("{}: failed to remove connection {}, as it was not tracked", persistenceId(), conn);
324                 }
325             } else {
326                 LOG.info("{}: removed connection {}", persistenceId(), conn);
327             }
328         } finally {
329             connectionsLock.unlockWrite(stamp);
330         }
331     }
332
333     @SuppressWarnings("unchecked")
334     void reconnectConnection(final ConnectedClientConnection<?> oldConn,
335             final ReconnectingClientConnection<?> newConn) {
336         final ReconnectingClientConnection<T> conn = (ReconnectingClientConnection<T>)newConn;
337         LOG.info("{}: connection {} reconnecting as {}", persistenceId(), oldConn, newConn);
338
339         final long stamp = connectionsLock.writeLock();
340         try {
341             final boolean replaced = connections.replace(oldConn.cookie(), (AbstractClientConnection<T>)oldConn, conn);
342             if (!replaced) {
343                 final AbstractClientConnection<T> existing = connections.get(oldConn.cookie());
344                 if (existing != null) {
345                     LOG.warn("{}: failed to replace connection {}, as it was superseded by {}", persistenceId(), conn,
346                         existing);
347                 } else {
348                     LOG.warn("{}: failed to replace connection {}, as it was not tracked", persistenceId(), conn);
349                 }
350             }
351         } finally {
352             connectionsLock.unlockWrite(stamp);
353         }
354
355         final Long shard = oldConn.cookie();
356         LOG.info("{}: refreshing backend for shard {}", persistenceId(), shard);
357         resolver().refreshBackendInfo(shard, conn.getBackendInfo().get()).whenComplete(
358             (backend, failure) -> context().executeInActor(behavior -> {
359                 backendConnectFinished(shard, conn, backend, failure);
360                 return behavior;
361             }));
362     }
363
364     private ConnectingClientConnection<T> createConnection(final Long shard) {
365         final ConnectingClientConnection<T> conn = new ConnectingClientConnection<>(context(), shard);
366         resolveConnection(shard, conn);
367         return conn;
368     }
369
370     private void resolveConnection(final Long shard, final AbstractClientConnection<T> conn) {
371         LOG.debug("{}: resolving shard {} connection {}", persistenceId(), shard, conn);
372         resolver().getBackendInfo(shard).whenComplete((backend, failure) -> context().executeInActor(behavior -> {
373             backendConnectFinished(shard, conn, backend, failure);
374             return behavior;
375         }));
376     }
377 }