BUG-8403: guard against ConcurrentModificationException
[controller.git] / opendaylight / md-sal / cds-access-client / src / main / java / org / opendaylight / controller / cluster / access / client / ClientActorBehavior.java
1 /*
2  * Copyright (c) 2016 Cisco Systems, Inc. and others.  All rights reserved.
3  *
4  * This program and the accompanying materials are made available under the
5  * terms of the Eclipse Public License v1.0 which accompanies this distribution,
6  * and is available at http://www.eclipse.org/legal/epl-v10.html
7  */
8 package org.opendaylight.controller.cluster.access.client;
9
10 import com.google.common.annotations.Beta;
11 import com.google.common.base.Preconditions;
12 import com.google.common.base.Verify;
13 import java.util.Collection;
14 import java.util.Map;
15 import java.util.concurrent.ConcurrentHashMap;
16 import java.util.concurrent.TimeUnit;
17 import java.util.concurrent.TimeoutException;
18 import javax.annotation.Nonnull;
19 import javax.annotation.Nullable;
20 import javax.annotation.concurrent.GuardedBy;
21 import org.opendaylight.controller.cluster.access.commands.NotLeaderException;
22 import org.opendaylight.controller.cluster.access.commands.OutOfSequenceEnvelopeException;
23 import org.opendaylight.controller.cluster.access.concepts.ClientIdentifier;
24 import org.opendaylight.controller.cluster.access.concepts.FailureEnvelope;
25 import org.opendaylight.controller.cluster.access.concepts.LocalHistoryIdentifier;
26 import org.opendaylight.controller.cluster.access.concepts.RequestException;
27 import org.opendaylight.controller.cluster.access.concepts.RequestFailure;
28 import org.opendaylight.controller.cluster.access.concepts.ResponseEnvelope;
29 import org.opendaylight.controller.cluster.access.concepts.RetiredGenerationException;
30 import org.opendaylight.controller.cluster.access.concepts.RuntimeRequestException;
31 import org.opendaylight.controller.cluster.access.concepts.SuccessEnvelope;
32 import org.opendaylight.controller.cluster.access.concepts.TransactionIdentifier;
33 import org.opendaylight.yangtools.concepts.Identifiable;
34 import org.opendaylight.yangtools.concepts.WritableIdentifier;
35 import org.slf4j.Logger;
36 import org.slf4j.LoggerFactory;
37 import scala.concurrent.duration.FiniteDuration;
38
39 /**
40  * A behavior, which handles messages sent to a {@link AbstractClientActor}.
41  *
42  * @author Robert Varga
43  */
44 @Beta
45 public abstract class ClientActorBehavior<T extends BackendInfo> extends
46         RecoveredClientActorBehavior<ClientActorContext> implements Identifiable<ClientIdentifier> {
47     /**
48      * Connection reconnect cohort, driven by this class.
49      */
50     @FunctionalInterface
51     protected interface ConnectionConnectCohort {
52         /**
53          * Finish the connection by replaying previous messages onto the new connection.
54          *
55          * @param enqueuedEntries Previously-enqueued entries
56          * @return A {@link ReconnectForwarder} to handle any straggler messages which arrive after this method returns.
57          */
58         @Nonnull ReconnectForwarder finishReconnect(@Nonnull Collection<ConnectionEntry> enqueuedEntries);
59     }
60
61     private static final Logger LOG = LoggerFactory.getLogger(ClientActorBehavior.class);
62     private static final FiniteDuration RESOLVE_RETRY_DURATION = FiniteDuration.apply(5, TimeUnit.SECONDS);
63
64     /**
65      * Map of connections to the backend. This map is concurrent to allow lookups, but given complex operations
66      * involved in connection transitions it is protected by a {@link InversibleLock}. Write-side of the lock is taken
67      * during connection transitions. Optimistic read-side of the lock is taken when new connections are introduced
68      * into the map.
69      *
70      * <p>
71      * The lock detects potential AB/BA deadlock scenarios and will force the reader side out by throwing
72      * a {@link InversibleLockException} -- which must be propagated up, releasing locks as it propagates. The initial
73      * entry point causing the the conflicting lookup must then call {@link InversibleLockException#awaitResolution()}
74      * before retrying the operation.
75      */
76     // TODO: it should be possible to move these two into ClientActorContext
77     private final Map<Long, AbstractClientConnection<T>> connections = new ConcurrentHashMap<>();
78     private final InversibleLock connectionsLock = new InversibleLock();
79     private final BackendInfoResolver<T> resolver;
80
81     protected ClientActorBehavior(@Nonnull final ClientActorContext context,
82             @Nonnull final BackendInfoResolver<T> resolver) {
83         super(context);
84         this.resolver = Preconditions.checkNotNull(resolver);
85     }
86
87     @Override
88     @Nonnull
89     public final ClientIdentifier getIdentifier() {
90         return context().getIdentifier();
91     }
92
93     /**
94      * Get a connection to a shard.
95      *
96      * @param shard Shard cookie
97      * @return Connection to a shard
98      * @throws InversibleLockException if the shard is being reconnected
99      */
100     public final AbstractClientConnection<T> getConnection(final Long shard) {
101         while (true) {
102             final long stamp = connectionsLock.optimisticRead();
103             final AbstractClientConnection<T> conn = connections.computeIfAbsent(shard, this::createConnection);
104             if (connectionsLock.validate(stamp)) {
105                 // No write-lock in-between, return success
106                 return conn;
107             }
108         }
109     }
110
111     private AbstractClientConnection<T> getConnection(final ResponseEnvelope<?> response) {
112         // Always called from actor context: no locking required
113         return connections.get(extractCookie(response.getMessage().getTarget()));
114     }
115
116     @SuppressWarnings("unchecked")
117     @Override
118     final ClientActorBehavior<T> onReceiveCommand(final Object command) {
119         if (command instanceof InternalCommand) {
120             return ((InternalCommand<T>) command).execute(this);
121         }
122         if (command instanceof SuccessEnvelope) {
123             return onRequestSuccess((SuccessEnvelope) command);
124         }
125         if (command instanceof FailureEnvelope) {
126             return internalOnRequestFailure((FailureEnvelope) command);
127         }
128
129         return onCommand(command);
130     }
131
132     private static long extractCookie(final WritableIdentifier id) {
133         if (id instanceof TransactionIdentifier) {
134             return ((TransactionIdentifier) id).getHistoryId().getCookie();
135         } else if (id instanceof LocalHistoryIdentifier) {
136             return ((LocalHistoryIdentifier) id).getCookie();
137         } else {
138             throw new IllegalArgumentException("Unhandled identifier " + id);
139         }
140     }
141
142     private void onResponse(final ResponseEnvelope<?> response) {
143         final AbstractClientConnection<T> connection = getConnection(response);
144         if (connection != null) {
145             connection.receiveResponse(response);
146         } else {
147             LOG.info("{}: Ignoring unknown response {}", persistenceId(), response);
148         }
149     }
150
151     private ClientActorBehavior<T> onRequestSuccess(final SuccessEnvelope success) {
152         onResponse(success);
153         return this;
154     }
155
156     private ClientActorBehavior<T> onRequestFailure(final FailureEnvelope failure) {
157         onResponse(failure);
158         return this;
159     }
160
161     private ClientActorBehavior<T> internalOnRequestFailure(final FailureEnvelope command) {
162         final RequestFailure<?, ?> failure = command.getMessage();
163         final RequestException cause = failure.getCause();
164         if (cause instanceof RetiredGenerationException) {
165             LOG.error("{}: current generation {} has been superseded", persistenceId(), getIdentifier(), cause);
166             haltClient(cause);
167             poison(cause);
168             return null;
169         }
170         if (cause instanceof NotLeaderException) {
171             final AbstractClientConnection<T> conn = getConnection(command);
172             if (conn instanceof ReconnectingClientConnection) {
173                 // Already reconnecting, do not churn the logs
174                 return this;
175             } else if (conn != null) {
176                 LOG.info("{}: connection {} indicated no leadership, reconnecting it", persistenceId(), conn, cause);
177                 return conn.reconnect(this, cause);
178             }
179         }
180         if (cause instanceof OutOfSequenceEnvelopeException) {
181             final AbstractClientConnection<T> conn = getConnection(command);
182             if (conn instanceof ReconnectingClientConnection) {
183                 // Already reconnecting, do not churn the logs
184                 return this;
185             } else if (conn != null) {
186                 LOG.info("{}: connection {} indicated no sequencing mismatch on {} sequence {}, reconnecting it",
187                     persistenceId(), conn, failure.getTarget(), failure.getSequence(), cause);
188                 return conn.reconnect(this, cause);
189             }
190         }
191
192         return onRequestFailure(command);
193     }
194
195     private void poison(final RequestException cause) {
196         final long stamp = connectionsLock.writeLock();
197         try {
198             for (AbstractClientConnection<T> q : connections.values()) {
199                 q.poison(cause);
200             }
201
202             connections.clear();
203         } finally {
204             connectionsLock.unlockWrite(stamp);
205         }
206     }
207
208     /**
209      * Halt And Catch Fire. Halt processing on this client. Implementations need to ensure they initiate state flush
210      * procedures. No attempt to use this instance should be made after this method returns. Any such use may result
211      * in undefined behavior.
212      *
213      * @param cause Failure cause
214      */
215     protected abstract void haltClient(@Nonnull Throwable cause);
216
217     /**
218      * Override this method to handle any command which is not handled by the base behavior.
219      *
220      * @param command the command to process
221      * @return Next behavior to use, null if this actor should shut down.
222      */
223     @Nullable
224     protected abstract ClientActorBehavior<T> onCommand(@Nonnull Object command);
225
226     /**
227      * Override this method to provide a backend resolver instance.
228      *
229      * @return a backend resolver instance
230      */
231     protected final @Nonnull BackendInfoResolver<T> resolver() {
232         return resolver;
233     }
234
235     /**
236      * Callback invoked when a new connection has been established. Implementations are expected perform preparatory
237      * tasks before the previous connection is frozen.
238      *
239      * @param newConn New connection
240      * @return ConnectionConnectCohort which will be used to complete the process of bringing the connection up.
241      */
242     @GuardedBy("connectionsLock")
243     @Nonnull protected abstract ConnectionConnectCohort connectionUp(@Nonnull ConnectedClientConnection<T> newConn);
244
245     private void backendConnectFinished(final Long shard, final AbstractClientConnection<T> conn,
246             final T backend, final Throwable failure) {
247         if (failure != null) {
248             if (failure instanceof TimeoutException) {
249                 if (!conn.equals(connections.get(shard))) {
250                     // AbstractClientConnection will remove itself when it decides there is no point in continuing,
251                     // at which point we want to stop retrying
252                     LOG.info("{}: stopping resolution of shard {} on stale connection {}", persistenceId(), shard, conn,
253                         failure);
254                     return;
255                 }
256
257                 LOG.debug("{}: timed out resolving shard {}, scheduling retry in {}", persistenceId(), shard,
258                     RESOLVE_RETRY_DURATION, failure);
259                 context().executeInActor(b -> {
260                     resolveConnection(shard, conn);
261                     return b;
262                 }, RESOLVE_RETRY_DURATION);
263                 return;
264             }
265
266             LOG.error("{}: failed to resolve shard {}", persistenceId(), shard, failure);
267             final RequestException cause;
268             if (failure instanceof RequestException) {
269                 cause = (RequestException) failure;
270             } else {
271                 cause = new RuntimeRequestException("Failed to resolve shard " + shard, failure);
272             }
273
274             conn.poison(cause);
275             return;
276         }
277
278         LOG.info("{}: resolved shard {} to {}", persistenceId(), shard, backend);
279         final long stamp = connectionsLock.writeLock();
280         try {
281             // Create a new connected connection
282             final ConnectedClientConnection<T> newConn = new ConnectedClientConnection<>(conn.context(),
283                     conn.cookie(), backend);
284             LOG.info("{}: resolving connection {} to {}", persistenceId(), conn, newConn);
285
286             // Start reconnecting without the old connection lock held
287             final ConnectionConnectCohort cohort = Verify.verifyNotNull(connectionUp(newConn));
288
289             // Lock the old connection and get a reference to its entries
290             final Collection<ConnectionEntry> replayIterable = conn.startReplay();
291
292             // Finish the connection attempt
293             final ReconnectForwarder forwarder = Verify.verifyNotNull(cohort.finishReconnect(replayIterable));
294
295             // Install the forwarder, unlocking the old connection
296             conn.finishReplay(forwarder);
297
298             // Make sure new lookups pick up the new connection
299             connections.replace(shard, conn, newConn);
300             LOG.info("{}: replaced connection {} with {}", persistenceId(), conn, newConn);
301         } finally {
302             connectionsLock.unlockWrite(stamp);
303         }
304     }
305
306     void removeConnection(final AbstractClientConnection<?> conn) {
307         connections.remove(conn.cookie(), conn);
308         LOG.debug("{}: removed connection {}", persistenceId(), conn);
309     }
310
311     @SuppressWarnings("unchecked")
312     void reconnectConnection(final ConnectedClientConnection<?> oldConn,
313             final ReconnectingClientConnection<?> newConn) {
314         final ReconnectingClientConnection<T> conn = (ReconnectingClientConnection<T>)newConn;
315         LOG.info("{}: connection {} reconnecting as {}", persistenceId(), oldConn, newConn);
316
317         final boolean replaced = connections.replace(oldConn.cookie(), (AbstractClientConnection<T>)oldConn, conn);
318         if (!replaced) {
319             final AbstractClientConnection<T> existing = connections.get(oldConn.cookie());
320             LOG.warn("{}: old connection {} does not match existing {}, new connection {} in limbo", persistenceId(),
321                 oldConn, existing, newConn);
322         }
323
324         final Long shard = oldConn.cookie();
325         LOG.info("{}: refreshing backend for shard {}", persistenceId(), shard);
326         resolver().refreshBackendInfo(shard, conn.getBackendInfo().get()).whenComplete(
327             (backend, failure) -> context().executeInActor(behavior -> {
328                 backendConnectFinished(shard, conn, backend, failure);
329                 return behavior;
330             }));
331     }
332
333     private ConnectingClientConnection<T> createConnection(final Long shard) {
334         final ConnectingClientConnection<T> conn = new ConnectingClientConnection<>(context(), shard);
335         resolveConnection(shard, conn);
336         return conn;
337     }
338
339     private void resolveConnection(final Long shard, final AbstractClientConnection<T> conn) {
340         LOG.debug("{}: resolving shard {} connection {}", persistenceId(), shard, conn);
341         resolver().getBackendInfo(shard).whenComplete((backend, failure) -> context().executeInActor(behavior -> {
342             backendConnectFinished(shard, conn, backend, failure);
343             return behavior;
344         }));
345     }
346 }