BUG-5280: Correct reconnect retry logic
[controller.git] / opendaylight / md-sal / cds-access-client / src / main / java / org / opendaylight / controller / cluster / access / client / ClientActorBehavior.java
1 /*
2  * Copyright (c) 2016 Cisco Systems, Inc. and others.  All rights reserved.
3  *
4  * This program and the accompanying materials are made available under the
5  * terms of the Eclipse Public License v1.0 which accompanies this distribution,
6  * and is available at http://www.eclipse.org/legal/epl-v10.html
7  */
8 package org.opendaylight.controller.cluster.access.client;
9
10 import com.google.common.annotations.Beta;
11 import com.google.common.base.Preconditions;
12 import com.google.common.base.Verify;
13 import java.util.Map;
14 import java.util.concurrent.ConcurrentHashMap;
15 import java.util.concurrent.TimeUnit;
16 import java.util.concurrent.TimeoutException;
17 import javax.annotation.Nonnull;
18 import javax.annotation.Nullable;
19 import javax.annotation.concurrent.GuardedBy;
20 import org.opendaylight.controller.cluster.access.concepts.ClientIdentifier;
21 import org.opendaylight.controller.cluster.access.concepts.FailureEnvelope;
22 import org.opendaylight.controller.cluster.access.concepts.LocalHistoryIdentifier;
23 import org.opendaylight.controller.cluster.access.concepts.RequestException;
24 import org.opendaylight.controller.cluster.access.concepts.RequestFailure;
25 import org.opendaylight.controller.cluster.access.concepts.ResponseEnvelope;
26 import org.opendaylight.controller.cluster.access.concepts.RetiredGenerationException;
27 import org.opendaylight.controller.cluster.access.concepts.RuntimeRequestException;
28 import org.opendaylight.controller.cluster.access.concepts.SuccessEnvelope;
29 import org.opendaylight.controller.cluster.access.concepts.TransactionIdentifier;
30 import org.opendaylight.yangtools.concepts.Identifiable;
31 import org.opendaylight.yangtools.concepts.WritableIdentifier;
32 import org.slf4j.Logger;
33 import org.slf4j.LoggerFactory;
34 import scala.concurrent.duration.FiniteDuration;
35
36 /**
37  * A behavior, which handles messages sent to a {@link AbstractClientActor}.
38  *
39  * @author Robert Varga
40  */
41 @Beta
42 public abstract class ClientActorBehavior<T extends BackendInfo> extends
43         RecoveredClientActorBehavior<ClientActorContext> implements Identifiable<ClientIdentifier> {
44     /**
45      * Connection reconnect cohort, driven by this class.
46      */
47     @FunctionalInterface
48     protected interface ConnectionConnectCohort {
49         /**
50          * Finish the connection by replaying previous messages onto the new connection.
51          *
52          * @param enqueuedEntries Previously-enqueued entries
53          * @return A {@link ReconnectForwarder} to handle any straggler messages which arrive after this method returns.
54          */
55         @Nonnull ReconnectForwarder finishReconnect(@Nonnull Iterable<ConnectionEntry> enqueuedEntries);
56     }
57
58     private static final Logger LOG = LoggerFactory.getLogger(ClientActorBehavior.class);
59     private static final FiniteDuration RESOLVE_RETRY_DURATION = FiniteDuration.apply(5, TimeUnit.SECONDS);
60
61     /**
62      * Map of connections to the backend. This map is concurrent to allow lookups, but given complex operations
63      * involved in connection transitions it is protected by a {@link InversibleLock}. Write-side of the lock is taken
64      * during connection transitions. Optimistic read-side of the lock is taken when new connections are introduced
65      * into the map.
66      *
67      * <p>
68      * The lock detects potential AB/BA deadlock scenarios and will force the reader side out by throwing
69      * a {@link InversibleLockException} -- which must be propagated up, releasing locks as it propagates. The initial
70      * entry point causing the the conflicting lookup must then call {@link InversibleLockException#awaitResolution()}
71      * before retrying the operation.
72      */
73     // TODO: it should be possible to move these two into ClientActorContext
74     private final Map<Long, AbstractClientConnection<T>> connections = new ConcurrentHashMap<>();
75     private final InversibleLock connectionsLock = new InversibleLock();
76     private final BackendInfoResolver<T> resolver;
77
78     protected ClientActorBehavior(@Nonnull final ClientActorContext context,
79             @Nonnull final BackendInfoResolver<T> resolver) {
80         super(context);
81         this.resolver = Preconditions.checkNotNull(resolver);
82     }
83
84     @Override
85     @Nonnull
86     public final ClientIdentifier getIdentifier() {
87         return context().getIdentifier();
88     }
89
90     /**
91      * Get a connection to a shard.
92      *
93      * @param shard Shard cookie
94      * @return Connection to a shard
95      * @throws InversibleLockException if the shard is being reconnected
96      */
97     public final AbstractClientConnection<T> getConnection(final Long shard) {
98         while (true) {
99             final long stamp = connectionsLock.optimisticRead();
100             final AbstractClientConnection<T> conn = connections.computeIfAbsent(shard, this::createConnection);
101             if (connectionsLock.validate(stamp)) {
102                 // No write-lock in-between, return success
103                 return conn;
104             }
105         }
106     }
107
108     @SuppressWarnings("unchecked")
109     @Override
110     final ClientActorBehavior<T> onReceiveCommand(final Object command) {
111         if (command instanceof InternalCommand) {
112             return ((InternalCommand<T>) command).execute(this);
113         }
114         if (command instanceof SuccessEnvelope) {
115             return onRequestSuccess((SuccessEnvelope) command);
116         }
117         if (command instanceof FailureEnvelope) {
118             return internalOnRequestFailure((FailureEnvelope) command);
119         }
120
121         return onCommand(command);
122     }
123
124     private static long extractCookie(final WritableIdentifier id) {
125         if (id instanceof TransactionIdentifier) {
126             return ((TransactionIdentifier) id).getHistoryId().getCookie();
127         } else if (id instanceof LocalHistoryIdentifier) {
128             return ((LocalHistoryIdentifier) id).getCookie();
129         } else {
130             throw new IllegalArgumentException("Unhandled identifier " + id);
131         }
132     }
133
134     private void onResponse(final ResponseEnvelope<?> response) {
135         final long cookie = extractCookie(response.getMessage().getTarget());
136         final AbstractClientConnection<T> connection = connections.get(cookie);
137         if (connection != null) {
138             connection.receiveResponse(response);
139         } else {
140             LOG.info("{}: Ignoring unknown response {}", persistenceId(), response);
141         }
142     }
143
144     private ClientActorBehavior<T> onRequestSuccess(final SuccessEnvelope success) {
145         onResponse(success);
146         return this;
147     }
148
149     private ClientActorBehavior<T> onRequestFailure(final FailureEnvelope failure) {
150         onResponse(failure);
151         return this;
152     }
153
154     private ClientActorBehavior<T> internalOnRequestFailure(final FailureEnvelope command) {
155         final RequestFailure<?, ?> failure = command.getMessage();
156         final RequestException cause = failure.getCause();
157         if (cause instanceof RetiredGenerationException) {
158             LOG.error("{}: current generation {} has been superseded", persistenceId(), getIdentifier(), cause);
159             haltClient(cause);
160             poison(cause);
161             return null;
162         }
163
164         return onRequestFailure(command);
165     }
166
167     private void poison(final RequestException cause) {
168         final long stamp = connectionsLock.writeLock();
169         try {
170             for (AbstractClientConnection<T> q : connections.values()) {
171                 q.poison(cause);
172             }
173
174             connections.clear();
175         } finally {
176             connectionsLock.unlockWrite(stamp);
177         }
178     }
179
180     /**
181      * Halt And Catch Fire. Halt processing on this client. Implementations need to ensure they initiate state flush
182      * procedures. No attempt to use this instance should be made after this method returns. Any such use may result
183      * in undefined behavior.
184      *
185      * @param cause Failure cause
186      */
187     protected abstract void haltClient(@Nonnull Throwable cause);
188
189     /**
190      * Override this method to handle any command which is not handled by the base behavior.
191      *
192      * @param command the command to process
193      * @return Next behavior to use, null if this actor should shut down.
194      */
195     @Nullable
196     protected abstract ClientActorBehavior<T> onCommand(@Nonnull Object command);
197
198     /**
199      * Override this method to provide a backend resolver instance.
200      *
201      * @return a backend resolver instance
202      */
203     protected final @Nonnull BackendInfoResolver<T> resolver() {
204         return resolver;
205     }
206
207     /**
208      * Callback invoked when a new connection has been established. Implementations are expected perform preparatory
209      * tasks before the previous connection is frozen.
210      *
211      * @param newConn New connection
212      * @return ConnectionConnectCohort which will be used to complete the process of bringing the connection up.
213      */
214     @GuardedBy("connectionsLock")
215     @Nonnull protected abstract ConnectionConnectCohort connectionUp(@Nonnull ConnectedClientConnection<T> newConn);
216
217     private void backendConnectFinished(final Long shard, final AbstractClientConnection<T> conn,
218             final T backend, final Throwable failure) {
219         if (failure != null) {
220             if (failure instanceof TimeoutException) {
221                 if (!conn.equals(connections.get(shard))) {
222                     // AbstractClientConnection will remove itself when it decides there is no point in continuing,
223                     // at which point we want to stop retrying
224                     LOG.info("{}: stopping resolution of shard {} on stale connection {}", persistenceId(), shard, conn,
225                         failure);
226                     return;
227                 }
228
229                 LOG.debug("{}: timed out resolving shard {}, scheduling retry in {}", persistenceId(), shard,
230                     RESOLVE_RETRY_DURATION, failure);
231                 context().executeInActor(b -> {
232                     resolveConnection(shard, conn);
233                     return b;
234                 }, RESOLVE_RETRY_DURATION);
235                 return;
236             }
237
238             LOG.error("{}: failed to resolve shard {}", persistenceId(), shard, failure);
239             final RequestException cause;
240             if (failure instanceof RequestException) {
241                 cause = (RequestException) failure;
242             } else {
243                 cause = new RuntimeRequestException("Failed to resolve shard " + shard, failure);
244             }
245
246             conn.poison(cause);
247             return;
248         }
249
250         LOG.debug("{}: resolved shard {} to {}", persistenceId(), shard, backend);
251         final long stamp = connectionsLock.writeLock();
252         try {
253             // Create a new connected connection
254             final ConnectedClientConnection<T> newConn = new ConnectedClientConnection<>(conn.context(),
255                     conn.cookie(), backend);
256             LOG.debug("{}: resolving connection {} to {}", persistenceId(), conn, newConn);
257
258             // Start reconnecting without the old connection lock held
259             final ConnectionConnectCohort cohort = Verify.verifyNotNull(connectionUp(newConn));
260
261             // Lock the old connection and get a reference to its entries
262             final Iterable<ConnectionEntry> replayIterable = conn.startReplay();
263
264             // Finish the connection attempt
265             final ReconnectForwarder forwarder = Verify.verifyNotNull(cohort.finishReconnect(replayIterable));
266
267             // Install the forwarder, unlocking the old connection
268             conn.finishReplay(forwarder);
269
270             // Make sure new lookups pick up the new connection
271             connections.replace(shard, conn, newConn);
272             LOG.debug("{}: replaced connection {} with {}", persistenceId(), conn, newConn);
273         } finally {
274             connectionsLock.unlockWrite(stamp);
275         }
276     }
277
278     void removeConnection(final AbstractClientConnection<?> conn) {
279         connections.remove(conn.cookie(), conn);
280         LOG.debug("{}: removed connection {}", persistenceId(), conn);
281     }
282
283     @SuppressWarnings("unchecked")
284     void reconnectConnection(final ConnectedClientConnection<?> oldConn,
285             final ReconnectingClientConnection<?> newConn) {
286         final ReconnectingClientConnection<T> conn = (ReconnectingClientConnection<T>)newConn;
287         connections.replace(oldConn.cookie(), (AbstractClientConnection<T>)oldConn, conn);
288         LOG.debug("{}: connection {} reconnecting as {}", persistenceId(), oldConn, newConn);
289
290         final Long shard = oldConn.cookie();
291         resolver().refreshBackendInfo(shard, conn.getBackendInfo().get()).whenComplete(
292             (backend, failure) -> context().executeInActor(behavior -> {
293                 backendConnectFinished(shard, conn, backend, failure);
294                 return behavior;
295             }));
296     }
297
298     private ConnectingClientConnection<T> createConnection(final Long shard) {
299         final ConnectingClientConnection<T> conn = new ConnectingClientConnection<>(context(), shard);
300         resolveConnection(shard, conn);
301         return conn;
302     }
303
304     private void resolveConnection(final Long shard, final AbstractClientConnection<T> conn) {
305         LOG.debug("{}: resolving shard {} connection {}", persistenceId(), shard, conn);
306         resolver().getBackendInfo(shard).whenComplete((backend, failure) -> context().executeInActor(behavior -> {
307             backendConnectFinished(shard, conn, backend, failure);
308             return behavior;
309         }));
310     }
311 }

©2013 OpenDaylight, A Linux Foundation Collaborative Project. All Rights Reserved.
OpenDaylight is a registered trademark of The OpenDaylight Project, Inc.
Linux Foundation and OpenDaylight are registered trademarks of the Linux Foundation.
Linux is a registered trademark of Linus Torvalds.