From 15a67bd103c2dc32f28139a7295ac84143c20d0c Mon Sep 17 00:00:00 2001 From: Robert Varga Date: Tue, 30 May 2017 11:41:58 +0200 Subject: [PATCH] BUG-8494: Cap queue sleep time Inconsistency in transmit queue backpressure has caused an observed delay of over 44 minutes in CSIT. Thile this is an artefact of a bug, we should not delay application thread for large periods of time, as that can cause failures in the application (if it has other time-bound duties like BGP keepalive messages and similar). Restrict the sleep time to 5 seconds, emitting an INFO-level messages when we reach this level of sleeping. This should not typically not occur, as the backpressure should be kicking in much sooner, smooting out the delay curve. Change-Id: Ie5f148248caa71791bdda71ddd7e33e5733aa7f8 Signed-off-by: Robert Varga --- .../client/AbstractClientConnection.java | 31 +++++++++++++++++-- .../access/client/ClientActorBehavior.java | 5 ++- 2 files changed, 32 insertions(+), 4 deletions(-) diff --git a/opendaylight/md-sal/cds-access-client/src/main/java/org/opendaylight/controller/cluster/access/client/AbstractClientConnection.java b/opendaylight/md-sal/cds-access-client/src/main/java/org/opendaylight/controller/cluster/access/client/AbstractClientConnection.java index 32becc040d..4425c43362 100644 --- a/opendaylight/md-sal/cds-access-client/src/main/java/org/opendaylight/controller/cluster/access/client/AbstractClientConnection.java +++ b/opendaylight/md-sal/cds-access-client/src/main/java/org/opendaylight/controller/cluster/access/client/AbstractClientConnection.java @@ -72,6 +72,10 @@ public abstract class AbstractClientConnection { // Emit a debug entry if we sleep for more that this amount private static final long DEBUG_DELAY_NANOS = TimeUnit.MILLISECONDS.toNanos(100); + // Upper bound on the time a thread is forced to sleep to keep queue size under control + private static final long MAX_DELAY_SECONDS = 5; + private static final long MAX_DELAY_NANOS = TimeUnit.SECONDS.toNanos(MAX_DELAY_SECONDS); + private final Lock lock = new ReentrantLock(); private final ClientActorContext context; @GuardedBy("lock") @@ -133,10 +137,17 @@ public abstract class AbstractClientConnection { */ public final void sendRequest(final Request request, final Consumer> callback) { final long now = currentTime(); - final long delay = enqueueEntry(new ConnectionEntry(request, callback, now), now); + long delay = enqueueEntry(new ConnectionEntry(request, callback, now), now); try { - if (delay >= DEBUG_DELAY_NANOS && LOG.isDebugEnabled()) { - LOG.debug("Sleeping for {}ms", TimeUnit.NANOSECONDS.toMillis(delay)); + if (delay >= DEBUG_DELAY_NANOS) { + if (delay > MAX_DELAY_NANOS) { + LOG.info("Capping {} throttle delay from {} to {} seconds", this, + TimeUnit.NANOSECONDS.toSeconds(delay), MAX_DELAY_SECONDS); + delay = MAX_DELAY_NANOS; + } + if (LOG.isDebugEnabled()) { + LOG.debug("Sleeping for {}ms", TimeUnit.NANOSECONDS.toMillis(delay)); + } } TimeUnit.NANOSECONDS.sleep(delay); } catch (InterruptedException e) { @@ -172,6 +183,20 @@ public abstract class AbstractClientConnection { @GuardedBy("lock") final void finishReplay(final ReconnectForwarder forwarder) { setForwarder(forwarder); + + /* + * The process of replaying all messages may have taken a significant chunk of time, depending on type + * of messages, queue depth and available processing power. In extreme situations this may have already + * exceeded BACKEND_ALIVE_TIMEOUT_NANOS, in which case we are running the risk of not making reasonable forward + * progress before we start a reconnect cycle. + * + * Note that the timer is armed after we have sent the first message, hence we should be seeing a response + * from the backend before we see a timeout, simply due to how the mailbox operates. + * + * At any rate, reset the timestamp once we complete reconnection (which an atomic transition from the + * perspective of outside world), as that makes it a bit easier to reason about timing of events. + */ + lastReceivedTicks = currentTime(); lock.unlock(); } diff --git a/opendaylight/md-sal/cds-access-client/src/main/java/org/opendaylight/controller/cluster/access/client/ClientActorBehavior.java b/opendaylight/md-sal/cds-access-client/src/main/java/org/opendaylight/controller/cluster/access/client/ClientActorBehavior.java index 9ba118ed6b..554ffe97c7 100644 --- a/opendaylight/md-sal/cds-access-client/src/main/java/org/opendaylight/controller/cluster/access/client/ClientActorBehavior.java +++ b/opendaylight/md-sal/cds-access-client/src/main/java/org/opendaylight/controller/cluster/access/client/ClientActorBehavior.java @@ -9,6 +9,7 @@ package org.opendaylight.controller.cluster.access.client; import com.google.common.annotations.Beta; import com.google.common.base.Preconditions; +import com.google.common.base.Stopwatch; import com.google.common.base.Verify; import java.util.Collection; import java.util.Map; @@ -278,6 +279,8 @@ public abstract class ClientActorBehavior extends LOG.info("{}: resolved shard {} to {}", persistenceId(), shard, backend); final long stamp = connectionsLock.writeLock(); try { + final Stopwatch sw = Stopwatch.createStarted(); + // Create a new connected connection final ConnectedClientConnection newConn = new ConnectedClientConnection<>(conn.context(), conn.cookie(), backend); @@ -301,7 +304,7 @@ public abstract class ClientActorBehavior extends LOG.warn("{}: old connection {} does not match existing {}, new connection {} in limbo", persistenceId(), conn, existing, newConn); } else { - LOG.info("{}: replaced connection {} with {}", persistenceId(), conn, newConn); + LOG.info("{}: replaced connection {} with {} in {}", persistenceId(), conn, newConn, sw); } } finally { connectionsLock.unlockWrite(stamp); -- 2.36.6