From 5a04e042052a3c13fa4677f236a7107a801c7421 Mon Sep 17 00:00:00 2001 From: Robert Varga Date: Mon, 31 Jul 2017 15:54:12 +0200 Subject: [PATCH] BUG-8898: do not invoke timeouts directly Request timeouts are occuring with the connection lock held, at which point the connection can be at the tail of a successor chain: oldestConnection -> olderConnection -> connection If the callback being invoked attempts to transmit an entry, we will end up attempting to lock the entire chain. This would not be a problem except that if there is a concurrent attempt to lock the entire chain it ends up holding the lock of oldestConnection and it is waiting for the lock on connection -- which will only be released once the callback finishes executing, but that in turn waits for oldestConnection to be unlocked -- a classic AB/BA deadlock. This patch alleviates the problem by deferring callback execution via executeInActor, i.e. the timeout will be delivered at as part of normal message processing. Change-Id: I237908cf214bcdfd477fe0212d09b207a0c2cdbf Signed-off-by: Robert Varga (cherry picked from commit 4367f456f3c7a30c8ee9c7bca738b3e120a4e1d1) --- .../access/client/AbstractClientConnection.java | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/opendaylight/md-sal/cds-access-client/src/main/java/org/opendaylight/controller/cluster/access/client/AbstractClientConnection.java b/opendaylight/md-sal/cds-access-client/src/main/java/org/opendaylight/controller/cluster/access/client/AbstractClientConnection.java index afee418fd6..c9be5be548 100644 --- a/opendaylight/md-sal/cds-access-client/src/main/java/org/opendaylight/controller/cluster/access/client/AbstractClientConnection.java +++ b/opendaylight/md-sal/cds-access-client/src/main/java/org/opendaylight/controller/cluster/access/client/AbstractClientConnection.java @@ -403,9 +403,7 @@ public abstract class AbstractClientConnection { queue.remove(now); LOG.debug("{}: Connection {} timed out entry {}", context.persistenceId(), this, head); - final double time = beenOpen * 1.0 / 1_000_000_000; - head.complete(head.getRequest().toRequestFailure( - new RequestTimeoutException("Timed out after " + time + "seconds"))); + timeoutEntry(head, beenOpen); } LOG.debug("Connection {} timed out {} tasks", this, tasksTimedOut); @@ -416,6 +414,19 @@ public abstract class AbstractClientConnection { return Optional.empty(); } + private void timeoutEntry(final ConnectionEntry entry, final long beenOpen) { + // Timeouts needs to be re-scheduled on actor thread because we are holding the lock on the current queue, + // which may be the tail of a successor chain. This is a problem if the callback attempts to send a request + // because that will attempt to lock the chain from the start, potentially causing a deadlock if there is + // a concurrent attempt to transmit. + context.executeInActor(current -> { + final double time = beenOpen * 1.0 / 1_000_000_000; + entry.complete(entry.getRequest().toRequestFailure( + new RequestTimeoutException("Timed out after " + time + "seconds"))); + return current; + }); + } + final void poison(final RequestException cause) { lock.lock(); try { -- 2.36.6