Fix NetconfNodeHandler interactions 81/107281/1
authorRobert Varga <robert.varga@pantheon.tech>
Sat, 5 Aug 2023 23:08:28 +0000 (01:08 +0200)
committerRobert Varga <robert.varga@pantheon.tech>
Mon, 7 Aug 2023 08:50:32 +0000 (10:50 +0200)
There are a couple of failures in interacting with RemoteDeviceHandler
API (which is completely undocumented).

The API design (and interactions in implementations) assume that:
- onDeviceConnected()/onDeviceDisconnected() manage a reconnecting
  process -- i.e. when 'disconnected' it is assumed the device might
  come back again
- onDeviceFailed() is the hard alternative, indicating the node is node
  coming back

This is then picked up and routed through NetconfTopologyDeviceSalFacade
to NetconfDeviceTopologyAdapter, which makes assumptions and propagates
them to the datastore.

JIRA: NETCONF-1097
Change-Id: Ib148a617b52e775261212aa3a809d1ab450d7d13
Signed-off-by: Ivan Hrasko <ivan.hrasko@pantheon.tech>
Signed-off-by: Robert Varga <robert.varga@pantheon.tech>
(cherry picked from commit 9db827875364d13611ad3b00f24e42ec259417f4)

apps/netconf-topology/src/main/java/org/opendaylight/netconf/topology/spi/ConnectGivenUpException.java [new file with mode: 0644]
apps/netconf-topology/src/main/java/org/opendaylight/netconf/topology/spi/NetconfNodeHandler.java
apps/netconf-topology/src/test/java/org/opendaylight/netconf/topology/spi/NetconfNodeHandlerTest.java

diff --git a/apps/netconf-topology/src/main/java/org/opendaylight/netconf/topology/spi/ConnectGivenUpException.java b/apps/netconf-topology/src/main/java/org/opendaylight/netconf/topology/spi/ConnectGivenUpException.java
new file mode 100644 (file)
index 0000000..d8f5e20
--- /dev/null
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2023 PANTHEON.tech, s.r.o. and others.  All rights reserved.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License v1.0 which accompanies this distribution,
+ * and is available at http://www.eclipse.org/legal/epl-v10.html
+ */
+package org.opendaylight.netconf.topology.spi;
+
+import static java.util.Objects.requireNonNull;
+
+import org.opendaylight.netconf.client.mdsal.api.RemoteDeviceHandler;
+
+/**
+ * Reported to {@link RemoteDeviceHandler#onDeviceFailed(Throwable)} downstream of {@link NetconfNodeHandler}. Indicates
+ * the corresponding NETCONF topology node will never become connected.
+ */
+public final class ConnectGivenUpException extends Exception {
+    @java.io.Serial
+    private static final long serialVersionUID = 1L;
+
+    public ConnectGivenUpException(final String message) {
+        super(requireNonNull(message));
+    }
+}
index 204681a41cc79df05865511258a8a6c99138653f..47f980c66039ce8fa1c47e8e1077254aab9e1039 100644 (file)
@@ -159,8 +159,6 @@ public final class NetconfNodeHandler extends AbstractRegistration implements Re
     }
 
     private void connectComplete(final Future<?> future) {
-        final Throwable cause;
-
         // Locked manipulation of internal state
         synchronized (this) {
             // A quick sanity check
@@ -170,7 +168,7 @@ public final class NetconfNodeHandler extends AbstractRegistration implements Re
             }
 
             currentTask = null;
-            cause = future.cause();
+            final var cause = future.cause();
             if (cause == null || cause instanceof CancellationException) {
                 // Success or cancellation, nothing else to do.
                 // In case of success the rest of the setup is driven by RemoteDeviceHandler callbacks
@@ -181,7 +179,7 @@ public final class NetconfNodeHandler extends AbstractRegistration implements Re
         }
 
         // We are invoking callbacks, do not hold locks
-        onDeviceFailed(cause);
+        reconnectOrFail();
     }
 
     @Override
@@ -208,14 +206,14 @@ public final class NetconfNodeHandler extends AbstractRegistration implements Re
     @Override
     public void onDeviceDisconnected() {
         delegate.onDeviceDisconnected();
-        scheduleReconnect();
+        reconnectOrFail();
     }
 
     @Override
     public void onDeviceFailed(final Throwable throwable) {
+        // We have not reported onDeviceConnected(), so from the view of delete we are still connecting
         LOG.debug("Connection attempt failed", throwable);
-        delegate.onDeviceFailed(throwable);
-        scheduleReconnect();
+        reconnectOrFail();
     }
 
     @Override
@@ -223,9 +221,16 @@ public final class NetconfNodeHandler extends AbstractRegistration implements Re
         delegate.onNotification(domNotification);
     }
 
-    private synchronized void scheduleReconnect() {
+    private void reconnectOrFail() {
+        final var ex = scheduleReconnect();
+        if (ex != null) {
+            delegate.onDeviceFailed(ex);
+        }
+    }
+
+    private synchronized Exception scheduleReconnect() {
         if (isClosed()) {
-            return;
+            return null;
         }
 
         final long delayMillis;
@@ -233,7 +238,7 @@ public final class NetconfNodeHandler extends AbstractRegistration implements Re
         // We have exceeded the number of connection attempts
         if (maxAttempts > 0 && attempts >= maxAttempts) {
             LOG.info("Failed to connect {} after {} attempts, not attempting", deviceId, attempts);
-            return;
+            return new ConnectGivenUpException("Given up connecting " + deviceId + " after " + attempts + " attempts");
         }
 
         // First connection attempt gets initialized to minimum sleep, each subsequent is exponentially backed off
@@ -250,14 +255,12 @@ public final class NetconfNodeHandler extends AbstractRegistration implements Re
         lastSleep = delayMillis;
         LOG.debug("Retrying {} connection attempt {} after {} milliseconds", deviceId, attempts, delayMillis);
 
-        // If we are not sleeping at all, return an already-succeeded future
-        if (delayMillis == 0) {
-            lockedConnect();
-            return;
-        }
-
-        // Schedule a task for the right time. It will also clear the flag.
+        // Schedule a task for the right time. We always go through the executor to eliminate the special case of
+        // immediate reconnect. While we could check and got to lockedConnect(), it makes for a rare special case.
+        // That special case makes for more code paths to test and introduces additional uncertainty as to whether
+        // the attempt was executed on on this thread or not.
         currentTask = eventExecutor.schedule(this::reconnect, delayMillis, TimeUnit.MILLISECONDS);
+        return null;
     }
 
     private synchronized void reconnect() {
index d83137ddc6cfe9434972c8250a87502e9e719024..ed319dc5e71bfe7ee5a604c180d830084afa769b 100644 (file)
@@ -7,6 +7,8 @@
  */
 package org.opendaylight.netconf.topology.spi;
 
+import static org.hamcrest.CoreMatchers.instanceOf;
+import static org.hamcrest.MatcherAssert.assertThat;
 import static org.junit.Assert.assertEquals;
 import static org.mockito.ArgumentMatchers.any;
 import static org.mockito.ArgumentMatchers.eq;
@@ -122,7 +124,7 @@ public class NetconfNodeHandlerTest {
     }
 
     @Before
-    public void setUp() {
+    public void before() {
         // Instantiate the handler
         handler = new NetconfNodeHandler(clientDispatcher, eventExecutor, keepaliveExecutor, BASE_SCHEMAS,
             schemaManager, processingExecutor,
@@ -158,6 +160,24 @@ public class NetconfNodeHandlerTest {
         assertEquals(0, handler.attempts());
     }
 
+    @Test
+    public void failedSchemaCausesReconnect() {
+        assertSuccessfulConnect();
+        assertEquals(1, handler.attempts());
+
+        // Note: this will count as a second attempt
+        doReturn(scheduleFuture).when(eventExecutor).schedule(scheduleCaptor.capture(), eq(150L),
+            eq(TimeUnit.MILLISECONDS));
+        handler.onDeviceFailed(new AssertionError("schema failure"));
+
+        assertEquals(2, handler.attempts());
+
+        // and when we run the task, we get a clientDispatcher invocation, but attempts are still the same
+        scheduleCaptor.getValue().run();
+        verify(clientDispatcher, times(2)).createClient(any());
+        assertEquals(2, handler.attempts());
+    }
+
     @Test
     public void downAfterUpCausesReconnect() {
         // Let's borrow common bits
@@ -185,12 +205,9 @@ public class NetconfNodeHandlerTest {
         handler.connect();
         assertEquals(1, handler.attempts());
 
-        // FIXME: NETCONF-1097 remove this stubbing
-        final var firstFailure = new AssertionError("first");
-        doNothing().when(delegate).onDeviceFailed(firstFailure);
         doReturn(scheduleFuture).when(eventExecutor).schedule(scheduleCaptor.capture(), eq(150L),
             eq(TimeUnit.MILLISECONDS));
-        firstPromise.setFailure(firstFailure);
+        firstPromise.setFailure(new AssertionError("first"));
 
         assertEquals(2, handler.attempts());
 
@@ -200,9 +217,10 @@ public class NetconfNodeHandlerTest {
         assertEquals(2, handler.attempts());
 
         // now report the second failure
-        final var secondFailure = new AssertionError("second");
-        doNothing().when(delegate).onDeviceFailed(secondFailure);
-        secondPromise.setFailure(secondFailure);
+        final var throwableCaptor = ArgumentCaptor.forClass(Throwable.class);
+        doNothing().when(delegate).onDeviceFailed(throwableCaptor.capture());
+        secondPromise.setFailure(new AssertionError("second"));
+        assertThat(throwableCaptor.getValue(), instanceOf(ConnectGivenUpException.class));
 
         // but nothing else happens
         assertEquals(2, handler.attempts());