Restart downed nodes. 27/98927/7
authorTomas Cere <tomas.cere@pantheon.tech>
Wed, 8 Dec 2021 11:40:40 +0000 (12:40 +0100)
committerTomas Cere <tomas.cere@pantheon.tech>
Mon, 17 Jan 2022 12:18:57 +0000 (13:18 +0100)
Nodes can be downed by sbr(for example when Isolated, sbr makes the
decision to down the minority), so we need to make sure they can
come up automatically unless we have another mechanism in place
for bringing up downed nodes.

JIRA: CONTROLLER-2025
Change-Id: I23d3ca2cee471c51d0eadc6c426461aa6eef193d
Signed-off-by: Tomas Cere <tomas.cere@pantheon.tech>
opendaylight/md-sal/sal-clustering-commons/src/main/java/org/opendaylight/controller/cluster/common/actor/QuarantinedMonitorActor.java

index dfafb82..77dcba5 100644 (file)
@@ -11,6 +11,8 @@ package org.opendaylight.controller.cluster.common.actor;
 import akka.actor.Address;
 import akka.actor.Props;
 import akka.actor.UntypedAbstractActor;
 import akka.actor.Address;
 import akka.actor.Props;
 import akka.actor.UntypedAbstractActor;
+import akka.cluster.Cluster;
+import akka.cluster.ClusterEvent;
 import akka.japi.Effect;
 import akka.remote.AssociationErrorEvent;
 import akka.remote.RemotingLifecycleEvent;
 import akka.japi.Effect;
 import akka.remote.AssociationErrorEvent;
 import akka.remote.RemotingLifecycleEvent;
@@ -37,7 +39,7 @@ public class QuarantinedMonitorActor extends UntypedAbstractActor {
     private final Effect callback;
     private boolean quarantined;
 
     private final Effect callback;
     private boolean quarantined;
 
-    private Set<Address> addressSet = new HashSet<>();
+    private final Set<Address> addressSet = new HashSet<>();
     private int count = 0;
 
     protected QuarantinedMonitorActor(final Effect callback) {
     private int count = 0;
 
     protected QuarantinedMonitorActor(final Effect callback) {
@@ -46,6 +48,7 @@ public class QuarantinedMonitorActor extends UntypedAbstractActor {
         LOG.debug("Created QuarantinedMonitorActor");
 
         getContext().system().eventStream().subscribe(getSelf(), RemotingLifecycleEvent.class);
         LOG.debug("Created QuarantinedMonitorActor");
 
         getContext().system().eventStream().subscribe(getSelf(), RemotingLifecycleEvent.class);
+        getContext().system().eventStream().subscribe(getSelf(), ClusterEvent.MemberDowned.class);
     }
 
     @Override
     }
 
     @Override
@@ -71,10 +74,10 @@ public class QuarantinedMonitorActor extends UntypedAbstractActor {
             // execute the callback
             callback.apply();
         } else  if (message instanceof AssociationErrorEvent) {
             // execute the callback
             callback.apply();
         } else  if (message instanceof AssociationErrorEvent) {
-            String errorMessage = message.toString();
+            final String errorMessage = message.toString();
             LOG.trace("errorMessage:{}", errorMessage);
             if (errorMessage.contains("The remote system has a UID that has been quarantined")) {
             LOG.trace("errorMessage:{}", errorMessage);
             if (errorMessage.contains("The remote system has a UID that has been quarantined")) {
-                Address address = ((AssociationErrorEvent) message).getRemoteAddress();
+                final Address address = ((AssociationErrorEvent) message).getRemoteAddress();
                 addressSet.add(address);
                 count++;
                 LOG.trace("address:{} addressSet: {} count:{}", address, addressSet, count);
                 addressSet.add(address);
                 count++;
                 LOG.trace("address:{} addressSet: {} count:{}", address, addressSet, count);
@@ -92,6 +95,13 @@ public class QuarantinedMonitorActor extends UntypedAbstractActor {
                 count = 0;
                 addressSet.clear();
             }
                 count = 0;
                 addressSet.clear();
             }
+        } else if (message instanceof ClusterEvent.MemberDowned) {
+            final ClusterEvent.MemberDowned event = (ClusterEvent.MemberDowned) message;
+            if (Cluster.get(getContext().system()).selfMember().equals(event.member())) {
+                LOG.warn("This member has been downed, restarting");
+
+                callback.apply();
+            }
         }
     }
 
         }
     }