From 46fedf6c2d91ea96c453c059c3b2b7a6c16c370f Mon Sep 17 00:00:00 2001 From: Tomas Cere Date: Wed, 8 Dec 2021 12:40:40 +0100 Subject: [PATCH] Restart downed nodes. Nodes can be downed by sbr(for example when Isolated, sbr makes the decision to down the minority), so we need to make sure they can come up automatically unless we have another mechanism in place for bringing up downed nodes. JIRA: CONTROLLER-2025 Change-Id: I23d3ca2cee471c51d0eadc6c426461aa6eef193d Signed-off-by: Tomas Cere --- .../common/actor/QuarantinedMonitorActor.java | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/opendaylight/md-sal/sal-clustering-commons/src/main/java/org/opendaylight/controller/cluster/common/actor/QuarantinedMonitorActor.java b/opendaylight/md-sal/sal-clustering-commons/src/main/java/org/opendaylight/controller/cluster/common/actor/QuarantinedMonitorActor.java index dfafb82b61..77dcba564d 100644 --- a/opendaylight/md-sal/sal-clustering-commons/src/main/java/org/opendaylight/controller/cluster/common/actor/QuarantinedMonitorActor.java +++ b/opendaylight/md-sal/sal-clustering-commons/src/main/java/org/opendaylight/controller/cluster/common/actor/QuarantinedMonitorActor.java @@ -11,6 +11,8 @@ package org.opendaylight.controller.cluster.common.actor; import akka.actor.Address; import akka.actor.Props; import akka.actor.UntypedAbstractActor; +import akka.cluster.Cluster; +import akka.cluster.ClusterEvent; import akka.japi.Effect; import akka.remote.AssociationErrorEvent; import akka.remote.RemotingLifecycleEvent; @@ -37,7 +39,7 @@ public class QuarantinedMonitorActor extends UntypedAbstractActor { private final Effect callback; private boolean quarantined; - private Set
addressSet = new HashSet<>(); + private final Set
addressSet = new HashSet<>(); private int count = 0; protected QuarantinedMonitorActor(final Effect callback) { @@ -46,6 +48,7 @@ public class QuarantinedMonitorActor extends UntypedAbstractActor { LOG.debug("Created QuarantinedMonitorActor"); getContext().system().eventStream().subscribe(getSelf(), RemotingLifecycleEvent.class); + getContext().system().eventStream().subscribe(getSelf(), ClusterEvent.MemberDowned.class); } @Override @@ -71,10 +74,10 @@ public class QuarantinedMonitorActor extends UntypedAbstractActor { // execute the callback callback.apply(); } else if (message instanceof AssociationErrorEvent) { - String errorMessage = message.toString(); + final String errorMessage = message.toString(); LOG.trace("errorMessage:{}", errorMessage); if (errorMessage.contains("The remote system has a UID that has been quarantined")) { - Address address = ((AssociationErrorEvent) message).getRemoteAddress(); + final Address address = ((AssociationErrorEvent) message).getRemoteAddress(); addressSet.add(address); count++; LOG.trace("address:{} addressSet: {} count:{}", address, addressSet, count); @@ -92,6 +95,13 @@ public class QuarantinedMonitorActor extends UntypedAbstractActor { count = 0; addressSet.clear(); } + } else if (message instanceof ClusterEvent.MemberDowned) { + final ClusterEvent.MemberDowned event = (ClusterEvent.MemberDowned) message; + if (Cluster.get(getContext().system()).selfMember().equals(event.member())) { + LOG.warn("This member has been downed, restarting"); + + callback.apply(); + } } } -- 2.36.6