Do not bump follower term while it is isolated 63/89863/3
authorTomas Cere <tomas.cere@pantheon.tech>
Fri, 22 May 2020 11:11:04 +0000 (13:11 +0200)
committerRobert Varga <nite@hq.sk>
Mon, 25 May 2020 09:20:47 +0000 (09:20 +0000)
When a follower gets into a state that its isolated(ie cannot reach either peer)
it starts bumping its term every election timeout. Once the cluster heals it
would then disrupt the cluster because it has a higher term then the healthy part.
We can avoid this by not triggering new elections on followers that cannot
reach other peers while they are unreachable.

JIRA: CONTROLLER-1935
Change-Id: I8ee9f333740637ba5569287d405540f374bbc4bc
Signed-off-by: Tomas Cere <tomas.cere@pantheon.tech>
opendaylight/md-sal/sal-akka-raft/src/main/java/org/opendaylight/controller/cluster/raft/behaviors/Follower.java

index b642ee43a563588f7a95c08e4b97495b94526186..288ce32a64ec21286adda9cf3672f7b53d1bf506 100644 (file)
@@ -18,6 +18,7 @@ import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Stopwatch;
 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.HashSet;
 import java.util.List;
 import java.util.Optional;
 import java.util.Set;
@@ -506,6 +507,10 @@ public class Follower extends AbstractRaftActorBehavior {
                 if (isLeaderAvailabilityKnown() && lastLeaderMessageInterval < maxElectionTimeout) {
                     log.debug("{}: Received ElectionTimeout but leader appears to be available", logName());
                     scheduleElection(electionDuration());
+                } else if (isThisFollowerIsolated()) {
+                    log.debug("{}: this follower is isolated. Do not switch to Candidate for now.", logName());
+                    setLeaderId(null);
+                    scheduleElection(electionDuration());
                 } else {
                     log.debug("{}: Received ElectionTimeout - switching to Candidate", logName());
                     return internalSwitchBehavior(RaftState.Candidate);
@@ -575,6 +580,40 @@ public class Follower extends AbstractRaftActorBehavior {
         return false;
     }
 
+    private boolean isThisFollowerIsolated() {
+        final Optional<Cluster> maybeCluster = context.getCluster();
+        if (!maybeCluster.isPresent()) {
+            return false;
+        }
+
+        final Cluster cluster = maybeCluster.get();
+        final Member selfMember = cluster.selfMember();
+
+        final CurrentClusterState state = cluster.state();
+        final Set<Member> unreachable = state.getUnreachable();
+        final Iterable<Member> members = state.getMembers();
+
+        log.debug("{}: Checking if this node is isolated in the cluster unreachable set {},"
+                        + "all members {} self member: {}", logName(), unreachable, members, selfMember);
+
+        // no unreachable peers means we cannot be isolated
+        if (unreachable.size() == 0) {
+            return false;
+        }
+
+        final Set<Member> membersToCheck = new HashSet<>();
+        members.forEach(membersToCheck::add);
+
+        membersToCheck.removeAll(unreachable);
+
+        // check if the only member not unreachable is us
+        if (membersToCheck.size() == 1 && membersToCheck.iterator().next().equals(selfMember)) {
+            return true;
+        }
+
+        return false;
+    }
+
     private void handleInstallSnapshot(final ActorRef sender, final InstallSnapshot installSnapshot) {
 
         log.debug("{}: handleInstallSnapshot: {}", logName(), installSnapshot);