From: Tomas Cere Date: Fri, 22 May 2020 11:11:04 +0000 (+0200) Subject: Do not bump follower term while it is isolated X-Git-Tag: v2.0.1~14 X-Git-Url: https://git.opendaylight.org/gerrit/gitweb?p=controller.git;a=commitdiff_plain;h=d3d04140ac3d7a2a90a3b953cc4ea27fca2bfc32;ds=sidebyside Do not bump follower term while it is isolated When a follower gets into a state that its isolated(ie cannot reach either peer) it starts bumping its term every election timeout. Once the cluster heals it would then disrupt the cluster because it has a higher term then the healthy part. We can avoid this by not triggering new elections on followers that cannot reach other peers while they are unreachable. JIRA: CONTROLLER-1935 Change-Id: I8ee9f333740637ba5569287d405540f374bbc4bc Signed-off-by: Tomas Cere --- diff --git a/opendaylight/md-sal/sal-akka-raft/src/main/java/org/opendaylight/controller/cluster/raft/behaviors/Follower.java b/opendaylight/md-sal/sal-akka-raft/src/main/java/org/opendaylight/controller/cluster/raft/behaviors/Follower.java index b642ee43a5..288ce32a64 100644 --- a/opendaylight/md-sal/sal-akka-raft/src/main/java/org/opendaylight/controller/cluster/raft/behaviors/Follower.java +++ b/opendaylight/md-sal/sal-akka-raft/src/main/java/org/opendaylight/controller/cluster/raft/behaviors/Follower.java @@ -18,6 +18,7 @@ import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Stopwatch; import java.io.IOException; import java.util.ArrayList; +import java.util.HashSet; import java.util.List; import java.util.Optional; import java.util.Set; @@ -506,6 +507,10 @@ public class Follower extends AbstractRaftActorBehavior { if (isLeaderAvailabilityKnown() && lastLeaderMessageInterval < maxElectionTimeout) { log.debug("{}: Received ElectionTimeout but leader appears to be available", logName()); scheduleElection(electionDuration()); + } else if (isThisFollowerIsolated()) { + log.debug("{}: this follower is isolated. Do not switch to Candidate for now.", logName()); + setLeaderId(null); + scheduleElection(electionDuration()); } else { log.debug("{}: Received ElectionTimeout - switching to Candidate", logName()); return internalSwitchBehavior(RaftState.Candidate); @@ -575,6 +580,40 @@ public class Follower extends AbstractRaftActorBehavior { return false; } + private boolean isThisFollowerIsolated() { + final Optional maybeCluster = context.getCluster(); + if (!maybeCluster.isPresent()) { + return false; + } + + final Cluster cluster = maybeCluster.get(); + final Member selfMember = cluster.selfMember(); + + final CurrentClusterState state = cluster.state(); + final Set unreachable = state.getUnreachable(); + final Iterable members = state.getMembers(); + + log.debug("{}: Checking if this node is isolated in the cluster unreachable set {}," + + "all members {} self member: {}", logName(), unreachable, members, selfMember); + + // no unreachable peers means we cannot be isolated + if (unreachable.size() == 0) { + return false; + } + + final Set membersToCheck = new HashSet<>(); + members.forEach(membersToCheck::add); + + membersToCheck.removeAll(unreachable); + + // check if the only member not unreachable is us + if (membersToCheck.size() == 1 && membersToCheck.iterator().next().equals(selfMember)) { + return true; + } + + return false; + } + private void handleInstallSnapshot(final ActorRef sender, final InstallSnapshot installSnapshot) { log.debug("{}: handleInstallSnapshot: {}", logName(), installSnapshot);