From 88e2974b8d391d6e91a6338b0a1b8dbf966a8a71 Mon Sep 17 00:00:00 2001 From: Tom Pantelis Date: Mon, 28 Aug 2017 15:44:31 -0400 Subject: [PATCH] Fix intermitent testFollowerResyncWith*LeaderRestart failure NonVotingFollowerIntegrationTest#testFollowerResyncWithOneMoreLeaderLogEntryAfterNonPersistentLeaderRestart fails intermittently: NonVotingFollowerIntegrationTest.testFollowerResyncWithOneMoreLeaderLogEntryAfterNonPersistentLeaderRestart:233 Did not receive message of type class org.opendaylight.controller.cluster.raft.base.messages.SnapshotComplete This seems to be a side-effect of https://git.opendaylight.org/gerrit/#/c/62255/ which changes the timing a bit such that an install snapshot doesn't occur on the follower which should happen in order to completely re-sycnc it with the leader - instead it ends up removing the stale out-of-sync entries and appending the new ones from the leader which gets the journal up-to-date but the stale entries had already been applied to the state which leaves the state out-of-sync with journal. I added an additional check in the follower to force the leader to install a snapshot if the first out-of-sync log entry index <= the lastAppliedIndex which means the entries to be removed have already been applied to the state. Change-Id: Ic3815a694a8531d9f7f42f19ad8978d52fc902b3 Signed-off-by: Tom Pantelis --- .../controller/cluster/raft/behaviors/Follower.java | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/opendaylight/md-sal/sal-akka-raft/src/main/java/org/opendaylight/controller/cluster/raft/behaviors/Follower.java b/opendaylight/md-sal/sal-akka-raft/src/main/java/org/opendaylight/controller/cluster/raft/behaviors/Follower.java index e1f440dace..4fa0e161db 100644 --- a/opendaylight/md-sal/sal-akka-raft/src/main/java/org/opendaylight/controller/cluster/raft/behaviors/Follower.java +++ b/opendaylight/md-sal/sal-akka-raft/src/main/java/org/opendaylight/controller/cluster/raft/behaviors/Follower.java @@ -253,10 +253,13 @@ public class Follower extends AbstractRaftActorBehavior { } if (!context.getRaftPolicy().applyModificationToStateBeforeConsensus()) { - log.info("{}: Removing entries from log starting at {}", logName(), matchEntry.getIndex()); + log.info("{}: Removing entries from log starting at {}, commitIndex: {}, lastApplied: {}", + logName(), matchEntry.getIndex(), context.getCommitIndex(), context.getLastApplied()); - // Entries do not match so remove all subsequent entries - if (!context.getReplicatedLog().removeFromAndPersist(matchEntry.getIndex())) { + // Entries do not match so remove all subsequent entries but only if the existing entries haven't + // been applied to the state yet. + if (matchEntry.getIndex() <= context.getLastApplied() + || !context.getReplicatedLog().removeFromAndPersist(matchEntry.getIndex())) { // Could not remove the entries - this means the matchEntry index must be in the // snapshot and not the log. In this case the prior entries are part of the state // so we must send back a reply to force a snapshot to completely re-sync the -- 2.36.6