From 23b10ec4ddfdd9348c2abe7dbcfbed3b49db3dc6 Mon Sep 17 00:00:00 2001 From: Tom Pantelis Date: Fri, 3 Jun 2016 16:04:28 -0400 Subject: [PATCH] Remove snapshot after startup and fix related bug Fixed an issue in the follower out-of-sync integrity checking where is needs to take into account that the previous index may be in the snapshot. A similar issue was seen with other inegrity checks. These issues were indirectly related to the snapshot after startup that was introduced in Be. I think this snapshot is unsafe b/c the replicatedToAllIndex hasn't been determined yet which I think may cause other issues with the trimming after snapshot completion, as the logic takes replicatedToAllIndex into account. And there may be other lurking bugs. I thinks it's safer to let the normal snapshot logic handle it. The reason for the snapshot after startup was to avoid having to recover the same journal entries again on restart that were just recovered. However in reality, in production, servers aren't commonly restarted and typically go weeks/months in between restarts. By the time of the next restart there would likely have been another snapshot and an arbitrary amount of new journal entries to recover so it really doesn't add much value. Change-Id: I4f8197911f9258804c470b3608f669fca707a425 Signed-off-by: Tom Pantelis --- .../controller/cluster/raft/RaftActor.java | 7 ---- .../cluster/raft/behaviors/Follower.java | 8 ++-- ...ftActorServerConfigurationSupportTest.java | 18 +++------ .../cluster/raft/RaftActorTest.java | 37 ------------------- ...eplicationAndSnapshotsIntegrationTest.java | 6 +-- 5 files changed, 11 insertions(+), 65 deletions(-) diff --git a/opendaylight/md-sal/sal-akka-raft/src/main/java/org/opendaylight/controller/cluster/raft/RaftActor.java b/opendaylight/md-sal/sal-akka-raft/src/main/java/org/opendaylight/controller/cluster/raft/RaftActor.java index 933fde98b9..602a76ba42 100644 --- a/opendaylight/md-sal/sal-akka-raft/src/main/java/org/opendaylight/controller/cluster/raft/RaftActor.java +++ b/opendaylight/md-sal/sal-akka-raft/src/main/java/org/opendaylight/controller/cluster/raft/RaftActor.java @@ -170,13 +170,6 @@ public abstract class RaftActor extends AbstractUntypedPersistentActor { initializeBehavior(); raftRecovery = null; - - if (context.getReplicatedLog().size() > 0) { - self().tell(new InitiateCaptureSnapshot(), self()); - LOG.info("{}: Snapshot capture initiated after recovery", persistenceId()); - } else { - LOG.info("{}: Snapshot capture NOT initiated after recovery, journal empty", persistenceId()); - } } } diff --git a/opendaylight/md-sal/sal-akka-raft/src/main/java/org/opendaylight/controller/cluster/raft/behaviors/Follower.java b/opendaylight/md-sal/sal-akka-raft/src/main/java/org/opendaylight/controller/cluster/raft/behaviors/Follower.java index da10f41ab3..a9d1b8233e 100644 --- a/opendaylight/md-sal/sal-akka-raft/src/main/java/org/opendaylight/controller/cluster/raft/behaviors/Follower.java +++ b/opendaylight/md-sal/sal-akka-raft/src/main/java/org/opendaylight/controller/cluster/raft/behaviors/Follower.java @@ -95,7 +95,7 @@ public class Follower extends AbstractRaftActorBehavior { } private boolean isLogEntryPresent(long index){ - if(index == context.getReplicatedLog().getSnapshotIndex()){ + if(context.getReplicatedLog().isInSnapshot(index)) { return true; } @@ -312,8 +312,7 @@ public class Follower extends AbstractRaftActorBehavior { logName(), prevLogTerm, appendEntries.getPrevLogTerm()); } else if(appendEntries.getPrevLogIndex() == -1 && appendEntries.getPrevLogTerm() == -1 && appendEntries.getReplicatedToAllIndex() != -1 - && !isLogEntryPresent(appendEntries.getReplicatedToAllIndex()) - && !context.getReplicatedLog().isInSnapshot(appendEntries.getReplicatedToAllIndex())) { + && !isLogEntryPresent(appendEntries.getReplicatedToAllIndex())) { // This append entry comes from a leader who has it's log aggressively trimmed and so does not have // the previous entry in it's in-memory journal @@ -322,8 +321,7 @@ public class Follower extends AbstractRaftActorBehavior { logName(), appendEntries.getReplicatedToAllIndex()); } else if(appendEntries.getPrevLogIndex() == -1 && appendEntries.getPrevLogTerm() == -1 && appendEntries.getReplicatedToAllIndex() != -1 && numLogEntries > 0 - && !isLogEntryPresent(appendEntries.getEntries().get(0).getIndex() - 1) - && !context.getReplicatedLog().isInSnapshot(appendEntries.getEntries().get(0).getIndex() - 1)) { + && !isLogEntryPresent(appendEntries.getEntries().get(0).getIndex() - 1)) { LOG.debug( "{}: Cannot append entries because the calculated previousIndex {} was not found in the in-memory journal", logName(), appendEntries.getEntries().get(0).getIndex() - 1); diff --git a/opendaylight/md-sal/sal-akka-raft/src/test/java/org/opendaylight/controller/cluster/raft/RaftActorServerConfigurationSupportTest.java b/opendaylight/md-sal/sal-akka-raft/src/test/java/org/opendaylight/controller/cluster/raft/RaftActorServerConfigurationSupportTest.java index 1a43dfe829..816e5b2170 100644 --- a/opendaylight/md-sal/sal-akka-raft/src/test/java/org/opendaylight/controller/cluster/raft/RaftActorServerConfigurationSupportTest.java +++ b/opendaylight/md-sal/sal-akka-raft/src/test/java/org/opendaylight/controller/cluster/raft/RaftActorServerConfigurationSupportTest.java @@ -41,7 +41,6 @@ import org.opendaylight.controller.cluster.raft.base.messages.ApplyState; import org.opendaylight.controller.cluster.raft.base.messages.CaptureSnapshotReply; import org.opendaylight.controller.cluster.raft.base.messages.ElectionTimeout; import org.opendaylight.controller.cluster.raft.base.messages.InitiateCaptureSnapshot; -import org.opendaylight.controller.cluster.raft.base.messages.SnapshotComplete; import org.opendaylight.controller.cluster.raft.base.messages.UpdateElectionTerm; import org.opendaylight.controller.cluster.raft.behaviors.AbstractLeader; import org.opendaylight.controller.cluster.raft.behaviors.Follower; @@ -1140,8 +1139,8 @@ public class RaftActorServerConfigurationSupportTest extends AbstractActorTest { PERSISTENT, node2Collector).withDispatcher(Dispatchers.DefaultDispatcherId()), node2ID); CollectingMockRaftActor node2RaftActor = node2RaftActorRef.underlyingActor(); - // Wait for snapshot after recovery - MessageCollectorActor.expectFirstMatching(node1Collector, SnapshotComplete.class); + node1RaftActor.waitForInitializeBehaviorComplete(); + node2RaftActor.waitForInitializeBehaviorComplete(); // Verify the intended server config was loaded and applied. verifyServerConfigurationPayloadEntry(node1RaftActor.getRaftActorContext().getReplicatedLog(), @@ -1151,7 +1150,9 @@ public class RaftActorServerConfigurationSupportTest extends AbstractActorTest { assertEquals("getRaftState", RaftState.Follower, node1RaftActor.getRaftState()); assertEquals("getLeaderId", null, node1RaftActor.getLeaderId()); - MessageCollectorActor.expectFirstMatching(node2Collector, SnapshotComplete.class); + verifyServerConfigurationPayloadEntry(node2RaftActor.getRaftActorContext().getReplicatedLog(), + nonVotingServer(node1ID), nonVotingServer(node2ID), votingServer("downNode1"), + votingServer("downNode2")); assertEquals("isVotingMember", false, node2RaftActor.getRaftActorContext().isVotingMember()); // For the test, we send a ChangeServersVotingStatus message to node1 to flip the voting states for @@ -1236,9 +1237,6 @@ public class RaftActorServerConfigurationSupportTest extends AbstractActorTest { PERSISTENT, node2Collector).withDispatcher(Dispatchers.DefaultDispatcherId()), node2ID); CollectingMockRaftActor node2RaftActor = node2RaftActorRef.underlyingActor(); - // Wait for snapshot after recovery - MessageCollectorActor.expectFirstMatching(node1Collector, SnapshotComplete.class); - // Send a ChangeServersVotingStatus message to node1 to change mode1 to voting. This should cause // node1 to try to elect itself as leader in order to apply the new server config. But we'll drop // RequestVote messages in node2 which should cause node1 to time out and revert back to the previous @@ -1301,9 +1299,6 @@ public class RaftActorServerConfigurationSupportTest extends AbstractActorTest { PERSISTENT, node2Collector).withDispatcher(Dispatchers.DefaultDispatcherId()), node2ID); CollectingMockRaftActor node2RaftActor = node2RaftActorRef.underlyingActor(); - // Wait for snapshot after recovery - MessageCollectorActor.expectFirstMatching(node1Collector, SnapshotComplete.class); - // Send a ChangeServersVotingStatus message to node1 to change mode1 to voting. This should cause // node1 to try to elect itself as leader in order to apply the new server config. However node1's log // is behind node2's so node2 should not grant node1's vote. This should cause node1 to time out and @@ -1368,9 +1363,6 @@ public class RaftActorServerConfigurationSupportTest extends AbstractActorTest { PERSISTENT, node2Collector).withDispatcher(Dispatchers.DefaultDispatcherId()), node2ID); CollectingMockRaftActor node2RaftActor = node2RaftActorRef.underlyingActor(); - // Wait for snapshot after recovery - MessageCollectorActor.expectFirstMatching(node1Collector, SnapshotComplete.class); - // Send a ChangeServersVotingStatus message to node1 to change node1 to voting. This should cause // node1 to try to elect itself as leader in order to apply the new server config. But we'll drop // RequestVote messages in node2 and make it the leader so node1 should forward the server change diff --git a/opendaylight/md-sal/sal-akka-raft/src/test/java/org/opendaylight/controller/cluster/raft/RaftActorTest.java b/opendaylight/md-sal/sal-akka-raft/src/test/java/org/opendaylight/controller/cluster/raft/RaftActorTest.java index 48b555faff..6565c59b5f 100644 --- a/opendaylight/md-sal/sal-akka-raft/src/test/java/org/opendaylight/controller/cluster/raft/RaftActorTest.java +++ b/opendaylight/md-sal/sal-akka-raft/src/test/java/org/opendaylight/controller/cluster/raft/RaftActorTest.java @@ -960,40 +960,6 @@ public class RaftActorTest extends AbstractActorTest { }}; } - @Test - public void testRaftActorOnRecoverySnapshot() throws Exception { - TEST_LOG.info("testRaftActorOnRecoverySnapshot"); - - new JavaTestKit(getSystem()) {{ - String persistenceId = factory.generateActorId("follower-"); - - DefaultConfigParamsImpl config = new DefaultConfigParamsImpl(); - - // Set the heartbeat interval high to essentially disable election otherwise the test - // may fail if the actor is switched to Leader - config.setHeartBeatInterval(new FiniteDuration(1, TimeUnit.DAYS)); - - ImmutableMap peerAddresses = ImmutableMap.builder().put("member1", "address").build(); - - // Create mock ReplicatedLogEntry - ReplicatedLogEntry replLogEntry = new MockRaftActorContext.MockReplicatedLogEntry(1,1, - new MockRaftActorContext.MockPayload("F", 1)); - - InMemoryJournal.addEntry(persistenceId, 1, replLogEntry); - - TestActorRef ref = factory.createTestActor( - MockRaftActor.props(persistenceId, peerAddresses, config)); - - MockRaftActor mockRaftActor = ref.underlyingActor(); - - mockRaftActor.waitForRecoveryComplete(); - - mockRaftActor.waitForInitializeBehaviorComplete(); - - verify(mockRaftActor.snapshotCohortDelegate, timeout(5000)).createSnapshot(any(ActorRef.class)); - }}; - } - @Test public void testSwitchBehavior(){ String persistenceId = factory.generateActorId("leader-"); @@ -1131,9 +1097,6 @@ public class RaftActorTest extends AbstractActorTest { mockRaftActor.waitForRecoveryComplete(); - // Wait for snapshot after recovery - verify(mockRaftActor.snapshotCohortDelegate, timeout(5000)).createSnapshot(any(ActorRef.class)); - mockRaftActor.snapshotCohortDelegate = mock(RaftActorSnapshotCohort.class); raftActorRef.tell(GetSnapshot.INSTANCE, kit.getRef()); diff --git a/opendaylight/md-sal/sal-akka-raft/src/test/java/org/opendaylight/controller/cluster/raft/ReplicationAndSnapshotsIntegrationTest.java b/opendaylight/md-sal/sal-akka-raft/src/test/java/org/opendaylight/controller/cluster/raft/ReplicationAndSnapshotsIntegrationTest.java index e48234c0a3..c658ef0804 100644 --- a/opendaylight/md-sal/sal-akka-raft/src/test/java/org/opendaylight/controller/cluster/raft/ReplicationAndSnapshotsIntegrationTest.java +++ b/opendaylight/md-sal/sal-akka-raft/src/test/java/org/opendaylight/controller/cluster/raft/ReplicationAndSnapshotsIntegrationTest.java @@ -188,9 +188,9 @@ public class ReplicationAndSnapshotsIntegrationTest extends AbstractRaftActorInt // Verify the persisted snapshot in the leader. This should reflect the advanced snapshot index as // the last applied log entry (2) even though the leader hasn't yet advanced its cached snapshot index. List persistedSnapshots = InMemorySnapshotStore.getSnapshots(leaderId, Snapshot.class); - assertEquals("Persisted snapshots size", 2, persistedSnapshots.size()); - verifySnapshot("Persisted", persistedSnapshots.get(1), initialTerm, 2, currentTerm, 3); - List unAppliedEntry = persistedSnapshots.get(1).getUnAppliedEntries(); + assertEquals("Persisted snapshots size", 1, persistedSnapshots.size()); + verifySnapshot("Persisted", persistedSnapshots.get(0), initialTerm, 2, currentTerm, 3); + List unAppliedEntry = persistedSnapshots.get(0).getUnAppliedEntries(); assertEquals("Persisted Snapshot getUnAppliedEntries size", 1, unAppliedEntry.size()); verifyReplicatedLogEntry(unAppliedEntry.get(0), currentTerm, 3, payload3); -- 2.36.6