Remove snapshot after startup and fix related bug 53/39853/2
authorTom Pantelis <tpanteli@brocade.com>
Fri, 3 Jun 2016 20:04:28 +0000 (16:04 -0400)
committerTom Pantelis <tpanteli@brocade.com>
Sat, 4 Jun 2016 13:08:47 +0000 (09:08 -0400)
Fixed an issue in the follower out-of-sync integrity checking where is
needs to take into account that the previous index may be in the
snapshot. A similar issue was seen with other inegrity checks.

These issues were indirectly related to the snapshot after startup that
was introduced in Be. I think this snapshot is unsafe b/c the
replicatedToAllIndex hasn't been determined yet which I think may cause
other issues with the trimming after snapshot completion, as the logic
takes replicatedToAllIndex into account. And there may be other lurking
bugs. I thinks it's safer to let the normal snapshot logic handle it.

The reason for the snapshot after startup was to avoid having to recover
the same journal entries again on restart that were just recovered. However
in reality, in production, servers aren't commonly restarted and
typically go weeks/months in between restarts. By the time of the next
restart there would likely have been another snapshot and an arbitrary amount
of  new journal entries to recover so it really doesn't add much value.

Change-Id: Ie14148e5dbde3e93deafc5943278aea8c9bb3e75
Signed-off-by: Tom Pantelis <tpanteli@brocade.com>
opendaylight/md-sal/sal-akka-raft/src/main/java/org/opendaylight/controller/cluster/raft/RaftActor.java
opendaylight/md-sal/sal-akka-raft/src/main/java/org/opendaylight/controller/cluster/raft/behaviors/Follower.java
opendaylight/md-sal/sal-akka-raft/src/test/java/org/opendaylight/controller/cluster/raft/RaftActorTest.java
opendaylight/md-sal/sal-akka-raft/src/test/java/org/opendaylight/controller/cluster/raft/ReplicationAndSnapshotsIntegrationTest.java

index f85fa9bfe347bf21688ded7cdcaf5655a4570370..dd986fbbacfa15a297c20e2ae418059c5722cad0 100644 (file)
@@ -182,13 +182,6 @@ public abstract class RaftActor extends AbstractUntypedPersistentActor {
             initializeBehavior();
 
             raftRecovery = null;
-
-            if (context.getReplicatedLog().size() > 0) {
-                self().tell(new InitiateCaptureSnapshot(), self());
-                LOG.info("{}: Snapshot capture initiated after recovery", persistenceId());
-            } else {
-                LOG.info("{}: Snapshot capture NOT initiated after recovery, journal empty", persistenceId());
-            }
         }
     }
 
index 82c6bafedd19b7a4337f802539ade9211b0280a0..1290abdff3e4ef0840a5081fe00be633a41b707c 100644 (file)
@@ -72,7 +72,7 @@ public class Follower extends AbstractRaftActorBehavior {
     }
 
     private boolean isLogEntryPresent(long index){
-        if(index == context.getReplicatedLog().getSnapshotIndex()){
+        if(context.getReplicatedLog().isInSnapshot(index)) {
             return true;
         }
 
@@ -290,8 +290,7 @@ public class Follower extends AbstractRaftActorBehavior {
                     logName(), prevLogTerm, appendEntries.getPrevLogTerm());
         } else if(appendEntries.getPrevLogIndex() == -1 && appendEntries.getPrevLogTerm() == -1
                 && appendEntries.getReplicatedToAllIndex() != -1
-                && !isLogEntryPresent(appendEntries.getReplicatedToAllIndex())
-                && !context.getReplicatedLog().isInSnapshot(appendEntries.getReplicatedToAllIndex())) {
+                && !isLogEntryPresent(appendEntries.getReplicatedToAllIndex())) {
             // This append entry comes from a leader who has it's log aggressively trimmed and so does not have
             // the previous entry in it's in-memory journal
 
@@ -300,8 +299,7 @@ public class Follower extends AbstractRaftActorBehavior {
                     logName(), appendEntries.getReplicatedToAllIndex());
         } else if(appendEntries.getPrevLogIndex() == -1 && appendEntries.getPrevLogTerm() == -1
                 && appendEntries.getReplicatedToAllIndex() != -1 && numLogEntries > 0
-                && !isLogEntryPresent(appendEntries.getEntries().get(0).getIndex() - 1)
-                && !context.getReplicatedLog().isInSnapshot(appendEntries.getEntries().get(0).getIndex() - 1)) {
+                && !isLogEntryPresent(appendEntries.getEntries().get(0).getIndex() - 1)) {
             LOG.debug(
                     "{}: Cannot append entries because the calculated previousIndex {} was not found in the in-memory journal",
                     logName(), appendEntries.getEntries().get(0).getIndex() - 1);
index f4846d2589ad050632b47bc24c15b451e818b218..1ec197de13ef6606b609abfefbf8dc5fe91c81f2 100644 (file)
@@ -962,40 +962,6 @@ public class RaftActorTest extends AbstractActorTest {
         }};
     }
 
-    @Test
-    public void testRaftActorOnRecoverySnapshot() throws Exception {
-        TEST_LOG.info("testRaftActorOnRecoverySnapshot");
-
-        new JavaTestKit(getSystem()) {{
-                String persistenceId = factory.generateActorId("follower-");
-
-                DefaultConfigParamsImpl config = new DefaultConfigParamsImpl();
-
-                // Set the heartbeat interval high to essentially disable election otherwise the test
-                // may fail if the actor is switched to Leader
-                config.setHeartBeatInterval(new FiniteDuration(1, TimeUnit.DAYS));
-
-                ImmutableMap<String, String> peerAddresses = ImmutableMap.<String, String>builder().put("member1", "address").build();
-
-                // Create mock ReplicatedLogEntry
-                ReplicatedLogEntry replLogEntry = new MockRaftActorContext.MockReplicatedLogEntry(1,1,
-                        new MockRaftActorContext.MockPayload("F", 1));
-
-                InMemoryJournal.addEntry(persistenceId, 1, replLogEntry);
-
-                TestActorRef<MockRaftActor> ref = factory.createTestActor(
-                        MockRaftActor.props(persistenceId, peerAddresses, config));
-
-                MockRaftActor mockRaftActor = ref.underlyingActor();
-
-                mockRaftActor.waitForRecoveryComplete();
-
-                mockRaftActor.waitForInitializeBehaviorComplete();
-
-                verify(mockRaftActor.snapshotCohortDelegate, timeout(5000)).createSnapshot(any(ActorRef.class));
-            }};
-    }
-
     @Test
     public void testSwitchBehavior(){
         String persistenceId = factory.generateActorId("leader-");
@@ -1133,9 +1099,6 @@ public class RaftActorTest extends AbstractActorTest {
 
         mockRaftActor.waitForRecoveryComplete();
 
-        // Wait for snapshot after recovery
-        verify(mockRaftActor.snapshotCohortDelegate, timeout(5000)).createSnapshot(any(ActorRef.class));
-
         mockRaftActor.snapshotCohortDelegate = mock(RaftActorSnapshotCohort.class);
 
         raftActorRef.tell(GetSnapshot.INSTANCE, kit.getRef());
index 6334173cc6ff61d3041ab373b713d477d8cf4c88..c37aee71d3d1ad214779f72fcde9e72f21b710ca 100644 (file)
@@ -188,9 +188,9 @@ public class ReplicationAndSnapshotsIntegrationTest extends AbstractRaftActorInt
         // Verify the persisted snapshot in the leader. This should reflect the advanced snapshot index as
         // the last applied log entry (2) even though the leader hasn't yet advanced its cached snapshot index.
         List<Snapshot> persistedSnapshots = InMemorySnapshotStore.getSnapshots(leaderId, Snapshot.class);
-        assertEquals("Persisted snapshots size", 2, persistedSnapshots.size());
-        verifySnapshot("Persisted", persistedSnapshots.get(1), initialTerm, 2, currentTerm, 3);
-        List<ReplicatedLogEntry> unAppliedEntry = persistedSnapshots.get(1).getUnAppliedEntries();
+        assertEquals("Persisted snapshots size", 1, persistedSnapshots.size());
+        verifySnapshot("Persisted", persistedSnapshots.get(0), initialTerm, 2, currentTerm, 3);
+        List<ReplicatedLogEntry> unAppliedEntry = persistedSnapshots.get(0).getUnAppliedEntries();
         assertEquals("Persisted Snapshot getUnAppliedEntries size", 1, unAppliedEntry.size());
         verifyReplicatedLogEntry(unAppliedEntry.get(0), currentTerm, 3, payload3);