Bug 6587: Retain state when transitioning between Leader and IsolatedLeader

[controller.git] / opendaylight / md-sal / sal-akka-raft / src / main / java / org / opendaylight / controller / cluster / raft / behaviors / AbstractLeader.java
diff --git a/opendaylight/md-sal/sal-akka-raft/src/main/java/org/opendaylight/controller/cluster/raft/behaviors/AbstractLeader.java b/opendaylight/md-sal/sal-akka-raft/src/main/java/org/opendaylight/controller/cluster/raft/behaviors/AbstractLeader.java

index 5219ebb3107c1a62f36a3dc8b285be72dad88532..b241e0a67a4f81118b1592e25bc1011e336318dc 100644 (file)
--- a/opendaylight/md-sal/sal-akka-raft/src/main/java/org/opendaylight/controller/cluster/raft/behaviors/AbstractLeader.java
+++ b/opendaylight/md-sal/sal-akka-raft/src/main/java/org/opendaylight/controller/cluster/raft/behaviors/AbstractLeader.java
@@ -35,7 +35,6 @@ import org.opendaylight.controller.cluster.raft.PeerInfo;
  import org.opendaylight.controller.cluster.raft.RaftActorContext;
  import org.opendaylight.controller.cluster.raft.RaftState;
  import org.opendaylight.controller.cluster.raft.ReplicatedLogEntry;
-import org.opendaylight.controller.cluster.raft.ServerConfigurationPayload;
  import org.opendaylight.controller.cluster.raft.Snapshot;
  import org.opendaylight.controller.cluster.raft.VotingState;
  import org.opendaylight.controller.cluster.raft.base.messages.Replicate;
@@ -48,6 +47,7 @@ import org.opendaylight.controller.cluster.raft.messages.InstallSnapshotReply;
  import org.opendaylight.controller.cluster.raft.messages.RaftRPC;
  import org.opendaylight.controller.cluster.raft.messages.RequestVoteReply;
  import org.opendaylight.controller.cluster.raft.messages.UnInitializedFollowerSnapshotReply;
+import org.opendaylight.controller.cluster.raft.persisted.ServerConfigurationPayload;
  import scala.concurrent.duration.FiniteDuration;
  
  /**
@@ -96,23 +96,29 @@ public abstract class AbstractLeader extends AbstractRaftActorBehavior {
      private final Queue<ClientRequestTracker> trackers = new LinkedList<>();
  
      private Cancellable heartbeatSchedule = null;
-    private Optional<SnapshotHolder> snapshot;
+    private Optional<SnapshotHolder> snapshot = Optional.absent();;
      private int minReplicationCount;
  
-    protected AbstractLeader(RaftActorContext context, RaftState state) {
+    protected AbstractLeader(RaftActorContext context, RaftState state,
+            @Nullable AbstractLeader initializeFromLeader) {
          super(context, state);
  
-        for(PeerInfo peerInfo: context.getPeers()) {
-            FollowerLogInformation followerLogInformation = new FollowerLogInformationImpl(peerInfo, -1, context);
-            followerToLog.put(peerInfo.getId(), followerLogInformation);
+        if(initializeFromLeader != null) {
+            followerToLog.putAll(initializeFromLeader.followerToLog);
+            mapFollowerToSnapshot.putAll(initializeFromLeader.mapFollowerToSnapshot);
+            snapshot = initializeFromLeader.snapshot;
+            trackers.addAll(initializeFromLeader.trackers);
+        } else {
+            for(PeerInfo peerInfo: context.getPeers()) {
+                FollowerLogInformation followerLogInformation = new FollowerLogInformationImpl(peerInfo, -1, context);
+                followerToLog.put(peerInfo.getId(), followerLogInformation);
+            }
          }
  
          LOG.debug("{}: Election: Leader has following peers: {}", logName(), getFollowerIds());
  
          updateMinReplicaCount();
  
-        snapshot = Optional.absent();
-
          // Immediately schedule a heartbeat
          // Upon election: send initial empty AppendEntries RPCs
          // (heartbeat) to each server; repeat during idle periods to
@@ -123,6 +129,10 @@ public abstract class AbstractLeader extends AbstractRaftActorBehavior {
          scheduleHeartBeat(context.getConfigParams().getHeartBeatInterval());
      }
  
+    protected AbstractLeader(RaftActorContext context, RaftState state) {
+        this(context, state, null);
+    }
+
      /**
       * Return an immutable collection of follower identifiers.
       *
@@ -218,13 +228,36 @@ public abstract class AbstractLeader extends AbstractRaftActorBehavior {
          followerLogInformation.setRaftVersion(appendEntriesReply.getRaftVersion());
  
          boolean updated = false;
-        if (appendEntriesReply.isSuccess()) {
+        if(appendEntriesReply.getLogLastIndex() > context.getReplicatedLog().lastIndex()) {
+            // The follower's log is actually ahead of the leader's log. Normally this doesn't happen
+            // in raft as a node cannot become leader if it's log is behind another's. However, the
+            // non-voting semantics deviate a bit from raft. Only voting members participate in
+            // elections and can become leader so it's possible for a non-voting follower to be ahead
+            // of the leader. This can happen if persistence is disabled and all voting members are
+            // restarted. In this case, the voting leader will start out with an empty log however
+            // the non-voting followers still retain the previous data in memory. On the first
+            // AppendEntries, the non-voting follower returns a successful reply b/c the prevLogIndex
+            // sent by the leader is -1 and thus the integrity checks pass. However the follower's returned
+            // lastLogIndex may be higher in which case we want to reset the follower by installing a
+            // snapshot. It's also possible that the follower's last log index is behind the leader's.
+            // However in this case the log terms won't match and the logs will conflict - this is handled
+            // elsewhere.
+            LOG.debug("{}: handleAppendEntriesReply: follower {} lastIndex {} is ahead of our lastIndex {} - forcing install snaphot",
+                    logName(), followerLogInformation.getId(), appendEntriesReply.getLogLastIndex(),
+                    context.getReplicatedLog().lastIndex());
+
+            followerLogInformation.setMatchIndex(-1);
+            followerLogInformation.setNextIndex(-1);
+
+            initiateCaptureSnapshot(followerId);
+            updated = true;
+        } else if (appendEntriesReply.isSuccess()) {
              updated = updateFollowerLogInformation(followerLogInformation, appendEntriesReply);
          } else {
              LOG.debug("{}: handleAppendEntriesReply: received unsuccessful reply: {}", logName(), appendEntriesReply);
  
              long followerLastLogIndex = appendEntriesReply.getLogLastIndex();
-            ReplicatedLogEntry followersLastLogEntry = context.getReplicatedLog().get(followerLastLogIndex);
+            long followersLastLogTerm = getLogEntryTerm(followerLastLogIndex);
              if(appendEntriesReply.isForceInstallSnapshot()) {
                  // Reset the followers match and next index. This is to signal that this follower has nothing
                  // in common with this Leader and so would require a snapshot to be installed
@@ -233,8 +266,8 @@ public abstract class AbstractLeader extends AbstractRaftActorBehavior {
  
                  // Force initiate a snapshot capture
                  initiateCaptureSnapshot(followerId);
-            } else if(followerLastLogIndex < 0 || (followersLastLogEntry != null &&
-                    followersLastLogEntry.getTerm() == appendEntriesReply.getLogLastTerm())) {
+            } else if(followerLastLogIndex < 0 || (followersLastLogTerm >= 0 &&
+                    followersLastLogTerm == appendEntriesReply.getLogLastTerm())) {
                  // The follower's log is empty or the last entry is present in the leader's journal
                  // and the terms match so the follower is just behind the leader's journal from
                  // the last snapshot, if any. We'll catch up the follower quickly by starting at the
@@ -242,11 +275,11 @@ public abstract class AbstractLeader extends AbstractRaftActorBehavior {
  
                  updated = updateFollowerLogInformation(followerLogInformation, appendEntriesReply);
              } else {
-                // TODO: When we find that the follower is out of sync with the
-                // Leader we simply decrement that followers next index by 1.
-                // Would it be possible to do better than this? The RAFT spec
-                // does not explicitly deal with it but may be something for us to
-                // think about.
+                // The follower's log conflicts with leader's log so decrement follower's next index by 1
+                // in an attempt to find where the logs match.
+
+                LOG.debug("{}: follower's last log term {} conflicts with the leader's {} - dec next index",
+                        logName(), appendEntriesReply.getLogLastTerm(), followersLastLogTerm);
  
                  followerLogInformation.decrNextIndex();
              }
@@ -267,7 +300,7 @@ public abstract class AbstractLeader extends AbstractRaftActorBehavior {
              LOG.trace("{}: checking Nth index {}", logName(), N);
              for (FollowerLogInformation info : followerToLog.values()) {
                  final PeerInfo peerInfo = context.getPeerInfo(info.getId());
-                if(info.getMatchIndex() >= N && (peerInfo != null && peerInfo.isVoting())) {
+                if(info.getMatchIndex() >= N && peerInfo != null && peerInfo.isVoting()) {
                      replicatedCount++;
                  } else if(LOG.isTraceEnabled()) {
                      LOG.trace("{}: Not counting follower {} - matchIndex: {}, {}", logName(), info.getId(),
@@ -321,6 +354,7 @@ public abstract class AbstractLeader extends AbstractRaftActorBehavior {
  
          //Send the next log entry immediately, if possible, no need to wait for heartbeat to trigger that event
          sendUpdatesToFollower(followerId, followerLogInformation, false, !updated);
+
          return this;
      }
  
@@ -373,11 +407,9 @@ public abstract class AbstractLeader extends AbstractRaftActorBehavior {
      protected void beforeSendHeartbeat(){}
  
      @Override
-    public RaftActorBehavior handleMessage(ActorRef sender, Object originalMessage) {
+    public RaftActorBehavior handleMessage(ActorRef sender, Object message) {
          Preconditions.checkNotNull(sender, "sender should not be null");
  
-        Object message = fromSerializableMessage(originalMessage);
-
          if (message instanceof RaftRPC) {
              RaftRPC rpc = (RaftRPC) message;
              // If RPC request or response contains term T > currentTerm:
@@ -505,18 +537,17 @@ public abstract class AbstractLeader extends AbstractRaftActorBehavior {
      private void replicate(Replicate replicate) {
          long logIndex = replicate.getReplicatedLogEntry().getIndex();
  
-        LOG.debug("{}: Replicate message: identifier: {}, logIndex: {}", logName(),
-                replicate.getIdentifier(), logIndex);
+        LOG.debug("{}: Replicate message: identifier: {}, logIndex: {}, payload: {}", logName(),
+                replicate.getIdentifier(), logIndex, replicate.getReplicatedLogEntry().getData().getClass());
  
          // Create a tracker entry we will use this later to notify the
          // client actor
-        trackers.add(
-            new ClientRequestTrackerImpl(replicate.getClientActor(),
-                replicate.getIdentifier(),
-                logIndex)
-        );
+        if(replicate.getClientActor() != null) {
+            trackers.add(new ClientRequestTrackerImpl(replicate.getClientActor(), replicate.getIdentifier(),
+                    logIndex));
+        }
  
-        boolean applyModificationToState = followerToLog.isEmpty()
+        boolean applyModificationToState = !context.anyVotingPeers()
                  || context.getRaftPolicy().applyModificationToStateBeforeConsensus();
  
          if(applyModificationToState){
@@ -596,7 +627,7 @@ public abstract class AbstractLeader extends AbstractRaftActorBehavior {
                      // then snapshot should be sent
  
                      if (LOG.isDebugEnabled()) {
-                        LOG.debug(String.format("%s: InitiateInstallSnapshot to follower: %s," +
+                        LOG.debug(String.format("%s: InitiateInstallSnapshot to follower: %s, " +
                                      "follower-nextIndex: %d, leader-snapshot-index: %d,  " +
                                      "leader-last-index: %d", logName(), followerId,
                                      followerNextIndex, leaderSnapShotIndex, leaderLastIndex));
@@ -626,8 +657,8 @@ public abstract class AbstractLeader extends AbstractRaftActorBehavior {
      private void sendAppendEntriesToFollower(ActorSelection followerActor, long followerNextIndex,
          List<ReplicatedLogEntry> entries, String followerId) {
          AppendEntries appendEntries = new AppendEntries(currentTerm(), context.getId(),
-            prevLogIndex(followerNextIndex),
-            prevLogTerm(followerNextIndex), entries,
+            getLogEntryIndex(followerNextIndex - 1),
+            getLogEntryTerm(followerNextIndex - 1), entries,
              context.getCommitIndex(), super.getReplicatedToAllIndex(), context.getPayloadVersion());
  
          if(!entries.isEmpty() || LOG.isTraceEnabled()) {
@@ -669,9 +700,9 @@ public abstract class AbstractLeader extends AbstractRaftActorBehavior {
          // If the follower's nextIndex is -1 then we might as well send it a snapshot
          // Otherwise send it a snapshot only if the nextIndex is not present in the log but is present
          // in the snapshot
-        return (nextIndex == -1 ||
+        return nextIndex == -1 ||
                  (!context.getReplicatedLog().isPresent(nextIndex)
-                        && context.getReplicatedLog().isInSnapshot(nextIndex)));
+                        && context.getReplicatedLog().isInSnapshot(nextIndex));
  
      }
  
@@ -811,7 +842,7 @@ public abstract class AbstractLeader extends AbstractRaftActorBehavior {
                  }
              }
          }
-        return (minPresent != 0);
+        return minPresent != 0;
      }
  
      /**
@@ -833,8 +864,8 @@ public abstract class AbstractLeader extends AbstractRaftActorBehavior {
          public FollowerToSnapshot(ByteString snapshotBytes) {
              this.snapshotBytes = snapshotBytes;
              int size = snapshotBytes.size();
-            totalChunks = ( size / context.getConfigParams().getSnapshotChunkSize()) +
-                ((size % context.getConfigParams().getSnapshotChunkSize()) > 0 ? 1 : 0);
+            totalChunks = (size / context.getConfigParams().getSnapshotChunkSize()) +
+                (size % context.getConfigParams().getSnapshotChunkSize() > 0 ? 1 : 0);
              if(LOG.isDebugEnabled()) {
                  LOG.debug("{}: Snapshot {} bytes, total chunks to send:{}",
                          logName(), size, totalChunks);