BUG 2585 : Make Election Timeout Factor configurable 58/14158/2
authorMoiz Raja <moraja@cisco.com>
Thu, 15 Jan 2015 00:21:22 +0000 (16:21 -0800)
committerMoiz Raja <moraja@cisco.com>
Tue, 20 Jan 2015 02:53:48 +0000 (18:53 -0800)
Keeping the Heartbeat interval small and increasing the timeout
factor allows us to skip a few heartbeats if neccessary without
causing Followers to become Candidates.

Simply increasing the heartbeat interval has the negative effect
of causing replication messages for out-of date followers to go
slower so making the election factor configurable will be helpful
in keeping the cluster state stable.

Change-Id: Iae8105d65bba4a37987bfddb9f22d9d4d862a1fd
Signed-off-by: Moiz Raja <moraja@cisco.com>
opendaylight/md-sal/sal-akka-raft/src/main/java/org/opendaylight/controller/cluster/raft/ConfigParams.java
opendaylight/md-sal/sal-akka-raft/src/main/java/org/opendaylight/controller/cluster/raft/DefaultConfigParamsImpl.java
opendaylight/md-sal/sal-akka-raft/src/main/java/org/opendaylight/controller/cluster/raft/RaftActor.java
opendaylight/md-sal/sal-akka-raft/src/main/java/org/opendaylight/controller/cluster/raft/behaviors/AbstractRaftActorBehavior.java
opendaylight/md-sal/sal-distributed-datastore/src/main/java/org/opendaylight/controller/cluster/datastore/DatastoreContext.java
opendaylight/md-sal/sal-distributed-datastore/src/main/java/org/opendaylight/controller/config/yang/config/distributed_datastore_provider/DistributedConfigDataStoreProviderModule.java
opendaylight/md-sal/sal-distributed-datastore/src/main/java/org/opendaylight/controller/config/yang/config/distributed_datastore_provider/DistributedOperationalDataStoreProviderModule.java
opendaylight/md-sal/sal-distributed-datastore/src/main/yang/distributed-datastore-provider.yang

index 4245cf10f778fc81fe3181b0db5d9b825ea23a4c..78a1335d58a2ed7cacf78572425be7beb20274ca 100644 (file)
@@ -76,4 +76,12 @@ public interface ConfigParams {
      * @return FiniteDuration
      */
     FiniteDuration getIsolatedCheckInterval();
+
+
+    /**
+     * The multiplication factor to be used to determine shard election timeout. The election timeout
+     * is determined by multiplying the election timeout factor with the heartbeat duration.
+     */
+    long getElectionTimeoutFactor();
+
 }
index 3a6bdbf0a3ec75151313c6b9a00143ea1d15cbd0..86867e1d040ee84396450ee72f6097093aecd70e 100644 (file)
@@ -39,7 +39,6 @@ public class DefaultConfigParamsImpl implements ConfigParams {
     public static final FiniteDuration HEART_BEAT_INTERVAL =
         new FiniteDuration(100, TimeUnit.MILLISECONDS);
 
-
     private FiniteDuration heartBeatInterval = HEART_BEAT_INTERVAL;
     private long snapshotBatchCount = SNAPSHOT_BATCH_COUNT;
     private int journalRecoveryLogBatchSize = JOURNAL_RECOVERY_LOG_BATCH_SIZE;
@@ -50,6 +49,8 @@ public class DefaultConfigParamsImpl implements ConfigParams {
     // in-memory journal can use before it needs to snapshot
     private int snapshotDataThresholdPercentage = 12;
 
+    private long electionTimeoutFactor = 2;
+
     public void setHeartBeatInterval(FiniteDuration heartBeatInterval) {
         this.heartBeatInterval = heartBeatInterval;
     }
@@ -70,6 +71,10 @@ public class DefaultConfigParamsImpl implements ConfigParams {
         this.isolatedLeaderCheckInterval = isolatedLeaderCheckInterval;
     }
 
+    public void setElectionTimeoutFactor(long electionTimeoutFactor){
+        this.electionTimeoutFactor = electionTimeoutFactor;
+    }
+
     @Override
     public long getSnapshotBatchCount() {
         return snapshotBatchCount;
@@ -88,8 +93,7 @@ public class DefaultConfigParamsImpl implements ConfigParams {
 
     @Override
     public FiniteDuration getElectionTimeOutInterval() {
-        // returns 2 times the heart beat interval
-        return getHeartBeatInterval().$times(2);
+        return getHeartBeatInterval().$times(electionTimeoutFactor);
     }
 
     @Override
@@ -111,4 +115,9 @@ public class DefaultConfigParamsImpl implements ConfigParams {
     public FiniteDuration getIsolatedCheckInterval() {
         return isolatedLeaderCheckInterval;
     }
+
+    @Override
+    public long getElectionTimeoutFactor() {
+        return electionTimeoutFactor;
+    }
 }
index 3b8469207798952298a6f304a2b07e18153e4ea2..0bf5c911bb29880a9f8c625c28da79c3af04678a 100644 (file)
@@ -170,9 +170,7 @@ public abstract class RaftActor extends AbstractUntypedPersistentActor {
 
                 onRecoveryComplete();
 
-                RaftActorBehavior oldBehavior = currentBehavior;
-                currentBehavior = new Follower(context);
-                handleBehaviorChange(oldBehavior, currentBehavior);
+                initializeBehavior();
             }
         }
     }
@@ -271,8 +269,16 @@ public abstract class RaftActor extends AbstractUntypedPersistentActor {
             replicatedLog.lastIndex(), replicatedLog.snapshotIndex,
             replicatedLog.snapshotTerm, replicatedLog.size());
 
+        initializeBehavior();
+    }
+
+    protected void initializeBehavior(){
+        changeCurrentBehavior(new Follower(context));
+    }
+
+    protected void changeCurrentBehavior(RaftActorBehavior newBehavior){
         RaftActorBehavior oldBehavior = currentBehavior;
-        currentBehavior = new Follower(context);
+        currentBehavior = newBehavior;
         handleBehaviorChange(oldBehavior, currentBehavior);
     }
 
index f235221da940d4cb21c491e78b9624c2afa18a80..04462be0420eaa3f0504be0523b7c9371128273d 100644 (file)
@@ -11,6 +11,8 @@ package org.opendaylight.controller.cluster.raft.behaviors;
 import akka.actor.ActorRef;
 import akka.actor.Cancellable;
 import akka.event.LoggingAdapter;
+import java.util.Random;
+import java.util.concurrent.TimeUnit;
 import org.opendaylight.controller.cluster.raft.ClientRequestTracker;
 import org.opendaylight.controller.cluster.raft.RaftActorContext;
 import org.opendaylight.controller.cluster.raft.ReplicatedLogEntry;
@@ -24,9 +26,6 @@ import org.opendaylight.controller.cluster.raft.messages.RequestVote;
 import org.opendaylight.controller.cluster.raft.messages.RequestVoteReply;
 import scala.concurrent.duration.FiniteDuration;
 
-import java.util.Random;
-import java.util.concurrent.TimeUnit;
-
 /**
  * Abstract class that represents the behavior of a RaftActor
  * <p/>
@@ -202,7 +201,7 @@ public abstract class AbstractRaftActorBehavior implements RaftActorBehavior {
     protected FiniteDuration electionDuration() {
         long variance = new Random().nextInt(context.getConfigParams().getElectionTimeVariance());
         return context.getConfigParams().getElectionTimeOutInterval().$plus(
-            new FiniteDuration(variance, TimeUnit.MILLISECONDS));
+                new FiniteDuration(variance, TimeUnit.MILLISECONDS));
     }
 
     /**
index daba3fdf8ac18ac889c348b1af017c679603ebb3..01e42dbb8e92400b7f4af73d55257d6f9ebbe4a2 100644 (file)
@@ -36,13 +36,14 @@ public class DatastoreContext {
     private final Timeout shardLeaderElectionTimeout;
     private final boolean persistent;
     private final ConfigurationReader configurationReader;
+    private final long shardElectionTimeoutFactor;
 
     private DatastoreContext(InMemoryDOMDataStoreConfigProperties dataStoreProperties,
             ConfigParams shardRaftConfig, String dataStoreMXBeanType, int operationTimeoutInSeconds,
             Duration shardTransactionIdleTimeout, int shardTransactionCommitTimeoutInSeconds,
             int shardTransactionCommitQueueCapacity, Timeout shardInitializationTimeout,
             Timeout shardLeaderElectionTimeout,
-            boolean persistent, ConfigurationReader configurationReader) {
+            boolean persistent, ConfigurationReader configurationReader, long shardElectionTimeoutFactor) {
         this.dataStoreProperties = dataStoreProperties;
         this.shardRaftConfig = shardRaftConfig;
         this.dataStoreMXBeanType = dataStoreMXBeanType;
@@ -54,6 +55,7 @@ public class DatastoreContext {
         this.shardLeaderElectionTimeout = shardLeaderElectionTimeout;
         this.persistent = persistent;
         this.configurationReader = configurationReader;
+        this.shardElectionTimeoutFactor = shardElectionTimeoutFactor;
     }
 
     public static Builder newBuilder() {
@@ -104,6 +106,10 @@ public class DatastoreContext {
         return configurationReader;
     }
 
+    public long getShardElectionTimeoutFactor(){
+        return this.shardElectionTimeoutFactor;
+    }
+
     public static class Builder {
         private InMemoryDOMDataStoreConfigProperties dataStoreProperties;
         private Duration shardTransactionIdleTimeout = Duration.create(10, TimeUnit.MINUTES);
@@ -120,6 +126,7 @@ public class DatastoreContext {
         private ConfigurationReader configurationReader = new FileConfigurationReader();
         private int shardIsolatedLeaderCheckIntervalInMillis = shardHeartbeatIntervalInMillis * 10;
         private int shardSnapshotDataThresholdPercentage = 12;
+        private long shardElectionTimeoutFactor = 2;
 
         public Builder shardTransactionIdleTimeout(Duration shardTransactionIdleTimeout) {
             this.shardTransactionIdleTimeout = shardTransactionIdleTimeout;
@@ -197,6 +204,11 @@ public class DatastoreContext {
             return this;
         }
 
+        public Builder shardElectionTimeoutFactor(long shardElectionTimeoutFactor){
+            this.shardElectionTimeoutFactor = shardElectionTimeoutFactor;
+            return this;
+        }
+
 
         public DatastoreContext build() {
             DefaultConfigParamsImpl raftConfig = new DefaultConfigParamsImpl();
@@ -205,6 +217,7 @@ public class DatastoreContext {
             raftConfig.setJournalRecoveryLogBatchSize(shardJournalRecoveryLogBatchSize);
             raftConfig.setSnapshotBatchCount(shardSnapshotBatchCount);
             raftConfig.setSnapshotDataThresholdPercentage(shardSnapshotDataThresholdPercentage);
+            raftConfig.setElectionTimeoutFactor(shardElectionTimeoutFactor);
             raftConfig.setIsolatedLeaderCheckInterval(
                 new FiniteDuration(shardIsolatedLeaderCheckIntervalInMillis, TimeUnit.MILLISECONDS));
 
@@ -212,7 +225,7 @@ public class DatastoreContext {
                     operationTimeoutInSeconds, shardTransactionIdleTimeout,
                     shardTransactionCommitTimeoutInSeconds, shardTransactionCommitQueueCapacity,
                     shardInitializationTimeout, shardLeaderElectionTimeout,
-                    persistent, configurationReader);
+                    persistent, configurationReader, shardElectionTimeoutFactor);
         }
     }
 }
index 2db487952b01cdeac2ea1567d0bc5fe793b88f30..711c6a37b5c8e9aff85dd91db8d5041be67e8d4c 100644 (file)
@@ -66,6 +66,7 @@ public class DistributedConfigDataStoreProviderModule extends
                 .persistent(props.getPersistent().booleanValue())
                 .shardIsolatedLeaderCheckIntervalInMillis(
                     props.getShardIsolatedLeaderCheckIntervalInMillis().getValue())
+                .shardElectionTimeoutFactor(props.getShardElectionTimeoutFactor().getValue())
                 .build();
 
         return DistributedDataStoreFactory.createInstance("config", getConfigSchemaServiceDependency(),
index 866807e141cf9cc4781a36e0d03f4f84f97cd264..d9df06df1c852e873225b670df7e3fd352213ef9 100644 (file)
@@ -66,6 +66,7 @@ public class DistributedOperationalDataStoreProviderModule extends
                 .persistent(props.getPersistent().booleanValue())
                 .shardIsolatedLeaderCheckIntervalInMillis(
                     props.getShardIsolatedLeaderCheckIntervalInMillis().getValue())
+                .shardElectionTimeoutFactor(props.getShardElectionTimeoutFactor().getValue())
                 .build();
 
         return DistributedDataStoreFactory.createInstance("operational",
index 367d4f45e235209dd3a07b51107dec6198d1c157..46cd50d0c158b6a7e316364a588d28f8364fffce 100644 (file)
@@ -110,6 +110,13 @@ module distributed-datastore-provider {
             description "The interval at which a shard will send a heart beat message to its remote shard.";
          }
 
+         leaf shard-election-timeout-factor {
+            default 2;
+            type non-zero-uint32-type;
+            description "The multiplication factor to be used to determine shard election timeout. The shard election timeout
+                         is determined by multiplying shard-heartbeat-interval-in-millis with the shard-election-timeout-factor";
+         }
+
          leaf operation-timeout-in-seconds {
             default 5;
             type operation-timeout-type;