From e5f441207182bcadf0a047c77e04af197ad6a7ae Mon Sep 17 00:00:00 2001 From: Moiz Raja Date: Wed, 14 Jan 2015 16:21:22 -0800 Subject: [PATCH] BUG 2585 : Make Election Timeout Factor configurable Keeping the Heartbeat interval small and increasing the timeout factor allows us to skip a few heartbeats if neccessary without causing Followers to become Candidates. Simply increasing the heartbeat interval has the negative effect of causing replication messages for out-of date followers to go slower so making the election factor configurable will be helpful in keeping the cluster state stable. Change-Id: Iae8105d65bba4a37987bfddb9f22d9d4d862a1fd Signed-off-by: Moiz Raja --- .../controller/cluster/raft/ConfigParams.java | 8 ++++++++ .../cluster/raft/DefaultConfigParamsImpl.java | 15 ++++++++++++--- .../controller/cluster/raft/RaftActor.java | 14 ++++++++++---- .../behaviors/AbstractRaftActorBehavior.java | 7 +++---- .../cluster/datastore/DatastoreContext.java | 17 +++++++++++++++-- ...istributedConfigDataStoreProviderModule.java | 1 + ...butedOperationalDataStoreProviderModule.java | 1 + .../yang/distributed-datastore-provider.yang | 7 +++++++ 8 files changed, 57 insertions(+), 13 deletions(-) diff --git a/opendaylight/md-sal/sal-akka-raft/src/main/java/org/opendaylight/controller/cluster/raft/ConfigParams.java b/opendaylight/md-sal/sal-akka-raft/src/main/java/org/opendaylight/controller/cluster/raft/ConfigParams.java index 4245cf10f7..78a1335d58 100644 --- a/opendaylight/md-sal/sal-akka-raft/src/main/java/org/opendaylight/controller/cluster/raft/ConfigParams.java +++ b/opendaylight/md-sal/sal-akka-raft/src/main/java/org/opendaylight/controller/cluster/raft/ConfigParams.java @@ -76,4 +76,12 @@ public interface ConfigParams { * @return FiniteDuration */ FiniteDuration getIsolatedCheckInterval(); + + + /** + * The multiplication factor to be used to determine shard election timeout. The election timeout + * is determined by multiplying the election timeout factor with the heartbeat duration. + */ + long getElectionTimeoutFactor(); + } diff --git a/opendaylight/md-sal/sal-akka-raft/src/main/java/org/opendaylight/controller/cluster/raft/DefaultConfigParamsImpl.java b/opendaylight/md-sal/sal-akka-raft/src/main/java/org/opendaylight/controller/cluster/raft/DefaultConfigParamsImpl.java index 3a6bdbf0a3..86867e1d04 100644 --- a/opendaylight/md-sal/sal-akka-raft/src/main/java/org/opendaylight/controller/cluster/raft/DefaultConfigParamsImpl.java +++ b/opendaylight/md-sal/sal-akka-raft/src/main/java/org/opendaylight/controller/cluster/raft/DefaultConfigParamsImpl.java @@ -39,7 +39,6 @@ public class DefaultConfigParamsImpl implements ConfigParams { public static final FiniteDuration HEART_BEAT_INTERVAL = new FiniteDuration(100, TimeUnit.MILLISECONDS); - private FiniteDuration heartBeatInterval = HEART_BEAT_INTERVAL; private long snapshotBatchCount = SNAPSHOT_BATCH_COUNT; private int journalRecoveryLogBatchSize = JOURNAL_RECOVERY_LOG_BATCH_SIZE; @@ -50,6 +49,8 @@ public class DefaultConfigParamsImpl implements ConfigParams { // in-memory journal can use before it needs to snapshot private int snapshotDataThresholdPercentage = 12; + private long electionTimeoutFactor = 2; + public void setHeartBeatInterval(FiniteDuration heartBeatInterval) { this.heartBeatInterval = heartBeatInterval; } @@ -70,6 +71,10 @@ public class DefaultConfigParamsImpl implements ConfigParams { this.isolatedLeaderCheckInterval = isolatedLeaderCheckInterval; } + public void setElectionTimeoutFactor(long electionTimeoutFactor){ + this.electionTimeoutFactor = electionTimeoutFactor; + } + @Override public long getSnapshotBatchCount() { return snapshotBatchCount; @@ -88,8 +93,7 @@ public class DefaultConfigParamsImpl implements ConfigParams { @Override public FiniteDuration getElectionTimeOutInterval() { - // returns 2 times the heart beat interval - return getHeartBeatInterval().$times(2); + return getHeartBeatInterval().$times(electionTimeoutFactor); } @Override @@ -111,4 +115,9 @@ public class DefaultConfigParamsImpl implements ConfigParams { public FiniteDuration getIsolatedCheckInterval() { return isolatedLeaderCheckInterval; } + + @Override + public long getElectionTimeoutFactor() { + return electionTimeoutFactor; + } } diff --git a/opendaylight/md-sal/sal-akka-raft/src/main/java/org/opendaylight/controller/cluster/raft/RaftActor.java b/opendaylight/md-sal/sal-akka-raft/src/main/java/org/opendaylight/controller/cluster/raft/RaftActor.java index 3b84692077..0bf5c911bb 100644 --- a/opendaylight/md-sal/sal-akka-raft/src/main/java/org/opendaylight/controller/cluster/raft/RaftActor.java +++ b/opendaylight/md-sal/sal-akka-raft/src/main/java/org/opendaylight/controller/cluster/raft/RaftActor.java @@ -170,9 +170,7 @@ public abstract class RaftActor extends AbstractUntypedPersistentActor { onRecoveryComplete(); - RaftActorBehavior oldBehavior = currentBehavior; - currentBehavior = new Follower(context); - handleBehaviorChange(oldBehavior, currentBehavior); + initializeBehavior(); } } } @@ -271,8 +269,16 @@ public abstract class RaftActor extends AbstractUntypedPersistentActor { replicatedLog.lastIndex(), replicatedLog.snapshotIndex, replicatedLog.snapshotTerm, replicatedLog.size()); + initializeBehavior(); + } + + protected void initializeBehavior(){ + changeCurrentBehavior(new Follower(context)); + } + + protected void changeCurrentBehavior(RaftActorBehavior newBehavior){ RaftActorBehavior oldBehavior = currentBehavior; - currentBehavior = new Follower(context); + currentBehavior = newBehavior; handleBehaviorChange(oldBehavior, currentBehavior); } diff --git a/opendaylight/md-sal/sal-akka-raft/src/main/java/org/opendaylight/controller/cluster/raft/behaviors/AbstractRaftActorBehavior.java b/opendaylight/md-sal/sal-akka-raft/src/main/java/org/opendaylight/controller/cluster/raft/behaviors/AbstractRaftActorBehavior.java index f235221da9..04462be042 100644 --- a/opendaylight/md-sal/sal-akka-raft/src/main/java/org/opendaylight/controller/cluster/raft/behaviors/AbstractRaftActorBehavior.java +++ b/opendaylight/md-sal/sal-akka-raft/src/main/java/org/opendaylight/controller/cluster/raft/behaviors/AbstractRaftActorBehavior.java @@ -11,6 +11,8 @@ package org.opendaylight.controller.cluster.raft.behaviors; import akka.actor.ActorRef; import akka.actor.Cancellable; import akka.event.LoggingAdapter; +import java.util.Random; +import java.util.concurrent.TimeUnit; import org.opendaylight.controller.cluster.raft.ClientRequestTracker; import org.opendaylight.controller.cluster.raft.RaftActorContext; import org.opendaylight.controller.cluster.raft.ReplicatedLogEntry; @@ -24,9 +26,6 @@ import org.opendaylight.controller.cluster.raft.messages.RequestVote; import org.opendaylight.controller.cluster.raft.messages.RequestVoteReply; import scala.concurrent.duration.FiniteDuration; -import java.util.Random; -import java.util.concurrent.TimeUnit; - /** * Abstract class that represents the behavior of a RaftActor *

@@ -202,7 +201,7 @@ public abstract class AbstractRaftActorBehavior implements RaftActorBehavior { protected FiniteDuration electionDuration() { long variance = new Random().nextInt(context.getConfigParams().getElectionTimeVariance()); return context.getConfigParams().getElectionTimeOutInterval().$plus( - new FiniteDuration(variance, TimeUnit.MILLISECONDS)); + new FiniteDuration(variance, TimeUnit.MILLISECONDS)); } /** diff --git a/opendaylight/md-sal/sal-distributed-datastore/src/main/java/org/opendaylight/controller/cluster/datastore/DatastoreContext.java b/opendaylight/md-sal/sal-distributed-datastore/src/main/java/org/opendaylight/controller/cluster/datastore/DatastoreContext.java index daba3fdf8a..01e42dbb8e 100644 --- a/opendaylight/md-sal/sal-distributed-datastore/src/main/java/org/opendaylight/controller/cluster/datastore/DatastoreContext.java +++ b/opendaylight/md-sal/sal-distributed-datastore/src/main/java/org/opendaylight/controller/cluster/datastore/DatastoreContext.java @@ -36,13 +36,14 @@ public class DatastoreContext { private final Timeout shardLeaderElectionTimeout; private final boolean persistent; private final ConfigurationReader configurationReader; + private final long shardElectionTimeoutFactor; private DatastoreContext(InMemoryDOMDataStoreConfigProperties dataStoreProperties, ConfigParams shardRaftConfig, String dataStoreMXBeanType, int operationTimeoutInSeconds, Duration shardTransactionIdleTimeout, int shardTransactionCommitTimeoutInSeconds, int shardTransactionCommitQueueCapacity, Timeout shardInitializationTimeout, Timeout shardLeaderElectionTimeout, - boolean persistent, ConfigurationReader configurationReader) { + boolean persistent, ConfigurationReader configurationReader, long shardElectionTimeoutFactor) { this.dataStoreProperties = dataStoreProperties; this.shardRaftConfig = shardRaftConfig; this.dataStoreMXBeanType = dataStoreMXBeanType; @@ -54,6 +55,7 @@ public class DatastoreContext { this.shardLeaderElectionTimeout = shardLeaderElectionTimeout; this.persistent = persistent; this.configurationReader = configurationReader; + this.shardElectionTimeoutFactor = shardElectionTimeoutFactor; } public static Builder newBuilder() { @@ -104,6 +106,10 @@ public class DatastoreContext { return configurationReader; } + public long getShardElectionTimeoutFactor(){ + return this.shardElectionTimeoutFactor; + } + public static class Builder { private InMemoryDOMDataStoreConfigProperties dataStoreProperties; private Duration shardTransactionIdleTimeout = Duration.create(10, TimeUnit.MINUTES); @@ -120,6 +126,7 @@ public class DatastoreContext { private ConfigurationReader configurationReader = new FileConfigurationReader(); private int shardIsolatedLeaderCheckIntervalInMillis = shardHeartbeatIntervalInMillis * 10; private int shardSnapshotDataThresholdPercentage = 12; + private long shardElectionTimeoutFactor = 2; public Builder shardTransactionIdleTimeout(Duration shardTransactionIdleTimeout) { this.shardTransactionIdleTimeout = shardTransactionIdleTimeout; @@ -197,6 +204,11 @@ public class DatastoreContext { return this; } + public Builder shardElectionTimeoutFactor(long shardElectionTimeoutFactor){ + this.shardElectionTimeoutFactor = shardElectionTimeoutFactor; + return this; + } + public DatastoreContext build() { DefaultConfigParamsImpl raftConfig = new DefaultConfigParamsImpl(); @@ -205,6 +217,7 @@ public class DatastoreContext { raftConfig.setJournalRecoveryLogBatchSize(shardJournalRecoveryLogBatchSize); raftConfig.setSnapshotBatchCount(shardSnapshotBatchCount); raftConfig.setSnapshotDataThresholdPercentage(shardSnapshotDataThresholdPercentage); + raftConfig.setElectionTimeoutFactor(shardElectionTimeoutFactor); raftConfig.setIsolatedLeaderCheckInterval( new FiniteDuration(shardIsolatedLeaderCheckIntervalInMillis, TimeUnit.MILLISECONDS)); @@ -212,7 +225,7 @@ public class DatastoreContext { operationTimeoutInSeconds, shardTransactionIdleTimeout, shardTransactionCommitTimeoutInSeconds, shardTransactionCommitQueueCapacity, shardInitializationTimeout, shardLeaderElectionTimeout, - persistent, configurationReader); + persistent, configurationReader, shardElectionTimeoutFactor); } } } diff --git a/opendaylight/md-sal/sal-distributed-datastore/src/main/java/org/opendaylight/controller/config/yang/config/distributed_datastore_provider/DistributedConfigDataStoreProviderModule.java b/opendaylight/md-sal/sal-distributed-datastore/src/main/java/org/opendaylight/controller/config/yang/config/distributed_datastore_provider/DistributedConfigDataStoreProviderModule.java index 2db487952b..711c6a37b5 100644 --- a/opendaylight/md-sal/sal-distributed-datastore/src/main/java/org/opendaylight/controller/config/yang/config/distributed_datastore_provider/DistributedConfigDataStoreProviderModule.java +++ b/opendaylight/md-sal/sal-distributed-datastore/src/main/java/org/opendaylight/controller/config/yang/config/distributed_datastore_provider/DistributedConfigDataStoreProviderModule.java @@ -66,6 +66,7 @@ public class DistributedConfigDataStoreProviderModule extends .persistent(props.getPersistent().booleanValue()) .shardIsolatedLeaderCheckIntervalInMillis( props.getShardIsolatedLeaderCheckIntervalInMillis().getValue()) + .shardElectionTimeoutFactor(props.getShardElectionTimeoutFactor().getValue()) .build(); return DistributedDataStoreFactory.createInstance("config", getConfigSchemaServiceDependency(), diff --git a/opendaylight/md-sal/sal-distributed-datastore/src/main/java/org/opendaylight/controller/config/yang/config/distributed_datastore_provider/DistributedOperationalDataStoreProviderModule.java b/opendaylight/md-sal/sal-distributed-datastore/src/main/java/org/opendaylight/controller/config/yang/config/distributed_datastore_provider/DistributedOperationalDataStoreProviderModule.java index 866807e141..d9df06df1c 100644 --- a/opendaylight/md-sal/sal-distributed-datastore/src/main/java/org/opendaylight/controller/config/yang/config/distributed_datastore_provider/DistributedOperationalDataStoreProviderModule.java +++ b/opendaylight/md-sal/sal-distributed-datastore/src/main/java/org/opendaylight/controller/config/yang/config/distributed_datastore_provider/DistributedOperationalDataStoreProviderModule.java @@ -66,6 +66,7 @@ public class DistributedOperationalDataStoreProviderModule extends .persistent(props.getPersistent().booleanValue()) .shardIsolatedLeaderCheckIntervalInMillis( props.getShardIsolatedLeaderCheckIntervalInMillis().getValue()) + .shardElectionTimeoutFactor(props.getShardElectionTimeoutFactor().getValue()) .build(); return DistributedDataStoreFactory.createInstance("operational", diff --git a/opendaylight/md-sal/sal-distributed-datastore/src/main/yang/distributed-datastore-provider.yang b/opendaylight/md-sal/sal-distributed-datastore/src/main/yang/distributed-datastore-provider.yang index 367d4f45e2..46cd50d0c1 100644 --- a/opendaylight/md-sal/sal-distributed-datastore/src/main/yang/distributed-datastore-provider.yang +++ b/opendaylight/md-sal/sal-distributed-datastore/src/main/yang/distributed-datastore-provider.yang @@ -110,6 +110,13 @@ module distributed-datastore-provider { description "The interval at which a shard will send a heart beat message to its remote shard."; } + leaf shard-election-timeout-factor { + default 2; + type non-zero-uint32-type; + description "The multiplication factor to be used to determine shard election timeout. The shard election timeout + is determined by multiplying shard-heartbeat-interval-in-millis with the shard-election-timeout-factor"; + } + leaf operation-timeout-in-seconds { default 5; type operation-timeout-type; -- 2.36.6