From b14856a01f6515e6063f82e5fb448f7e9a57029b Mon Sep 17 00:00:00 2001 From: Moiz Raja Date: Mon, 4 Apr 2016 18:27:17 -0700 Subject: [PATCH] BUG 5656 : Entity ownership candidates not removed consistently on leadership change This patch removes candidates for all downed members when entity ownership shard leadership changes. This fixes a corner case where a leader/follower are both killed simulaneously in a cluster which has greater than 3 nodes. In this case the old leader does not have a chance to remove the killed follower. The new leader does know that the follower is down so it can remove the candidates for all downed followers on shard leadership change. Change-Id: If28f5656e0daee40fb96a937dbd0a868b7d3645a Signed-off-by: Moiz Raja --- .../entityownership/EntityOwnershipShard.java | 13 +-- ...ributedEntityOwnershipIntegrationTest.java | 98 +++++++++++++++- .../src/test/resources/application.conf | 110 ++++++++++++++++++ .../module-shards-default-5-node.conf | 17 +++ 4 files changed, 229 insertions(+), 9 deletions(-) create mode 100644 opendaylight/md-sal/sal-distributed-datastore/src/test/resources/module-shards-default-5-node.conf diff --git a/opendaylight/md-sal/sal-distributed-datastore/src/main/java/org/opendaylight/controller/cluster/datastore/entityownership/EntityOwnershipShard.java b/opendaylight/md-sal/sal-distributed-datastore/src/main/java/org/opendaylight/controller/cluster/datastore/entityownership/EntityOwnershipShard.java index 76a6ac1e8f..0ca4980bbf 100644 --- a/opendaylight/md-sal/sal-distributed-datastore/src/main/java/org/opendaylight/controller/cluster/datastore/entityownership/EntityOwnershipShard.java +++ b/opendaylight/md-sal/sal-distributed-datastore/src/main/java/org/opendaylight/controller/cluster/datastore/entityownership/EntityOwnershipShard.java @@ -279,15 +279,12 @@ class EntityOwnershipShard extends Shard { // This allows the strategies to be re-initialized with existing statistics maintained by // EntityOwnershipStatistics strategyConfig.clearStrategies(); - // We were just elected leader. If the old leader is down, select new owners for the entities - // owned by the down leader. - String oldLeaderMemberName = peerIdToMemberNames.get(oldLeader); - - LOG.debug("{}: oldLeaderMemberName: {}", persistenceId(), oldLeaderMemberName); - - if(downPeerMemberNames.contains(oldLeaderMemberName)) { - removeCandidateFromEntities(oldLeaderMemberName); + // Remove the candidates for all members that are known to be down. In a cluster which has greater than + // 3 nodes it is possible for a some node beside the leader being down when the leadership transitions + // it makes sense to use this event to remove all the candidates for those downed nodes + for(String downPeerName : downPeerMemberNames){ + removeCandidateFromEntities(downPeerName); } } else { // The leader changed - notify the coordinator to check if pending modifications need to be sent. diff --git a/opendaylight/md-sal/sal-distributed-datastore/src/test/java/org/opendaylight/controller/cluster/datastore/entityownership/DistributedEntityOwnershipIntegrationTest.java b/opendaylight/md-sal/sal-distributed-datastore/src/test/java/org/opendaylight/controller/cluster/datastore/entityownership/DistributedEntityOwnershipIntegrationTest.java index d23a9faa69..e049238e3b 100644 --- a/opendaylight/md-sal/sal-distributed-datastore/src/test/java/org/opendaylight/controller/cluster/datastore/entityownership/DistributedEntityOwnershipIntegrationTest.java +++ b/opendaylight/md-sal/sal-distributed-datastore/src/test/java/org/opendaylight/controller/cluster/datastore/entityownership/DistributedEntityOwnershipIntegrationTest.java @@ -74,6 +74,7 @@ import org.opendaylight.yangtools.yang.model.api.SchemaContext; */ public class DistributedEntityOwnershipIntegrationTest { private static final String MODULE_SHARDS_CONFIG = "module-shards-default.conf"; + private static final String MODULE_SHARDS_5_NODE_CONFIG = "module-shards-default-5-node.conf"; private static final String MODULE_SHARDS_MEMBER_1_CONFIG = "module-shards-default-member-1.conf"; private static final String ENTITY_TYPE1 = "entityType1"; private static final String ENTITY_TYPE2 = "entityType2"; @@ -318,7 +319,7 @@ public class DistributedEntityOwnershipIntegrationTest { leaderNode.cleanup(); follower1Node.waitForMemberDown("member-1"); - // Re-enable elections on folower1 so it becomes the leader + // Re-enable elections on follower1 so it becomes the leader ActorRef follower1Shard = IntegrationTestKit.findLocalShard(follower1Node.configDataStore(). getActorContext(), ENTITY_OWNERSHIP_SHARD_NAME); @@ -336,6 +337,101 @@ public class DistributedEntityOwnershipIntegrationTest { verifyOwner(follower1Node.configDataStore(), ENTITY2, "member-3"); } + @Test + public void testLeaderAndFollowerCandidatesRemovedAfterShutdown() throws Exception { + followerDatastoreContextBuilder.shardElectionTimeoutFactor(5). + customRaftPolicyImplementation(DisableElectionsRaftPolicy.class.getName()); + + String name = "test"; + MemberNode leaderNode = MemberNode.builder(memberNodes).akkaConfig("Member1").testName(name ). + moduleShardsConfig(MODULE_SHARDS_5_NODE_CONFIG).schemaContext(SCHEMA_CONTEXT).createOperDatastore(false). + datastoreContextBuilder(leaderDatastoreContextBuilder).build(); + + MemberNode follower1Node = MemberNode.builder(memberNodes).akkaConfig("Member2").testName(name ). + moduleShardsConfig(MODULE_SHARDS_5_NODE_CONFIG).schemaContext(SCHEMA_CONTEXT).createOperDatastore(false). + datastoreContextBuilder(followerDatastoreContextBuilder).build(); + + MemberNode follower2Node = MemberNode.builder(memberNodes).akkaConfig("Member3").testName(name ). + moduleShardsConfig(MODULE_SHARDS_5_NODE_CONFIG).schemaContext(SCHEMA_CONTEXT).createOperDatastore(false). + datastoreContextBuilder(followerDatastoreContextBuilder).build(); + + MemberNode follower3Node = MemberNode.builder(memberNodes).akkaConfig("Member4").testName(name ). + moduleShardsConfig(MODULE_SHARDS_5_NODE_CONFIG).schemaContext(SCHEMA_CONTEXT).createOperDatastore(false). + datastoreContextBuilder(followerDatastoreContextBuilder).build(); + + MemberNode follower4Node = MemberNode.builder(memberNodes).akkaConfig("Member5").testName(name ). + moduleShardsConfig(MODULE_SHARDS_5_NODE_CONFIG).schemaContext(SCHEMA_CONTEXT).createOperDatastore(false). + datastoreContextBuilder(followerDatastoreContextBuilder).build(); + + DistributedDataStore leaderDistributedDataStore = leaderNode.configDataStore(); + + leaderDistributedDataStore.waitTillReady(); + follower1Node.configDataStore().waitTillReady(); + follower2Node.configDataStore().waitTillReady(); + + EntityOwnershipService leaderEntityOwnershipService = newOwnershipService(leaderDistributedDataStore); + EntityOwnershipService follower1EntityOwnershipService = newOwnershipService(follower1Node.configDataStore()); + EntityOwnershipService follower2EntityOwnershipService = newOwnershipService(follower2Node.configDataStore()); + EntityOwnershipService follower3EntityOwnershipService = newOwnershipService(follower3Node.configDataStore()); + EntityOwnershipService follower4EntityOwnershipService = newOwnershipService(follower4Node.configDataStore()); + + leaderNode.kit().waitUntilLeader(leaderNode.configDataStore().getActorContext(), ENTITY_OWNERSHIP_SHARD_NAME); + + // Register follower1 candidate for entity1 and verify it becomes owner + + follower1EntityOwnershipService.registerCandidate(ENTITY1); + verifyOwner(leaderDistributedDataStore, ENTITY1, "member-2"); + + // Register leader candidate for entity1 + + leaderEntityOwnershipService.registerCandidate(ENTITY1); + verifyCandidates(leaderDistributedDataStore, ENTITY1, "member-2", "member-1"); + verifyOwner(leaderDistributedDataStore, ENTITY1, "member-2"); + + // Register leader candidate for entity2 and verify it becomes owner + + leaderEntityOwnershipService.registerCandidate(ENTITY2); + verifyOwner(leaderDistributedDataStore, ENTITY2, "member-1"); + + // Register follower2 candidate for entity2 + + follower2EntityOwnershipService.registerCandidate(ENTITY2); + verifyCandidates(leaderDistributedDataStore, ENTITY2, "member-1", "member-3"); + verifyOwner(leaderDistributedDataStore, ENTITY2, "member-1"); + + // Register follower3 as a candidate for entity2 as well + + follower3EntityOwnershipService.registerCandidate(ENTITY2); + verifyCandidates(leaderDistributedDataStore, ENTITY2, "member-1", "member-3", "member-4"); + verifyOwner(leaderDistributedDataStore, ENTITY2, "member-1"); + + + // Shutdown the leader and verify its removed from the candidate list + + leaderNode.cleanup(); + follower3Node.cleanup(); + + follower1Node.waitForMemberDown("member-1"); + follower1Node.waitForMemberDown("member-4"); + + // Re-enable elections on follower1 so it becomes the leader + + ActorRef follower1Shard = IntegrationTestKit.findLocalShard(follower1Node.configDataStore(). + getActorContext(), ENTITY_OWNERSHIP_SHARD_NAME); + follower1Shard.tell(DatastoreContext.newBuilderFrom(followerDatastoreContextBuilder.build()). + customRaftPolicyImplementation(null).build(), ActorRef.noSender()); + + MemberNode.verifyRaftState(follower1Node.configDataStore(), ENTITY_OWNERSHIP_SHARD_NAME, + raftState -> assertEquals("Raft state", RaftState.Leader.toString(), raftState.getRaftState())); + + // Verify the prior leader's and follower3 candidates are removed + + verifyCandidates(follower1Node.configDataStore(), ENTITY1, "member-2"); + verifyCandidates(follower1Node.configDataStore(), ENTITY2, "member-3"); + verifyOwner(follower1Node.configDataStore(), ENTITY1, "member-2"); + verifyOwner(follower1Node.configDataStore(), ENTITY2, "member-3"); + } + /** * Reproduces bug 4554 * diff --git a/opendaylight/md-sal/sal-distributed-datastore/src/test/resources/application.conf b/opendaylight/md-sal/sal-distributed-datastore/src/test/resources/application.conf index 42fcd0ba40..9db9900577 100644 --- a/opendaylight/md-sal/sal-distributed-datastore/src/test/resources/application.conf +++ b/opendaylight/md-sal/sal-distributed-datastore/src/test/resources/application.conf @@ -197,3 +197,113 @@ Member3 { } } } + +Member4 { + bounded-mailbox { + mailbox-type = "org.opendaylight.controller.cluster.common.actor.MeteredBoundedMailbox" + mailbox-capacity = 1000 + mailbox-push-timeout-time = 100ms + } + + in-memory-journal { + class = "org.opendaylight.controller.cluster.raft.utils.InMemoryJournal" + } + + in-memory-snapshot-store { + class = "org.opendaylight.controller.cluster.raft.utils.InMemorySnapshotStore" + plugin-dispatcher = "akka.persistence.dispatchers.default-plugin-dispatcher" + } + + akka { + persistence.snapshot-store.plugin = "in-memory-snapshot-store" + persistence.journal.plugin = "in-memory-journal" + + loglevel = "INFO" + + actor { + provider = "akka.cluster.ClusterActorRefProvider" + + serializers { + java = "akka.serialization.JavaSerializer" + proto = "akka.remote.serialization.ProtobufSerializer" + readylocal = "org.opendaylight.controller.cluster.datastore.messages.ReadyLocalTransactionSerializer" + } + + serialization-bindings { + "com.google.protobuf.Message" = proto + "org.opendaylight.controller.cluster.datastore.messages.ReadyLocalTransaction" = readylocal + } + } + remote { + log-remote-lifecycle-events = off + netty.tcp { + hostname = "127.0.0.1" + port = 2560 + } + } + + cluster { + auto-down-unreachable-after = 100s + retry-unsuccessful-join-after = 100ms + + roles = [ + "member-4" + ] + } + } +} + +Member5 { + bounded-mailbox { + mailbox-type = "org.opendaylight.controller.cluster.common.actor.MeteredBoundedMailbox" + mailbox-capacity = 1000 + mailbox-push-timeout-time = 100ms + } + + in-memory-journal { + class = "org.opendaylight.controller.cluster.raft.utils.InMemoryJournal" + } + + in-memory-snapshot-store { + class = "org.opendaylight.controller.cluster.raft.utils.InMemorySnapshotStore" + plugin-dispatcher = "akka.persistence.dispatchers.default-plugin-dispatcher" + } + + akka { + persistence.snapshot-store.plugin = "in-memory-snapshot-store" + persistence.journal.plugin = "in-memory-journal" + + loglevel = "INFO" + + actor { + provider = "akka.cluster.ClusterActorRefProvider" + + serializers { + java = "akka.serialization.JavaSerializer" + proto = "akka.remote.serialization.ProtobufSerializer" + readylocal = "org.opendaylight.controller.cluster.datastore.messages.ReadyLocalTransactionSerializer" + } + + serialization-bindings { + "com.google.protobuf.Message" = proto + "org.opendaylight.controller.cluster.datastore.messages.ReadyLocalTransaction" = readylocal + } + } + remote { + log-remote-lifecycle-events = off + netty.tcp { + hostname = "127.0.0.1" + port = 2561 + } + } + + cluster { + auto-down-unreachable-after = 100s + retry-unsuccessful-join-after = 100ms + + roles = [ + "member-5" + ] + } + } +} diff --git a/opendaylight/md-sal/sal-distributed-datastore/src/test/resources/module-shards-default-5-node.conf b/opendaylight/md-sal/sal-distributed-datastore/src/test/resources/module-shards-default-5-node.conf new file mode 100644 index 0000000000..5aff081617 --- /dev/null +++ b/opendaylight/md-sal/sal-distributed-datastore/src/test/resources/module-shards-default-5-node.conf @@ -0,0 +1,17 @@ +module-shards = [ + { + name = "default" + shards = [ + { + name="default", + replicas = [ + "member-1", + "member-2", + "member-3", + "member-4", + "member-5" + ] + } + ] + } +] -- 2.36.6