BUG 5656 : Entity ownership candidates not removed consistently on leadership change 16/37116/3
authorMoiz Raja <moraja@cisco.com>
Tue, 5 Apr 2016 01:27:17 +0000 (18:27 -0700)
committerTom Pantelis <tpanteli@brocade.com>
Wed, 6 Apr 2016 11:26:56 +0000 (11:26 +0000)
This patch removes candidates for all downed members when entity ownership shard
leadership changes. This fixes a corner case where a leader/follower are both killed
simulaneously in a cluster which has greater than 3 nodes. In this case the old leader
does not have a chance to remove the killed follower. The new leader does know that
the follower is down so it can remove the candidates for all downed followers on
shard leadership change.

Change-Id: If28f5656e0daee40fb96a937dbd0a868b7d3645a
Signed-off-by: Moiz Raja <moraja@cisco.com>
opendaylight/md-sal/sal-distributed-datastore/src/main/java/org/opendaylight/controller/cluster/datastore/entityownership/EntityOwnershipShard.java
opendaylight/md-sal/sal-distributed-datastore/src/test/java/org/opendaylight/controller/cluster/datastore/entityownership/DistributedEntityOwnershipIntegrationTest.java
opendaylight/md-sal/sal-distributed-datastore/src/test/resources/application.conf
opendaylight/md-sal/sal-distributed-datastore/src/test/resources/module-shards-default-5-node.conf [new file with mode: 0644]

index 76a6ac1e8f61beaf362b3851412e612a7cc4d0f8..0ca4980bbf0df4344d187b4b86f62a1a9e37126e 100644 (file)
@@ -279,15 +279,12 @@ class EntityOwnershipShard extends Shard {
             // This allows the strategies to be re-initialized with existing statistics maintained by
             // EntityOwnershipStatistics
             strategyConfig.clearStrategies();
             // This allows the strategies to be re-initialized with existing statistics maintained by
             // EntityOwnershipStatistics
             strategyConfig.clearStrategies();
-            // We were just elected leader. If the old leader is down, select new owners for the entities
-            // owned by the down leader.
 
 
-            String oldLeaderMemberName = peerIdToMemberNames.get(oldLeader);
-
-            LOG.debug("{}: oldLeaderMemberName: {}", persistenceId(), oldLeaderMemberName);
-
-            if(downPeerMemberNames.contains(oldLeaderMemberName)) {
-                removeCandidateFromEntities(oldLeaderMemberName);
+            // Remove the candidates for all members that are known to be down. In a cluster which has greater than
+            // 3 nodes it is possible for a some node beside the leader being down when the leadership transitions
+            // it makes sense to use this event to remove all the candidates for those downed nodes
+            for(String downPeerName : downPeerMemberNames){
+                removeCandidateFromEntities(downPeerName);
             }
         } else {
             // The leader changed - notify the coordinator to check if pending modifications need to be sent.
             }
         } else {
             // The leader changed - notify the coordinator to check if pending modifications need to be sent.
index d23a9faa69f6abc3f6b0e4be78a559c549e87396..e049238e3b6e7cb5738b2513191cba0b4fa376e0 100644 (file)
@@ -74,6 +74,7 @@ import org.opendaylight.yangtools.yang.model.api.SchemaContext;
  */
 public class DistributedEntityOwnershipIntegrationTest {
     private static final String MODULE_SHARDS_CONFIG = "module-shards-default.conf";
  */
 public class DistributedEntityOwnershipIntegrationTest {
     private static final String MODULE_SHARDS_CONFIG = "module-shards-default.conf";
+    private static final String MODULE_SHARDS_5_NODE_CONFIG = "module-shards-default-5-node.conf";
     private static final String MODULE_SHARDS_MEMBER_1_CONFIG = "module-shards-default-member-1.conf";
     private static final String ENTITY_TYPE1 = "entityType1";
     private static final String ENTITY_TYPE2 = "entityType2";
     private static final String MODULE_SHARDS_MEMBER_1_CONFIG = "module-shards-default-member-1.conf";
     private static final String ENTITY_TYPE1 = "entityType1";
     private static final String ENTITY_TYPE2 = "entityType2";
@@ -318,7 +319,7 @@ public class DistributedEntityOwnershipIntegrationTest {
         leaderNode.cleanup();
         follower1Node.waitForMemberDown("member-1");
 
         leaderNode.cleanup();
         follower1Node.waitForMemberDown("member-1");
 
-        // Re-enable elections on folower1 so it becomes the leader
+        // Re-enable elections on follower1 so it becomes the leader
 
         ActorRef follower1Shard = IntegrationTestKit.findLocalShard(follower1Node.configDataStore().
                 getActorContext(), ENTITY_OWNERSHIP_SHARD_NAME);
 
         ActorRef follower1Shard = IntegrationTestKit.findLocalShard(follower1Node.configDataStore().
                 getActorContext(), ENTITY_OWNERSHIP_SHARD_NAME);
@@ -336,6 +337,101 @@ public class DistributedEntityOwnershipIntegrationTest {
         verifyOwner(follower1Node.configDataStore(), ENTITY2, "member-3");
     }
 
         verifyOwner(follower1Node.configDataStore(), ENTITY2, "member-3");
     }
 
+    @Test
+    public void testLeaderAndFollowerCandidatesRemovedAfterShutdown() throws Exception {
+        followerDatastoreContextBuilder.shardElectionTimeoutFactor(5).
+                customRaftPolicyImplementation(DisableElectionsRaftPolicy.class.getName());
+
+        String name = "test";
+        MemberNode leaderNode = MemberNode.builder(memberNodes).akkaConfig("Member1").testName(name ).
+                moduleShardsConfig(MODULE_SHARDS_5_NODE_CONFIG).schemaContext(SCHEMA_CONTEXT).createOperDatastore(false).
+                datastoreContextBuilder(leaderDatastoreContextBuilder).build();
+
+        MemberNode follower1Node = MemberNode.builder(memberNodes).akkaConfig("Member2").testName(name ).
+                moduleShardsConfig(MODULE_SHARDS_5_NODE_CONFIG).schemaContext(SCHEMA_CONTEXT).createOperDatastore(false).
+                datastoreContextBuilder(followerDatastoreContextBuilder).build();
+
+        MemberNode follower2Node = MemberNode.builder(memberNodes).akkaConfig("Member3").testName(name ).
+                moduleShardsConfig(MODULE_SHARDS_5_NODE_CONFIG).schemaContext(SCHEMA_CONTEXT).createOperDatastore(false).
+                datastoreContextBuilder(followerDatastoreContextBuilder).build();
+
+        MemberNode follower3Node = MemberNode.builder(memberNodes).akkaConfig("Member4").testName(name ).
+                moduleShardsConfig(MODULE_SHARDS_5_NODE_CONFIG).schemaContext(SCHEMA_CONTEXT).createOperDatastore(false).
+                datastoreContextBuilder(followerDatastoreContextBuilder).build();
+
+        MemberNode follower4Node = MemberNode.builder(memberNodes).akkaConfig("Member5").testName(name ).
+                moduleShardsConfig(MODULE_SHARDS_5_NODE_CONFIG).schemaContext(SCHEMA_CONTEXT).createOperDatastore(false).
+                datastoreContextBuilder(followerDatastoreContextBuilder).build();
+
+        DistributedDataStore leaderDistributedDataStore = leaderNode.configDataStore();
+
+        leaderDistributedDataStore.waitTillReady();
+        follower1Node.configDataStore().waitTillReady();
+        follower2Node.configDataStore().waitTillReady();
+
+        EntityOwnershipService leaderEntityOwnershipService = newOwnershipService(leaderDistributedDataStore);
+        EntityOwnershipService follower1EntityOwnershipService = newOwnershipService(follower1Node.configDataStore());
+        EntityOwnershipService follower2EntityOwnershipService = newOwnershipService(follower2Node.configDataStore());
+        EntityOwnershipService follower3EntityOwnershipService = newOwnershipService(follower3Node.configDataStore());
+        EntityOwnershipService follower4EntityOwnershipService = newOwnershipService(follower4Node.configDataStore());
+
+        leaderNode.kit().waitUntilLeader(leaderNode.configDataStore().getActorContext(), ENTITY_OWNERSHIP_SHARD_NAME);
+
+        // Register follower1 candidate for entity1 and verify it becomes owner
+
+        follower1EntityOwnershipService.registerCandidate(ENTITY1);
+        verifyOwner(leaderDistributedDataStore, ENTITY1, "member-2");
+
+        // Register leader candidate for entity1
+
+        leaderEntityOwnershipService.registerCandidate(ENTITY1);
+        verifyCandidates(leaderDistributedDataStore, ENTITY1, "member-2", "member-1");
+        verifyOwner(leaderDistributedDataStore, ENTITY1, "member-2");
+
+        // Register leader candidate for entity2 and verify it becomes owner
+
+        leaderEntityOwnershipService.registerCandidate(ENTITY2);
+        verifyOwner(leaderDistributedDataStore, ENTITY2, "member-1");
+
+        // Register follower2 candidate for entity2
+
+        follower2EntityOwnershipService.registerCandidate(ENTITY2);
+        verifyCandidates(leaderDistributedDataStore, ENTITY2, "member-1", "member-3");
+        verifyOwner(leaderDistributedDataStore, ENTITY2, "member-1");
+
+        // Register follower3 as a candidate for entity2 as well
+
+        follower3EntityOwnershipService.registerCandidate(ENTITY2);
+        verifyCandidates(leaderDistributedDataStore, ENTITY2, "member-1", "member-3", "member-4");
+        verifyOwner(leaderDistributedDataStore, ENTITY2, "member-1");
+
+
+        // Shutdown the leader and verify its removed from the candidate list
+
+        leaderNode.cleanup();
+        follower3Node.cleanup();
+
+        follower1Node.waitForMemberDown("member-1");
+        follower1Node.waitForMemberDown("member-4");
+
+        // Re-enable elections on follower1 so it becomes the leader
+
+        ActorRef follower1Shard = IntegrationTestKit.findLocalShard(follower1Node.configDataStore().
+                getActorContext(), ENTITY_OWNERSHIP_SHARD_NAME);
+        follower1Shard.tell(DatastoreContext.newBuilderFrom(followerDatastoreContextBuilder.build()).
+                customRaftPolicyImplementation(null).build(), ActorRef.noSender());
+
+        MemberNode.verifyRaftState(follower1Node.configDataStore(), ENTITY_OWNERSHIP_SHARD_NAME,
+                raftState -> assertEquals("Raft state", RaftState.Leader.toString(), raftState.getRaftState()));
+
+        // Verify the prior leader's and follower3 candidates are removed
+
+        verifyCandidates(follower1Node.configDataStore(), ENTITY1, "member-2");
+        verifyCandidates(follower1Node.configDataStore(), ENTITY2, "member-3");
+        verifyOwner(follower1Node.configDataStore(), ENTITY1, "member-2");
+        verifyOwner(follower1Node.configDataStore(), ENTITY2, "member-3");
+    }
+
     /**
      * Reproduces bug <a href="https://bugs.opendaylight.org/show_bug.cgi?id=4554">4554</a>
      *
     /**
      * Reproduces bug <a href="https://bugs.opendaylight.org/show_bug.cgi?id=4554">4554</a>
      *
index 42fcd0ba40feac65daae0a2df2c8289c7a0f4b0a..9db9900577c7d5f53e22b6cb6ae90cac5256c9df 100644 (file)
@@ -197,3 +197,113 @@ Member3 {
     }
   }
 }
     }
   }
 }
+
+Member4 {
+  bounded-mailbox {
+    mailbox-type = "org.opendaylight.controller.cluster.common.actor.MeteredBoundedMailbox"
+    mailbox-capacity = 1000
+    mailbox-push-timeout-time = 100ms
+  }
+
+  in-memory-journal {
+    class = "org.opendaylight.controller.cluster.raft.utils.InMemoryJournal"
+  }
+
+  in-memory-snapshot-store {
+    class = "org.opendaylight.controller.cluster.raft.utils.InMemorySnapshotStore"
+    plugin-dispatcher = "akka.persistence.dispatchers.default-plugin-dispatcher"
+  }
+
+  akka {
+    persistence.snapshot-store.plugin = "in-memory-snapshot-store"
+    persistence.journal.plugin = "in-memory-journal"
+
+    loglevel = "INFO"
+
+    actor {
+      provider = "akka.cluster.ClusterActorRefProvider"
+
+      serializers {
+          java = "akka.serialization.JavaSerializer"
+          proto = "akka.remote.serialization.ProtobufSerializer"
+          readylocal = "org.opendaylight.controller.cluster.datastore.messages.ReadyLocalTransactionSerializer"
+      }
+
+      serialization-bindings {
+          "com.google.protobuf.Message" = proto
+          "org.opendaylight.controller.cluster.datastore.messages.ReadyLocalTransaction" = readylocal
+      }
+    }
+    remote {
+      log-remote-lifecycle-events = off
+      netty.tcp {
+        hostname = "127.0.0.1"
+        port = 2560
+      }
+    }
+
+    cluster {
+      auto-down-unreachable-after = 100s
+      retry-unsuccessful-join-after = 100ms
+
+      roles = [
+        "member-4"
+      ]
+    }
+  }
+}
+
+Member5 {
+  bounded-mailbox {
+    mailbox-type = "org.opendaylight.controller.cluster.common.actor.MeteredBoundedMailbox"
+    mailbox-capacity = 1000
+    mailbox-push-timeout-time = 100ms
+  }
+
+  in-memory-journal {
+    class = "org.opendaylight.controller.cluster.raft.utils.InMemoryJournal"
+  }
+
+  in-memory-snapshot-store {
+    class = "org.opendaylight.controller.cluster.raft.utils.InMemorySnapshotStore"
+    plugin-dispatcher = "akka.persistence.dispatchers.default-plugin-dispatcher"
+  }
+
+  akka {
+    persistence.snapshot-store.plugin = "in-memory-snapshot-store"
+    persistence.journal.plugin = "in-memory-journal"
+
+    loglevel = "INFO"
+
+    actor {
+      provider = "akka.cluster.ClusterActorRefProvider"
+
+      serializers {
+          java = "akka.serialization.JavaSerializer"
+          proto = "akka.remote.serialization.ProtobufSerializer"
+          readylocal = "org.opendaylight.controller.cluster.datastore.messages.ReadyLocalTransactionSerializer"
+      }
+
+      serialization-bindings {
+          "com.google.protobuf.Message" = proto
+          "org.opendaylight.controller.cluster.datastore.messages.ReadyLocalTransaction" = readylocal
+      }
+    }
+    remote {
+      log-remote-lifecycle-events = off
+      netty.tcp {
+        hostname = "127.0.0.1"
+        port = 2561
+      }
+    }
+
+    cluster {
+      auto-down-unreachable-after = 100s
+      retry-unsuccessful-join-after = 100ms
+
+      roles = [
+        "member-5"
+      ]
+    }
+  }
+}
diff --git a/opendaylight/md-sal/sal-distributed-datastore/src/test/resources/module-shards-default-5-node.conf b/opendaylight/md-sal/sal-distributed-datastore/src/test/resources/module-shards-default-5-node.conf
new file mode 100644 (file)
index 0000000..5aff081
--- /dev/null
@@ -0,0 +1,17 @@
+module-shards = [
+    {
+        name = "default"
+        shards = [
+            {
+                name="default",
+                replicas = [
+                    "member-1",
+                    "member-2",
+                    "member-3",
+                    "member-4",
+                    "member-5"
+                ]
+            }
+        ]
+    }
+]