Bug 4992: Removed old leader's candidates on leader change
[controller.git] / opendaylight / md-sal / sal-distributed-datastore / src / main / java / org / opendaylight / controller / cluster / datastore / entityownership / EntityOwnershipShard.java
index 20a69022db721dd6c1e0398c1af650b936363603..74cc6717dcb44b330d38e211a899690a39b0e316 100644 (file)
@@ -7,39 +7,64 @@
  */
 package org.opendaylight.controller.cluster.datastore.entityownership;
 
+import static org.opendaylight.controller.cluster.datastore.entityownership.EntityOwnersModel.CANDIDATE_NODE_ID;
+import static org.opendaylight.controller.cluster.datastore.entityownership.EntityOwnersModel.ENTITY_ID_NODE_ID;
+import static org.opendaylight.controller.cluster.datastore.entityownership.EntityOwnersModel.ENTITY_ID_QNAME;
+import static org.opendaylight.controller.cluster.datastore.entityownership.EntityOwnersModel.ENTITY_NODE_ID;
+import static org.opendaylight.controller.cluster.datastore.entityownership.EntityOwnersModel.ENTITY_OWNERS_PATH;
+import static org.opendaylight.controller.cluster.datastore.entityownership.EntityOwnersModel.ENTITY_OWNER_NODE_ID;
+import static org.opendaylight.controller.cluster.datastore.entityownership.EntityOwnersModel.ENTITY_OWNER_QNAME;
+import static org.opendaylight.controller.cluster.datastore.entityownership.EntityOwnersModel.ENTITY_TYPES_PATH;
+import static org.opendaylight.controller.cluster.datastore.entityownership.EntityOwnersModel.ENTITY_TYPE_NODE_ID;
+import static org.opendaylight.controller.cluster.datastore.entityownership.EntityOwnersModel.ENTITY_TYPE_QNAME;
+import static org.opendaylight.controller.cluster.datastore.entityownership.EntityOwnersModel.candidateMapEntry;
+import static org.opendaylight.controller.cluster.datastore.entityownership.EntityOwnersModel.candidateNodeKey;
+import static org.opendaylight.controller.cluster.datastore.entityownership.EntityOwnersModel.candidatePath;
+import static org.opendaylight.controller.cluster.datastore.entityownership.EntityOwnersModel.createEntity;
+import static org.opendaylight.controller.cluster.datastore.entityownership.EntityOwnersModel.entityOwnersWithCandidate;
+import akka.actor.ActorRef;
 import akka.actor.ActorSelection;
-import akka.actor.Props;
-import akka.dispatch.OnComplete;
-import akka.pattern.AskTimeoutException;
+import akka.actor.Cancellable;
 import akka.pattern.Patterns;
+import com.google.common.base.Optional;
+import com.google.common.base.Preconditions;
+import com.google.common.base.Strings;
 import java.util.ArrayList;
-import java.util.List;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
 import java.util.Map;
+import java.util.Set;
 import java.util.concurrent.TimeUnit;
-import org.opendaylight.controller.cluster.datastore.DataStoreVersions;
 import org.opendaylight.controller.cluster.datastore.DatastoreContext;
 import org.opendaylight.controller.cluster.datastore.Shard;
+import org.opendaylight.controller.cluster.datastore.entityownership.messages.CandidateAdded;
+import org.opendaylight.controller.cluster.datastore.entityownership.messages.CandidateRemoved;
 import org.opendaylight.controller.cluster.datastore.entityownership.messages.RegisterCandidateLocal;
+import org.opendaylight.controller.cluster.datastore.entityownership.messages.RegisterListenerLocal;
+import org.opendaylight.controller.cluster.datastore.entityownership.messages.SelectOwner;
 import org.opendaylight.controller.cluster.datastore.entityownership.messages.UnregisterCandidateLocal;
+import org.opendaylight.controller.cluster.datastore.entityownership.messages.UnregisterListenerLocal;
+import org.opendaylight.controller.cluster.datastore.entityownership.selectionstrategy.EntityOwnerSelectionStrategy;
+import org.opendaylight.controller.cluster.datastore.entityownership.selectionstrategy.EntityOwnerSelectionStrategyConfig;
 import org.opendaylight.controller.cluster.datastore.identifiers.ShardIdentifier;
-import org.opendaylight.controller.cluster.datastore.identifiers.TransactionIdentifier;
 import org.opendaylight.controller.cluster.datastore.messages.BatchedModifications;
+import org.opendaylight.controller.cluster.datastore.messages.PeerDown;
+import org.opendaylight.controller.cluster.datastore.messages.PeerUp;
 import org.opendaylight.controller.cluster.datastore.messages.SuccessReply;
+import org.opendaylight.controller.cluster.datastore.modification.DeleteModification;
 import org.opendaylight.controller.cluster.datastore.modification.MergeModification;
+import org.opendaylight.controller.cluster.datastore.modification.WriteModification;
 import org.opendaylight.controller.md.sal.common.api.clustering.Entity;
-import org.opendaylight.yang.gen.v1.urn.opendaylight.params.xml.ns.yang.controller.md.sal.clustering.entity.owners.rev150804.EntityOwners;
-import org.opendaylight.yang.gen.v1.urn.opendaylight.params.xml.ns.yang.controller.md.sal.clustering.entity.owners.rev150804.entity.owners.EntityType;
-import org.opendaylight.yang.gen.v1.urn.opendaylight.params.xml.ns.yang.controller.md.sal.clustering.entity.owners.rev150804.entity.owners.entity.type.entity.Candidate;
-import org.opendaylight.yangtools.yang.common.QName;
 import org.opendaylight.yangtools.yang.data.api.YangInstanceIdentifier;
-import org.opendaylight.yangtools.yang.data.api.YangInstanceIdentifier.NodeIdentifier;
+import org.opendaylight.yangtools.yang.data.api.YangInstanceIdentifier.PathArgument;
+import org.opendaylight.yangtools.yang.data.api.schema.DataContainerChild;
 import org.opendaylight.yangtools.yang.data.api.schema.MapEntryNode;
 import org.opendaylight.yangtools.yang.data.api.schema.MapNode;
 import org.opendaylight.yangtools.yang.data.api.schema.NormalizedNode;
 import org.opendaylight.yangtools.yang.data.impl.schema.ImmutableNodes;
-import org.opendaylight.yangtools.yang.data.impl.schema.builder.impl.ImmutableContainerNodeBuilder;
-import org.opendaylight.yangtools.yang.model.api.SchemaContext;
 import scala.concurrent.Future;
+import scala.concurrent.duration.FiniteDuration;
 
 /**
  * Special Shard for EntityOwnership.
@@ -47,25 +72,32 @@ import scala.concurrent.Future;
  * @author Thomas Pantelis
  */
 class EntityOwnershipShard extends Shard {
-    static final YangInstanceIdentifier ENTITY_OWNERS_PATH = YangInstanceIdentifier.of(EntityOwners.QNAME);
-    static final  QName ENTITY_QNAME = org.opendaylight.yang.gen.v1.urn.opendaylight.params.xml.ns.yang.controller.
-            md.sal.clustering.entity.owners.rev150804.entity.owners.entity.type.Entity.QNAME;
-    static final QName CANDIDATE_NAME = QName.create(Candidate.QNAME, "name");
-    static final QName ENTITY_ID = QName.create(ENTITY_QNAME, "id");
-    static final QName ENTITY_TYPE = QName.create(EntityType.QNAME, "type");
-
-    private int transactionIDCounter = 0;
     private final String localMemberName;
-    private final List<BatchedModifications> retryModifications = new ArrayList<>();
+    private final EntityOwnershipShardCommitCoordinator commitCoordinator;
+    private final EntityOwnershipListenerSupport listenerSupport;
+    private final Set<String> downPeerMemberNames = new HashSet<>();
+    private final Map<String, String> peerIdToMemberNames = new HashMap<>();
+    private final EntityOwnerSelectionStrategyConfig strategyConfig;
+    private final Map<YangInstanceIdentifier, Cancellable> entityToScheduledOwnershipTask = new HashMap<>();
+    private final EntityOwnershipStatistics entityOwnershipStatistics;
 
     private static DatastoreContext noPersistenceDatastoreContext(DatastoreContext datastoreContext) {
         return DatastoreContext.newBuilderFrom(datastoreContext).persistent(false).build();
     }
 
-    protected EntityOwnershipShard(ShardIdentifier name, Map<String, String> peerAddresses,
-            DatastoreContext datastoreContext, SchemaContext schemaContext, String localMemberName) {
-        super(name, peerAddresses, noPersistenceDatastoreContext(datastoreContext), schemaContext);
-        this.localMemberName = localMemberName;
+    protected EntityOwnershipShard(Builder builder) {
+        super(builder);
+        this.localMemberName = builder.localMemberName;
+        this.commitCoordinator = new EntityOwnershipShardCommitCoordinator(builder.localMemberName, LOG);
+        this.listenerSupport = new EntityOwnershipListenerSupport(getContext(), persistenceId());
+        this.strategyConfig = builder.ownerSelectionStrategyConfig;
+        this.entityOwnershipStatistics = new EntityOwnershipStatistics();
+        this.entityOwnershipStatistics.init(getDataStore());
+
+        for(String peerId: getRaftActorContext().getPeerIds()) {
+            ShardIdentifier shardId = ShardIdentifier.builder().fromShardIdString(peerId).build();
+            peerIdToMemberNames.put(peerId, shardId.getMemberName());
+        }
     }
 
     @Override
@@ -73,139 +105,412 @@ class EntityOwnershipShard extends Shard {
         super.onDatastoreContext(noPersistenceDatastoreContext(context));
     }
 
+    @Override
+    protected void onRecoveryComplete() {
+        super.onRecoveryComplete();
+
+        new CandidateListChangeListener(getSelf(), persistenceId()).init(getDataStore());
+        new EntityOwnerChangeListener(localMemberName, listenerSupport).init(getDataStore());
+    }
+
     @Override
     public void onReceiveCommand(final Object message) throws Exception {
         if(message instanceof RegisterCandidateLocal) {
-            onRegisterCandidateLocal((RegisterCandidateLocal)message);
+            onRegisterCandidateLocal((RegisterCandidateLocal) message);
         } else if(message instanceof UnregisterCandidateLocal) {
             onUnregisterCandidateLocal((UnregisterCandidateLocal)message);
-        } else {
+        } else if(message instanceof CandidateAdded){
+            onCandidateAdded((CandidateAdded) message);
+        } else if(message instanceof CandidateRemoved){
+            onCandidateRemoved((CandidateRemoved) message);
+        } else if(message instanceof PeerDown) {
+            onPeerDown((PeerDown) message);
+        } else if(message instanceof PeerUp) {
+            onPeerUp((PeerUp) message);
+        } else if(message instanceof RegisterListenerLocal) {
+            onRegisterListenerLocal((RegisterListenerLocal)message);
+        } else if(message instanceof UnregisterListenerLocal) {
+            onUnregisterListenerLocal((UnregisterListenerLocal) message);
+        } else if(message instanceof SelectOwner) {
+            onSelectOwner((SelectOwner) message);
+        } else if(!commitCoordinator.handleMessage(message, this)) {
             super.onReceiveCommand(message);
         }
     }
 
-    private void onRegisterCandidateLocal(RegisterCandidateLocal registerCandidate) {
-        LOG.debug("onRegisterCandidateLocal: {}", registerCandidate);
+    private void onSelectOwner(SelectOwner selectOwner) {
+        String currentOwner = getCurrentOwner(selectOwner.getEntityPath());
+        if(Strings.isNullOrEmpty(currentOwner)) {
+            String entityType = EntityOwnersModel.entityTypeFromEntityPath(selectOwner.getEntityPath());
+            writeNewOwner(selectOwner.getEntityPath(), newOwner(selectOwner.getAllCandidates(),
+                    entityOwnershipStatistics.byEntityType(entityType),
+                    selectOwner.getOwnerSelectionStrategy()));
 
-        // TODO - add the listener locally.
+            Cancellable cancellable = entityToScheduledOwnershipTask.get(selectOwner.getEntityPath());
+            if(cancellable != null){
+                if(!cancellable.isCancelled()){
+                    cancellable.cancel();
+                }
+                entityToScheduledOwnershipTask.remove(selectOwner.getEntityPath());
+            }
+        }
+    }
 
-        BatchedModifications modifications = new BatchedModifications(
-                TransactionIdentifier.create(localMemberName, ++transactionIDCounter).toString(),
-                DataStoreVersions.CURRENT_VERSION, "");
-        modifications.setDoCommitOnReady(true);
-        modifications.setReady(true);
-        modifications.setTotalMessagesSent(1);
+    private void onRegisterCandidateLocal(RegisterCandidateLocal registerCandidate) {
+        LOG.debug("{}: onRegisterCandidateLocal: {}", persistenceId(), registerCandidate);
 
-        NormalizedNode<?, ?> entityOwners = createEntityOwnersWithCandidate(registerCandidate.getEntity(), localMemberName);
-        modifications.addModification(new MergeModification(ENTITY_OWNERS_PATH, entityOwners));
+        listenerSupport.setHasCandidateForEntity(registerCandidate.getEntity());
 
-        tryCommitModifications(modifications);
+        NormalizedNode<?, ?> entityOwners = entityOwnersWithCandidate(registerCandidate.getEntity().getType(),
+                registerCandidate.getEntity().getId(), localMemberName);
+        commitCoordinator.commitModification(new MergeModification(ENTITY_OWNERS_PATH, entityOwners), this);
 
         getSender().tell(SuccessReply.INSTANCE, getSelf());
     }
 
-    private NormalizedNode<?, ?> createEntityOwnersWithCandidate(Entity entity, String memberName) {
-        MapNode candidateNode = ImmutableNodes.mapNodeBuilder(Candidate.QNAME).addChild(
-                        ImmutableNodes.mapEntry(Candidate.QNAME, CANDIDATE_NAME, memberName)).build();
+    private void onUnregisterCandidateLocal(UnregisterCandidateLocal unregisterCandidate) {
+        LOG.debug("{}: onUnregisterCandidateLocal: {}", persistenceId(), unregisterCandidate);
 
-        MapEntryNode entityNode = ImmutableNodes.mapEntryBuilder(ENTITY_QNAME, ENTITY_ID, entity.getId()).
-                addChild(candidateNode).build();
+        Entity entity = unregisterCandidate.getEntity();
+        listenerSupport.unsetHasCandidateForEntity(entity);
 
-        MapEntryNode entityTypeNode = ImmutableNodes.mapEntryBuilder(EntityType.QNAME, ENTITY_TYPE, entity.getType()).
-                addChild(ImmutableNodes.mapNodeBuilder(ENTITY_QNAME).addChild(entityNode).build()).build();
+        YangInstanceIdentifier candidatePath = candidatePath(entity.getType(), entity.getId(), localMemberName);
+        commitCoordinator.commitModification(new DeleteModification(candidatePath), this);
 
-        return ImmutableContainerNodeBuilder.create().withNodeIdentifier(new NodeIdentifier(EntityOwners.QNAME)).
-                addChild(ImmutableNodes.mapNodeBuilder(EntityType.QNAME).addChild(entityTypeNode).build()).build();
+        getSender().tell(SuccessReply.INSTANCE, getSelf());
     }
 
-    private void tryCommitModifications(final BatchedModifications modifications) {
-        if(isLeader()) {
-            if(isIsolatedLeader()) {
-                LOG.debug("Leader is isolated - adding BatchedModifications {} for retry", modifications.getTransactionID());
+    private void onRegisterListenerLocal(final RegisterListenerLocal registerListener) {
+        LOG.debug("{}: onRegisterListenerLocal: {}", persistenceId(), registerListener);
 
-                retryModifications.add(modifications);
-            } else {
-                LOG.debug("Committing BatchedModifications {} locally", modifications.getTransactionID());
+        listenerSupport.addEntityOwnershipListener(registerListener.getEntityType(), registerListener.getListener());
+
+        getSender().tell(SuccessReply.INSTANCE, getSelf());
 
-                // Note that it's possible the commit won't get consensus and will timeout and not be applied
-                // to the state. However we don't need to retry it in that case b/c it will be committed to
-                // the journal first and, once a majority of followers come back on line and it is replicated,
-                // it will be applied at that point.
-                handleBatchedModificationsLocal(modifications, self());
+        searchForEntitiesOwnedBy(localMemberName, new EntityWalker() {
+            @Override
+            public void onEntity(MapEntryNode entityTypeNode, MapEntryNode entityNode) {
+                Optional<DataContainerChild<? extends PathArgument, ?>> possibleType =
+                        entityTypeNode.getChild(ENTITY_TYPE_NODE_ID);
+                String entityType = possibleType.isPresent() ? possibleType.get().getValue().toString() : null;
+                if (registerListener.getEntityType().equals(entityType)) {
+                    Entity entity = new Entity(entityType,
+                            (YangInstanceIdentifier) entityNode.getChild(ENTITY_ID_NODE_ID).get().getValue());
+                    listenerSupport.notifyEntityOwnershipListener(entity, false, true, true, registerListener.getListener());
+                }
             }
+        });
+    }
+
+    private void onUnregisterListenerLocal(UnregisterListenerLocal unregisterListener) {
+        LOG.debug("{}: onUnregisterListenerLocal: {}", persistenceId(), unregisterListener);
+
+        listenerSupport.removeEntityOwnershipListener(unregisterListener.getEntityType(), unregisterListener.getListener());
+
+        getSender().tell(SuccessReply.INSTANCE, getSelf());
+    }
+
+    void tryCommitModifications(final BatchedModifications modifications) {
+        if(isLeader()) {
+            LOG.debug("{}: Committing BatchedModifications {} locally", persistenceId(), modifications.getTransactionID());
+
+            // Note that it's possible the commit won't get consensus and will timeout and not be applied
+            // to the state. However we don't need to retry it in that case b/c it will be committed to
+            // the journal first and, once a majority of followers come back on line and it is replicated,
+            // it will be applied at that point.
+            handleBatchedModificationsLocal(modifications, self());
         } else {
             final ActorSelection leader = getLeader();
             if (leader != null) {
-                LOG.debug("Sending BatchedModifications {} to leader {}", modifications.getTransactionID(), leader);
+                if(LOG.isDebugEnabled()) {
+                    LOG.debug("{}: Sending BatchedModifications {} to leader {}", persistenceId(),
+                            modifications.getTransactionID(), leader);
+                }
 
                 Future<Object> future = Patterns.ask(leader, modifications, TimeUnit.SECONDS.toMillis(
                         getDatastoreContext().getShardTransactionCommitTimeoutInSeconds()));
-                future.onComplete(new OnComplete<Object>() {
-                    @Override
-                    public void onComplete(Throwable failure, Object response) {
-                        if(failure != null) {
-                            if(failure instanceof AskTimeoutException) {
-                                LOG.debug("BatchedModifications {} to leader {} timed out - retrying",
-                                        modifications.getTransactionID(), leader);
-                                tryCommitModifications(modifications);
-                            } else {
-                                LOG.error("BatchedModifications {} to leader {} failed",
-                                        modifications.getTransactionID(), leader, failure);
-                            }
-                        } else {
-                            LOG.debug("BatchedModifications {} to leader {} succeeded",
-                                    modifications.getTransactionID(), leader);
-                        }
-                    }
-                }, getContext().dispatcher());
-            } else {
-                LOG.debug("No leader - adding BatchedModifications {} for retry", modifications.getTransactionID());
 
-                retryModifications.add(modifications);
+                Patterns.pipe(future, getContext().dispatcher()).pipeTo(getSelf(), ActorRef.noSender());
             }
         }
     }
 
+    boolean hasLeader() {
+        return getLeader() != null && !isIsolatedLeader();
+    }
+
     @Override
     protected void onStateChanged() {
         super.onStateChanged();
 
-        if(!retryModifications.isEmpty() && getLeader() != null && !isIsolatedLeader()) {
-            LOG.debug("# BatchedModifications to retry {}", retryModifications.size());
+        boolean isLeader = isLeader();
+        if(LOG.isDebugEnabled()) {
+            LOG.debug("{}: onStateChanged: isLeader: {}, hasLeader: {}", persistenceId(), isLeader, hasLeader());
+        }
+
+        commitCoordinator.onStateChanged(this, isLeader);
+    }
+
+    @Override
+    protected void onLeaderChanged(String oldLeader, String newLeader) {
+        super.onLeaderChanged(oldLeader, newLeader);
+
+        boolean isLeader = isLeader();
+        LOG.debug("{}: onLeaderChanged: oldLeader: {}, newLeader: {}, isLeader: {}", persistenceId(), oldLeader,
+                newLeader, isLeader);
 
-            List<BatchedModifications> retryModificationsCopy = new ArrayList<>(retryModifications);
-            retryModifications.clear();
-            for(BatchedModifications mods: retryModificationsCopy) {
-                tryCommitModifications(mods);
+        if(isLeader) {
+            // We were just elected leader. If the old leader is down, select new owners for the entities
+            // owned by the down leader.
+
+            String oldLeaderMemberName = peerIdToMemberNames.get(oldLeader);
+
+            LOG.debug("{}: oldLeaderMemberName: {}", persistenceId(), oldLeaderMemberName);
+
+            if(downPeerMemberNames.contains(oldLeaderMemberName)) {
+                removeCandidateFromEntities(oldLeaderMemberName);
             }
+        } else {
+            // The leader changed - notify the coordinator to check if pending modifications need to be sent.
+            // While onStateChanged also does this, this method handles the case where the shard hears from a
+            // leader and stays in the follower state. In that case no behavior state change occurs.
+            commitCoordinator.onStateChanged(this, isLeader);
         }
     }
 
-    private void onUnregisterCandidateLocal(UnregisterCandidateLocal unregisterCandidate) {
-        // TODO - implement
-        getSender().tell(SuccessReply.INSTANCE, getSelf());
+    private void onCandidateRemoved(CandidateRemoved message) {
+        LOG.debug("{}: onCandidateRemoved: {}", persistenceId(), message);
+
+        if(isLeader()) {
+            String currentOwner = getCurrentOwner(message.getEntityPath());
+            if(message.getRemovedCandidate().equals(currentOwner) || message.getRemainingCandidates().size() == 0){
+                String entityType = EntityOwnersModel.entityTypeFromEntityPath(message.getEntityPath());
+                writeNewOwner(message.getEntityPath(),
+                        newOwner(message.getRemainingCandidates(), entityOwnershipStatistics.byEntityType(entityType),
+                                getEntityOwnerElectionStrategy(message.getEntityPath())));
+            }
+        } else {
+            // We're not the leader. If the removed candidate is our local member then check if we actually
+            // have a local candidate registered. If we do then we must have been partitioned from the leader
+            // and the leader removed our candidate since the leader can't tell the difference between a
+            // temporary network partition and a node's process actually restarted. So, in that case, re-add
+            // our candidate.
+            if(localMemberName.equals(message.getRemovedCandidate()) &&
+                    listenerSupport.hasCandidateForEntity(createEntity(message.getEntityPath()))) {
+                LOG.debug("Local candidate member was removed but a local candidate is registered for {}" +
+                    " - adding back local candidate", message.getEntityPath());
+
+                commitCoordinator.commitModification(new MergeModification(
+                        candidatePath(message.getEntityPath(), localMemberName),
+                        candidateMapEntry(localMemberName)), this);
+            }
+        }
     }
 
-    public static Props props(final ShardIdentifier name, final Map<String, String> peerAddresses,
-            final DatastoreContext datastoreContext, final SchemaContext schemaContext, final String localMemberName) {
-        return Props.create(new Creator(name, peerAddresses, datastoreContext, schemaContext, localMemberName));
+    private EntityOwnerSelectionStrategy getEntityOwnerElectionStrategy(YangInstanceIdentifier entityPath) {
+        final String entityType = EntityOwnersModel.entityTypeFromEntityPath(entityPath);
+        return strategyConfig.createStrategy(entityType);
     }
 
-    private static class Creator extends AbstractShardCreator {
-        private static final long serialVersionUID = 1L;
+    private void onCandidateAdded(CandidateAdded message) {
+        if(!isLeader()){
+            return;
+        }
+
+        LOG.debug("{}: onCandidateAdded: {}", persistenceId(), message);
+
+        // Since a node's candidate member is only added by the node itself, we can assume the node is up so
+        // remove it from the downPeerMemberNames.
+        downPeerMemberNames.remove(message.getNewCandidate());
+
+        String currentOwner = getCurrentOwner(message.getEntityPath());
+        EntityOwnerSelectionStrategy strategy = getEntityOwnerElectionStrategy(message.getEntityPath());
+        if(Strings.isNullOrEmpty(currentOwner)){
+            if(strategy.getSelectionDelayInMillis() == 0L) {
+                String entityType = EntityOwnersModel.entityTypeFromEntityPath(message.getEntityPath());
+                writeNewOwner(message.getEntityPath(), newOwner(message.getAllCandidates(),
+                        entityOwnershipStatistics.byEntityType(entityType), strategy));
+            } else {
+                scheduleOwnerSelection(message.getEntityPath(), message.getAllCandidates(), strategy);
+            }
+        }
+    }
 
-        private final String localMemberName;
+    private void onPeerDown(PeerDown peerDown) {
+        LOG.info("{}: onPeerDown: {}", persistenceId(), peerDown);
 
-        Creator(final ShardIdentifier name, final Map<String, String> peerAddresses,
-                final DatastoreContext datastoreContext, final SchemaContext schemaContext,
-                final String localMemberName) {
-            super(name, peerAddresses, datastoreContext, schemaContext);
+        String downMemberName = peerDown.getMemberName();
+        if(downPeerMemberNames.add(downMemberName) && isLeader()) {
+            // Remove the down peer as a candidate from all entities.
+            removeCandidateFromEntities(downMemberName);
+        }
+    }
+
+    private void onPeerUp(PeerUp peerUp) {
+        LOG.debug("{}: onPeerUp: {}", persistenceId(), peerUp);
+
+        peerIdToMemberNames.put(peerUp.getPeerId(), peerUp.getMemberName());
+        downPeerMemberNames.remove(peerUp.getMemberName());
+
+        // Notify the coordinator to check if pending modifications need to be sent. We do this here
+        // to handle the case where the leader's peer address isn't now yet when a prior state or
+        // leader change occurred.
+        commitCoordinator.onStateChanged(this, isLeader());
+    }
+
+    private void removeCandidateFromEntities(final String owner) {
+        final BatchedModifications modifications = commitCoordinator.newBatchedModifications();
+        searchForEntities(new EntityWalker() {
+            @Override
+            public void onEntity(MapEntryNode entityTypeNode, MapEntryNode entityNode) {
+                if (hasCandidate(entityNode, owner)) {
+                    YangInstanceIdentifier entityId =
+                            (YangInstanceIdentifier) entityNode.getIdentifier().getKeyValues().get(ENTITY_ID_QNAME);
+                    YangInstanceIdentifier candidatePath = candidatePath(
+                            entityTypeNode.getIdentifier().getKeyValues().get(ENTITY_TYPE_QNAME).toString(),
+                            entityId, owner);
+
+                    LOG.info("{}: Found entity {}, removing candidate {}, path {}", persistenceId(), entityId,
+                            owner, candidatePath);
+
+                    modifications.addModification(new DeleteModification(candidatePath));
+                }
+            }
+        });
+
+        commitCoordinator.commitModifications(modifications, this);
+    }
+
+    private static boolean hasCandidate(MapEntryNode entity, String candidateName) {
+        return ((MapNode)entity.getChild(CANDIDATE_NODE_ID).get()).getChild(candidateNodeKey(candidateName)).isPresent();
+    }
+
+    private void searchForEntitiesOwnedBy(final String owner, final EntityWalker walker) {
+        Optional<NormalizedNode<?, ?>> possibleEntityTypes = getDataStore().readNode(ENTITY_TYPES_PATH);
+        if(!possibleEntityTypes.isPresent()) {
+            return;
+        }
+
+        LOG.debug("{}: Searching for entities owned by {}", persistenceId(), owner);
+
+        searchForEntities(new EntityWalker() {
+            @Override
+            public void onEntity(MapEntryNode entityTypeNode, MapEntryNode entityNode) {
+                Optional<DataContainerChild<? extends PathArgument, ?>> possibleOwner =
+                        entityNode.getChild(ENTITY_OWNER_NODE_ID);
+                if (possibleOwner.isPresent() && owner.equals(possibleOwner.get().getValue().toString())) {
+                    walker.onEntity(entityTypeNode, entityNode);
+                }
+            }
+        });
+    }
+
+    private void searchForEntities(EntityWalker walker) {
+        Optional<NormalizedNode<?, ?>> possibleEntityTypes = getDataStore().readNode(ENTITY_TYPES_PATH);
+        if(!possibleEntityTypes.isPresent()) {
+            return;
+        }
+
+        for(MapEntryNode entityType:  ((MapNode) possibleEntityTypes.get()).getValue()) {
+            Optional<DataContainerChild<? extends PathArgument, ?>> possibleEntities =
+                    entityType.getChild(ENTITY_NODE_ID);
+            if(!possibleEntities.isPresent()) {
+                continue; // shouldn't happen but handle anyway
+            }
+
+            for(MapEntryNode entity:  ((MapNode) possibleEntities.get()).getValue()) {
+                walker.onEntity(entityType, entity);
+            }
+        }
+    }
+
+    private void writeNewOwner(YangInstanceIdentifier entityPath, String newOwner) {
+        LOG.debug("{}: Writing new owner {} for entity {}", persistenceId(), newOwner, entityPath);
+
+        commitCoordinator.commitModification(new WriteModification(entityPath.node(ENTITY_OWNER_QNAME),
+                ImmutableNodes.leafNode(ENTITY_OWNER_NODE_ID, newOwner)), this);
+    }
+
+    /**
+     * Schedule a new owner selection job. Cancelling any outstanding job if it has not been cancelled.
+     *
+     * @param entityPath
+     * @param allCandidates
+     */
+    public void scheduleOwnerSelection(YangInstanceIdentifier entityPath, Collection<String> allCandidates,
+                                       EntityOwnerSelectionStrategy strategy){
+        Cancellable lastScheduledTask = entityToScheduledOwnershipTask.get(entityPath);
+        if(lastScheduledTask != null && !lastScheduledTask.isCancelled()){
+            lastScheduledTask.cancel();
+        }
+        lastScheduledTask = context().system().scheduler().scheduleOnce(
+                FiniteDuration.apply(strategy.getSelectionDelayInMillis(), TimeUnit.MILLISECONDS)
+                , self(), new SelectOwner(entityPath, allCandidates, strategy)
+                , context().system().dispatcher(), self());
+
+        entityToScheduledOwnershipTask.put(entityPath, lastScheduledTask);
+    }
+
+    private String newOwner(Collection<String> candidates, Map<String, Long> statistics, EntityOwnerSelectionStrategy ownerSelectionStrategy) {
+        Collection<String> viableCandidates = getViableCandidates(candidates);
+        if(viableCandidates.size() == 0){
+            return "";
+        }
+        return ownerSelectionStrategy.newOwner(viableCandidates, statistics);
+    }
+
+    private Collection<String> getViableCandidates(Collection<String> candidates) {
+        Collection<String> viableCandidates = new ArrayList<>();
+
+        for (String candidate : candidates) {
+            if (!downPeerMemberNames.contains(candidate)) {
+                viableCandidates.add(candidate);
+            }
+        }
+        return viableCandidates;
+    }
+
+    private String getCurrentOwner(YangInstanceIdentifier entityId) {
+        Optional<NormalizedNode<?, ?>> optionalEntityOwner = getDataStore().readNode(entityId.node(ENTITY_OWNER_QNAME));
+        if(optionalEntityOwner.isPresent()){
+            return optionalEntityOwner.get().getValue().toString();
+        }
+        return null;
+    }
+
+    private static interface EntityWalker {
+        void onEntity(MapEntryNode entityTypeNode, MapEntryNode entityNode);
+    }
+
+    public static Builder newBuilder() {
+        return new Builder();
+    }
+
+    static class Builder extends Shard.AbstractBuilder<Builder, EntityOwnershipShard> {
+        private String localMemberName;
+        private EntityOwnerSelectionStrategyConfig ownerSelectionStrategyConfig;
+
+        protected Builder() {
+            super(EntityOwnershipShard.class);
+        }
+
+        Builder localMemberName(String localMemberName) {
+            checkSealed();
             this.localMemberName = localMemberName;
+            return this;
+        }
+
+        Builder ownerSelectionStrategyConfig(EntityOwnerSelectionStrategyConfig ownerSelectionStrategyConfig){
+            checkSealed();
+            this.ownerSelectionStrategyConfig = ownerSelectionStrategyConfig;
+            return this;
         }
 
         @Override
-        public Shard create() throws Exception {
-            return new EntityOwnershipShard(name, peerAddresses, datastoreContext, schemaContext, localMemberName);
+        protected void verify() {
+            super.verify();
+            Preconditions.checkNotNull(localMemberName, "localMemberName should not be null");
+            Preconditions.checkNotNull(ownerSelectionStrategyConfig, "ownerSelectionStrategyConfig should not be null");
         }
     }
 }