BUG-9145: rework singleton service group state tracking 66/63866/2
authorRobert Varga <robert.varga@pantheon.tech>
Thu, 21 Sep 2017 23:00:04 +0000 (01:00 +0200)
committerVratko Polak <vrpolak@cisco.com>
Mon, 2 Oct 2017 11:25:52 +0000 (13:25 +0200)
The beef of the issue here is that there are multiple codepaths
which end up sharing state transitions -- hence we could erroneously
end up waiting for entity release, when if fact we have already
lost it.

Rather than going the explicit FSM route, which would require quite
a few more states, rework state tracking such that each event is
evaluated against current state. This has the nice feature of making
duplicate events idempotent, as the code is all about reconciling
current and intended state.

This also disentangles the loss of ownership codepaths, which means
we can optimize behavior when partitions are involved -- specifically
jeopardy service entity ownership does not result in service shutdown,
because service liveness is actually guaranteed by cleanup entity.
Hence we can keep the services and the cleanup entity intact, leading
to less churn during partitions.

Should the cleanup entity report jeopardy, we need to bring the services
down, but we do not need to unregister the cleanup entity -- if the
partition is healed, we can start services again without the delay
incurred by EOS. If we end up losing partition healing, cleanup entity
ownership will be taken from us, at which point we also unregister.

Change-Id: Id5c8a97722b9a21114135554d92c17898dd0150e
Signed-off-by: Robert Varga <robert.varga@pantheon.tech>
singleton-service/mdsal-singleton-dom-impl/src/main/java/org/opendaylight/mdsal/singleton/dom/impl/ClusterSingletonServiceGroupImpl.java
singleton-service/mdsal-singleton-dom-impl/src/test/java/org/opendaylight/mdsal/singleton/dom/impl/AbstractDOMClusterServiceProviderTest.java
singleton-service/mdsal-singleton-dom-impl/src/test/java/org/opendaylight/mdsal/singleton/dom/impl/ClusterSingletonServiceGroupImplTest.java
singleton-service/mdsal-singleton-dom-impl/src/test/java/org/opendaylight/mdsal/singleton/dom/impl/DOMClusterSingletonServiceProviderAsyncImplTest.java
singleton-service/mdsal-singleton-dom-impl/src/test/java/org/opendaylight/mdsal/singleton/dom/impl/DOMClusterSingletonServiceProviderImplTest.java

index 279129adc012d60ff9c4ce28dfc914653338308d..877acbba4e7d73f222a51515ab3d38632d8d7751 100644 (file)
@@ -23,7 +23,9 @@ import java.util.concurrent.atomic.AtomicReference;
 import java.util.concurrent.locks.ReentrantLock;
 import javax.annotation.CheckReturnValue;
 import javax.annotation.concurrent.GuardedBy;
+import javax.annotation.concurrent.ThreadSafe;
 import org.opendaylight.mdsal.eos.common.api.CandidateAlreadyRegisteredException;
+import org.opendaylight.mdsal.eos.common.api.EntityOwnershipChangeState;
 import org.opendaylight.mdsal.eos.common.api.GenericEntity;
 import org.opendaylight.mdsal.eos.common.api.GenericEntityOwnershipCandidateRegistration;
 import org.opendaylight.mdsal.eos.common.api.GenericEntityOwnershipChange;
@@ -56,46 +58,50 @@ import org.slf4j.LoggerFactory;
  * @param <G> the GenericEntityOwnershipListener type
  * @param <S> the GenericEntityOwnershipService type
  */
+@ThreadSafe
 final class ClusterSingletonServiceGroupImpl<P extends Path<P>, E extends GenericEntity<P>,
         C extends GenericEntityOwnershipChange<P, E>,  G extends GenericEntityOwnershipListener<P, C>,
         S extends GenericEntityOwnershipService<P, E, G>> extends ClusterSingletonServiceGroup<P, E, C> {
-    private enum State {
+
+    private enum EntityState {
         /**
-         * This group has been freshly allocated and has not been started yet.
+         * This entity was never registered.
          */
-        INITIAL,
+        UNREGISTERED,
         /**
-         * Operational state. Service entity is registered, but ownership was not resolved yet.
+         * Registration exists, but we are waiting for it to resolve.
          */
         REGISTERED,
         /**
-         * Operational state. Service entity confirmed to be follower.
-         */
-        STANDBY,
-        /**
-         * Service entity acquired. Attempting to acquire cleanup entity.
+         * Registration indicated we are the owner.
          */
-        TAKING_OWNERSHIP,
+        OWNED,
         /**
-         * Both entities held and user services are being started.
+         * Registration indicated we are the owner, but global state is uncertain -- meaning there can be owners in
+         * another partition, for example.
          */
-        STARTING_SERVICES,
+        OWNED_JEOPARDY,
         /**
-         * Steady state. Both entities held and services have finished starting.
+         * Registration indicated we are not the owner. In this state we do not care about global state, therefore we
+         * do not need an UNOWNED_JEOPARDY state.
          */
-        OWNER,
+        UNOWNED,
+    }
+
+    private enum ServiceState {
         /**
-         * User services are being stopped due to either loss of an entity or a shutdown.
+         * Local services are stopped.
          */
-        STOPPING_SERVICES,
+        STOPPED,
         /**
-         * We have stopped services and are now relinquishing the cleanup entity.
+         * Local services are up and running.
          */
-        RELEASING_OWNERSHIP,
+        // FIXME: we should support async startup, which will require a STARTING state.
+        STARTED,
         /**
-         * Terminated, this group cannot be used anymore.
+         * Local services are being stopped.
          */
-        TERMINATED
+        STOPPING,
     }
 
     private static final Logger LOG = LoggerFactory.getLogger(ClusterSingletonServiceGroupImpl.class);
@@ -107,23 +113,71 @@ final class ClusterSingletonServiceGroupImpl<P extends Path<P>, E extends Generi
     private final E serviceEntity;
     private final E cleanupEntity;
 
-    private final AtomicReference<SettableFuture<Void>> closeFuture = new AtomicReference<>();
     private final ReentrantLock lock = new ReentrantLock(true);
 
     @GuardedBy("lock")
     private final List<ClusterSingletonService> serviceGroup;
 
+    /*
+     * State tracking is quite involved, as we are tracking up to four asynchronous sources of events:
+     * - user calling close()
+     * - service entity ownership
+     * - cleanup entity ownership
+     * - service shutdown future
+     *
+     * Absolutely correct solution would be a set of behaviors, which govern each state, remembering where we want to
+     * get to and what we are doing. That would result in ~15 classes which would quickly render this code unreadable
+     * due to boilerplate overhead.
+     *
+     * We therefore take a different approach, tracking state directly in this class and evaluate state transitions
+     * based on recorded bits -- without explicit representation of state machine state.
+     */
+    /**
+     * Group close future. In can only go from null to non-null reference. Whenever it is non-null, it indicates that
+     * the user has closed the group and we are converging to termination.
+     */
+    // We are using volatile get-and-set to support non-blocking close(). It may be more efficient to inline it here,
+    // as we perform a volatile read after unlocking -- that volatile read may easier on L1 cache.
+    // XXX: above needs a microbenchmark contention ever becomes a problem.
+    private final AtomicReference<SettableFuture<Void>> closeFuture = new AtomicReference<>();
+
+    /**
+     * Service (base) entity registration. This entity selects an owner candidate across nodes. Candidates proceed to
+     * acquire {@link #cleanupEntity}.
+     */
     @GuardedBy("lock")
-    private State state = State.INITIAL;
+    private GenericEntityOwnershipCandidateRegistration<P, E> serviceEntityReg = null;
+    /**
+     * Service (base) entity last reported state.
+     */
+    @GuardedBy("lock")
+    private EntityState serviceEntityState = EntityState.UNREGISTERED;
 
+    /**
+     * Cleanup (owner) entity registration. This entity guards access to service state and coordinates shutdown cleanup
+     * and startup.
+     */
     @GuardedBy("lock")
-    private List<C> capture;
+    private GenericEntityOwnershipCandidateRegistration<P, E> cleanupEntityReg;
+    /**
+     * Cleanup (owner) entity last reported state.
+     */
+    @GuardedBy("lock")
+    private EntityState cleanupEntityState = EntityState.UNREGISTERED;
 
-    /* EOS Candidate Registrations */
+    /**
+     * Optional event capture list. This field is initialized when we interact with entity ownership service, to capture
+     * events reported during EOS method invocation -- like immediate acquisition of entity when we register it. This
+     * prevents bugs from recursion.
+     */
     @GuardedBy("lock")
-    private GenericEntityOwnershipCandidateRegistration<P, E> serviceEntityReg;
+    private List<C> capture = null;
+
+    /**
+     * State of local services.
+     */
     @GuardedBy("lock")
-    private GenericEntityOwnershipCandidateRegistration<P, E> cleanupEntityReg;
+    private ServiceState localServicesState = ServiceState.STOPPED;
 
     /**
      * Class constructor. Note: last argument is reused as-is.
@@ -187,76 +241,109 @@ final class ClusterSingletonServiceGroupImpl<P extends Path<P>, E extends Generi
     }
 
     @GuardedBy("lock")
-    private void updateState(final State newState) {
-        LOG.debug("Service group {} switching from {} to {}", identifier, state, newState);
-        state = Verify.verifyNotNull(newState);
+    private void startCapture() {
+        Verify.verify(capture == null, "Service group {} is already capturing events {}", identifier, capture);
+        capture = new ArrayList<>(0);
+        LOG.debug("Service group {} started capturing events", identifier);
+    }
+
+    private List<C> endCapture() {
+        final List<C> ret = Verify.verifyNotNull(capture, "Service group {} is not currently capturing", identifier);
+        capture = null;
+        LOG.debug("Service group {} finished capturing events, {} events to process", identifier, ret.size());
+        return ret;
     }
 
     @GuardedBy("lock")
     private void lockedClose(final SettableFuture<Void> future) {
         if (serviceEntityReg != null) {
-            LOG.debug("Service group {} unregistering", identifier);
+            // We are still holding the service registration, close it now...
+            LOG.debug("Service group {} unregistering service entity {}", identifier, serviceEntity);
+            startCapture();
             serviceEntityReg.close();
             serviceEntityReg = null;
+
+            // This can potentially mutate our state, so all previous checks need to be re-validated.
+            endCapture().forEach(this::lockedOwnershipChanged);
         }
 
-        switch (state) {
-            case INITIAL:
-                // Not started: not much to do
-                terminate(future);
-                break;
-            case TERMINATED:
-                // Already done: no-op
-                break;
+        // Now check service entity state: if it is still owned, we need to wait until it is acknowledged as
+        // unregistered.
+        switch (serviceEntityState) {
             case REGISTERED:
-            case STANDBY:
-                LOG.debug("Service group {} terminated", identifier);
-                terminate(future);
-                break;
-            case OWNER:
-                // No-op, we will react to the loss of registration instead.
-                break;
-            case STOPPING_SERVICES:
-                // Waiting for services. Will resume once we get notified.
+            case UNOWNED:
+            case UNREGISTERED:
+                // We have either successfully shut down, or have never started up, proceed with termination
                 break;
-            case RELEASING_OWNERSHIP:
-                // Waiting for cleanup entity to flip, will resume afterwards.
-                break;
-            case TAKING_OWNERSHIP:
-                // Abort taking of ownership and close
-                LOG.debug("Service group {} aborting ownership bid", identifier);
-                cleanupEntityReg.close();
-                cleanupEntityReg = null;
-                updateState(State.RELEASING_OWNERSHIP);
+            case OWNED:
+                // We have unregistered, but EOS has not reported our loss of ownership. We will continue with shutdown
+                // when that loss is reported.
+                LOG.debug("Service group {} is still owned, postponing termination", identifier);
+                return;
+            case OWNED_JEOPARDY:
+                // This is a significant event, as it relates to cluster split/join operations, operators need to know
+                // we are waiting for cluster join event.
+                LOG.info("Service group {} is still owned with split cluster, postponing termination", identifier);
+                return;
+            default:
+                throw new IllegalStateException("Unhandled service entity state " + serviceEntityState);
+        }
+
+        // We do not own service entity state: we need to ensure services are stopped.
+        if (stopServices()) {
+            LOG.debug("Service group {} started shutting down services, postponing termination", identifier);
+            return;
+        }
+
+        // Local cleanup completed, release cleanup entity if needed
+        if (cleanupEntityReg != null) {
+            LOG.debug("Service group {} unregistering cleanup entity {}", identifier, cleanupEntity);
+            startCapture();
+            cleanupEntityReg.close();
+            cleanupEntityReg = null;
+
+            // This can potentially mutate our state, so all previous checks need to be re-validated.
+            endCapture().forEach(this::lockedOwnershipChanged);
+        }
+
+        switch (cleanupEntityState) {
+            case REGISTERED:
+            case UNOWNED:
+            case UNREGISTERED:
+                // We have either successfully shut down, or have never started up, proceed with termination
                 break;
+            case OWNED:
+                // We have unregistered, but EOS has not reported our loss of ownership. We will continue with shutdown
+                // when that loss is reported.
+                LOG.debug("Service group {} is still owns cleanup, postponing termination", identifier);
+                return;
+            case OWNED_JEOPARDY:
+                // This is a significant event, as it relates to cluster split/join operations, operators need to know
+                // we are waiting for cluster join event.
+                LOG.info("Service group {} is still owns cleanup with split cluster, postponing termination",
+                    identifier);
+                return;
             default:
-                throw new IllegalStateException("Unhandled state " + state);
+                throw new IllegalStateException("Unhandled cleanup entity state " + serviceEntityState);
         }
-    }
 
-    @GuardedBy("lock")
-    private void terminate(final SettableFuture<Void> future) {
-        updateState(State.TERMINATED);
-        Verify.verify(future.set(null));
+        // No registrations left and no service operations pending, we are done
+        LOG.debug("Service group {} completing termination", identifier);
+        future.set(null);
     }
 
     @Override
     void initialize() throws CandidateAlreadyRegisteredException {
-        LOG.debug("Initialization ClusterSingletonGroup {}", identifier);
-
         lock.lock();
         try {
-            Preconditions.checkState(state == State.INITIAL, "Unexpected singleton group %s state %s", identifier,
-                    state);
+            Preconditions.checkState(serviceEntityState == EntityState.UNREGISTERED,
+                    "Singleton group %s was already initilized", identifier);
 
-            // Catch events if they fire during this call
-            capture = new ArrayList<>(0);
+            LOG.debug("Initializing service group {} with services {}", identifier, serviceGroup);
+            startCapture();
             serviceEntityReg = entityOwnershipService.registerCandidate(serviceEntity);
-            state = State.REGISTERED;
-
-            final List<C> captured = capture;
-            capture = null;
-            captured.forEach(this::lockedOwnershipChanged);
+            serviceEntityState = EntityState.REGISTERED;
+            endCapture().forEach(this::lockedOwnershipChanged);
         } finally {
             lock.unlock();
         }
@@ -272,23 +359,28 @@ final class ClusterSingletonServiceGroupImpl<P extends Path<P>, E extends Generi
         Verify.verify(identifier.equals(service.getIdentifier().getValue()));
         checkNotClosed();
 
-        LOG.debug("RegisterService method call for ClusterSingletonServiceGroup {}", identifier);
-
         lock.lock();
         try {
-            Preconditions.checkState(state != State.INITIAL, "Service group %s is not initialized yet", identifier);
+            Preconditions.checkState(serviceEntityState != EntityState.UNREGISTERED,
+                    "Service group %s is not initialized yet", identifier);
+
+            LOG.debug("Adding service {} to service group {}", service, identifier);
             serviceGroup.add(service);
 
-            switch (state) {
-                case OWNER:
-                case STARTING_SERVICES:
+            switch (localServicesState) {
+                case STARTED:
+                    LOG.debug("Service group {} starting late-registered service {}", identifier, service);
                     service.instantiateServiceInstance();
                     break;
-                default:
+                case STOPPED:
+                case STOPPING:
                     break;
+                default:
+                    throw new IllegalStateException("Unhandled local services state " + localServicesState);
             }
         } finally {
             lock.unlock();
+            finishCloseIfNeeded();
         }
     }
 
@@ -310,13 +402,16 @@ final class ClusterSingletonServiceGroupImpl<P extends Path<P>, E extends Generi
             Verify.verify(serviceGroup.remove(service));
             LOG.debug("Service {} was removed from group.", service.getIdentifier().getValue());
 
-            switch (state) {
-                case OWNER:
-                case STARTING_SERVICES:
+            switch (localServicesState) {
+                case STARTED:
+                    LOG.warn("Service group {} stopping unregistered service {}", identifier, service);
                     service.closeServiceInstance();
                     break;
-                default:
+                case STOPPED:
+                case STOPPING:
                     break;
+                default:
+                    throw new IllegalStateException("Unhandled local services state " + localServicesState);
             }
 
             return false;
@@ -343,77 +438,161 @@ final class ClusterSingletonServiceGroupImpl<P extends Path<P>, E extends Generi
         }
     }
 
+    /**
+     * Handle an ownership change with the lock held. Callers are expected to handle termination conditions, this method
+     * and anything it calls must not call {@link #lockedClose(SettableFuture)}.
+     *
+     * @param ownershipChange reported change
+     */
+    @GuardedBy("lock")
     private void lockedOwnershipChanged(final C ownershipChange) {
-        if (ownershipChange.inJeopardy()) {
-            LOG.warn("Cluster Node lost connection to another cluster nodes {}", ownershipChange);
-            lostOwnership();
-            return;
-        }
-
         final E entity = ownershipChange.getEntity();
         if (serviceEntity.equals(entity)) {
-            serviceOwnershipChanged(ownershipChange);
+            serviceOwnershipChanged(ownershipChange.getState(), ownershipChange.inJeopardy());
         } else if (cleanupEntity.equals(entity)) {
-            cleanupCandidateOwnershipChanged(ownershipChange);
+            cleanupCandidateOwnershipChanged(ownershipChange.getState(), ownershipChange.inJeopardy());
         } else {
             LOG.warn("Group {} received unrecognized change {}", identifier, ownershipChange);
         }
     }
 
-    private void cleanupCandidateOwnershipChanged(final C ownershipChange) {
-        switch (ownershipChange.getState()) {
+    private void cleanupCandidateOwnershipChanged(final EntityOwnershipChangeState state, final boolean jeopardy) {
+        if (jeopardy) {
+            switch (state) {
+                case LOCAL_OWNERSHIP_GRANTED:
+                case LOCAL_OWNERSHIP_RETAINED_WITH_NO_CHANGE:
+                    if (cleanupEntityReg == null) {
+                        LOG.debug("Service group {} ignoring cleanup entity ownership when unregistered", identifier);
+                        return;
+                    }
+
+                    LOG.warn("Service group {} cleanup entity owned without certainty", identifier);
+                    cleanupEntityState = EntityState.OWNED_JEOPARDY;
+                    break;
+                case LOCAL_OWNERSHIP_LOST_NEW_OWNER:
+                case LOCAL_OWNERSHIP_LOST_NO_OWNER:
+                case REMOTE_OWNERSHIP_CHANGED:
+                case REMOTE_OWNERSHIP_LOST_NO_OWNER:
+                    LOG.info("Service group {} cleanup entity ownership uncertain", identifier);
+                    cleanupEntityState = EntityState.UNOWNED;
+                    break;
+                default:
+                    throw new IllegalStateException("Unhandled cleanup entity jeopardy change " + state);
+            }
+
+            stopServices();
+            return;
+        }
+
+        if (cleanupEntityState == EntityState.OWNED_JEOPARDY) {
+            // Pair info message with previous jeopardy
+            LOG.info("Service group {} cleanup entity ownership ascertained", identifier);
+        }
+
+        switch (state) {
             case LOCAL_OWNERSHIP_GRANTED:
-                switch (state) {
-                    case TAKING_OWNERSHIP:
-                        // SLAVE to MASTER
+            case LOCAL_OWNERSHIP_RETAINED_WITH_NO_CHANGE:
+                if (cleanupEntityReg == null) {
+                    LOG.debug("Service group {} ignoring cleanup entity ownership when unregistered", identifier);
+                    return;
+                }
+
+                cleanupEntityState = EntityState.OWNED;
+                switch (localServicesState) {
+                    case STARTED:
+                        LOG.debug("Service group {} already has local services running", identifier);
+                        break;
+                    case STOPPED:
                         startServices();
-                        return;
-                    default:
                         break;
+                    case STOPPING:
+                        LOG.debug("Service group {} has local services stopping, postponing startup", identifier);
+                        break;
+                    default:
+                        throw new IllegalStateException("Unhandled local services state " + localServicesState);
                 }
                 break;
             case LOCAL_OWNERSHIP_LOST_NEW_OWNER:
             case LOCAL_OWNERSHIP_LOST_NO_OWNER:
-                switch (state) {
-                    case RELEASING_OWNERSHIP:
-                        // Slight cheat: if we are closing down, we just need to notify the future
-                        updateState(isClosed() ? State.INITIAL : State.STANDBY);
-                        return;
-                    case STARTING_SERVICES:
-                    case OWNER:
-                    case TAKING_OWNERSHIP:
-                        LOG.warn("Group {} lost cleanup ownership in state {}", identifier, state);
-                        return;
-                    default:
-                        break;
-                }
-
+                cleanupEntityState = EntityState.UNOWNED;
+                stopServices();
                 break;
-            case LOCAL_OWNERSHIP_RETAINED_WITH_NO_CHANGE:
-            case REMOTE_OWNERSHIP_CHANGED:
             case REMOTE_OWNERSHIP_LOST_NO_OWNER:
+            case REMOTE_OWNERSHIP_CHANGED:
+                cleanupEntityState = EntityState.UNOWNED;
+                break;
             default:
+                LOG.warn("Service group {} ignoring unhandled cleanup entity change {}", identifier, state);
                 break;
         }
-
-        LOG.debug("Group {} in state {} ignoring cleanup OwnershipChange {}", identifier, state, ownershipChange);
     }
 
-    private void serviceOwnershipChanged(final C ownershipChange) {
-        switch (ownershipChange.getState()) {
+    private void serviceOwnershipChanged(final EntityOwnershipChangeState state, final boolean jeopardy) {
+        if (jeopardy) {
+            LOG.info("Service group {} service entity ownership uncertain", identifier);
+
+            // Service entity ownership is uncertain, which means we want to record the state, but we do not want
+            // to stop local services nor do anything with the cleanup entity.
+            switch (state) {
+                case LOCAL_OWNERSHIP_GRANTED:
+                case LOCAL_OWNERSHIP_RETAINED_WITH_NO_CHANGE:
+                    if (serviceEntityReg == null) {
+                        LOG.debug("Service group {} ignoring service entity ownership when unregistered", identifier);
+                        return;
+                    }
+
+                    serviceEntityState = EntityState.OWNED_JEOPARDY;
+                    break;
+                case LOCAL_OWNERSHIP_LOST_NEW_OWNER:
+                case LOCAL_OWNERSHIP_LOST_NO_OWNER:
+                case REMOTE_OWNERSHIP_CHANGED:
+                case REMOTE_OWNERSHIP_LOST_NO_OWNER:
+                    serviceEntityState = EntityState.UNOWNED;
+                    break;
+                default:
+                    throw new IllegalStateException("Unhandled cleanup entity jeopardy change " + state);
+            }
+            return;
+        }
+
+        if (serviceEntityState == EntityState.OWNED_JEOPARDY) {
+            // Pair info message with previous jeopardy
+            LOG.info("Service group {} service entity ownership ascertained", identifier);
+        }
+
+        switch (state) {
             case LOCAL_OWNERSHIP_GRANTED:
-                // SLAVE to MASTER : ownershipChange.getState().isOwner() && !ownershipChange.getState().wasOwner()
+            case LOCAL_OWNERSHIP_RETAINED_WITH_NO_CHANGE:
+                if (serviceEntityReg == null) {
+                    LOG.debug("Service group {} ignoring service entity ownership when unregistered", identifier);
+                    return;
+                }
+
+                serviceEntityState = EntityState.OWNED;
                 takeOwnership();
                 break;
             case LOCAL_OWNERSHIP_LOST_NEW_OWNER:
             case LOCAL_OWNERSHIP_LOST_NO_OWNER:
-                // MASTER to SLAVE : !ownershipChange.getState().isOwner() && ownershipChange.getState().wasOwner()
-                lostOwnership();
+                LOG.debug("Service group {} lost service entity ownership", identifier);
+                serviceEntityState = EntityState.UNOWNED;
+                if (stopServices()) {
+                    LOG.debug("Service group {} already stopping services, postponing cleanup", identifier);
+                    return;
+                }
+
+                if (cleanupEntityReg != null) {
+                    cleanupEntityReg.close();
+                    cleanupEntityReg = null;
+                }
+                break;
+            case REMOTE_OWNERSHIP_CHANGED:
+            case REMOTE_OWNERSHIP_LOST_NO_OWNER:
+                // No need to react, just update the state
+                serviceEntityState = EntityState.UNOWNED;
                 break;
             default:
-                // Not needed notifications
-                LOG.debug("Group {} in state {} not processed entity OwnershipChange {}", identifier, state,
-                    ownershipChange);
+                LOG.warn("Service group {} ignoring unhandled cleanup entity change {}", identifier, state);
+                break;
         }
     }
 
@@ -435,18 +614,21 @@ final class ClusterSingletonServiceGroupImpl<P extends Path<P>, E extends Generi
      */
     private void takeOwnership() {
         if (isClosed()) {
-            LOG.debug("Service group {} is closed, not taking ownership", identifier);
+            LOG.debug("Service group {} is closed, skipping cleanup ownership bid", identifier);
             return;
         }
 
-        LOG.debug("Group {} taking ownership", identifier);
+        LOG.debug("Service group {} registering cleanup entity", identifier);
 
-        updateState(State.TAKING_OWNERSHIP);
+        startCapture();
         try {
             cleanupEntityReg = entityOwnershipService.registerCandidate(cleanupEntity);
+            cleanupEntityState = EntityState.REGISTERED;
         } catch (CandidateAlreadyRegisteredException e) {
             LOG.error("Service group {} failed to take ownership", identifier, e);
         }
+
+        endCapture().forEach(this::lockedOwnershipChanged);
     }
 
     /*
@@ -469,84 +651,108 @@ final class ClusterSingletonServiceGroupImpl<P extends Path<P>, E extends Generi
             }
         });
 
+        localServicesState = ServiceState.STARTED;
         LOG.debug("Service group {} services started", identifier);
-        updateState(State.OWNER);
     }
 
-    /*
-     * Help method calls suspendService method for stop this single cluster-wide service instance.
-     * The last async. step has to close DoubleCandidateRegistration reference what should initialize
-     * new election for DoubleCandidateEntity.
-     */
-    private void lostOwnership() {
-        LOG.debug("Service group {} lost ownership in state {}", identifier, state);
-        switch (state) {
-            case REGISTERED:
-                updateState(State.STANDBY);
-                break;
-            case OWNER:
-                stopServices();
-                break;
-            case STARTING_SERVICES:
-            case STOPPING_SERVICES:
-                // No-op, as these will re-check state before proceeding
-                break;
-            case TAKING_OWNERSHIP:
-                cleanupEntityReg.close();
-                cleanupEntityReg = null;
-                updateState(State.STANDBY);
-                break;
-            case INITIAL:
-            case TERMINATED:
+    @SuppressWarnings("checkstyle:IllegalCatch")
+    boolean stopServices() {
+        switch (localServicesState) {
+            case STARTED:
+                localServicesState = ServiceState.STOPPING;
+
+                final List<ListenableFuture<Void>> serviceCloseFutureList = new ArrayList<>(serviceGroup.size());
+                for (final ClusterSingletonService service : serviceGroup) {
+                    final ListenableFuture<Void> future;
+
+                    try {
+                        future = service.closeServiceInstance();
+                    } catch (Exception e) {
+                        LOG.warn("Service group {} service {} failed to stop, attempting to continue", identifier,
+                            service, e);
+                        continue;
+                    }
+
+                    serviceCloseFutureList.add(future);
+                }
+
+                LOG.debug("Service group {} initiated service shutdown", identifier);
+
+                Futures.addCallback(Futures.allAsList(serviceCloseFutureList), new FutureCallback<List<Void>>() {
+                    @Override
+                    public void onFailure(final Throwable cause) {
+                        LOG.warn("Service group {} service stopping reported error", identifier, cause);
+                        onServicesStopped();
+                    }
+
+                    @Override
+                    public void onSuccess(final List<Void> nulls) {
+                        onServicesStopped();
+                    }
+                }, MoreExecutors.directExecutor());
+
+                return localServicesState == ServiceState.STOPPING;
+            case STOPPED:
+                LOG.debug("Service group {} has already stopped services", identifier);
+                return false;
+            case STOPPING:
+                LOG.debug("Service group {} is already stopping services", identifier);
+                return true;
             default:
-                LOG.info("Service group {} ignoring lost ownership in state {},", identifier, state);
-                break;
+                throw new IllegalStateException("Unhandled local services state " + localServicesState);
         }
     }
 
-    @SuppressWarnings("checkstyle:IllegalCatch")
-    void stopServices() {
-        updateState(State.STOPPING_SERVICES);
-
-        final List<ListenableFuture<Void>> serviceCloseFutureList = new ArrayList<>(serviceGroup.size());
-        for (final ClusterSingletonService service : serviceGroup) {
-            final ListenableFuture<Void> future;
+    void onServicesStopped() {
+        LOG.debug("Service group {} finished stopping services", identifier);
+        lock.lock();
+        try {
+            localServicesState = ServiceState.STOPPED;
 
-            try {
-                future = service.closeServiceInstance();
-            } catch (Exception e) {
-                LOG.warn("Service group {} service {} failed to stop, attempting to continue", identifier,
-                    service, e);
-                continue;
+            if (isClosed()) {
+                LOG.debug("Service group {} closed, skipping service restart check", identifier);
+                return;
             }
 
-            serviceCloseFutureList.add(future);
-        }
-
-        Futures.addCallback(Futures.allAsList(serviceCloseFutureList), new FutureCallback<List<Void>>() {
-            @Override
-            public void onFailure(final Throwable cause) {
-                LOG.warn("Service group {} service stopping reported error", identifier, cause);
-                onServicesStopped();
+            // If we lost the service entity while services were stopping, we need to unregister cleanup entity
+            switch (serviceEntityState) {
+                case OWNED:
+                case OWNED_JEOPARDY:
+                    // No need to churn cleanup entity
+                    break;
+                case REGISTERED:
+                case UNOWNED:
+                case UNREGISTERED:
+                    if (cleanupEntityReg != null) {
+                        startCapture();
+                        cleanupEntityReg.close();
+                        cleanupEntityReg = null;
+                        endCapture().forEach(this::lockedOwnershipChanged);
+                    }
+                    break;
+                default:
+                    throw new IllegalStateException("Unhandled service entity state" + serviceEntityState);
             }
 
-            @Override
-            public void onSuccess(final List<Void> nulls) {
-                onServicesStopped();
+            if (cleanupEntityReg == null) {
+                LOG.debug("Service group {} does not have cleanup entity registered, skipping restart check",
+                    identifier);
+                return;
             }
-        }, MoreExecutors.directExecutor());
-    }
 
-    void onServicesStopped() {
-        LOG.debug("Service group {} finished stopping services", identifier);
-        lock.lock();
-        try {
-            if (cleanupEntityReg != null) {
-                updateState(State.RELEASING_OWNERSHIP);
-                cleanupEntityReg.close();
-                cleanupEntityReg = null;
-            } else {
-                updateState(State.STANDBY);
+            // Double-check if the services should really be down
+            switch (cleanupEntityState) {
+                case OWNED:
+                    // We have finished stopping services, but we own cleanup, e.g. we should start them again.
+                    startServices();
+                    return;
+                case UNOWNED:
+                case OWNED_JEOPARDY:
+                case REGISTERED:
+                case UNREGISTERED:
+                    break;
+                default:
+                    throw new IllegalStateException("Unhandled cleanup entity state" + cleanupEntityState);
             }
         } finally {
             lock.unlock();
@@ -556,6 +762,6 @@ final class ClusterSingletonServiceGroupImpl<P extends Path<P>, E extends Generi
 
     @Override
     public String toString() {
-        return MoreObjects.toStringHelper(this).add("identifier", identifier).add("state", state).toString();
+        return MoreObjects.toStringHelper(this).add("identifier", identifier).toString();
     }
 }
index eff1025c14f9d7a7d5d6498d6291b8f801358171..d3d6fc1e62beb62315cda8ac1e7ed7e20727e955 100644 (file)
@@ -150,6 +150,15 @@ public abstract class AbstractDOMClusterServiceProviderTest {
         return new DOMEntityOwnershipChange(ENTITY, EntityOwnershipChangeState.REMOTE_OWNERSHIP_LOST_NO_OWNER);
     }
 
+    static final DOMEntityOwnershipChange getEntityToJeopardy() {
+        return new DOMEntityOwnershipChange(ENTITY, EntityOwnershipChangeState.REMOTE_OWNERSHIP_LOST_NO_OWNER, true);
+    }
+
+    static final DOMEntityOwnershipChange getEntityMasterJeopardy() {
+        return new DOMEntityOwnershipChange(ENTITY, EntityOwnershipChangeState.LOCAL_OWNERSHIP_RETAINED_WITH_NO_CHANGE,
+            true);
+    }
+
     static final DOMEntityOwnershipChange getDoubleEntityToMaster() {
         return new DOMEntityOwnershipChange(DOUBLE_ENTITY, EntityOwnershipChangeState.LOCAL_OWNERSHIP_GRANTED);
     }
@@ -162,8 +171,14 @@ public abstract class AbstractDOMClusterServiceProviderTest {
         return new DOMEntityOwnershipChange(DOUBLE_ENTITY, EntityOwnershipChangeState.LOCAL_OWNERSHIP_LOST_NEW_OWNER);
     }
 
-    static final DOMEntityOwnershipChange getEntityToJeopardy() {
-        return new DOMEntityOwnershipChange(ENTITY, EntityOwnershipChangeState.REMOTE_OWNERSHIP_LOST_NO_OWNER, true);
+    static final DOMEntityOwnershipChange getDoubleEntityToJeopardy() {
+        return new DOMEntityOwnershipChange(DOUBLE_ENTITY, EntityOwnershipChangeState.REMOTE_OWNERSHIP_LOST_NO_OWNER,
+            true);
+    }
+
+    static final DOMEntityOwnershipChange getDoubleEntityMasterJeopardy() {
+        return new DOMEntityOwnershipChange(DOUBLE_ENTITY,
+            EntityOwnershipChangeState.LOCAL_OWNERSHIP_RETAINED_WITH_NO_CHANGE, true);
     }
 
     /**
index 4f45ce379fd715658ae26af5efac10898b64329c..83ed842d504ebe388338ae3bf5860c863010d72d 100644 (file)
@@ -366,7 +366,13 @@ public class ClusterSingletonServiceGroupImplTest {
         verify(mockEosService).registerCandidate(CLOSE_ENTITY);
         singletonServiceGroup.ownershipChanged(getDoubleEntityToMaster());
         verify(mockClusterSingletonService).instantiateServiceInstance();
+
+        // Base entity in jeopardy should not matter...
         singletonServiceGroup.ownershipChanged(getEntityToJeopardy());
+        verify(mockClusterSingletonService, never()).closeServiceInstance();
+
+        // ... application state is actually guarded by cleanup
+        singletonServiceGroup.ownershipChanged(getDoubleEntityToJeopardy());
         verify(mockClusterSingletonService).closeServiceInstance();
     }
 
@@ -437,7 +443,7 @@ public class ClusterSingletonServiceGroupImplTest {
         singletonServiceGroup.ownershipChanged(getDoubleEntityToMaster());
         verify(mockClusterSingletonService).instantiateServiceInstance();
         singletonServiceGroup.ownershipChanged(getDoubleEntityToSlave());
-        verify(mockClusterSingletonService, never()).closeServiceInstance();
+        verify(mockClusterSingletonService).closeServiceInstance();
     }
 
     /**
@@ -511,6 +517,11 @@ public class ClusterSingletonServiceGroupImplTest {
                 EntityOwnershipChangeState.LOCAL_OWNERSHIP_LOST_NO_OWNER);
     }
 
+    private static GenericEntityOwnershipChange<TestInstanceIdentifier, TestEntity> getEntityToJeopardy() {
+        return new GenericEntityOwnershipChange<>(MAIN_ENTITY,
+                EntityOwnershipChangeState.REMOTE_OWNERSHIP_LOST_NO_OWNER, true);
+    }
+
     private static GenericEntityOwnershipChange<TestInstanceIdentifier, TestEntity> getDoubleEntityToMaster() {
         return new GenericEntityOwnershipChange<>(CLOSE_ENTITY, EntityOwnershipChangeState.LOCAL_OWNERSHIP_GRANTED);
     }
@@ -524,8 +535,8 @@ public class ClusterSingletonServiceGroupImplTest {
         return new GenericEntityOwnershipChange<>(CLOSE_ENTITY, EntityOwnershipChangeState.REMOTE_OWNERSHIP_CHANGED);
     }
 
-    private static GenericEntityOwnershipChange<TestInstanceIdentifier, TestEntity> getEntityToJeopardy() {
-        return new GenericEntityOwnershipChange<>(MAIN_ENTITY,
+    private static GenericEntityOwnershipChange<TestInstanceIdentifier, TestEntity> getDoubleEntityToJeopardy() {
+        return new GenericEntityOwnershipChange<>(CLOSE_ENTITY,
                 EntityOwnershipChangeState.REMOTE_OWNERSHIP_LOST_NO_OWNER, true);
     }
 }
index 7e1180fa5b0506556d9b9b2b90921ab0481ba089..fb6786d30e1f49e2e0a479efae73914a3d0eec3e 100644 (file)
@@ -114,7 +114,7 @@ public final class DOMClusterSingletonServiceProviderAsyncImplTest extends Abstr
         clusterSingletonServiceProvider.ownershipChanged(getDoubleEntityToMaster());
         assertEquals(TestClusterSingletonServiceState.STARTED, clusterSingletonService.getServiceState());
         clusterSingletonServiceProvider.ownershipChanged(getDoubleEntityToSlave());
-        assertEquals(TestClusterSingletonServiceState.STARTED, clusterSingletonService.getServiceState());
+        assertEquals(TestClusterSingletonServiceState.DESTROYED, clusterSingletonService.getServiceState());
         Thread.sleep(ASYNC_TIME_DELAY_MILLIS * 2);
         verify(mockEosDoubleEntityListReg, never()).close();
         verify(mockEntityCandReg, never()).close();
@@ -149,13 +149,15 @@ public final class DOMClusterSingletonServiceProviderAsyncImplTest extends Abstr
         assertEquals(TestClusterSingletonServiceState.INITIALIZED, clusterSingletonService.getServiceState());
         clusterSingletonServiceProvider.ownershipChanged(getDoubleEntityToMaster());
         assertEquals(TestClusterSingletonServiceState.STARTED, clusterSingletonService.getServiceState());
-        clusterSingletonServiceProvider.ownershipChanged(getEntityToJeopardy());
+        clusterSingletonServiceProvider.ownershipChanged(getEntityMasterJeopardy());
+        assertEquals(TestClusterSingletonServiceState.STARTED, clusterSingletonService.getServiceState());
+        clusterSingletonServiceProvider.ownershipChanged(getDoubleEntityMasterJeopardy());
         assertEquals(TestClusterSingletonServiceState.DESTROYED, clusterSingletonService.getServiceState());
         Thread.sleep(ASYNC_TIME_DELAY_MILLIS * 2);
         verify(mockEosEntityListReg, never()).close();
         verify(mockEosDoubleEntityListReg, never()).close();
         verify(mockEntityCandReg, never()).close();
-        verify(mockDoubleEntityCandReg).close();
+        verify(mockDoubleEntityCandReg, never()).close();
     }
 
     /**
index 67236c8047fb7d57d2683f1daa765e5132ef9a65..16ee48a95215b47262b4c4c4b07e232a8cb78da8 100644 (file)
@@ -12,7 +12,7 @@ import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotNull;
 import static org.mockito.Mockito.atLeastOnce;
 import static org.mockito.Mockito.never;
-import static org.mockito.Mockito.reset;
+import static org.mockito.Mockito.times;
 import static org.mockito.Mockito.verify;
 
 import org.junit.Before;
@@ -84,7 +84,7 @@ public class DOMClusterSingletonServiceProviderImplTest extends AbstractDOMClust
         clusterSingletonServiceProvider.ownershipChanged(getDoubleEntityToMaster());
         assertEquals(TestClusterSingletonServiceState.STARTED, clusterSingletonService.getServiceState());
         clusterSingletonServiceProvider.ownershipChanged(getDoubleEntityToSlave());
-        assertEquals(TestClusterSingletonServiceState.STARTED, clusterSingletonService.getServiceState());
+        assertEquals(TestClusterSingletonServiceState.DESTROYED, clusterSingletonService.getServiceState());
         verify(mockEosDoubleEntityListReg, never()).close();
         verify(mockEntityCandReg, never()).close();
         verify(mockDoubleEntityCandReg, never()).close();
@@ -93,12 +93,11 @@ public class DOMClusterSingletonServiceProviderImplTest extends AbstractDOMClust
         verify(mockEosDoubleEntityListReg, never()).close();
         verify(mockEntityCandReg, atLeastOnce()).close();
         verify(mockDoubleEntityCandReg, never()).close();
-        assertEquals(TestClusterSingletonServiceState.STARTED, clusterSingletonService.getServiceState());
+        assertEquals(TestClusterSingletonServiceState.DESTROYED, clusterSingletonService.getServiceState());
         clusterSingletonServiceProvider.ownershipChanged(getEntityToSlave());
         verify(mockEntityCandReg, atLeastOnce()).close();
         verify(mockDoubleEntityCandReg, atLeastOnce()).close();
         verify(mockEosDoubleEntityListReg, never()).close();
-        assertEquals(TestClusterSingletonServiceState.DESTROYED, clusterSingletonService.getServiceState());
     }
 
     /**
@@ -119,12 +118,12 @@ public class DOMClusterSingletonServiceProviderImplTest extends AbstractDOMClust
         assertEquals(TestClusterSingletonServiceState.INITIALIZED, clusterSingletonService.getServiceState());
         clusterSingletonServiceProvider.ownershipChanged(getDoubleEntityToMaster());
         assertEquals(TestClusterSingletonServiceState.STARTED, clusterSingletonService.getServiceState());
-        clusterSingletonServiceProvider.ownershipChanged(getEntityToJeopardy());
+        clusterSingletonServiceProvider.ownershipChanged(getDoubleEntityToJeopardy());
         assertEquals(TestClusterSingletonServiceState.DESTROYED, clusterSingletonService.getServiceState());
         verify(mockEosEntityListReg, never()).close();
         verify(mockEosDoubleEntityListReg, never()).close();
         verify(mockEntityCandReg, never()).close();
-        verify(mockDoubleEntityCandReg).close();
+        verify(mockDoubleEntityCandReg, never()).close();
     }
 
     /**
@@ -236,9 +235,8 @@ public class DOMClusterSingletonServiceProviderImplTest extends AbstractDOMClust
         verify(mockDoubleEntityCandReg, never()).close();
 
         // Instantiate the next incarnation
-        reset(mockEos);
         reg = clusterSingletonServiceProvider.registerClusterSingletonService(clusterSingletonService2);
-        verify(mockEos, never()).registerCandidate(ENTITY);
+        verify(mockEos).registerCandidate(ENTITY);
         assertEquals(TestClusterSingletonServiceState.INITIALIZED, clusterSingletonService2.getServiceState());
 
         // Drive the old incarnation to closure, resetting mocks as needed
@@ -250,12 +248,10 @@ public class DOMClusterSingletonServiceProviderImplTest extends AbstractDOMClust
 
         // Reset mocks for reuse. The next change should see the previous group terminate and the next incarnation
         // to start coming up
-        reset(mockEntityCandReg);
-        reset(mockDoubleEntityCandReg);
         clusterSingletonServiceProvider.ownershipChanged(getDoubleEntityToSlave());
-        verify(mockEos).registerCandidate(ENTITY);
+        verify(mockEos, times(2)).registerCandidate(ENTITY);
         clusterSingletonServiceProvider.ownershipChanged(getEntityToMaster());
-        verify(mockEos).registerCandidate(DOUBLE_ENTITY);
+        verify(mockEos, times(2)).registerCandidate(DOUBLE_ENTITY);
         clusterSingletonServiceProvider.ownershipChanged(getDoubleEntityToMaster());
         assertEquals(TestClusterSingletonServiceState.STARTED, clusterSingletonService2.getServiceState());