From 6ce7a04df27c4a8d959b229c2766cdd31eb3c6a8 Mon Sep 17 00:00:00 2001 From: Robert Varga Date: Fri, 22 Sep 2017 01:00:04 +0200 Subject: [PATCH] BUG-9145: rework singleton service group state tracking The beef of the issue here is that there are multiple codepaths which end up sharing state transitions -- hence we could erroneously end up waiting for entity release, when if fact we have already lost it. Rather than going the explicit FSM route, which would require quite a few more states, rework state tracking such that each event is evaluated against current state. This has the nice feature of making duplicate events idempotent, as the code is all about reconciling current and intended state. This also disentangles the loss of ownership codepaths, which means we can optimize behavior when partitions are involved -- specifically jeopardy service entity ownership does not result in service shutdown, because service liveness is actually guaranteed by cleanup entity. Hence we can keep the services and the cleanup entity intact, leading to less churn during partitions. Should the cleanup entity report jeopardy, we need to bring the services down, but we do not need to unregister the cleanup entity -- if the partition is healed, we can start services again without the delay incurred by EOS. If we end up losing partition healing, cleanup entity ownership will be taken from us, at which point we also unregister. Change-Id: Id5c8a97722b9a21114135554d92c17898dd0150e Signed-off-by: Robert Varga (cherry picked from commit d0295fff2bf0a90d49a4e4ba08f6db933f62cf47) --- .../ClusterSingletonServiceGroupImpl.java | 592 ++++++++++++------ ...AbstractDOMClusterServiceProviderTest.java | 19 +- .../ClusterSingletonServiceGroupImplTest.java | 17 +- ...SingletonServiceProviderAsyncImplTest.java | 8 +- ...usterSingletonServiceProviderImplTest.java | 20 +- 5 files changed, 443 insertions(+), 213 deletions(-) diff --git a/singleton-service/mdsal-singleton-dom-impl/src/main/java/org/opendaylight/mdsal/singleton/dom/impl/ClusterSingletonServiceGroupImpl.java b/singleton-service/mdsal-singleton-dom-impl/src/main/java/org/opendaylight/mdsal/singleton/dom/impl/ClusterSingletonServiceGroupImpl.java index 279129adc0..877acbba4e 100644 --- a/singleton-service/mdsal-singleton-dom-impl/src/main/java/org/opendaylight/mdsal/singleton/dom/impl/ClusterSingletonServiceGroupImpl.java +++ b/singleton-service/mdsal-singleton-dom-impl/src/main/java/org/opendaylight/mdsal/singleton/dom/impl/ClusterSingletonServiceGroupImpl.java @@ -23,7 +23,9 @@ import java.util.concurrent.atomic.AtomicReference; import java.util.concurrent.locks.ReentrantLock; import javax.annotation.CheckReturnValue; import javax.annotation.concurrent.GuardedBy; +import javax.annotation.concurrent.ThreadSafe; import org.opendaylight.mdsal.eos.common.api.CandidateAlreadyRegisteredException; +import org.opendaylight.mdsal.eos.common.api.EntityOwnershipChangeState; import org.opendaylight.mdsal.eos.common.api.GenericEntity; import org.opendaylight.mdsal.eos.common.api.GenericEntityOwnershipCandidateRegistration; import org.opendaylight.mdsal.eos.common.api.GenericEntityOwnershipChange; @@ -56,46 +58,50 @@ import org.slf4j.LoggerFactory; * @param the GenericEntityOwnershipListener type * @param the GenericEntityOwnershipService type */ +@ThreadSafe final class ClusterSingletonServiceGroupImpl

, E extends GenericEntity

, C extends GenericEntityOwnershipChange, G extends GenericEntityOwnershipListener, S extends GenericEntityOwnershipService> extends ClusterSingletonServiceGroup { - private enum State { + + private enum EntityState { /** - * This group has been freshly allocated and has not been started yet. + * This entity was never registered. */ - INITIAL, + UNREGISTERED, /** - * Operational state. Service entity is registered, but ownership was not resolved yet. + * Registration exists, but we are waiting for it to resolve. */ REGISTERED, /** - * Operational state. Service entity confirmed to be follower. - */ - STANDBY, - /** - * Service entity acquired. Attempting to acquire cleanup entity. + * Registration indicated we are the owner. */ - TAKING_OWNERSHIP, + OWNED, /** - * Both entities held and user services are being started. + * Registration indicated we are the owner, but global state is uncertain -- meaning there can be owners in + * another partition, for example. */ - STARTING_SERVICES, + OWNED_JEOPARDY, /** - * Steady state. Both entities held and services have finished starting. + * Registration indicated we are not the owner. In this state we do not care about global state, therefore we + * do not need an UNOWNED_JEOPARDY state. */ - OWNER, + UNOWNED, + } + + private enum ServiceState { /** - * User services are being stopped due to either loss of an entity or a shutdown. + * Local services are stopped. */ - STOPPING_SERVICES, + STOPPED, /** - * We have stopped services and are now relinquishing the cleanup entity. + * Local services are up and running. */ - RELEASING_OWNERSHIP, + // FIXME: we should support async startup, which will require a STARTING state. + STARTED, /** - * Terminated, this group cannot be used anymore. + * Local services are being stopped. */ - TERMINATED + STOPPING, } private static final Logger LOG = LoggerFactory.getLogger(ClusterSingletonServiceGroupImpl.class); @@ -107,23 +113,71 @@ final class ClusterSingletonServiceGroupImpl

, E extends Generi private final E serviceEntity; private final E cleanupEntity; - private final AtomicReference> closeFuture = new AtomicReference<>(); private final ReentrantLock lock = new ReentrantLock(true); @GuardedBy("lock") private final List serviceGroup; + /* + * State tracking is quite involved, as we are tracking up to four asynchronous sources of events: + * - user calling close() + * - service entity ownership + * - cleanup entity ownership + * - service shutdown future + * + * Absolutely correct solution would be a set of behaviors, which govern each state, remembering where we want to + * get to and what we are doing. That would result in ~15 classes which would quickly render this code unreadable + * due to boilerplate overhead. + * + * We therefore take a different approach, tracking state directly in this class and evaluate state transitions + * based on recorded bits -- without explicit representation of state machine state. + */ + /** + * Group close future. In can only go from null to non-null reference. Whenever it is non-null, it indicates that + * the user has closed the group and we are converging to termination. + */ + // We are using volatile get-and-set to support non-blocking close(). It may be more efficient to inline it here, + // as we perform a volatile read after unlocking -- that volatile read may easier on L1 cache. + // XXX: above needs a microbenchmark contention ever becomes a problem. + private final AtomicReference> closeFuture = new AtomicReference<>(); + + /** + * Service (base) entity registration. This entity selects an owner candidate across nodes. Candidates proceed to + * acquire {@link #cleanupEntity}. + */ @GuardedBy("lock") - private State state = State.INITIAL; + private GenericEntityOwnershipCandidateRegistration serviceEntityReg = null; + /** + * Service (base) entity last reported state. + */ + @GuardedBy("lock") + private EntityState serviceEntityState = EntityState.UNREGISTERED; + /** + * Cleanup (owner) entity registration. This entity guards access to service state and coordinates shutdown cleanup + * and startup. + */ @GuardedBy("lock") - private List capture; + private GenericEntityOwnershipCandidateRegistration cleanupEntityReg; + /** + * Cleanup (owner) entity last reported state. + */ + @GuardedBy("lock") + private EntityState cleanupEntityState = EntityState.UNREGISTERED; - /* EOS Candidate Registrations */ + /** + * Optional event capture list. This field is initialized when we interact with entity ownership service, to capture + * events reported during EOS method invocation -- like immediate acquisition of entity when we register it. This + * prevents bugs from recursion. + */ @GuardedBy("lock") - private GenericEntityOwnershipCandidateRegistration serviceEntityReg; + private List capture = null; + + /** + * State of local services. + */ @GuardedBy("lock") - private GenericEntityOwnershipCandidateRegistration cleanupEntityReg; + private ServiceState localServicesState = ServiceState.STOPPED; /** * Class constructor. Note: last argument is reused as-is. @@ -187,76 +241,109 @@ final class ClusterSingletonServiceGroupImpl

, E extends Generi } @GuardedBy("lock") - private void updateState(final State newState) { - LOG.debug("Service group {} switching from {} to {}", identifier, state, newState); - state = Verify.verifyNotNull(newState); + private void startCapture() { + Verify.verify(capture == null, "Service group {} is already capturing events {}", identifier, capture); + capture = new ArrayList<>(0); + LOG.debug("Service group {} started capturing events", identifier); + } + + private List endCapture() { + final List ret = Verify.verifyNotNull(capture, "Service group {} is not currently capturing", identifier); + capture = null; + LOG.debug("Service group {} finished capturing events, {} events to process", identifier, ret.size()); + return ret; } @GuardedBy("lock") private void lockedClose(final SettableFuture future) { if (serviceEntityReg != null) { - LOG.debug("Service group {} unregistering", identifier); + // We are still holding the service registration, close it now... + LOG.debug("Service group {} unregistering service entity {}", identifier, serviceEntity); + startCapture(); serviceEntityReg.close(); serviceEntityReg = null; + + // This can potentially mutate our state, so all previous checks need to be re-validated. + endCapture().forEach(this::lockedOwnershipChanged); } - switch (state) { - case INITIAL: - // Not started: not much to do - terminate(future); - break; - case TERMINATED: - // Already done: no-op - break; + // Now check service entity state: if it is still owned, we need to wait until it is acknowledged as + // unregistered. + switch (serviceEntityState) { case REGISTERED: - case STANDBY: - LOG.debug("Service group {} terminated", identifier); - terminate(future); - break; - case OWNER: - // No-op, we will react to the loss of registration instead. - break; - case STOPPING_SERVICES: - // Waiting for services. Will resume once we get notified. + case UNOWNED: + case UNREGISTERED: + // We have either successfully shut down, or have never started up, proceed with termination break; - case RELEASING_OWNERSHIP: - // Waiting for cleanup entity to flip, will resume afterwards. - break; - case TAKING_OWNERSHIP: - // Abort taking of ownership and close - LOG.debug("Service group {} aborting ownership bid", identifier); - cleanupEntityReg.close(); - cleanupEntityReg = null; - updateState(State.RELEASING_OWNERSHIP); + case OWNED: + // We have unregistered, but EOS has not reported our loss of ownership. We will continue with shutdown + // when that loss is reported. + LOG.debug("Service group {} is still owned, postponing termination", identifier); + return; + case OWNED_JEOPARDY: + // This is a significant event, as it relates to cluster split/join operations, operators need to know + // we are waiting for cluster join event. + LOG.info("Service group {} is still owned with split cluster, postponing termination", identifier); + return; + default: + throw new IllegalStateException("Unhandled service entity state " + serviceEntityState); + } + + // We do not own service entity state: we need to ensure services are stopped. + if (stopServices()) { + LOG.debug("Service group {} started shutting down services, postponing termination", identifier); + return; + } + + // Local cleanup completed, release cleanup entity if needed + if (cleanupEntityReg != null) { + LOG.debug("Service group {} unregistering cleanup entity {}", identifier, cleanupEntity); + startCapture(); + cleanupEntityReg.close(); + cleanupEntityReg = null; + + // This can potentially mutate our state, so all previous checks need to be re-validated. + endCapture().forEach(this::lockedOwnershipChanged); + } + + switch (cleanupEntityState) { + case REGISTERED: + case UNOWNED: + case UNREGISTERED: + // We have either successfully shut down, or have never started up, proceed with termination break; + case OWNED: + // We have unregistered, but EOS has not reported our loss of ownership. We will continue with shutdown + // when that loss is reported. + LOG.debug("Service group {} is still owns cleanup, postponing termination", identifier); + return; + case OWNED_JEOPARDY: + // This is a significant event, as it relates to cluster split/join operations, operators need to know + // we are waiting for cluster join event. + LOG.info("Service group {} is still owns cleanup with split cluster, postponing termination", + identifier); + return; default: - throw new IllegalStateException("Unhandled state " + state); + throw new IllegalStateException("Unhandled cleanup entity state " + serviceEntityState); } - } - @GuardedBy("lock") - private void terminate(final SettableFuture future) { - updateState(State.TERMINATED); - Verify.verify(future.set(null)); + // No registrations left and no service operations pending, we are done + LOG.debug("Service group {} completing termination", identifier); + future.set(null); } @Override void initialize() throws CandidateAlreadyRegisteredException { - LOG.debug("Initialization ClusterSingletonGroup {}", identifier); - lock.lock(); try { - Preconditions.checkState(state == State.INITIAL, "Unexpected singleton group %s state %s", identifier, - state); + Preconditions.checkState(serviceEntityState == EntityState.UNREGISTERED, + "Singleton group %s was already initilized", identifier); - // Catch events if they fire during this call - capture = new ArrayList<>(0); + LOG.debug("Initializing service group {} with services {}", identifier, serviceGroup); + startCapture(); serviceEntityReg = entityOwnershipService.registerCandidate(serviceEntity); - state = State.REGISTERED; - - final List captured = capture; - capture = null; - captured.forEach(this::lockedOwnershipChanged); + serviceEntityState = EntityState.REGISTERED; + endCapture().forEach(this::lockedOwnershipChanged); } finally { lock.unlock(); } @@ -272,23 +359,28 @@ final class ClusterSingletonServiceGroupImpl

, E extends Generi Verify.verify(identifier.equals(service.getIdentifier().getValue())); checkNotClosed(); - LOG.debug("RegisterService method call for ClusterSingletonServiceGroup {}", identifier); - lock.lock(); try { - Preconditions.checkState(state != State.INITIAL, "Service group %s is not initialized yet", identifier); + Preconditions.checkState(serviceEntityState != EntityState.UNREGISTERED, + "Service group %s is not initialized yet", identifier); + + LOG.debug("Adding service {} to service group {}", service, identifier); serviceGroup.add(service); - switch (state) { - case OWNER: - case STARTING_SERVICES: + switch (localServicesState) { + case STARTED: + LOG.debug("Service group {} starting late-registered service {}", identifier, service); service.instantiateServiceInstance(); break; - default: + case STOPPED: + case STOPPING: break; + default: + throw new IllegalStateException("Unhandled local services state " + localServicesState); } } finally { lock.unlock(); + finishCloseIfNeeded(); } } @@ -310,13 +402,16 @@ final class ClusterSingletonServiceGroupImpl

, E extends Generi Verify.verify(serviceGroup.remove(service)); LOG.debug("Service {} was removed from group.", service.getIdentifier().getValue()); - switch (state) { - case OWNER: - case STARTING_SERVICES: + switch (localServicesState) { + case STARTED: + LOG.warn("Service group {} stopping unregistered service {}", identifier, service); service.closeServiceInstance(); break; - default: + case STOPPED: + case STOPPING: break; + default: + throw new IllegalStateException("Unhandled local services state " + localServicesState); } return false; @@ -343,77 +438,161 @@ final class ClusterSingletonServiceGroupImpl

, E extends Generi } } + /** + * Handle an ownership change with the lock held. Callers are expected to handle termination conditions, this method + * and anything it calls must not call {@link #lockedClose(SettableFuture)}. + * + * @param ownershipChange reported change + */ + @GuardedBy("lock") private void lockedOwnershipChanged(final C ownershipChange) { - if (ownershipChange.inJeopardy()) { - LOG.warn("Cluster Node lost connection to another cluster nodes {}", ownershipChange); - lostOwnership(); - return; - } - final E entity = ownershipChange.getEntity(); if (serviceEntity.equals(entity)) { - serviceOwnershipChanged(ownershipChange); + serviceOwnershipChanged(ownershipChange.getState(), ownershipChange.inJeopardy()); } else if (cleanupEntity.equals(entity)) { - cleanupCandidateOwnershipChanged(ownershipChange); + cleanupCandidateOwnershipChanged(ownershipChange.getState(), ownershipChange.inJeopardy()); } else { LOG.warn("Group {} received unrecognized change {}", identifier, ownershipChange); } } - private void cleanupCandidateOwnershipChanged(final C ownershipChange) { - switch (ownershipChange.getState()) { + private void cleanupCandidateOwnershipChanged(final EntityOwnershipChangeState state, final boolean jeopardy) { + if (jeopardy) { + switch (state) { + case LOCAL_OWNERSHIP_GRANTED: + case LOCAL_OWNERSHIP_RETAINED_WITH_NO_CHANGE: + if (cleanupEntityReg == null) { + LOG.debug("Service group {} ignoring cleanup entity ownership when unregistered", identifier); + return; + } + + LOG.warn("Service group {} cleanup entity owned without certainty", identifier); + cleanupEntityState = EntityState.OWNED_JEOPARDY; + break; + case LOCAL_OWNERSHIP_LOST_NEW_OWNER: + case LOCAL_OWNERSHIP_LOST_NO_OWNER: + case REMOTE_OWNERSHIP_CHANGED: + case REMOTE_OWNERSHIP_LOST_NO_OWNER: + LOG.info("Service group {} cleanup entity ownership uncertain", identifier); + cleanupEntityState = EntityState.UNOWNED; + break; + default: + throw new IllegalStateException("Unhandled cleanup entity jeopardy change " + state); + } + + stopServices(); + return; + } + + if (cleanupEntityState == EntityState.OWNED_JEOPARDY) { + // Pair info message with previous jeopardy + LOG.info("Service group {} cleanup entity ownership ascertained", identifier); + } + + switch (state) { case LOCAL_OWNERSHIP_GRANTED: - switch (state) { - case TAKING_OWNERSHIP: - // SLAVE to MASTER + case LOCAL_OWNERSHIP_RETAINED_WITH_NO_CHANGE: + if (cleanupEntityReg == null) { + LOG.debug("Service group {} ignoring cleanup entity ownership when unregistered", identifier); + return; + } + + cleanupEntityState = EntityState.OWNED; + switch (localServicesState) { + case STARTED: + LOG.debug("Service group {} already has local services running", identifier); + break; + case STOPPED: startServices(); - return; - default: break; + case STOPPING: + LOG.debug("Service group {} has local services stopping, postponing startup", identifier); + break; + default: + throw new IllegalStateException("Unhandled local services state " + localServicesState); } break; case LOCAL_OWNERSHIP_LOST_NEW_OWNER: case LOCAL_OWNERSHIP_LOST_NO_OWNER: - switch (state) { - case RELEASING_OWNERSHIP: - // Slight cheat: if we are closing down, we just need to notify the future - updateState(isClosed() ? State.INITIAL : State.STANDBY); - return; - case STARTING_SERVICES: - case OWNER: - case TAKING_OWNERSHIP: - LOG.warn("Group {} lost cleanup ownership in state {}", identifier, state); - return; - default: - break; - } - + cleanupEntityState = EntityState.UNOWNED; + stopServices(); break; - case LOCAL_OWNERSHIP_RETAINED_WITH_NO_CHANGE: - case REMOTE_OWNERSHIP_CHANGED: case REMOTE_OWNERSHIP_LOST_NO_OWNER: + case REMOTE_OWNERSHIP_CHANGED: + cleanupEntityState = EntityState.UNOWNED; + break; default: + LOG.warn("Service group {} ignoring unhandled cleanup entity change {}", identifier, state); break; } - - LOG.debug("Group {} in state {} ignoring cleanup OwnershipChange {}", identifier, state, ownershipChange); } - private void serviceOwnershipChanged(final C ownershipChange) { - switch (ownershipChange.getState()) { + private void serviceOwnershipChanged(final EntityOwnershipChangeState state, final boolean jeopardy) { + if (jeopardy) { + LOG.info("Service group {} service entity ownership uncertain", identifier); + + // Service entity ownership is uncertain, which means we want to record the state, but we do not want + // to stop local services nor do anything with the cleanup entity. + switch (state) { + case LOCAL_OWNERSHIP_GRANTED: + case LOCAL_OWNERSHIP_RETAINED_WITH_NO_CHANGE: + if (serviceEntityReg == null) { + LOG.debug("Service group {} ignoring service entity ownership when unregistered", identifier); + return; + } + + serviceEntityState = EntityState.OWNED_JEOPARDY; + break; + case LOCAL_OWNERSHIP_LOST_NEW_OWNER: + case LOCAL_OWNERSHIP_LOST_NO_OWNER: + case REMOTE_OWNERSHIP_CHANGED: + case REMOTE_OWNERSHIP_LOST_NO_OWNER: + serviceEntityState = EntityState.UNOWNED; + break; + default: + throw new IllegalStateException("Unhandled cleanup entity jeopardy change " + state); + } + return; + } + + if (serviceEntityState == EntityState.OWNED_JEOPARDY) { + // Pair info message with previous jeopardy + LOG.info("Service group {} service entity ownership ascertained", identifier); + } + + switch (state) { case LOCAL_OWNERSHIP_GRANTED: - // SLAVE to MASTER : ownershipChange.getState().isOwner() && !ownershipChange.getState().wasOwner() + case LOCAL_OWNERSHIP_RETAINED_WITH_NO_CHANGE: + if (serviceEntityReg == null) { + LOG.debug("Service group {} ignoring service entity ownership when unregistered", identifier); + return; + } + + serviceEntityState = EntityState.OWNED; takeOwnership(); break; case LOCAL_OWNERSHIP_LOST_NEW_OWNER: case LOCAL_OWNERSHIP_LOST_NO_OWNER: - // MASTER to SLAVE : !ownershipChange.getState().isOwner() && ownershipChange.getState().wasOwner() - lostOwnership(); + LOG.debug("Service group {} lost service entity ownership", identifier); + serviceEntityState = EntityState.UNOWNED; + if (stopServices()) { + LOG.debug("Service group {} already stopping services, postponing cleanup", identifier); + return; + } + + if (cleanupEntityReg != null) { + cleanupEntityReg.close(); + cleanupEntityReg = null; + } + break; + case REMOTE_OWNERSHIP_CHANGED: + case REMOTE_OWNERSHIP_LOST_NO_OWNER: + // No need to react, just update the state + serviceEntityState = EntityState.UNOWNED; break; default: - // Not needed notifications - LOG.debug("Group {} in state {} not processed entity OwnershipChange {}", identifier, state, - ownershipChange); + LOG.warn("Service group {} ignoring unhandled cleanup entity change {}", identifier, state); + break; } } @@ -435,18 +614,21 @@ final class ClusterSingletonServiceGroupImpl

, E extends Generi */ private void takeOwnership() { if (isClosed()) { - LOG.debug("Service group {} is closed, not taking ownership", identifier); + LOG.debug("Service group {} is closed, skipping cleanup ownership bid", identifier); return; } - LOG.debug("Group {} taking ownership", identifier); + LOG.debug("Service group {} registering cleanup entity", identifier); - updateState(State.TAKING_OWNERSHIP); + startCapture(); try { cleanupEntityReg = entityOwnershipService.registerCandidate(cleanupEntity); + cleanupEntityState = EntityState.REGISTERED; } catch (CandidateAlreadyRegisteredException e) { LOG.error("Service group {} failed to take ownership", identifier, e); } + + endCapture().forEach(this::lockedOwnershipChanged); } /* @@ -469,84 +651,108 @@ final class ClusterSingletonServiceGroupImpl

, E extends Generi } }); + localServicesState = ServiceState.STARTED; LOG.debug("Service group {} services started", identifier); - updateState(State.OWNER); } - /* - * Help method calls suspendService method for stop this single cluster-wide service instance. - * The last async. step has to close DoubleCandidateRegistration reference what should initialize - * new election for DoubleCandidateEntity. - */ - private void lostOwnership() { - LOG.debug("Service group {} lost ownership in state {}", identifier, state); - switch (state) { - case REGISTERED: - updateState(State.STANDBY); - break; - case OWNER: - stopServices(); - break; - case STARTING_SERVICES: - case STOPPING_SERVICES: - // No-op, as these will re-check state before proceeding - break; - case TAKING_OWNERSHIP: - cleanupEntityReg.close(); - cleanupEntityReg = null; - updateState(State.STANDBY); - break; - case INITIAL: - case TERMINATED: + @SuppressWarnings("checkstyle:IllegalCatch") + boolean stopServices() { + switch (localServicesState) { + case STARTED: + localServicesState = ServiceState.STOPPING; + + final List> serviceCloseFutureList = new ArrayList<>(serviceGroup.size()); + for (final ClusterSingletonService service : serviceGroup) { + final ListenableFuture future; + + try { + future = service.closeServiceInstance(); + } catch (Exception e) { + LOG.warn("Service group {} service {} failed to stop, attempting to continue", identifier, + service, e); + continue; + } + + serviceCloseFutureList.add(future); + } + + LOG.debug("Service group {} initiated service shutdown", identifier); + + Futures.addCallback(Futures.allAsList(serviceCloseFutureList), new FutureCallback>() { + @Override + public void onFailure(final Throwable cause) { + LOG.warn("Service group {} service stopping reported error", identifier, cause); + onServicesStopped(); + } + + @Override + public void onSuccess(final List nulls) { + onServicesStopped(); + } + }, MoreExecutors.directExecutor()); + + return localServicesState == ServiceState.STOPPING; + case STOPPED: + LOG.debug("Service group {} has already stopped services", identifier); + return false; + case STOPPING: + LOG.debug("Service group {} is already stopping services", identifier); + return true; default: - LOG.info("Service group {} ignoring lost ownership in state {},", identifier, state); - break; + throw new IllegalStateException("Unhandled local services state " + localServicesState); } } - @SuppressWarnings("checkstyle:IllegalCatch") - void stopServices() { - updateState(State.STOPPING_SERVICES); - - final List> serviceCloseFutureList = new ArrayList<>(serviceGroup.size()); - for (final ClusterSingletonService service : serviceGroup) { - final ListenableFuture future; + void onServicesStopped() { + LOG.debug("Service group {} finished stopping services", identifier); + lock.lock(); + try { + localServicesState = ServiceState.STOPPED; - try { - future = service.closeServiceInstance(); - } catch (Exception e) { - LOG.warn("Service group {} service {} failed to stop, attempting to continue", identifier, - service, e); - continue; + if (isClosed()) { + LOG.debug("Service group {} closed, skipping service restart check", identifier); + return; } - serviceCloseFutureList.add(future); - } - - Futures.addCallback(Futures.allAsList(serviceCloseFutureList), new FutureCallback>() { - @Override - public void onFailure(final Throwable cause) { - LOG.warn("Service group {} service stopping reported error", identifier, cause); - onServicesStopped(); + // If we lost the service entity while services were stopping, we need to unregister cleanup entity + switch (serviceEntityState) { + case OWNED: + case OWNED_JEOPARDY: + // No need to churn cleanup entity + break; + case REGISTERED: + case UNOWNED: + case UNREGISTERED: + if (cleanupEntityReg != null) { + startCapture(); + cleanupEntityReg.close(); + cleanupEntityReg = null; + endCapture().forEach(this::lockedOwnershipChanged); + } + break; + default: + throw new IllegalStateException("Unhandled service entity state" + serviceEntityState); } - @Override - public void onSuccess(final List nulls) { - onServicesStopped(); + if (cleanupEntityReg == null) { + LOG.debug("Service group {} does not have cleanup entity registered, skipping restart check", + identifier); + return; } - }, MoreExecutors.directExecutor()); - } - void onServicesStopped() { - LOG.debug("Service group {} finished stopping services", identifier); - lock.lock(); - try { - if (cleanupEntityReg != null) { - updateState(State.RELEASING_OWNERSHIP); - cleanupEntityReg.close(); - cleanupEntityReg = null; - } else { - updateState(State.STANDBY); + // Double-check if the services should really be down + switch (cleanupEntityState) { + case OWNED: + // We have finished stopping services, but we own cleanup, e.g. we should start them again. + startServices(); + return; + case UNOWNED: + case OWNED_JEOPARDY: + case REGISTERED: + case UNREGISTERED: + break; + default: + throw new IllegalStateException("Unhandled cleanup entity state" + cleanupEntityState); } } finally { lock.unlock(); @@ -556,6 +762,6 @@ final class ClusterSingletonServiceGroupImpl

, E extends Generi @Override public String toString() { - return MoreObjects.toStringHelper(this).add("identifier", identifier).add("state", state).toString(); + return MoreObjects.toStringHelper(this).add("identifier", identifier).toString(); } } diff --git a/singleton-service/mdsal-singleton-dom-impl/src/test/java/org/opendaylight/mdsal/singleton/dom/impl/AbstractDOMClusterServiceProviderTest.java b/singleton-service/mdsal-singleton-dom-impl/src/test/java/org/opendaylight/mdsal/singleton/dom/impl/AbstractDOMClusterServiceProviderTest.java index eff1025c14..d3d6fc1e62 100644 --- a/singleton-service/mdsal-singleton-dom-impl/src/test/java/org/opendaylight/mdsal/singleton/dom/impl/AbstractDOMClusterServiceProviderTest.java +++ b/singleton-service/mdsal-singleton-dom-impl/src/test/java/org/opendaylight/mdsal/singleton/dom/impl/AbstractDOMClusterServiceProviderTest.java @@ -150,6 +150,15 @@ public abstract class AbstractDOMClusterServiceProviderTest { return new DOMEntityOwnershipChange(ENTITY, EntityOwnershipChangeState.REMOTE_OWNERSHIP_LOST_NO_OWNER); } + static final DOMEntityOwnershipChange getEntityToJeopardy() { + return new DOMEntityOwnershipChange(ENTITY, EntityOwnershipChangeState.REMOTE_OWNERSHIP_LOST_NO_OWNER, true); + } + + static final DOMEntityOwnershipChange getEntityMasterJeopardy() { + return new DOMEntityOwnershipChange(ENTITY, EntityOwnershipChangeState.LOCAL_OWNERSHIP_RETAINED_WITH_NO_CHANGE, + true); + } + static final DOMEntityOwnershipChange getDoubleEntityToMaster() { return new DOMEntityOwnershipChange(DOUBLE_ENTITY, EntityOwnershipChangeState.LOCAL_OWNERSHIP_GRANTED); } @@ -162,8 +171,14 @@ public abstract class AbstractDOMClusterServiceProviderTest { return new DOMEntityOwnershipChange(DOUBLE_ENTITY, EntityOwnershipChangeState.LOCAL_OWNERSHIP_LOST_NEW_OWNER); } - static final DOMEntityOwnershipChange getEntityToJeopardy() { - return new DOMEntityOwnershipChange(ENTITY, EntityOwnershipChangeState.REMOTE_OWNERSHIP_LOST_NO_OWNER, true); + static final DOMEntityOwnershipChange getDoubleEntityToJeopardy() { + return new DOMEntityOwnershipChange(DOUBLE_ENTITY, EntityOwnershipChangeState.REMOTE_OWNERSHIP_LOST_NO_OWNER, + true); + } + + static final DOMEntityOwnershipChange getDoubleEntityMasterJeopardy() { + return new DOMEntityOwnershipChange(DOUBLE_ENTITY, + EntityOwnershipChangeState.LOCAL_OWNERSHIP_RETAINED_WITH_NO_CHANGE, true); } /** diff --git a/singleton-service/mdsal-singleton-dom-impl/src/test/java/org/opendaylight/mdsal/singleton/dom/impl/ClusterSingletonServiceGroupImplTest.java b/singleton-service/mdsal-singleton-dom-impl/src/test/java/org/opendaylight/mdsal/singleton/dom/impl/ClusterSingletonServiceGroupImplTest.java index 4f45ce379f..83ed842d50 100644 --- a/singleton-service/mdsal-singleton-dom-impl/src/test/java/org/opendaylight/mdsal/singleton/dom/impl/ClusterSingletonServiceGroupImplTest.java +++ b/singleton-service/mdsal-singleton-dom-impl/src/test/java/org/opendaylight/mdsal/singleton/dom/impl/ClusterSingletonServiceGroupImplTest.java @@ -366,7 +366,13 @@ public class ClusterSingletonServiceGroupImplTest { verify(mockEosService).registerCandidate(CLOSE_ENTITY); singletonServiceGroup.ownershipChanged(getDoubleEntityToMaster()); verify(mockClusterSingletonService).instantiateServiceInstance(); + + // Base entity in jeopardy should not matter... singletonServiceGroup.ownershipChanged(getEntityToJeopardy()); + verify(mockClusterSingletonService, never()).closeServiceInstance(); + + // ... application state is actually guarded by cleanup + singletonServiceGroup.ownershipChanged(getDoubleEntityToJeopardy()); verify(mockClusterSingletonService).closeServiceInstance(); } @@ -437,7 +443,7 @@ public class ClusterSingletonServiceGroupImplTest { singletonServiceGroup.ownershipChanged(getDoubleEntityToMaster()); verify(mockClusterSingletonService).instantiateServiceInstance(); singletonServiceGroup.ownershipChanged(getDoubleEntityToSlave()); - verify(mockClusterSingletonService, never()).closeServiceInstance(); + verify(mockClusterSingletonService).closeServiceInstance(); } /** @@ -511,6 +517,11 @@ public class ClusterSingletonServiceGroupImplTest { EntityOwnershipChangeState.LOCAL_OWNERSHIP_LOST_NO_OWNER); } + private static GenericEntityOwnershipChange getEntityToJeopardy() { + return new GenericEntityOwnershipChange<>(MAIN_ENTITY, + EntityOwnershipChangeState.REMOTE_OWNERSHIP_LOST_NO_OWNER, true); + } + private static GenericEntityOwnershipChange getDoubleEntityToMaster() { return new GenericEntityOwnershipChange<>(CLOSE_ENTITY, EntityOwnershipChangeState.LOCAL_OWNERSHIP_GRANTED); } @@ -524,8 +535,8 @@ public class ClusterSingletonServiceGroupImplTest { return new GenericEntityOwnershipChange<>(CLOSE_ENTITY, EntityOwnershipChangeState.REMOTE_OWNERSHIP_CHANGED); } - private static GenericEntityOwnershipChange getEntityToJeopardy() { - return new GenericEntityOwnershipChange<>(MAIN_ENTITY, + private static GenericEntityOwnershipChange getDoubleEntityToJeopardy() { + return new GenericEntityOwnershipChange<>(CLOSE_ENTITY, EntityOwnershipChangeState.REMOTE_OWNERSHIP_LOST_NO_OWNER, true); } } diff --git a/singleton-service/mdsal-singleton-dom-impl/src/test/java/org/opendaylight/mdsal/singleton/dom/impl/DOMClusterSingletonServiceProviderAsyncImplTest.java b/singleton-service/mdsal-singleton-dom-impl/src/test/java/org/opendaylight/mdsal/singleton/dom/impl/DOMClusterSingletonServiceProviderAsyncImplTest.java index 7e1180fa5b..fb6786d30e 100644 --- a/singleton-service/mdsal-singleton-dom-impl/src/test/java/org/opendaylight/mdsal/singleton/dom/impl/DOMClusterSingletonServiceProviderAsyncImplTest.java +++ b/singleton-service/mdsal-singleton-dom-impl/src/test/java/org/opendaylight/mdsal/singleton/dom/impl/DOMClusterSingletonServiceProviderAsyncImplTest.java @@ -114,7 +114,7 @@ public final class DOMClusterSingletonServiceProviderAsyncImplTest extends Abstr clusterSingletonServiceProvider.ownershipChanged(getDoubleEntityToMaster()); assertEquals(TestClusterSingletonServiceState.STARTED, clusterSingletonService.getServiceState()); clusterSingletonServiceProvider.ownershipChanged(getDoubleEntityToSlave()); - assertEquals(TestClusterSingletonServiceState.STARTED, clusterSingletonService.getServiceState()); + assertEquals(TestClusterSingletonServiceState.DESTROYED, clusterSingletonService.getServiceState()); Thread.sleep(ASYNC_TIME_DELAY_MILLIS * 2); verify(mockEosDoubleEntityListReg, never()).close(); verify(mockEntityCandReg, never()).close(); @@ -149,13 +149,15 @@ public final class DOMClusterSingletonServiceProviderAsyncImplTest extends Abstr assertEquals(TestClusterSingletonServiceState.INITIALIZED, clusterSingletonService.getServiceState()); clusterSingletonServiceProvider.ownershipChanged(getDoubleEntityToMaster()); assertEquals(TestClusterSingletonServiceState.STARTED, clusterSingletonService.getServiceState()); - clusterSingletonServiceProvider.ownershipChanged(getEntityToJeopardy()); + clusterSingletonServiceProvider.ownershipChanged(getEntityMasterJeopardy()); + assertEquals(TestClusterSingletonServiceState.STARTED, clusterSingletonService.getServiceState()); + clusterSingletonServiceProvider.ownershipChanged(getDoubleEntityMasterJeopardy()); assertEquals(TestClusterSingletonServiceState.DESTROYED, clusterSingletonService.getServiceState()); Thread.sleep(ASYNC_TIME_DELAY_MILLIS * 2); verify(mockEosEntityListReg, never()).close(); verify(mockEosDoubleEntityListReg, never()).close(); verify(mockEntityCandReg, never()).close(); - verify(mockDoubleEntityCandReg).close(); + verify(mockDoubleEntityCandReg, never()).close(); } /** diff --git a/singleton-service/mdsal-singleton-dom-impl/src/test/java/org/opendaylight/mdsal/singleton/dom/impl/DOMClusterSingletonServiceProviderImplTest.java b/singleton-service/mdsal-singleton-dom-impl/src/test/java/org/opendaylight/mdsal/singleton/dom/impl/DOMClusterSingletonServiceProviderImplTest.java index 67236c8047..16ee48a952 100644 --- a/singleton-service/mdsal-singleton-dom-impl/src/test/java/org/opendaylight/mdsal/singleton/dom/impl/DOMClusterSingletonServiceProviderImplTest.java +++ b/singleton-service/mdsal-singleton-dom-impl/src/test/java/org/opendaylight/mdsal/singleton/dom/impl/DOMClusterSingletonServiceProviderImplTest.java @@ -12,7 +12,7 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; import static org.mockito.Mockito.atLeastOnce; import static org.mockito.Mockito.never; -import static org.mockito.Mockito.reset; +import static org.mockito.Mockito.times; import static org.mockito.Mockito.verify; import org.junit.Before; @@ -84,7 +84,7 @@ public class DOMClusterSingletonServiceProviderImplTest extends AbstractDOMClust clusterSingletonServiceProvider.ownershipChanged(getDoubleEntityToMaster()); assertEquals(TestClusterSingletonServiceState.STARTED, clusterSingletonService.getServiceState()); clusterSingletonServiceProvider.ownershipChanged(getDoubleEntityToSlave()); - assertEquals(TestClusterSingletonServiceState.STARTED, clusterSingletonService.getServiceState()); + assertEquals(TestClusterSingletonServiceState.DESTROYED, clusterSingletonService.getServiceState()); verify(mockEosDoubleEntityListReg, never()).close(); verify(mockEntityCandReg, never()).close(); verify(mockDoubleEntityCandReg, never()).close(); @@ -93,12 +93,11 @@ public class DOMClusterSingletonServiceProviderImplTest extends AbstractDOMClust verify(mockEosDoubleEntityListReg, never()).close(); verify(mockEntityCandReg, atLeastOnce()).close(); verify(mockDoubleEntityCandReg, never()).close(); - assertEquals(TestClusterSingletonServiceState.STARTED, clusterSingletonService.getServiceState()); + assertEquals(TestClusterSingletonServiceState.DESTROYED, clusterSingletonService.getServiceState()); clusterSingletonServiceProvider.ownershipChanged(getEntityToSlave()); verify(mockEntityCandReg, atLeastOnce()).close(); verify(mockDoubleEntityCandReg, atLeastOnce()).close(); verify(mockEosDoubleEntityListReg, never()).close(); - assertEquals(TestClusterSingletonServiceState.DESTROYED, clusterSingletonService.getServiceState()); } /** @@ -119,12 +118,12 @@ public class DOMClusterSingletonServiceProviderImplTest extends AbstractDOMClust assertEquals(TestClusterSingletonServiceState.INITIALIZED, clusterSingletonService.getServiceState()); clusterSingletonServiceProvider.ownershipChanged(getDoubleEntityToMaster()); assertEquals(TestClusterSingletonServiceState.STARTED, clusterSingletonService.getServiceState()); - clusterSingletonServiceProvider.ownershipChanged(getEntityToJeopardy()); + clusterSingletonServiceProvider.ownershipChanged(getDoubleEntityToJeopardy()); assertEquals(TestClusterSingletonServiceState.DESTROYED, clusterSingletonService.getServiceState()); verify(mockEosEntityListReg, never()).close(); verify(mockEosDoubleEntityListReg, never()).close(); verify(mockEntityCandReg, never()).close(); - verify(mockDoubleEntityCandReg).close(); + verify(mockDoubleEntityCandReg, never()).close(); } /** @@ -236,9 +235,8 @@ public class DOMClusterSingletonServiceProviderImplTest extends AbstractDOMClust verify(mockDoubleEntityCandReg, never()).close(); // Instantiate the next incarnation - reset(mockEos); reg = clusterSingletonServiceProvider.registerClusterSingletonService(clusterSingletonService2); - verify(mockEos, never()).registerCandidate(ENTITY); + verify(mockEos).registerCandidate(ENTITY); assertEquals(TestClusterSingletonServiceState.INITIALIZED, clusterSingletonService2.getServiceState()); // Drive the old incarnation to closure, resetting mocks as needed @@ -250,12 +248,10 @@ public class DOMClusterSingletonServiceProviderImplTest extends AbstractDOMClust // Reset mocks for reuse. The next change should see the previous group terminate and the next incarnation // to start coming up - reset(mockEntityCandReg); - reset(mockDoubleEntityCandReg); clusterSingletonServiceProvider.ownershipChanged(getDoubleEntityToSlave()); - verify(mockEos).registerCandidate(ENTITY); + verify(mockEos, times(2)).registerCandidate(ENTITY); clusterSingletonServiceProvider.ownershipChanged(getEntityToMaster()); - verify(mockEos).registerCandidate(DOUBLE_ENTITY); + verify(mockEos, times(2)).registerCandidate(DOUBLE_ENTITY); clusterSingletonServiceProvider.ownershipChanged(getDoubleEntityToMaster()); assertEquals(TestClusterSingletonServiceState.STARTED, clusterSingletonService2.getServiceState()); -- 2.36.6