X-Git-Url: https://git.opendaylight.org/gerrit/gitweb?p=controller.git;a=blobdiff_plain;f=opendaylight%2Fmd-sal%2Fsal-distributed-datastore%2Fsrc%2Fmain%2Fjava%2Forg%2Fopendaylight%2Fcontroller%2Fcluster%2Fdatastore%2FShardDataTree.java;h=8f3a467d55de9cb739b6d24a7e5db6cfa34910cb;hp=5a7ce80ffd14cf61e4086bd3f3a7e7895f91f60e;hb=43130cfeb2a1ac9f733ac8a777cabb36ff1277af;hpb=925cb4a228d0fda99c7bfeb432eb25285a223887 diff --git a/opendaylight/md-sal/sal-distributed-datastore/src/main/java/org/opendaylight/controller/cluster/datastore/ShardDataTree.java b/opendaylight/md-sal/sal-distributed-datastore/src/main/java/org/opendaylight/controller/cluster/datastore/ShardDataTree.java index 5a7ce80ffd..8f3a467d55 100644 --- a/opendaylight/md-sal/sal-distributed-datastore/src/main/java/org/opendaylight/controller/cluster/datastore/ShardDataTree.java +++ b/opendaylight/md-sal/sal-distributed-datastore/src/main/java/org/opendaylight/controller/cluster/datastore/ShardDataTree.java @@ -10,17 +10,20 @@ package org.opendaylight.controller.cluster.datastore; import akka.actor.ActorRef; import akka.util.Timeout; import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.MoreObjects; import com.google.common.base.Optional; import com.google.common.base.Preconditions; import com.google.common.base.Stopwatch; +import com.google.common.base.Ticker; import com.google.common.base.Verify; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableMap.Builder; +import com.google.common.collect.Iterables; import com.google.common.primitives.UnsignedLong; +import edu.umd.cs.findbugs.annotations.SuppressFBWarnings; import java.io.File; import java.io.IOException; -import java.util.AbstractMap.SimpleEntry; import java.util.ArrayDeque; import java.util.ArrayList; import java.util.Collection; @@ -32,15 +35,24 @@ import java.util.Queue; import java.util.concurrent.ExecutionException; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; +import java.util.function.Consumer; import java.util.function.UnaryOperator; import javax.annotation.Nonnull; +import javax.annotation.Nullable; import javax.annotation.concurrent.NotThreadSafe; import org.opendaylight.controller.cluster.access.concepts.LocalHistoryIdentifier; import org.opendaylight.controller.cluster.access.concepts.TransactionIdentifier; import org.opendaylight.controller.cluster.datastore.DataTreeCohortActorRegistry.CohortRegistryCommand; import org.opendaylight.controller.cluster.datastore.ShardDataTreeCohort.State; +import org.opendaylight.controller.cluster.datastore.jmx.mbeans.shard.ShardStats; +import org.opendaylight.controller.cluster.datastore.persisted.AbortTransactionPayload; +import org.opendaylight.controller.cluster.datastore.persisted.AbstractIdentifiablePayload; +import org.opendaylight.controller.cluster.datastore.persisted.CloseLocalHistoryPayload; import org.opendaylight.controller.cluster.datastore.persisted.CommitTransactionPayload; +import org.opendaylight.controller.cluster.datastore.persisted.CreateLocalHistoryPayload; import org.opendaylight.controller.cluster.datastore.persisted.MetadataShardDataTreeSnapshot; +import org.opendaylight.controller.cluster.datastore.persisted.PurgeLocalHistoryPayload; +import org.opendaylight.controller.cluster.datastore.persisted.PurgeTransactionPayload; import org.opendaylight.controller.cluster.datastore.persisted.ShardDataTreeSnapshot; import org.opendaylight.controller.cluster.datastore.persisted.ShardDataTreeSnapshotMetadata; import org.opendaylight.controller.cluster.datastore.utils.DataTreeModificationOutput; @@ -51,7 +63,6 @@ import org.opendaylight.controller.md.sal.common.api.data.AsyncDataChangeListene import org.opendaylight.controller.md.sal.common.api.data.OptimisticLockFailedException; import org.opendaylight.controller.md.sal.common.api.data.TransactionCommitFailedException; import org.opendaylight.controller.md.sal.dom.api.DOMDataTreeChangeListener; -import org.opendaylight.controller.md.sal.dom.store.impl.DataChangeListenerRegistration; import org.opendaylight.yangtools.concepts.Identifier; import org.opendaylight.yangtools.concepts.ListenerRegistration; import org.opendaylight.yangtools.yang.data.api.YangInstanceIdentifier; @@ -62,9 +73,10 @@ import org.opendaylight.yangtools.yang.data.api.schema.tree.DataTreeCandidateTip import org.opendaylight.yangtools.yang.data.api.schema.tree.DataTreeCandidates; import org.opendaylight.yangtools.yang.data.api.schema.tree.DataTreeModification; import org.opendaylight.yangtools.yang.data.api.schema.tree.DataTreeSnapshot; +import org.opendaylight.yangtools.yang.data.api.schema.tree.DataTreeTip; import org.opendaylight.yangtools.yang.data.api.schema.tree.DataValidationFailedException; -import org.opendaylight.yangtools.yang.data.api.schema.tree.ModificationType; import org.opendaylight.yangtools.yang.data.api.schema.tree.TipProducingDataTree; +import org.opendaylight.yangtools.yang.data.api.schema.tree.TipProducingDataTreeTip; import org.opendaylight.yangtools.yang.data.api.schema.tree.TreeType; import org.opendaylight.yangtools.yang.data.impl.schema.tree.InMemoryDataTreeFactory; import org.opendaylight.yangtools.yang.model.api.SchemaContext; @@ -76,7 +88,8 @@ import scala.concurrent.duration.Duration; * Internal shard state, similar to a DOMStore, but optimized for use in the actor system, * e.g. it does not expose public interfaces and assumes it is only ever called from a * single thread. - *

+ * + *

* This class is not part of the API contract and is subject to change at any time. */ @NotThreadSafe @@ -94,9 +107,25 @@ public class ShardDataTree extends ShardDataTreeTransactionParent { private static final Timeout COMMIT_STEP_TIMEOUT = new Timeout(Duration.create(5, TimeUnit.SECONDS)); private static final Logger LOG = LoggerFactory.getLogger(ShardDataTree.class); + /** + * Process this many transactions in a single batched run. If we exceed this limit, we need to schedule later + * execution to finish up the batch. This is necessary in case of a long list of transactions which progress + * immediately through their preCommit phase -- if that happens, their completion eats up stack frames and could + * result in StackOverflowError. + */ + private static final int MAX_TRANSACTION_BATCH = 100; + private final Map transactionChains = new HashMap<>(); private final DataTreeCohortActorRegistry cohortRegistry = new DataTreeCohortActorRegistry(); private final Queue pendingTransactions = new ArrayDeque<>(); + private final Queue pendingCommits = new ArrayDeque<>(); + private final Queue pendingFinishCommits = new ArrayDeque<>(); + + /** + * Callbacks that need to be invoked once a payload is replicated. + */ + private final Map replicationCallbacks = new HashMap<>(); + private final ShardDataTreeChangeListenerPublisher treeChangeListenerPublisher; private final ShardDataChangeListenerPublisher dataChangeListenerPublisher; private final Collection> metadata; @@ -105,9 +134,19 @@ public class ShardDataTree extends ShardDataTreeTransactionParent { private final Shard shard; private Runnable runOnPendingTransactionsComplete; + /** + * Optimistic {@link DataTreeCandidate} preparation. Since our DataTree implementation is a + * {@link TipProducingDataTree}, each {@link DataTreeCandidate} is also a {@link DataTreeTip}, e.g. another + * candidate can be prepared on top of it. They still need to be committed in sequence. Here we track the current + * tip of the data tree, which is the last DataTreeCandidate we have in flight, or the DataTree itself. + */ + private TipProducingDataTreeTip tip; + private SchemaContext schemaContext; - public ShardDataTree(final Shard shard, final SchemaContext schemaContext, final TipProducingDataTree dataTree, + private int currentTransactionBatch; + + ShardDataTree(final Shard shard, final SchemaContext schemaContext, final TipProducingDataTree dataTree, final ShardDataTreeChangeListenerPublisher treeChangeListenerPublisher, final ShardDataChangeListenerPublisher dataChangeListenerPublisher, final String logContext, final ShardDataTreeMetadata... metadata) { @@ -119,25 +158,33 @@ public class ShardDataTree extends ShardDataTreeTransactionParent { this.dataChangeListenerPublisher = Preconditions.checkNotNull(dataChangeListenerPublisher); this.logContext = Preconditions.checkNotNull(logContext); this.metadata = ImmutableList.copyOf(metadata); + tip = dataTree; } - public ShardDataTree(final Shard shard, final SchemaContext schemaContext, final TreeType treeType, + ShardDataTree(final Shard shard, final SchemaContext schemaContext, final TreeType treeType, + final YangInstanceIdentifier root, final ShardDataTreeChangeListenerPublisher treeChangeListenerPublisher, - final ShardDataChangeListenerPublisher dataChangeListenerPublisher, final String logContext) { - this(shard, schemaContext, InMemoryDataTreeFactory.getInstance().create(treeType), - treeChangeListenerPublisher, dataChangeListenerPublisher, logContext); + final ShardDataChangeListenerPublisher dataChangeListenerPublisher, final String logContext, + final ShardDataTreeMetadata... metadata) { + this(shard, schemaContext, InMemoryDataTreeFactory.getInstance().create(treeType, root), + treeChangeListenerPublisher, dataChangeListenerPublisher, logContext, metadata); } @VisibleForTesting public ShardDataTree(final Shard shard, final SchemaContext schemaContext, final TreeType treeType) { - this(shard, schemaContext, treeType, new DefaultShardDataTreeChangeListenerPublisher(), - new DefaultShardDataChangeListenerPublisher(), ""); + this(shard, schemaContext, treeType, YangInstanceIdentifier.EMPTY, + new DefaultShardDataTreeChangeListenerPublisher(""), + new DefaultShardDataChangeListenerPublisher(""), ""); } - String logContext() { + final String logContext() { return logContext; } + final Ticker ticker() { + return shard.ticker(); + } + public TipProducingDataTree getDataTree() { return dataTree; } @@ -151,6 +198,10 @@ public class ShardDataTree extends ShardDataTreeTransactionParent { this.schemaContext = Preconditions.checkNotNull(newSchemaContext); } + void resetTransactionBatch() { + currentTransactionBatch = 0; + } + /** * Take a snapshot of current state for later recovery. * @@ -171,11 +222,15 @@ public class ShardDataTree extends ShardDataTreeTransactionParent { return new MetadataShardDataTreeSnapshot(rootNode, metaBuilder.build()); } + private boolean anyPendingTransactions() { + return !pendingTransactions.isEmpty() || !pendingCommits.isEmpty() || !pendingFinishCommits.isEmpty(); + } + private void applySnapshot(@Nonnull final ShardDataTreeSnapshot snapshot, final UnaryOperator wrapper) throws DataValidationFailedException { final Stopwatch elapsed = Stopwatch.createStarted(); - if (!pendingTransactions.isEmpty()) { + if (anyPendingTransactions()) { LOG.warn("{}: applying state snapshot with pending transactions", logContext); } @@ -212,7 +267,7 @@ public class ShardDataTree extends ShardDataTreeTransactionParent { dataTree.commit(candidate); notifyListeners(candidate); - LOG.debug("{}: state snapshot applied in %s", logContext, elapsed); + LOG.debug("{}: state snapshot applied in {}", logContext, elapsed); } /** @@ -284,8 +339,16 @@ public class ShardDataTree extends ShardDataTreeTransactionParent { ((CommitTransactionPayload) payload).getCandidate(); applyRecoveryCandidate(e.getValue()); allMetadataCommittedTransaction(e.getKey()); - } else if (payload instanceof DataTreeCandidatePayload) { - applyRecoveryCandidate(((DataTreeCandidatePayload) payload).getCandidate()); + } else if (payload instanceof AbortTransactionPayload) { + allMetadataAbortedTransaction(((AbortTransactionPayload) payload).getIdentifier()); + } else if (payload instanceof PurgeTransactionPayload) { + allMetadataPurgedTransaction(((PurgeTransactionPayload) payload).getIdentifier()); + } else if (payload instanceof CreateLocalHistoryPayload) { + allMetadataCreatedLocalHistory(((CreateLocalHistoryPayload) payload).getIdentifier()); + } else if (payload instanceof CloseLocalHistoryPayload) { + allMetadataClosedLocalHistory(((CloseLocalHistoryPayload) payload).getIdentifier()); + } else if (payload instanceof PurgeLocalHistoryPayload) { + allMetadataPurgedLocalHistory(((PurgeLocalHistoryPayload) payload).getIdentifier()); } else { LOG.debug("{}: ignoring unhandled payload {}", logContext, payload); } @@ -330,29 +393,74 @@ public class ShardDataTree extends ShardDataTreeTransactionParent { * pre-Boron state -- which limits the number of options here. */ if (payload instanceof CommitTransactionPayload) { + final TransactionIdentifier txId; if (identifier == null) { final Entry e = ((CommitTransactionPayload) payload).getCandidate(); - applyReplicatedCandidate(e.getKey(), e.getValue()); - allMetadataCommittedTransaction(e.getKey()); + txId = e.getKey(); + applyReplicatedCandidate(txId, e.getValue()); } else { Verify.verify(identifier instanceof TransactionIdentifier); - payloadReplicationComplete((TransactionIdentifier) identifier); + txId = (TransactionIdentifier) identifier; + payloadReplicationComplete(txId); + } + allMetadataCommittedTransaction(txId); + } else if (payload instanceof AbortTransactionPayload) { + if (identifier != null) { + payloadReplicationComplete((AbortTransactionPayload) payload); + } + allMetadataAbortedTransaction(((AbortTransactionPayload) payload).getIdentifier()); + } else if (payload instanceof PurgeTransactionPayload) { + if (identifier != null) { + payloadReplicationComplete((PurgeTransactionPayload) payload); + } + allMetadataPurgedTransaction(((PurgeTransactionPayload) payload).getIdentifier()); + } else if (payload instanceof CloseLocalHistoryPayload) { + if (identifier != null) { + payloadReplicationComplete((CloseLocalHistoryPayload) payload); } + allMetadataClosedLocalHistory(((CloseLocalHistoryPayload) payload).getIdentifier()); + } else if (payload instanceof CreateLocalHistoryPayload) { + if (identifier != null) { + payloadReplicationComplete((CreateLocalHistoryPayload)payload); + } + allMetadataCreatedLocalHistory(((CreateLocalHistoryPayload) payload).getIdentifier()); + } else if (payload instanceof PurgeLocalHistoryPayload) { + if (identifier != null) { + payloadReplicationComplete((PurgeLocalHistoryPayload)payload); + } + allMetadataPurgedLocalHistory(((PurgeLocalHistoryPayload) payload).getIdentifier()); } else { LOG.warn("{}: ignoring unhandled identifier {} payload {}", logContext, identifier, payload); } } + private void replicatePayload(final Identifier id, final Payload payload, @Nullable final Runnable callback) { + if (callback != null) { + replicationCallbacks.put(payload, callback); + } + shard.persistPayload(id, payload, true); + } + + private void payloadReplicationComplete(final AbstractIdentifiablePayload payload) { + final Runnable callback = replicationCallbacks.remove(payload); + if (callback != null) { + LOG.debug("{}: replication of {} completed, invoking {}", logContext, payload.getIdentifier(), callback); + callback.run(); + } else { + LOG.debug("{}: replication of {} has no callback", logContext, payload.getIdentifier()); + } + } + private void payloadReplicationComplete(final TransactionIdentifier txId) { - final CommitEntry current = pendingTransactions.peek(); + final CommitEntry current = pendingFinishCommits.peek(); if (current == null) { LOG.warn("{}: No outstanding transactions, ignoring consensus on transaction {}", logContext, txId); return; } if (!current.cohort.getIdentifier().equals(txId)) { - LOG.warn("{}: Head of queue is {}, ignoring consensus on transaction {}", logContext, + LOG.debug("{}: Head of pendingFinishCommits queue is {}, ignoring consensus on transaction {}", logContext, current.cohort.getIdentifier(), txId); return; } @@ -360,17 +468,65 @@ public class ShardDataTree extends ShardDataTreeTransactionParent { finishCommit(current.cohort); } + private void allMetadataAbortedTransaction(final TransactionIdentifier txId) { + for (ShardDataTreeMetadata m : metadata) { + m.onTransactionAborted(txId); + } + } + private void allMetadataCommittedTransaction(final TransactionIdentifier txId) { for (ShardDataTreeMetadata m : metadata) { m.onTransactionCommitted(txId); } } - private ShardDataTreeTransactionChain ensureTransactionChain(final LocalHistoryIdentifier localHistoryIdentifier) { - ShardDataTreeTransactionChain chain = transactionChains.get(localHistoryIdentifier); + private void allMetadataPurgedTransaction(final TransactionIdentifier txId) { + for (ShardDataTreeMetadata m : metadata) { + m.onTransactionPurged(txId); + } + } + + private void allMetadataCreatedLocalHistory(final LocalHistoryIdentifier historyId) { + for (ShardDataTreeMetadata m : metadata) { + m.onHistoryCreated(historyId); + } + } + + private void allMetadataClosedLocalHistory(final LocalHistoryIdentifier historyId) { + for (ShardDataTreeMetadata m : metadata) { + m.onHistoryClosed(historyId); + } + } + + private void allMetadataPurgedLocalHistory(final LocalHistoryIdentifier historyId) { + for (ShardDataTreeMetadata m : metadata) { + m.onHistoryPurged(historyId); + } + } + + /** + * Create a transaction chain for specified history. Unlike {@link #ensureTransactionChain(LocalHistoryIdentifier)}, + * this method is used for re-establishing state when we are taking over + * + * @param historyId Local history identifier + * @param closed True if the chain should be created in closed state (i.e. pending purge) + * @return Transaction chain handle + */ + ShardDataTreeTransactionChain recreateTransactionChain(final LocalHistoryIdentifier historyId, + final boolean closed) { + final ShardDataTreeTransactionChain ret = new ShardDataTreeTransactionChain(historyId, this); + final ShardDataTreeTransactionChain existing = transactionChains.putIfAbsent(historyId, ret); + Preconditions.checkState(existing == null, "Attempted to recreate chain %s, but %s already exists", historyId, + existing); + return ret; + } + + ShardDataTreeTransactionChain ensureTransactionChain(final LocalHistoryIdentifier historyId) { + ShardDataTreeTransactionChain chain = transactionChains.get(historyId); if (chain == null) { - chain = new ShardDataTreeTransactionChain(localHistoryIdentifier, this); - transactionChains.put(localHistoryIdentifier, chain); + chain = new ShardDataTreeTransactionChain(historyId, this); + transactionChains.put(historyId, chain); + replicatePayload(historyId, CreateLocalHistoryPayload.create(historyId), null); } return chain; @@ -378,7 +534,7 @@ public class ShardDataTree extends ShardDataTreeTransactionParent { ReadOnlyShardDataTreeTransaction newReadOnlyTransaction(final TransactionIdentifier txId) { if (txId.getHistoryId().getHistoryId() == 0) { - return new ReadOnlyShardDataTreeTransaction(txId, dataTree.takeSnapshot()); + return new ReadOnlyShardDataTreeTransaction(this, txId, dataTree.takeSnapshot()); } return ensureTransactionChain(txId.getHistoryId()).newReadOnlyTransaction(txId); @@ -395,78 +551,98 @@ public class ShardDataTree extends ShardDataTreeTransactionParent { @VisibleForTesting public void notifyListeners(final DataTreeCandidate candidate) { - treeChangeListenerPublisher.publishChanges(candidate, logContext); - dataChangeListenerPublisher.publishChanges(candidate, logContext); - } - - void notifyOfInitialData(final DataChangeListenerRegistration>> listenerReg, final Optional currentState) { - if (currentState.isPresent()) { - ShardDataChangeListenerPublisher localPublisher = dataChangeListenerPublisher.newInstance(); - localPublisher.registerDataChangeListener(listenerReg.getPath(), listenerReg.getInstance(), - listenerReg.getScope()); - localPublisher.publishChanges(currentState.get(), logContext); - } + treeChangeListenerPublisher.publishChanges(candidate); + dataChangeListenerPublisher.publishChanges(candidate); } - void notifyOfInitialData(final YangInstanceIdentifier path, final DOMDataTreeChangeListener listener, - final Optional currentState) { - if (currentState.isPresent()) { - ShardDataTreeChangeListenerPublisher localPublisher = treeChangeListenerPublisher.newInstance(); - localPublisher.registerTreeChangeListener(path, listener); - localPublisher.publishChanges(currentState.get(), logContext); - } - } - - void closeAllTransactionChains() { + /** + * Immediately purge all state relevant to leader. This includes all transaction chains and any scheduled + * replication callbacks. + */ + void purgeLeaderState() { for (ShardDataTreeTransactionChain chain : transactionChains.values()) { chain.close(); } transactionChains.clear(); + replicationCallbacks.clear(); } - void closeTransactionChain(final LocalHistoryIdentifier transactionChainId) { - final ShardDataTreeTransactionChain chain = transactionChains.remove(transactionChainId); - if (chain != null) { - chain.close(); - } else { - LOG.debug("{}: Closing non-existent transaction chain {}", logContext, transactionChainId); + /** + * Close a single transaction chain. + * + * @param id History identifier + * @param callback Callback to invoke upon completion, may be null + */ + void closeTransactionChain(final LocalHistoryIdentifier id, @Nullable final Runnable callback) { + final ShardDataTreeTransactionChain chain = transactionChains.get(id); + if (chain == null) { + LOG.debug("{}: Closing non-existent transaction chain {}", logContext, id); + if (callback != null) { + callback.run(); + } + return; } + + chain.close(); + replicatePayload(id, CloseLocalHistoryPayload.create(id), callback); } - Entry>>, - Optional> registerChangeListener(final YangInstanceIdentifier path, - final AsyncDataChangeListener> listener, - final DataChangeScope scope) { - DataChangeListenerRegistration>> reg = - dataChangeListenerPublisher.registerDataChangeListener(path, listener, scope); + /** + * Purge a single transaction chain. + * + * @param id History identifier + * @param callback Callback to invoke upon completion, may be null + */ + void purgeTransactionChain(final LocalHistoryIdentifier id, @Nullable final Runnable callback) { + final ShardDataTreeTransactionChain chain = transactionChains.remove(id); + if (chain == null) { + LOG.debug("{}: Purging non-existent transaction chain {}", logContext, id); + if (callback != null) { + callback.run(); + } + return; + } + + replicatePayload(id, PurgeLocalHistoryPayload.create(id), callback); + } - return new SimpleEntry<>(reg, readCurrentData()); + void registerDataChangeListener(final YangInstanceIdentifier path, + final AsyncDataChangeListener> listener, + final DataChangeScope scope, final Optional initialState, + final Consumer>>> + onRegistration) { + dataChangeListenerPublisher.registerDataChangeListener(path, listener, scope, initialState, onRegistration); } - private Optional readCurrentData() { + Optional readCurrentData() { final Optional> currentState = dataTree.takeSnapshot().readNode(YangInstanceIdentifier.EMPTY); return currentState.isPresent() ? Optional.of(DataTreeCandidates.fromNormalizedNode( YangInstanceIdentifier.EMPTY, currentState.get())) : Optional.absent(); } - public Entry, Optional> - registerTreeChangeListener(final YangInstanceIdentifier path, final DOMDataTreeChangeListener listener) { - final ListenerRegistration reg = - treeChangeListenerPublisher.registerTreeChangeListener(path, listener); - - return new SimpleEntry<>(reg, readCurrentData()); + public void registerTreeChangeListener(final YangInstanceIdentifier path, final DOMDataTreeChangeListener listener, + final Optional initialState, + final Consumer> onRegistration) { + treeChangeListenerPublisher.registerTreeChangeListener(path, listener, initialState, onRegistration); } int getQueueSize() { - return pendingTransactions.size(); + return pendingTransactions.size() + pendingCommits.size() + pendingFinishCommits.size(); } @Override - void abortTransaction(final AbstractShardDataTreeTransaction transaction) { - // Intentional no-op + void abortTransaction(final AbstractShardDataTreeTransaction transaction, final Runnable callback) { + final TransactionIdentifier id = transaction.getIdentifier(); + LOG.debug("{}: aborting transaction {}", logContext, id); + replicatePayload(id, AbortTransactionPayload.create(id), callback); + } + + @Override + void abortFromTransactionActor(final AbstractShardDataTreeTransaction transaction) { + // No-op for free-standing transactions + } @Override @@ -474,7 +650,12 @@ public class ShardDataTree extends ShardDataTreeTransactionParent { final DataTreeModification snapshot = transaction.getSnapshot(); snapshot.ready(); - return createReadyCohort(transaction.getId(), snapshot); + return createReadyCohort(transaction.getIdentifier(), snapshot); + } + + void purgeTransaction(final TransactionIdentifier id, final Runnable callback) { + LOG.debug("{}: purging transaction {}", logContext, id); + replicatePayload(id, PurgeTransactionPayload.create(id), callback); } public Optional> readNode(final YangInstanceIdentifier path) { @@ -498,6 +679,9 @@ public class ShardDataTree extends ShardDataTreeTransactionParent { @VisibleForTesting @Deprecated public DataTreeCandidate commit(final DataTreeModification modification) throws DataValidationFailedException { + // Direct modification commit is a utility, which cannot be used while we have transactions in-flight + Preconditions.checkState(tip == dataTree, "Cannot modify data tree while transacgitons are pending"); + modification.ready(); dataTree.validate(modification); DataTreeCandidate candidate = dataTree.prepare(modification); @@ -506,33 +690,57 @@ public class ShardDataTree extends ShardDataTreeTransactionParent { } public Collection getAndClearPendingTransactions() { - Collection ret = new ArrayList<>(pendingTransactions.size()); + Collection ret = new ArrayList<>(getQueueSize()); + + for (CommitEntry entry: pendingFinishCommits) { + ret.add(entry.cohort); + } + + for (CommitEntry entry: pendingCommits) { + ret.add(entry.cohort); + } + for (CommitEntry entry: pendingTransactions) { ret.add(entry.cohort); } + pendingFinishCommits.clear(); + pendingCommits.clear(); pendingTransactions.clear(); + tip = dataTree; return ret; } + /** + * Called some time after {@link #processNextPendingTransaction()} decides to stop processing. + */ + void resumeNextPendingTransaction() { + LOG.debug("{}: attempting to resume transaction processing", logContext); + processNextPending(); + } + @SuppressWarnings("checkstyle:IllegalCatch") - private void processNextTransaction() { - while (!pendingTransactions.isEmpty()) { - final CommitEntry entry = pendingTransactions.peek(); + private void processNextPendingTransaction() { + ++currentTransactionBatch; + if (currentTransactionBatch > MAX_TRANSACTION_BATCH) { + LOG.debug("{}: Already processed {}, scheduling continuation", logContext, currentTransactionBatch); + shard.scheduleNextPendingTransaction(); + return; + } + + processNextPending(pendingTransactions, State.CAN_COMMIT_PENDING, entry -> { final SimpleShardDataTreeCohort cohort = entry.cohort; final DataTreeModification modification = cohort.getDataTreeModification(); - if (cohort.getState() != State.CAN_COMMIT_PENDING) { - break; - } - LOG.debug("{}: Validating transaction {}", logContext, cohort.getIdentifier()); Exception cause; try { - dataTree.validate(modification); + cohort.throwCanCommitFailure(); + + tip.validate(modification); LOG.debug("{}: Transaction {} validated", logContext, cohort.getIdentifier()); cohort.successfulCanCommit(); - entry.lastAccess = shard.ticker().read(); + entry.lastAccess = ticker().read(); return; } catch (ConflictingModificationAppliedException e) { LOG.warn("{}: Store Tx {}: Conflicting modification for path {}.", logContext, cohort.getIdentifier(), @@ -554,11 +762,46 @@ public class ShardDataTree extends ShardDataTreeTransactionParent { // Failure path: propagate the failure, remove the transaction from the queue and loop to the next one pendingTransactions.poll().cohort.failedCanCommit(cause); + }); + } + + private void processNextPending() { + processNextPendingCommit(); + processNextPendingTransaction(); + } + + private void processNextPending(final Queue queue, final State allowedState, + final Consumer processor) { + while (!queue.isEmpty()) { + final CommitEntry entry = queue.peek(); + final SimpleShardDataTreeCohort cohort = entry.cohort; + + if (cohort.isFailed()) { + LOG.debug("{}: Removing failed transaction {}", logContext, cohort.getIdentifier()); + queue.remove(); + continue; + } + + if (cohort.getState() == allowedState) { + processor.accept(entry); + } + + break; } maybeRunOperationOnPendingTransactionsComplete(); } + private void processNextPendingCommit() { + processNextPending(pendingCommits, State.COMMIT_PENDING, + entry -> startCommit(entry.cohort, entry.cohort.getCandidate())); + } + + private boolean peekNextPendingCommit() { + final CommitEntry first = pendingCommits.peek(); + return first != null && first.cohort.getState() == State.COMMIT_PENDING; + } + void startCanCommit(final SimpleShardDataTreeCohort cohort) { final SimpleShardDataTreeCohort current = pendingTransactions.peek().cohort; if (!cohort.equals(current)) { @@ -566,13 +809,13 @@ public class ShardDataTree extends ShardDataTreeTransactionParent { return; } - processNextTransaction(); + processNextPendingTransaction(); } private void failPreCommit(final Exception cause) { shard.getShardMBean().incrementFailedTransactionsCount(); pendingTransactions.poll().cohort.failedPreCommit(cause); - processNextTransaction(); + processNextPendingTransaction(); } @SuppressWarnings("checkstyle:IllegalCatch") @@ -582,29 +825,37 @@ public class ShardDataTree extends ShardDataTreeTransactionParent { final SimpleShardDataTreeCohort current = entry.cohort; Verify.verify(cohort.equals(current), "Attempted to pre-commit %s while %s is pending", cohort, current); - final DataTreeCandidateTip candidate; - try { - candidate = dataTree.prepare(cohort.getDataTreeModification()); - } catch (Exception e) { - failPreCommit(e); - return; - } + LOG.debug("{}: Preparing transaction {}", logContext, current.getIdentifier()); + + final DataTreeCandidateTip candidate; try { + candidate = tip.prepare(cohort.getDataTreeModification()); cohort.userPreCommit(candidate); - } catch (ExecutionException | TimeoutException e) { + } catch (ExecutionException | TimeoutException | RuntimeException e) { failPreCommit(e); return; } - entry.lastAccess = shard.ticker().read(); + // Set the tip of the data tree. + tip = Verify.verifyNotNull(candidate); + + entry.lastAccess = ticker().read(); + + pendingTransactions.remove(); + pendingCommits.add(entry); + + LOG.debug("{}: Transaction {} prepared", logContext, current.getIdentifier()); + cohort.successfulPreCommit(candidate); + + processNextPendingTransaction(); } private void failCommit(final Exception cause) { shard.getShardMBean().incrementFailedTransactionsCount(); - pendingTransactions.poll().cohort.failedCommit(cause); - processNextTransaction(); + pendingFinishCommits.poll().cohort.failedCommit(cause); + processNextPending(); } @SuppressWarnings("checkstyle:IllegalCatch") @@ -614,6 +865,11 @@ public class ShardDataTree extends ShardDataTreeTransactionParent { LOG.debug("{}: Resuming commit of transaction {}", logContext, txId); + if (tip == candidate) { + // All pending candidates have been committed, reset the tip to the data tree. + tip = dataTree; + } + try { dataTree.commit(candidate); } catch (Exception e) { @@ -626,72 +882,126 @@ public class ShardDataTree extends ShardDataTreeTransactionParent { shard.getShardMBean().setLastCommittedTransactionTime(System.currentTimeMillis()); // FIXME: propagate journal index - pendingTransactions.poll().cohort.successfulCommit(UnsignedLong.ZERO); + pendingFinishCommits.poll().cohort.successfulCommit(UnsignedLong.ZERO); LOG.trace("{}: Transaction {} committed, proceeding to notify", logContext, txId); notifyListeners(candidate); - processNextTransaction(); + processNextPending(); } void startCommit(final SimpleShardDataTreeCohort cohort, final DataTreeCandidate candidate) { - final CommitEntry entry = pendingTransactions.peek(); + final CommitEntry entry = pendingCommits.peek(); Preconditions.checkState(entry != null, "Attempted to start commit of %s when no transactions pending", cohort); final SimpleShardDataTreeCohort current = entry.cohort; - Verify.verify(cohort.equals(current), "Attempted to commit %s while %s is pending", cohort, current); - - if (shard.canSkipPayload() || candidate.getRootNode().getModificationType() == ModificationType.UNMODIFIED) { - LOG.debug("{}: No replication required, proceeding to finish commit", logContext); - finishCommit(cohort); + if (!cohort.equals(current)) { + LOG.debug("{}: Transaction {} scheduled for commit step", logContext, cohort.getIdentifier()); return; } + LOG.debug("{}: Starting commit for transaction {}", logContext, current.getIdentifier()); + final TransactionIdentifier txId = cohort.getIdentifier(); final Payload payload; try { payload = CommitTransactionPayload.create(txId, candidate); } catch (IOException e) { LOG.error("{}: Failed to encode transaction {} candidate {}", logContext, txId, candidate, e); - pendingTransactions.poll().cohort.failedCommit(e); + pendingCommits.poll().cohort.failedCommit(e); + processNextPending(); return; } + // We process next transactions pending canCommit before we call persistPayload to possibly progress subsequent + // transactions to the COMMIT_PENDING state so the payloads can be batched for replication. This is done for + // single-shard transactions that immediately transition from canCommit to preCommit to commit. Note that + // if the next pending transaction is progressed to COMMIT_PENDING and this method (startCommit) is called, + // the next transaction will not attempt to replicate b/c the current transaction is still at the head of the + // pendingCommits queue. + processNextPendingTransaction(); + + // After processing next pending transactions, we can now remove the current transaction from pendingCommits. + // Note this must be done before the call to peekNextPendingCommit below so we check the next transaction + // in order to properly determine the batchHint flag for the call to persistPayload. + pendingCommits.remove(); + pendingFinishCommits.add(entry); + + // See if the next transaction is pending commit (ie in the COMMIT_PENDING state) so it can be batched with + // this transaction for replication. + boolean replicationBatchHint = peekNextPendingCommit(); + // Once completed, we will continue via payloadReplicationComplete + shard.persistPayload(txId, payload, replicationBatchHint); + entry.lastAccess = shard.ticker().read(); - shard.persistPayload(txId, payload); + LOG.debug("{}: Transaction {} submitted to persistence", logContext, txId); + + // Process the next transaction pending commit, if any. If there is one it will be batched with this + // transaction for replication. + processNextPendingCommit(); + } + + Collection getCohortActors() { + return cohortRegistry.getCohortActors(); } void processCohortRegistryCommand(final ActorRef sender, final CohortRegistryCommand message) { cohortRegistry.process(sender, message); } + @Override + ShardDataTreeCohort createFailedCohort(final TransactionIdentifier txId, final DataTreeModification mod, + final Exception failure) { + SimpleShardDataTreeCohort cohort = new SimpleShardDataTreeCohort.DeadOnArrival(this, mod, txId, failure); + pendingTransactions.add(new CommitEntry(cohort, ticker().read())); + return cohort; + } + + @Override ShardDataTreeCohort createReadyCohort(final TransactionIdentifier txId, - final DataTreeModification modification) { - SimpleShardDataTreeCohort cohort = new SimpleShardDataTreeCohort(this, modification, txId, + final DataTreeModification mod) { + SimpleShardDataTreeCohort cohort = new SimpleShardDataTreeCohort.Normal(this, mod, txId, cohortRegistry.createCohort(schemaContext, txId, COMMIT_STEP_TIMEOUT)); - pendingTransactions.add(new CommitEntry(cohort, shard.ticker().read())); + pendingTransactions.add(new CommitEntry(cohort, ticker().read())); return cohort; } + // Exposed for ShardCommitCoordinator so it does not have deal with local histories (it does not care), this mimics + // the newReadWriteTransaction() + ShardDataTreeCohort newReadyCohort(final TransactionIdentifier txId, final DataTreeModification mod) { + if (txId.getHistoryId().getHistoryId() == 0) { + return createReadyCohort(txId, mod); + } + + return ensureTransactionChain(txId.getHistoryId()).createReadyCohort(txId, mod); + } + + @SuppressFBWarnings(value = "DB_DUPLICATE_SWITCH_CLAUSES", justification = "See inline comments below.") void checkForExpiredTransactions(final long transactionCommitTimeoutMillis) { final long timeout = TimeUnit.MILLISECONDS.toNanos(transactionCommitTimeoutMillis); - final long now = shard.ticker().read(); - final CommitEntry currentTx = pendingTransactions.peek(); + final long now = ticker().read(); + + final Queue currentQueue = !pendingFinishCommits.isEmpty() ? pendingFinishCommits : + !pendingCommits.isEmpty() ? pendingCommits : pendingTransactions; + final CommitEntry currentTx = currentQueue.peek(); if (currentTx != null && currentTx.lastAccess + timeout < now) { LOG.warn("{}: Current transaction {} has timed out after {} ms in state {}", logContext, currentTx.cohort.getIdentifier(), transactionCommitTimeoutMillis, currentTx.cohort.getState()); boolean processNext = true; switch (currentTx.cohort.getState()) { case CAN_COMMIT_PENDING: - pendingTransactions.poll().cohort.failedCanCommit(new TimeoutException()); + currentQueue.remove().cohort.failedCanCommit(new TimeoutException()); break; case CAN_COMMIT_COMPLETE: - pendingTransactions.poll().cohort.reportFailure(new TimeoutException()); + // The suppression of the FindBugs "DB_DUPLICATE_SWITCH_CLAUSES" warning pertains to this clause + // whose code is duplicated with PRE_COMMIT_COMPLETE. The clauses aren't combined in case the code + // in PRE_COMMIT_COMPLETE is changed. + currentQueue.remove().cohort.reportFailure(new TimeoutException()); break; case PRE_COMMIT_PENDING: - pendingTransactions.poll().cohort.failedPreCommit(new TimeoutException()); + currentQueue.remove().cohort.failedPreCommit(new TimeoutException()); break; case PRE_COMMIT_COMPLETE: // FIXME: this is a legacy behavior problem. Three-phase commit protocol specifies that after we @@ -711,7 +1021,7 @@ public class ShardDataTree extends ShardDataTreeTransactionParent { // In order to make the pre-commit timer working across failovers, though, we need // a per-shard cluster-wide monotonic time, so a follower becoming the leader can accurately // restart the timer. - pendingTransactions.poll().cohort.reportFailure(new TimeoutException()); + currentQueue.remove().cohort.reportFailure(new TimeoutException()); break; case COMMIT_PENDING: LOG.warn("{}: Transaction {} is still committing, cannot abort", logContext, @@ -724,47 +1034,94 @@ public class ShardDataTree extends ShardDataTreeTransactionParent { case FAILED: case READY: default: - pendingTransactions.poll(); + currentQueue.remove(); } if (processNext) { - processNextTransaction(); + processNextPending(); } } } - void startAbort(final SimpleShardDataTreeCohort cohort) { - final Iterator it = pendingTransactions.iterator(); + boolean startAbort(final SimpleShardDataTreeCohort cohort) { + final Iterator it = Iterables.concat(pendingFinishCommits, pendingCommits, + pendingTransactions).iterator(); if (!it.hasNext()) { LOG.debug("{}: no open transaction while attempting to abort {}", logContext, cohort.getIdentifier()); - return; + return true; } // First entry is special, as it may already be committing final CommitEntry first = it.next(); if (cohort.equals(first.cohort)) { if (cohort.getState() != State.COMMIT_PENDING) { - LOG.debug("{}: aborted head of queue {} in state {}", logContext, cohort.getIdentifier(), + LOG.debug("{}: aborting head of queue {} in state {}", logContext, cohort.getIdentifier(), cohort.getIdentifier()); - pendingTransactions.poll(); - processNextTransaction(); - } else { - LOG.warn("{}: transaction {} is committing, skipping abort", logContext, cohort.getIdentifier()); + + it.remove(); + if (cohort.getCandidate() != null) { + rebaseTransactions(it, dataTree); + } + + processNextPending(); + return true; } - return; + LOG.warn("{}: transaction {} is committing, skipping abort", logContext, cohort.getIdentifier()); + return false; } + TipProducingDataTreeTip newTip = MoreObjects.firstNonNull(first.cohort.getCandidate(), dataTree); while (it.hasNext()) { final CommitEntry e = it.next(); if (cohort.equals(e.cohort)) { LOG.debug("{}: aborting queued transaction {}", logContext, cohort.getIdentifier()); + it.remove(); - return; + if (cohort.getCandidate() != null) { + rebaseTransactions(it, newTip); + } + + return true; + } else { + newTip = MoreObjects.firstNonNull(e.cohort.getCandidate(), newTip); } } LOG.debug("{}: aborted transaction {} not found in the queue", logContext, cohort.getIdentifier()); + return true; + } + + @SuppressWarnings("checkstyle:IllegalCatch") + private void rebaseTransactions(final Iterator iter, @Nonnull final TipProducingDataTreeTip newTip) { + tip = Preconditions.checkNotNull(newTip); + while (iter.hasNext()) { + final SimpleShardDataTreeCohort cohort = iter.next().cohort; + if (cohort.getState() == State.CAN_COMMIT_COMPLETE) { + LOG.debug("{}: Revalidating queued transaction {}", logContext, cohort.getIdentifier()); + + try { + tip.validate(cohort.getDataTreeModification()); + } catch (DataValidationFailedException | RuntimeException e) { + LOG.debug("{}: Failed to revalidate queued transaction {}", logContext, cohort.getIdentifier(), e); + cohort.reportFailure(e); + } + } else if (cohort.getState() == State.PRE_COMMIT_COMPLETE) { + LOG.debug("{}: Repreparing queued transaction {}", logContext, cohort.getIdentifier()); + + try { + tip.validate(cohort.getDataTreeModification()); + DataTreeCandidateTip candidate = tip.prepare(cohort.getDataTreeModification()); + cohort.userPreCommit(candidate); + + cohort.setNewCandidate(candidate); + tip = candidate; + } catch (ExecutionException | TimeoutException | RuntimeException | DataValidationFailedException e) { + LOG.debug("{}: Failed to reprepare queued transaction {}", logContext, cohort.getIdentifier(), e); + cohort.reportFailure(e); + } + } + } } void setRunOnPendingTransactionsComplete(final Runnable operation) { @@ -773,7 +1130,7 @@ public class ShardDataTree extends ShardDataTreeTransactionParent { } private void maybeRunOperationOnPendingTransactionsComplete() { - if (runOnPendingTransactionsComplete != null && pendingTransactions.isEmpty()) { + if (runOnPendingTransactionsComplete != null && !anyPendingTransactions()) { LOG.debug("{}: Pending transactions complete - running operation {}", logContext, runOnPendingTransactionsComplete); @@ -781,4 +1138,8 @@ public class ShardDataTree extends ShardDataTreeTransactionParent { runOnPendingTransactionsComplete = null; } } + + ShardStats getStats() { + return shard.getShardMBean(); + } }