Fix intermittent LeaderTest/CandidateTest failures 44/34544/2
authorTom Pantelis <tpanteli@brocade.com>
Thu, 11 Feb 2016 08:24:53 +0000 (03:24 -0500)
committerGerrit Code Review <gerrit@opendaylight.org>
Fri, 12 Feb 2016 15:45:41 +0000 (15:45 +0000)
The test cases in LeaderTest and CandidateTest have been failing
intermittently. A particular test in CandidateTest has recently started
failing fairly regularly on jenkins for some reason.

The common denominator is that an initial message to an actor isn't
received and goes to dead letters instead, even though the actor was
just created. This seems related to the use of ActorSelection in the raft
behavior classes, I suspect a timing issue where the underlying actor
isn't actually created/available yet via actorSelection. I had seen this
in the past and attempted to alleviate it by adding a verifyActorReady to
TestActorFactory to verify with retries that the actor can be obtained via
actorSelection.resolveOne. However it doesn't appear resolveOne works as
advertised or maybe a successful call doesn't mean a message will
succeed.

I changed verifyActorReady to send an Identify message to the
actorSelection and verify successful response. On my system LeaderTest
would usually fail within 30 test runs. After the change it ran
successfully 400 times.

Change-Id: I2da7d4a4d14c68810e87fc64b711b5c80608f5d7
Signed-off-by: Tom Pantelis <tpanteli@brocade.com>
opendaylight/md-sal/sal-akka-raft/src/test/java/org/opendaylight/controller/cluster/raft/TestActorFactory.java

index 4ca018c5e2a29634f37a716fe889367a3b0f789d..0b709aa038d5e12ad4a897c95a7aa5b448feb451 100644 (file)
@@ -17,16 +17,23 @@ package org.opendaylight.controller.cluster.raft;
  */
 
 import akka.actor.Actor;
  */
 
 import akka.actor.Actor;
+import akka.actor.ActorIdentity;
 import akka.actor.ActorRef;
 import akka.actor.ActorRef;
+import akka.actor.ActorSelection;
 import akka.actor.ActorSystem;
 import akka.actor.ActorSystem;
+import akka.actor.Identify;
 import akka.actor.PoisonPill;
 import akka.actor.Props;
 import akka.actor.PoisonPill;
 import akka.actor.Props;
+import akka.pattern.Patterns;
 import akka.testkit.JavaTestKit;
 import akka.testkit.TestActorRef;
 import akka.util.Timeout;
 import akka.testkit.JavaTestKit;
 import akka.testkit.TestActorRef;
 import akka.util.Timeout;
+import com.google.common.base.Stopwatch;
+import com.google.common.util.concurrent.Uninterruptibles;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.concurrent.TimeUnit;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.concurrent.TimeUnit;
+import org.junit.Assert;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import scala.concurrent.Await;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import scala.concurrent.Await;
@@ -98,22 +105,26 @@ public class TestActorFactory implements AutoCloseable {
     private void verifyActorReady(ActorRef actorRef) {
         // Sometimes we see messages go to dead letters soon after creation - it seems the actor isn't quite
         // in a state yet to receive messages or isn't actually created yet. This seems to happen with
     private void verifyActorReady(ActorRef actorRef) {
         // Sometimes we see messages go to dead letters soon after creation - it seems the actor isn't quite
         // in a state yet to receive messages or isn't actually created yet. This seems to happen with
-        // actorSelection so, to alleviate it, we use an actorSelection and call resolveOne with retries to
-        // ensure it's ready.
+        // actorSelection so, to alleviate it, we use an actorSelection and send an Identify message with
+        // retries to ensure it's ready.
 
 
-        int tries = 1;
-        while(true) {
+        Timeout timeout = new Timeout(100, TimeUnit.MILLISECONDS);
+        Throwable lastError = null;
+        Stopwatch sw = Stopwatch.createStarted();
+        while(sw.elapsed(TimeUnit.SECONDS) <= 10) {
             try {
             try {
-                Timeout timeout = new Timeout(100, TimeUnit.MILLISECONDS);
-                Future<ActorRef> future = system.actorSelection(actorRef.path()).resolveOne(timeout);
-                Await.ready(future, timeout.duration());
-                break;
-            } catch (Exception e) {
-                if(tries++ > 20) {
-                    throw new RuntimeException(e);
-                }
+                ActorSelection actorSelection = system.actorSelection(actorRef.path().toString());
+                Future<Object> future = Patterns.ask(actorSelection, new Identify(""), timeout);
+                ActorIdentity reply = (ActorIdentity)Await.result(future, timeout.duration());
+                Assert.assertNotNull("Identify returned null", reply.getRef());
+                return;
+            } catch (Exception | AssertionError e) {
+                Uninterruptibles.sleepUninterruptibly(100, TimeUnit.MILLISECONDS);
+                lastError = e;
             }
         }
             }
         }
+
+        throw new RuntimeException(lastError);
     }
 
     /**
     }
 
     /**