From 96bc0b882d0c51d9b58c9f87654e6d133fd9ef34 Mon Sep 17 00:00:00 2001
From: Lucas Wang <luwang@linkedin.com>
Date: Sun, 29 Jul 2018 21:06:18 -0700
Subject: [PATCH] KAFKA-7180; Fixing the flaky test
 testHWCheckpointWithFailuresSingleLogSegment

By waiting until server1 has joined the ISR before shutting down server2

Rerun the test method many times after the code change, and there is no flakiness any more.

Author: Lucas Wang <luwang@linkedin.com>

Reviewers: Mayuresh Gharat <gharatmayuresh15@gmail.com>, Dong Lin <lindong28@gmail.com>

Closes #5387 from gitlw/fixing_flacky_logrecevorytest
---
 .../test/scala/unit/kafka/server/LogRecoveryTest.scala   | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/core/src/test/scala/unit/kafka/server/LogRecoveryTest.scala b/core/src/test/scala/unit/kafka/server/LogRecoveryTest.scala
index 880950ae02c..1bd15f7b537 100755
--- a/core/src/test/scala/unit/kafka/server/LogRecoveryTest.scala
+++ b/core/src/test/scala/unit/kafka/server/LogRecoveryTest.scala
@@ -143,6 +143,15 @@ class LogRecoveryTest extends ZooKeeperTestHarness {
       leader == 0 || leader == 1)
 
     assertEquals(hw, hwFile1.read.getOrElse(topicPartition, 0L))
+    /** We plan to shutdown server2 and transfer the leadership to server1.
+      * With unclean leader election turned off, a prerequisite for the successful leadership transition
+      * is that server1 has caught up on the topicPartition, and has joined the ISR.
+      * In the line below, we wait until the condition is met before shutting down server2
+      */
+    waitUntilTrue(() => server2.replicaManager.getPartition(topicPartition).get.inSyncReplicas.size == 2,
+      "Server 1 is not able to join the ISR after restart")
+
+
     // since server 2 was never shut down, the hw value of 30 is probably not checkpointed to disk yet
     server2.shutdown()
     assertEquals(hw, hwFile2.read.getOrElse(topicPartition, 0L))