Mirror of Apache Kafka
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

458 lines
25 KiB

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#!/usr/bin/env python
# ===================================
# replica_basic_test.py
# ===================================
import inspect
import logging
import os
import pprint
import signal
import subprocess
import sys
import time
import traceback
from system_test_env import SystemTestEnv
sys.path.append(SystemTestEnv.SYSTEM_TEST_UTIL_DIR)
from setup_utils import SetupUtils
from replication_utils import ReplicationUtils
import system_test_utils
from testcase_env import TestcaseEnv
# product specific: Kafka
import kafka_system_test_utils
import metrics
class ReplicaBasicTest(ReplicationUtils, SetupUtils):
testModuleAbsPathName = os.path.realpath(__file__)
testSuiteAbsPathName = os.path.abspath(os.path.dirname(testModuleAbsPathName))
def __init__(self, systemTestEnv):
# SystemTestEnv - provides cluster level environment settings
# such as entity_id, hostname, kafka_home, java_home which
# are available in a list of dictionary named
# "clusterEntityConfigDictList"
self.systemTestEnv = systemTestEnv
super(ReplicaBasicTest, self).__init__(self)
# dict to pass user-defined attributes to logger argument: "extra"
d = {'name_of_class': self.__class__.__name__}
def signal_handler(self, signal, frame):
self.log_message("Interrupt detected - User pressed Ctrl+c")
# perform the necessary cleanup here when user presses Ctrl+c and it may be product specific
self.log_message("stopping all entities - please wait ...")
kafka_system_test_utils.stop_all_remote_running_processes(self.systemTestEnv, self.testcaseEnv)
sys.exit(1)
def runTest(self):
# ======================================================================
# get all testcase directories under this testsuite
# ======================================================================
testCasePathNameList = system_test_utils.get_dir_paths_with_prefix(
self.testSuiteAbsPathName, SystemTestEnv.SYSTEM_TEST_CASE_PREFIX)
testCasePathNameList.sort()
replicationUtils = ReplicationUtils(self)
# =============================================================
# launch each testcase one by one: testcase_1, testcase_2, ...
# =============================================================
for testCasePathName in testCasePathNameList:
skipThisTestCase = False
try:
# ======================================================================
# A new instance of TestcaseEnv to keep track of this testcase's env vars
# and initialize some env vars as testCasePathName is available now
# ======================================================================
self.testcaseEnv = TestcaseEnv(self.systemTestEnv, self)
self.testcaseEnv.testSuiteBaseDir = self.testSuiteAbsPathName
self.testcaseEnv.initWithKnownTestCasePathName(testCasePathName)
self.testcaseEnv.testcaseArgumentsDict = self.testcaseEnv.testcaseNonEntityDataDict["testcase_args"]
# ======================================================================
# SKIP if this case is IN testcase_to_skip.json or NOT IN testcase_to_run.json
# ======================================================================
testcaseDirName = self.testcaseEnv.testcaseResultsDict["_test_case_name"]
if self.systemTestEnv.printTestDescriptionsOnly:
self.testcaseEnv.printTestCaseDescription(testcaseDirName)
continue
elif self.systemTestEnv.isTestCaseToSkip(self.__class__.__name__, testcaseDirName):
self.log_message("Skipping : " + testcaseDirName)
skipThisTestCase = True
continue
else:
self.testcaseEnv.printTestCaseDescription(testcaseDirName)
system_test_utils.setup_remote_hosts_with_testcase_level_cluster_config(self.systemTestEnv, testCasePathName)
# ============================================================================== #
# ============================================================================== #
# Product Specific Testing Code Starts Here: #
# ============================================================================== #
# ============================================================================== #
# get optional testcase arguments
logRetentionTest = "false"
try:
logRetentionTest = self.testcaseEnv.testcaseArgumentsDict["log_retention_test"]
except:
pass
consumerMultiTopicsMode = "false"
try:
consumerMultiTopicsMode = self.testcaseEnv.testcaseArgumentsDict["consumer_multi_topics_mode"]
except:
pass
autoCreateTopic = "false"
try:
autoCreateTopic = self.testcaseEnv.testcaseArgumentsDict["auto_create_topic"]
except:
pass
# initialize self.testcaseEnv with user-defined environment variables (product specific)
self.testcaseEnv.userDefinedEnvVarDict["zkConnectStr"] = ""
self.testcaseEnv.userDefinedEnvVarDict["stopBackgroundProducer"] = False
self.testcaseEnv.userDefinedEnvVarDict["backgroundProducerStopped"] = False
self.testcaseEnv.userDefinedEnvVarDict["leaderElectionLatencyList"] = []
# initialize signal handler
signal.signal(signal.SIGINT, self.signal_handler)
# TestcaseEnv.testcaseConfigsList initialized by reading testcase properties file:
# system_test/<suite_name>_testsuite/testcase_<n>/testcase_<n>_properties.json
self.testcaseEnv.testcaseConfigsList = system_test_utils.get_json_list_data(
self.testcaseEnv.testcasePropJsonPathName)
# clean up data directories specified in zookeeper.properties and kafka_server_<n>.properties
kafka_system_test_utils.cleanup_data_at_remote_hosts(self.systemTestEnv, self.testcaseEnv)
# create "LOCAL" log directories for metrics, dashboards for each entity under this testcase
# for collecting logs from remote machines
kafka_system_test_utils.generate_testcase_log_dirs(self.systemTestEnv, self.testcaseEnv)
# TestcaseEnv - initialize producer & consumer config / log file pathnames
kafka_system_test_utils.init_entity_props(self.systemTestEnv, self.testcaseEnv)
# generate remote hosts log/config dirs if not exist
kafka_system_test_utils.generate_testcase_log_dirs_in_remote_hosts(self.systemTestEnv, self.testcaseEnv)
# generate properties files for zookeeper, kafka, producer, consumer:
# 1. copy system_test/<suite_name>_testsuite/config/*.properties to
# system_test/<suite_name>_testsuite/testcase_<n>/config/
# 2. update all properties files in system_test/<suite_name>_testsuite/testcase_<n>/config
# by overriding the settings specified in:
# system_test/<suite_name>_testsuite/testcase_<n>/testcase_<n>_properties.json
kafka_system_test_utils.generate_overriden_props_files(self.testSuiteAbsPathName,
self.testcaseEnv, self.systemTestEnv)
# =============================================
# preparing all entities to start the test
# =============================================
self.log_message("starting zookeepers")
kafka_system_test_utils.start_zookeepers(self.systemTestEnv, self.testcaseEnv)
self.anonLogger.info("sleeping for 2s")
time.sleep(2)
self.log_message("starting brokers")
kafka_system_test_utils.start_brokers(self.systemTestEnv, self.testcaseEnv)
self.anonLogger.info("sleeping for 5s")
time.sleep(5)
if autoCreateTopic.lower() == "false":
self.log_message("creating topics")
kafka_system_test_utils.create_topic(self.systemTestEnv, self.testcaseEnv)
self.anonLogger.info("sleeping for 5s")
time.sleep(5)
# =============================================
# start ConsoleConsumer if this is a Log Retention test
# =============================================
if logRetentionTest.lower() == "true":
self.log_message("starting consumer in the background")
kafka_system_test_utils.start_console_consumer(self.systemTestEnv, self.testcaseEnv)
time.sleep(1)
# =============================================
# starting producer
# =============================================
self.log_message("starting producer in the background")
kafka_system_test_utils.start_producer_performance(self.systemTestEnv, self.testcaseEnv, False)
msgProducingFreeTimeSec = self.testcaseEnv.testcaseArgumentsDict["message_producing_free_time_sec"]
self.anonLogger.info("sleeping for " + msgProducingFreeTimeSec + " sec to produce some messages")
time.sleep(int(msgProducingFreeTimeSec))
# =============================================
# A while-loop to bounce leader as specified
# by "num_iterations" in testcase_n_properties.json
# =============================================
i = 1
numIterations = int(self.testcaseEnv.testcaseArgumentsDict["num_iteration"])
brokerType = self.testcaseEnv.testcaseArgumentsDict["broker_type"]
bounceBrokerFlag = self.testcaseEnv.testcaseArgumentsDict["bounce_broker"]
while i <= numIterations:
self.log_message("Iteration " + str(i) + " of " + str(numIterations))
self.log_message("bounce_broker flag : " + bounceBrokerFlag)
leaderDict = None
controllerDict = None
stoppedBrokerEntityId = ""
# ==============================================
# Find out the entity id for the stopping broker
# ==============================================
if brokerType == "leader" or brokerType == "follower":
self.log_message("looking up leader")
leaderDict = kafka_system_test_utils.get_leader_attributes(self.systemTestEnv, self.testcaseEnv)
# ==========================
# leaderDict looks like this:
# ==========================
#{'entity_id': u'3',
# 'partition': '0',
# 'timestamp': 1345050255.8280001,
# 'hostname': u'localhost',
# 'topic': 'test_1',
# 'brokerid': '3'}
if brokerType == "leader":
stoppedBrokerEntityId = leaderDict["entity_id"]
self.log_message("Found leader with entity id: " + stoppedBrokerEntityId)
else: # Follower
self.log_message("looking up follower")
# a list of all brokers
brokerEntityIdList = system_test_utils.get_data_from_list_of_dicts(self.systemTestEnv.clusterEntityConfigDictList, "role", "broker", "entity_id")
# we pick the first non-leader broker as the follower
firstFollowerEntityId = None
for brokerEntityId in brokerEntityIdList:
if brokerEntityId != leaderDict["entity_id"]:
firstFollowerEntityId = brokerEntityId
break
stoppedBrokerEntityId = firstFollowerEntityId
self.log_message("Found follower with entity id: " + stoppedBrokerEntityId)
elif brokerType == "controller":
self.log_message("looking up controller")
controllerDict = kafka_system_test_utils.get_controller_attributes(self.systemTestEnv, self.testcaseEnv)
# ==========================
# controllerDict looks like this:
# ==========================
#{'entity_id': u'3',
# 'timestamp': 1345050255.8280001,
# 'hostname': u'localhost',
# 'brokerid': '3'}
stoppedBrokerEntityId = controllerDict["entity_id"]
self.log_message("Found controller with entity id: " + stoppedBrokerEntityId)
# =============================================
# Bounce the broker
# =============================================
if bounceBrokerFlag.lower() == "true":
if brokerType == "leader":
# validate to see if leader election is successful
self.log_message("validating leader election")
kafka_system_test_utils.validate_leader_election_successful(self.testcaseEnv, leaderDict, self.testcaseEnv.validationStatusDict)
# trigger leader re-election by stopping leader to get re-election latency
#reelectionLatency = kafka_system_test_utils.get_reelection_latency(self.systemTestEnv, self.testcaseEnv, leaderDict, self.leaderAttributesDict)
#latencyKeyName = "Leader Election Latency - iter " + str(i) + " brokerid " + leaderDict["brokerid"]
#self.testcaseEnv.validationStatusDict[latencyKeyName] = str("{0:.2f}".format(reelectionLatency * 1000)) + " ms"
#self.testcaseEnv.userDefinedEnvVarDict["leaderElectionLatencyList"].append("{0:.2f}".format(reelectionLatency * 1000))
elif brokerType == "follower":
# stopping Follower
self.log_message("stopping follower with entity id: " + firstFollowerEntityId)
kafka_system_test_utils.stop_remote_entity(self.systemTestEnv, firstFollowerEntityId, self.testcaseEnv.entityBrokerParentPidDict[firstFollowerEntityId])
elif brokerType == "controller":
# stopping Controller
self.log_message("stopping controller : " + controllerDict["brokerid"])
kafka_system_test_utils.stop_remote_entity(self.systemTestEnv, controllerDict["entity_id"], self.testcaseEnv.entityBrokerParentPidDict[controllerDict["entity_id"]])
brokerDownTimeInSec = 5
try:
brokerDownTimeInSec = int(self.testcaseEnv.testcaseArgumentsDict["broker_down_time_in_sec"])
except:
pass # take default
time.sleep(brokerDownTimeInSec)
# starting previously terminated broker
self.log_message("starting the previously terminated broker")
kafka_system_test_utils.start_entity_in_background(self.systemTestEnv, self.testcaseEnv, stoppedBrokerEntityId)
else:
# GC Pause simulation
pauseTime = None
try:
hostname = leaderDict["hostname"]
pauseTime = self.testcaseEnv.testcaseArgumentsDict["pause_time_in_seconds"]
parentPid = self.testcaseEnv.entityBrokerParentPidDict[leaderDict["entity_id"]]
pidStack = system_test_utils.get_remote_child_processes(hostname, parentPid)
system_test_utils.simulate_garbage_collection_pause_in_remote_process(hostname, pidStack, pauseTime)
except:
pass
self.anonLogger.info("sleeping for 60s")
time.sleep(60)
i += 1
# while loop
# update Leader Election Latency MIN/MAX to testcaseEnv.validationStatusDict
#self.testcaseEnv.validationStatusDict["Leader Election Latency MIN"] = None
#try:
# self.testcaseEnv.validationStatusDict["Leader Election Latency MIN"] = \
# min(self.testcaseEnv.userDefinedEnvVarDict["leaderElectionLatencyList"])
#except:
# pass
#
#self.testcaseEnv.validationStatusDict["Leader Election Latency MAX"] = None
#try:
# self.testcaseEnv.validationStatusDict["Leader Election Latency MAX"] = \
# max(self.testcaseEnv.userDefinedEnvVarDict["leaderElectionLatencyList"])
#except:
# pass
# =============================================
# tell producer to stop
# =============================================
self.testcaseEnv.lock.acquire()
self.testcaseEnv.userDefinedEnvVarDict["stopBackgroundProducer"] = True
time.sleep(1)
self.testcaseEnv.lock.release()
time.sleep(1)
# =============================================
# wait for producer thread's update of
# "backgroundProducerStopped" to be "True"
# =============================================
while 1:
self.testcaseEnv.lock.acquire()
self.logger.info("status of backgroundProducerStopped : [" + \
str(self.testcaseEnv.userDefinedEnvVarDict["backgroundProducerStopped"]) + "]", extra=self.d)
if self.testcaseEnv.userDefinedEnvVarDict["backgroundProducerStopped"]:
time.sleep(1)
self.logger.info("all producer threads completed", extra=self.d)
break
time.sleep(1)
self.testcaseEnv.lock.release()
time.sleep(2)
# =============================================
# collect logs from remote hosts to find the
# minimum common offset of a certain log
# segment file among all replicas
# =============================================
minStartingOffsetDict = None
if logRetentionTest.lower() == "true":
self.anonLogger.info("sleeping for 60s to make sure log truncation is completed")
time.sleep(60)
kafka_system_test_utils.collect_logs_from_remote_hosts(self.systemTestEnv, self.testcaseEnv)
minStartingOffsetDict = kafka_system_test_utils.getMinCommonStartingOffset(self.systemTestEnv, self.testcaseEnv)
print
pprint.pprint(minStartingOffsetDict)
# =============================================
# starting debug consumer
# =============================================
if consumerMultiTopicsMode.lower() == "false":
self.log_message("starting debug consumers in the background")
kafka_system_test_utils.start_simple_consumer(self.systemTestEnv, self.testcaseEnv, minStartingOffsetDict)
self.anonLogger.info("sleeping for 10s")
time.sleep(10)
# =============================================
# starting console consumer
# =============================================
if logRetentionTest.lower() == "false":
self.log_message("starting consumer in the background")
kafka_system_test_utils.start_console_consumer(self.systemTestEnv, self.testcaseEnv)
time.sleep(10)
# =============================================
# this testcase is completed - stop all entities
# =============================================
self.log_message("stopping all entities")
for entityId, parentPid in self.testcaseEnv.entityBrokerParentPidDict.items():
kafka_system_test_utils.stop_remote_entity(self.systemTestEnv, entityId, parentPid)
for entityId, parentPid in self.testcaseEnv.entityZkParentPidDict.items():
kafka_system_test_utils.stop_remote_entity(self.systemTestEnv, entityId, parentPid)
# make sure all entities are stopped
kafka_system_test_utils.ps_grep_terminate_running_entity(self.systemTestEnv)
# =============================================
# collect logs from remote hosts
# =============================================
kafka_system_test_utils.collect_logs_from_remote_hosts(self.systemTestEnv, self.testcaseEnv)
# =============================================
# validate the data matched and checksum
# =============================================
self.log_message("validating data matched")
if logRetentionTest.lower() == "true":
kafka_system_test_utils.validate_data_matched(self.systemTestEnv, self.testcaseEnv, replicationUtils)
elif consumerMultiTopicsMode.lower() == "true":
kafka_system_test_utils.validate_data_matched_in_multi_topics_from_single_consumer_producer(
self.systemTestEnv, self.testcaseEnv, replicationUtils)
else:
kafka_system_test_utils.validate_simple_consumer_data_matched_across_replicas(self.systemTestEnv, self.testcaseEnv)
kafka_system_test_utils.validate_broker_log_segment_checksum(self.systemTestEnv, self.testcaseEnv)
kafka_system_test_utils.validate_data_matched(self.systemTestEnv, self.testcaseEnv, replicationUtils)
kafka_system_test_utils.validate_index_log(self.systemTestEnv, self.testcaseEnv)
# =============================================
# draw graphs
# =============================================
metrics.draw_all_graphs(self.systemTestEnv.METRICS_PATHNAME,
self.testcaseEnv,
self.systemTestEnv.clusterEntityConfigDictList)
# build dashboard, one for each role
metrics.build_all_dashboards(self.systemTestEnv.METRICS_PATHNAME,
self.testcaseEnv.testCaseDashboardsDir,
self.systemTestEnv.clusterEntityConfigDictList)
except Exception as e:
self.log_message("Exception while running test {0}".format(e))
traceback.print_exc()
finally:
if not skipThisTestCase and not self.systemTestEnv.printTestDescriptionsOnly:
self.log_message("stopping all entities - please wait ...")
kafka_system_test_utils.stop_all_remote_running_processes(self.systemTestEnv, self.testcaseEnv)