You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
332 lines
14 KiB
332 lines
14 KiB
# Licensed to the Apache Software Foundation (ASF) under one or more |
|
# contributor license agreements. See the NOTICE file distributed with |
|
# this work for additional information regarding copyright ownership. |
|
# The ASF licenses this file to You under the Apache License, Version 2.0 |
|
# (the "License"); you may not use this file except in compliance with |
|
# the License. You may obtain a copy of the License at |
|
# |
|
# http://www.apache.org/licenses/LICENSE-2.0 |
|
# |
|
# Unless required by applicable law or agreed to in writing, software |
|
# distributed under the License is distributed on an "AS IS" BASIS, |
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
# See the License for the specific language governing permissions and |
|
# limitations under the License. |
|
|
|
import json |
|
import os.path |
|
import requests |
|
from requests.adapters import HTTPAdapter |
|
from requests.packages.urllib3 import Retry |
|
|
|
from ducktape.services.service import Service |
|
from ducktape.utils.util import wait_until |
|
from kafkatest.directory_layout.kafka_path import KafkaPathResolverMixin |
|
|
|
|
|
class TrogdorService(KafkaPathResolverMixin, Service): |
|
""" |
|
A ducktape service for running the trogdor fault injection daemons. |
|
|
|
Attributes: |
|
PERSISTENT_ROOT The root filesystem path to store service files under. |
|
COORDINATOR_STDOUT_STDERR The path where we store the coordinator's stdout/stderr output. |
|
AGENT_STDOUT_STDERR The path where we store the agents's stdout/stderr output. |
|
COORDINATOR_LOG The path where we store the coordinator's log4j output. |
|
AGENT_LOG The path where we store the agent's log4j output. |
|
AGENT_LOG4J_PROPERTIES The path to the agent log4j.properties file for log config. |
|
COORDINATOR_LOG4J_PROPERTIES The path to the coordinator log4j.properties file for log config. |
|
CONFIG_PATH The path to the trogdor configuration file. |
|
DEFAULT_AGENT_PORT The default port to use for trogdor_agent daemons. |
|
DEFAULT_COORDINATOR_PORT The default port to use for trogdor_coordinator daemons. |
|
REQUEST_TIMEOUT The request timeout in seconds to use for REST requests. |
|
REQUEST_HEADERS The request headers to use when communicating with trogdor. |
|
""" |
|
|
|
PERSISTENT_ROOT="/mnt/trogdor" |
|
COORDINATOR_STDOUT_STDERR = os.path.join(PERSISTENT_ROOT, "trogdor-coordinator-stdout-stderr.log") |
|
AGENT_STDOUT_STDERR = os.path.join(PERSISTENT_ROOT, "trogdor-agent-stdout-stderr.log") |
|
COORDINATOR_LOG = os.path.join(PERSISTENT_ROOT, "trogdor-coordinator.log") |
|
AGENT_LOG = os.path.join(PERSISTENT_ROOT, "trogdor-agent.log") |
|
COORDINATOR_LOG4J_PROPERTIES = os.path.join(PERSISTENT_ROOT, "trogdor-coordinator-log4j.properties") |
|
AGENT_LOG4J_PROPERTIES = os.path.join(PERSISTENT_ROOT, "trogdor-agent-log4j.properties") |
|
CONFIG_PATH = os.path.join(PERSISTENT_ROOT, "trogdor.conf") |
|
DEFAULT_AGENT_PORT=8888 |
|
DEFAULT_COORDINATOR_PORT=8889 |
|
REQUEST_TIMEOUT=5 |
|
REQUEST_HEADERS = {"Content-type": "application/json"} |
|
|
|
logs = { |
|
"trogdor_coordinator_stdout_stderr": { |
|
"path": COORDINATOR_STDOUT_STDERR, |
|
"collect_default": True}, |
|
"trogdor_agent_stdout_stderr": { |
|
"path": AGENT_STDOUT_STDERR, |
|
"collect_default": True}, |
|
"trogdor_coordinator_log": { |
|
"path": COORDINATOR_LOG, |
|
"collect_default": True}, |
|
"trogdor_agent_log": { |
|
"path": AGENT_LOG, |
|
"collect_default": True}, |
|
} |
|
|
|
|
|
def __init__(self, context, agent_nodes=None, client_services=None, |
|
agent_port=DEFAULT_AGENT_PORT, coordinator_port=DEFAULT_COORDINATOR_PORT): |
|
""" |
|
Create a Trogdor service. |
|
|
|
:param context: The test context. |
|
:param agent_nodes: The nodes to run the agents on. |
|
:param client_services: Services whose nodes we should run agents on. |
|
:param agent_port: The port to use for the trogdor_agent daemons. |
|
:param coordinator_port: The port to use for the trogdor_coordinator daemons. |
|
""" |
|
Service.__init__(self, context, num_nodes=1) |
|
self.coordinator_node = self.nodes[0] |
|
if client_services is not None: |
|
for client_service in client_services: |
|
for node in client_service.nodes: |
|
self.nodes.append(node) |
|
if agent_nodes is not None: |
|
for agent_node in agent_nodes: |
|
self.nodes.append(agent_node) |
|
if (len(self.nodes) == 1): |
|
raise RuntimeError("You must supply at least one agent node to run the service on.") |
|
self.agent_port = agent_port |
|
self.coordinator_port = coordinator_port |
|
|
|
def free(self): |
|
# We only want to deallocate the coordinator node, not the agent nodes. So we |
|
# change self.nodes to include only the coordinator node, and then invoke |
|
# the base class' free method. |
|
if self.coordinator_node is not None: |
|
self.nodes = [self.coordinator_node] |
|
self.coordinator_node = None |
|
Service.free(self) |
|
|
|
def _create_config_dict(self): |
|
""" |
|
Create a dictionary with the Trogdor configuration. |
|
|
|
:return: The configuration dictionary. |
|
""" |
|
dict_nodes = {} |
|
for node in self.nodes: |
|
dict_nodes[node.name] = { |
|
"hostname": node.account.ssh_hostname, |
|
} |
|
if node.name == self.coordinator_node.name: |
|
dict_nodes[node.name]["trogdor.coordinator.port"] = self.coordinator_port |
|
else: |
|
dict_nodes[node.name]["trogdor.agent.port"] = self.agent_port |
|
|
|
return { |
|
"platform": "org.apache.kafka.trogdor.basic.BasicPlatform", |
|
"nodes": dict_nodes, |
|
} |
|
|
|
def start_node(self, node): |
|
node.account.mkdirs(TrogdorService.PERSISTENT_ROOT) |
|
|
|
# Create the configuration file on the node. |
|
str = json.dumps(self._create_config_dict(), indent=2) |
|
self.logger.info("Creating configuration file %s with %s" % (TrogdorService.CONFIG_PATH, str)) |
|
node.account.create_file(TrogdorService.CONFIG_PATH, str) |
|
|
|
if self.is_coordinator(node): |
|
self._start_coordinator_node(node) |
|
else: |
|
self._start_agent_node(node) |
|
|
|
def _start_coordinator_node(self, node): |
|
node.account.create_file(TrogdorService.COORDINATOR_LOG4J_PROPERTIES, |
|
self.render('log4j.properties', |
|
log_path=TrogdorService.COORDINATOR_LOG)) |
|
self._start_trogdor_daemon("coordinator", TrogdorService.COORDINATOR_STDOUT_STDERR, |
|
TrogdorService.COORDINATOR_LOG4J_PROPERTIES, |
|
TrogdorService.COORDINATOR_LOG, node) |
|
self.logger.info("Started trogdor coordinator on %s." % node.name) |
|
|
|
def _start_agent_node(self, node): |
|
node.account.create_file(TrogdorService.AGENT_LOG4J_PROPERTIES, |
|
self.render('log4j.properties', |
|
log_path=TrogdorService.AGENT_LOG)) |
|
self._start_trogdor_daemon("agent", TrogdorService.AGENT_STDOUT_STDERR, |
|
TrogdorService.AGENT_LOG4J_PROPERTIES, |
|
TrogdorService.AGENT_LOG, node) |
|
self.logger.info("Started trogdor agent on %s." % node.name) |
|
|
|
def _start_trogdor_daemon(self, daemon_name, stdout_stderr_capture_path, |
|
log4j_properties_path, log_path, node): |
|
cmd = "export KAFKA_LOG4J_OPTS='-Dlog4j.configuration=file:%s'; " % log4j_properties_path |
|
cmd += "%s %s --%s.config %s --node-name %s 1>> %s 2>> %s &" % \ |
|
(self.path.script("trogdor.sh", node), |
|
daemon_name, |
|
daemon_name, |
|
TrogdorService.CONFIG_PATH, |
|
node.name, |
|
stdout_stderr_capture_path, |
|
stdout_stderr_capture_path) |
|
node.account.ssh(cmd) |
|
with node.account.monitor_log(log_path) as monitor: |
|
monitor.wait_until("Starting %s process." % daemon_name, timeout_sec=60, backoff_sec=.25, |
|
err_msg=("%s on %s didn't finish startup" % (daemon_name, node.name))) |
|
|
|
def wait_node(self, node, timeout_sec=None): |
|
if self.is_coordinator(node): |
|
return len(node.account.java_pids(self.coordinator_class_name())) == 0 |
|
else: |
|
return len(node.account.java_pids(self.agent_class_name())) == 0 |
|
|
|
def stop_node(self, node): |
|
"""Halt trogdor processes on this node.""" |
|
if self.is_coordinator(node): |
|
node.account.kill_java_processes(self.coordinator_class_name()) |
|
else: |
|
node.account.kill_java_processes(self.agent_class_name()) |
|
|
|
def clean_node(self, node): |
|
"""Clean up persistent state on this node - e.g. service logs, configuration files etc.""" |
|
self.stop_node(node) |
|
node.account.ssh("rm -rf -- %s" % TrogdorService.PERSISTENT_ROOT) |
|
|
|
def _coordinator_url(self, path): |
|
return "http://%s:%d/coordinator/%s" % \ |
|
(self.coordinator_node.account.ssh_hostname, self.coordinator_port, path) |
|
|
|
def request_session(self): |
|
""" |
|
Creates a new request session which will retry for a while. |
|
""" |
|
session = requests.Session() |
|
session.mount('http://', |
|
HTTPAdapter(max_retries=Retry(total=4, backoff_factor=0.3))) |
|
return session |
|
|
|
def _coordinator_post(self, path, message): |
|
""" |
|
Make a POST request to the Trogdor coordinator. |
|
|
|
:param path: The URL path to use. |
|
:param message: The message object to send. |
|
:return: The response as an object. |
|
""" |
|
url = self._coordinator_url(path) |
|
self.logger.info("POST %s %s" % (url, message)) |
|
response = self.request_session().post(url, json=message, |
|
timeout=TrogdorService.REQUEST_TIMEOUT, |
|
headers=TrogdorService.REQUEST_HEADERS) |
|
response.raise_for_status() |
|
return response.json() |
|
|
|
def _coordinator_put(self, path, message): |
|
""" |
|
Make a PUT request to the Trogdor coordinator. |
|
|
|
:param path: The URL path to use. |
|
:param message: The message object to send. |
|
:return: The response as an object. |
|
""" |
|
url = self._coordinator_url(path) |
|
self.logger.info("PUT %s %s" % (url, message)) |
|
response = self.request_session().put(url, json=message, |
|
timeout=TrogdorService.REQUEST_TIMEOUT, |
|
headers=TrogdorService.REQUEST_HEADERS) |
|
response.raise_for_status() |
|
return response.json() |
|
|
|
def _coordinator_get(self, path, message): |
|
""" |
|
Make a GET request to the Trogdor coordinator. |
|
|
|
:param path: The URL path to use. |
|
:param message: The message object to send. |
|
:return: The response as an object. |
|
""" |
|
url = self._coordinator_url(path) |
|
self.logger.info("GET %s %s" % (url, message)) |
|
response = self.request_session().get(url, json=message, |
|
timeout=TrogdorService.REQUEST_TIMEOUT, |
|
headers=TrogdorService.REQUEST_HEADERS) |
|
response.raise_for_status() |
|
return response.json() |
|
|
|
def create_task(self, id, spec): |
|
""" |
|
Create a new task. |
|
|
|
:param id: The task id. |
|
:param spec: The task spec. |
|
""" |
|
self._coordinator_post("task/create", { "id": id, "spec": spec.message()}) |
|
return TrogdorTask(id, self) |
|
|
|
def stop_task(self, id): |
|
""" |
|
Stop a task. |
|
|
|
:param id: The task id. |
|
""" |
|
self._coordinator_put("task/stop", { "id": id }) |
|
|
|
def tasks(self): |
|
""" |
|
Get the tasks which are on the coordinator. |
|
|
|
:returns: A map of task id strings to task state objects. |
|
Task state objects contain a 'spec' field with the spec |
|
and a 'state' field with the state. |
|
""" |
|
return self._coordinator_get("tasks", {}) |
|
|
|
def is_coordinator(self, node): |
|
return node == self.coordinator_node |
|
|
|
def agent_class_name(self): |
|
return "org.apache.kafka.trogdor.agent.Agent" |
|
|
|
def coordinator_class_name(self): |
|
return "org.apache.kafka.trogdor.coordinator.Coordinator" |
|
|
|
class TrogdorTask(object): |
|
PENDING_STATE = "PENDING" |
|
RUNNING_STATE = "RUNNING" |
|
STOPPING_STATE = "STOPPING" |
|
DONE_STATE = "DONE" |
|
|
|
def __init__(self, id, trogdor): |
|
self.id = id |
|
self.trogdor = trogdor |
|
|
|
def done(self): |
|
""" |
|
Check if this task is done. |
|
|
|
:raises RuntimeError: If the task encountered an error. |
|
:returns: True if the task is in DONE_STATE; |
|
False if it is in a different state. |
|
""" |
|
task_state = self.trogdor.tasks()["tasks"][self.id] |
|
if task_state is None: |
|
raise RuntimeError("Coordinator did not know about %s." % self.id) |
|
error = task_state.get("error") |
|
if error is None or error == "": |
|
return task_state["state"] == TrogdorTask.DONE_STATE |
|
raise RuntimeError("Failed to gracefully stop %s: got task error: %s" % (self.id, error)) |
|
|
|
def stop(self): |
|
""" |
|
Stop this task. |
|
|
|
:raises RuntimeError: If the task encountered an error. |
|
""" |
|
if self.done(): |
|
return |
|
self.trogdor.stop_task(self.id) |
|
|
|
def wait_for_done(self, timeout_sec=360): |
|
wait_until(lambda: self.done(), |
|
timeout_sec=timeout_sec, |
|
err_msg="%s failed to finish in the expected amount of time." % self.id)
|
|
|