src-kafka/tests/kafkatest/services/trogdor/trogdor.py

# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json
import os.path
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3 import Retry

from ducktape.services.service import Service
from ducktape.utils.util import wait_until
from kafkatest.directory_layout.kafka_path import KafkaPathResolverMixin


class TrogdorService(KafkaPathResolverMixin, Service):
    """
    A ducktape service for running the trogdor fault injection daemons.

    Attributes:
        PERSISTENT_ROOT                 The root filesystem path to store service files under.
        COORDINATOR_STDOUT_STDERR       The path where we store the coordinator's stdout/stderr output.
        AGENT_STDOUT_STDERR             The path where we store the agents's stdout/stderr output.
        COORDINATOR_LOG                 The path where we store the coordinator's log4j output.
        AGENT_LOG                       The path where we store the agent's log4j output.
        AGENT_LOG4J_PROPERTIES          The path to the agent log4j.properties file for log config.
        COORDINATOR_LOG4J_PROPERTIES    The path to the coordinator log4j.properties file for log config.
        CONFIG_PATH                     The path to the trogdor configuration file.
        DEFAULT_AGENT_PORT              The default port to use for trogdor_agent daemons.
        DEFAULT_COORDINATOR_PORT        The default port to use for trogdor_coordinator daemons.
        REQUEST_TIMEOUT                 The request timeout in seconds to use for REST requests.
        REQUEST_HEADERS                 The request headers to use when communicating with trogdor.
    """

    PERSISTENT_ROOT="/mnt/trogdor"
    COORDINATOR_STDOUT_STDERR = os.path.join(PERSISTENT_ROOT, "trogdor-coordinator-stdout-stderr.log")
    AGENT_STDOUT_STDERR = os.path.join(PERSISTENT_ROOT, "trogdor-agent-stdout-stderr.log")
    COORDINATOR_LOG = os.path.join(PERSISTENT_ROOT, "trogdor-coordinator.log")
    AGENT_LOG = os.path.join(PERSISTENT_ROOT, "trogdor-agent.log")
    COORDINATOR_LOG4J_PROPERTIES = os.path.join(PERSISTENT_ROOT, "trogdor-coordinator-log4j.properties")
    AGENT_LOG4J_PROPERTIES = os.path.join(PERSISTENT_ROOT, "trogdor-agent-log4j.properties")
    CONFIG_PATH = os.path.join(PERSISTENT_ROOT, "trogdor.conf")
    DEFAULT_AGENT_PORT=8888
    DEFAULT_COORDINATOR_PORT=8889
    REQUEST_TIMEOUT=5
    REQUEST_HEADERS = {"Content-type": "application/json"}

    logs = {
        "trogdor_coordinator_stdout_stderr": {
            "path": COORDINATOR_STDOUT_STDERR,
            "collect_default": True},
        "trogdor_agent_stdout_stderr": {
            "path": AGENT_STDOUT_STDERR,
            "collect_default": True},
        "trogdor_coordinator_log": {
            "path": COORDINATOR_LOG,
            "collect_default": True},
        "trogdor_agent_log": {
            "path": AGENT_LOG,
            "collect_default": True},
    }


    def __init__(self, context, agent_nodes=None, client_services=None,
                 agent_port=DEFAULT_AGENT_PORT, coordinator_port=DEFAULT_COORDINATOR_PORT):
        """
        Create a Trogdor service.

        :param context:             The test context.
        :param agent_nodes:         The nodes to run the agents on.
        :param client_services:     Services whose nodes we should run agents on.
        :param agent_port:          The port to use for the trogdor_agent daemons.
        :param coordinator_port:    The port to use for the trogdor_coordinator daemons.
        """
        Service.__init__(self, context, num_nodes=1)
        self.coordinator_node = self.nodes[0]
        if client_services is not None:
            for client_service in client_services:
                for node in client_service.nodes:
                    self.nodes.append(node)
        if agent_nodes is not None:
            for agent_node in agent_nodes:
                self.nodes.append(agent_node)
        if (len(self.nodes) == 1):
            raise RuntimeError("You must supply at least one agent node to run the service on.")
        self.agent_port = agent_port
        self.coordinator_port = coordinator_port

    def free(self):
        # We only want to deallocate the coordinator node, not the agent nodes.  So we
        # change self.nodes to include only the coordinator node, and then invoke
        # the base class' free method.
        if self.coordinator_node is not None:
            self.nodes = [self.coordinator_node]
            self.coordinator_node = None
            Service.free(self)

    def _create_config_dict(self):
        """
        Create a dictionary with the Trogdor configuration.

        :return:            The configuration dictionary.
        """
        dict_nodes = {}
        for node in self.nodes:
            dict_nodes[node.name] = {
                "hostname": node.account.ssh_hostname,
            }
            if node.name == self.coordinator_node.name:
                dict_nodes[node.name]["trogdor.coordinator.port"] = self.coordinator_port
            else:
                dict_nodes[node.name]["trogdor.agent.port"] = self.agent_port

        return {
            "platform": "org.apache.kafka.trogdor.basic.BasicPlatform",
            "nodes": dict_nodes,
        }

    def start_node(self, node):
        node.account.mkdirs(TrogdorService.PERSISTENT_ROOT)

        # Create the configuration file on the node.
        str = json.dumps(self._create_config_dict(), indent=2)
        self.logger.info("Creating configuration file %s with %s" % (TrogdorService.CONFIG_PATH, str))
        node.account.create_file(TrogdorService.CONFIG_PATH, str)

        if self.is_coordinator(node):
            self._start_coordinator_node(node)
        else:
            self._start_agent_node(node)

    def _start_coordinator_node(self, node):
        node.account.create_file(TrogdorService.COORDINATOR_LOG4J_PROPERTIES,
                                 self.render('log4j.properties',
                                             log_path=TrogdorService.COORDINATOR_LOG))
        self._start_trogdor_daemon("coordinator", TrogdorService.COORDINATOR_STDOUT_STDERR,
                                   TrogdorService.COORDINATOR_LOG4J_PROPERTIES,
                                   TrogdorService.COORDINATOR_LOG, node)
        self.logger.info("Started trogdor coordinator on %s." % node.name)

    def _start_agent_node(self, node):
        node.account.create_file(TrogdorService.AGENT_LOG4J_PROPERTIES,
                                 self.render('log4j.properties',
                                             log_path=TrogdorService.AGENT_LOG))
        self._start_trogdor_daemon("agent", TrogdorService.AGENT_STDOUT_STDERR,
                                   TrogdorService.AGENT_LOG4J_PROPERTIES,
                                   TrogdorService.AGENT_LOG, node)
        self.logger.info("Started trogdor agent on %s." % node.name)

    def _start_trogdor_daemon(self, daemon_name, stdout_stderr_capture_path,
                              log4j_properties_path, log_path, node):
        cmd = "export KAFKA_LOG4J_OPTS='-Dlog4j.configuration=file:%s'; " % log4j_properties_path
        cmd += "%s %s --%s.config %s --node-name %s 1>> %s 2>> %s &" % \
               (self.path.script("trogdor.sh", node),
                daemon_name,
                daemon_name,
                TrogdorService.CONFIG_PATH,
                node.name,
                stdout_stderr_capture_path,
                stdout_stderr_capture_path)
        node.account.ssh(cmd)
        with node.account.monitor_log(log_path) as monitor:
            monitor.wait_until("Starting %s process." % daemon_name, timeout_sec=60, backoff_sec=.25,
                               err_msg=("%s on %s didn't finish startup" % (daemon_name, node.name)))

    def wait_node(self, node, timeout_sec=None):
        if self.is_coordinator(node):
            return len(node.account.java_pids(self.coordinator_class_name())) == 0
        else:
            return len(node.account.java_pids(self.agent_class_name())) == 0

    def stop_node(self, node):
        """Halt trogdor processes on this node."""
        if self.is_coordinator(node):
            node.account.kill_java_processes(self.coordinator_class_name())
        else:
            node.account.kill_java_processes(self.agent_class_name())

    def clean_node(self, node):
        """Clean up persistent state on this node - e.g. service logs, configuration files etc."""
        self.stop_node(node)
        node.account.ssh("rm -rf -- %s" % TrogdorService.PERSISTENT_ROOT)

    def _coordinator_url(self, path):
        return "http://%s:%d/coordinator/%s" % \
               (self.coordinator_node.account.ssh_hostname, self.coordinator_port, path)

    def request_session(self):
        """
        Creates a new request session which will retry for a while.
        """
        session = requests.Session()
        session.mount('http://',
                      HTTPAdapter(max_retries=Retry(total=4, backoff_factor=0.3)))
        return session

    def _coordinator_post(self, path, message):
        """
        Make a POST request to the Trogdor coordinator.

        :param path:            The URL path to use.
        :param message:         The message object to send.
        :return:                The response as an object.
        """
        url = self._coordinator_url(path)
        self.logger.info("POST %s %s" % (url, message))
        response = self.request_session().post(url, json=message,
                                               timeout=TrogdorService.REQUEST_TIMEOUT,
                                               headers=TrogdorService.REQUEST_HEADERS)
        response.raise_for_status()
        return response.json()

    def _coordinator_put(self, path, message):
        """
        Make a PUT request to the Trogdor coordinator.

        :param path:            The URL path to use.
        :param message:         The message object to send.
        :return:                The response as an object.
        """
        url = self._coordinator_url(path)
        self.logger.info("PUT %s %s" % (url, message))
        response = self.request_session().put(url, json=message,
                                              timeout=TrogdorService.REQUEST_TIMEOUT,
                                              headers=TrogdorService.REQUEST_HEADERS)
        response.raise_for_status()
        return response.json()

    def _coordinator_get(self, path, message):
        """
        Make a GET request to the Trogdor coordinator.

        :param path:            The URL path to use.
        :param message:         The message object to send.
        :return:                The response as an object.
        """
        url = self._coordinator_url(path)
        self.logger.info("GET %s %s" % (url, message))
        response = self.request_session().get(url, json=message,
                                              timeout=TrogdorService.REQUEST_TIMEOUT,
                                              headers=TrogdorService.REQUEST_HEADERS)
        response.raise_for_status()
        return response.json()

    def create_task(self, id, spec):
        """
        Create a new task.

        :param id:          The task id.
        :param spec:        The task spec.
        """
        self._coordinator_post("task/create", { "id": id, "spec": spec.message()})
        return TrogdorTask(id, self)

    def stop_task(self, id):
        """
        Stop a task.

        :param id:          The task id.
        """
        self._coordinator_put("task/stop", { "id": id })

    def tasks(self):
        """
        Get the tasks which are on the coordinator.

        :returns:           A map of task id strings to task state objects.
                            Task state objects contain a 'spec' field with the spec
                            and a 'state' field with the state.
        """
        return self._coordinator_get("tasks", {})

    def is_coordinator(self, node):
        return node == self.coordinator_node

    def agent_class_name(self):
        return "org.apache.kafka.trogdor.agent.Agent"

    def coordinator_class_name(self):
        return "org.apache.kafka.trogdor.coordinator.Coordinator"

class TrogdorTask(object):
    PENDING_STATE = "PENDING"
    RUNNING_STATE = "RUNNING"
    STOPPING_STATE = "STOPPING"
    DONE_STATE = "DONE"

    def __init__(self, id, trogdor):
        self.id = id
        self.trogdor = trogdor

    def done(self):
        """
        Check if this task is done.

        :raises RuntimeError:       If the task encountered an error.
        :returns:                   True if the task is in DONE_STATE;
                                    False if it is in a different state.
        """
        task_state = self.trogdor.tasks()["tasks"][self.id]
        if task_state is None:
            raise RuntimeError("Coordinator did not know about %s." % self.id)
        error = task_state.get("error")
        if error is None or error == "":
            return task_state["state"] == TrogdorTask.DONE_STATE
        raise RuntimeError("Failed to gracefully stop %s: got task error: %s" % (self.id, error))

    def stop(self):
        """
        Stop this task.

        :raises RuntimeError:       If the task encountered an error.
        """
        if self.done():
            return
        self.trogdor.stop_task(self.id)

    def wait_for_done(self, timeout_sec=360):
        wait_until(lambda: self.done(),
                   timeout_sec=timeout_sec,
                   err_msg="%s failed to finish in the expected amount of time." % self.id)