# Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer from collections import defaultdict, namedtuple import json from threading import Thread from select import select import socket MetricKey = namedtuple('MetricKey', ['host', 'client_id', 'name', 'group', 'tags']) MetricValue = namedtuple('MetricValue', ['time', 'value']) # Python's logging library doesn't define anything more detailed than DEBUG, but we'd like a finer-grained setting for # for highly detailed messages, e.g. logging every single incoming request. TRACE = 5 class HttpMetricsCollector(object): """ HttpMetricsCollector enables collection of metrics from various Kafka clients instrumented with the PushHttpMetricsReporter. It starts a web server locally and provides the necessary configuration for clients to automatically report metrics data to this server. It also provides basic functionality for querying the recorded metrics. This class can be used either as a mixin or standalone object. """ # The port to listen on on the worker node, which will be forwarded to the port listening on this driver node REMOTE_PORT = 6789 def __init__(self, **kwargs): """ Create a new HttpMetricsCollector :param period the period, in seconds, between updates that the metrics reporter configuration should define. defaults to reporting once per second :param args: :param kwargs: """ self._http_metrics_period = kwargs.pop('period', 1) super(HttpMetricsCollector, self).__init__(**kwargs) # TODO: currently we maintain just a simple map from all key info -> value. However, some key fields are far # more common to filter on, so we'd want to index by them, e.g. host, client.id, metric name. self._http_metrics = defaultdict(list) self._httpd = HTTPServer(('', 0), _MetricsReceiver) self._httpd.parent = self self._httpd.metrics = self._http_metrics self._http_metrics_thread = Thread(target=self._run_http_metrics_httpd, name='http-metrics-thread[%s]' % str(self)) self._http_metrics_thread.start() self._forwarders = {} @property def http_metrics_url(self): """ :return: the URL to use when reporting metrics """ return "http://%s:%d" % ("localhost", self.REMOTE_PORT) @property def http_metrics_client_configs(self): """ Get client configurations that can be used to report data to this collector. Put these in a properties file for clients (e.g. console producer or consumer) to have them push metrics to this driver. Note that in some cases (e.g. streams, connect) these settings may need to be prefixed. :return: a dictionary of client configurations that will direct a client to report metrics to this collector """ return { "metric.reporters": "org.apache.kafka.tools.PushHttpMetricsReporter", "metrics.url": self.http_metrics_url, "metrics.period": self._http_metrics_period, } def start_node(self, node): local_port = self._httpd.socket.getsockname()[1] self.logger.debug('HttpMetricsCollector listening on %s', local_port) self._forwarders[self.idx(node)] = _ReverseForwarder(self.logger, node, self.REMOTE_PORT, local_port) super(HttpMetricsCollector, self).start_node(node) def stop(self): super(HttpMetricsCollector, self).stop() if self._http_metrics_thread: self.logger.debug("Shutting down metrics httpd") self._httpd.shutdown() self._http_metrics_thread.join() self.logger.debug("Finished shutting down metrics httpd") def stop_node(self, node): super(HttpMetricsCollector, self).stop_node(node) idx = self.idx(node) self._forwarders[idx].stop() del self._forwarders[idx] def metrics(self, host=None, client_id=None, name=None, group=None, tags=None): """ Get any collected metrics that match the specified parameters, yielding each as a tuple of (key, [, ...]) values. """ for k, values in self._http_metrics.iteritems(): if ((host is None or host == k.host) and (client_id is None or client_id == k.client_id) and (name is None or name == k.name) and (group is None or group == k.group) and (tags is None or tags == k.tags)): yield (k, values) def _run_http_metrics_httpd(self): self._httpd.serve_forever() class _MetricsReceiver(BaseHTTPRequestHandler): """ HTTP request handler that accepts requests from the PushHttpMetricsReporter and stores them back into the parent HttpMetricsCollector """ def log_message(self, format, *args, **kwargs): # Don't do any logging here so we get rid of the mostly useless per-request Apache log-style info that spams # the debug log pass def do_POST(self): data = self.rfile.read(int(self.headers['Content-Length'])) data = json.loads(data) self.server.parent.logger.log(TRACE, "POST %s\n\n%s\n%s", self.path, self.headers, json.dumps(data, indent=4, separators=(',', ': '))) self.send_response(204) self.end_headers() client = data['client'] host = client['host'] client_id = client['client_id'] ts = client['time'] metrics = data['metrics'] for raw_metric in metrics: name = raw_metric['name'] group = raw_metric['group'] # Convert to tuple of pairs because dicts & lists are unhashable tags = tuple([(k, v) for k, v in raw_metric['tags'].iteritems()]), value = raw_metric['value'] key = MetricKey(host=host, client_id=client_id, name=name, group=group, tags=tags) metric_value = MetricValue(time=ts, value=value) self.server.metrics[key].append(metric_value) class _ReverseForwarder(object): """ Runs reverse forwarding of a port on a node to a local port. This allows you to setup a server on the test driver that only assumes we have basic SSH access that ducktape guarantees is available for worker nodes. """ def __init__(self, logger, node, remote_port, local_port): self.logger = logger self._node = node self._local_port = local_port self._remote_port = remote_port self.logger.debug('Forwarding %s port %d to driver port %d', node, remote_port, local_port) self._stopping = False self._transport = node.account.ssh_client.get_transport() self._transport.request_port_forward('', remote_port) self._accept_thread = Thread(target=self._accept) self._accept_thread.start() def stop(self): self._stopping = True self._accept_thread.join(30) if self._accept_thread.isAlive(): raise RuntimeError("Failed to stop reverse forwarder on %s", self._node) self._transport.cancel_port_forward('', self._remote_port) def _accept(self): while not self._stopping: chan = self._transport.accept(1) if chan is None: continue thr = Thread(target=self._handler, args=(chan,)) thr.setDaemon(True) thr.start() def _handler(self, chan): sock = socket.socket() try: sock.connect(("localhost", self._local_port)) except Exception as e: self.logger.error('Forwarding request to port %d failed: %r', self._local_port, e) return self.logger.log(TRACE, 'Connected! Tunnel open %r -> %r -> %d', chan.origin_addr, chan.getpeername(), self._local_port) while True: r, w, x = select([sock, chan], [], []) if sock in r: data = sock.recv(1024) if len(data) == 0: break chan.send(data) if chan in r: data = chan.recv(1024) if len(data) == 0: break sock.send(data) chan.close() sock.close() self.logger.log(TRACE, 'Tunnel closed from %r', chan.origin_addr)