Browse Source
* Add rate limiting to tc * Feedback from PR * Add a sanity test for tc * Add iperf to vagrant scripts * Dynamically determine the network interface * Add some temp code for testing on AWS * Temp: use hostname instead of external IP * Temp: more AWS debugging * More AWS WIP * More AWS temp * Lower latency some * AWS wip * Trying this again now that ping should work * Add cluster decorator to tests * Fix broken import * Fix device name * Fix decorator arg * Remove errant import * Increase timeouts * Fix tbf command, relax assertion on latency test * Fix log line * Final bit of cleanup * Newline * Revert Trogdor retry count * PR feedback * More PR feedback * Feedback from PR * Remove unused argumentpull/7712/head
David Arthur
5 years ago
committed by
GitHub
10 changed files with 282 additions and 29 deletions
@ -0,0 +1,138 @@ |
|||||||
|
# Licensed to the Apache Software Foundation (ASF) under one or more |
||||||
|
# contributor license agreements. See the NOTICE file distributed with |
||||||
|
# this work for additional information regarding copyright ownership. |
||||||
|
# The ASF licenses this file to You under the Apache License, Version 2.0 |
||||||
|
# (the "License"); you may not use this file except in compliance with |
||||||
|
# the License. You may obtain a copy of the License at |
||||||
|
# |
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0 |
||||||
|
# |
||||||
|
# Unless required by applicable law or agreed to in writing, software |
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS, |
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||||
|
# See the License for the specific language governing permissions and |
||||||
|
# limitations under the License. |
||||||
|
|
||||||
|
import re |
||||||
|
|
||||||
|
from ducktape.mark import parametrize |
||||||
|
from ducktape.mark.resource import cluster |
||||||
|
from ducktape.tests.test import Test |
||||||
|
from ducktape.utils.util import wait_until |
||||||
|
|
||||||
|
from kafkatest.services.trogdor.degraded_network_fault_spec import DegradedNetworkFaultSpec |
||||||
|
from kafkatest.services.trogdor.trogdor import TrogdorService |
||||||
|
from kafkatest.services.zookeeper import ZookeeperService |
||||||
|
|
||||||
|
|
||||||
|
class NetworkDegradeTest(Test): |
||||||
|
""" |
||||||
|
These tests ensure that the network degrade Trogdor specs (which use "tc") are working as expected in whatever |
||||||
|
environment the system tests may be running in. The linux tools "ping" and "iperf" are used for validation |
||||||
|
and need to be available along with "tc" in the test environment. |
||||||
|
""" |
||||||
|
|
||||||
|
def __init__(self, test_context): |
||||||
|
super(NetworkDegradeTest, self).__init__(test_context) |
||||||
|
self.zk = ZookeeperService(test_context, num_nodes=3) |
||||||
|
self.trogdor = TrogdorService(context=self.test_context, client_services=[self.zk]) |
||||||
|
|
||||||
|
def setUp(self): |
||||||
|
self.zk.start() |
||||||
|
self.trogdor.start() |
||||||
|
|
||||||
|
def teardown(self): |
||||||
|
self.trogdor.stop() |
||||||
|
self.zk.stop() |
||||||
|
|
||||||
|
@cluster(num_nodes=5) |
||||||
|
@parametrize(task_name="latency-100", device_name="eth0", latency_ms=50, rate_limit_kbit=0) |
||||||
|
@parametrize(task_name="latency-100-rate-1000", device_name="eth0", latency_ms=50, rate_limit_kbit=1000) |
||||||
|
def test_latency(self, task_name, device_name, latency_ms, rate_limit_kbit): |
||||||
|
spec = DegradedNetworkFaultSpec(0, 10000) |
||||||
|
for node in self.zk.nodes: |
||||||
|
spec.add_node_spec(node.name, device_name, latency_ms, rate_limit_kbit) |
||||||
|
|
||||||
|
latency = self.trogdor.create_task(task_name, spec) |
||||||
|
|
||||||
|
zk0 = self.zk.nodes[0] |
||||||
|
zk1 = self.zk.nodes[1] |
||||||
|
|
||||||
|
# Capture the ping times from the ping stdout |
||||||
|
# 64 bytes from ducker01 (172.24.0.2): icmp_seq=1 ttl=64 time=0.325 ms |
||||||
|
r = re.compile(r".*time=(?P<time>[\d.]+)\sms.*") |
||||||
|
|
||||||
|
times = [] |
||||||
|
for line in zk0.account.ssh_capture("ping -i 1 -c 20 %s" % zk1.account.hostname): |
||||||
|
self.logger.debug("Ping output: %s" % line) |
||||||
|
m = r.match(line) |
||||||
|
if m is not None and m.group("time"): |
||||||
|
times.append(float(m.group("time"))) |
||||||
|
self.logger.info("Parsed ping time of %d" % float(m.group("time"))) |
||||||
|
self.logger.debug("Captured ping times: %s" % times) |
||||||
|
|
||||||
|
# We expect to see some low ping times (before and after the task runs) as well as high ping times |
||||||
|
# (during the task). For the high time, it's twice the configured latency since both links apply the |
||||||
|
# rule, 80% for a little variance buffer |
||||||
|
high_time_ms = 0.8 * 2 * latency_ms |
||||||
|
low_time_ms = 10 |
||||||
|
slow_times = [t for t in times if t > high_time_ms] |
||||||
|
fast_times = [t for t in times if t < low_time_ms] |
||||||
|
|
||||||
|
latency.stop() |
||||||
|
latency.wait_for_done() |
||||||
|
|
||||||
|
# We captured 20 ping times. Assert that at least 5 were "fast" and 5 were "slow" |
||||||
|
assert len(slow_times) > 5, "Expected to see more slow ping times (lower than %d)" % low_time_ms |
||||||
|
assert len(fast_times) > 5, "Expected to see more fast ping times (higher than %d)" % high_time_ms |
||||||
|
|
||||||
|
@cluster(num_nodes=5) |
||||||
|
@parametrize(task_name="rate-1000", device_name="eth0", latency_ms=0, rate_limit_kbit=1000000) |
||||||
|
@parametrize(task_name="rate-1000-latency-50", device_name="eth0", latency_ms=50, rate_limit_kbit=1000000) |
||||||
|
def test_rate(self, task_name, device_name, latency_ms, rate_limit_kbit): |
||||||
|
zk0 = self.zk.nodes[0] |
||||||
|
zk1 = self.zk.nodes[1] |
||||||
|
|
||||||
|
spec = DegradedNetworkFaultSpec(0, 60000) |
||||||
|
spec.add_node_spec(zk0.name, device_name, latency_ms, rate_limit_kbit) |
||||||
|
|
||||||
|
# start the task and wait |
||||||
|
rate_limit = self.trogdor.create_task(task_name, spec) |
||||||
|
wait_until(lambda: rate_limit.running(), |
||||||
|
timeout_sec=10, |
||||||
|
err_msg="%s failed to start within 10 seconds." % rate_limit) |
||||||
|
|
||||||
|
# Run iperf server on zk1, iperf client on zk0 |
||||||
|
iperf_server = zk1.account.ssh_capture("iperf -s") |
||||||
|
|
||||||
|
# Capture the measured kbps between the two nodes. |
||||||
|
# [ 3] 0.0- 1.0 sec 2952576 KBytes 24187503 Kbits/sec |
||||||
|
r = re.compile(r"^.*\s(?P<rate>[\d.]+)\sKbits/sec$") |
||||||
|
|
||||||
|
measured_rates = [] |
||||||
|
for line in zk0.account.ssh_capture("iperf -i 1 -t 20 -f k -c %s" % zk1.account.hostname): |
||||||
|
self.logger.info("iperf output %s" % line) |
||||||
|
m = r.match(line) |
||||||
|
if m is not None: |
||||||
|
measured_rate = float(m.group("rate")) |
||||||
|
measured_rates.append(measured_rate) |
||||||
|
self.logger.info("Parsed rate of %d kbit/s from iperf" % measured_rate) |
||||||
|
|
||||||
|
# kill iperf server and consume the stdout to ensure clean exit |
||||||
|
zk1.account.kill_process("iperf") |
||||||
|
for _ in iperf_server: |
||||||
|
continue |
||||||
|
|
||||||
|
rate_limit.stop() |
||||||
|
rate_limit.wait_for_done() |
||||||
|
|
||||||
|
self.logger.info("Measured rates: %s" % measured_rates) |
||||||
|
|
||||||
|
# We expect to see measured rates within an order of magnitude of our target rate |
||||||
|
low_kbps = rate_limit_kbit / 10 |
||||||
|
high_kbps = rate_limit_kbit * 10 |
||||||
|
acceptable_rates = [r for r in measured_rates if low_kbps < r < high_kbps] |
||||||
|
|
||||||
|
msg = "Expected most of the measured rates to be within an order of magnitude of target %d." % rate_limit_kbit |
||||||
|
msg += " This means `tc` did not limit the bandwidth as expected." |
||||||
|
assert len(acceptable_rates) > 5, msg |
Loading…
Reference in new issue