Improve batching test code

* Reduce the batching test to only use few containers that should be directly connected and assert for direction connection * Remove majority of peers and operating systems such as Windows and MacOS to simplify local testing * Add histogram based on packet timestamp differences in addition to already existing distribution histogram * Random misalignment replaced with constant sleeping Batching currently uses only distribution histogram which is basically packets falling into the histogram with a granularity of 1second. This while providing nice visual insight lacked precision since two packets having timestamps of 0.999 and 1.001 would fall into two separate buckets and also to provide some metric of "intervals" we would need to calculate streaks of silence and they are imprecise. A histogram between two packet timestamps should be much more precise and provide better insights. Signed-off-by: Lukas Pukenis <[email protected]>
ghztomash · Nov 29, 2024 · 03102fa · 03102fa
1 parent 03cff7b
commit 03102fa
Show file tree

Hide file tree

Showing 10 changed files with 285 additions and 232 deletions.
diff --git a/.unreleased/LLT-5797-batch-test-improvements b/.unreleased/LLT-5797-batch-test-improvements
diff --git a/nat-lab/pyproject.toml b/nat-lab/pyproject.toml
@@ -85,7 +85,8 @@ markers = [
     "ipv4: tests only ipv4 WG connectivity",
     "ipv6: tests only ipv6 WG connectivity",
     "ipv4v6: tests dual stack WG connectivity",
-    "batching: tests packet batching"
+    "batching: tests packet batching",
+    "utils: tests the natlab utilities",
 ]
 filterwarnings = [
     "ignore::DeprecationWarning"

diff --git a/nat-lab/run_local.py b/nat-lab/run_local.py
@@ -58,6 +58,9 @@ def main() -> int:
     parser.add_argument(
         "--linux-native", action="store_true", help="Run tests with 'linux_native' mark"
     )
+    parser.add_argument(
+        "--utils", action="store_true", help="Run tests with 'utils' mark"
+    )
     parser.add_argument("--nobuild", action="store_true", help="Don't build TCLI")
     parser.add_argument("--notests", action="store_true", help="Don't run tests")
     parser.add_argument(

diff --git a/nat-lab/tests/test_batching.py b/nat-lab/tests/test_batching.py
@@ -1,57 +1,60 @@
 import asyncio
+import itertools
 import pytest
-import random
 from contextlib import AsyncExitStack
 from helpers import SetupParameters, setup_environment
 from itertools import zip_longest
-from scapy.layers.inet import TCP, UDP  # type: ignore
+from scapy.layers.inet import TCP, UDP, ICMP  # type: ignore
+from scapy.layers.l2 import ARP  # type: ignore
 from timeouts import TEST_BATCHING_TIMEOUT
-from typing import List, Tuple
-from utils.batching import (
-    capture_traffic,
-    print_histogram,
-    generate_histogram_from_pcap,
-)
+from typing import List
+from utils.asyncio_util import run_async_context
 from utils.bindings import (
     features_with_endpoint_providers,
     FeatureLinkDetection,
     FeaturePersistentKeepalive,
     FeatureBatching,
     EndpointProvider,
     RelayState,
+    NodeState,
+    PathType,
     TelioAdapterType,
 )
 from utils.connection import DockerConnection
 from utils.connection_util import DOCKER_GW_MAP, ConnectionTag, container_id
+from utils.traffic import (
+    capture_traffic,
+    render_chart,
+    generate_packet_distribution_histogram,
+    generate_packet_delay_histogram,
+)
 
-BATCHING_MISALIGN_RANGE = (0, 5)  # Seconds to sleep for peers before starting
-BATCHING_CAPTURE_TIME = 240  # Tied to TEST_BATCHING_TIMEOUT
+BATCHING_MISALIGN_S = 7
+BATCHING_CAPTURE_TIME = 120  # Tied to TEST_BATCHING_TIMEOUT
 
 
 def _generate_setup_parameters(
     conn_tag: ConnectionTag, adapter: TelioAdapterType, batching: bool
 ) -> SetupParameters:
-    features = features_with_endpoint_providers(
-        [EndpointProvider.UPNP, EndpointProvider.LOCAL, EndpointProvider.STUN]
-    )
+    features = features_with_endpoint_providers([EndpointProvider.STUN])
 
     features.link_detection = FeatureLinkDetection(
         rtt_seconds=1, no_of_pings=1, use_for_downgrade=True
     )
     features.batching = (
         FeatureBatching(
-            direct_connection_threshold=35,
+            direct_connection_threshold=15,
             trigger_effective_duration=10,
             trigger_cooldown_duration=60,
         )
         if batching
         else None
     )
     features.wireguard.persistent_keepalive = FeaturePersistentKeepalive(
-        direct=70,
-        proxying=70,
-        stun=70,
-        vpn=70,
+        direct=30,
+        proxying=30,
+        stun=30,
+        vpn=30,
     )
 
     return SetupParameters(
@@ -70,58 +73,6 @@ def _generate_setup_parameters(
         ConnectionTag.DOCKER_CONE_CLIENT_2,
         TelioAdapterType.LINUX_NATIVE_TUN,
     ),
-    (
-        ConnectionTag.DOCKER_SYMMETRIC_CLIENT_1,
-        TelioAdapterType.LINUX_NATIVE_TUN,
-    ),
-    (
-        ConnectionTag.DOCKER_SYMMETRIC_CLIENT_2,
-        TelioAdapterType.LINUX_NATIVE_TUN,
-    ),
-    (
-        ConnectionTag.DOCKER_UPNP_CLIENT_1,
-        TelioAdapterType.LINUX_NATIVE_TUN,
-    ),
-    (
-        ConnectionTag.DOCKER_UPNP_CLIENT_2,
-        TelioAdapterType.LINUX_NATIVE_TUN,
-    ),
-    (
-        ConnectionTag.DOCKER_SHARED_CLIENT_1,
-        TelioAdapterType.LINUX_NATIVE_TUN,
-    ),
-    (
-        ConnectionTag.DOCKER_OPEN_INTERNET_CLIENT_1,
-        TelioAdapterType.LINUX_NATIVE_TUN,
-    ),
-    (
-        ConnectionTag.DOCKER_OPEN_INTERNET_CLIENT_2,
-        TelioAdapterType.LINUX_NATIVE_TUN,
-    ),
-    (
-        ConnectionTag.DOCKER_OPEN_INTERNET_CLIENT_DUAL_STACK,
-        TelioAdapterType.LINUX_NATIVE_TUN,
-    ),
-    (
-        ConnectionTag.DOCKER_UDP_BLOCK_CLIENT_1,
-        TelioAdapterType.LINUX_NATIVE_TUN,
-    ),
-    (
-        ConnectionTag.DOCKER_UDP_BLOCK_CLIENT_2,
-        TelioAdapterType.LINUX_NATIVE_TUN,
-    ),
-    (
-        ConnectionTag.DOCKER_INTERNAL_SYMMETRIC_CLIENT,
-        TelioAdapterType.LINUX_NATIVE_TUN,
-    ),
-    (ConnectionTag.DOCKER_FULLCONE_CLIENT_1, TelioAdapterType.LINUX_NATIVE_TUN),
-    (ConnectionTag.DOCKER_FULLCONE_CLIENT_2, TelioAdapterType.LINUX_NATIVE_TUN),
-    (
-        ConnectionTag.MAC_VM,
-        TelioAdapterType.NEP_TUN,
-    ),
-    (ConnectionTag.WINDOWS_VM_1, TelioAdapterType.WINDOWS_NATIVE_TUN),
-    (ConnectionTag.WINDOWS_VM_2, TelioAdapterType.WIREGUARD_GO_TUN),
 ]
 # This test captures histograms of network activity to evaluate the effect of local batching in libtelio.
 # Since only local batching is implemented, no client-generated traffic should occur during the test.
@@ -132,44 +83,38 @@ def _generate_setup_parameters(
 # not do anything about syncing the keepalives between the peers.
 
 
-# TODO: Add asserts for local batching
-# TODO: Implement received-data-trigger batching
 @pytest.mark.asyncio
 @pytest.mark.timeout(TEST_BATCHING_TIMEOUT)
 @pytest.mark.parametrize(
-    "setup_params,misalign_sleep_range,capture_duration",
+    "setup_params,misalign_sleep_s,capture_duration",
     [
         pytest.param(
             [
-                _generate_setup_parameters(conn_tag, adapter, False)
+                _generate_setup_parameters(conn_tag, adapter, True)
                 for conn_tag, adapter in ALL_NODES
             ],
-            BATCHING_MISALIGN_RANGE,
+            BATCHING_MISALIGN_S,
             BATCHING_CAPTURE_TIME,
             marks=[
                 pytest.mark.batching,
-                pytest.mark.mac,
-                pytest.mark.windows,
             ],
         ),
         pytest.param(
             [
-                _generate_setup_parameters(conn_tag, adapter, True)
+                _generate_setup_parameters(conn_tag, adapter, False)
                 for conn_tag, adapter in ALL_NODES
             ],
-            BATCHING_MISALIGN_RANGE,
+            BATCHING_MISALIGN_S,
             BATCHING_CAPTURE_TIME,
             marks=[
                 pytest.mark.batching,
-                pytest.mark.mac,
-                pytest.mark.windows,
             ],
         ),
     ],
 )
 async def test_batching(
     setup_params: List[SetupParameters],
-    misalign_sleep_range: Tuple[int, int],
+    misalign_sleep_s: int,
     capture_duration: int,
 ) -> None:
     async with AsyncExitStack() as exit_stack:
@@ -208,29 +153,56 @@ async def test_batching(
             await client.stop_device()
 
         # misalign the peers by sleeping some before starting each node again
-        async def start_node_manually(client, node, sleep_min: int, sleep_max: int):
-            await asyncio.sleep(random.randint(sleep_min, sleep_max))
+        async def start_node_manually(client, node, sleep_s):
+            await asyncio.sleep(sleep_s)
             await client.simple_start()
             await client.set_meshnet_config(env.api.get_meshnet_config(node.id))
 
         await asyncio.gather(*[
-            start_node_manually(
-                client, node, misalign_sleep_range[0], misalign_sleep_range[1]
+            start_node_manually(client, node, i * misalign_sleep_s)
+            for i, (client, node) in enumerate(cnodes)
+        ])
+
+        await asyncio.gather(*[
+            await exit_stack.enter_async_context(
+                run_async_context(
+                    client.wait_for_state_peer(
+                        node.public_key, [NodeState.CONNECTED], [PathType.DIRECT]
+                    )
+                )
             )
-            for client, node in cnodes
+            for client, node in itertools.product(env.clients, env.nodes)
+            if not client.is_node(node)
         ])
 
         pyro5_ports = [
             int(port) for port in {client.get_proxy_port() for client in env.clients}
         ]
 
+        print("Pyro ports", pyro5_ports)
+        # In general it's not great to filter traffic but for testing and observing
+        # it's crucial since it distorts the results. For example Pyro traffic is a constant stream of
+        # TCP packets
         allow_pcap_filters = [
             (
-                "IP46 + No Pyro5 traffic",
+                "No Pyro5, SSDP, ARP",
                 lambda p: (
-                    (p.haslayer(UDP) or p.haslayer(TCP))
-                    and p.sport not in pyro5_ports
-                    and p.dport not in pyro5_ports
+                    (
+                        (p.haslayer(UDP) or p.haslayer(TCP))
+                        and p.sport not in pyro5_ports
+                        and p.dport not in pyro5_ports
+                    )
+                    and (
+                        not p.haslayer(ICMP)
+                        or p.haslayer(ICMP)
+                        and p[ICMP].type in [0, 8]
+                    )
+                    and (
+                        p.haslayer(UDP)
+                        and p[UDP].sport != 1900
+                        and p[UDP].dport != 1900
+                    )
+                    and (not p.haslayer(ARP))
                 ),
             ),
         ]
@@ -247,9 +219,24 @@ async def start_node_manually(client, node, sleep_min: int, sleep_max: int):
 
         pcap_paths = await asyncio.gather(*pcap_capture_tasks)
 
+        is_batching_enabled = env.clients[0].get_features().batching is not None
         for container, pcap_path in zip(container_names, pcap_paths):
-            for filt in allow_pcap_filters:
-                filter_name = filt[0]
-                hs = generate_histogram_from_pcap(pcap_path, capture_duration, filt[1])
-                title = f"{container}-filter({filter_name})"
-                print_histogram(title, hs, max_height=12)
+            distribution_hs = generate_packet_distribution_histogram(
+                pcap_path, capture_duration, allow_pcap_filters
+            )
+            delay_hs = generate_packet_delay_histogram(
+                pcap_path, capture_duration, allow_pcap_filters
+            )
+
+            batch_str = "batch" if is_batching_enabled else "nobatch"
+
+            print(f"*** {container}-{batch_str} ***")
+
+            distribution_chart = render_chart(distribution_hs)
+            delay_chart = render_chart(delay_hs)
+
+            print("Distribution chart below")
+            print(distribution_chart)
+
+            print("Delay chart below")
+            print(delay_chart)
diff --git a/nat-lab/tests/timeouts.py b/nat-lab/tests/timeouts.py
@@ -12,4 +12,4 @@
 TEST_NODE_STATE_FLICKERING_RELAY_TIMEOUT = 180
 TEST_NODE_STATE_FLICKERING_DIRECT_TIMEOUT = 180
 TEST_MESH_STATE_AFTER_DISCONNECTING_NODE_TIMEOUT = 300
-TEST_BATCHING_TIMEOUT = 600
+TEST_BATCHING_TIMEOUT = 1000
diff --git a/nat-lab/tests/utils/__init__.py b/nat-lab/tests/utils/__init__.py
@@ -0,0 +1 @@
+from .traffic import *