Merge pull request NordSecurity#969 from NordSecurity/LLT-5797-batch-…

…test-improvements LLT-5797-batch-test-improvements
ghztomash · Nov 29, 2024 · b6aff68 · b6aff68
2 parents e949d77 + 03102fa
commit b6aff68
Show file tree

Hide file tree

Showing 10 changed files with 285 additions and 232 deletions.
diff --git a/.unreleased/LLT-5797-batch-test-improvements b/.unreleased/LLT-5797-batch-test-improvements
diff --git a/nat-lab/pyproject.toml b/nat-lab/pyproject.toml
@@ -85,7 +85,8 @@ markers = [
     "ipv4: tests only ipv4 WG connectivity",
     "ipv6: tests only ipv6 WG connectivity",
     "ipv4v6: tests dual stack WG connectivity",
-    "batching: tests packet batching"
+    "batching: tests packet batching",
+    "utils: tests the natlab utilities",
 ]
 filterwarnings = [
     "ignore::DeprecationWarning"

diff --git a/nat-lab/run_local.py b/nat-lab/run_local.py
@@ -58,6 +58,9 @@ def main() -> int:
     parser.add_argument(
         "--linux-native", action="store_true", help="Run tests with 'linux_native' mark"
     )
+    parser.add_argument(
+        "--utils", action="store_true", help="Run tests with 'utils' mark"
+    )
     parser.add_argument("--nobuild", action="store_true", help="Don't build TCLI")
     parser.add_argument("--notests", action="store_true", help="Don't run tests")
     parser.add_argument(

diff --git a/nat-lab/tests/test_batching.py b/nat-lab/tests/test_batching.py
@@ -1,57 +1,60 @@
 import asyncio
+import itertools
 import pytest
-import random
 from contextlib import AsyncExitStack
 from helpers import SetupParameters, setup_environment
 from itertools import zip_longest
-from scapy.layers.inet import TCP, UDP  # type: ignore
+from scapy.layers.inet import TCP, UDP, ICMP  # type: ignore
+from scapy.layers.l2 import ARP  # type: ignore
 from timeouts import TEST_BATCHING_TIMEOUT
-from typing import List, Tuple
-from utils.batching import (
-    capture_traffic,
-    print_histogram,
-    generate_histogram_from_pcap,
-)
+from typing import List
+from utils.asyncio_util import run_async_context
 from utils.bindings import (
     features_with_endpoint_providers,
     FeatureLinkDetection,
     FeaturePersistentKeepalive,
     FeatureBatching,
     EndpointProvider,
     RelayState,
+    NodeState,
+    PathType,
     TelioAdapterType,
 )
 from utils.connection import DockerConnection
 from utils.connection_util import DOCKER_GW_MAP, ConnectionTag, container_id
+from utils.traffic import (
+    capture_traffic,
+    render_chart,
+    generate_packet_distribution_histogram,
+    generate_packet_delay_histogram,
+)
 
-BATCHING_MISALIGN_RANGE = (0, 5)  # Seconds to sleep for peers before starting
-BATCHING_CAPTURE_TIME = 240  # Tied to TEST_BATCHING_TIMEOUT
+BATCHING_MISALIGN_S = 7
+BATCHING_CAPTURE_TIME = 120  # Tied to TEST_BATCHING_TIMEOUT
 
 
 def _generate_setup_parameters(
     conn_tag: ConnectionTag, adapter: TelioAdapterType, batching: bool
 ) -> SetupParameters:
-    features = features_with_endpoint_providers(
-        [EndpointProvider.UPNP, EndpointProvider.LOCAL, EndpointProvider.STUN]
-    )
+    features = features_with_endpoint_providers([EndpointProvider.STUN])
 
     features.link_detection = FeatureLinkDetection(
         rtt_seconds=1, no_of_pings=1, use_for_downgrade=True
     )
     features.batching = (
         FeatureBatching(
-            direct_connection_threshold=35,
+            direct_connection_threshold=15,
             trigger_effective_duration=10,
             trigger_cooldown_duration=60,
         )
         if batching
         else None
     )
     features.wireguard.persistent_keepalive = FeaturePersistentKeepalive(
-        direct=70,
-        proxying=70,
-        stun=70,
-        vpn=70,
+        direct=30,
+        proxying=30,
+        stun=30,
+        vpn=30,
     )
 
     return SetupParameters(
@@ -70,58 +73,6 @@ def _generate_setup_parameters(
         ConnectionTag.DOCKER_CONE_CLIENT_2,
         TelioAdapterType.LINUX_NATIVE_TUN,
     ),
-    (
-        ConnectionTag.DOCKER_SYMMETRIC_CLIENT_1,
-        TelioAdapterType.LINUX_NATIVE_TUN,
-    ),
-    (
-        ConnectionTag.DOCKER_SYMMETRIC_CLIENT_2,
-        TelioAdapterType.LINUX_NATIVE_TUN,
-    ),
-    (
-        ConnectionTag.DOCKER_UPNP_CLIENT_1,
-        TelioAdapterType.LINUX_NATIVE_TUN,
-    ),
-    (
-        ConnectionTag.DOCKER_UPNP_CLIENT_2,
-        TelioAdapterType.LINUX_NATIVE_TUN,
-    ),
-    (
-        ConnectionTag.DOCKER_SHARED_CLIENT_1,
-        TelioAdapterType.LINUX_NATIVE_TUN,
-    ),
-    (
-        ConnectionTag.DOCKER_OPEN_INTERNET_CLIENT_1,
-        TelioAdapterType.LINUX_NATIVE_TUN,
-    ),
-    (
-        ConnectionTag.DOCKER_OPEN_INTERNET_CLIENT_2,
-        TelioAdapterType.LINUX_NATIVE_TUN,
-    ),
-    (
-        ConnectionTag.DOCKER_OPEN_INTERNET_CLIENT_DUAL_STACK,
-        TelioAdapterType.LINUX_NATIVE_TUN,
-    ),
-    (
-        ConnectionTag.DOCKER_UDP_BLOCK_CLIENT_1,
-        TelioAdapterType.LINUX_NATIVE_TUN,
-    ),
-    (
-        ConnectionTag.DOCKER_UDP_BLOCK_CLIENT_2,
-        TelioAdapterType.LINUX_NATIVE_TUN,
-    ),
-    (
-        ConnectionTag.DOCKER_INTERNAL_SYMMETRIC_CLIENT,
-        TelioAdapterType.LINUX_NATIVE_TUN,
-    ),
-    (ConnectionTag.DOCKER_FULLCONE_CLIENT_1, TelioAdapterType.LINUX_NATIVE_TUN),
-    (ConnectionTag.DOCKER_FULLCONE_CLIENT_2, TelioAdapterType.LINUX_NATIVE_TUN),
-    (
-        ConnectionTag.MAC_VM,
-        TelioAdapterType.NEP_TUN,
-    ),
-    (ConnectionTag.WINDOWS_VM_1, TelioAdapterType.WINDOWS_NATIVE_TUN),
-    (ConnectionTag.WINDOWS_VM_2, TelioAdapterType.WIREGUARD_GO_TUN),
 ]
 # This test captures histograms of network activity to evaluate the effect of local batching in libtelio.
 # Since only local batching is implemented, no client-generated traffic should occur during the test.
@@ -132,44 +83,38 @@ def _generate_setup_parameters(
 # not do anything about syncing the keepalives between the peers.
 
 
-# TODO: Add asserts for local batching
-# TODO: Implement received-data-trigger batching
 @pytest.mark.asyncio
 @pytest.mark.timeout(TEST_BATCHING_TIMEOUT)
 @pytest.mark.parametrize(
-    "setup_params,misalign_sleep_range,capture_duration",
+    "setup_params,misalign_sleep_s,capture_duration",
     [
         pytest.param(
             [
-                _generate_setup_parameters(conn_tag, adapter, False)
+                _generate_setup_parameters(conn_tag, adapter, True)
                 for conn_tag, adapter in ALL_NODES
             ],
-            BATCHING_MISALIGN_RANGE,
+            BATCHING_MISALIGN_S,
             BATCHING_CAPTURE_TIME,
             marks=[
                 pytest.mark.batching,
-                pytest.mark.mac,
-                pytest.mark.windows,
             ],
         ),
         pytest.param(
             [
-                _generate_setup_parameters(conn_tag, adapter, True)
+                _generate_setup_parameters(conn_tag, adapter, False)
                 for conn_tag, adapter in ALL_NODES
             ],
-            BATCHING_MISALIGN_RANGE,
+            BATCHING_MISALIGN_S,
             BATCHING_CAPTURE_TIME,
             marks=[
                 pytest.mark.batching,
-                pytest.mark.mac,
-                pytest.mark.windows,
             ],
         ),
     ],
 )
 async def test_batching(
     setup_params: List[SetupParameters],
-    misalign_sleep_range: Tuple[int, int],
+    misalign_sleep_s: int,
     capture_duration: int,
 ) -> None:
     async with AsyncExitStack() as exit_stack:
@@ -208,29 +153,56 @@ async def test_batching(
             await client.stop_device()
 
         # misalign the peers by sleeping some before starting each node again
-        async def start_node_manually(client, node, sleep_min: int, sleep_max: int):
-            await asyncio.sleep(random.randint(sleep_min, sleep_max))
+        async def start_node_manually(client, node, sleep_s):
+            await asyncio.sleep(sleep_s)
             await client.simple_start()
             await client.set_meshnet_config(env.api.get_meshnet_config(node.id))
 
         await asyncio.gather(*[
-            start_node_manually(
-                client, node, misalign_sleep_range[0], misalign_sleep_range[1]
+            start_node_manually(client, node, i * misalign_sleep_s)
+            for i, (client, node) in enumerate(cnodes)
+        ])
+
+        await asyncio.gather(*[
+            await exit_stack.enter_async_context(
+                run_async_context(
+                    client.wait_for_state_peer(
+                        node.public_key, [NodeState.CONNECTED], [PathType.DIRECT]
+                    )
+                )
             )
-            for client, node in cnodes
+            for client, node in itertools.product(env.clients, env.nodes)
+            if not client.is_node(node)
         ])
 
         pyro5_ports = [
             int(port) for port in {client.get_proxy_port() for client in env.clients}
         ]
 
+        print("Pyro ports", pyro5_ports)
+        # In general it's not great to filter traffic but for testing and observing
+        # it's crucial since it distorts the results. For example Pyro traffic is a constant stream of
+        # TCP packets
         allow_pcap_filters = [
             (
-                "IP46 + No Pyro5 traffic",
+                "No Pyro5, SSDP, ARP",
                 lambda p: (
-                    (p.haslayer(UDP) or p.haslayer(TCP))
-                    and p.sport not in pyro5_ports
-                    and p.dport not in pyro5_ports
+                    (
+                        (p.haslayer(UDP) or p.haslayer(TCP))
+                        and p.sport not in pyro5_ports
+                        and p.dport not in pyro5_ports
+                    )
+                    and (
+                        not p.haslayer(ICMP)
+                        or p.haslayer(ICMP)
+                        and p[ICMP].type in [0, 8]
+                    )
+                    and (
+                        p.haslayer(UDP)
+                        and p[UDP].sport != 1900
+                        and p[UDP].dport != 1900
+                    )
+                    and (not p.haslayer(ARP))
                 ),
             ),
         ]
@@ -247,9 +219,24 @@ async def start_node_manually(client, node, sleep_min: int, sleep_max: int):
 
         pcap_paths = await asyncio.gather(*pcap_capture_tasks)
 
+        is_batching_enabled = env.clients[0].get_features().batching is not None
         for container, pcap_path in zip(container_names, pcap_paths):
-            for filt in allow_pcap_filters:
-                filter_name = filt[0]
-                hs = generate_histogram_from_pcap(pcap_path, capture_duration, filt[1])
-                title = f"{container}-filter({filter_name})"
-                print_histogram(title, hs, max_height=12)
+            distribution_hs = generate_packet_distribution_histogram(
+                pcap_path, capture_duration, allow_pcap_filters
+            )
+            delay_hs = generate_packet_delay_histogram(
+                pcap_path, capture_duration, allow_pcap_filters
+            )
+
+            batch_str = "batch" if is_batching_enabled else "nobatch"
+
+            print(f"*** {container}-{batch_str} ***")
+
+            distribution_chart = render_chart(distribution_hs)
+            delay_chart = render_chart(delay_hs)
+
+            print("Distribution chart below")
+            print(distribution_chart)
+
+            print("Delay chart below")
+            print(delay_chart)
diff --git a/nat-lab/tests/timeouts.py b/nat-lab/tests/timeouts.py
@@ -12,4 +12,4 @@
 TEST_NODE_STATE_FLICKERING_RELAY_TIMEOUT = 180
 TEST_NODE_STATE_FLICKERING_DIRECT_TIMEOUT = 180
 TEST_MESH_STATE_AFTER_DISCONNECTING_NODE_TIMEOUT = 300
-TEST_BATCHING_TIMEOUT = 600
+TEST_BATCHING_TIMEOUT = 1000
diff --git a/nat-lab/tests/utils/__init__.py b/nat-lab/tests/utils/__init__.py
@@ -0,0 +1 @@
+from .traffic import *