Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

LLT-5944: Dump a core when tcpdump is stuck #1068

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file.
16 changes: 11 additions & 5 deletions nat-lab/bin/kill_process_by_natlab_id
Original file line number Diff line number Diff line change
@@ -1,20 +1,26 @@
#!/usr/bin/env bash

if [[ "$#" -ne 1 ]]; then
echo "Wrong number of parameters"
if [[ "$#" -lt 1 || "$#" -gt 2 ]]; then
echo "Usage: $0 <NATLAB_ID> [--SEGV]"
exit 1
fi

NATLAB_ID=$1
SIGNAL="-TERM" # default SIGTERM

# If the second argument is '--SEGV', send SIGSEGV (-11) to create a coredump
if [[ "$2" == "--SEGV" ]]; then
SIGNAL="-SEGV"
fi

for pid in $(ps -e -o pid=); do
if grep --null-data --text KILL_ID=${NATLAB_ID} /proc/${pid}/environ; then
cmd=$(tr -d '\000' < /proc/${pid}/cmdline || echo "N/A")
echo "$(date) Killing ${pid} ${cmd}"
kill "${pid}"
echo "$(date) Killing ${pid} ${cmd} with $SIGNAL"
kill "${SIGNAL}" "${pid}"
wait "${pid}" 2>/dev/null
exit 0
fi
done

echo "The process to kill not found"
echo "The process to kill was not found: $NATLAB_ID"
24 changes: 22 additions & 2 deletions nat-lab/tests/utils/tcpdump.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from asyncio import Event, wait_for, sleep
from config import WINDUMP_BINARY_WINDOWS
from contextlib import asynccontextmanager, AsyncExitStack
from datetime import datetime
from typing import AsyncIterator, Optional
from utils.connection import TargetOS, Connection
from utils.output_notifier import OutputNotifier
Expand Down Expand Up @@ -44,6 +45,7 @@ def __init__(
self.count = count
self.stdout = ""
self.stderr = ""
self.kill_id = "DO_NOT_KILL" + secrets.token_hex(8).upper() if session else None

self.output_notifier.notify_output("listening on", self.start_event)

Expand Down Expand Up @@ -94,7 +96,7 @@ def __init__(
# handle signals properly while `tcpdump -w file` is running, without writing
# to file, everything works fine
term_type="xterm" if self.connection.target_os == TargetOS.Mac else None,
kill_id="DO_NOT_KILL" + secrets.token_hex(8).upper() if session else None,
kill_id=self.kill_id,
)

@staticmethod
Expand Down Expand Up @@ -133,7 +135,25 @@ async def execute(self) -> None:
@asynccontextmanager
async def run(self) -> AsyncIterator["TcpDump"]:
async with self.process.run(self.on_stdout, self.on_stderr, True):
await wait_for(self.start_event.wait(), 10)
try:
await wait_for(self.start_event.wait(), 0.2)
except TimeoutError as e:
print(
datetime.now(),
"tcpdump timed out, killing it to create a coredump 🗡️",
)
if self.connection.target_os != TargetOS.Windows:
if self.kill_id:
await self.connection.create_process([
"/opt/bin/kill_process_by_natlab_id",
str(self.kill_id),
"--SEGV",
]).execute()
else:
await self.connection.create_process(
["killall", "-11", "tcpdump"]
).execute()
raise TimeoutError from e
yield self
# Windump takes so long to flush packets to stdout/file
if self.connection.target_os == TargetOS.Windows:
Expand Down
Loading