From 83c76d27c795784a79d1f6c4f6d8654790282655 Mon Sep 17 00:00:00 2001 From: Adam <103067949+AdamL-Microsoft@users.noreply.github.com> Date: Fri, 6 Oct 2023 15:35:18 -0700 Subject: [PATCH] Revert PR #3494 "Disable `repro` and `debug` VM CLI commands. " (#3557) * Revert "Revert removal of `onefuzz repro get_files...` command (#3541)" This reverts commit acd1a365bc28a95768251acd101b595f9ddbedeb. * Revert "Disable `repro` and `debug` VM CLI commands. (#3494)" This reverts commit 7bcc41c67b74cd97668e169705d9af365f5c1297. --- src/cli/onefuzz/api.py | 243 +++++++++++++++++++++- src/cli/onefuzz/debug.py | 188 +++++++++++++++++ src/integration-tests/integration-test.py | 25 ++- 3 files changed, 445 insertions(+), 11 deletions(-) diff --git a/src/cli/onefuzz/api.py b/src/cli/onefuzz/api.py index fdd9bd2197..6968192642 100644 --- a/src/cli/onefuzz/api.py +++ b/src/cli/onefuzz/api.py @@ -9,6 +9,7 @@ import pkgutil import re import subprocess # nosec +import time import uuid from enum import Enum from shutil import which @@ -34,7 +35,8 @@ from .__version__ import __version__ from .azcopy import azcopy_sync -from .backend import Backend, BackendConfig, ContainerWrapper +from .backend import Backend, BackendConfig, ContainerWrapper, wait +from .ssh import build_ssh_command, ssh_connect, temp_file UUID_EXPANSION = TypeVar("UUID_EXPANSION", UUID, str) @@ -529,10 +531,21 @@ def _download_tasks( class Repro(Endpoint): - """Interact with repro files""" + """Interact with Reproduction VMs""" endpoint = "repro_vms" + def get(self, vm_id: UUID_EXPANSION) -> models.Repro: + """get information about a Reproduction VM""" + vm_id_expanded = self._disambiguate_uuid( + "vm_id", vm_id, lambda: [str(x.vm_id) for x in self.list()] + ) + + self.logger.debug("get repro vm: %s", vm_id_expanded) + return self._req_model( + "GET", models.Repro, data=requests.ReproGet(vm_id=vm_id_expanded) + ) + def get_files( self, report_container: primitives.Container, @@ -540,7 +553,7 @@ def get_files( include_setup: bool = False, output_dir: primitives.Directory = primitives.Directory("."), ) -> None: - """downloads the files necessary to locally repro the crash from given report""" + """downloads the files necessary to locally repro the crash from a given report""" report_bytes = self.onefuzz.containers.files.get(report_container, report_name) report = json.loads(report_bytes) @@ -602,6 +615,230 @@ def get_files( primitives.Container(setup_container), output_dir ) + def create( + self, container: primitives.Container, path: str, duration: int = 24 + ) -> models.Repro: + """Create a Reproduction VM from a Crash Report""" + self.logger.info( + "creating repro vm: %s %s (%d hours)", container, path, duration + ) + return self._req_model( + "POST", + models.Repro, + data=models.ReproConfig(container=container, path=path, duration=duration), + ) + + def delete(self, vm_id: UUID_EXPANSION) -> models.Repro: + """Delete a Reproduction VM""" + vm_id_expanded = self._disambiguate_uuid( + "vm_id", vm_id, lambda: [str(x.vm_id) for x in self.list()] + ) + + self.logger.debug("deleting repro vm: %s", vm_id_expanded) + return self._req_model( + "DELETE", models.Repro, data=requests.ReproGet(vm_id=vm_id_expanded) + ) + + def list(self) -> List[models.Repro]: + """List all VMs""" + self.logger.debug("listing repro vms") + return self._req_model_list("GET", models.Repro, data=requests.ReproGet()) + + def _dbg_linux( + self, repro: models.Repro, debug_command: Optional[str] + ) -> Optional[str]: + """Launch gdb with GDB script that includes 'target remote | ssh ...'""" + + if ( + repro.auth is None + or repro.ip is None + or repro.state != enums.VmState.running + ): + raise Exception("vm setup failed: %s" % repro.state) + + with build_ssh_command( + repro.ip, repro.auth.private_key, command="-T" + ) as ssh_cmd: + gdb_script = [ + "target remote | %s sudo /onefuzz/bin/repro-stdout.sh" + % " ".join(ssh_cmd) + ] + + if debug_command: + gdb_script += [debug_command, "quit"] + + with temp_file("gdb.script", "\n".join(gdb_script)) as gdb_script_path: + dbg = ["gdb", "--silent", "--command", gdb_script_path] + + if debug_command: + dbg += ["--batch"] + + try: + # security note: dbg is built from content coming from + # the server, which is trusted in this context. + return subprocess.run( # nosec + dbg, stdout=subprocess.PIPE, stderr=subprocess.STDOUT + ).stdout.decode(errors="ignore") + except subprocess.CalledProcessError as err: + self.logger.error( + "debug failed: %s", err.output.decode(errors="ignore") + ) + raise err + else: + # security note: dbg is built from content coming from the + # server, which is trusted in this context. + subprocess.call(dbg) # nosec + return None + + def _dbg_windows( + self, + repro: models.Repro, + debug_command: Optional[str], + retry_limit: Optional[int], + ) -> Optional[str]: + """Setup an SSH tunnel, then connect via CDB over SSH tunnel""" + + if ( + repro.auth is None + or repro.ip is None + or repro.state != enums.VmState.running + ): + raise Exception("vm setup failed: %s" % repro.state) + + retry_count = 0 + bind_all = which("wslpath") is not None and repro.os == enums.OS.windows + proxy = "*:" + REPRO_SSH_FORWARD if bind_all else REPRO_SSH_FORWARD + while retry_limit is None or retry_count <= retry_limit: + if retry_limit: + retry_count = retry_count + 1 + with ssh_connect(repro.ip, repro.auth.private_key, proxy=proxy): + dbg = ["cdb.exe", "-remote", "tcp:port=1337,server=localhost"] + if debug_command: + dbg_script = [debug_command, "qq"] + with temp_file( + "db.script", "\r\n".join(dbg_script) + ) as dbg_script_path: + dbg += ["-cf", _wsl_path(dbg_script_path)] + + logging.debug("launching: %s", dbg) + try: + # security note: dbg is built from content coming from the server, + # which is trusted in this context. + return subprocess.run( # nosec + dbg, stdout=subprocess.PIPE, stderr=subprocess.STDOUT + ).stdout.decode(errors="ignore") + except subprocess.CalledProcessError as err: + if err.returncode == 0x8007274D: + self.logger.info( + "failed to connect to debug-server trying again in 10 seconds..." + ) + time.sleep(10.0) + else: + self.logger.error( + "debug failed: %s", + err.output.decode(errors="ignore"), + ) + raise err + else: + logging.debug("launching: %s", dbg) + # security note: dbg is built from content coming from the + # server, which is trusted in this context. + try: + subprocess.check_call(dbg) # nosec + return None + except subprocess.CalledProcessError as err: + if err.returncode == 0x8007274D: + self.logger.info( + "failed to connect to debug-server trying again in 10 seconds..." + ) + time.sleep(10.0) + else: + return None + + if retry_limit is not None: + self.logger.info( + f"failed to connect to debug-server after {retry_limit} attempts. Please try again later " + + f"with onefuzz debug connect {repro.vm_id}" + ) + return None + + def connect( + self, + vm_id: UUID_EXPANSION, + delete_after_use: bool = False, + debug_command: Optional[str] = None, + retry_limit: Optional[int] = None, + ) -> Optional[str]: + """Connect to an existing Reproduction VM""" + + self.logger.info("connecting to reproduction VM: %s", vm_id) + + if which("ssh") is None: + raise Exception("unable to find ssh on local machine") + + def missing_os() -> Tuple[bool, str, models.Repro]: + repro = self.get(vm_id) + return ( + repro.os is not None, + "waiting for os determination", + repro, + ) + + repro = wait(missing_os) + + if repro.os == enums.OS.windows: + if which("cdb.exe") is None: + raise Exception("unable to find cdb.exe on local machine") + if repro.os == enums.OS.linux: + if which("gdb") is None: + raise Exception("unable to find gdb on local machine") + + def func() -> Tuple[bool, str, models.Repro]: + repro = self.get(vm_id) + state = repro.state + return ( + repro.auth is not None + and repro.ip is not None + and state not in [enums.VmState.init, enums.VmState.extensions_launch], + "launching reproducing vm. current state: %s" % state, + repro, + ) + + repro = wait(func) + # give time for debug server to initialize + time.sleep(30.0) + result: Optional[str] = None + if repro.os == enums.OS.windows: + result = self._dbg_windows(repro, debug_command, retry_limit) + elif repro.os == enums.OS.linux: + result = self._dbg_linux(repro, debug_command) + else: + raise NotImplementedError + + if delete_after_use: + self.logger.debug("deleting vm %s", repro.vm_id) + self.delete(repro.vm_id) + + return result + + def create_and_connect( + self, + container: primitives.Container, + path: str, + duration: int = 24, + delete_after_use: bool = False, + debug_command: Optional[str] = None, + retry_limit: Optional[int] = None, + ) -> Optional[str]: + """Create and connect to a Reproduction VM""" + repro = self.create(container, path, duration=duration) + return self.connect( + repro.vm_id, + delete_after_use=delete_after_use, + debug_command=debug_command, + retry_limit=retry_limit, + ) + class Notifications(Endpoint): """Interact with models.Notifications""" diff --git a/src/cli/onefuzz/debug.py b/src/cli/onefuzz/debug.py index 0182cb19a1..27aee5a0ee 100644 --- a/src/cli/onefuzz/debug.py +++ b/src/cli/onefuzz/debug.py @@ -36,6 +36,9 @@ from onefuzz.api import UUID_EXPANSION, Command, Endpoint, Onefuzz from .azure_identity_credential_adapter import AzureIdentityCredentialAdapter +from .backend import wait +from .rdp import rdp_connect +from .ssh import ssh_connect EMPTY_SHA256 = "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" ZERO_SHA256 = "0" * len(EMPTY_SHA256) @@ -44,6 +47,143 @@ DEFAULT_TAIL_DELAY = 10.0 +class DebugRepro(Command): + """Debug repro instances""" + + def _disambiguate(self, vm_id: UUID_EXPANSION) -> str: + return str( + self.onefuzz.repro._disambiguate_uuid( + "vm_id", + vm_id, + lambda: [str(x.vm_id) for x in self.onefuzz.repro.list()], + ) + ) + + def _info(self) -> Tuple[str, str]: + info = self.onefuzz.info.get() + return info.resource_group, info.subscription + + def ssh(self, vm_id: str) -> None: + vm_id = self._disambiguate(vm_id) + repro = self.onefuzz.repro.get(vm_id) + if repro.ip is None: + raise Exception("missing IP: %s" % repro) + if repro.auth is None: + raise Exception("missing Auth: %s" % repro) + + with ssh_connect(repro.ip, repro.auth.private_key, call=True): + pass + + def rdp(self, vm_id: str) -> None: + vm_id = self._disambiguate(vm_id) + repro = self.onefuzz.repro.get(vm_id) + if repro.ip is None: + raise Exception("missing IP: %s" % repro) + if repro.auth is None: + raise Exception("missing Auth: %s" % repro) + + RDP_PORT = 3389 + with rdp_connect(repro.ip, repro.auth.password, port=RDP_PORT): + return + + +class DebugNode(Command): + """Debug a specific node on a scaleset""" + + def rdp(self, machine_id: UUID_EXPANSION, duration: Optional[int] = 1) -> None: + node = self.onefuzz.nodes.get(machine_id) + if node.scaleset_id is None: + raise Exception("node is not part of a scaleset") + self.onefuzz.debug.scalesets.rdp( + scaleset_id=node.scaleset_id, machine_id=node.machine_id, duration=duration + ) + + def ssh(self, machine_id: UUID_EXPANSION, duration: Optional[int] = 1) -> None: + node = self.onefuzz.nodes.get(machine_id) + if node.scaleset_id is None: + raise Exception("node is not part of a scaleset") + self.onefuzz.debug.scalesets.ssh( + scaleset_id=node.scaleset_id, machine_id=node.machine_id, duration=duration + ) + + +class DebugScaleset(Command): + """Debug tasks""" + + def _get_proxy_setup( + self, scaleset_id: str, machine_id: UUID, port: int, duration: Optional[int] + ) -> Tuple[bool, str, Optional[Tuple[str, int]]]: + proxy = self.onefuzz.scaleset_proxy.create( + scaleset_id, machine_id, port, duration=duration + ) + if proxy.ip is None: + return (False, "waiting on proxy ip", None) + + return (True, "waiting on proxy port", (proxy.ip, proxy.forward.src_port)) + + def rdp( + self, + scaleset_id: str, + machine_id: UUID_EXPANSION, + duration: Optional[int] = 1, + ) -> None: + ( + scaleset, + machine_id_expanded, + ) = self.onefuzz.scalesets._expand_scaleset_machine( + scaleset_id, machine_id, include_auth=True + ) + + RDP_PORT = 3389 + setup = wait( + lambda: self._get_proxy_setup( + scaleset.scaleset_id, machine_id_expanded, RDP_PORT, duration + ) + ) + if setup is None: + raise Exception("no proxy for RDP port configured") + + if scaleset.auth is None: + raise Exception("auth is not available for scaleset") + + ip, port = setup + with rdp_connect(ip, scaleset.auth.password, port=port): + return + + def ssh( + self, + scaleset_id: str, + machine_id: UUID_EXPANSION, + duration: Optional[int] = 1, + command: Optional[str] = None, + ) -> None: + ( + scaleset, + machine_id_expanded, + ) = self.onefuzz.scalesets._expand_scaleset_machine( + scaleset_id, machine_id, include_auth=True + ) + + SSH_PORT = 22 + setup = wait( + lambda: self._get_proxy_setup( + scaleset.scaleset_id, machine_id_expanded, SSH_PORT, duration + ) + ) + if setup is None: + raise Exception("no proxy for SSH port configured") + + ip, port = setup + + if scaleset.auth is None: + raise Exception("auth is not available for scaleset") + + with ssh_connect( + ip, scaleset.auth.private_key, port=port, call=True, command=command + ): + return + + class DebugTask(Command): """Debug a specific task""" @@ -70,6 +210,26 @@ def _get_node( raise Exception("unable to find scaleset node running on task") + def ssh( + self, + task_id: UUID_EXPANSION, + *, + node_id: Optional[UUID] = None, + duration: Optional[int] = 1, + ) -> None: + scaleset_id, node_id = self._get_node(task_id, node_id) + return self.onefuzz.debug.scalesets.ssh(scaleset_id, node_id, duration=duration) + + def rdp( + self, + task_id: UUID_EXPANSION, + *, + node_id: Optional[UUID] = None, + duration: Optional[int] = 1, + ) -> None: + scaleset_id, node_id = self._get_node(task_id, node_id) + return self.onefuzz.debug.scalesets.rdp(scaleset_id, node_id, duration=duration) + def libfuzzer_coverage( self, task_id: UUID_EXPANSION, @@ -124,12 +284,37 @@ def _get_task(self, job_id: UUID_EXPANSION, task_type: TaskType) -> UUID: "unable to find task type %s for job:%s" % (task_type.name, job_id) ) + def ssh( + self, + job_id: UUID_EXPANSION, + task_type: TaskType, + *, + duration: Optional[int] = 1, + ) -> None: + """SSH into the first node running the specified task type in the job""" + return self.onefuzz.debug.task.ssh( + self._get_task(job_id, task_type), duration=duration + ) + + def rdp( + self, + job_id: UUID_EXPANSION, + task_type: TaskType, + *, + duration: Optional[int] = 1, + ) -> None: + """RDP into the first node running the specified task type in the job""" + return self.onefuzz.debug.task.rdp( + self._get_task(job_id, task_type), duration=duration + ) + class DebugJob(Command): """Debug a specific Job""" def __init__(self, onefuzz: Any, logger: logging.Logger): super().__init__(onefuzz, logger) + self.task = DebugJobTask(onefuzz, logger) def libfuzzer_coverage( self, @@ -721,7 +906,10 @@ class Debug(Command): def __init__(self, onefuzz: Any, logger: logging.Logger): super().__init__(onefuzz, logger) + self.scalesets = DebugScaleset(onefuzz, logger) + self.repro = DebugRepro(onefuzz, logger) self.job = DebugJob(onefuzz, logger) self.notification = DebugNotification(onefuzz, logger) self.task = DebugTask(onefuzz, logger) self.logs = DebugLog(onefuzz, logger) + self.node = DebugNode(onefuzz, logger) diff --git a/src/integration-tests/integration-test.py b/src/integration-tests/integration-test.py index 1ba572e3f3..15ffcfb9fe 100755 --- a/src/integration-tests/integration-test.py +++ b/src/integration-tests/integration-test.py @@ -244,7 +244,7 @@ class Integration(BaseModel): "--test:{extra_setup_dir}", "--write_test_file={extra_output_dir}/test.txt", ], - pool=PoolName("mariner"), + pool=PoolName("mariner") ), "windows-libfuzzer": Integration( template=TemplateType.libfuzzer, @@ -401,13 +401,10 @@ def try_info_get(data: Any) -> None: self.of.pools.create(name, OS.linux) self.logger.info("creating scaleset for pool: %s", name) self.of.scalesets.create( - name, - pool_size, - region=region, - initial_size=pool_size, - image="MicrosoftCBLMariner:cbl-mariner:cbl-mariner-2-gen2:latest", + name, pool_size, region=region, initial_size=pool_size, image="MicrosoftCBLMariner:cbl-mariner:cbl-mariner-2-gen2:latest" ) + class UnmanagedPool: def __init__( self, @@ -647,7 +644,7 @@ def launch( setup = Directory(os.path.join(setup, config.nested_setup_dir)) job: Optional[Job] = None - + job = self.build_job( duration, pool, target, config, setup, target_exe, inputs ) @@ -1280,7 +1277,7 @@ def check_logs_for_errors(self) -> None: if seen_errors: raise Exception("logs included errors") - + def build_pool_name(self, os_type: str) -> PoolName: return PoolName(f"testpool-{os_type}-{self.test_id}") @@ -1465,6 +1462,18 @@ def check_results( job_ids=job_ids, ) + if skip_repro: + self.logger.warning("not testing crash repro") + else: + self.check_repros( + test_id, + endpoint=endpoint, + authority=authority, + client_id=client_id, + client_secret=client_secret, + job_ids=job_ids, + ) + def test_unmanaged( self, samples: Directory,