Skip to content

Commit

Permalink
Merge pull request #147 from asmacdo/bf-gpu-info-parse
Browse files Browse the repository at this point in the history
Revert back to original smon + blacken and logging
  • Loading branch information
yarikoptic authored Aug 20, 2024
2 parents 0508635 + b9cf241 commit b35abcd
Show file tree
Hide file tree
Showing 2 changed files with 93 additions and 17 deletions.
47 changes: 30 additions & 17 deletions src/con_duct/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,26 +325,39 @@ def get_system_info(self) -> None:
uid=uid, memory_total=memory_total, cpu_total=cpu_total
)
# GPU information
if shutil.which("nvidia-smi"):
if shutil.which("nvidia-smi") is not None:
lgr.debug("Checking NVIDIA GPU using nvidia-smi")
try:
gpu_info = (
subprocess.check_output(
[
"nvidia-smi",
"--query-gpu=index,name,pci.bus_id,driver_version,memory.total,compute_mode",
"--format=csv",
],
text=True,
)
.strip()
.split("\n")[1:]
out = subprocess.check_output(
[
"nvidia-smi",
"--query-gpu=index,name,pci.bus_id,driver_version,memory.total,compute_mode",
"--format=csv",
]
)
self.gpus = [
dict(zip(gpu_info[0].split(", "), gpu.split(", ")))
for gpu in gpu_info[1:]
]
except subprocess.CalledProcessError:
except subprocess.CalledProcessError as e:
lgr.warning("Error collecting gpu information: %s", str(e))
self.gpus = None
return
try:
decoded = out.decode("utf-8")
lines = decoded.strip().split("\n")
_ = lines.pop(0) # header
self.gpus = []
for line in lines:
cols = line.split(", ")
self.gpus.append(
{
"index": cols[0],
"name": cols[1],
"bus_id": cols[2],
"driver_version": cols[3],
"memory.total": cols[4],
"compute_mode": cols[5],
}
)
except Exception as e:
lgr.warning("Error parsing gpu information: %s", str(e))
self.gpus = None

def collect_sample(self) -> Optional[Sample]:
Expand Down
63 changes: 63 additions & 0 deletions test/test_report.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations
from datetime import datetime
import subprocess
from unittest import mock
import pytest
from con_duct.__main__ import (
Expand Down Expand Up @@ -160,3 +161,65 @@ def test_execution_summary_formatted(
assert "unknown" in output
# Process did not finish, we didn't set start_time, so remains nan but there
assert "wall clock time: nan" in report.execution_summary_formatted.lower()


@mock.patch("con_duct.__main__.shutil.which")
@mock.patch("con_duct.__main__.subprocess.check_output")
@mock.patch("con_duct.__main__.LogPaths")
def test_gpu_parsing_green(
mock_log_paths: mock.MagicMock, mock_sp: mock.MagicMock, _mock_which: mock.MagicMock
) -> None:
mock_sp.return_value = (
"index, name, pci.bus_id, driver_version, memory.total [MiB], compute_mode\n"
"0, NVIDIA RTX A5500 Laptop GPU, 00000000:01:00.0, 535.183.01, 16384 MiB, Default"
).encode("utf-8")
report = Report("_cmd", [], mock_log_paths, EXECUTION_SUMMARY_FORMAT, clobber=False)
report.get_system_info()
assert report.gpus is not None
assert report.gpus == [
{
"index": "0",
"name": "NVIDIA RTX A5500 Laptop GPU",
"bus_id": "00000000:01:00.0",
"driver_version": "535.183.01",
"memory.total": "16384 MiB",
"compute_mode": "Default",
}
]


@mock.patch("con_duct.__main__.lgr")
@mock.patch("con_duct.__main__.shutil.which")
@mock.patch("con_duct.__main__.subprocess.check_output")
@mock.patch("con_duct.__main__.LogPaths")
def test_gpu_call_error(
mock_log_paths: mock.MagicMock,
mock_sp: mock.MagicMock,
_mock_which: mock.MagicMock,
mlgr: mock.MagicMock,
) -> None:
mock_sp.side_effect = subprocess.CalledProcessError(1, "errrr")
report = Report("_cmd", [], mock_log_paths, EXECUTION_SUMMARY_FORMAT, clobber=False)
report.get_system_info()
assert report.gpus is None
mlgr.warning.assert_called_once()


@mock.patch("con_duct.__main__.lgr")
@mock.patch("con_duct.__main__.shutil.which")
@mock.patch("con_duct.__main__.subprocess.check_output")
@mock.patch("con_duct.__main__.LogPaths")
def test_gpu_parse_error(
mock_log_paths: mock.MagicMock,
mock_sp: mock.MagicMock,
_mock_which: mock.MagicMock,
mlgr: mock.MagicMock,
) -> None:
mock_sp.return_value = (
"index, name, pci.bus_id, driver_version, memory.total [MiB], compute_mode\n"
"not-enough-values, 535.183.01, 16384 MiB, Default"
).encode("utf-8")
report = Report("_cmd", [], mock_log_paths, EXECUTION_SUMMARY_FORMAT, clobber=False)
report.get_system_info()
assert report.gpus is None
mlgr.warning.assert_called_once()

0 comments on commit b35abcd

Please sign in to comment.