Skip to content

Commit

Permalink
Fix cloud-ci with gpu arch
Browse files Browse the repository at this point in the history
  • Loading branch information
satyaog committed Apr 2, 2024
1 parent 39fad22 commit 0f0f7fb
Show file tree
Hide file tree
Showing 8 changed files with 87 additions and 109 deletions.
18 changes: 11 additions & 7 deletions .github/workflows/cloud-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ jobs:
include:
- arch: cuda
exclude: "no-cuda"
run_on: azure
run_on: azure__a100
# - arch: rocm
# exclude : "no-rocm"

Expand All @@ -34,8 +34,8 @@ jobs:
shell: bash -el {0}

env:
MILABENCH_CONFIG: "config/test.yaml"
MILABENCH_SYSTEM: "config/examples/cloud-system.yaml"
MILABENCH_CONFIG: "config/standard.yaml"
MILABENCH_SYSTEM: "config/cloud-system.yaml"
MILABENCH_BASE: "output"
MILABENCH_ARGS: ""
MILABENCH_GPU_ARCH: "${{ matrix.arch }}"
Expand All @@ -53,15 +53,18 @@ jobs:
with:
python-version: 3.9

# Follow
# https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/guides/service_principal_client_secret
# to generate a clientId as well as a clientSecret
- name: Azure login
uses: azure/login@v2
with:
creds: |
{
"clientId": "${{ secrets.ARM_CLIENT_ID }}",
"clientSecret": "${{ secrets.ARM_CLIENT_SECRET }}",
"subscriptionId": "${{ secrets.ARM_SUBSCRIPTION_ID }}",
"tenantId": "${{ secrets.ARM_TENANT_ID }}",
"clientId": "${{ secrets.ARM_CLIENT_ID }}"
"tenantId": "${{ secrets.ARM_TENANT_ID }}"
}
- name: dependencies
Expand Down Expand Up @@ -108,7 +111,7 @@ jobs:
- name: install benchmarks
run: |
poetry run milabench install
poetry run milabench install --variant ${{ matrix.arch }}
- name: prepare benchmarks
run: |
Expand All @@ -134,4 +137,5 @@ jobs:
fi
poetry run milabench cloud \
--teardown \
--run-on ${{ matrix.run_on }}
--run-on ${{ matrix.run_on }} \
--all
18 changes: 18 additions & 0 deletions config/cloud-system.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
system:
# Nodes list
nodes:
# Alias used to reference the node
- name: manager
# Use 1.1.1.1 as an ip placeholder
ip: 1.1.1.1
# Use this node as the master node or not
main: true
# User to use in remote milabench operations
user: user

# Cloud instances profiles
cloud_profiles:
azure__a100:
username: ubuntu
size: Standard_NC24ads_A100_v4
location: eastus2
7 changes: 7 additions & 0 deletions config/examples/cloud-system.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,18 @@ system:

# Cloud instances profiles
cloud_profiles:
# The cloud platform to use in the form of {PLATFORM}__{PROFILE_NAME}
azure:
# covalent-azure-plugin args
username: ubuntu
size: Standard_B1s
location: eastus2
azure__free:
username: ubuntu
size: Standard_B2ats_v2
location: eastus2
ec2:
# covalent-ec2-plugin args
username: ubuntu
instance_type: t2.micro
volume_size: 8
Expand Down
4 changes: 2 additions & 2 deletions config/test.yaml → config/examples/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,14 @@ test:
inherits: _defaults
group: test_remote
install_group: test_remote
definition: ../benchmarks/_template
definition: ../../benchmarks/_template
plan:
method: njobs
n: 1

testing:
inherits: _defaults
definition: ../benchmarks/_template
definition: ../../benchmarks/_template
group: test_remote_2
install_group: test_remote_2
plan:
Expand Down
53 changes: 33 additions & 20 deletions milabench/cli/cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import sys

from coleo import Option, tooled
from omegaconf import OmegaConf
import yaml

from ..common import get_multipack
Expand All @@ -15,7 +16,16 @@
_ACTIONS = (_SETUP, _TEARDOWN, _LIST)


def manage_cloud(pack, packs, run_on, action="setup"):
def _flatten_cli_args(**kwargs):
return sum(
(
(f"--{k.replace('_', '-')}", *([v] if v else []))
for k, v in kwargs.items()
), ()
)


def manage_cloud(pack, run_on, action="setup"):
assert run_on in pack.config["system"]["cloud_profiles"]

key_map = {
Expand All @@ -28,11 +38,6 @@ def manage_cloud(pack, packs, run_on, action="setup"):

nodes = iter(enumerate(pack.config["system"]["nodes"]))

state_prefix = []
for p in packs.values():
state_prefix.append(p.config["name"])
state_prefix.append(p.config["install_variant"])

while True:
try:
i, n = next(nodes)
Expand All @@ -41,8 +46,10 @@ def manage_cloud(pack, packs, run_on, action="setup"):
except StopIteration:
break

plan_params["state_prefix"] = plan_params.get("state_prefix", None) or "-".join([str(i), *state_prefix])
plan_params["state_prefix"] = plan_params.get("state_prefix", None) or "-".join([str(i), run_on])
plan_params["state_id"] = plan_params.get("state_id", None) or pack.config["hash"]
if i > 0:
plan_params["reuse_resource_group"] = None

import milabench.cli.covalent as cv

Expand All @@ -59,16 +66,9 @@ def manage_cloud(pack, packs, run_on, action="setup"):
cmd = [
sys.executable,
"-m", cv.__name__,
run_on,
run_on.split("__")[0],
f"--{action}",
*list(
sum(
(
(f"--{k.replace('_', '-')}", v)
for k, v in plan_params.items()
), ()
)
)
*_flatten_cli_args(**plan_params)
]
p = subprocess.Popen(
cmd,
Expand Down Expand Up @@ -121,7 +121,8 @@ def _setup():

mp = get_multipack()
setup_pack = mp.setup_pack()
system_config = manage_cloud(setup_pack, mp.packs, run_on, action=_SETUP)
system_config = manage_cloud(setup_pack, run_on, action=_SETUP)
del system_config["arch"]

print(f"# hash::>{setup_pack.config['hash']}")
print(yaml.dump({"system": system_config}))
Expand All @@ -131,12 +132,24 @@ def _setup():
def _teardown():
"""Teardown a cloud infrastructure"""

# Setup cloud on target infra
# Teardown cloud instance on target infra
run_on: Option & str

mp = get_multipack()
# Teardown all cloud instances
all: Option & bool = False

overrides = {}
if all:
overrides = {
"*": OmegaConf.to_object(OmegaConf.from_dotlist([
f"system.cloud_profiles.{run_on}.state_prefix='*'",
f"system.cloud_profiles.{run_on}.state_id='*'",
]))
}

mp = get_multipack(overrides=overrides)
setup_pack = mp.setup_pack()
manage_cloud(setup_pack, mp.packs, run_on, action=_TEARDOWN)
manage_cloud(setup_pack, run_on, action=_TEARDOWN)


@tooled
Expand Down
59 changes: 6 additions & 53 deletions milabench/cli/covalent/__main__.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,12 @@
import argparse
import asyncio
import json
import os
import pathlib
import subprocess
import sys
import tempfile


def _load_venv(venv:pathlib.Path) -> dict:
activate = venv / "bin/activate"
if not activate.exists():
raise FileNotFoundError(str(activate))
env = subprocess.run(
f". '{activate}' && python3 -c 'import os ; import json ; print(json.dumps(dict(os.environ)))'",
shell=True,
capture_output=True
).stdout
return json.loads(env)


def serve(*argv):
return subprocess.run([
"covalent",
Expand Down Expand Up @@ -119,29 +106,16 @@ def lattice(argv=(), deps_bash = None):
deps_bash = None

if not argv and args.setup:
conda_prefix = "eval \"$(conda shell.bash hook)\""
conda_activate = "conda activate milabench"
deps_bash = []
for _cmd in (
f"{conda_activate} || conda create -n milabench -y",
f"{conda_activate}"
f" && conda install python={sys.version_info.major}.{sys.version_info.minor} virtualenv pip -y"
f" || >&2 echo First attempt to install python in milabench env failed",
f"{conda_activate}"
f" && conda install python={sys.version_info.major}.{sys.version_info.minor} virtualenv pip -y"
f" || conda remove -n milabench --all -y",
):
deps_bash.append(f"{conda_prefix} && ({_cmd})")
deps_bash = ct.DepsBash(deps_bash)
argv = ["conda", "env", "list"]
deps_bash = ct.DepsBash([])
# Make sure pip is installed
argv = ["python3", "-m", "pip", "freeze"]

if argv:
dispatch_id = ct.dispatch(lattice, disable_run=False)(argv, deps_bash=deps_bash)
result = ct.get_result(dispatch_id=dispatch_id, wait=True)
return_code, stdout, _ = result.result if result.result is not None else (1, "", "")

if return_code == 0 and args.setup:
assert any([l for l in stdout.split("\n") if l.startswith("milabench ")])
_executor:ct.executor.BaseExecutor = executor_cls(
**{
**_get_executor_kwargs(args),
Expand All @@ -154,7 +128,6 @@ def lattice(argv=(), deps_bash = None):
print(f"hostname::>{_executor.hostname}")
print(f"username::>{_executor.username}")
print(f"ssh_key_file::>{_executor.ssh_key_file}")
print(f"env::>{_executor.env}")
finally:
result = ct.get_result(dispatch_id=dispatch_id, wait=False) if dispatch_id else None
results_dir = result.results_dir if result else ""
Expand Down Expand Up @@ -185,29 +158,9 @@ def main(argv=None):
try:
import covalent as ct
except (KeyError, ImportError):
module = pathlib.Path(__file__).resolve().parent
cache_dir = pathlib.Path(f"/tmp/milabench/{module.name}_venv")
python3 = str(cache_dir / "bin/python3")
check_module = "import covalent"
try:
subprocess.run([python3, "-c", check_module], check=True)
except (FileNotFoundError, subprocess.CalledProcessError):
cache_dir.mkdir(parents=True, exist_ok=True)
subprocess.run([sys.executable, "-m", "virtualenv", str(cache_dir)], check=True)
subprocess.run([python3, "-m", "pip", "install", "-U", "pip"], check=True)
subprocess.run([
python3,
"-m",
"pip",
"install",
"-r",
str(module / "requirements.txt")
], stdout=sys.stderr, check=True)
subprocess.run([python3, "-c", check_module], check=True)
return subprocess.call(
[python3, __file__, *argv],
env=_load_venv(cache_dir)
)
from ..utils import run_in_module_venv
check_if_module = "import covalent"
run_in_module_venv(__file__, check_if_module, argv)

parser = argparse.ArgumentParser()
subparsers = parser.add_subparsers()
Expand Down
8 changes: 6 additions & 2 deletions milabench/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,10 @@ def resolve_addresses(nodes):
is_local = (
("127.0.0.1" in ipaddrlist)
or (hostname in ("localhost", socket.gethostname()))
# Tmp workaround until networking on azure allows to associate the
# local hostname (`hostname.split(".")[0]`) with the public fqdn
# (hostname.split(".")[0].*.cloudapp.azure.com)
or (hostname.split(".")[0] == socket.gethostname())
or len(ip_list.intersection(ipaddrlist)) > 0
)
node["local"] = is_local
Expand Down Expand Up @@ -227,9 +231,9 @@ def build_system_config(config_file, defaults=None, gpu=True):
config = yaml.safe_load(cf)

if defaults:
config = merge(defaults, config)
config["system"] = merge(defaults["system"], config["system"])

system = config.get("system", {})
system = config["system"]

# capacity is only required if batch resizer is enabled
if (gpu or is_autoscale_enabled()) and not "gpu" not in system:
Expand Down
Loading

0 comments on commit 0f0f7fb

Please sign in to comment.