Skip to content

Commit

Permalink
Validate min_sku_spec for models (#2078)
Browse files Browse the repository at this point in the history
* Validate min_sku_spec for models

* fix flakes

* Update

* Compare int vals

* add list of skus that failed valdn

* update logging

* update changelog and azureml-assets version

* update version
  • Loading branch information
novaturient95 authored Jan 19, 2024
1 parent 0930cf0 commit 0851f24
Show file tree
Hide file tree
Showing 6 changed files with 223 additions and 5 deletions.
18 changes: 16 additions & 2 deletions .github/workflows/assets-validation.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,15 @@ defaults:
run:
shell: bash

permissions:
# Required for OIDC login to Azure
id-token: write

jobs:
validate:
name: Validate
runs-on: ubuntu-latest
environment: Testing

steps:
- name: Clone branch
Expand All @@ -47,12 +52,21 @@ jobs:
uses: actions/setup-python@v4
with:
python-version: '>=3.8'


- name: Log in to Azure
uses: azure/login@v1
with:
client-id: ${{ secrets.AZURE_CLIENT_ID }}
tenant-id: ${{ secrets.AZURE_TENANT_ID }}
subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}

- name: Install dependencies
run: pip install -e $scripts_azureml_assets_dir

- name: Validate assets
run: python -u $scripts_assets_dir/validate_assets.py -i "${{ github.event.inputs.asset_dirs || env.default_asset_dirs }}" -a $asset_config_filename -c "${{ steps.changed-files.outputs.all_modified_files }}" -n -I -C -b -t -e
env:
SUBSCRIPTION_ID: ${{ secrets.AZURE_SUBSCRIPTION_ID }}

- name: Validate source tree
run: python -u $scripts_assets_dir/validate_tree.py -i "${{ github.event.inputs.asset_dirs || env.default_asset_dirs }}"
Expand Down
4 changes: 4 additions & 0 deletions scripts/azureml-assets/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@

### 🐛 Bugs Fixed

## 1.16.31 (2023-01-06)
### 🚀 New Features
- [#2078](https://github.com/Azure/azureml-assets/pull/2078) Model spec min sku valdn

## 1.16.30 (2024-01-18)
### 🚀 New Features
- [#2141](https://github.com/Azure/azureml-assets/pull/2141) Support archiving models
Expand Down
80 changes: 80 additions & 0 deletions scripts/azureml-assets/azureml/assets/util/sku_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

"""SKU utils."""

import json
import requests
from azureml.assets.util import logger
from azureml.assets.util.util import retry
from azure.identity import AzureCliCredential


all_sku_details = None
SKU_DETAILS_URI = (
"https://management.azure.com/subscriptions/{}/"
"providers/Microsoft.MachineLearningServices/locations/{}"
"/vmSizes?api-version=2021-01-01&expandChildren=true"
)


@retry(3)
def get_all_sku_details(credential: AzureCliCredential, subscription_id: str, location: str = "eastus"):
"""Return all sku details.
return response =>
{
"Standard_A1_v2": {
"name": "Standard_A1_v2",
"family": "standardAv2Family",
"vCPUs": 1,
"gpus": 0,
"osVhdSizeMB": 1047552,
"maxResourceVolumeMB": 10240,
"memoryGB": 2.0,
....
},
....
....
}
Args:
credential (AzureCliCredential): Credential to generate token for the request
subscription_id (str): Subscription ID to check details in
location (str): location to query SKU details for. Default is set to eastus
"""
global all_sku_details
if all_sku_details is None:
vmSizes = SKU_DETAILS_URI.format(subscription_id, location)
token = credential.get_token("https://management.azure.com/.default")
headers = {"Authorization": f"Bearer {token.token}"}
response = requests.get(vmSizes, headers=headers)
status_code = response.status_code
content = response.content
if status_code != 200:
raise Exception(f"Unsuccessful requst. Response : {response}")
sku_details_list = json.loads(content).get("amlCompute", [])
all_sku_details = {sku_details["name"]: sku_details for sku_details in sku_details_list}

return all_sku_details


def get_sku_details(credential: AzureCliCredential, SKU: str, subscription_id: str, location: str = "eastus"):
"""Get sku details.
Args:
credential (AzureCliCredential): Credential to generate token for the request
SKU (str): SKU to fetch detail of
subscription_id (str): Subscription ID to check details in
location (str): location to query SKU details for. Default is set to eastus
"""
global all_sku_details
if all_sku_details is None:
logger.print(f"Fetching all sku details for subscription: {subscription_id} and location: {location}")
all_sku_details = get_all_sku_details(
credential,
subscription_id,
location,
)

return all_sku_details.get(SKU, None)
30 changes: 30 additions & 0 deletions scripts/azureml-assets/azureml/assets/util/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -433,3 +433,33 @@ def dump_yaml(yaml_dict: dict, file_path: str):
"""
with open(file_path, "w") as f:
yaml_dict = YAML().dump(yaml_dict, f)


def retry(times):
"""Retry Decorator.
Args:
times (int): The number of times to repeat the wrapped function/method
"""

def decorator(func):
def newfn(*args, **kwargs):
attempt = 1
while attempt <= times:
try:
return func(*args, **kwargs)
except Exception:
attempt += 1
ex_msg = "Exception thrown when attempting to run {}, attempt {} of {}".format(
func.__name__, attempt, times
)
logger.log_warning(ex_msg)
if attempt == times:
logger.log_warning(
"Retried {} times when calling {}, now giving up!".format(times, func.__name__)
)
raise

return newfn

return decorator
94 changes: 92 additions & 2 deletions scripts/azureml-assets/azureml/assets/validate_assets.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

"""Validate assets."""

import os
import argparse
import json
import re
Expand All @@ -15,12 +16,14 @@
from azure.ai.ml import load_model
from azure.ai.ml.entities import Model
from azure.ai.ml.operations._run_history_constants import JobStatus
from azure.identity import AzureCliCredential

import azureml.assets as assets
import azureml.assets.util as util
from azureml.assets import PublishLocation, PublishVisibility
from azureml.assets.config import ValidationException
from azureml.assets.util import logger
from azureml.assets.util.sku_utils import get_all_sku_details

ERROR_TEMPLATE = "Validation of {file} failed: {error}"
WARNING_TEMPLATE = "Warning during validation of {file}: {warning}"
Expand All @@ -43,6 +46,15 @@
SUPPORTED_INFERENCE_SKU_FILE_NAME = "config/supported_inference_skus.json"
SUPPORTED_INFERENCE_SKU_FILE_PATH = Path(__file__).parent / SUPPORTED_INFERENCE_SKU_FILE_NAME

# credential and mlcient initialization
# credential might not always be present
# check in try-except block
credential = None
try:
credential = AzureCliCredential()
except Exception as e:
logger.log_warning(f"exception in creating credential. {e}")


class MLFlowModelProperties:
"""Commonly defined model properties."""
Expand Down Expand Up @@ -689,8 +701,6 @@ def validate_model_scenario(
recommended_skus = model.properties.get(recommended_skus_prop_name, "").strip()
compute_allowlists = set(model.tags.get(compute_allowlist_tags_name, []))

# TODO: add min_sku validation than just its existence

if not min_sku:
_log_error(asset_file_name_with_path, f"{min_sku_prop_name} is missing in model properties")
error_count += 1
Expand All @@ -715,9 +725,89 @@ def validate_model_scenario(
)
error_count += 1

# confirm min_sku_spec with list of supported computes
error_count += confirm_min_sku_spec(asset_file_name_with_path, min_sku_prop_name, compute_allowlists, min_sku)

return error_count


def confirm_min_sku_spec(
asset_file_name_with_path: Path,
min_sku_prop_name: str,
supported_skus: set,
min_sku_spec: str
):
"""Validate model properties, tags for different scenarios.
Args:
asset_file_name_with_path (Path): file path to model asset
min_sku_prop_name (str): min sku property name for the scenario
supported_skus (List): supported SKUs for the scenario
min_sku_spec (str): Scenario min SKU spec
Returns:
int: Number of errors.
"""
subscription_id = os.getenv("SUBSCRIPTION_ID", None)
if not (credential and subscription_id):
logger.log_warning("credential or subscription_id missing. Skipping min sku valdn")
return 0

try:
all_sku_details = get_all_sku_details(credential, subscription_id)
min_disk = min_cpu_mem = min_ngpus = min_ncpus = -1
for sku in supported_skus:
sku_details = all_sku_details.get(sku)
if not sku_details:
raise Exception(
f"Caught exception while checking {min_sku_prop_name}."
f" Either invalid sku {sku} or issue with fetching sku details"
)

num_cpus = sku_details["vCPUs"]
num_gpus = sku_details["gpus"]
cpu_mem = int(sku_details["memoryGB"])
disk_space = int(sku_details["maxResourceVolumeMB"] / 1024)

min_ncpus = min(num_cpus, min_ncpus) if min_ncpus > 0 else num_cpus
min_ngpus = min(num_gpus, min_ngpus) if min_ngpus >= 0 else num_gpus
min_cpu_mem = min(cpu_mem, min_cpu_mem) if min_cpu_mem > 0 else cpu_mem
min_disk = min(disk_space, min_disk) if min_disk > 0 else disk_space

ncpus, ngpus, mem, disk = [int(item) for item in min_sku_spec.split("|")]
if ncpus != min_ncpus or ngpus != min_ngpus or mem != min_cpu_mem or disk != min_disk:
_log_error(
asset_file_name_with_path,
f"for {min_sku_prop_name} => "
f"{ncpus}|{ngpus}|{mem}|{disk} != {min_ncpus}|{min_ngpus}|{min_cpu_mem}|{min_disk}"
)

# list of skus larger than current specific min-sku
skus_failing_valdn = []
for sku in supported_skus:
sku_details = all_sku_details.get(sku)
num_cpus = sku_details["vCPUs"]
num_gpus = sku_details["gpus"]
cpu_mem = int(sku_details["memoryGB"])
disk_space = int(sku_details["maxResourceVolumeMB"] / 1024)

if num_cpus < ncpus or num_gpus < ngpus or cpu_mem < mem or disk_space < disk:
sku_spec = "|".join([str(num_cpus), str(num_gpus), str(cpu_mem), str(disk_space)])
skus_failing_valdn.append(f"{sku}: {sku_spec}")

_log_error(
asset_file_name_with_path,
f"for {min_sku_prop_name} => "
f"SKUs having smaller spec: {skus_failing_valdn}"
)

return 1
except Exception as e:
_log_error(asset_file_name_with_path, f"Exception in fetching SKU details => {e}")
return 1
return 0


def validate_model_spec(asset_config: assets.AssetConfig) -> int:
"""Validate model spec.
Expand Down
2 changes: 1 addition & 1 deletion scripts/azureml-assets/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

setup(
name="azureml-assets",
version="1.16.30",
version="1.16.31",
description="Utilities for publishing assets to Azure Machine Learning system registries.",
author="Microsoft Corp",
packages=find_packages(),
Expand Down

0 comments on commit 0851f24

Please sign in to comment.