Fix DBCopilot environment Vulnerabilities #3217
10 fail, 1 skipped, 271 pass in 13h 55m 5s
282 tests 271 ✅ 13h 55m 5s ⏱️
27 suites 1 💤
27 files 10 ❌
Results for commit bb4277c.
Annotations
Check warning on line 0 in environment/responsibleai-text.tests.responsibleai_text_sample_test
github-actions / Test Results for assets-test
test_responsibleai_text (environment/responsibleai-text.tests.responsibleai_text_sample_test) failed
pytest-reports/environment/responsibleai-text.xml [took 30m 23s]
Raw output
AssertionError: assert 'Running' == <JobStatus.CO...: 'Completed'>
- Completed
+ Running
def test_responsibleai_text():
"""Tests a sample job using responsibleai text image as the environment."""
this_dir = Path(__file__).parent
subscription_id = os.environ.get("subscription_id")
resource_group = os.environ.get("resource_group")
workspace_name = os.environ.get("workspace")
ml_client = MLClient(
AzureCliCredential(), subscription_id, resource_group, workspace_name
)
env_name = "responsibleai-text"
env_docker_context = Environment(
build=BuildContext(path=this_dir / BUILD_CONTEXT),
name=env_name,
description="ResponsibleAI Text environment created from a Docker context.",
)
ml_client.environments.create_or_update(env_docker_context)
# create the command
job = command(
code=this_dir / JOB_SOURCE_CODE, # local path where the code is stored
command="python main.py",
environment=f"{env_name}@latest",
compute=os.environ.get("cpu_cluster"),
display_name="responsibleai-text-example",
description="A test run of the responsibleai text curated environment",
experiment_name="responsibleaiTextExperiment"
)
returned_job = ml_client.create_or_update(job)
assert returned_job is not None
# Poll until final status is reached, or timed out
timeout = time.time() + (TIMEOUT_MINUTES * 60)
while time.time() <= timeout:
current_status = ml_client.jobs.get(returned_job.name).status
if current_status in [JobStatus.COMPLETED, JobStatus.FAILED]:
break
time.sleep(30) # sleep 30 seconds
if current_status == JobStatus.FAILED:
ml_client.jobs.download(returned_job.name)
if STD_LOG.exists():
print(f"*** BEGIN {STD_LOG} ***")
with open(STD_LOG, "r") as f:
print(f.read(), end="")
print(f"*** END {STD_LOG} ***")
else:
ml_client.jobs.stream(returned_job.name)
> assert current_status == JobStatus.COMPLETED
E AssertionError: assert 'Running' == <JobStatus.CO...: 'Completed'>
E - Completed
E + Running
tests/responsibleai_text_sample_test.py:73: AssertionError
Check warning on line 0 in environment/tensorflow-2.12-cuda11.tests.tensorflow2_12_sample_test
github-actions / Test Results for assets-test
test_tensorflow_2_12 (environment/tensorflow-2.12-cuda11.tests.tensorflow2_12_sample_test) failed
pytest-reports/environment/tensorflow-2.12-cuda11.xml [took 10m 2s]
Raw output
azure.ai.ml._ml_exceptions.JobException: Exception :
{
"error": {
"code": "UserError",
"message": "Image build failed. For more details, check log file azureml-logs/20_image_build_log.txt.",
"message_format": "Image build failed. For more details, check log file {ArtifactPath}.",
"message_parameters": {
"ArtifactPath": "azureml-logs/20_image_build_log.txt"
},
"details": [],
"inner_error": {
"code": "BadArgument",
"inner_error": {
"code": "ImageBuildFailure"
}
}
},
"correlation": {
"operation": "a7e12e719bee29b04d1eb79f75fdbf20",
"request": "03587c198c905a4f"
},
"environment": "eastus",
"location": "eastus",
"time": "2024-08-12T03:28:58.488234Z",
"component_name": "RunHistory"
}
def test_tensorflow_2_12():
"""Tests a sample job using tensorflow 2.12 as the environment."""
this_dir = Path(__file__).parent
subscription_id = os.environ.get("subscription_id")
resource_group = os.environ.get("resource_group")
workspace_name = os.environ.get("workspace")
ml_client = MLClient(
AzureCliCredential(), subscription_id, resource_group, workspace_name
)
env_name = "tensorflow-2_12-cuda11"
env_docker_context = Environment(
build=BuildContext(path=this_dir / BUILD_CONTEXT),
name=env_name,
description="Tensorflow 2.12 environment created from a Docker context.",
)
ml_client.environments.create_or_update(env_docker_context)
# create the command
job = command(
code=this_dir / JOB_SOURCE_CODE, # local path where the code is stored
command="python main.py",
environment=f"{env_name}@latest",
compute=os.environ.get("gpu_v100_cluster"),
display_name="tensorflow-mnist-example",
description="A test run of the tensorflow 2_12 curated environment",
experiment_name="tensorflow212Experiment"
)
returned_job = ml_client.create_or_update(job)
assert returned_job is not None
# Poll until final status is reached or timed out
timeout = time.time() + (TIMEOUT_MINUTES * 60)
while time.time() <= timeout:
job = ml_client.jobs.get(returned_job.name)
status = job.status
if status in [JobStatus.COMPLETED, JobStatus.FAILED]:
break
time.sleep(30) # sleep 30 seconds
else:
# Timeout
ml_client.jobs.cancel(returned_job.name)
raise Exception(f"Test aborted because the job took longer than {TIMEOUT_MINUTES} minutes. "
f"Last status was {status}.")
if status == JobStatus.FAILED:
ml_client.jobs.download(returned_job.name)
if STD_LOG.exists():
print(f"*** BEGIN {STD_LOG} ***")
with open(STD_LOG, "r") as f:
print(f.read(), end="")
print(f"*** END {STD_LOG} ***")
else:
> ml_client.jobs.stream(returned_job.name)
tests/tensorflow2_12_sample_test.py:76:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
/usr/share/miniconda/envs/isolated_1723432750667/lib/python3.12/site-packages/azure/ai/ml/_telemetry/activity.py:169: in wrapper
return f(*args, **kwargs)
/usr/share/miniconda/envs/isolated_1723432750667/lib/python3.12/site-packages/azure/ai/ml/operations/_job_operations.py:490: in stream
self._stream_logs_until_completion(
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
run_operations = <azure.ai.ml.operations._run_operations.RunOperations object at 0x7efdf23c7ef0>
job_resource = <azure.ai.ml._restclient.v2022_02_01_preview.models._models_py3.JobBaseData object at 0x7efdf126a630>
datastore_operations = <azure.ai.ml.operations._datastore_operations.DatastoreOperations object at 0x7efdf239da90>
raise_exception_on_failed_job = True
def stream_logs_until_completion(
run_operations: RunOperations,
job_resource: JobBaseData,
datastore_operations: DatastoreOperations = None,
raise_exception_on_failed_job=True,
) -> None:
"""Stream the experiment run output to the specified file handle.
By default the the file handle points to stdout.
:param run_operations: The run history operations class.
:type run_operations: RunOperations
:param job_resource: The job to stream
:type job_resource: JobBaseData
:param datastore_operations: Optional, the datastore operations class, used to get logs from datastore
:type datastore_operations: Optional[DatastoreOperations]
:param raise_exception_on_failed_job: Should this method fail if job fails
:type raise_exception_on_failed_job: Boolean
:return:
:rtype: None
"""
job_type = job_resource.properties.job_type
job_name = job_resource.name
studio_endpoint = job_resource.properties.services.get("Studio", None)
studio_endpoint = studio_endpoint.endpoint if studio_endpoint else None
file_handle = sys.stdout
ds_properties = None
prefix = None
if (
hasattr(job_resource.properties, "outputs")
and job_resource.properties.job_type != RestJobType.AUTO_ML
and datastore_operations
):
# Get default output location
default_output = (
job_resource.properties.outputs.get("default", None) if job_resource.properties.outputs else None
)
is_uri_folder = default_output and default_output.job_output_type == DataType.URI_FOLDER
if is_uri_folder:
output_uri = default_output.uri
# Parse the uri format
output_uri = output_uri.split("datastores/")[1]
datastore_name, prefix = output_uri.split("/", 1)
ds_properties = get_datastore_info(datastore_operations, datastore_name)
try:
file_handle.write("RunId: {}\n".format(job_name))
file_handle.write("Web View: {}\n".format(studio_endpoint))
_current_details: RunDetails = run_operations.get_run_details(job_name)
session = create_session_with_retry()
processed_logs = {}
poll_start_time = time.time()
while (
_current_details.status in RunHistoryConstants.IN_PROGRESS_STATUSES
or _current_details.status == JobStatus.FINALIZING
):
file_handle.flush()
time.sleep(_wait_before_polling(time.time() - poll_start_time))
_current_details: RunDetails = run_operations.get_run_details(job_name) # TODO use FileWatcher
if job_type.lower() in JobType.PIPELINE:
legacy_folder_name = "/logs/azureml/"
else:
legacy_folder_name = "/azureml-logs/"
_current_logs_dict = (
list_logs_in_datastore(ds_properties, prefix=prefix, legacy_log_folder_name=legacy_folder_name)
if ds_properties is not None
else _current_details.log_files
)
# Get the list of new logs available after filtering out the processed ones
available_logs = _get_sorted_filtered_logs(_current_logs_dict, job_type, processed_logs)
content = ""
for current_log in available_logs:
content = download_text_from_url(
_current_logs_dict[current_log],
session,
timeout=RunHistoryConstants._DEFAULT_GET_CONTENT_TIMEOUT,
)
_incremental_print(content, processed_logs, current_log, file_handle)
# TODO: Temporary solution to wait for all the logs to be printed in the finalizing state.
if (
_current_details.status not in RunHistoryConstants.IN_PROGRESS_STATUSES
and _current_details.status == JobStatus.FINALIZING
and "The activity completed successfully. Finalizing run..." in content
):
break
file_handle.write("\n")
file_handle.write("Execution Summary\n")
file_handle.write("=================\n")
file_handle.write("RunId: {}\n".format(job_name))
file_handle.write("Web View: {}\n".format(studio_endpoint))
warnings = _current_details.warnings
if warnings:
messages = [x.message for x in warnings if x.message]
if len(messages) > 0:
file_handle.write("\nWarnings:\n")
for message in messages:
file_handle.write(message + "\n")
file_handle.write("\n")
if _current_details.status == JobStatus.FAILED:
error = (
_current_details.error.as_dict()
if _current_details.error
else "Detailed error not set on the Run. Please check the logs for details."
)
# If we are raising the error later on, so we don't double print.
if not raise_exception_on_failed_job:
file_handle.write("\nError:\n")
file_handle.write(json.dumps(error, indent=4))
file_handle.write("\n")
else:
> raise JobException(
message="Exception : \n {} ".format(json.dumps(error, indent=4)),
target=ErrorTarget.JOB,
no_personal_data_message="Exception raised on failed job.",
)
E azure.ai.ml._ml_exceptions.JobException: Exception :
E {
E "error": {
E "code": "UserError",
E "message": "Image build failed. For more details, check log file azureml-logs/20_image_build_log.txt.",
E "message_format": "Image build failed. For more details, check log file {ArtifactPath}.",
E "message_parameters": {
E "ArtifactPath": "azureml-logs/20_image_build_log.txt"
E },
E "details": [],
E "inner_error": {
E "code": "BadArgument",
E "inner_error": {
E "code": "ImageBuildFailure"
E }
E }
E },
E "correlation": {
E "operation": "a7e12e719bee29b04d1eb79f75fdbf20",
E "request": "03587c198c905a4f"
E },
E "environment": "eastus",
E "location": "eastus",
E "time": "2024-08-12T03:28:58.488234Z",
E "component_name": "RunHistory"
E }
/usr/share/miniconda/envs/isolated_1723432750667/lib/python3.12/site-packages/azure/ai/ml/operations/_job_ops_helper.py:285: JobException
github-actions / Test Results for assets-test
test_dataset_downloader_component[None-all-test-/home/runner/work/azureml-assets/azureml-assets/assets/aml-benchmark/scripts/da…/math.py] (component/dataset_downloader.tests.dataset_downloader.test_dataset_downloader.TestDatasetDownloaderComponent) failed
pytest-reports/component/dataset_downloader.xml [took 1m 36s]
Raw output
ValueError: The repository for math contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/math.
Please pass the argument `trust_remote_code=True` to allow custom code to be run.
trust_remote_code = None, repo_id = 'math'
def resolve_trust_remote_code(trust_remote_code: Optional[bool], repo_id: str) -> bool:
"""
Copied and adapted from Transformers
https://github.com/huggingface/transformers/blob/2098d343cc4b4b9d2aea84b3cf1eb5a1e610deff/src/transformers/dynamic_module_utils.py#L589
"""
trust_remote_code = trust_remote_code if trust_remote_code is not None else config.HF_DATASETS_TRUST_REMOTE_CODE
if trust_remote_code is None:
if config.TIME_OUT_REMOTE_CODE > 0:
try:
signal.signal(signal.SIGALRM, _raise_timeout_error)
signal.alarm(config.TIME_OUT_REMOTE_CODE)
while trust_remote_code is None:
> answer = input(
f"The repository for {repo_id} contains custom code which must be executed to correctly "
f"load the dataset. You can inspect the repository content at https://hf.co/datasets/{repo_id}.\n"
f"You can avoid this prompt in future by passing the argument `trust_remote_code=True`.\n\n"
f"Do you wish to run the custom code? [y/N] "
)
E OSError: pytest: reading from stdin while output is captured! Consider using `-s`.
/usr/share/miniconda/envs/isolated_1723432722702/lib/python3.11/site-packages/datasets/load.py:120: OSError
During handling of the above exception, another exception occurred:
self = <tests.dataset_downloader.test_dataset_downloader.TestDatasetDownloaderComponent object at 0x7f4d42cdc8d0>
temp_dir = '/tmp/pytest-of-runner/pytest-0/test_dataset_downloader_compon4'
dataset_name = None, configuration = 'all', split = 'test'
script = '/home/runner/work/azureml-assets/azureml-assets/assets/aml-benchmark/scripts/data_loaders/math.py'
@pytest.mark.parametrize(
"dataset_name, configuration, split, script",
[
("xquad", "xquad.en", "validation", None),
("xquad", "xquad.en,xquad.hi", "all", None),
("xquad", "all", "all", None),
("cifar10", "all", "test", None),
(None, "all", "test", Constants.MATH_DATASET_LOADER_SCRIPT),
],
)
def test_dataset_downloader_component(
self,
temp_dir: str,
dataset_name: Union[str, None],
configuration: Union[str, None],
split: Union[str, None],
script: Union[str, None],
) -> None:
"""Dataset Downloader component test."""
ml_client = get_mlclient()
pipeline_job = self._get_pipeline_job(
dataset_name,
configuration,
split,
script,
self.test_dataset_downloader_component.__name__,
)
# submit the pipeline job
pipeline_job = ml_client.create_or_update(
pipeline_job, experiment_name=self.EXP_NAME
)
ml_client.jobs.stream(pipeline_job.name)
print(pipeline_job)
file_count = 1
path = dataset_name if dataset_name else script
if configuration == "all":
> file_count = len(get_dataset_config_names(path))
../../tests/dataset_downloader/test_dataset_downloader.py:74:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
/usr/share/miniconda/envs/isolated_1723432722702/lib/python3.11/site-packages/datasets/inspect.py:347: in get_dataset_config_names
dataset_module = dataset_module_factory(
/usr/share/miniconda/envs/isolated_1723432722702/lib/python3.11/site-packages/datasets/load.py:1814: in dataset_module_factory
).get_module()
/usr/share/miniconda/envs/isolated_1723432722702/lib/python3.11/site-packages/datasets/load.py:962: in get_module
trust_remote_code = resolve_trust_remote_code(self.trust_remote_code, self.name)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
trust_remote_code = None, repo_id = 'math'
def resolve_trust_remote_code(trust_remote_code: Optional[bool], repo_id: str) -> bool:
"""
Copied and adapted from Transformers
https://github.com/huggingface/transformers/blob/2098d343cc4b4b9d2aea84b3cf1eb5a1e610deff/src/transformers/dynamic_module_utils.py#L589
"""
trust_remote_code = trust_remote_code if trust_remote_code is not None else config.HF_DATASETS_TRUST_REMOTE_CODE
if trust_remote_code is None:
if config.TIME_OUT_REMOTE_CODE > 0:
try:
signal.signal(signal.SIGALRM, _raise_timeout_error)
signal.alarm(config.TIME_OUT_REMOTE_CODE)
while trust_remote_code is None:
answer = input(
f"The repository for {repo_id} contains custom code which must be executed to correctly "
f"load the dataset. You can inspect the repository content at https://hf.co/datasets/{repo_id}.\n"
f"You can avoid this prompt in future by passing the argument `trust_remote_code=True`.\n\n"
f"Do you wish to run the custom code? [y/N] "
)
if answer.lower() in ["yes", "y", "1"]:
trust_remote_code = True
elif answer.lower() in ["no", "n", "0", ""]:
trust_remote_code = False
signal.alarm(0)
except Exception:
# OS which does not support signal.SIGALRM
> raise ValueError(
f"The repository for {repo_id} contains custom code which must be executed to correctly "
f"load the dataset. You can inspect the repository content at https://hf.co/datasets/{repo_id}.\n"
f"Please pass the argument `trust_remote_code=True` to allow custom code to be run."
)
E ValueError: The repository for math contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/math.
E Please pass the argument `trust_remote_code=True` to allow custom code to be run.
/usr/share/miniconda/envs/isolated_1723432722702/lib/python3.11/site-packages/datasets/load.py:133: ValueError
github-actions / Test Results for assets-test
test_invalid_input_combination[None-train-None] (component/dataset_downloader.tests.dataset_downloader.test_dataset_downloader.TestDatasetDownloaderScript) failed
pytest-reports/component/dataset_downloader.xml [took 14s]
Raw output
ValueError: Loading this dataset requires you to execute custom code contained in the dataset repository on your local machine. Please set the option `trust_remote_code=True` to permit loading of this dataset.
self = <tests.dataset_downloader.test_dataset_downloader.TestDatasetDownloaderScript object at 0x7f4d42cf2150>
dataset_name = None, split = 'train', script = None
@pytest.mark.parametrize(
"dataset_name, split, script",
[(None, "train", None), ("dataset", "test", "script")],
)
def test_invalid_input_combination(
self,
dataset_name: Union[str, None],
split: Union[str, None],
script: Union[str, None],
):
"""Test for invalid input combination."""
if dataset_name and script:
expected_exception_mssg = "Either 'dataset_name' or 'script' must be supplied; but not both."
elif not (dataset_name or script):
expected_exception_mssg = "Either 'dataset_name' or 'script' must be supplied."
# Run the script and verify the exception
try:
> self._run_downloader_script(dataset_name, None, split, script)
../../tests/dataset_downloader/test_dataset_downloader.py:156:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
../../tests/dataset_downloader/test_dataset_downloader.py:226: in _run_downloader_script
run_command(" ".join(args))
../../tests/test_utils.py:285: in run_command
result = subprocess.run(
/usr/share/miniconda/envs/isolated_1723432722702/lib/python3.11/subprocess.py:548: in run
stdout, stderr = process.communicate(input, timeout=timeout)
/usr/share/miniconda/envs/isolated_1723432722702/lib/python3.11/subprocess.py:1192: in communicate
stdout = self.stdout.read()
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
signum = 14
frame = <frame at 0x7f4d3be04440, file '/usr/share/miniconda/envs/isolated_1723432722702/lib/python3.11/subprocess.py', line 1192, code communicate>
def _raise_timeout_error(signum, frame):
> raise ValueError(
"Loading this dataset requires you to execute custom code contained in the dataset repository on your local "
"machine. Please set the option `trust_remote_code=True` to permit loading of this dataset."
)
E ValueError: Loading this dataset requires you to execute custom code contained in the dataset repository on your local machine. Please set the option `trust_remote_code=True` to permit loading of this dataset.
/usr/share/miniconda/envs/isolated_1723432722702/lib/python3.11/site-packages/datasets/load.py:102: ValueError
github-actions / Test Results for assets-test
test_invalid_hf_dataset[winogrande-None-validation] (component/dataset_downloader.tests.dataset_downloader.test_dataset_downloader.TestDatasetDownloaderScript) failed
pytest-reports/component/dataset_downloader.xml [took 2s]
Raw output
ValueError: The repository for winogrande contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/winogrande.
Please pass the argument `trust_remote_code=True` to allow custom code to be run.
self = <tests.dataset_downloader.test_dataset_downloader.TestDatasetDownloaderScript object at 0x7f4d42cf0510>
dataset_name = 'winogrande', configuration = None, split = 'validation'
@pytest.mark.parametrize(
"dataset_name, configuration, split",
[
("squad_v2", None, "test"),
("winogrande", None, "validation"),
("some_random_name", None, "test"),
("cifar100", None, "test")
],
)
def test_invalid_hf_dataset(
self, dataset_name: str, configuration: Optional[str], split: str
):
"""Test for unsupported url file."""
expected_exception_mssg = f"Split '{split}' not available for dataset '{dataset_name}' and config '{None}'."
if dataset_name == "winogrande" and configuration is None:
expected_exception_mssg = (
f"Multiple configurations available for dataset '{dataset_name}'. Please specify either one of "
> f"the following: {get_dataset_config_names(dataset_name)} or 'all'."
../../tests/dataset_downloader/test_dataset_downloader.py:178:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
/usr/share/miniconda/envs/isolated_1723432722702/lib/python3.11/site-packages/datasets/inspect.py:347: in get_dataset_config_names
dataset_module = dataset_module_factory(
/usr/share/miniconda/envs/isolated_1723432722702/lib/python3.11/site-packages/datasets/load.py:1914: in dataset_module_factory
raise e1 from None
/usr/share/miniconda/envs/isolated_1723432722702/lib/python3.11/site-packages/datasets/load.py:1887: in dataset_module_factory
).get_module()
/usr/share/miniconda/envs/isolated_1723432722702/lib/python3.11/site-packages/datasets/load.py:1525: in get_module
trust_remote_code = resolve_trust_remote_code(self.trust_remote_code, self.name)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
trust_remote_code = None, repo_id = 'winogrande'
def resolve_trust_remote_code(trust_remote_code: Optional[bool], repo_id: str) -> bool:
"""
Copied and adapted from Transformers
https://github.com/huggingface/transformers/blob/2098d343cc4b4b9d2aea84b3cf1eb5a1e610deff/src/transformers/dynamic_module_utils.py#L589
"""
trust_remote_code = trust_remote_code if trust_remote_code is not None else config.HF_DATASETS_TRUST_REMOTE_CODE
if trust_remote_code is None:
if config.TIME_OUT_REMOTE_CODE > 0:
try:
signal.signal(signal.SIGALRM, _raise_timeout_error)
signal.alarm(config.TIME_OUT_REMOTE_CODE)
while trust_remote_code is None:
answer = input(
f"The repository for {repo_id} contains custom code which must be executed to correctly "
f"load the dataset. You can inspect the repository content at https://hf.co/datasets/{repo_id}.\n"
f"You can avoid this prompt in future by passing the argument `trust_remote_code=True`.\n\n"
f"Do you wish to run the custom code? [y/N] "
)
if answer.lower() in ["yes", "y", "1"]:
trust_remote_code = True
elif answer.lower() in ["no", "n", "0", ""]:
trust_remote_code = False
signal.alarm(0)
except Exception:
# OS which does not support signal.SIGALRM
> raise ValueError(
f"The repository for {repo_id} contains custom code which must be executed to correctly "
f"load the dataset. You can inspect the repository content at https://hf.co/datasets/{repo_id}.\n"
f"Please pass the argument `trust_remote_code=True` to allow custom code to be run."
)
E ValueError: The repository for winogrande contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/winogrande.
E Please pass the argument `trust_remote_code=True` to allow custom code to be run.
/usr/share/miniconda/envs/isolated_1723432722702/lib/python3.11/site-packages/datasets/load.py:133: ValueError
github-actions / Test Results for assets-test
test_invalid_hf_dataset[some_random_name-None-test] (component/dataset_downloader.tests.dataset_downloader.test_dataset_downloader.TestDatasetDownloaderScript) failed
pytest-reports/component/dataset_downloader.xml [took 14s]
Raw output
ValueError: Loading this dataset requires you to execute custom code contained in the dataset repository on your local machine. Please set the option `trust_remote_code=True` to permit loading of this dataset.
self = <tests.dataset_downloader.test_dataset_downloader.TestDatasetDownloaderScript object at 0x7f4d42cebd50>
dataset_name = 'some_random_name', configuration = None, split = 'test'
@pytest.mark.parametrize(
"dataset_name, configuration, split",
[
("squad_v2", None, "test"),
("winogrande", None, "validation"),
("some_random_name", None, "test"),
("cifar100", None, "test")
],
)
def test_invalid_hf_dataset(
self, dataset_name: str, configuration: Optional[str], split: str
):
"""Test for unsupported url file."""
expected_exception_mssg = f"Split '{split}' not available for dataset '{dataset_name}' and config '{None}'."
if dataset_name == "winogrande" and configuration is None:
expected_exception_mssg = (
f"Multiple configurations available for dataset '{dataset_name}'. Please specify either one of "
f"the following: {get_dataset_config_names(dataset_name)} or 'all'."
)
elif dataset_name == "some_random_name":
expected_exception_mssg = f"FileNotFoundError: Dataset '{dataset_name}' doesn't exist on the Hub"
elif dataset_name == "cifar100":
expected_exception_mssg = "Error saving dataset to JSONL format: "
# Run the script and verify the exception
try:
> self._run_downloader_script(dataset_name, None, split, None)
../../tests/dataset_downloader/test_dataset_downloader.py:187:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
../../tests/dataset_downloader/test_dataset_downloader.py:226: in _run_downloader_script
run_command(" ".join(args))
../../tests/test_utils.py:285: in run_command
result = subprocess.run(
/usr/share/miniconda/envs/isolated_1723432722702/lib/python3.11/subprocess.py:548: in run
stdout, stderr = process.communicate(input, timeout=timeout)
/usr/share/miniconda/envs/isolated_1723432722702/lib/python3.11/subprocess.py:1192: in communicate
stdout = self.stdout.read()
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
signum = 14
frame = <frame at 0x7f4d4273fd40, file '/usr/share/miniconda/envs/isolated_1723432722702/lib/python3.11/subprocess.py', line 1192, code communicate>
def _raise_timeout_error(signum, frame):
> raise ValueError(
"Loading this dataset requires you to execute custom code contained in the dataset repository on your local "
"machine. Please set the option `trust_remote_code=True` to permit loading of this dataset."
)
E ValueError: Loading this dataset requires you to execute custom code contained in the dataset repository on your local machine. Please set the option `trust_remote_code=True` to permit loading of this dataset.
/usr/share/miniconda/envs/isolated_1723432722702/lib/python3.11/site-packages/datasets/load.py:102: ValueError
Check warning on line 0 in environment/acpt-pytorch-2.2-cuda12.1.tests.pytorch2_2_sample_test
github-actions / Test Results for assets-test
test_pytorch_2_2 (environment/acpt-pytorch-2.2-cuda12.1.tests.pytorch2_2_sample_test) failed
pytest-reports/environment/acpt-pytorch-2.2-cuda12.1.xml [took 35m 55s]
Raw output
azure.ai.ml.exceptions.JobException: Exception :
{
"error": {
"code": "UserError",
"message": "Execution failed. User process 'Rank 1' exited with status code 1. Please check log file 'user_logs/std_log_process_1.txt' for error details. Error: Traceback (most recent call last):\n File \"/mnt/azureml/cr/j/81b29a2e28f14fcc9a90a2d4e67dbc62/exe/wd/pretrain_glue.py\", line 27, in <module>\n from transformers import (\n File \"<frozen importlib._bootstrap>\", line 1075, in _handle_fromlist\n File \"/opt/conda/envs/ptca/lib/python3.10/site-packages/transformers/utils/import_utils.py\", line 1272, in __getattr__\n module = self._get_module(self._class_to_module[name])\n File \"/opt/conda/envs/ptca/lib/python3.10/site-packages/transformers/utils/import_utils.py\", line 1284, in _get_module\n raise RuntimeError(\nRuntimeError: Failed to import transformers.trainer because of the following error (look up to see its traceback):\nFailed to import transformers.integrations.integration_utils because of the following error (look up to see its traceback):\ncannot import name 'split_torch_state_dict_into_shards' from 'huggingface_hub' (/opt/conda/envs/ptca/lib/python3.10/site-packages/huggingface_hub/__init__.py)\n",
"message_parameters": {},
"details": []
},
"time": "0001-01-01T00:00:00.000Z",
"component_name": "CommonRuntime"
}
def test_pytorch_2_2():
"""Tests a sample job using pytorch 2.0 as the environment."""
this_dir = Path(__file__).parent
subscription_id = os.environ.get("subscription_id")
resource_group = os.environ.get("resource_group")
workspace_name = os.environ.get("workspace")
ml_client = MLClient(
AzureCliCredential(), subscription_id, resource_group, workspace_name
)
env_name = "acpt-pytorch-2_2-cuda12_1"
env_docker_context = Environment(
build=BuildContext(path=this_dir / BUILD_CONTEXT),
name=env_name,
description="Pytorch 2.2 environment created from a Docker context.",
)
ml_client.environments.create_or_update(env_docker_context)
# create the command
job = command(
code=this_dir / JOB_SOURCE_CODE, # local path where the code is stored
command="pip install -r requirements.txt && pip install multiprocess==0.70.15"
" && python pretrain_glue.py --tensorboard_log_dir \"/outputs/runs/\""
" --deepspeed ds_config.json --num_train_epochs 5 --output_dir outputs --disable_tqdm 1"
" --local_rank $RANK --evaluation_strategy \"epoch\" --logging_strategy \"epoch\""
" --per_device_train_batch_size 93 --gradient_accumulation_steps 1"
" --per_device_eval_batch_size 93 --learning_rate 3e-05 --adam_beta1 0.8 --adam_beta2 0.999"
" --weight_decay 3e-07 --warmup_steps 500 --fp16 --logging_steps 1000"
" --model_checkpoint \"bert-large-uncased\"",
outputs={
"output": Output(
type="uri_folder",
mode="rw_mount",
path="azureml://datastores/workspaceblobstore/paths/outputs"
)
},
environment=f"{env_name}@latest",
compute=os.environ.get("gpu_v100_cluster"),
display_name="bert-pretrain-GLUE",
description="Pretrain the BERT model on the GLUE dataset.",
experiment_name="pytorch22_Cuda121_py310_Experiment",
distribution=PyTorchDistribution(process_count_per_instance=1),
resources=JobResourceConfiguration(instance_count=2, shm_size='3100m'),
)
returned_job = ml_client.create_or_update(job)
assert returned_job is not None
# Poll until final status is reached or timed out
timeout = time.time() + (TIMEOUT_MINUTES * 60)
while time.time() <= timeout:
current_status = ml_client.jobs.get(returned_job.name).status
if current_status in ["Completed", "Failed"]:
break
time.sleep(30) # sleep 30 seconds
bashCommand = "ls"
process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE)
output, error = process.communicate()
print(output)
print(error)
if current_status == "Failed" or current_status == "Cancelled":
ml_client.jobs.download(returned_job.name)
if STD_LOG.exists():
print(f"*** BEGIN {STD_LOG} ***")
with open(STD_LOG, "r") as f:
print(f.read(), end="")
print(f"*** END {STD_LOG} ***")
else:
> ml_client.jobs.stream(returned_job.name)
tests/pytorch2_2_sample_test.py:92:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
/usr/share/miniconda/envs/isolated_1723432738489/lib/python3.12/site-packages/azure/core/tracing/decorator.py:94: in wrapper_use_tracer
return func(*args, **kwargs)
/usr/share/miniconda/envs/isolated_1723432738489/lib/python3.12/site-packages/azure/ai/ml/operations/_job_operations.py:617: in stream
self._stream_logs_until_completion(
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
run_operations = <azure.ai.ml.operations._run_operations.RunOperations object at 0x7fbe998e0350>
job_resource = <azure.ai.ml._restclient.v2022_10_01_preview.models._models_py3.JobBase object at 0x7fbe9981f1a0>
datastore_operations = <azure.ai.ml.operations._datastore_operations.DatastoreOperations object at 0x7fbe9d0c1820>
raise_exception_on_failed_job = True
def stream_logs_until_completion(
run_operations: RunOperations,
job_resource: JobBaseData,
datastore_operations: DatastoreOperations = None,
raise_exception_on_failed_job=True,
*,
requests_pipeline: HttpPipeline
) -> None:
"""Stream the experiment run output to the specified file handle. By
default the the file handle points to stdout.
:param run_operations: The run history operations class.
:type run_operations: RunOperations
:param job_resource: The job to stream
:type job_resource: JobBaseData
:param datastore_operations: Optional, the datastore operations class, used to get logs from datastore
:type datastore_operations: Optional[DatastoreOperations]
:param raise_exception_on_failed_job: Should this method fail if job fails
:type raise_exception_on_failed_job: Boolean
:return:
:rtype: None
"""
job_type = job_resource.properties.job_type
job_name = job_resource.name
studio_endpoint = job_resource.properties.services.get("Studio", None)
studio_endpoint = studio_endpoint.endpoint if studio_endpoint else None
file_handle = sys.stdout
ds_properties = None
prefix = None
if (
hasattr(job_resource.properties, "outputs")
and job_resource.properties.job_type != RestJobType.AUTO_ML
and datastore_operations
):
# Get default output location
default_output = (
job_resource.properties.outputs.get("default", None) if job_resource.properties.outputs else None
)
is_uri_folder = default_output and default_output.job_output_type == DataType.URI_FOLDER
if is_uri_folder:
output_uri = default_output.uri
# Parse the uri format
output_uri = output_uri.split("datastores/")[1]
datastore_name, prefix = output_uri.split("/", 1)
ds_properties = get_datastore_info(datastore_operations, datastore_name)
try:
file_handle.write("RunId: {}\n".format(job_name))
file_handle.write("Web View: {}\n".format(studio_endpoint))
_current_details: RunDetails = run_operations.get_run_details(job_name)
processed_logs = {}
poll_start_time = time.time()
pipeline_with_retries = create_requests_pipeline_with_retry(requests_pipeline=requests_pipeline)
while (
_current_details.status in RunHistoryConstants.IN_PROGRESS_STATUSES
or _current_details.status == JobStatus.FINALIZING
):
file_handle.flush()
time.sleep(_wait_before_polling(time.time() - poll_start_time))
_current_details: RunDetails = run_operations.get_run_details(job_name) # TODO use FileWatcher
if job_type.lower() in JobType.PIPELINE:
legacy_folder_name = "/logs/azureml/"
else:
legacy_folder_name = "/azureml-logs/"
_current_logs_dict = (
list_logs_in_datastore(
ds_properties,
prefix=prefix,
legacy_log_folder_name=legacy_folder_name,
)
if ds_properties is not None
else _current_details.log_files
)
# Get the list of new logs available after filtering out the processed ones
available_logs = _get_sorted_filtered_logs(_current_logs_dict, job_type, processed_logs)
content = ""
for current_log in available_logs:
content = download_text_from_url(
_current_logs_dict[current_log],
pipeline_with_retries,
timeout=RunHistoryConstants._DEFAULT_GET_CONTENT_TIMEOUT,
)
_incremental_print(content, processed_logs, current_log, file_handle)
# TODO: Temporary solution to wait for all the logs to be printed in the finalizing state.
if (
_current_details.status not in RunHistoryConstants.IN_PROGRESS_STATUSES
and _current_details.status == JobStatus.FINALIZING
and "The activity completed successfully. Finalizing run..." in content
):
break
file_handle.write("\n")
file_handle.write("Execution Summary\n")
file_handle.write("=================\n")
file_handle.write("RunId: {}\n".format(job_name))
file_handle.write("Web View: {}\n".format(studio_endpoint))
warnings = _current_details.warnings
if warnings:
messages = [x.message for x in warnings if x.message]
if len(messages) > 0:
file_handle.write("\nWarnings:\n")
for message in messages:
file_handle.write(message + "\n")
file_handle.write("\n")
if _current_details.status == JobStatus.FAILED:
error = (
_current_details.error.as_dict()
if _current_details.error
else "Detailed error not set on the Run. Please check the logs for details."
)
# If we are raising the error later on, so we don't double print.
if not raise_exception_on_failed_job:
file_handle.write("\nError:\n")
file_handle.write(json.dumps(error, indent=4))
file_handle.write("\n")
else:
> raise JobException(
message="Exception : \n {} ".format(json.dumps(error, indent=4)),
target=ErrorTarget.JOB,
no_personal_data_message="Exception raised on failed job.",
error_category=ErrorCategory.SYSTEM_ERROR,
)
E azure.ai.ml.exceptions.JobException: Exception :
E {
E "error": {
E "code": "UserError",
E "message": "Execution failed. User process 'Rank 1' exited with status code 1. Please check log file 'user_logs/std_log_process_1.txt' for error details. Error: Traceback (most recent call last):\n File \"/mnt/azureml/cr/j/81b29a2e28f14fcc9a90a2d4e67dbc62/exe/wd/pretrain_glue.py\", line 27, in <module>\n from transformers import (\n File \"<frozen importlib._bootstrap>\", line 1075, in _handle_fromlist\n File \"/opt/conda/envs/ptca/lib/python3.10/site-packages/transformers/utils/import_utils.py\", line 1272, in __getattr__\n module = self._get_module(self._class_to_module[name])\n File \"/opt/conda/envs/ptca/lib/python3.10/site-packages/transformers/utils/import_utils.py\", line 1284, in _get_module\n raise RuntimeError(\nRuntimeError: Failed to import transformers.trainer because of the following error (look up to see its traceback):\nFailed to import transformers.integrations.integration_utils because of the following error (look up to see its traceback):\ncannot import name 'split_torch_state_dict_into_shards' from 'huggingface_hub' (/opt/conda/envs/ptca/lib/python3.10/site-packages/huggingface_hub/__init__.py)\n",
E "message_parameters": {},
E "details": []
E },
E "time": "0001-01-01T00:00:00.000Z",
E "component_name": "CommonRuntime"
E }
/usr/share/miniconda/envs/isolated_1723432738489/lib/python3.12/site-packages/azure/ai/ml/operations/_job_ops_helper.py:297: JobException
Check warning on line 0 in environment/acpt-pytorch-1.13-cuda11.7.tests.pytorch1_11_sample_test
github-actions / Test Results for assets-test
test_pytorch_1_13 (environment/acpt-pytorch-1.13-cuda11.7.tests.pytorch1_11_sample_test) failed
pytest-reports/environment/acpt-pytorch-1.13-cuda11.7.xml [took 43m 56s]
Raw output
azure.ai.ml.exceptions.JobException: Exception :
{
"error": {
"code": "UserError",
"message": "Execution failed. User process 'Rank 1' exited with status code 1. Please check log file 'user_logs/std_log_process_1.txt' for error details. Error: Traceback (most recent call last):\n File \"pretrain_glue.py\", line 27, in <module>\n from transformers import (\n File \"<frozen importlib._bootstrap>\", line 1039, in _handle_fromlist\n File \"/opt/conda/envs/ptca/lib/python3.8/site-packages/transformers/utils/import_utils.py\", line 1272, in __getattr__\n module = self._get_module(self._class_to_module[name])\n File \"/opt/conda/envs/ptca/lib/python3.8/site-packages/transformers/utils/import_utils.py\", line 1284, in _get_module\n raise RuntimeError(\nRuntimeError: Failed to import transformers.trainer because of the following error (look up to see its traceback):\nFailed to import transformers.integrations.integration_utils because of the following error (look up to see its traceback):\ncannot import name 'split_torch_state_dict_into_shards' from 'huggingface_hub' (/opt/conda/envs/ptca/lib/python3.8/site-packages/huggingface_hub/__init__.py)\n",
"message_parameters": {},
"details": []
},
"time": "0001-01-01T00:00:00.000Z",
"component_name": "CommonRuntime"
}
def test_pytorch_1_13():
"""Tests a sample job using pytorch 1.13 as the environment."""
this_dir = Path(__file__).parent
subscription_id = os.environ.get("subscription_id")
resource_group = os.environ.get("resource_group")
workspace_name = os.environ.get("workspace")
ml_client = MLClient(
AzureCliCredential(), subscription_id, resource_group, workspace_name
)
env_name = "acpt-pytorch-1_13-cuda11_7"
env_docker_context = Environment(
build=BuildContext(path=this_dir / BUILD_CONTEXT),
name=env_name,
description="Pytorch 1.13 environment created from a Docker context.",
)
ml_client.environments.create_or_update(env_docker_context)
# create the command
job = command(
code=this_dir / JOB_SOURCE_CODE, # local path where the code is stored
command="pip install -r requirements.txt"
" && python pretrain_glue.py --tensorboard_log_dir \"/outputs/runs/\""
" --deepspeed ds_config.json --num_train_epochs 5 --output_dir outputs --disable_tqdm 1"
" --local_rank $RANK --evaluation_strategy \"epoch\" --logging_strategy \"epoch\""
" --per_device_train_batch_size 93 --gradient_accumulation_steps 1"
" --per_device_eval_batch_size 93 --learning_rate 3e-05 --adam_beta1 0.8 --adam_beta2 0.999"
" --weight_decay 3e-07 --warmup_steps 500 --fp16 --logging_steps 1000"
" --model_checkpoint \"bert-large-uncased\"",
outputs={
"output": Output(
type="uri_folder",
mode="rw_mount",
path="azureml://datastores/workspaceblobstore/paths/outputs"
)
},
environment=f"{env_name}@latest",
compute=os.environ.get("gpu_v100_cluster"),
display_name="bert-pretrain-GLUE",
description="Pretrain the BERT model on the GLUE dataset.",
experiment_name="pytorch113_Cuda117_Experiment",
distribution=PyTorchDistribution(process_count_per_instance=1),
resources=JobResourceConfiguration(instance_count=2, shm_size='3100m'),
)
returned_job = ml_client.create_or_update(job)
assert returned_job is not None
# Poll until final status is reached or timed out
timeout = time.time() + (TIMEOUT_MINUTES * 60)
while time.time() <= timeout:
current_status = ml_client.jobs.get(returned_job.name).status
if current_status in ["Completed", "Failed"]:
break
time.sleep(30) # sleep 30 seconds
bashCommand = "ls"
process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE)
output, error = process.communicate()
print(output)
print(error)
if current_status == "Failed":
ml_client.jobs.download(returned_job.name)
if STD_LOG.exists():
print(f"*** BEGIN {STD_LOG} ***")
with open(STD_LOG, "r") as f:
print(f.read(), end="")
print(f"*** END {STD_LOG} ***")
else:
> ml_client.jobs.stream(returned_job.name)
tests/pytorch1_11_sample_test.py:92:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
/usr/share/miniconda/envs/isolated_1723432714935/lib/python3.12/site-packages/azure/core/tracing/decorator.py:94: in wrapper_use_tracer
return func(*args, **kwargs)
/usr/share/miniconda/envs/isolated_1723432714935/lib/python3.12/site-packages/azure/ai/ml/operations/_job_operations.py:617: in stream
self._stream_logs_until_completion(
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
run_operations = <azure.ai.ml.operations._run_operations.RunOperations object at 0x7f6ad1006ea0>
job_resource = <azure.ai.ml._restclient.v2022_10_01_preview.models._models_py3.JobBase object at 0x7f6ad3063dd0>
datastore_operations = <azure.ai.ml.operations._datastore_operations.DatastoreOperations object at 0x7f6ad47c56d0>
raise_exception_on_failed_job = True
def stream_logs_until_completion(
run_operations: RunOperations,
job_resource: JobBaseData,
datastore_operations: DatastoreOperations = None,
raise_exception_on_failed_job=True,
*,
requests_pipeline: HttpPipeline
) -> None:
"""Stream the experiment run output to the specified file handle. By
default the the file handle points to stdout.
:param run_operations: The run history operations class.
:type run_operations: RunOperations
:param job_resource: The job to stream
:type job_resource: JobBaseData
:param datastore_operations: Optional, the datastore operations class, used to get logs from datastore
:type datastore_operations: Optional[DatastoreOperations]
:param raise_exception_on_failed_job: Should this method fail if job fails
:type raise_exception_on_failed_job: Boolean
:return:
:rtype: None
"""
job_type = job_resource.properties.job_type
job_name = job_resource.name
studio_endpoint = job_resource.properties.services.get("Studio", None)
studio_endpoint = studio_endpoint.endpoint if studio_endpoint else None
file_handle = sys.stdout
ds_properties = None
prefix = None
if (
hasattr(job_resource.properties, "outputs")
and job_resource.properties.job_type != RestJobType.AUTO_ML
and datastore_operations
):
# Get default output location
default_output = (
job_resource.properties.outputs.get("default", None) if job_resource.properties.outputs else None
)
is_uri_folder = default_output and default_output.job_output_type == DataType.URI_FOLDER
if is_uri_folder:
output_uri = default_output.uri
# Parse the uri format
output_uri = output_uri.split("datastores/")[1]
datastore_name, prefix = output_uri.split("/", 1)
ds_properties = get_datastore_info(datastore_operations, datastore_name)
try:
file_handle.write("RunId: {}\n".format(job_name))
file_handle.write("Web View: {}\n".format(studio_endpoint))
_current_details: RunDetails = run_operations.get_run_details(job_name)
processed_logs = {}
poll_start_time = time.time()
pipeline_with_retries = create_requests_pipeline_with_retry(requests_pipeline=requests_pipeline)
while (
_current_details.status in RunHistoryConstants.IN_PROGRESS_STATUSES
or _current_details.status == JobStatus.FINALIZING
):
file_handle.flush()
time.sleep(_wait_before_polling(time.time() - poll_start_time))
_current_details: RunDetails = run_operations.get_run_details(job_name) # TODO use FileWatcher
if job_type.lower() in JobType.PIPELINE:
legacy_folder_name = "/logs/azureml/"
else:
legacy_folder_name = "/azureml-logs/"
_current_logs_dict = (
list_logs_in_datastore(
ds_properties,
prefix=prefix,
legacy_log_folder_name=legacy_folder_name,
)
if ds_properties is not None
else _current_details.log_files
)
# Get the list of new logs available after filtering out the processed ones
available_logs = _get_sorted_filtered_logs(_current_logs_dict, job_type, processed_logs)
content = ""
for current_log in available_logs:
content = download_text_from_url(
_current_logs_dict[current_log],
pipeline_with_retries,
timeout=RunHistoryConstants._DEFAULT_GET_CONTENT_TIMEOUT,
)
_incremental_print(content, processed_logs, current_log, file_handle)
# TODO: Temporary solution to wait for all the logs to be printed in the finalizing state.
if (
_current_details.status not in RunHistoryConstants.IN_PROGRESS_STATUSES
and _current_details.status == JobStatus.FINALIZING
and "The activity completed successfully. Finalizing run..." in content
):
break
file_handle.write("\n")
file_handle.write("Execution Summary\n")
file_handle.write("=================\n")
file_handle.write("RunId: {}\n".format(job_name))
file_handle.write("Web View: {}\n".format(studio_endpoint))
warnings = _current_details.warnings
if warnings:
messages = [x.message for x in warnings if x.message]
if len(messages) > 0:
file_handle.write("\nWarnings:\n")
for message in messages:
file_handle.write(message + "\n")
file_handle.write("\n")
if _current_details.status == JobStatus.FAILED:
error = (
_current_details.error.as_dict()
if _current_details.error
else "Detailed error not set on the Run. Please check the logs for details."
)
# If we are raising the error later on, so we don't double print.
if not raise_exception_on_failed_job:
file_handle.write("\nError:\n")
file_handle.write(json.dumps(error, indent=4))
file_handle.write("\n")
else:
> raise JobException(
message="Exception : \n {} ".format(json.dumps(error, indent=4)),
target=ErrorTarget.JOB,
no_personal_data_message="Exception raised on failed job.",
error_category=ErrorCategory.SYSTEM_ERROR,
)
E azure.ai.ml.exceptions.JobException: Exception :
E {
E "error": {
E "code": "UserError",
E "message": "Execution failed. User process 'Rank 1' exited with status code 1. Please check log file 'user_logs/std_log_process_1.txt' for error details. Error: Traceback (most recent call last):\n File \"pretrain_glue.py\", line 27, in <module>\n from transformers import (\n File \"<frozen importlib._bootstrap>\", line 1039, in _handle_fromlist\n File \"/opt/conda/envs/ptca/lib/python3.8/site-packages/transformers/utils/import_utils.py\", line 1272, in __getattr__\n module = self._get_module(self._class_to_module[name])\n File \"/opt/conda/envs/ptca/lib/python3.8/site-packages/transformers/utils/import_utils.py\", line 1284, in _get_module\n raise RuntimeError(\nRuntimeError: Failed to import transformers.trainer because of the following error (look up to see its traceback):\nFailed to import transformers.integrations.integration_utils because of the following error (look up to see its traceback):\ncannot import name 'split_torch_state_dict_into_shards' from 'huggingface_hub' (/opt/conda/envs/ptca/lib/python3.8/site-packages/huggingface_hub/__init__.py)\n",
E "message_parameters": {},
E "details": []
E },
E "time": "0001-01-01T00:00:00.000Z",
E "component_name": "CommonRuntime"
E }
/usr/share/miniconda/envs/isolated_1723432714935/lib/python3.12/site-packages/azure/ai/ml/operations/_job_ops_helper.py:297: JobException
Check warning on line 0 in environment/tensorflow-2.11-cuda11.tests.tensorflow2_11_sample_test
github-actions / Test Results for assets-test
test_tensorflow_2_11 (environment/tensorflow-2.11-cuda11.tests.tensorflow2_11_sample_test) failed
pytest-reports/environment/tensorflow-2.11-cuda11.xml [took 11m 26s]
Raw output
azure.ai.ml._ml_exceptions.JobException: Exception :
{
"error": {
"code": "UserError",
"message": "Image build failed. For more details, check log file azureml-logs/20_image_build_log.txt.",
"message_format": "Image build failed. For more details, check log file {ArtifactPath}.",
"message_parameters": {
"ArtifactPath": "azureml-logs/20_image_build_log.txt"
},
"details": [],
"inner_error": {
"code": "BadArgument",
"inner_error": {
"code": "ImageBuildFailure"
}
}
},
"correlation": {
"operation": "220363807c5cd69f3ae6e5e8def60fe9",
"request": "e29f83e22989c990"
},
"environment": "eastus",
"location": "eastus",
"time": "2024-08-12T03:30:10.050746Z",
"component_name": "RunHistory"
}
def test_tensorflow_2_11():
"""Tests a sample job using tensorflow 2.11 as the environment."""
this_dir = Path(__file__).parent
subscription_id = os.environ.get("subscription_id")
resource_group = os.environ.get("resource_group")
workspace_name = os.environ.get("workspace")
ml_client = MLClient(
AzureCliCredential(), subscription_id, resource_group, workspace_name
)
env_name = "tensorflow-2_11-cuda11"
env_docker_context = Environment(
build=BuildContext(path=this_dir / BUILD_CONTEXT),
name=env_name,
description="Tensorflow 2.11 environment created from a Docker context.",
)
ml_client.environments.create_or_update(env_docker_context)
# create the command
job = command(
code=this_dir / JOB_SOURCE_CODE, # local path where the code is stored
command="python main.py",
environment=f"{env_name}@latest",
compute=os.environ.get("gpu_v100_cluster"),
display_name="tensorflow-mnist-example",
description="A test run of the tensorflow 2_11 curated environment",
experiment_name="tensorflow211Experiment"
)
returned_job = ml_client.create_or_update(job)
assert returned_job is not None
# Poll until final status is reached or timed out
timeout = time.time() + (TIMEOUT_MINUTES * 60)
while time.time() <= timeout:
job = ml_client.jobs.get(returned_job.name)
status = job.status
if status in [JobStatus.COMPLETED, JobStatus.FAILED]:
break
time.sleep(30) # sleep 30 seconds
else:
# Timeout
ml_client.jobs.cancel(returned_job.name)
raise Exception(f"Test aborted because the job took longer than {TIMEOUT_MINUTES} minutes. "
f"Last status was {status}.")
if status == JobStatus.FAILED:
ml_client.jobs.download(returned_job.name)
if STD_LOG.exists():
print(f"*** BEGIN {STD_LOG} ***")
with open(STD_LOG, "r") as f:
print(f.read(), end="")
print(f"*** END {STD_LOG} ***")
else:
> ml_client.jobs.stream(returned_job.name)
tests/tensorflow2_11_sample_test.py:76:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
/usr/share/miniconda/envs/isolated_1723432733547/lib/python3.12/site-packages/azure/ai/ml/_telemetry/activity.py:169: in wrapper
return f(*args, **kwargs)
/usr/share/miniconda/envs/isolated_1723432733547/lib/python3.12/site-packages/azure/ai/ml/operations/_job_operations.py:490: in stream
self._stream_logs_until_completion(
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
run_operations = <azure.ai.ml.operations._run_operations.RunOperations object at 0x7f2272a32c00>
job_resource = <azure.ai.ml._restclient.v2022_02_01_preview.models._models_py3.JobBaseData object at 0x7f2272a8dee0>
datastore_operations = <azure.ai.ml.operations._datastore_operations.DatastoreOperations object at 0x7f2273bbd6a0>
raise_exception_on_failed_job = True
def stream_logs_until_completion(
run_operations: RunOperations,
job_resource: JobBaseData,
datastore_operations: DatastoreOperations = None,
raise_exception_on_failed_job=True,
) -> None:
"""Stream the experiment run output to the specified file handle.
By default the the file handle points to stdout.
:param run_operations: The run history operations class.
:type run_operations: RunOperations
:param job_resource: The job to stream
:type job_resource: JobBaseData
:param datastore_operations: Optional, the datastore operations class, used to get logs from datastore
:type datastore_operations: Optional[DatastoreOperations]
:param raise_exception_on_failed_job: Should this method fail if job fails
:type raise_exception_on_failed_job: Boolean
:return:
:rtype: None
"""
job_type = job_resource.properties.job_type
job_name = job_resource.name
studio_endpoint = job_resource.properties.services.get("Studio", None)
studio_endpoint = studio_endpoint.endpoint if studio_endpoint else None
file_handle = sys.stdout
ds_properties = None
prefix = None
if (
hasattr(job_resource.properties, "outputs")
and job_resource.properties.job_type != RestJobType.AUTO_ML
and datastore_operations
):
# Get default output location
default_output = (
job_resource.properties.outputs.get("default", None) if job_resource.properties.outputs else None
)
is_uri_folder = default_output and default_output.job_output_type == DataType.URI_FOLDER
if is_uri_folder:
output_uri = default_output.uri
# Parse the uri format
output_uri = output_uri.split("datastores/")[1]
datastore_name, prefix = output_uri.split("/", 1)
ds_properties = get_datastore_info(datastore_operations, datastore_name)
try:
file_handle.write("RunId: {}\n".format(job_name))
file_handle.write("Web View: {}\n".format(studio_endpoint))
_current_details: RunDetails = run_operations.get_run_details(job_name)
session = create_session_with_retry()
processed_logs = {}
poll_start_time = time.time()
while (
_current_details.status in RunHistoryConstants.IN_PROGRESS_STATUSES
or _current_details.status == JobStatus.FINALIZING
):
file_handle.flush()
time.sleep(_wait_before_polling(time.time() - poll_start_time))
_current_details: RunDetails = run_operations.get_run_details(job_name) # TODO use FileWatcher
if job_type.lower() in JobType.PIPELINE:
legacy_folder_name = "/logs/azureml/"
else:
legacy_folder_name = "/azureml-logs/"
_current_logs_dict = (
list_logs_in_datastore(ds_properties, prefix=prefix, legacy_log_folder_name=legacy_folder_name)
if ds_properties is not None
else _current_details.log_files
)
# Get the list of new logs available after filtering out the processed ones
available_logs = _get_sorted_filtered_logs(_current_logs_dict, job_type, processed_logs)
content = ""
for current_log in available_logs:
content = download_text_from_url(
_current_logs_dict[current_log],
session,
timeout=RunHistoryConstants._DEFAULT_GET_CONTENT_TIMEOUT,
)
_incremental_print(content, processed_logs, current_log, file_handle)
# TODO: Temporary solution to wait for all the logs to be printed in the finalizing state.
if (
_current_details.status not in RunHistoryConstants.IN_PROGRESS_STATUSES
and _current_details.status == JobStatus.FINALIZING
and "The activity completed successfully. Finalizing run..." in content
):
break
file_handle.write("\n")
file_handle.write("Execution Summary\n")
file_handle.write("=================\n")
file_handle.write("RunId: {}\n".format(job_name))
file_handle.write("Web View: {}\n".format(studio_endpoint))
warnings = _current_details.warnings
if warnings:
messages = [x.message for x in warnings if x.message]
if len(messages) > 0:
file_handle.write("\nWarnings:\n")
for message in messages:
file_handle.write(message + "\n")
file_handle.write("\n")
if _current_details.status == JobStatus.FAILED:
error = (
_current_details.error.as_dict()
if _current_details.error
else "Detailed error not set on the Run. Please check the logs for details."
)
# If we are raising the error later on, so we don't double print.
if not raise_exception_on_failed_job:
file_handle.write("\nError:\n")
file_handle.write(json.dumps(error, indent=4))
file_handle.write("\n")
else:
> raise JobException(
message="Exception : \n {} ".format(json.dumps(error, indent=4)),
target=ErrorTarget.JOB,
no_personal_data_message="Exception raised on failed job.",
)
E azure.ai.ml._ml_exceptions.JobException: Exception :
E {
E "error": {
E "code": "UserError",
E "message": "Image build failed. For more details, check log file azureml-logs/20_image_build_log.txt.",
E "message_format": "Image build failed. For more details, check log file {ArtifactPath}.",
E "message_parameters": {
E "ArtifactPath": "azureml-logs/20_image_build_log.txt"
E },
E "details": [],
E "inner_error": {
E "code": "BadArgument",
E "inner_error": {
E "code": "ImageBuildFailure"
E }
E }
E },
E "correlation": {
E "operation": "220363807c5cd69f3ae6e5e8def60fe9",
E "request": "e29f83e22989c990"
E },
E "environment": "eastus",
E "location": "eastus",
E "time": "2024-08-12T03:30:10.050746Z",
E "component_name": "RunHistory"
E }
/usr/share/miniconda/envs/isolated_1723432733547/lib/python3.12/site-packages/azure/ai/ml/operations/_job_ops_helper.py:285: JobException
github-actions / Test Results for assets-test
test_pytorch_2_2 (environment/acpt-pytorch-2.2-cuda12.1-profiler.tests.pytorch2_2_sample_test) failed
pytest-reports/environment/acpt-pytorch-2.2-cuda12.1-profiler.xml [took 1h 0m 38s]
Raw output
AssertionError: assert 'Running' == 'Completed'
- Completed
+ Running
def test_pytorch_2_2():
"""Tests a sample job using pytorch 2.0 as the environment."""
this_dir = Path(__file__).parent
subscription_id = os.environ.get("subscription_id")
resource_group = os.environ.get("resource_group")
workspace_name = os.environ.get("workspace")
ml_client = MLClient(
AzureCliCredential(), subscription_id, resource_group, workspace_name
)
env_name = "acpt-pytorch-2_2-cuda12_1-profiler"
env_docker_context = Environment(
build=BuildContext(path=this_dir / BUILD_CONTEXT),
name=env_name,
description="Pytorch 2.2 environment created from a Docker context.",
)
ml_client.environments.create_or_update(env_docker_context)
# create the command
job = command(
code=this_dir / JOB_SOURCE_CODE, # local path where the code is stored
command="pip install -r requirements.txt && pip install multiprocess==0.70.15"
" && python pretrain_glue.py --tensorboard_log_dir \"/outputs/runs/\""
" --deepspeed ds_config.json --num_train_epochs 5 --output_dir outputs --disable_tqdm 1"
" --local_rank $RANK --evaluation_strategy \"epoch\" --logging_strategy \"epoch\""
" --per_device_train_batch_size 93 --gradient_accumulation_steps 1"
" --per_device_eval_batch_size 93 --learning_rate 3e-05 --adam_beta1 0.8 --adam_beta2 0.999"
" --weight_decay 3e-07 --warmup_steps 500 --fp16 --logging_steps 1000"
" --model_checkpoint \"bert-large-uncased\"",
outputs={
"output": Output(
type="uri_folder",
mode="rw_mount",
path="azureml://datastores/workspaceblobstore/paths/outputs"
)
},
environment=f"{env_name}@latest",
compute=os.environ.get("gpu_v100_cluster"),
display_name="bert-pretrain-GLUE",
description="Pretrain the BERT model on the GLUE dataset.",
experiment_name="pytorch22_Cuda121_py310_profiler_Experiment",
distribution=PyTorchDistribution(process_count_per_instance=1),
resources=JobResourceConfiguration(instance_count=2, shm_size='3100m'),
)
returned_job = ml_client.create_or_update(job)
assert returned_job is not None
# Poll until final status is reached or timed out
timeout = time.time() + (TIMEOUT_MINUTES * 60)
while time.time() <= timeout:
current_status = ml_client.jobs.get(returned_job.name).status
if current_status in ["Completed", "Failed"]:
break
time.sleep(30) # sleep 30 seconds
bashCommand = "ls"
process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE)
output, error = process.communicate()
print(output)
print(error)
if current_status == "Failed" or current_status == "Cancelled":
ml_client.jobs.download(returned_job.name)
if STD_LOG.exists():
print(f"*** BEGIN {STD_LOG} ***")
with open(STD_LOG, "r") as f:
print(f.read(), end="")
print(f"*** END {STD_LOG} ***")
else:
ml_client.jobs.stream(returned_job.name)
> assert current_status == "Completed"
E AssertionError: assert 'Running' == 'Completed'
E - Completed
E + Running
tests/pytorch2_2_sample_test.py:94: AssertionError