Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix DBCopilot environment Vulnerabilities #3217

Merged
merged 12 commits into from
Aug 9, 2024

Merge branch 'xiangrao/20240801' of https://github.com/Azure/azureml-…

bb4277c
Select commit
Loading
Failed to load commit list.
Sign in for the full log view
Merged

Fix DBCopilot environment Vulnerabilities #3217

Merge branch 'xiangrao/20240801' of https://github.com/Azure/azureml-…
bb4277c
Select commit
Loading
Failed to load commit list.
GitHub Actions / Test Results for assets-test failed Aug 12, 2024 in 0s

10 fail, 1 skipped, 271 pass in 13h 55m 5s

282 tests   271 ✅  13h 55m 5s ⏱️
 27 suites    1 💤
 27 files     10 ❌

Results for commit bb4277c.

Annotations

Check warning on line 0 in environment/responsibleai-text.tests.responsibleai_text_sample_test

See this annotation in the file changed.

@github-actions github-actions / Test Results for assets-test

test_responsibleai_text (environment/responsibleai-text.tests.responsibleai_text_sample_test) failed

pytest-reports/environment/responsibleai-text.xml [took 30m 23s]
Raw output
AssertionError: assert 'Running' == <JobStatus.CO...: 'Completed'>
  - Completed
  + Running
def test_responsibleai_text():
        """Tests a sample job using responsibleai text image as the environment."""
        this_dir = Path(__file__).parent
    
        subscription_id = os.environ.get("subscription_id")
        resource_group = os.environ.get("resource_group")
        workspace_name = os.environ.get("workspace")
    
        ml_client = MLClient(
            AzureCliCredential(), subscription_id, resource_group, workspace_name
        )
    
        env_name = "responsibleai-text"
    
        env_docker_context = Environment(
            build=BuildContext(path=this_dir / BUILD_CONTEXT),
            name=env_name,
            description="ResponsibleAI Text environment created from a Docker context.",
        )
        ml_client.environments.create_or_update(env_docker_context)
    
        # create the command
        job = command(
            code=this_dir / JOB_SOURCE_CODE,  # local path where the code is stored
            command="python main.py",
            environment=f"{env_name}@latest",
            compute=os.environ.get("cpu_cluster"),
            display_name="responsibleai-text-example",
            description="A test run of the responsibleai text curated environment",
            experiment_name="responsibleaiTextExperiment"
        )
    
        returned_job = ml_client.create_or_update(job)
        assert returned_job is not None
    
        # Poll until final status is reached, or timed out
        timeout = time.time() + (TIMEOUT_MINUTES * 60)
        while time.time() <= timeout:
            current_status = ml_client.jobs.get(returned_job.name).status
            if current_status in [JobStatus.COMPLETED, JobStatus.FAILED]:
                break
            time.sleep(30)  # sleep 30 seconds
    
        if current_status == JobStatus.FAILED:
            ml_client.jobs.download(returned_job.name)
            if STD_LOG.exists():
                print(f"*** BEGIN {STD_LOG} ***")
                with open(STD_LOG, "r") as f:
                    print(f.read(), end="")
                print(f"*** END {STD_LOG} ***")
            else:
                ml_client.jobs.stream(returned_job.name)
    
>       assert current_status == JobStatus.COMPLETED
E       AssertionError: assert 'Running' == <JobStatus.CO...: 'Completed'>
E         - Completed
E         + Running

tests/responsibleai_text_sample_test.py:73: AssertionError

Check warning on line 0 in environment/tensorflow-2.12-cuda11.tests.tensorflow2_12_sample_test

See this annotation in the file changed.

@github-actions github-actions / Test Results for assets-test

test_tensorflow_2_12 (environment/tensorflow-2.12-cuda11.tests.tensorflow2_12_sample_test) failed

pytest-reports/environment/tensorflow-2.12-cuda11.xml [took 10m 2s]
Raw output
azure.ai.ml._ml_exceptions.JobException: Exception : 
 {
    "error": {
        "code": "UserError",
        "message": "Image build failed. For more details, check log file azureml-logs/20_image_build_log.txt.",
        "message_format": "Image build failed. For more details, check log file {ArtifactPath}.",
        "message_parameters": {
            "ArtifactPath": "azureml-logs/20_image_build_log.txt"
        },
        "details": [],
        "inner_error": {
            "code": "BadArgument",
            "inner_error": {
                "code": "ImageBuildFailure"
            }
        }
    },
    "correlation": {
        "operation": "a7e12e719bee29b04d1eb79f75fdbf20",
        "request": "03587c198c905a4f"
    },
    "environment": "eastus",
    "location": "eastus",
    "time": "2024-08-12T03:28:58.488234Z",
    "component_name": "RunHistory"
}
def test_tensorflow_2_12():
        """Tests a sample job using tensorflow 2.12 as the environment."""
        this_dir = Path(__file__).parent
    
        subscription_id = os.environ.get("subscription_id")
        resource_group = os.environ.get("resource_group")
        workspace_name = os.environ.get("workspace")
    
        ml_client = MLClient(
            AzureCliCredential(), subscription_id, resource_group, workspace_name
        )
    
        env_name = "tensorflow-2_12-cuda11"
    
        env_docker_context = Environment(
            build=BuildContext(path=this_dir / BUILD_CONTEXT),
            name=env_name,
            description="Tensorflow 2.12 environment created from a Docker context.",
        )
        ml_client.environments.create_or_update(env_docker_context)
    
        # create the command
        job = command(
            code=this_dir / JOB_SOURCE_CODE,  # local path where the code is stored
            command="python main.py",
            environment=f"{env_name}@latest",
            compute=os.environ.get("gpu_v100_cluster"),
            display_name="tensorflow-mnist-example",
            description="A test run of the tensorflow 2_12 curated environment",
            experiment_name="tensorflow212Experiment"
        )
    
        returned_job = ml_client.create_or_update(job)
        assert returned_job is not None
    
        # Poll until final status is reached or timed out
        timeout = time.time() + (TIMEOUT_MINUTES * 60)
        while time.time() <= timeout:
            job = ml_client.jobs.get(returned_job.name)
            status = job.status
            if status in [JobStatus.COMPLETED, JobStatus.FAILED]:
                break
            time.sleep(30)  # sleep 30 seconds
        else:
            # Timeout
            ml_client.jobs.cancel(returned_job.name)
            raise Exception(f"Test aborted because the job took longer than {TIMEOUT_MINUTES} minutes. "
                            f"Last status was {status}.")
    
        if status == JobStatus.FAILED:
            ml_client.jobs.download(returned_job.name)
            if STD_LOG.exists():
                print(f"*** BEGIN {STD_LOG} ***")
                with open(STD_LOG, "r") as f:
                    print(f.read(), end="")
                print(f"*** END {STD_LOG} ***")
            else:
>               ml_client.jobs.stream(returned_job.name)

tests/tensorflow2_12_sample_test.py:76: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
/usr/share/miniconda/envs/isolated_1723432750667/lib/python3.12/site-packages/azure/ai/ml/_telemetry/activity.py:169: in wrapper
    return f(*args, **kwargs)
/usr/share/miniconda/envs/isolated_1723432750667/lib/python3.12/site-packages/azure/ai/ml/operations/_job_operations.py:490: in stream
    self._stream_logs_until_completion(
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

run_operations = <azure.ai.ml.operations._run_operations.RunOperations object at 0x7efdf23c7ef0>
job_resource = <azure.ai.ml._restclient.v2022_02_01_preview.models._models_py3.JobBaseData object at 0x7efdf126a630>
datastore_operations = <azure.ai.ml.operations._datastore_operations.DatastoreOperations object at 0x7efdf239da90>
raise_exception_on_failed_job = True

    def stream_logs_until_completion(
        run_operations: RunOperations,
        job_resource: JobBaseData,
        datastore_operations: DatastoreOperations = None,
        raise_exception_on_failed_job=True,
    ) -> None:
        """Stream the experiment run output to the specified file handle.
        By default the the file handle points to stdout.
        :param run_operations: The run history operations class.
        :type run_operations: RunOperations
        :param job_resource: The job to stream
        :type job_resource: JobBaseData
        :param datastore_operations: Optional, the datastore operations class, used to get logs from datastore
        :type datastore_operations: Optional[DatastoreOperations]
        :param raise_exception_on_failed_job: Should this method fail if job fails
        :type raise_exception_on_failed_job: Boolean
        :return:
        :rtype: None
        """
        job_type = job_resource.properties.job_type
        job_name = job_resource.name
        studio_endpoint = job_resource.properties.services.get("Studio", None)
        studio_endpoint = studio_endpoint.endpoint if studio_endpoint else None
        file_handle = sys.stdout
        ds_properties = None
        prefix = None
        if (
            hasattr(job_resource.properties, "outputs")
            and job_resource.properties.job_type != RestJobType.AUTO_ML
            and datastore_operations
        ):
            # Get default output location
    
            default_output = (
                job_resource.properties.outputs.get("default", None) if job_resource.properties.outputs else None
            )
            is_uri_folder = default_output and default_output.job_output_type == DataType.URI_FOLDER
            if is_uri_folder:
                output_uri = default_output.uri
                # Parse the uri format
                output_uri = output_uri.split("datastores/")[1]
                datastore_name, prefix = output_uri.split("/", 1)
                ds_properties = get_datastore_info(datastore_operations, datastore_name)
    
        try:
            file_handle.write("RunId: {}\n".format(job_name))
            file_handle.write("Web View: {}\n".format(studio_endpoint))
    
            _current_details: RunDetails = run_operations.get_run_details(job_name)
            session = create_session_with_retry()
    
            processed_logs = {}
    
            poll_start_time = time.time()
            while (
                _current_details.status in RunHistoryConstants.IN_PROGRESS_STATUSES
                or _current_details.status == JobStatus.FINALIZING
            ):
                file_handle.flush()
                time.sleep(_wait_before_polling(time.time() - poll_start_time))
                _current_details: RunDetails = run_operations.get_run_details(job_name)  # TODO use FileWatcher
                if job_type.lower() in JobType.PIPELINE:
                    legacy_folder_name = "/logs/azureml/"
                else:
                    legacy_folder_name = "/azureml-logs/"
                _current_logs_dict = (
                    list_logs_in_datastore(ds_properties, prefix=prefix, legacy_log_folder_name=legacy_folder_name)
                    if ds_properties is not None
                    else _current_details.log_files
                )
                # Get the list of new logs available after filtering out the processed ones
                available_logs = _get_sorted_filtered_logs(_current_logs_dict, job_type, processed_logs)
                content = ""
                for current_log in available_logs:
                    content = download_text_from_url(
                        _current_logs_dict[current_log],
                        session,
                        timeout=RunHistoryConstants._DEFAULT_GET_CONTENT_TIMEOUT,
                    )
    
                    _incremental_print(content, processed_logs, current_log, file_handle)
    
                # TODO: Temporary solution to wait for all the logs to be printed in the finalizing state.
                if (
                    _current_details.status not in RunHistoryConstants.IN_PROGRESS_STATUSES
                    and _current_details.status == JobStatus.FINALIZING
                    and "The activity completed successfully. Finalizing run..." in content
                ):
                    break
    
            file_handle.write("\n")
            file_handle.write("Execution Summary\n")
            file_handle.write("=================\n")
            file_handle.write("RunId: {}\n".format(job_name))
            file_handle.write("Web View: {}\n".format(studio_endpoint))
    
            warnings = _current_details.warnings
            if warnings:
                messages = [x.message for x in warnings if x.message]
                if len(messages) > 0:
                    file_handle.write("\nWarnings:\n")
                    for message in messages:
                        file_handle.write(message + "\n")
                    file_handle.write("\n")
    
            if _current_details.status == JobStatus.FAILED:
                error = (
                    _current_details.error.as_dict()
                    if _current_details.error
                    else "Detailed error not set on the Run. Please check the logs for details."
                )
                # If we are raising the error later on, so we don't double print.
                if not raise_exception_on_failed_job:
                    file_handle.write("\nError:\n")
                    file_handle.write(json.dumps(error, indent=4))
                    file_handle.write("\n")
                else:
>                   raise JobException(
                        message="Exception : \n {} ".format(json.dumps(error, indent=4)),
                        target=ErrorTarget.JOB,
                        no_personal_data_message="Exception raised on failed job.",
                    )
E                   azure.ai.ml._ml_exceptions.JobException: Exception : 
E                    {
E                       "error": {
E                           "code": "UserError",
E                           "message": "Image build failed. For more details, check log file azureml-logs/20_image_build_log.txt.",
E                           "message_format": "Image build failed. For more details, check log file {ArtifactPath}.",
E                           "message_parameters": {
E                               "ArtifactPath": "azureml-logs/20_image_build_log.txt"
E                           },
E                           "details": [],
E                           "inner_error": {
E                               "code": "BadArgument",
E                               "inner_error": {
E                                   "code": "ImageBuildFailure"
E                               }
E                           }
E                       },
E                       "correlation": {
E                           "operation": "a7e12e719bee29b04d1eb79f75fdbf20",
E                           "request": "03587c198c905a4f"
E                       },
E                       "environment": "eastus",
E                       "location": "eastus",
E                       "time": "2024-08-12T03:28:58.488234Z",
E                       "component_name": "RunHistory"
E                   }

/usr/share/miniconda/envs/isolated_1723432750667/lib/python3.12/site-packages/azure/ai/ml/operations/_job_ops_helper.py:285: JobException

Check warning on line 0 in component/dataset_downloader.tests.dataset_downloader.test_dataset_downloader.TestDatasetDownloaderComponent

See this annotation in the file changed.

@github-actions github-actions / Test Results for assets-test

test_dataset_downloader_component[None-all-test-/home/runner/work/azureml-assets/azureml-assets/assets/aml-benchmark/scripts/da…/math.py] (component/dataset_downloader.tests.dataset_downloader.test_dataset_downloader.TestDatasetDownloaderComponent) failed

pytest-reports/component/dataset_downloader.xml [took 1m 36s]
Raw output
ValueError: The repository for math contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/math.
Please pass the argument `trust_remote_code=True` to allow custom code to be run.
trust_remote_code = None, repo_id = 'math'

    def resolve_trust_remote_code(trust_remote_code: Optional[bool], repo_id: str) -> bool:
        """
        Copied and adapted from Transformers
        https://github.com/huggingface/transformers/blob/2098d343cc4b4b9d2aea84b3cf1eb5a1e610deff/src/transformers/dynamic_module_utils.py#L589
        """
        trust_remote_code = trust_remote_code if trust_remote_code is not None else config.HF_DATASETS_TRUST_REMOTE_CODE
        if trust_remote_code is None:
            if config.TIME_OUT_REMOTE_CODE > 0:
                try:
                    signal.signal(signal.SIGALRM, _raise_timeout_error)
                    signal.alarm(config.TIME_OUT_REMOTE_CODE)
                    while trust_remote_code is None:
>                       answer = input(
                            f"The repository for {repo_id} contains custom code which must be executed to correctly "
                            f"load the dataset. You can inspect the repository content at https://hf.co/datasets/{repo_id}.\n"
                            f"You can avoid this prompt in future by passing the argument `trust_remote_code=True`.\n\n"
                            f"Do you wish to run the custom code? [y/N] "
                        )
E                       OSError: pytest: reading from stdin while output is captured!  Consider using `-s`.

/usr/share/miniconda/envs/isolated_1723432722702/lib/python3.11/site-packages/datasets/load.py:120: OSError

During handling of the above exception, another exception occurred:

self = <tests.dataset_downloader.test_dataset_downloader.TestDatasetDownloaderComponent object at 0x7f4d42cdc8d0>
temp_dir = '/tmp/pytest-of-runner/pytest-0/test_dataset_downloader_compon4'
dataset_name = None, configuration = 'all', split = 'test'
script = '/home/runner/work/azureml-assets/azureml-assets/assets/aml-benchmark/scripts/data_loaders/math.py'

    @pytest.mark.parametrize(
        "dataset_name, configuration, split, script",
        [
            ("xquad", "xquad.en", "validation", None),
            ("xquad", "xquad.en,xquad.hi", "all", None),
            ("xquad", "all", "all", None),
            ("cifar10", "all", "test", None),
            (None, "all", "test", Constants.MATH_DATASET_LOADER_SCRIPT),
        ],
    )
    def test_dataset_downloader_component(
        self,
        temp_dir: str,
        dataset_name: Union[str, None],
        configuration: Union[str, None],
        split: Union[str, None],
        script: Union[str, None],
    ) -> None:
        """Dataset Downloader component test."""
        ml_client = get_mlclient()
    
        pipeline_job = self._get_pipeline_job(
            dataset_name,
            configuration,
            split,
            script,
            self.test_dataset_downloader_component.__name__,
        )
    
        # submit the pipeline job
        pipeline_job = ml_client.create_or_update(
            pipeline_job, experiment_name=self.EXP_NAME
        )
        ml_client.jobs.stream(pipeline_job.name)
        print(pipeline_job)
    
        file_count = 1
        path = dataset_name if dataset_name else script
        if configuration == "all":
>           file_count = len(get_dataset_config_names(path))

../../tests/dataset_downloader/test_dataset_downloader.py:74: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
/usr/share/miniconda/envs/isolated_1723432722702/lib/python3.11/site-packages/datasets/inspect.py:347: in get_dataset_config_names
    dataset_module = dataset_module_factory(
/usr/share/miniconda/envs/isolated_1723432722702/lib/python3.11/site-packages/datasets/load.py:1814: in dataset_module_factory
    ).get_module()
/usr/share/miniconda/envs/isolated_1723432722702/lib/python3.11/site-packages/datasets/load.py:962: in get_module
    trust_remote_code = resolve_trust_remote_code(self.trust_remote_code, self.name)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

trust_remote_code = None, repo_id = 'math'

    def resolve_trust_remote_code(trust_remote_code: Optional[bool], repo_id: str) -> bool:
        """
        Copied and adapted from Transformers
        https://github.com/huggingface/transformers/blob/2098d343cc4b4b9d2aea84b3cf1eb5a1e610deff/src/transformers/dynamic_module_utils.py#L589
        """
        trust_remote_code = trust_remote_code if trust_remote_code is not None else config.HF_DATASETS_TRUST_REMOTE_CODE
        if trust_remote_code is None:
            if config.TIME_OUT_REMOTE_CODE > 0:
                try:
                    signal.signal(signal.SIGALRM, _raise_timeout_error)
                    signal.alarm(config.TIME_OUT_REMOTE_CODE)
                    while trust_remote_code is None:
                        answer = input(
                            f"The repository for {repo_id} contains custom code which must be executed to correctly "
                            f"load the dataset. You can inspect the repository content at https://hf.co/datasets/{repo_id}.\n"
                            f"You can avoid this prompt in future by passing the argument `trust_remote_code=True`.\n\n"
                            f"Do you wish to run the custom code? [y/N] "
                        )
                        if answer.lower() in ["yes", "y", "1"]:
                            trust_remote_code = True
                        elif answer.lower() in ["no", "n", "0", ""]:
                            trust_remote_code = False
                    signal.alarm(0)
                except Exception:
                    # OS which does not support signal.SIGALRM
>                   raise ValueError(
                        f"The repository for {repo_id} contains custom code which must be executed to correctly "
                        f"load the dataset. You can inspect the repository content at https://hf.co/datasets/{repo_id}.\n"
                        f"Please pass the argument `trust_remote_code=True` to allow custom code to be run."
                    )
E                   ValueError: The repository for math contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/math.
E                   Please pass the argument `trust_remote_code=True` to allow custom code to be run.

/usr/share/miniconda/envs/isolated_1723432722702/lib/python3.11/site-packages/datasets/load.py:133: ValueError

Check warning on line 0 in component/dataset_downloader.tests.dataset_downloader.test_dataset_downloader.TestDatasetDownloaderScript

See this annotation in the file changed.

@github-actions github-actions / Test Results for assets-test

test_invalid_input_combination[None-train-None] (component/dataset_downloader.tests.dataset_downloader.test_dataset_downloader.TestDatasetDownloaderScript) failed

pytest-reports/component/dataset_downloader.xml [took 14s]
Raw output
ValueError: Loading this dataset requires you to execute custom code contained in the dataset repository on your local machine. Please set the option `trust_remote_code=True` to permit loading of this dataset.
self = <tests.dataset_downloader.test_dataset_downloader.TestDatasetDownloaderScript object at 0x7f4d42cf2150>
dataset_name = None, split = 'train', script = None

    @pytest.mark.parametrize(
        "dataset_name, split, script",
        [(None, "train", None), ("dataset", "test", "script")],
    )
    def test_invalid_input_combination(
        self,
        dataset_name: Union[str, None],
        split: Union[str, None],
        script: Union[str, None],
    ):
        """Test for invalid input combination."""
        if dataset_name and script:
            expected_exception_mssg = "Either 'dataset_name' or 'script' must be supplied; but not both."
        elif not (dataset_name or script):
            expected_exception_mssg = "Either 'dataset_name' or 'script' must be supplied."
    
        # Run the script and verify the exception
        try:
>           self._run_downloader_script(dataset_name, None, split, script)

../../tests/dataset_downloader/test_dataset_downloader.py:156: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
../../tests/dataset_downloader/test_dataset_downloader.py:226: in _run_downloader_script
    run_command(" ".join(args))
../../tests/test_utils.py:285: in run_command
    result = subprocess.run(
/usr/share/miniconda/envs/isolated_1723432722702/lib/python3.11/subprocess.py:548: in run
    stdout, stderr = process.communicate(input, timeout=timeout)
/usr/share/miniconda/envs/isolated_1723432722702/lib/python3.11/subprocess.py:1192: in communicate
    stdout = self.stdout.read()
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

signum = 14
frame = <frame at 0x7f4d3be04440, file '/usr/share/miniconda/envs/isolated_1723432722702/lib/python3.11/subprocess.py', line 1192, code communicate>

    def _raise_timeout_error(signum, frame):
>       raise ValueError(
            "Loading this dataset requires you to execute custom code contained in the dataset repository on your local "
            "machine. Please set the option `trust_remote_code=True` to permit loading of this dataset."
        )
E       ValueError: Loading this dataset requires you to execute custom code contained in the dataset repository on your local machine. Please set the option `trust_remote_code=True` to permit loading of this dataset.

/usr/share/miniconda/envs/isolated_1723432722702/lib/python3.11/site-packages/datasets/load.py:102: ValueError

Check warning on line 0 in component/dataset_downloader.tests.dataset_downloader.test_dataset_downloader.TestDatasetDownloaderScript

See this annotation in the file changed.

@github-actions github-actions / Test Results for assets-test

test_invalid_hf_dataset[winogrande-None-validation] (component/dataset_downloader.tests.dataset_downloader.test_dataset_downloader.TestDatasetDownloaderScript) failed

pytest-reports/component/dataset_downloader.xml [took 2s]
Raw output
ValueError: The repository for winogrande contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/winogrande.
Please pass the argument `trust_remote_code=True` to allow custom code to be run.
self = <tests.dataset_downloader.test_dataset_downloader.TestDatasetDownloaderScript object at 0x7f4d42cf0510>
dataset_name = 'winogrande', configuration = None, split = 'validation'

    @pytest.mark.parametrize(
        "dataset_name, configuration, split",
        [
            ("squad_v2", None, "test"),
            ("winogrande", None, "validation"),
            ("some_random_name", None, "test"),
            ("cifar100", None, "test")
        ],
    )
    def test_invalid_hf_dataset(
        self, dataset_name: str, configuration: Optional[str], split: str
    ):
        """Test for unsupported url file."""
        expected_exception_mssg = f"Split '{split}' not available for dataset '{dataset_name}' and config '{None}'."
        if dataset_name == "winogrande" and configuration is None:
            expected_exception_mssg = (
                f"Multiple configurations available for dataset '{dataset_name}'. Please specify either one of "
>               f"the following: {get_dataset_config_names(dataset_name)} or 'all'."

../../tests/dataset_downloader/test_dataset_downloader.py:178: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
/usr/share/miniconda/envs/isolated_1723432722702/lib/python3.11/site-packages/datasets/inspect.py:347: in get_dataset_config_names
    dataset_module = dataset_module_factory(
/usr/share/miniconda/envs/isolated_1723432722702/lib/python3.11/site-packages/datasets/load.py:1914: in dataset_module_factory
    raise e1 from None
/usr/share/miniconda/envs/isolated_1723432722702/lib/python3.11/site-packages/datasets/load.py:1887: in dataset_module_factory
    ).get_module()
/usr/share/miniconda/envs/isolated_1723432722702/lib/python3.11/site-packages/datasets/load.py:1525: in get_module
    trust_remote_code = resolve_trust_remote_code(self.trust_remote_code, self.name)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

trust_remote_code = None, repo_id = 'winogrande'

    def resolve_trust_remote_code(trust_remote_code: Optional[bool], repo_id: str) -> bool:
        """
        Copied and adapted from Transformers
        https://github.com/huggingface/transformers/blob/2098d343cc4b4b9d2aea84b3cf1eb5a1e610deff/src/transformers/dynamic_module_utils.py#L589
        """
        trust_remote_code = trust_remote_code if trust_remote_code is not None else config.HF_DATASETS_TRUST_REMOTE_CODE
        if trust_remote_code is None:
            if config.TIME_OUT_REMOTE_CODE > 0:
                try:
                    signal.signal(signal.SIGALRM, _raise_timeout_error)
                    signal.alarm(config.TIME_OUT_REMOTE_CODE)
                    while trust_remote_code is None:
                        answer = input(
                            f"The repository for {repo_id} contains custom code which must be executed to correctly "
                            f"load the dataset. You can inspect the repository content at https://hf.co/datasets/{repo_id}.\n"
                            f"You can avoid this prompt in future by passing the argument `trust_remote_code=True`.\n\n"
                            f"Do you wish to run the custom code? [y/N] "
                        )
                        if answer.lower() in ["yes", "y", "1"]:
                            trust_remote_code = True
                        elif answer.lower() in ["no", "n", "0", ""]:
                            trust_remote_code = False
                    signal.alarm(0)
                except Exception:
                    # OS which does not support signal.SIGALRM
>                   raise ValueError(
                        f"The repository for {repo_id} contains custom code which must be executed to correctly "
                        f"load the dataset. You can inspect the repository content at https://hf.co/datasets/{repo_id}.\n"
                        f"Please pass the argument `trust_remote_code=True` to allow custom code to be run."
                    )
E                   ValueError: The repository for winogrande contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/winogrande.
E                   Please pass the argument `trust_remote_code=True` to allow custom code to be run.

/usr/share/miniconda/envs/isolated_1723432722702/lib/python3.11/site-packages/datasets/load.py:133: ValueError

Check warning on line 0 in component/dataset_downloader.tests.dataset_downloader.test_dataset_downloader.TestDatasetDownloaderScript

See this annotation in the file changed.

@github-actions github-actions / Test Results for assets-test

test_invalid_hf_dataset[some_random_name-None-test] (component/dataset_downloader.tests.dataset_downloader.test_dataset_downloader.TestDatasetDownloaderScript) failed

pytest-reports/component/dataset_downloader.xml [took 14s]
Raw output
ValueError: Loading this dataset requires you to execute custom code contained in the dataset repository on your local machine. Please set the option `trust_remote_code=True` to permit loading of this dataset.
self = <tests.dataset_downloader.test_dataset_downloader.TestDatasetDownloaderScript object at 0x7f4d42cebd50>
dataset_name = 'some_random_name', configuration = None, split = 'test'

    @pytest.mark.parametrize(
        "dataset_name, configuration, split",
        [
            ("squad_v2", None, "test"),
            ("winogrande", None, "validation"),
            ("some_random_name", None, "test"),
            ("cifar100", None, "test")
        ],
    )
    def test_invalid_hf_dataset(
        self, dataset_name: str, configuration: Optional[str], split: str
    ):
        """Test for unsupported url file."""
        expected_exception_mssg = f"Split '{split}' not available for dataset '{dataset_name}' and config '{None}'."
        if dataset_name == "winogrande" and configuration is None:
            expected_exception_mssg = (
                f"Multiple configurations available for dataset '{dataset_name}'. Please specify either one of "
                f"the following: {get_dataset_config_names(dataset_name)} or 'all'."
            )
        elif dataset_name == "some_random_name":
            expected_exception_mssg = f"FileNotFoundError: Dataset '{dataset_name}' doesn't exist on the Hub"
        elif dataset_name == "cifar100":
            expected_exception_mssg = "Error saving dataset to JSONL format: "
    
        # Run the script and verify the exception
        try:
>           self._run_downloader_script(dataset_name, None, split, None)

../../tests/dataset_downloader/test_dataset_downloader.py:187: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
../../tests/dataset_downloader/test_dataset_downloader.py:226: in _run_downloader_script
    run_command(" ".join(args))
../../tests/test_utils.py:285: in run_command
    result = subprocess.run(
/usr/share/miniconda/envs/isolated_1723432722702/lib/python3.11/subprocess.py:548: in run
    stdout, stderr = process.communicate(input, timeout=timeout)
/usr/share/miniconda/envs/isolated_1723432722702/lib/python3.11/subprocess.py:1192: in communicate
    stdout = self.stdout.read()
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

signum = 14
frame = <frame at 0x7f4d4273fd40, file '/usr/share/miniconda/envs/isolated_1723432722702/lib/python3.11/subprocess.py', line 1192, code communicate>

    def _raise_timeout_error(signum, frame):
>       raise ValueError(
            "Loading this dataset requires you to execute custom code contained in the dataset repository on your local "
            "machine. Please set the option `trust_remote_code=True` to permit loading of this dataset."
        )
E       ValueError: Loading this dataset requires you to execute custom code contained in the dataset repository on your local machine. Please set the option `trust_remote_code=True` to permit loading of this dataset.

/usr/share/miniconda/envs/isolated_1723432722702/lib/python3.11/site-packages/datasets/load.py:102: ValueError

Check warning on line 0 in environment/acpt-pytorch-2.2-cuda12.1.tests.pytorch2_2_sample_test

See this annotation in the file changed.

@github-actions github-actions / Test Results for assets-test

test_pytorch_2_2 (environment/acpt-pytorch-2.2-cuda12.1.tests.pytorch2_2_sample_test) failed

pytest-reports/environment/acpt-pytorch-2.2-cuda12.1.xml [took 35m 55s]
Raw output
azure.ai.ml.exceptions.JobException: Exception : 
 {
    "error": {
        "code": "UserError",
        "message": "Execution failed. User process 'Rank 1' exited with status code 1. Please check log file 'user_logs/std_log_process_1.txt' for error details. Error: Traceback (most recent call last):\n  File \"/mnt/azureml/cr/j/81b29a2e28f14fcc9a90a2d4e67dbc62/exe/wd/pretrain_glue.py\", line 27, in <module>\n    from transformers import (\n  File \"<frozen importlib._bootstrap>\", line 1075, in _handle_fromlist\n  File \"/opt/conda/envs/ptca/lib/python3.10/site-packages/transformers/utils/import_utils.py\", line 1272, in __getattr__\n    module = self._get_module(self._class_to_module[name])\n  File \"/opt/conda/envs/ptca/lib/python3.10/site-packages/transformers/utils/import_utils.py\", line 1284, in _get_module\n    raise RuntimeError(\nRuntimeError: Failed to import transformers.trainer because of the following error (look up to see its traceback):\nFailed to import transformers.integrations.integration_utils because of the following error (look up to see its traceback):\ncannot import name 'split_torch_state_dict_into_shards' from 'huggingface_hub' (/opt/conda/envs/ptca/lib/python3.10/site-packages/huggingface_hub/__init__.py)\n",
        "message_parameters": {},
        "details": []
    },
    "time": "0001-01-01T00:00:00.000Z",
    "component_name": "CommonRuntime"
}
def test_pytorch_2_2():
        """Tests a sample job using pytorch 2.0 as the environment."""
        this_dir = Path(__file__).parent
    
        subscription_id = os.environ.get("subscription_id")
        resource_group = os.environ.get("resource_group")
        workspace_name = os.environ.get("workspace")
    
        ml_client = MLClient(
            AzureCliCredential(), subscription_id, resource_group, workspace_name
        )
    
        env_name = "acpt-pytorch-2_2-cuda12_1"
    
        env_docker_context = Environment(
            build=BuildContext(path=this_dir / BUILD_CONTEXT),
            name=env_name,
            description="Pytorch 2.2 environment created from a Docker context.",
        )
        ml_client.environments.create_or_update(env_docker_context)
    
        # create the command
        job = command(
            code=this_dir / JOB_SOURCE_CODE,  # local path where the code is stored
            command="pip install -r requirements.txt && pip install multiprocess==0.70.15"
                    " && python pretrain_glue.py --tensorboard_log_dir \"/outputs/runs/\""
                    " --deepspeed ds_config.json --num_train_epochs 5 --output_dir outputs --disable_tqdm 1"
                    " --local_rank $RANK --evaluation_strategy \"epoch\" --logging_strategy \"epoch\""
                    " --per_device_train_batch_size 93 --gradient_accumulation_steps 1"
                    " --per_device_eval_batch_size 93 --learning_rate 3e-05 --adam_beta1 0.8 --adam_beta2 0.999"
                    " --weight_decay 3e-07 --warmup_steps 500 --fp16 --logging_steps 1000"
                    " --model_checkpoint \"bert-large-uncased\"",
            outputs={
                "output": Output(
                    type="uri_folder",
                    mode="rw_mount",
                    path="azureml://datastores/workspaceblobstore/paths/outputs"
                )
            },
            environment=f"{env_name}@latest",
            compute=os.environ.get("gpu_v100_cluster"),
            display_name="bert-pretrain-GLUE",
            description="Pretrain the BERT model on the GLUE dataset.",
            experiment_name="pytorch22_Cuda121_py310_Experiment",
            distribution=PyTorchDistribution(process_count_per_instance=1),
            resources=JobResourceConfiguration(instance_count=2, shm_size='3100m'),
        )
    
        returned_job = ml_client.create_or_update(job)
        assert returned_job is not None
    
        # Poll until final status is reached or timed out
        timeout = time.time() + (TIMEOUT_MINUTES * 60)
        while time.time() <= timeout:
            current_status = ml_client.jobs.get(returned_job.name).status
            if current_status in ["Completed", "Failed"]:
                break
            time.sleep(30)  # sleep 30 seconds
    
        bashCommand = "ls"
        process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE)
        output, error = process.communicate()
        print(output)
        print(error)
    
        if current_status == "Failed" or current_status == "Cancelled":
            ml_client.jobs.download(returned_job.name)
            if STD_LOG.exists():
                print(f"*** BEGIN {STD_LOG} ***")
                with open(STD_LOG, "r") as f:
                    print(f.read(), end="")
                print(f"*** END {STD_LOG} ***")
            else:
>               ml_client.jobs.stream(returned_job.name)

tests/pytorch2_2_sample_test.py:92: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
/usr/share/miniconda/envs/isolated_1723432738489/lib/python3.12/site-packages/azure/core/tracing/decorator.py:94: in wrapper_use_tracer
    return func(*args, **kwargs)
/usr/share/miniconda/envs/isolated_1723432738489/lib/python3.12/site-packages/azure/ai/ml/operations/_job_operations.py:617: in stream
    self._stream_logs_until_completion(
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

run_operations = <azure.ai.ml.operations._run_operations.RunOperations object at 0x7fbe998e0350>
job_resource = <azure.ai.ml._restclient.v2022_10_01_preview.models._models_py3.JobBase object at 0x7fbe9981f1a0>
datastore_operations = <azure.ai.ml.operations._datastore_operations.DatastoreOperations object at 0x7fbe9d0c1820>
raise_exception_on_failed_job = True

    def stream_logs_until_completion(
        run_operations: RunOperations,
        job_resource: JobBaseData,
        datastore_operations: DatastoreOperations = None,
        raise_exception_on_failed_job=True,
        *,
        requests_pipeline: HttpPipeline
    ) -> None:
        """Stream the experiment run output to the specified file handle. By
        default the the file handle points to stdout.
    
        :param run_operations: The run history operations class.
        :type run_operations: RunOperations
        :param job_resource: The job to stream
        :type job_resource: JobBaseData
        :param datastore_operations: Optional, the datastore operations class, used to get logs from datastore
        :type datastore_operations: Optional[DatastoreOperations]
        :param raise_exception_on_failed_job: Should this method fail if job fails
        :type raise_exception_on_failed_job: Boolean
        :return:
        :rtype: None
        """
        job_type = job_resource.properties.job_type
        job_name = job_resource.name
        studio_endpoint = job_resource.properties.services.get("Studio", None)
        studio_endpoint = studio_endpoint.endpoint if studio_endpoint else None
        file_handle = sys.stdout
        ds_properties = None
        prefix = None
        if (
            hasattr(job_resource.properties, "outputs")
            and job_resource.properties.job_type != RestJobType.AUTO_ML
            and datastore_operations
        ):
            # Get default output location
    
            default_output = (
                job_resource.properties.outputs.get("default", None) if job_resource.properties.outputs else None
            )
            is_uri_folder = default_output and default_output.job_output_type == DataType.URI_FOLDER
            if is_uri_folder:
                output_uri = default_output.uri
                # Parse the uri format
                output_uri = output_uri.split("datastores/")[1]
                datastore_name, prefix = output_uri.split("/", 1)
                ds_properties = get_datastore_info(datastore_operations, datastore_name)
    
        try:
            file_handle.write("RunId: {}\n".format(job_name))
            file_handle.write("Web View: {}\n".format(studio_endpoint))
    
            _current_details: RunDetails = run_operations.get_run_details(job_name)
    
            processed_logs = {}
    
            poll_start_time = time.time()
            pipeline_with_retries = create_requests_pipeline_with_retry(requests_pipeline=requests_pipeline)
            while (
                _current_details.status in RunHistoryConstants.IN_PROGRESS_STATUSES
                or _current_details.status == JobStatus.FINALIZING
            ):
                file_handle.flush()
                time.sleep(_wait_before_polling(time.time() - poll_start_time))
                _current_details: RunDetails = run_operations.get_run_details(job_name)  # TODO use FileWatcher
                if job_type.lower() in JobType.PIPELINE:
                    legacy_folder_name = "/logs/azureml/"
                else:
                    legacy_folder_name = "/azureml-logs/"
                _current_logs_dict = (
                    list_logs_in_datastore(
                        ds_properties,
                        prefix=prefix,
                        legacy_log_folder_name=legacy_folder_name,
                    )
                    if ds_properties is not None
                    else _current_details.log_files
                )
                # Get the list of new logs available after filtering out the processed ones
                available_logs = _get_sorted_filtered_logs(_current_logs_dict, job_type, processed_logs)
                content = ""
                for current_log in available_logs:
                    content = download_text_from_url(
                        _current_logs_dict[current_log],
                        pipeline_with_retries,
                        timeout=RunHistoryConstants._DEFAULT_GET_CONTENT_TIMEOUT,
                    )
    
                    _incremental_print(content, processed_logs, current_log, file_handle)
    
                # TODO: Temporary solution to wait for all the logs to be printed in the finalizing state.
                if (
                    _current_details.status not in RunHistoryConstants.IN_PROGRESS_STATUSES
                    and _current_details.status == JobStatus.FINALIZING
                    and "The activity completed successfully. Finalizing run..." in content
                ):
                    break
    
            file_handle.write("\n")
            file_handle.write("Execution Summary\n")
            file_handle.write("=================\n")
            file_handle.write("RunId: {}\n".format(job_name))
            file_handle.write("Web View: {}\n".format(studio_endpoint))
    
            warnings = _current_details.warnings
            if warnings:
                messages = [x.message for x in warnings if x.message]
                if len(messages) > 0:
                    file_handle.write("\nWarnings:\n")
                    for message in messages:
                        file_handle.write(message + "\n")
                    file_handle.write("\n")
    
            if _current_details.status == JobStatus.FAILED:
                error = (
                    _current_details.error.as_dict()
                    if _current_details.error
                    else "Detailed error not set on the Run. Please check the logs for details."
                )
                # If we are raising the error later on, so we don't double print.
                if not raise_exception_on_failed_job:
                    file_handle.write("\nError:\n")
                    file_handle.write(json.dumps(error, indent=4))
                    file_handle.write("\n")
                else:
>                   raise JobException(
                        message="Exception : \n {} ".format(json.dumps(error, indent=4)),
                        target=ErrorTarget.JOB,
                        no_personal_data_message="Exception raised on failed job.",
                        error_category=ErrorCategory.SYSTEM_ERROR,
                    )
E                   azure.ai.ml.exceptions.JobException: Exception : 
E                    {
E                       "error": {
E                           "code": "UserError",
E                           "message": "Execution failed. User process 'Rank 1' exited with status code 1. Please check log file 'user_logs/std_log_process_1.txt' for error details. Error: Traceback (most recent call last):\n  File \"/mnt/azureml/cr/j/81b29a2e28f14fcc9a90a2d4e67dbc62/exe/wd/pretrain_glue.py\", line 27, in <module>\n    from transformers import (\n  File \"<frozen importlib._bootstrap>\", line 1075, in _handle_fromlist\n  File \"/opt/conda/envs/ptca/lib/python3.10/site-packages/transformers/utils/import_utils.py\", line 1272, in __getattr__\n    module = self._get_module(self._class_to_module[name])\n  File \"/opt/conda/envs/ptca/lib/python3.10/site-packages/transformers/utils/import_utils.py\", line 1284, in _get_module\n    raise RuntimeError(\nRuntimeError: Failed to import transformers.trainer because of the following error (look up to see its traceback):\nFailed to import transformers.integrations.integration_utils because of the following error (look up to see its traceback):\ncannot import name 'split_torch_state_dict_into_shards' from 'huggingface_hub' (/opt/conda/envs/ptca/lib/python3.10/site-packages/huggingface_hub/__init__.py)\n",
E                           "message_parameters": {},
E                           "details": []
E                       },
E                       "time": "0001-01-01T00:00:00.000Z",
E                       "component_name": "CommonRuntime"
E                   }

/usr/share/miniconda/envs/isolated_1723432738489/lib/python3.12/site-packages/azure/ai/ml/operations/_job_ops_helper.py:297: JobException

Check warning on line 0 in environment/acpt-pytorch-1.13-cuda11.7.tests.pytorch1_11_sample_test

See this annotation in the file changed.

@github-actions github-actions / Test Results for assets-test

test_pytorch_1_13 (environment/acpt-pytorch-1.13-cuda11.7.tests.pytorch1_11_sample_test) failed

pytest-reports/environment/acpt-pytorch-1.13-cuda11.7.xml [took 43m 56s]
Raw output
azure.ai.ml.exceptions.JobException: Exception : 
 {
    "error": {
        "code": "UserError",
        "message": "Execution failed. User process 'Rank 1' exited with status code 1. Please check log file 'user_logs/std_log_process_1.txt' for error details. Error: Traceback (most recent call last):\n  File \"pretrain_glue.py\", line 27, in <module>\n    from transformers import (\n  File \"<frozen importlib._bootstrap>\", line 1039, in _handle_fromlist\n  File \"/opt/conda/envs/ptca/lib/python3.8/site-packages/transformers/utils/import_utils.py\", line 1272, in __getattr__\n    module = self._get_module(self._class_to_module[name])\n  File \"/opt/conda/envs/ptca/lib/python3.8/site-packages/transformers/utils/import_utils.py\", line 1284, in _get_module\n    raise RuntimeError(\nRuntimeError: Failed to import transformers.trainer because of the following error (look up to see its traceback):\nFailed to import transformers.integrations.integration_utils because of the following error (look up to see its traceback):\ncannot import name 'split_torch_state_dict_into_shards' from 'huggingface_hub' (/opt/conda/envs/ptca/lib/python3.8/site-packages/huggingface_hub/__init__.py)\n",
        "message_parameters": {},
        "details": []
    },
    "time": "0001-01-01T00:00:00.000Z",
    "component_name": "CommonRuntime"
}
def test_pytorch_1_13():
        """Tests a sample job using pytorch 1.13 as the environment."""
        this_dir = Path(__file__).parent
    
        subscription_id = os.environ.get("subscription_id")
        resource_group = os.environ.get("resource_group")
        workspace_name = os.environ.get("workspace")
    
        ml_client = MLClient(
            AzureCliCredential(), subscription_id, resource_group, workspace_name
        )
    
        env_name = "acpt-pytorch-1_13-cuda11_7"
    
        env_docker_context = Environment(
            build=BuildContext(path=this_dir / BUILD_CONTEXT),
            name=env_name,
            description="Pytorch 1.13 environment created from a Docker context.",
        )
        ml_client.environments.create_or_update(env_docker_context)
    
        # create the command
        job = command(
            code=this_dir / JOB_SOURCE_CODE,  # local path where the code is stored
            command="pip install -r requirements.txt"
                    " && python pretrain_glue.py --tensorboard_log_dir \"/outputs/runs/\""
                    " --deepspeed ds_config.json --num_train_epochs 5 --output_dir outputs --disable_tqdm 1"
                    " --local_rank $RANK --evaluation_strategy \"epoch\" --logging_strategy \"epoch\""
                    " --per_device_train_batch_size 93 --gradient_accumulation_steps 1"
                    " --per_device_eval_batch_size 93 --learning_rate 3e-05 --adam_beta1 0.8 --adam_beta2 0.999"
                    " --weight_decay 3e-07 --warmup_steps 500 --fp16 --logging_steps 1000"
                    " --model_checkpoint \"bert-large-uncased\"",
            outputs={
                "output": Output(
                    type="uri_folder",
                    mode="rw_mount",
                    path="azureml://datastores/workspaceblobstore/paths/outputs"
                )
            },
            environment=f"{env_name}@latest",
            compute=os.environ.get("gpu_v100_cluster"),
            display_name="bert-pretrain-GLUE",
            description="Pretrain the BERT model on the GLUE dataset.",
            experiment_name="pytorch113_Cuda117_Experiment",
            distribution=PyTorchDistribution(process_count_per_instance=1),
            resources=JobResourceConfiguration(instance_count=2, shm_size='3100m'),
        )
    
        returned_job = ml_client.create_or_update(job)
        assert returned_job is not None
    
        # Poll until final status is reached or timed out
        timeout = time.time() + (TIMEOUT_MINUTES * 60)
        while time.time() <= timeout:
            current_status = ml_client.jobs.get(returned_job.name).status
            if current_status in ["Completed", "Failed"]:
                break
            time.sleep(30)  # sleep 30 seconds
    
        bashCommand = "ls"
        process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE)
        output, error = process.communicate()
        print(output)
        print(error)
    
        if current_status == "Failed":
            ml_client.jobs.download(returned_job.name)
            if STD_LOG.exists():
                print(f"*** BEGIN {STD_LOG} ***")
                with open(STD_LOG, "r") as f:
                    print(f.read(), end="")
                print(f"*** END {STD_LOG} ***")
            else:
>               ml_client.jobs.stream(returned_job.name)

tests/pytorch1_11_sample_test.py:92: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
/usr/share/miniconda/envs/isolated_1723432714935/lib/python3.12/site-packages/azure/core/tracing/decorator.py:94: in wrapper_use_tracer
    return func(*args, **kwargs)
/usr/share/miniconda/envs/isolated_1723432714935/lib/python3.12/site-packages/azure/ai/ml/operations/_job_operations.py:617: in stream
    self._stream_logs_until_completion(
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

run_operations = <azure.ai.ml.operations._run_operations.RunOperations object at 0x7f6ad1006ea0>
job_resource = <azure.ai.ml._restclient.v2022_10_01_preview.models._models_py3.JobBase object at 0x7f6ad3063dd0>
datastore_operations = <azure.ai.ml.operations._datastore_operations.DatastoreOperations object at 0x7f6ad47c56d0>
raise_exception_on_failed_job = True

    def stream_logs_until_completion(
        run_operations: RunOperations,
        job_resource: JobBaseData,
        datastore_operations: DatastoreOperations = None,
        raise_exception_on_failed_job=True,
        *,
        requests_pipeline: HttpPipeline
    ) -> None:
        """Stream the experiment run output to the specified file handle. By
        default the the file handle points to stdout.
    
        :param run_operations: The run history operations class.
        :type run_operations: RunOperations
        :param job_resource: The job to stream
        :type job_resource: JobBaseData
        :param datastore_operations: Optional, the datastore operations class, used to get logs from datastore
        :type datastore_operations: Optional[DatastoreOperations]
        :param raise_exception_on_failed_job: Should this method fail if job fails
        :type raise_exception_on_failed_job: Boolean
        :return:
        :rtype: None
        """
        job_type = job_resource.properties.job_type
        job_name = job_resource.name
        studio_endpoint = job_resource.properties.services.get("Studio", None)
        studio_endpoint = studio_endpoint.endpoint if studio_endpoint else None
        file_handle = sys.stdout
        ds_properties = None
        prefix = None
        if (
            hasattr(job_resource.properties, "outputs")
            and job_resource.properties.job_type != RestJobType.AUTO_ML
            and datastore_operations
        ):
            # Get default output location
    
            default_output = (
                job_resource.properties.outputs.get("default", None) if job_resource.properties.outputs else None
            )
            is_uri_folder = default_output and default_output.job_output_type == DataType.URI_FOLDER
            if is_uri_folder:
                output_uri = default_output.uri
                # Parse the uri format
                output_uri = output_uri.split("datastores/")[1]
                datastore_name, prefix = output_uri.split("/", 1)
                ds_properties = get_datastore_info(datastore_operations, datastore_name)
    
        try:
            file_handle.write("RunId: {}\n".format(job_name))
            file_handle.write("Web View: {}\n".format(studio_endpoint))
    
            _current_details: RunDetails = run_operations.get_run_details(job_name)
    
            processed_logs = {}
    
            poll_start_time = time.time()
            pipeline_with_retries = create_requests_pipeline_with_retry(requests_pipeline=requests_pipeline)
            while (
                _current_details.status in RunHistoryConstants.IN_PROGRESS_STATUSES
                or _current_details.status == JobStatus.FINALIZING
            ):
                file_handle.flush()
                time.sleep(_wait_before_polling(time.time() - poll_start_time))
                _current_details: RunDetails = run_operations.get_run_details(job_name)  # TODO use FileWatcher
                if job_type.lower() in JobType.PIPELINE:
                    legacy_folder_name = "/logs/azureml/"
                else:
                    legacy_folder_name = "/azureml-logs/"
                _current_logs_dict = (
                    list_logs_in_datastore(
                        ds_properties,
                        prefix=prefix,
                        legacy_log_folder_name=legacy_folder_name,
                    )
                    if ds_properties is not None
                    else _current_details.log_files
                )
                # Get the list of new logs available after filtering out the processed ones
                available_logs = _get_sorted_filtered_logs(_current_logs_dict, job_type, processed_logs)
                content = ""
                for current_log in available_logs:
                    content = download_text_from_url(
                        _current_logs_dict[current_log],
                        pipeline_with_retries,
                        timeout=RunHistoryConstants._DEFAULT_GET_CONTENT_TIMEOUT,
                    )
    
                    _incremental_print(content, processed_logs, current_log, file_handle)
    
                # TODO: Temporary solution to wait for all the logs to be printed in the finalizing state.
                if (
                    _current_details.status not in RunHistoryConstants.IN_PROGRESS_STATUSES
                    and _current_details.status == JobStatus.FINALIZING
                    and "The activity completed successfully. Finalizing run..." in content
                ):
                    break
    
            file_handle.write("\n")
            file_handle.write("Execution Summary\n")
            file_handle.write("=================\n")
            file_handle.write("RunId: {}\n".format(job_name))
            file_handle.write("Web View: {}\n".format(studio_endpoint))
    
            warnings = _current_details.warnings
            if warnings:
                messages = [x.message for x in warnings if x.message]
                if len(messages) > 0:
                    file_handle.write("\nWarnings:\n")
                    for message in messages:
                        file_handle.write(message + "\n")
                    file_handle.write("\n")
    
            if _current_details.status == JobStatus.FAILED:
                error = (
                    _current_details.error.as_dict()
                    if _current_details.error
                    else "Detailed error not set on the Run. Please check the logs for details."
                )
                # If we are raising the error later on, so we don't double print.
                if not raise_exception_on_failed_job:
                    file_handle.write("\nError:\n")
                    file_handle.write(json.dumps(error, indent=4))
                    file_handle.write("\n")
                else:
>                   raise JobException(
                        message="Exception : \n {} ".format(json.dumps(error, indent=4)),
                        target=ErrorTarget.JOB,
                        no_personal_data_message="Exception raised on failed job.",
                        error_category=ErrorCategory.SYSTEM_ERROR,
                    )
E                   azure.ai.ml.exceptions.JobException: Exception : 
E                    {
E                       "error": {
E                           "code": "UserError",
E                           "message": "Execution failed. User process 'Rank 1' exited with status code 1. Please check log file 'user_logs/std_log_process_1.txt' for error details. Error: Traceback (most recent call last):\n  File \"pretrain_glue.py\", line 27, in <module>\n    from transformers import (\n  File \"<frozen importlib._bootstrap>\", line 1039, in _handle_fromlist\n  File \"/opt/conda/envs/ptca/lib/python3.8/site-packages/transformers/utils/import_utils.py\", line 1272, in __getattr__\n    module = self._get_module(self._class_to_module[name])\n  File \"/opt/conda/envs/ptca/lib/python3.8/site-packages/transformers/utils/import_utils.py\", line 1284, in _get_module\n    raise RuntimeError(\nRuntimeError: Failed to import transformers.trainer because of the following error (look up to see its traceback):\nFailed to import transformers.integrations.integration_utils because of the following error (look up to see its traceback):\ncannot import name 'split_torch_state_dict_into_shards' from 'huggingface_hub' (/opt/conda/envs/ptca/lib/python3.8/site-packages/huggingface_hub/__init__.py)\n",
E                           "message_parameters": {},
E                           "details": []
E                       },
E                       "time": "0001-01-01T00:00:00.000Z",
E                       "component_name": "CommonRuntime"
E                   }

/usr/share/miniconda/envs/isolated_1723432714935/lib/python3.12/site-packages/azure/ai/ml/operations/_job_ops_helper.py:297: JobException

Check warning on line 0 in environment/tensorflow-2.11-cuda11.tests.tensorflow2_11_sample_test

See this annotation in the file changed.

@github-actions github-actions / Test Results for assets-test

test_tensorflow_2_11 (environment/tensorflow-2.11-cuda11.tests.tensorflow2_11_sample_test) failed

pytest-reports/environment/tensorflow-2.11-cuda11.xml [took 11m 26s]
Raw output
azure.ai.ml._ml_exceptions.JobException: Exception : 
 {
    "error": {
        "code": "UserError",
        "message": "Image build failed. For more details, check log file azureml-logs/20_image_build_log.txt.",
        "message_format": "Image build failed. For more details, check log file {ArtifactPath}.",
        "message_parameters": {
            "ArtifactPath": "azureml-logs/20_image_build_log.txt"
        },
        "details": [],
        "inner_error": {
            "code": "BadArgument",
            "inner_error": {
                "code": "ImageBuildFailure"
            }
        }
    },
    "correlation": {
        "operation": "220363807c5cd69f3ae6e5e8def60fe9",
        "request": "e29f83e22989c990"
    },
    "environment": "eastus",
    "location": "eastus",
    "time": "2024-08-12T03:30:10.050746Z",
    "component_name": "RunHistory"
}
def test_tensorflow_2_11():
        """Tests a sample job using tensorflow 2.11 as the environment."""
        this_dir = Path(__file__).parent
    
        subscription_id = os.environ.get("subscription_id")
        resource_group = os.environ.get("resource_group")
        workspace_name = os.environ.get("workspace")
    
        ml_client = MLClient(
            AzureCliCredential(), subscription_id, resource_group, workspace_name
        )
    
        env_name = "tensorflow-2_11-cuda11"
    
        env_docker_context = Environment(
            build=BuildContext(path=this_dir / BUILD_CONTEXT),
            name=env_name,
            description="Tensorflow 2.11 environment created from a Docker context.",
        )
        ml_client.environments.create_or_update(env_docker_context)
    
        # create the command
        job = command(
            code=this_dir / JOB_SOURCE_CODE,  # local path where the code is stored
            command="python main.py",
            environment=f"{env_name}@latest",
            compute=os.environ.get("gpu_v100_cluster"),
            display_name="tensorflow-mnist-example",
            description="A test run of the tensorflow 2_11 curated environment",
            experiment_name="tensorflow211Experiment"
        )
    
        returned_job = ml_client.create_or_update(job)
        assert returned_job is not None
    
        # Poll until final status is reached or timed out
        timeout = time.time() + (TIMEOUT_MINUTES * 60)
        while time.time() <= timeout:
            job = ml_client.jobs.get(returned_job.name)
            status = job.status
            if status in [JobStatus.COMPLETED, JobStatus.FAILED]:
                break
            time.sleep(30)  # sleep 30 seconds
        else:
            # Timeout
            ml_client.jobs.cancel(returned_job.name)
            raise Exception(f"Test aborted because the job took longer than {TIMEOUT_MINUTES} minutes. "
                            f"Last status was {status}.")
    
        if status == JobStatus.FAILED:
            ml_client.jobs.download(returned_job.name)
            if STD_LOG.exists():
                print(f"*** BEGIN {STD_LOG} ***")
                with open(STD_LOG, "r") as f:
                    print(f.read(), end="")
                print(f"*** END {STD_LOG} ***")
            else:
>               ml_client.jobs.stream(returned_job.name)

tests/tensorflow2_11_sample_test.py:76: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
/usr/share/miniconda/envs/isolated_1723432733547/lib/python3.12/site-packages/azure/ai/ml/_telemetry/activity.py:169: in wrapper
    return f(*args, **kwargs)
/usr/share/miniconda/envs/isolated_1723432733547/lib/python3.12/site-packages/azure/ai/ml/operations/_job_operations.py:490: in stream
    self._stream_logs_until_completion(
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

run_operations = <azure.ai.ml.operations._run_operations.RunOperations object at 0x7f2272a32c00>
job_resource = <azure.ai.ml._restclient.v2022_02_01_preview.models._models_py3.JobBaseData object at 0x7f2272a8dee0>
datastore_operations = <azure.ai.ml.operations._datastore_operations.DatastoreOperations object at 0x7f2273bbd6a0>
raise_exception_on_failed_job = True

    def stream_logs_until_completion(
        run_operations: RunOperations,
        job_resource: JobBaseData,
        datastore_operations: DatastoreOperations = None,
        raise_exception_on_failed_job=True,
    ) -> None:
        """Stream the experiment run output to the specified file handle.
        By default the the file handle points to stdout.
        :param run_operations: The run history operations class.
        :type run_operations: RunOperations
        :param job_resource: The job to stream
        :type job_resource: JobBaseData
        :param datastore_operations: Optional, the datastore operations class, used to get logs from datastore
        :type datastore_operations: Optional[DatastoreOperations]
        :param raise_exception_on_failed_job: Should this method fail if job fails
        :type raise_exception_on_failed_job: Boolean
        :return:
        :rtype: None
        """
        job_type = job_resource.properties.job_type
        job_name = job_resource.name
        studio_endpoint = job_resource.properties.services.get("Studio", None)
        studio_endpoint = studio_endpoint.endpoint if studio_endpoint else None
        file_handle = sys.stdout
        ds_properties = None
        prefix = None
        if (
            hasattr(job_resource.properties, "outputs")
            and job_resource.properties.job_type != RestJobType.AUTO_ML
            and datastore_operations
        ):
            # Get default output location
    
            default_output = (
                job_resource.properties.outputs.get("default", None) if job_resource.properties.outputs else None
            )
            is_uri_folder = default_output and default_output.job_output_type == DataType.URI_FOLDER
            if is_uri_folder:
                output_uri = default_output.uri
                # Parse the uri format
                output_uri = output_uri.split("datastores/")[1]
                datastore_name, prefix = output_uri.split("/", 1)
                ds_properties = get_datastore_info(datastore_operations, datastore_name)
    
        try:
            file_handle.write("RunId: {}\n".format(job_name))
            file_handle.write("Web View: {}\n".format(studio_endpoint))
    
            _current_details: RunDetails = run_operations.get_run_details(job_name)
            session = create_session_with_retry()
    
            processed_logs = {}
    
            poll_start_time = time.time()
            while (
                _current_details.status in RunHistoryConstants.IN_PROGRESS_STATUSES
                or _current_details.status == JobStatus.FINALIZING
            ):
                file_handle.flush()
                time.sleep(_wait_before_polling(time.time() - poll_start_time))
                _current_details: RunDetails = run_operations.get_run_details(job_name)  # TODO use FileWatcher
                if job_type.lower() in JobType.PIPELINE:
                    legacy_folder_name = "/logs/azureml/"
                else:
                    legacy_folder_name = "/azureml-logs/"
                _current_logs_dict = (
                    list_logs_in_datastore(ds_properties, prefix=prefix, legacy_log_folder_name=legacy_folder_name)
                    if ds_properties is not None
                    else _current_details.log_files
                )
                # Get the list of new logs available after filtering out the processed ones
                available_logs = _get_sorted_filtered_logs(_current_logs_dict, job_type, processed_logs)
                content = ""
                for current_log in available_logs:
                    content = download_text_from_url(
                        _current_logs_dict[current_log],
                        session,
                        timeout=RunHistoryConstants._DEFAULT_GET_CONTENT_TIMEOUT,
                    )
    
                    _incremental_print(content, processed_logs, current_log, file_handle)
    
                # TODO: Temporary solution to wait for all the logs to be printed in the finalizing state.
                if (
                    _current_details.status not in RunHistoryConstants.IN_PROGRESS_STATUSES
                    and _current_details.status == JobStatus.FINALIZING
                    and "The activity completed successfully. Finalizing run..." in content
                ):
                    break
    
            file_handle.write("\n")
            file_handle.write("Execution Summary\n")
            file_handle.write("=================\n")
            file_handle.write("RunId: {}\n".format(job_name))
            file_handle.write("Web View: {}\n".format(studio_endpoint))
    
            warnings = _current_details.warnings
            if warnings:
                messages = [x.message for x in warnings if x.message]
                if len(messages) > 0:
                    file_handle.write("\nWarnings:\n")
                    for message in messages:
                        file_handle.write(message + "\n")
                    file_handle.write("\n")
    
            if _current_details.status == JobStatus.FAILED:
                error = (
                    _current_details.error.as_dict()
                    if _current_details.error
                    else "Detailed error not set on the Run. Please check the logs for details."
                )
                # If we are raising the error later on, so we don't double print.
                if not raise_exception_on_failed_job:
                    file_handle.write("\nError:\n")
                    file_handle.write(json.dumps(error, indent=4))
                    file_handle.write("\n")
                else:
>                   raise JobException(
                        message="Exception : \n {} ".format(json.dumps(error, indent=4)),
                        target=ErrorTarget.JOB,
                        no_personal_data_message="Exception raised on failed job.",
                    )
E                   azure.ai.ml._ml_exceptions.JobException: Exception : 
E                    {
E                       "error": {
E                           "code": "UserError",
E                           "message": "Image build failed. For more details, check log file azureml-logs/20_image_build_log.txt.",
E                           "message_format": "Image build failed. For more details, check log file {ArtifactPath}.",
E                           "message_parameters": {
E                               "ArtifactPath": "azureml-logs/20_image_build_log.txt"
E                           },
E                           "details": [],
E                           "inner_error": {
E                               "code": "BadArgument",
E                               "inner_error": {
E                                   "code": "ImageBuildFailure"
E                               }
E                           }
E                       },
E                       "correlation": {
E                           "operation": "220363807c5cd69f3ae6e5e8def60fe9",
E                           "request": "e29f83e22989c990"
E                       },
E                       "environment": "eastus",
E                       "location": "eastus",
E                       "time": "2024-08-12T03:30:10.050746Z",
E                       "component_name": "RunHistory"
E                   }

/usr/share/miniconda/envs/isolated_1723432733547/lib/python3.12/site-packages/azure/ai/ml/operations/_job_ops_helper.py:285: JobException

Check warning on line 0 in environment/acpt-pytorch-2.2-cuda12.1-profiler.tests.pytorch2_2_sample_test

See this annotation in the file changed.

@github-actions github-actions / Test Results for assets-test

test_pytorch_2_2 (environment/acpt-pytorch-2.2-cuda12.1-profiler.tests.pytorch2_2_sample_test) failed

pytest-reports/environment/acpt-pytorch-2.2-cuda12.1-profiler.xml [took 1h 0m 38s]
Raw output
AssertionError: assert 'Running' == 'Completed'
  - Completed
  + Running
def test_pytorch_2_2():
        """Tests a sample job using pytorch 2.0 as the environment."""
        this_dir = Path(__file__).parent
    
        subscription_id = os.environ.get("subscription_id")
        resource_group = os.environ.get("resource_group")
        workspace_name = os.environ.get("workspace")
    
        ml_client = MLClient(
            AzureCliCredential(), subscription_id, resource_group, workspace_name
        )
    
        env_name = "acpt-pytorch-2_2-cuda12_1-profiler"
    
        env_docker_context = Environment(
            build=BuildContext(path=this_dir / BUILD_CONTEXT),
            name=env_name,
            description="Pytorch 2.2 environment created from a Docker context.",
        )
        ml_client.environments.create_or_update(env_docker_context)
    
        # create the command
        job = command(
            code=this_dir / JOB_SOURCE_CODE,  # local path where the code is stored
            command="pip install -r requirements.txt && pip install multiprocess==0.70.15"
                    " && python pretrain_glue.py --tensorboard_log_dir \"/outputs/runs/\""
                    " --deepspeed ds_config.json --num_train_epochs 5 --output_dir outputs --disable_tqdm 1"
                    " --local_rank $RANK --evaluation_strategy \"epoch\" --logging_strategy \"epoch\""
                    " --per_device_train_batch_size 93 --gradient_accumulation_steps 1"
                    " --per_device_eval_batch_size 93 --learning_rate 3e-05 --adam_beta1 0.8 --adam_beta2 0.999"
                    " --weight_decay 3e-07 --warmup_steps 500 --fp16 --logging_steps 1000"
                    " --model_checkpoint \"bert-large-uncased\"",
            outputs={
                "output": Output(
                    type="uri_folder",
                    mode="rw_mount",
                    path="azureml://datastores/workspaceblobstore/paths/outputs"
                )
            },
            environment=f"{env_name}@latest",
            compute=os.environ.get("gpu_v100_cluster"),
            display_name="bert-pretrain-GLUE",
            description="Pretrain the BERT model on the GLUE dataset.",
            experiment_name="pytorch22_Cuda121_py310_profiler_Experiment",
            distribution=PyTorchDistribution(process_count_per_instance=1),
            resources=JobResourceConfiguration(instance_count=2, shm_size='3100m'),
        )
    
        returned_job = ml_client.create_or_update(job)
        assert returned_job is not None
    
        # Poll until final status is reached or timed out
        timeout = time.time() + (TIMEOUT_MINUTES * 60)
        while time.time() <= timeout:
            current_status = ml_client.jobs.get(returned_job.name).status
            if current_status in ["Completed", "Failed"]:
                break
            time.sleep(30)  # sleep 30 seconds
    
        bashCommand = "ls"
        process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE)
        output, error = process.communicate()
        print(output)
        print(error)
    
        if current_status == "Failed" or current_status == "Cancelled":
            ml_client.jobs.download(returned_job.name)
            if STD_LOG.exists():
                print(f"*** BEGIN {STD_LOG} ***")
                with open(STD_LOG, "r") as f:
                    print(f.read(), end="")
                print(f"*** END {STD_LOG} ***")
            else:
                ml_client.jobs.stream(returned_job.name)
    
>       assert current_status == "Completed"
E       AssertionError: assert 'Running' == 'Completed'
E         - Completed
E         + Running

tests/pytorch2_2_sample_test.py:94: AssertionError