diff --git a/.github/workflows/benchmark_nightly.yml b/.github/workflows/benchmark_nightly.yml index b2da7cbb5b..df86056cd2 100644 --- a/.github/workflows/benchmark_nightly.yml +++ b/.github/workflows/benchmark_nightly.yml @@ -2,15 +2,18 @@ name: Benchmark torchserve nightly on: # run every day at 2:15am - schedule: - - cron: '15 02 * * *' + # schedule: + # - cron: '15 02 * * *' + push: + branches: + - "ci_logs" jobs: nightly: strategy: fail-fast: false matrix: - hardware: [cpu, gpu, inf2] + hardware: [gpu] #[cpu, gpu, inf2] runs-on: - self-hosted - ${{ matrix.hardware }} diff --git a/.github/workflows/benchmark_torch_compile_nightly.yml b/.github/workflows/benchmark_torch_compile_nightly.yml index 310a7a5d00..f11152994c 100644 --- a/.github/workflows/benchmark_torch_compile_nightly.yml +++ b/.github/workflows/benchmark_torch_compile_nightly.yml @@ -3,7 +3,10 @@ name: Benchmark torch.compile models nightly on: # run every day at 9:15pm schedule: - - cron: '15 21 * * *' + - cron: '15 02 * * *' + # push: + # branches: + # - "ci_logs" jobs: nightly: diff --git a/benchmarks/auto_benchmark.py b/benchmarks/auto_benchmark.py index c0d132dd2a..54aa935a08 100644 --- a/benchmarks/auto_benchmark.py +++ b/benchmarks/auto_benchmark.py @@ -11,6 +11,7 @@ MODEL_JSON_CONFIG_PATH = CWD + "/model_json_config" BENCHMARK_TMP_PATH = "/tmp/benchmark" BENCHMARK_REPORT_PATH = "/tmp/ts_benchmark" +BENCHMARK_REPORT_PATH_TEST = "/tmp/ts_benchmark/fail" TS_LOGS_PATH = CWD + "/logs" MODEL_STORE = "/tmp/model_store" WF_STORE = "/tmp/wf_store" @@ -136,8 +137,8 @@ def install_torchserve(skip_ts_install, hw, ts_version, nightly): return # git checkout branch if it is needed - cmd = "git checkout master && git reset --hard && git clean -dffx . && git pull --rebase" - execute(cmd, wait=True) + # cmd = "git checkout master && git reset --hard && git clean -dffx . && git pull --rebase" + # execute(cmd, wait=True) print("successfully reset git") ts_install_cmd = None @@ -159,6 +160,7 @@ def install_torchserve(skip_ts_install, hw, ts_version, nightly): if nightly: cmd += " --nightly_torch" execute(cmd, wait=True) + print("successfully install install_dependencies.py") # install torchserve @@ -212,10 +214,23 @@ def run_benchmark(bm_config): # generate stats metrics from ab_report.csv bm_model = model_json_config[0 : -len(".json")] - gen_metrics_json.gen_metric( - "{}/ab_report.csv".format(BENCHMARK_TMP_PATH), - "{}/logs/stats_metrics.json".format(BENCHMARK_TMP_PATH), - ) + try: + gen_metrics_json.gen_metric( + "{}/ab_report.csv".format(BENCHMARK_TMP_PATH), + "{}/logs/stats_metrics.json".format(BENCHMARK_TMP_PATH), + ) + except Exception as e: + bm_model_log_path = "{}/{}".format(BENCHMARK_REPORT_PATH_TEST, bm_model) + os.makedirs(bm_model_log_path, exist_ok=True) + + cmd = "tar -cvzf {}/logs.tar.gz {}".format( + bm_model_log_path, TS_LOGS_PATH + ) + execute(cmd, wait=True) + + print(f"An error occurred: {e}") + if "report_cmd" in bm_config: + execute(bm_config["report_cmd"], wait=True) # load stats metrics to remote metrics storage if "metrics_cmd" in bm_config: diff --git a/benchmarks/benchmark_config_gpu.yaml b/benchmarks/benchmark_config_gpu.yaml index 8d9bc8e39c..8d02417b42 100644 --- a/benchmarks/benchmark_config_gpu.yaml +++ b/benchmarks/benchmark_config_gpu.yaml @@ -3,18 +3,19 @@ # - nightly: "2022.3.16" # - release: "0.5.3" # Nightly build will be installed if "ts_version" is not specifiged -# ts_version: -# branch: &ts_version "master" +ts_version: + branch: &ts_version "ci_logs" + # a list of model configure yaml files defined in benchmarks/models_config # or a list of model configure yaml files with full path models: - "bert_multi_gpu.yaml" - "bert_multi_gpu_better_transformer.yaml" - - "bert_multi_gpu_no_better_transformer.yaml" - - "fastrcnn.yaml" - - "mnist.yaml" - - "vgg16.yaml" + # - "bert_multi_gpu_no_better_transformer.yaml" + # - "fastrcnn.yaml" + # - "mnist.yaml" + # - "vgg16.yaml" # - "wf_dog_breed.yaml" # benchmark on "cpu" or "gpu". @@ -28,11 +29,11 @@ hardware: &hardware "gpu" # - keep the values order as the same as the command definition. # - set up the command before enabling `metrics_cmd`. # For example, aws client and AWS credentials need to be setup before trying this example. -metrics_cmd: - - "cmd": "aws cloudwatch put-metric-data" - - "--namespace": ["torchserve_benchmark_nightly_", *hardware] - - "--region": "us-east-2" - - "--metric-data": 'file:///tmp/benchmark/logs/stats_metrics.json' +# metrics_cmd: +# - "cmd": "aws cloudwatch put-metric-data" +# - "--namespace": ["torchserve_benchmark_nightly_", *hardware] +# - "--region": "us-east-2" +# - "--metric-data": 'file:///tmp/benchmark/logs/stats_metrics.json' # load report to remote storage or local different path if "report_cmd" is set. # the command line to load report to remote storage. @@ -48,4 +49,4 @@ metrics_cmd: report_cmd: - "cmd": "aws s3 cp --recursive" - "source": '/tmp/ts_benchmark/' - - "dest": ['s3://torchserve-benchmark/nightly', "today()", *hardware] + - "dest": ['s3://torchserve-benchmark/nightly', "today()", "test", *hardware] diff --git a/benchmarks/benchmark_config_torch_compile_gpu.yaml b/benchmarks/benchmark_config_torch_compile_gpu.yaml index 97cf53351f..57fc3124ce 100644 --- a/benchmarks/benchmark_config_torch_compile_gpu.yaml +++ b/benchmarks/benchmark_config_torch_compile_gpu.yaml @@ -3,15 +3,15 @@ # - nightly: "2022.3.16" # - release: "0.5.3" # Nightly build will be installed if "ts_version" is not specifiged -#ts_version: -# branch: &ts_version "master" +ts_version: + branch: &ts_version "ci_logs" # a list of model configure yaml files defined in benchmarks/models_config # or a list of model configure yaml files with full path models: - "bert_torch_compile_gpu.yaml" - - "resnet50_torch_compile_gpu.yaml" - - "sam_fast_torch_compile_gpu_best_latency.yaml" + # - "resnet50_torch_compile_gpu.yaml" + # - "sam_fast_torch_compile_gpu_best_latency.yaml" # benchmark on "cpu" or "gpu". # "cpu" is set if "hardware" is not specified @@ -24,11 +24,11 @@ hardware: &hardware "gpu" # - keep the values order as the same as the command definition. # - set up the command before enabling `metrics_cmd`. # For example, aws client and AWS credentials need to be setup before trying this example. -metrics_cmd: - - "cmd": "aws cloudwatch put-metric-data" - - "--namespace": ["torchserve_benchmark_nightly_torch_compile_", *hardware] - - "--region": "us-east-2" - - "--metric-data": 'file:///tmp/benchmark/logs/stats_metrics.json' +# metrics_cmd: +# - "cmd": "aws cloudwatch put-metric-data" +# - "--namespace": ["torchserve_benchmark_nightly_torch_compile_", *hardware] +# - "--region": "us-east-2" +# - "--metric-data": 'file:///tmp/benchmark/logs/stats_metrics.json' # load report to remote storage or local different path if "report_cmd" is set. # the command line to load report to remote storage. diff --git a/benchmarks/models_config/bert_multi_gpu_better_transformer.yaml b/benchmarks/models_config/bert_multi_gpu_better_transformer.yaml index c476aad5eb..6ec677088c 100644 --- a/benchmarks/models_config/bert_multi_gpu_better_transformer.yaml +++ b/benchmarks/models_config/bert_multi_gpu_better_transformer.yaml @@ -2,7 +2,8 @@ bert_bt: eager_mode: benchmark_engine: "ab" - url: https://torchserve.pytorch.org/mar_files/BERTSeqClassification-BT.mar + # url: https://torchserve.pytorch.org/mar_files/BERTSeqClassification-BT.mar + url: https://torchserve.pytorch.org/mar_files/BERTSeqClassification-ERROR.mar workers: - 4 batch_delay: 100 diff --git a/ts_scripts/install_dependencies.py b/ts_scripts/install_dependencies.py index 20bd76599a..0dd1d6883d 100644 --- a/ts_scripts/install_dependencies.py +++ b/ts_scripts/install_dependencies.py @@ -140,6 +140,7 @@ def install_python_packages(self, cuda_version, requirements_file_path, nightly) os.system( f"pip3 install numpy --pre torch torchvision torchaudio torchtext --index-url https://download.pytorch.org/whl/nightly/{pt_nightly}" ) + elif args.skip_torch_install: print("Skipping Torch installation") else: