From a9d0cc0f70de59610563b0e1ec52ece954329ae1 Mon Sep 17 00:00:00 2001 From: Eric Passmore Date: Wed, 6 Dec 2023 15:25:53 -0800 Subject: [PATCH 1/2] updated docs and tests for scripted process --- .pylintrc | 4 ++ README.md | 38 ++++++--------- orchestration-service/test/run-pytest.sh | 1 + orchestration-service/web_service.py | 2 +- replay-client/test/run.sh | 1 + scripts/orchestrator-bootstrap.sh | 2 +- scripts/process_orchestration_log.py | 62 ++++++++++++++++++++++++ 7 files changed, 85 insertions(+), 25 deletions(-) create mode 100644 scripts/process_orchestration_log.py diff --git a/.pylintrc b/.pylintrc index 36bac97..eda2e2d 100644 --- a/.pylintrc +++ b/.pylintrc @@ -1,3 +1,7 @@ +[MASTER] +# mac pylint error could not find numpy +init-hook='import sys; sys.path.append("/usr/local/lib/python3.10/site-packages")' + [MESSAGES CONTROL] disable=duplicate-code diff --git a/README.md b/README.md index fdfb074..cb32ab4 100644 --- a/README.md +++ b/README.md @@ -21,33 +21,24 @@ Select `LowEndOrchestrator` and use the default template. ![OrchTemplaceSelect](docs/images/CDOrchTemplateSelect.png) ## Updating Orchestrator Job Configuration -By default the setup will spin up a webservice with [test data](meta-data/test-simple-jobs.json). To change the job configuration you need to create your own JSON configuration, and restart the service to use the new JSON. +By default the setup will spin up a webservice with [Production Run from Nov 2023](meta-data/full-production-run-20231130.json). To change the job configuration you need to create your own JSON configuration, and restart the service to use the new JSON. - Create your own JSON following the example formate from `test-simple-jobs.json` - Upload the file to the orchestrator node - Log into the orchestrator node as `ubuntu` user - Kill the existing service named `python3 ... webservice.py` -- Restart with your configuration `python3 $HOME/replay-test/orchestration-service/web_service.py --config my-config.json --host 0.0.0.0 &` +- Restart with your configuration `python3 $HOME/replay-test/orchestration-service/web_service.py --config my-config.json --host 0.0.0.0 --log ~/orch-complete-timings.log &` ## Replay Setup -You can spin up as many replay nodes as you need. Each replay node is designed to use one replay slice configuration as provided in the JSON configuration file. If you have 100 replay slices configured you can utilize up to 100 replay hosts. +You can spin up as many replay nodes as you need. Replay nodes will continuously pick and process new jobs. Each replay host works on one job at a time before picking up the next job. Therefore a small number of replay hosts will process all the jobs given enough time. For example, if there are 100 replay slices configured at most 100 replay hosts, and as few as 1 replay host, may be utilized. -To setup your orchestrator node. Go to EC2 Instances -![CDEC2Instance](docs/images/CDEC2Instance.png) - -Select launch instance from template -![LaunchTemplace](docs/images/CDLaunchTemplate.png) - -Select `ChickenReplayHost` and use the default template. -![ReplayTemplaceSelect](docs/images/CDReplayTemplateSelect.png) - -Once your replay host is setup you need to ssh into the host and start the job. -- Grab the private IP of the orchestrator node -- SSH in as user `enf-replay` -- Run `$HOME/replay-test/replay-client/start-nodeos-run-replay X.X.X.X` - - replacing the argument with the orchestrator node private IP - - optionally provide a second argument for the orchestrator webservice port +To run the replay nodes ssh into the orchestrator node and run [run-replay-instance.sh](scripts/run-replay-instance.sh). The script takes two arguments the first is the number of replay hosts to spin up. The second argument indicates this is a dry run, and don't start up the hosts. +``` +ssh -i private.key -l ubuntu orchestor +cd replay-test +scripts/run-replay-instance.sh 10 [DRY-RUN] +``` -**Alternative**: you can start a replay node on the command line from the orchestrator node. See [an example](scripts/run-replay-instance.sh). +**Note**: It is important to run this script, as it injects the IP address of the orchestrator node into the replay nodes. Without this script you would need to manually update all the replay nodes with the IP address of the orchestrator. ## Web Dashboard You can see the status of jobs, configuration, and summary of replay status by using the webservice on the orchestrator node. Navigate to `http://orchestor.example.com:4000/`. @@ -55,16 +46,16 @@ You can see the status of jobs, configuration, and summary of replay status by u Many HTTP calls support HTML, JSON, and Text responses. Look at [HTTP Service Calls](docs/http-service-calls.md) for other URL options and Accept encoding options. ## Termination of Replay Nodes -Replay nodes are not automatically terminated. To save on hosting costs, it is advisable to terminate the nodes after the replay tests are completed. +Replay nodes are not automatically terminated. To save on hosting costs, it is advisable to terminate the nodes after the replay tests are completed. Termination can be accomplished using the AWS dashboard. ## Testing For testing options see [Running Tests](docs/running-tests.md) ## Generating Manifests -The python script `replay-test/scripts/generate_manifest_from_eosnation.py` will build a manifest off the list of eos nation snapshots. +The python script `replay-test/scripts/generate_manifest_from_eosnation.py` will build a manifest off the list of eos nation snapshots. A manifest may be validated for valid JSON and a contiguous block range using the [validate_manifest.py](scripts/validate_manifest.py) script -Redirect of stdout is needed to separate the debug messages printed on stderr -`python3 generate_manifest_from_eosnation.py --source-net mainnet > full-mainnet-run.json 1> ./manifest-config.json` +Redirect of stdout is recommended to separate the debug messages printed on stderr +`python3 generate_manifest_from_eosnation.py --source-net mainnet 1> ./manifest-config.json` ### Options In this release `block-space-between-slices`, `max-block-height`, and `min-block-height` are experimental. @@ -76,3 +67,4 @@ In this release `block-space-between-slices`, `max-block-height`, and `min-block - `--block-space-between-slices` Min number of blocks between slices, cuts down on the number of slices created - `--max-block-height` Limits manifest by not processing starting block ranges above value - `--min-block-height` Limits manifest by not processing starting block ranges below value +- `--debug` Prints out internal status messages diff --git a/orchestration-service/test/run-pytest.sh b/orchestration-service/test/run-pytest.sh index caeeddb..dcc8780 100755 --- a/orchestration-service/test/run-pytest.sh +++ b/orchestration-service/test/run-pytest.sh @@ -42,3 +42,4 @@ fi # shutdown service clean up file kill "$WEB_SERVICE_PID" rm ../../meta-data/test-modify-jobs.json +rm orchestration.log diff --git a/orchestration-service/web_service.py b/orchestration-service/web_service.py index 85cbf6d..4ab2c1a 100644 --- a/orchestration-service/web_service.py +++ b/orchestration-service/web_service.py @@ -287,7 +287,7 @@ def application(request): parser.add_argument('--config', '-c', type=str, help='Path to config json') parser.add_argument('--port', type=int, default=4000, help='Port for web service') parser.add_argument('--host', type=str, default='0.0.0.0', help='Listening service name or ip') - parser.add_argument('--log', type=str, default="~/orchestration.log", + parser.add_argument('--log', type=str, default="orchestration.log", help="log file for service") args = parser.parse_args() diff --git a/replay-client/test/run.sh b/replay-client/test/run.sh index 9b5527d..dd4bf7d 100755 --- a/replay-client/test/run.sh +++ b/replay-client/test/run.sh @@ -55,3 +55,4 @@ echo "CONFIG OPERATIONS TESTS PASSED" # shutdown service and cleanup kill "$WEB_SERVICE_PID" rm ../../meta-data/test-modify-jobs.json +rm orchestration.log diff --git a/scripts/orchestrator-bootstrap.sh b/scripts/orchestrator-bootstrap.sh index 3144218..dcff53f 100755 --- a/scripts/orchestrator-bootstrap.sh +++ b/scripts/orchestrator-bootstrap.sh @@ -14,7 +14,7 @@ rm -rf /tmp/aws /tmp/awscliv2.zip ## git scripts for enf-user ## sudo -i -u "${USER}" git clone https://github.com/eosnetworkfoundation/replay-test -sudo -i -u "${USER}" pip install datetime argparse werkzeug bs4 +sudo -i -u "${USER}" pip install datetime argparse werkzeug bs4 numpy ## startup service in background ## sudo -i -u "${USER}" python3 /home/"${USER}"/replay-test/orchestration-service/web_service.py \ diff --git a/scripts/process_orchestration_log.py b/scripts/process_orchestration_log.py new file mode 100644 index 0000000..4e15ebe --- /dev/null +++ b/scripts/process_orchestration_log.py @@ -0,0 +1,62 @@ +"""Parse Orchestration File and Calculate Job Elapsed Time""" +import argparse +from datetime import datetime +import statistics +import numpy as np + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description='helper script to extract elapsed timing from log') + parser.add_argument('--log', type=str, help='path to config file') + + args = parser.parse_args() + timings = [] + + # Open the file and read log_entry by log_entry + with open(args.log, 'r', encoding='utf-8') as file: + for log_entry in file: + # Check if the specific phrase is in the current log_entry + if "OrchWebSrv INFO Completed Job" in log_entry: + # Print the log_entry or perform other actions + for part in log_entry.split(','): + if 'starttime' in part: + starttimestr = part.split(': ', 1)[1] + starttime = datetime.strptime(starttimestr, '%Y-%m-%dT%H:%M:%S') + elif 'endtime' in part: + endtimestr = part.split(': ', 1)[1] + endtime = datetime.strptime(endtimestr, '%Y-%m-%dT%H:%M:%S') + elif 'jobid' in part: + jobid = part.split(': ', 1)[1].strip() + # calc elapsed time + timedelta = endtime - starttime + # Convert the difference to total seconds + total_minutes = int(timedelta.total_seconds())/60 + timings.append(total_minutes) + #print(f"Job {jobid} elapsed time in minutes {total_minutes}") + + # Calculate average (mean) + average = statistics.mean(timings) + + # Calculate standard deviation + std_dev = statistics.stdev(timings) + + # Calculate median + median = statistics.median(timings) + + # Calculate the 75th and 90th percentiles + percentile_75 = np.percentile(timings, 75) + percentile_90 = np.percentile(timings, 90) + + # get longest + longest = max(timings) + + # Print the results + print("JOB TIMING ALL TIMES IN MINUTES") + print("-------------------------------") + print(f"Number of Jobs: {len(timings)}") + print(f"Average: {round(average,2)}") + print(f"Standard Deviation: {round(std_dev,2)}") + print(f"Median: {round(median,2)}") + print(f"75th Percentile: {round(percentile_75,2)}") + print(f"90th Percentile: {round(percentile_90,2)}") + print(f"Longest Running Job {round(longest,2)} mins") From 303c5c28fe545dbc45896ad039e32d19201fc0d5 Mon Sep 17 00:00:00 2001 From: Eric Passmore Date: Fri, 8 Dec 2023 13:27:40 -0800 Subject: [PATCH 2/2] added config id and snapshot to complete log --- orchestration-service/web_service.py | 6 ++-- scripts/process_orchestration_log.py | 50 ++++++++++++++++++++-------- 2 files changed, 40 insertions(+), 16 deletions(-) diff --git a/orchestration-service/web_service.py b/orchestration-service/web_service.py index 4ab2c1a..9eaca73 100644 --- a/orchestration-service/web_service.py +++ b/orchestration-service/web_service.py @@ -150,8 +150,10 @@ def application(request): # log timings for completed jobs if data['status'] == 'COMPLETE': # pylint: disable=used-before-assignment - logger.info("Completed Job, starttime: %s, endtime: %s, jobid: %s", - data['start_time'], data['end_time'], data['job_id']) + logger.info("Completed Job, starttime: %s, endtime: %s,\ + jobid: %s, config: %s, snapshot: %s", + data['start_time'], data['end_time'], + data['job_id'], data['replay_slice_id'], data['snapshot_path']) # check bool success for set_job to ensure valid data if jobs.set_job(data): stringified = str( diff --git a/scripts/process_orchestration_log.py b/scripts/process_orchestration_log.py index 4e15ebe..6b4c05d 100644 --- a/scripts/process_orchestration_log.py +++ b/scripts/process_orchestration_log.py @@ -17,38 +17,44 @@ for log_entry in file: # Check if the specific phrase is in the current log_entry if "OrchWebSrv INFO Completed Job" in log_entry: + complete_record = {} # Print the log_entry or perform other actions for part in log_entry.split(','): if 'starttime' in part: starttimestr = part.split(': ', 1)[1] - starttime = datetime.strptime(starttimestr, '%Y-%m-%dT%H:%M:%S') + complete_record['starttime'] = datetime.strptime( + starttimestr, '%Y-%m-%dT%H:%M:%S') elif 'endtime' in part: endtimestr = part.split(': ', 1)[1] - endtime = datetime.strptime(endtimestr, '%Y-%m-%dT%H:%M:%S') + complete_record['endtime'] = datetime.strptime( + endtimestr, '%Y-%m-%dT%H:%M:%S') elif 'jobid' in part: - jobid = part.split(': ', 1)[1].strip() + complete_record['jobid'] = part.split(': ', 1)[1].strip() + elif 'config' in part: + complete_record['config'] = part.split(': ', 1)[1].strip() + elif 'snapshot' in part: + complete_record['snapshot'] = part.split(': ', 1)[1].strip() # calc elapsed time - timedelta = endtime - starttime - # Convert the difference to total seconds - total_minutes = int(timedelta.total_seconds())/60 - timings.append(total_minutes) - #print(f"Job {jobid} elapsed time in minutes {total_minutes}") + timedelta = complete_record['endtime'] - complete_record['starttime'] + # Convert the difference to total minutes + complete_record['total_minutes'] = int(timedelta.total_seconds())/60 + timings.append(complete_record) # Calculate average (mean) - average = statistics.mean(timings) + average = statistics.mean(list(record['total_minutes'] for record in timings)) # Calculate standard deviation - std_dev = statistics.stdev(timings) + std_dev = statistics.stdev(list(record['total_minutes'] for record in timings)) # Calculate median - median = statistics.median(timings) + median = statistics.median(list(record['total_minutes'] for record in timings)) # Calculate the 75th and 90th percentiles - percentile_75 = np.percentile(timings, 75) - percentile_90 = np.percentile(timings, 90) + percentile_75 = np.percentile(list(record['total_minutes'] for record in timings), 75) + percentile_90 = np.percentile(list(record['total_minutes'] for record in timings), 90) # get longest - longest = max(timings) + longest = max(list(record['total_minutes'] for record in timings)) # Print the results print("JOB TIMING ALL TIMES IN MINUTES") @@ -60,3 +66,19 @@ print(f"75th Percentile: {round(percentile_75,2)}") print(f"90th Percentile: {round(percentile_90,2)}") print(f"Longest Running Job {round(longest,2)} mins") + + if std_dev > average: + print("\nLONG RUNNING JOBS TOP 90%") + print("-------------------------") + for record in timings: + if record['total_minutes'] > percentile_90: + # when config and snapshot exist print full + # field check for backwards compat + if 'config' in record and 'snapshot' in record \ + and record['config'] and record['snapshot']: + print(f"Job {record['jobid']}\ + running time {round(record['total_minutes'],2)}\ + config {record['config']} with snapshot {record['snapshot']}") + else: + print(f"Job {record['jobid']}\ + running time {round(record['total_minutes'],2)}")