-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtraining.py
180 lines (137 loc) · 5.59 KB
/
training.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
import os
import subprocess
from datetime import datetime
import numpy as np
from google.cloud import aiplatform, storage
from google.cloud.aiplatform import hyperparameter_tuning as hpt
from google.cloud.aiplatform.models import Prediction
from google.cloud.storage.bucket import Bucket
from transformers import AutoTokenizer
PROJECT_NAME = "ageless-wall-364306"
BUCKET_NAME = "ageless-wall-364306-vertex-ai"
REPOSITORY_NAME = "vertex-ai-images"
LOCATION = "europe-west4"
# I'm assuming the reader created a docker artifact at the artifact registry with a name 'vertex-ai-images'
IMAGE_URI = (
f"{LOCATION}-docker.pkg.dev/{PROJECT_NAME}/{REPOSITORY_NAME}/tweet_eval:hypertune"
)
# For having unique names for the training jobs
TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
def upload_to_gcs():
gcs_client = storage.Client(PROJECT_NAME)
gcs_bucket: Bucket = gcs_client.bucket(BUCKET_NAME)
train_blob = gcs_bucket.blob("tweet_eval_emotions/data/train/train.csv")
test_blob = gcs_bucket.blob("tweet_eval_emotions/data/test/test.csv")
validation_blob = gcs_bucket.blob(
"tweet_eval_emotions/data/validation/validation.csv"
)
train_blob.upload_from_filename("train.csv")
test_blob.upload_from_filename("test.csv")
validation_blob.upload_from_filename("validation.csv")
# upload_to_gcs(bucket)
aiplatform.init(project=PROJECT_NAME, location=LOCATION, staging_bucket=BUCKET_NAME)
"""
RUN ONLY ONCE THIS TAKES FOREVER
Actually run this command on the gcp console "gcloud auth configure-docker europe-west4-docker.pkg.dev"
and after that run these commands below in the cloud console. Don't bother with running these locally in python code.
Or run this code in Vertex AI workbench and as a bonus not deal with credentials we set above
https://cloud.google.com/artifact-registry/docs/docker/pushing-and-pulling#auth
"""
# build_result = subprocess.run(['docker', 'build', '-t', IMAGE_URI, 'custom_training_docker'])
# push_result = subprocess.run(['docker', 'push', IMAGE_URI], check=True)
# First one is the chief and the second one is workers. We have 1 chief and 2 workers but the training will be on
# both chief and workers. https://codelabs.developers.google.com/vertex_multiworker_training#5
container_spec = {
"image_uri": IMAGE_URI,
"args": [
f"--project_name={PROJECT_NAME}",
f"--bucket_name={BUCKET_NAME}",
f"--train_path=tweet_eval_emotions/data/train/train.csv",
f"--test_path=tweet_eval_emotions/data/test/test.csv",
f"--validation_path=tweet_eval_emotions/data/validation/validation.csv",
f"--distribute=multiworker",
f"--batch_size=32",
f"--hp=True",
],
}
machine_spec = {
"machine_type": "n1-standard-4",
# "accelerator_type": "NVIDIA_TESLA_T4",
# "accelerator_count": 2,
}
worker_pool_specs = [
{
"machine_spec": machine_spec,
"replica_count": 1,
"container_spec": container_spec,
},
{
"machine_spec": machine_spec,
"replica_count": 2,
"container_spec": container_spec,
},
]
JOB_NAME = "custom_nlp_training-hyperparameter-job " + TIMESTAMP
custom_job = aiplatform.CustomJob(
display_name=JOB_NAME, project=PROJECT_NAME, worker_pool_specs=worker_pool_specs
)
metric_spec = {"accuracy": "maximize"}
parameter_spec = {
"lr": hpt.DoubleParameterSpec(min=0.001, max=1, scale="log"),
"epochs": hpt.IntegerParameterSpec(min=1, max=3, scale="linear"),
}
hp_job = aiplatform.HyperparameterTuningJob(
display_name=JOB_NAME,
custom_job=custom_job,
metric_spec=metric_spec,
parameter_spec=parameter_spec,
max_trial_count=2,
parallel_trial_count=2,
project=PROJECT_NAME,
)
hp_job.run()
metrics = [trial.final_measurement.metrics[0].value for trial in hp_job.trials]
best_trial = hp_job.trials[metrics.index(max(metrics))]
best_accuracy = float(best_trial.final_measurement.metrics[0].value)
best_values = {param.parameter_id: param.value for param in best_trial.parameters}
print(best_trial)
print(best_accuracy)
print(best_values)
MACHINE_TYPE = "n1-standard"
VCPU = "4"
TRAIN_COMPUTE = MACHINE_TYPE + "-" + VCPU
print("Train machine type", TRAIN_COMPUTE)
# The difference is not only /training /prediction. train image's name starts with tf, deploy image starts with tf2
DEPLOY_IMAGE = "europe-docker.pkg.dev/vertex-ai/prediction/tf2-cpu.2-9:latest"
print("Deployment:", DEPLOY_IMAGE)
container_job = aiplatform.CustomContainerTrainingJob(
display_name=f"custom_nlp_training_{TIMESTAMP}",
container_uri=IMAGE_URI,
model_serving_container_image_uri=DEPLOY_IMAGE,
project=PROJECT_NAME,
)
container_spec["args"].pop()
container_spec["args"].append(f"--hp=False")
container_spec["args"].append(f"--lr={best_values['lr']}")
container_spec["args"].append(f"--epochs={int(best_values['epochs'])}")
model = container_job.run(
model_display_name=f"tweet_eval_{TIMESTAMP}",
args=container_spec["args"],
replica_count=3,
machine_type=TRAIN_COMPUTE,
sync=True,
)
VCPU = "4"
DEPLOY_COMPUTE = MACHINE_TYPE + "-" + VCPU
print("Deploy machine type", DEPLOY_COMPUTE)
# Create an endpoint
endpoint = model.deploy(machine_type=DEPLOY_COMPUTE, sync=True)
tokenizer = AutoTokenizer.from_pretrained("google/electra-small-discriminator")
example_text = tokenizer("I love you", truncation=True, padding="max_length")
example_text.pop("token_type_ids")
# Get prediction from the endpoint
prediction: Prediction = endpoint.predict(instances=[example_text])
print(prediction.predictions[0])
index = np.argmax(prediction.predictions[0])
id_to_label = {0: "anger", 1: "joy", 2: "optimism", 3: "sadness"}
print(id_to_label[index])