Skip to content

Commit

Permalink
revert provisioner error
Browse files Browse the repository at this point in the history
  • Loading branch information
Michaelvll committed Jan 19, 2025
1 parent c590d5a commit 56ee47b
Showing 1 changed file with 7 additions and 9 deletions.
16 changes: 7 additions & 9 deletions sky/provision/gcp/instance_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
r'The resource \'projects/.*/global/firewalls/.*\' was not found')


def _retry_on_gcp_or_provision_exception(
def _retry_on_gcp_http_exception(
regex: Optional[str] = None,
max_retries: int = GCP_MAX_RETRIES,
retry_interval_s: int = GCP_RETRY_INTERVAL_SECONDS,
Expand All @@ -57,11 +57,9 @@ def try_catch_exc():
except Exception as e: # pylint: disable=broad-except
if (isinstance(e, gcp.http_error_exception()) and
(regex is None or re.search(regex, str(e)))):
logger.error(
f'Retrying for gcp.http_error_exception: {e}')
return e
if isinstance(e, common.ProvisionerError):
err_msg = str(e.errors)
if regex is None or re.search(regex, err_msg):
return e
raise

for _ in range(max_retries):
Expand Down Expand Up @@ -434,7 +432,7 @@ def wait_for_operation(cls,
logger.debug(
f'Waiting GCP operation {operation["name"]} to be ready ...')

@_retry_on_gcp_or_provision_exception(
@_retry_on_gcp_http_exception(
f'Failed to wait for operation {operation["name"]}')
def call_operation(fn, timeout: int):
request = fn(
Expand Down Expand Up @@ -620,7 +618,7 @@ def create_or_update_firewall_rule(
# newly created instances, it may fail with the following error:
# "Labels fingerprint either invalid or resource labels have changed"
# We should retry until the labels are set successfully.
@_retry_on_gcp_or_provision_exception('Labels fingerprint either invalid')
@_retry_on_gcp_http_exception('Labels fingerprint either invalid')
def set_labels(cls, project_id: str, availability_zone: str, node_id: str,
labels: dict) -> None:
node = cls.load_resource().instances().get(
Expand Down Expand Up @@ -1219,7 +1217,7 @@ def wait_for_operation(cls,
"""Poll for TPU operation until finished."""
del project_id, region, zone # unused

@_retry_on_gcp_or_provision_exception(
@_retry_on_gcp_http_exception(
f'Failed to wait for operation {operation["name"]}')
def call_operation(fn, timeout: int):
request = fn(name=operation['name'])
Expand Down Expand Up @@ -1387,7 +1385,7 @@ def get_vpc_name(
f'Failed to get VPC name for instance {instance}') from e

@classmethod
@_retry_on_gcp_or_provision_exception('unable to queue the operation')
@_retry_on_gcp_http_exception('unable to queue the operation')
def set_labels(cls, project_id: str, availability_zone: str, node_id: str,
labels: dict) -> None:
while True:
Expand Down

0 comments on commit 56ee47b

Please sign in to comment.