From 706d4cabcf3d660e112d78efc75cc862e8fb29d1 Mon Sep 17 00:00:00 2001 From: Natalia Kaczor Date: Fri, 7 Feb 2025 06:42:46 -0800 Subject: [PATCH] Merge "Fix building report with customized stepd names + add disable_paging parameters to the log queries" -- Branch commit log -- commit b488c91ad7786b03c8e4ca07340bd58b64b1d061 Author: Natalia Kaczor Date: 2025-02-05T17:40:29Z Fix building report with customized stepd names + add disable_paging parameters to the log queries Change-Id: I8dfb09564f1535130342c9ebf01077f222e2cca0 GitOrigin-RevId: 32ba4f6cfbada92d2a70cd5ddc8b96c90c006257 --- gcpdiag/queries/logs.py | 18 +- .../lb/snapshots/unhealthy_backends.txt | 8 +- gcpdiag/runbook/lb/ssl_certificates.py | 179 +++++----- gcpdiag/runbook/lb/unhealthy_backends.py | 321 +++++++++++------- 4 files changed, 312 insertions(+), 214 deletions(-) diff --git a/gcpdiag/queries/logs.py b/gcpdiag/queries/logs.py index ee7c6b4a8..331e488f6 100644 --- a/gcpdiag/queries/logs.py +++ b/gcpdiag/queries/logs.py @@ -44,11 +44,12 @@ from typing import (Any, Deque, Dict, List, Mapping, Optional, Sequence, Set, Tuple, Union) +import apiclient.errors import dateutil.parser import ratelimit from boltons.iterutils import get_path -from gcpdiag import caching, config, models +from gcpdiag import caching, config, models, utils from gcpdiag.queries import apis @@ -162,7 +163,12 @@ def query(project_id: str, resource_type: str, log_name: str, period=config.get('logging_ratelimit_period_seconds')) def _ratelimited_execute(req): """Wrapper to req.execute() with rate limiting to avoid hitting quotas.""" - return req.execute(num_retries=config.API_RETRIES) + try: + return req.execute(num_retries=config.API_RETRIES) + except apiclient.errors.HttpError as err: + logging.error('failed to execute logging request for request %s. Error: %s', + req, err) + raise utils.GcpApiError(err) from err def _execute_query_job(job: _LogsQueryJob): @@ -240,7 +246,11 @@ def _execute_query_job(job: _LogsQueryJob): @caching.cached_api_call -def realtime_query(project_id, filter_str, start_time, end_time): +def realtime_query(project_id, + filter_str, + start_time, + end_time, + disable_paging=False): """Intended for use in only runbooks. use logs.query() for lint rules.""" logging_api = apis.get_api('logging', 'v2', project_id) @@ -283,6 +293,8 @@ def realtime_query(project_id, filter_str, start_time, end_time): 'maximum query runtime for log query reached (project: %s, query: %s).', project_id, filter_str.replace('\n', ' AND ')) return deque + if disable_paging: + break req = logging_api.entries().list_next(req, res) if req is not None: logging.info('still fetching logs (project: %s, max wait: %ds)', diff --git a/gcpdiag/runbook/lb/snapshots/unhealthy_backends.txt b/gcpdiag/runbook/lb/snapshots/unhealthy_backends.txt index 848b8c307..d364ccd03 100644 --- a/gcpdiag/runbook/lb/snapshots/unhealthy_backends.txt +++ b/gcpdiag/runbook/lb/snapshots/unhealthy_backends.txt @@ -49,13 +49,13 @@ lb/unhealthy-backends: Load Balancer Unhealthy Backends Analyzer. Group https://www.googleapis.com/compute/v1/projects/gcpdiag-lb2-aaaa/zones/us-east1-b/instanceGroups/lb-backend-example has 2/2 unhealthy backends -[GATEWAY]: Check if health check logging is enabled. +[GATEWAY]: Verify health check logging enabled for backend service "web-backend-service" in region "global". - gcpdiag-lb2-aaaa/http-basic-check [OK] [REASON] Health check logging is enabled for health check projects/gcpdiag-lb1-aaaa/global/healthChecks/http-basic-check. -[GATEWAY]: Look for the latest health check logs and based on that decide what to do next. +[GATEWAY]: Analyze latest health check log for backend service "web-backend-service" in region "global". [AUTOMATED STEP]: Analyze TIMEOUT health check logs for backend service "web-backend-service" in region "global". - gcpdiag-lb2-aaaa/web-backend-service [UNCERTAIN] @@ -276,13 +276,13 @@ lb/unhealthy-backends: Load Balancer Unhealthy Backends Analyzer. Group https://www.googleapis.com/compute/v1/projects/gcpdiag-lb2-aaaa/zones/europe-west4-b/networkEndpointGroups/neg1 has 1/1 unhealthy backends -[GATEWAY]: Check if health check logging is enabled. +[GATEWAY]: Verify health check logging enabled for backend service "backend-service-2" in region "europe-west4". - gcpdiag-lb2-aaaa/tcp-basic-check-2 [OK] [REASON] Health check logging is enabled for health check projects/gcpdiag-lb2-aaaa/regions/europe-west4/healthChecks/tcp-basic-check-2. -[GATEWAY]: Look for the latest health check logs and based on that decide what to do next. +[GATEWAY]: Analyze latest health check log for backend service "backend-service-2" in region "europe-west4". [AUTOMATED STEP]: Analyze TIMEOUT health check logs for backend service "backend-service-2" in region "europe-west4". - gcpdiag-lb2-aaaa/backend-service-2 [UNCERTAIN] diff --git a/gcpdiag/runbook/lb/ssl_certificates.py b/gcpdiag/runbook/lb/ssl_certificates.py index bb0d8b5d5..366ca46f0 100644 --- a/gcpdiag/runbook/lb/ssl_certificates.py +++ b/gcpdiag/runbook/lb/ssl_certificates.py @@ -70,15 +70,26 @@ class SslCertificates(runbook.DiagnosticTree): def build_tree(self): """Construct the diagnostic tree with appropriate steps.""" + project_id = op.get(flags.PROJECT_ID) + certificate_name = op.get(flags.CERTIFICATE_NAME) # Instantiate your step classes start = SslCertificatesStart() + start.project_id = project_id + start.certificate_name = certificate_name # add them to your tree self.add_start(start) # you can create custom steps to define unique logic cert_status = AnalyzeCertificateStatus() + cert_status.project_id = project_id + cert_status.certificate_name = certificate_name # Describe the step relationships self.add_step(parent=start, child=cert_status) - self.add_step(parent=cert_status, child=AnalyzeDomainStatuses()) + + analyze_domain_statuses = AnalyzeDomainStatuses() + analyze_domain_statuses.project_id = project_id + analyze_domain_statuses.certificate_name = certificate_name + + self.add_step(parent=cert_status, child=analyze_domain_statuses) # Ending your runbook self.add_end(SslCertificatesEnd()) @@ -87,31 +98,33 @@ class SslCertificatesStart(runbook.StartStep): """Verify the existence type and status of the SSL certificate.""" template = 'ssl_certificates::confirmation' + project_id: str + certificate_name: str @property def name(self): return (f'Verify the existence and status of the SSL certificate' - f' "{op.get(flags.CERTIFICATE_NAME)}".') + f' "{self.certificate_name}".') def execute(self): """Verifies the existence type and status of the SSL certificate.""" - proj = crm.get_project(op.get(flags.PROJECT_ID)) + proj = crm.get_project(self.project_id) - if not apis.is_enabled(op.context.project_id, 'compute'): + if not apis.is_enabled(self.project_id, 'compute'): op.add_skipped(proj, reason='Compute API is not enabled') return # Early exit if Compute API is disabled try: - op.info(f'name: {op.get(flags.CERTIFICATE_NAME)}') - certificate = lb.get_ssl_certificate(op.context.project_id, - op.get(flags.CERTIFICATE_NAME)) + op.info(f'name: {self.certificate_name}') + certificate = lb.get_ssl_certificate(self.project_id, + self.certificate_name) except googleapiclient.errors.HttpError: op.add_skipped( proj, reason=op.prep_msg( op.SKIPPED_REASON, - name=op.get(flags.CERTIFICATE_NAME), - project_id=op.get(flags.PROJECT_ID), + name=self.certificate_name, + project_id=self.project_id, ), ) return # Early exit if certificate doesn't exist @@ -119,20 +132,19 @@ def execute(self): op.add_skipped( proj, reason=op.prep_msg(op.SKIPPED_REASON_ALT1, - name=op.get(flags.CERTIFICATE_NAME)), + name=self.certificate_name), ) return # Early exit if certificate is not Google-managed if certificate.status == 'ACTIVE': op.add_ok( proj, - reason=op.prep_msg(op.SUCCESS_REASON, - name=op.get(flags.CERTIFICATE_NAME)), + reason=op.prep_msg(op.SUCCESS_REASON, name=self.certificate_name), ) else: op.add_failed(proj, reason=op.prep_msg(op.FAILURE_REASON, - name=op.get(flags.CERTIFICATE_NAME)), + name=self.certificate_name), remediation='') @@ -140,23 +152,23 @@ class AnalyzeCertificateStatus(runbook.Gateway): """Analyze the status of the Google-managed certificate.""" template = 'ssl_certificates::cert_status' + project_id: str + certificate_name: str @property def name(self): return (f'Analyze the status of the Google-managed SSL certificate' - f' "{op.get(flags.CERTIFICATE_NAME)}".') + f' "{self.certificate_name}".') def execute(self): """Checks the status of the Google-managed certificate.""" - certificate = lb.get_ssl_certificate(op.context.project_id, - op.get(flags.CERTIFICATE_NAME)) + certificate = lb.get_ssl_certificate(self.project_id, self.certificate_name) op.add_metadata('certificateStatus', certificate.status) if certificate.status == 'PROVISIONING_FAILED_PERMANENTLY': op.add_failed( certificate, - reason=op.prep_msg(op.FAILURE_REASON, - name=op.get(flags.CERTIFICATE_NAME)), + reason=op.prep_msg(op.FAILURE_REASON, name=self.certificate_name), remediation=op.prep_msg(op.FAILURE_REMEDIATION), ) return @@ -165,7 +177,7 @@ def execute(self): certificate, reason=op.prep_msg( op.UNCERTAIN_REASON, - name=op.get(flags.CERTIFICATE_NAME), + name=self.certificate_name, status=certificate.status, context=('Further investigation into the status of each domain is' ' necessary.'), @@ -177,7 +189,7 @@ def execute(self): certificate, reason=op.prep_msg( op.UNCERTAIN_REASON, - name=op.get(flags.CERTIFICATE_NAME), + name=self.certificate_name, status=certificate.status, context=( 'This typically occurs when the load balancer or DNS' @@ -191,7 +203,7 @@ def execute(self): certificate, reason=op.prep_msg( op.SUCCESS_REASON, - name=op.get(flags.CERTIFICATE_NAME), + name=self.certificate_name, status=certificate.status, ), ) @@ -201,17 +213,18 @@ class AnalyzeDomainStatuses(runbook.Gateway): """Check the status of each individual domain associated with the SSL certificate.""" template = 'ssl_certificates::domain_status' + project_id: str + certificate_name: str @property def name(self): return ( f'Check the status of each individual domain associated with the SSL' - f' certificate "{op.get(flags.CERTIFICATE_NAME)}".') + f' certificate "{self.certificate_name}".') def execute(self): """Checks the status of each individual domain associated with the SSL certificate.""" - certificate = lb.get_ssl_certificate(op.context.project_id, - op.get(flags.CERTIFICATE_NAME)) + certificate = lb.get_ssl_certificate(self.project_id, self.certificate_name) failed_not_visible_domains = [] failed_caa_domains = [] @@ -229,34 +242,38 @@ def execute(self): if failed_not_visible_domains: step = AnalyzeFailedNotVisibleDomains() + step.project_id = self.project_id step.domains = failed_not_visible_domains - step.certificate_name = op.get(flags.CERTIFICATE_NAME) + step.certificate_name = self.certificate_name self.add_child(step) if failed_caa_domains: step = AnalyzeFailedCaaCheck() + step.project_id = self.project_id step.domains = failed_caa_domains - step.certificate_name = op.get(flags.CERTIFICATE_NAME) + step.certificate_name = self.certificate_name self.add_child(step) if failed_rate_limited_domains: step = AnalyzeRateLimitedDomains() + step.project_id = self.project_id step.domains = failed_rate_limited_domains - step.certificate_name = op.get(flags.CERTIFICATE_NAME) + step.certificate_name = self.certificate_name self.add_child(step) if provisioning_domains: step = AnalyzeProvisioningDomains() + step.project_id = self.project_id step.domains = provisioning_domains - step.certificate_name = op.get(flags.CERTIFICATE_NAME) + step.certificate_name = self.certificate_name self.add_child(step) if failed_not_visible_domains or provisioning_domains: step = CheckCertificateAttachment() - step.certificate_name = op.get(flags.CERTIFICATE_NAME) + step.project_id = self.project_id + step.certificate_name = self.certificate_name self.add_child(step) if (not failed_not_visible_domains and not failed_caa_domains and not failed_rate_limited_domains and not provisioning_domains): op.add_ok( certificate, - reason=op.prep_msg(op.SUCCESS_REASON, - name=op.get(flags.CERTIFICATE_NAME)), + reason=op.prep_msg(op.SUCCESS_REASON, name=self.certificate_name), ) @@ -264,7 +281,7 @@ class AnalyzeFailedNotVisibleDomains(runbook.Step): """Analyze domains in "FAILED_NOT_VISIBLE" state.""" template = 'ssl_certificates::failed_not_visible_domains' - + project_id: str domains: List[str] certificate_name: str @@ -275,8 +292,7 @@ def name(self): def execute(self): """Analyzes domains in "FAILED_NOT_VISIBLE" state.""" - certificate = lb.get_ssl_certificate(op.context.project_id, - op.get(flags.CERTIFICATE_NAME)) + certificate = lb.get_ssl_certificate(self.project_id, self.certificate_name) op.add_failed( certificate, reason=op.prep_msg( @@ -293,7 +309,7 @@ class AnalyzeProvisioningDomains(runbook.Step): """Analyze domains in "PROVISIONING" state.""" template = 'ssl_certificates::provisioning_domains' - + project_id: str domains: List[str] certificate_name: str @@ -304,8 +320,7 @@ def name(self): def execute(self): """Analyzes domains in "PROVISIONING" state.""" - certificate = lb.get_ssl_certificate(op.context.project_id, - op.get(flags.CERTIFICATE_NAME)) + certificate = lb.get_ssl_certificate(self.project_id, self.certificate_name) op.add_uncertain( certificate, reason=op.prep_msg( @@ -322,7 +337,7 @@ class AnalyzeRateLimitedDomains(runbook.Step): """Analyze domains in "FAILED_RATE_LIMITED" state.""" template = 'ssl_certificates::failed_rate_limited_domains' - + project_id: str domains: List[str] certificate_name: str @@ -334,8 +349,7 @@ def name(self): def execute(self): """Analyzes domains in "FAILED_RATE_LIMITED" state.""" - certificate = lb.get_ssl_certificate(op.context.project_id, - op.get(flags.CERTIFICATE_NAME)) + certificate = lb.get_ssl_certificate(self.project_id, self.certificate_name) op.add_failed( certificate, reason=op.prep_msg(op.FAILURE_REASON, @@ -350,7 +364,7 @@ class AnalyzeFailedCaaCheck(runbook.Step): """Analyze domains in "FAILED_CAA_CHECKING" or "FAILED_CAA_FORBIDDEN" state.""" template = 'ssl_certificates::failed_caa_check_domains' - + project_id: str domains: List[str] certificate_name: str @@ -362,8 +376,7 @@ def name(self): def execute(self): """Analyzes domains in "FAILED_CAA_CHECKING" or "FAILED_CAA_FORBIDDEN" state.""" - certificate = lb.get_ssl_certificate(op.context.project_id, - op.get(flags.CERTIFICATE_NAME)) + certificate = lb.get_ssl_certificate(self.project_id, self.certificate_name) op.add_failed( certificate, reason=op.prep_msg(op.FAILURE_REASON, @@ -382,7 +395,7 @@ class CheckCertificateAttachment(runbook.Gateway): """ template = 'ssl_certificates::check_certificate_attachment' - + project_id: str certificate_name: str @property @@ -392,12 +405,11 @@ def name(self): def execute(self): """Checks if the SSL certificate is attached to a target proxy.""" - certificate = lb.get_ssl_certificate(op.context.project_id, - op.get(flags.CERTIFICATE_NAME)) + certificate = lb.get_ssl_certificate(self.project_id, self.certificate_name) try: - target_https_proxies = lb.get_target_https_proxies(op.context.project_id) - target_ssl_proxies = lb.get_target_ssl_proxies(op.context.project_id) + target_https_proxies = lb.get_target_https_proxies(self.project_id) + target_ssl_proxies = lb.get_target_ssl_proxies(self.project_id) except googleapiclient.errors.HttpError as e: op.add_skipped( certificate, @@ -413,14 +425,13 @@ def execute(self): if not target_proxies_with_certificate: op.add_failed( certificate, - reason=op.prep_msg(op.FAILURE_REASON, - name=op.get(flags.CERTIFICATE_NAME)), + reason=op.prep_msg(op.FAILURE_REASON, name=self.certificate_name), remediation=op.prep_msg(op.FAILURE_REMEDIATION), ) return try: - forwarding_rules = lb.get_forwarding_rules(op.context.project_id) + forwarding_rules = lb.get_forwarding_rules(self.project_id) except ValueError as e: op.add_skipped( certificate, @@ -460,7 +471,7 @@ def execute(self): certificate, reason=op.prep_msg( op.SUCCESS_REASON, - name=op.get(flags.CERTIFICATE_NAME), + name=self.certificate_name, target_proxies=', '.join( [tp.full_path for tp in used_target_proxies_with_certificate]), ), @@ -469,28 +480,30 @@ def execute(self): for domain in certificate.domain_status.keys(): if certificate.domain_status[domain] != 'ACTIVE': verify_dns_records = VerifyDnsRecords() + verify_dns_records.project_id = self.project_id verify_dns_records.domain = domain - verify_dns_records.certificate_name = op.get(flags.CERTIFICATE_NAME) + verify_dns_records.certificate_name = self.certificate_name verify_dns_records.forwarding_rules_with_certificate = ( forwarding_rules_with_certificate) self.add_child(verify_dns_records) verify_forwarding_rules_port = VerifyForwardingRulesPort() - verify_forwarding_rules_port.certificate_name = op.get( - flags.CERTIFICATE_NAME) + verify_forwarding_rules_port.project_id = self.project_id + verify_forwarding_rules_port.certificate_name = self.certificate_name verify_forwarding_rules_port.forwarding_rules_with_certificate = ( forwarding_rules_with_certificate) self.add_child(verify_forwarding_rules_port) verify_no_certificate_map_conflict = VerifyNoCertificateMapConflict() - verify_no_certificate_map_conflict.certificate_name = op.get( - flags.CERTIFICATE_NAME) + verify_no_certificate_map_conflict.project_id = self.project_id + verify_no_certificate_map_conflict.certificate_name = self.certificate_name verify_no_certificate_map_conflict.target_proxies_with_certificate = ( target_proxies_with_certificate) self.add_child(verify_no_certificate_map_conflict) check_provisioning_time = CheckProvisioningTime() - check_provisioning_time.certificate_name = op.get(flags.CERTIFICATE_NAME) + check_provisioning_time.project_id = self.project_id + check_provisioning_time.certificate_name = self.certificate_name check_provisioning_time.target_proxies_with_certificate = ( target_proxies_with_certificate) check_provisioning_time.forwarding_rules_with_certificate = ( @@ -502,7 +515,7 @@ class VerifyDnsRecords(runbook.Gateway): """Check the DNS records for specific domain associated with the SSL certificate.""" template = 'ssl_certificates::verify_dns_records' - + project_id: str forwarding_rules_with_certificate: List[lb.ForwardingRules] domain: str certificate_name: str @@ -514,8 +527,7 @@ def name(self): f' SSL certificate "{self.certificate_name}".') def execute(self): - certificate = lb.get_ssl_certificate(op.context.project_id, - self.certificate_name) + certificate = lb.get_ssl_certificate(self.project_id, self.certificate_name) ip_addresses = dns.find_dns_records(self.domain) op.add_metadata('domain', self.domain) @@ -545,7 +557,7 @@ def execute(self): op.SUCCESS_REASON, domain=self.domain, ip_addresses=', '.join(ip_addresses_pointing_to_lb), - name=op.get(flags.CERTIFICATE_NAME), + name=self.certificate_name, ), ) elif ip_addresses_pointing_to_lb and unresolved_ip_addresses: @@ -554,7 +566,7 @@ def execute(self): reason=op.prep_msg( op.UNCERTAIN_REASON, domain=self.domain, - name=op.get(flags.CERTIFICATE_NAME), + name=self.certificate_name, unresolved_ip_addresses=', '.join(unresolved_ip_addresses), resolved_ip_addresses=', '.join(ip_addresses_pointing_to_lb), ), @@ -562,7 +574,7 @@ def execute(self): op.UNCERTAIN_REMEDIATION, domain=self.domain, fr_ip_message=fr_ip_message, - name=op.get(flags.CERTIFICATE_NAME), + name=self.certificate_name, ), ) elif unresolved_ip_addresses: @@ -572,13 +584,13 @@ def execute(self): op.FAILURE_REASON, domain=self.domain, unresolved_ip_addresses=', '.join(unresolved_ip_addresses), - name=op.get(flags.CERTIFICATE_NAME), + name=self.certificate_name, ), remediation=op.prep_msg( op.FAILURE_REMEDIATION, domain=self.domain, fr_ip_message=fr_ip_message, - name=op.get(flags.CERTIFICATE_NAME), + name=self.certificate_name, ), ) else: @@ -589,7 +601,7 @@ def execute(self): op.FAILURE_REMEDIATION, domain=self.domain, fr_ip_message=fr_ip_message, - name=op.get(flags.CERTIFICATE_NAME), + name=self.certificate_name, ), ) @@ -602,7 +614,7 @@ class VerifyForwardingRulesPort(runbook.Step): """ template = 'ssl_certificates::verify_forwarding_rules_port' - + project_id: str forwarding_rules_with_certificate: List[lb.ForwardingRules] certificate_name: str @@ -613,8 +625,7 @@ def name(self): def execute(self): """Checks if the load balancer is configured to listen on port 443.""" - certificate = lb.get_ssl_certificate(op.context.project_id, - op.get(flags.CERTIFICATE_NAME)) + certificate = lb.get_ssl_certificate(self.project_id, self.certificate_name) # Group forwarding rules by IP address frs_by_ip = {} @@ -639,13 +650,12 @@ def execute(self): misconfigured_entities='\n'.join(misconfigured_entities), ), remediation=op.prep_msg(op.FAILURE_REMEDIATION, - name=op.get(flags.CERTIFICATE_NAME)), + name=self.certificate_name), ) else: op.add_ok( certificate, - reason=op.prep_msg(op.SUCCESS_REASON, - name=op.get(flags.CERTIFICATE_NAME)), + reason=op.prep_msg(op.SUCCESS_REASON, name=self.certificate_name), ) def is_port_in_range(self, port: int, port_range: str): @@ -661,7 +671,7 @@ class VerifyNoCertificateMapConflict(runbook.Step): """Checks for conflicting certificate map set on a target proxy.""" template = 'ssl_certificates::verify_no_certificate_map_conflict' - + project_id: str target_proxies_with_certificate: List[TargetProxy] certificate_name: str @@ -673,8 +683,7 @@ def name(self): def execute(self): """Checks for conflicting certificate map set on a target proxy.""" - certificate = lb.get_ssl_certificate(op.context.project_id, - op.get(flags.CERTIFICATE_NAME)) + certificate = lb.get_ssl_certificate(self.project_id, self.certificate_name) conflicting_target_proxies = [] for target_proxy in self.target_proxies_with_certificate: @@ -700,8 +709,7 @@ def execute(self): else: op.add_ok( certificate, - reason=op.prep_msg(op.SUCCESS_REASON, - name=op.get(flags.CERTIFICATE_NAME)), + reason=op.prep_msg(op.SUCCESS_REASON, name=self.certificate_name), ) @@ -709,7 +717,7 @@ class CheckProvisioningTime(runbook.Step): """Checks if the SSL certificate associated resources has been updated recently.""" template = 'ssl_certificates::check_provisioning_time' - + project_id: str target_proxies_with_certificate: List[TargetProxy] forwarding_rules_with_certificate: List[lb.ForwardingRules] certificate_name: str @@ -721,8 +729,7 @@ def name(self): def execute(self): """Checks if the SSL certificate associated resources has been updated recently.""" - certificate = lb.get_ssl_certificate(op.context.project_id, - op.get(flags.CERTIFICATE_NAME)) + certificate = lb.get_ssl_certificate(self.project_id, self.certificate_name) recently_changed = [] @@ -733,7 +740,7 @@ def execute(self): protoPayload.methodName=~"(forwardingRules|globalForwardingRules).(patch|update|insert)" """.format(forwarding_rule.region, forwarding_rule.id) serial_log_entries = logs.realtime_query( - project_id=op.get(flags.PROJECT_ID), + project_id=self.project_id, filter_str=filter_str, start_time=datetime.now() - timedelta(days=1), end_time=datetime.now(), @@ -766,7 +773,7 @@ def execute(self): # This should never happen raise ValueError(f'Unsupported target proxy type: {type(target_proxy)}') serial_log_entries = logs.realtime_query( - project_id=op.get(flags.PROJECT_ID), + project_id=self.project_id, filter_str=filter_str, start_time=datetime.now() - timedelta(days=1), end_time=datetime.now(), @@ -785,17 +792,17 @@ def execute(self): reason=op.prep_msg( op.UNCERTAIN_REASON, recently_changed='\n'.join(recently_changed), - name=op.get(flags.CERTIFICATE_NAME), + name=self.certificate_name, ), remediation=op.prep_msg(op.UNCERTAIN_REMEDIATION, - name=op.get(flags.CERTIFICATE_NAME)), + name=self.certificate_name), ) else: op.add_ok( certificate, reason=op.prep_msg( op.SUCCESS_REASON, - name=op.get(flags.CERTIFICATE_NAME), + name=self.certificate_name, ), ) diff --git a/gcpdiag/runbook/lb/unhealthy_backends.py b/gcpdiag/runbook/lb/unhealthy_backends.py index e464c2caa..7b6bae70e 100644 --- a/gcpdiag/runbook/lb/unhealthy_backends.py +++ b/gcpdiag/runbook/lb/unhealthy_backends.py @@ -94,22 +94,46 @@ class UnhealthyBackends(runbook.DiagnosticTree): def build_tree(self): """Building Decision Tree""" + project_id = op.get(flags.PROJECT_ID) + backend_service_name = op.get(flags.BACKEND_SERVICE_NAME) + region = op.get(flags.REGION, 'global') + start = UnhealthyBackendsStart() + start.project_id = project_id + start.backend_service_name = backend_service_name + start.region = region + self.add_start(start) logging_check = VerifyHealthCheckLoggingEnabled() + logging_check.project_id = project_id + logging_check.backend_service_name = backend_service_name + logging_check.region = region self.add_step(parent=start, child=logging_check) port_check = ValidateBackendServicePortConfiguration() + port_check.project_id = project_id + port_check.backend_service_name = backend_service_name + port_check.region = region self.add_step(parent=start, child=port_check) protocol_check = ValidateBackendServiceProtocolConfiguration() + protocol_check.project_id = project_id + protocol_check.backend_service_name = backend_service_name + protocol_check.region = region self.add_step(parent=start, child=protocol_check) firewall_check = VerifyFirewallRules() + firewall_check.project_id = project_id + firewall_check.backend_service_name = backend_service_name + firewall_check.region = region self.add_step(parent=start, child=firewall_check) vm_performance_check = CheckVmPerformance() + vm_performance_check.project_id = project_id + vm_performance_check.backend_service_name = backend_service_name + vm_performance_check.region = region + self.add_step(parent=start, child=vm_performance_check) # Ending your runbook @@ -120,54 +144,54 @@ class UnhealthyBackendsStart(runbook.StartStep): """Start step for Unhealthy Backends runbook.""" template = 'unhealthy_backends::confirmation' + project_id: str + backend_service_name: str + region: str @property def name(self): - region = op.get(flags.REGION, 'global') return (f'Analyze unhealthy backends for backend service' - f' "{op.get(flags.BACKEND_SERVICE_NAME)}" in region' - f' "{region}".') + f' "{self.backend_service_name}" in region' + f' "{self.region}".') def execute(self): """Checks the health of a specified load balancer's backends.""" - proj = crm.get_project(op.get(flags.PROJECT_ID)) + proj = crm.get_project(self.project_id) - if not apis.is_enabled(op.context.project_id, 'compute'): + if not apis.is_enabled(self.project_id, 'compute'): op.add_skipped(proj, reason='Compute API is not enabled') return # Early exit if Compute API is disabled try: - op.info(f'name: {op.get(flags.BACKEND_SERVICE_NAME)}, region:' - f" {op.get(flags.REGION, 'global')}") + op.info(f'name: {self.backend_service_name}, region:' + f' {self.region}') backend_service = lb.get_backend_service( - op.context.project_id, - op.get(flags.BACKEND_SERVICE_NAME), - op.get(flags.REGION, 'global'), + self.project_id, + self.backend_service_name, + self.region, ) except googleapiclient.errors.HttpError: op.add_skipped( proj, - reason=( - f'Backend service {op.get(flags.BACKEND_SERVICE_NAME)} does not' - f" exist in scope {op.get(flags.REGION, 'global')} or project" - f' {op.get(flags.PROJECT_ID)}'), + reason=(f'Backend service {self.backend_service_name} does not' + f' exist in scope {self.region} or project' + f' {self.project_id}'), ) return # Early exit if load balancer doesn't exist backend_health_statuses = lb.get_backend_service_health( - op.context.project_id, - op.get(flags.BACKEND_SERVICE_NAME), - op.get(flags.REGION), + self.project_id, + self.backend_service_name, + self.region, ) if not backend_health_statuses: op.add_skipped( proj, - reason=( - f'Backend service {op.get(flags.BACKEND_SERVICE_NAME)} does not' - f" have any backends in scope {op.get(flags.REGION, 'global')} or" - f' project {op.get(flags.PROJECT_ID)}'), + reason=(f'Backend service {self.backend_service_name} does not' + f' have any backends in scope {self.region} or' + f' project {self.project_id}'), ) return # Early exit if load balancer doesn't have any backends @@ -193,8 +217,8 @@ def execute(self): resource=backend_service, reason=op.prep_msg( op.FAILURE_REASON, - name=op.get(flags.BACKEND_SERVICE_NAME), - region=op.get(flags.REGION, 'global'), + name=self.backend_service_name, + region=self.region, detailed_reason=detailed_reason, ), remediation='', @@ -204,8 +228,8 @@ def execute(self): resource=backend_service, reason=op.prep_msg( op.SUCCESS_REASON, - name=op.get(flags.BACKEND_SERVICE_NAME), - region=op.get(flags.REGION, 'global'), + name=self.backend_service_name, + region=self.region, ), ) @@ -214,13 +238,15 @@ class CheckVmPerformance(runbook.CompositeStep): """Checks if the instances performance is degraded.""" template = 'unhealthy_backends::vm_performance' + project_id: str + backend_service_name: str + region: str @property def name(self): - region = op.get(flags.REGION, 'global') return (f'Check VMs performance for unhealthy backends in backend service' - f' "{op.get(flags.BACKEND_SERVICE_NAME)}" in region' - f' "{region}".') + f' "{self.backend_service_name}" in region' + f' "{self.region}".') def execute(self): """Checks if the VM performance is degraded. @@ -229,9 +255,9 @@ def execute(self): memory and cpu utilization are being checked. """ backend_health_statuses = lb.get_backend_service_health( - op.get(flags.PROJECT_ID), - op.get(flags.BACKEND_SERVICE_NAME), - op.get(flags.REGION), + self.project_id, + self.backend_service_name, + self.region, ) instances_to_analyze_by_group = {} @@ -276,33 +302,34 @@ class VerifyFirewallRules(runbook.Step): """Checks if firewall rules are configured correctly.""" template = 'unhealthy_backends::firewall_rules' + project_id: str + backend_service_name: str + region: str @property def name(self): - region = op.get(flags.REGION, 'global') return (f'Verify firewall rules allow health checks for backend service' - f' "{op.get(flags.BACKEND_SERVICE_NAME)}" in region' - f' "{region}".') + f' "{self.backend_service_name}" in region' + f' "{self.region}".') def execute(self): """Checks if firewall rules are configured correctly.""" - if not apis.is_enabled(op.context.project_id, 'recommender'): + if not apis.is_enabled(self.project_id, 'recommender'): op.add_skipped( - crm.get_project(op.context.project_id), + crm.get_project(self.project_id), reason=( 'Checking firewall rules requires Recommender API to be enabled'), ) return # Early exit if Recommender API is disabled backend_service = lb.get_backend_service( - op.context.project_id, - op.get(flags.BACKEND_SERVICE_NAME), - op.get(flags.REGION), + self.project_id, + self.backend_service_name, + self.region, ) used_by_refs = backend_service.used_by_refs - insights = lb.get_lb_insights_for_a_project(op.context.project_id, - op.get(flags.REGION, 'global')) + insights = lb.get_lb_insights_for_a_project(self.project_id, self.region) for insight in insights: if insight.is_firewall_rule_insight and insight.details.get( 'loadBalancerUri'): @@ -341,32 +368,33 @@ class ValidateBackendServicePortConfiguration(runbook.Step): """Checks if health check sends probe requests to the different port than serving port.""" template = 'unhealthy_backends::port_mismatch' + project_id: str + backend_service_name: str + region: str @property def name(self): - region = op.get(flags.REGION, 'global') return (f'Validate port configuration for backend service' - f' "{op.get(flags.BACKEND_SERVICE_NAME)}" in region' - f' "{region}".') + f' "{self.backend_service_name}" in region' + f' "{self.region}".') def execute(self): """Checks if health check sends probe requests to the different port than serving port.""" - if not apis.is_enabled(op.context.project_id, 'recommender'): + if not apis.is_enabled(self.project_id, 'recommender'): op.add_skipped( - crm.get_project(op.context.project_id), + crm.get_project(self.project_id), reason=('Checking port configuration requires Recommender API to be' ' enabled'), ) return # Early exit if Recommender API is disabled backend_service = lb.get_backend_service( - op.context.project_id, - op.get(flags.BACKEND_SERVICE_NAME), - op.get(flags.REGION), + self.project_id, + self.backend_service_name, + self.region, ) - igs = gce.get_instance_groups(op.context) - insights = lb.get_lb_insights_for_a_project(op.context.project_id, - op.get(flags.REGION, 'global')) + igs = gce.get_instance_groups(models.Context(project_id=self.project_id)) + insights = lb.get_lb_insights_for_a_project(self.project_id, self.region) for insight in insights: if insight.is_health_check_port_mismatch_insight: for info in insight.details.get('backendServiceInfos'): @@ -419,25 +447,27 @@ class ValidateBackendServiceProtocolConfiguration(runbook.Step): """Checks if health check uses the same protocol as backend service for serving traffic.""" template = 'unhealthy_backends::protocol_mismatch' + project_id: str + backend_service_name: str + region: str @property def name(self): - region = op.get(flags.REGION, 'global') return (f'Validate protocol configuration for backend service' - f' "{op.get(flags.BACKEND_SERVICE_NAME)}" in region' - f' "{region}".') + f' "{self.backend_service_name}" in region' + f' "{self.region}".') def execute(self): """Checks if health check uses the same protocol as backend service for serving traffic.""" backend_service = lb.get_backend_service( - op.context.project_id, - op.get(flags.BACKEND_SERVICE_NAME), - op.get(flags.REGION), + self.project_id, + self.backend_service_name, + self.region, ) health_check = gce.get_health_check( - op.context.project_id, + self.project_id, backend_service.health_check, backend_service.region, ) @@ -472,16 +502,25 @@ class VerifyHealthCheckLoggingEnabled(runbook.Gateway): """Check if health check logging is enabled.""" template = 'unhealthy_backends::logging_enabled' + project_id: str + backend_service_name: str + region: str + + @property + def name(self): + return (f'Verify health check logging enabled for backend service' + f' "{self.backend_service_name}" in region' + f' "{self.region}".') def execute(self): """Check if health check logging is enabled.""" backend_service = lb.get_backend_service( - op.context.project_id, - op.get(flags.BACKEND_SERVICE_NAME), - op.get(flags.REGION), + self.project_id, + self.backend_service_name, + self.region, ) health_check = gce.get_health_check( - op.context.project_id, + self.project_id, backend_service.health_check, backend_service.region, ) @@ -491,12 +530,21 @@ def execute(self): health_check, reason=op.prep_msg(op.SUCCESS_REASON, hc_url=health_check.full_path), ) - self.add_child(AnalyzeLatestHealthCheckLog()) - self.add_child(CheckPastHealthCheckSuccess()) + analyze_latest_hc_log = AnalyzeLatestHealthCheckLog() + analyze_latest_hc_log.project_id = self.project_id + analyze_latest_hc_log.backend_service_name = self.backend_service_name + analyze_latest_hc_log.region = self.region + self.add_child(analyze_latest_hc_log) + + check_past_hc_success = CheckPastHealthCheckSuccess() + check_past_hc_success.project_id = self.project_id + check_past_hc_success.backend_service_name = self.backend_service_name + check_past_hc_success.region = self.region + self.add_child(check_past_hc_success) else: additional_flags = '' - if op.get(flags.REGION): - additional_flags = f'--region={op.get(flags.REGION)} ' + if self.region != 'global': + additional_flags = f'--region={self.region} ' op.add_uncertain( backend_service, reason=op.prep_msg(op.UNCERTAIN_REASON, @@ -514,18 +562,28 @@ class AnalyzeLatestHealthCheckLog(runbook.Gateway): """Look for the latest health check logs and based on that decide what to do next.""" template = 'unhealthy_backends::health_check_log' + project_id: str + backend_service_name: str + region: str + + @property + def name(self): + + return (f'Analyze latest health check log for backend service' + f' "{self.backend_service_name}" in region' + f' "{self.region}".') def execute(self): """Look for the latest health check logs and based on that decide what to do next.""" backend_service = lb.get_backend_service( - op.context.project_id, - op.get(flags.BACKEND_SERVICE_NAME), - op.get(flags.REGION), + self.project_id, + self.backend_service_name, + self.region, ) health_checks_states = lb.get_backend_service_health( - op.context.project_id, - op.get(flags.BACKEND_SERVICE_NAME), - op.get(flags.REGION), + self.project_id, + self.backend_service_name, + self.region, ) # Find all groups that have at least one unhealthy instance @@ -553,27 +611,26 @@ def execute(self): filter_str = """resource.type="gce_instance_group" log_name="projects/{}/logs/compute.googleapis.com%2Fhealthchecks" resource.labels.instance_group_name="{}" - resource.labels.location=~{} + resource.labels.location=~"{}" jsonPayload.healthCheckProbeResult.healthState="UNHEALTHY" - """.format(op.get(flags.PROJECT_ID), resource_name, - location) + """.format(self.project_id, resource_name, location) elif resource_type == 'networkEndpointGroups': network_endpoint_group = _get_zonal_network_endpoint_group( - op.get(flags.PROJECT_ID), location, resource_name) + self.project_id, location, resource_name) if network_endpoint_group: filter_str = """resource.type="gce_network_endpoint_group" log_name="projects/{}/logs/compute.googleapis.com%2Fhealthchecks" resource.labels.network_endpoint_group_id="{}" resource.labels.zone={} jsonPayload.healthCheckProbeResult.healthState="UNHEALTHY" - """.format(op.get(flags.PROJECT_ID), - network_endpoint_group.id, location) + """.format(self.project_id, network_endpoint_group.id, + location) else: op.add_skipped( resource=backend_service, reason=( f'Network endpoint group {resource_name} in zone {location} ' - f'does not exist in project {op.get(flags.PROJECT_ID)}'), + f'does not exist in project {self.project_id}'), ) continue else: @@ -581,15 +638,16 @@ def execute(self): resource=backend_service, reason=(f'Unsupported resource type {resource_type} for group' f' {group} in backend service' - f' {op.get(flags.BACKEND_SERVICE_NAME)} in scope' - f" {op.get(flags.REGION, 'global')}"), + f' {self.backend_service_name} in scope' + f' {self.region}'), ) continue serial_log_entries = logs.realtime_query( - project_id=op.get(flags.PROJECT_ID), + project_id=self.project_id, filter_str=filter_str, start_time=datetime.now() - timedelta(days=14), end_time=datetime.now(), + disable_paging=True, ) if serial_log_entries: @@ -608,14 +666,25 @@ def execute(self): if detailed_health_states.get('TIMEOUT'): timeout_hc_log_step = AnalyzeTimeoutHealthCheckLog() + timeout_hc_log_step.project_id = self.project_id + timeout_hc_log_step.backend_service_name = self.backend_service_name + timeout_hc_log_step.region = self.region timeout_hc_log_step.logs = detailed_health_states.get('TIMEOUT') self.add_child(timeout_hc_log_step) if detailed_health_states.get('UNHEALTHY'): unhealthy_hc_log_step = AnalyzeUnhealthyHealthCheckLog() + unhealthy_hc_log_step.project_id = self.project_id + unhealthy_hc_log_step.backend_service_name = self.backend_service_name + unhealthy_hc_log_step.region = self.region unhealthy_hc_log_step.logs = detailed_health_states.get('UNHEALTHY') self.add_child(unhealthy_hc_log_step) if detailed_health_states.get('UNKNOWN'): - self.add_child(AnalyzeUnknownHealthCheckLog()) + unknown_hc_log_step = AnalyzeUnknownHealthCheckLog() + unknown_hc_log_step.project_id = self.project_id + unknown_hc_log_step.backend_service_name = self.backend_service_name + unknown_hc_log_step.region = self.region + + self.add_child(unknown_hc_log_step) class AnalyzeTimeoutHealthCheckLog(runbook.Step): @@ -623,20 +692,22 @@ class AnalyzeTimeoutHealthCheckLog(runbook.Step): logs: list[dict] template = 'unhealthy_backends::timeout_hc_state_log' + project_id: str + backend_service_name: str + region: str @property def name(self): - region = op.get(flags.REGION, 'global') return (f'Analyze TIMEOUT health check logs for backend service' - f' "{op.get(flags.BACKEND_SERVICE_NAME)}" in region' - f' "{region}".') + f' "{self.backend_service_name}" in region' + f' "{self.region}".') def execute(self): """Analyzes logs with the detailed health check state TIMEOUT""" backend_service = lb.get_backend_service( - op.context.project_id, - op.get(flags.BACKEND_SERVICE_NAME), - op.get(flags.REGION), + self.project_id, + self.backend_service_name, + self.region, ) if not self.logs: @@ -647,7 +718,7 @@ def execute(self): return health_check = gce.get_health_check( - op.context.project_id, + self.project_id, backend_service.health_check, backend_service.region, ) @@ -684,21 +755,23 @@ class AnalyzeUnhealthyHealthCheckLog(runbook.Step): template = 'unhealthy_backends::unhealthy_hc_state_log' logs: list[dict] + project_id: str + backend_service_name: str + region: str @property def name(self): - region = op.get(flags.REGION, 'global') return (f'Analyze UNHEALTHY health check logs for backend service' - f' "{op.get(flags.BACKEND_SERVICE_NAME)}" in region' - f' "{region}".') + f' "{self.backend_service_name}" in region' + f' "{self.region}".') def execute(self): """Analyzes logs with detailed health state UNHEALTHY.""" backend_service = lb.get_backend_service( - op.context.project_id, - op.get(flags.BACKEND_SERVICE_NAME), - op.get(flags.REGION), + self.project_id, + self.backend_service_name, + self.region, ) if not self.logs: @@ -709,7 +782,7 @@ def execute(self): return health_check = gce.get_health_check( - op.context.project_id, + self.project_id, backend_service.health_check, backend_service.region, ) @@ -742,20 +815,22 @@ class AnalyzeUnknownHealthCheckLog(runbook.Step): """Analyze logs with detailed health state UNKNOWN.""" template = 'unhealthy_backends::unknown_hc_state_log' + project_id: str + backend_service_name: str + region: str @property def name(self): - region = op.get(flags.REGION, 'global') return (f'Analyze UNKNOWN health check logs for backend service' - f' "{op.get(flags.BACKEND_SERVICE_NAME)}" in region' - f' "{region}".') + f' "{self.backend_service_name}" in region' + f' "{self.region}".') def execute(self): """Analyze logs with detailed health state UNKNOWN.""" backend_service = lb.get_backend_service( - op.context.project_id, - op.get(flags.BACKEND_SERVICE_NAME), - op.get(flags.REGION), + self.project_id, + self.backend_service_name, + self.region, ) op.add_uncertain( backend_service, @@ -769,26 +844,28 @@ class CheckPastHealthCheckSuccess(runbook.Step): """Checks if the health check has worked successfully in the past.""" template = 'unhealthy_backends::past_hc_success' + project_id: str + backend_service_name: str + region: str @property def name(self): - region = op.get(flags.REGION, 'global') return (f'Check past health check success for backend service' - f' "{op.get(flags.BACKEND_SERVICE_NAME)}" in region' - f' "{region}".') + f' "{self.backend_service_name}" in region' + f' "{self.region}".') def execute(self): """Checks if the health check has worked successfully in the past.""" backend_service = lb.get_backend_service( - op.get(flags.PROJECT_ID), - op.get(flags.BACKEND_SERVICE_NAME), - op.get(flags.REGION), + self.project_id, + self.backend_service_name, + self.region, ) health_checks_states = lb.get_backend_service_health( - op.context.project_id, - op.get(flags.BACKEND_SERVICE_NAME), - op.get(flags.REGION), + self.project_id, + self.backend_service_name, + self.region, ) unhealthy_groups = { @@ -818,10 +895,10 @@ def execute(self): jsonPayload.healthCheckProbeResult.previousHealthState="HEALTHY" jsonPayload.healthCheckProbeResult.detailedHealthState="TIMEOUT" OR "UNHEALTHY" OR "UNKNOWN" """.format( - op.get(flags.PROJECT_ID), resource_name, location) + self.project_id, resource_name, location) elif resource_type == 'networkEndpointGroups': network_endpoint_group = _get_zonal_network_endpoint_group( - op.get(flags.PROJECT_ID), location, resource_name) + self.project_id, location, resource_name) if network_endpoint_group: filter_str = """resource.type="gce_network_endpoint_group" log_name="projects/{}/logs/compute.googleapis.com%2Fhealthchecks" @@ -830,13 +907,13 @@ def execute(self): jsonPayload.healthCheckProbeResult.previousHealthState="HEALTHY" jsonPayload.healthCheckProbeResult.detailedHealthState="TIMEOUT" OR "UNHEALTHY" OR "UNKNOWN" """.format( - op.get(flags.PROJECT_ID), network_endpoint_group.id, location) + self.project_id, network_endpoint_group.id, location) else: op.add_skipped( resource=group, reason=( f'Network endpoint group {resource_name} in zone {location} ' - f'does not exist in project {op.get(flags.PROJECT_ID)}'), + f'does not exist in project {self.project_id}'), ) continue else: @@ -844,16 +921,17 @@ def execute(self): resource=group, reason=(f'Unsupported resource type {resource_type} for group' f' {group} in backend service' - f' {op.get(flags.BACKEND_SERVICE_NAME)} in scope' - f" {op.get(flags.REGION, 'global')}"), + f' {self.backend_service_name} in scope' + f' {self.region}'), ) continue serial_log_entries = logs.realtime_query( - project_id=op.get(flags.PROJECT_ID), + project_id=self.project_id, filter_str=filter_str, start_time=datetime.now() - timedelta(days=14), end_time=datetime.now(), + disable_paging=True, ) if serial_log_entries: @@ -895,11 +973,12 @@ def execute(self): """Finalize unhealthy backends diagnostics.""" if not config.get(flags.INTERACTIVE_MODE): region = op.get(flags.REGION, 'global') + backend_service_name = op.get(flags.BACKEND_SERVICE_NAME) response = op.prompt( kind=op.CONFIRMATION, message=( 'Are you still experiencing health check issues on the backend' - f' service {op.get(flags.BACKEND_SERVICE_NAME)} in region' + f' service {backend_service_name} in region' f' {region}?'), choice_msg='Enter an option: ', )