Skip to content

Commit

Permalink
[CMK & Health State] Health Check refactoring (#3535)
Browse files Browse the repository at this point in the history
* Changes in CosmosHealthCheck.

* Adding the concept of Degrated to health checks.

* Update Healthcase Shared components to 7.0.20.
Update Health Check to follow the same standards as DICOM service.

* Pending call to AddCustomerKeyValidationBackgroundService.

* Adding more properties to IFhirRuntimeConfiguration.

* Updating error codes.
Reusing method to identify selected data store.

* Fix error codes in tests.
  • Loading branch information
fhibf authored Sep 29, 2023
1 parent 90c537e commit 2742a05
Show file tree
Hide file tree
Showing 11 changed files with 216 additions and 49 deletions.
3 changes: 2 additions & 1 deletion Directory.Packages.props
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
<Project>
<!-- Shared dependencies versions.-->
<PropertyGroup>
<HealthcareSharedPackageVersion>7.0.15</HealthcareSharedPackageVersion>
<HealthcareSharedPackageVersion>7.0.20</HealthcareSharedPackageVersion>
<Hl7FhirVersion>4.3.0</Hl7FhirVersion>
</PropertyGroup>

Expand Down Expand Up @@ -94,6 +94,7 @@
<PackageVersion Include="Microsoft.Health.Api" Version="$(HealthcareSharedPackageVersion)" />
<PackageVersion Include="Microsoft.Health.Client" Version="$(HealthcareSharedPackageVersion)" />
<PackageVersion Include="Microsoft.Health.Core" Version="$(HealthcareSharedPackageVersion)" />
<PackageVersion Include="Microsoft.Health.Encryption" Version="$(HealthcareSharedPackageVersion)" />
<PackageVersion Include="Microsoft.Health.Extensions.BuildTimeCodeGenerator" Version="$(HealthcareSharedPackageVersion)" />
<PackageVersion Include="Microsoft.Health.Extensions.DependencyInjection" Version="$(HealthcareSharedPackageVersion)" />
<PackageVersion Include="Microsoft.Health.Fhir.Anonymizer.R4.Core" Version="3.1.0.62" />
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
// -------------------------------------------------------------------------------------------------
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
// -------------------------------------------------------------------------------------------------

namespace Microsoft.Health.Fhir.Core.Features.Health
{
public enum FhirHealthErrorCode
{
Error408, // External cancellation requested.
Error412, // Customer Managed Key is not available.
Error429, // Rate-limit exceptions.
Error500, // Not expected exceptions.
Error501, // Operation canceled.
}
}
21 changes: 19 additions & 2 deletions src/Microsoft.Health.Fhir.Core/Features/KnownDataStores.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,29 @@
// Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
// -------------------------------------------------------------------------------------------------

using System;
using EnsureThat;

namespace Microsoft.Health.Fhir.Core.Features
{
public static class KnownDataStores
{
public const string CosmosDb = "CosmosDb";
public const string CosmosDb = nameof(CosmosDb);

public const string SqlServer = nameof(SqlServer);

public static bool IsCosmosDbDataStore(string dataStoreName)
{
EnsureArg.IsNotNullOrWhiteSpace(dataStoreName, nameof(dataStoreName));

return string.Equals(CosmosDb, dataStoreName, StringComparison.OrdinalIgnoreCase);
}

public static bool IsSqlServerDataStore(string dataStoreName)
{
EnsureArg.IsNotNullOrWhiteSpace(dataStoreName, nameof(dataStoreName));

public const string SqlServer = "SqlServer";
return string.Equals(SqlServer, dataStoreName, StringComparison.OrdinalIgnoreCase);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ public async Task ExecuteAsync(ExportJobRecord exportJobRecord, WeakETag weakETa

string connectionHash = string.IsNullOrEmpty(_exportJobConfiguration.StorageAccountConnection) ?
string.Empty :
Health.Core.Extensions.StringExtensions.ComputeHash(_exportJobConfiguration.StorageAccountConnection);
Microsoft.Health.Core.Extensions.StringExtensions.ComputeHash(_exportJobConfiguration.StorageAccountConnection);

if (string.IsNullOrEmpty(exportJobRecord.StorageAccountUri))
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,18 @@
// Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
// -------------------------------------------------------------------------------------------------

using Microsoft.Health.Fhir.Core.Features;

namespace Microsoft.Health.Fhir.Core.Registration
{
public class AzureApiForFhirRuntimeConfiguration : IFhirRuntimeConfiguration
{
public string DataStore => KnownDataStores.CosmosDb;

public bool IsSelectiveSearchParameterSupported => false;

public bool IsExportBackgroundWorkedSupported => true;

public bool IsCustomerKeyValidationBackgroudWorkerSupported => false;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,18 @@
// Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
// -------------------------------------------------------------------------------------------------

using Microsoft.Health.Fhir.Core.Features;

namespace Microsoft.Health.Fhir.Core.Registration
{
public class AzureHealthDataServicesRuntimeConfiguration : IFhirRuntimeConfiguration
{
public string DataStore => KnownDataStores.SqlServer;

public bool IsSelectiveSearchParameterSupported => true;

public bool IsExportBackgroundWorkedSupported => false;

public bool IsCustomerKeyValidationBackgroudWorkerSupported => true;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,21 @@ namespace Microsoft.Health.Fhir.Core.Registration
{
public interface IFhirRuntimeConfiguration
{
string DataStore { get; }

/// <summary>
/// Selective Search Parameter.
/// </summary>
bool IsSelectiveSearchParameterSupported { get; }

/// <summary>
/// Export background worker.
/// </summary>
bool IsExportBackgroundWorkedSupported { get; }

/// <summary>
/// Customer Key Validation background worker keeps running and checking the health of customer managed key.
/// </summary>
bool IsCustomerKeyValidationBackgroudWorkerSupported { get; }
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
using Microsoft.Extensions.Logging.Abstractions;
using Microsoft.Extensions.Options;
using Microsoft.Health.Abstractions.Exceptions;
using Microsoft.Health.Core.Features.Health;
using Microsoft.Health.Fhir.Core.Features.Health;
using Microsoft.Health.Fhir.CosmosDb.Configs;
using Microsoft.Health.Fhir.CosmosDb.Features.Storage;
using Microsoft.Health.Fhir.Tests.Common;
Expand Down Expand Up @@ -122,13 +124,16 @@ public async Task GivenCosmosAccessIsForbidden_IsClientCmkError_WhenHealthIsChec

HealthCheckResult result = await _healthCheck.CheckHealthAsync(new HealthCheckContext());

Assert.Equal(HealthStatus.Unhealthy, result.Status);
Assert.Contains("customer-managed key is not available", result.Description);
Assert.Equal(HealthStatus.Degraded, result.Status);

Assert.NotNull(result.Data);
Assert.True(result.Data.ContainsKey("IsCustomerManagedKeyError"));
Assert.True((bool)result.Data["IsCustomerManagedKeyError"]);
Assert.NotNull(result.Exception);
Assert.Equal(cosmosException, result.Exception);
Assert.True(result.Data.Any());

Assert.True(result.Data.ContainsKey("Reason"));
Assert.Equal(HealthStatusReason.CustomerManagedKeyAccessLost, result.Data["Reason"]);

Assert.True(result.Data.ContainsKey("Error"));
Assert.Equal(FhirHealthErrorCode.Error412.ToString(), result.Data["Error"]);
}
}

Expand All @@ -147,9 +152,12 @@ public async Task GivenCosmosAccessIsForbidden_IsNotClientCmkError_WhenHealthIsC
HealthCheckResult result = await _healthCheck.CheckHealthAsync(new HealthCheckContext());

Assert.Equal(HealthStatus.Unhealthy, result.Status);
Assert.DoesNotContain("customer-managed key is not available", result.Description);
Assert.False(result.Data.Any());
Assert.Null(result.Exception);

Assert.NotNull(result.Data);
Assert.True(result.Data.Any());

Assert.True(result.Data.ContainsKey("Error"));
Assert.Equal(FhirHealthErrorCode.Error500.ToString(), result.Data["Error"]);
}

[Fact]
Expand All @@ -160,8 +168,10 @@ public async Task GivenCosmosDbWithTooManyRequests_WhenHealthIsChecked_ThenHealt

HealthCheckResult result = await _healthCheck.CheckHealthAsync(new HealthCheckContext());

Assert.Equal(HealthStatus.Healthy, result.Status);
Assert.Contains("rate limit", result.Description);
Assert.Equal(HealthStatus.Degraded, result.Status);

Assert.True(result.Data.ContainsKey("Error"));
Assert.Equal(FhirHealthErrorCode.Error429.ToString(), result.Data["Error"]);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,20 @@
using Microsoft.Extensions.Diagnostics.HealthChecks;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using Microsoft.Health.Core.Features.Health;
using Microsoft.Health.Extensions.DependencyInjection;
using Microsoft.Health.Fhir.Core.Extensions;
using Microsoft.Health.Fhir.Core.Features.Health;
using Microsoft.Health.Fhir.CosmosDb.Configs;
using Microsoft.Health.Fhir.CosmosDb.Features.Storage;

namespace Microsoft.Health.Fhir.CosmosDb.Features.Health
{
public class CosmosHealthCheck : IHealthCheck
{
private const string UnhealthyDescription = "The store is unhealthy.";
private const string DegradedDescription = "The health of the store has degraded.";

private readonly IScoped<Container> _container;
private readonly CosmosDataStoreConfiguration _configuration;
private readonly CosmosCollectionConfiguration _cosmosCollectionConfiguration;
Expand Down Expand Up @@ -69,7 +74,7 @@ public async Task<HealthCheckResult> CheckHealthAsync(HealthCheckContext context
using (CancellationTokenSource operationTokenSource = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken, timeBasedTokenSource.Token))
{
await _testProvider.PerformTestAsync(_container.Value, _configuration, _cosmosCollectionConfiguration, operationTokenSource.Token);
return HealthCheckResult.Healthy("Successfully connected to the data store.");
return HealthCheckResult.Healthy("Successfully connected.");
}
}
catch (CosmosOperationCanceledException coce)
Expand All @@ -80,36 +85,94 @@ public async Task<HealthCheckResult> CheckHealthAsync(HealthCheckContext context

if (cancellationToken.IsCancellationRequested)
{
// Handling an extenal cancellation.
// No reasons to retry as the cancellation was external to the health check.

_logger.LogWarning(coce, "Failed to connect to the data store. External cancellation requested.");
return HealthCheckResult.Unhealthy("Failed to connect to the data store. External cancellation requested.");

return HealthCheckResult.Unhealthy(
description: UnhealthyDescription,
data: new Dictionary<string, object>
{
{ "Reason", HealthStatusReason.ServiceUnavailable },
{ "Error", FhirHealthErrorCode.Error408.ToString() },
});
}
else if (attempt >= maxNumberAttempts)
{
_logger.LogWarning(coce, "Failed to connect to the data store. There were {NumberOfAttempts} attempts to connect to the data store, but they suffered a '{ExceptionType}'.", attempt, nameof(CosmosOperationCanceledException));
return HealthCheckResult.Unhealthy("Failed to connect to the data store. Operation canceled.");
// This is a very rare situation. This condition indicates that multiple attempts to connect to the data store happened, but they were not successful.

_logger.LogWarning(
coce,
"Failed to connect to the data store. There were {NumberOfAttempts} attempts to connect to the data store, but they suffered a '{ExceptionType}'.",
attempt,
nameof(CosmosOperationCanceledException));

return HealthCheckResult.Unhealthy(
description: UnhealthyDescription,
data: new Dictionary<string, object>
{
{ "Reason", HealthStatusReason.ServiceUnavailable },
{ "Error", FhirHealthErrorCode.Error501.ToString() },
});
}
else
{
// Number of attempts not reached. Allow retry.
_logger.LogWarning(coce, "Failed to connect to the data store. Attempt {NumberOfAttempts}. '{ExceptionType}'.", attempt, nameof(CosmosOperationCanceledException));

_logger.LogWarning(
coce,
"Failed to connect to the data store. Attempt {NumberOfAttempts}. '{ExceptionType}'.",
attempt,
nameof(CosmosOperationCanceledException));
}
}
catch (CosmosException ex) when (ex.IsCmkClientError())
{
return HealthCheckResult.Unhealthy(
"Connection to the data store was unsuccesful because the client's customer-managed key is not available.",
exception: ex,
new Dictionary<string, object>() { { "IsCustomerManagedKeyError", true } });
// Handling CMK errors.

_logger.LogWarning(
ex,
"Connection to the data store was unsuccesful because the client's customer-managed key is not available.");

return HealthCheckResult.Degraded(
description: DegradedDescription,
data: new Dictionary<string, object>
{
{ "Reason", HealthStatusReason.CustomerManagedKeyAccessLost },
{ "Error", FhirHealthErrorCode.Error412.ToString() },
});
}
catch (Exception ex) when (ex.IsRequestRateExceeded())
{
return HealthCheckResult.Healthy("Connection to the data store was successful, however, the rate limit has been exceeded.");
// Handling request rate exceptions.

_logger.LogWarning(
ex,
"Failed to connect to the data store. Rate limit has been exceeded.");

return HealthCheckResult.Degraded(
description: DegradedDescription,
data: new Dictionary<string, object>
{
{ "Reason", HealthStatusReason.ServiceDegraded },
{ "Error", FhirHealthErrorCode.Error429.ToString() },
});
}
catch (Exception ex)
{
_logger.LogWarning(ex, "Failed to connect to the data store.");
// Handling other exceptions.

const string message = "Failed to connect to the data store.";
_logger.LogWarning(ex, message);

return HealthCheckResult.Unhealthy("Failed to connect to the data store.");
return HealthCheckResult.Unhealthy(
description: UnhealthyDescription,
data: new Dictionary<string, object>
{
{ "Reason", HealthStatusReason.ServiceUnavailable },
{ "Error", FhirHealthErrorCode.Error500.ToString() },
});
}
}
while (true);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -121,22 +121,23 @@ public static IFhirServerBuilder AddFhirServer(
/// <summary>
/// Adds background worker services.
/// </summary>
/// <param name="fhirServerBuilder">The FHIR server builder.</param>
/// <param name="addExportWorker">Whether to add the background worker for export jobs</param>
/// <param name="fhirServerBuilder">FHIR server builder.</param>
/// <param name="runtimeConfiguration">FHIR Runtime Configuration</param>
/// <returns>The builder.</returns>
public static IFhirServerBuilder AddBackgroundWorkers(
this IFhirServerBuilder fhirServerBuilder,
bool addExportWorker)
IFhirRuntimeConfiguration runtimeConfiguration)
{
EnsureArg.IsNotNull(fhirServerBuilder, nameof(fhirServerBuilder));
EnsureArg.IsNotNull(runtimeConfiguration, nameof(runtimeConfiguration));

if (addExportWorker)
fhirServerBuilder.Services.AddHostedService<ReindexJobWorkerBackgroundService>();

if (runtimeConfiguration.IsExportBackgroundWorkedSupported)
{
fhirServerBuilder.Services.AddHostedService<LegacyExportJobWorkerBackgroundService>();
}

fhirServerBuilder.Services.AddHostedService<ReindexJobWorkerBackgroundService>();

return fhirServerBuilder;
}

Expand Down
Loading

0 comments on commit 2742a05

Please sign in to comment.