diff --git a/cloud/blockstore/config/storage.proto b/cloud/blockstore/config/storage.proto index ad82a894ac..69ee1743ae 100644 --- a/cloud/blockstore/config/storage.proto +++ b/cloud/blockstore/config/storage.proto @@ -1107,4 +1107,7 @@ message TStorageServiceConfig // When enabled, tag "use-intermediate-write-buffer" will be added // after scrubbing finds a mismatch optional bool AutomaticallyEnableBufferCopyingAfterChecksumMismatch = 403; + + // Timeout for TDestroyVolumeActor (in milliseconds) + optional uint32 DestroyVolumeTimeout = 404; } diff --git a/cloud/blockstore/libs/storage/api/volume.h b/cloud/blockstore/libs/storage/api/volume.h index 1e2e18e6fd..515bf915ae 100644 --- a/cloud/blockstore/libs/storage/api/volume.h +++ b/cloud/blockstore/libs/storage/api/volume.h @@ -35,6 +35,7 @@ namespace NCloud::NBlockStore::NStorage { xxx(UpdateVolumeParams, __VA_ARGS__) \ xxx(ChangeStorageConfig, __VA_ARGS__) \ xxx(GetStorageConfig, __VA_ARGS__) \ + xxx(GracefulShutdown, __VA_ARGS__) \ // BLOCKSTORE_VOLUME_REQUESTS @@ -331,6 +332,9 @@ struct TEvVolume EvGetStorageConfigRequest = EvBegin + 58, EvGetStorageConfigResponse = EvBegin + 59, + EvGracefulShutdownRequest = EvBegin + 60, + EvGracefulShutdownResponse = EvBegin + 61, + EvEnd }; diff --git a/cloud/blockstore/libs/storage/core/config.cpp b/cloud/blockstore/libs/storage/core/config.cpp index 2df913d52b..aef73d58c6 100644 --- a/cloud/blockstore/libs/storage/core/config.cpp +++ b/cloud/blockstore/libs/storage/core/config.cpp @@ -528,6 +528,8 @@ TDuration MSeconds(ui32 value) xxx(EnableToChangeStatesFromDiskRegistryMonpage, bool, false )\ xxx(EnableToChangeErrorStatesFromDiskRegistryMonpage, bool, false )\ xxx(CalculateSplittedUsedQuotaMetric, bool, false )\ + \ + xxx(DestroyVolumeTimeout, TDuration, Seconds(30) )\ // BLOCKSTORE_STORAGE_CONFIG_RW #define BLOCKSTORE_STORAGE_CONFIG(xxx) \ diff --git a/cloud/blockstore/libs/storage/core/config.h b/cloud/blockstore/libs/storage/core/config.h index 440b68ef0a..9bf4c01659 100644 --- a/cloud/blockstore/libs/storage/core/config.h +++ b/cloud/blockstore/libs/storage/core/config.h @@ -633,6 +633,7 @@ class TStorageConfig bool GetYdbViewerServiceEnabled() const; bool GetAutomaticallyEnableBufferCopyingAfterChecksumMismatch() const; + [[nodiscard]] TDuration GetDestroyVolumeTimeout() const; }; ui64 GetAllocationUnit( diff --git a/cloud/blockstore/libs/storage/protos/volume.proto b/cloud/blockstore/libs/storage/protos/volume.proto index dfae6356a6..5b98af3f8c 100644 --- a/cloud/blockstore/libs/storage/protos/volume.proto +++ b/cloud/blockstore/libs/storage/protos/volume.proto @@ -652,3 +652,24 @@ message TGetStorageConfigResponse // Result Storage config. NProto.TStorageServiceConfig StorageConfig = 3; } + +//////////////////////////////////////////////////////////////////////////////// +// GracefulShutdown request/response. + +message TGracefulShutdownRequest +{ + // Optional request headers. + THeaders Headers = 1; + + // Label of volume to shutdown. + string DiskId = 2; +} + +message TGracefulShutdownResponse +{ + // Optional error, set only if error happened. + NCloud.NProto.TError Error = 1; + + // Request traces. + NCloud.NProto.TTraceInfo Trace = 2; +} diff --git a/cloud/blockstore/libs/storage/service/service_actor_destroy.cpp b/cloud/blockstore/libs/storage/service/service_actor_destroy.cpp index b78674d6e2..f35d0ab831 100644 --- a/cloud/blockstore/libs/storage/service/service_actor_destroy.cpp +++ b/cloud/blockstore/libs/storage/service/service_actor_destroy.cpp @@ -34,6 +34,7 @@ class TDestroyVolumeActor final const bool DestroyIfBroken; const bool Sync; const ui64 FillGeneration; + const TDuration Timeout; bool IsDiskRegistryBased = false; bool VolumeNotFoundInSS = false; @@ -47,7 +48,8 @@ class TDestroyVolumeActor final TString diskId, bool destroyIfBroken, bool sync, - ui64 fillGeneration); + ui64 fillGeneration, + TDuration timeout); void Bootstrap(const TActorContext& ctx); @@ -57,6 +59,7 @@ class TDestroyVolumeActor final void NotifyDiskRegistry(const TActorContext& ctx); void StatVolume(const TActorContext& ctx); void DeallocateDisk(const TActorContext& ctx); + void GracefulShutdown(const TActorContext& ctx); NProto::TError CheckIfDestructionIsAllowed() const; void HandleModifyResponse( @@ -79,6 +82,15 @@ class TDestroyVolumeActor final const TEvDiskRegistry::TEvDeallocateDiskResponse::TPtr& ev, const TActorContext& ctx); + void HandleGracefulShutdownResponse( + const TEvVolume::TEvGracefulShutdownResponse::TPtr& + ev, + const TActorContext& ctx); + + void HandleTimeout( + const TEvents::TEvWakeup::TPtr& ev, + const TActorContext& ctx); + void ReplyAndDie(const TActorContext& ctx, NProto::TError error); private: @@ -95,7 +107,8 @@ TDestroyVolumeActor::TDestroyVolumeActor( TString diskId, bool destroyIfBroken, bool sync, - ui64 fillGeneration) + ui64 fillGeneration, + TDuration timeout) : Sender(sender) , Cookie(cookie) , AttachedDiskDestructionTimeout(attachedDiskDestructionTimeout) @@ -105,10 +118,12 @@ TDestroyVolumeActor::TDestroyVolumeActor( , DestroyIfBroken(destroyIfBroken) , Sync(sync) , FillGeneration(fillGeneration) + , Timeout(timeout) {} void TDestroyVolumeActor::Bootstrap(const TActorContext& ctx) { + ctx.Schedule(Timeout, new TEvents::TEvWakeup()); if (DestroyIfBroken) { WaitReady(ctx); } else { @@ -180,6 +195,13 @@ void TDestroyVolumeActor::DeallocateDisk(const TActorContext& ctx) NCloud::Send(ctx, MakeDiskRegistryProxyServiceId(), std::move(request)); } +void TDestroyVolumeActor::GracefulShutdown(const TActorContext& ctx) +{ + auto request = std::make_unique(); + request->Record.SetDiskId(DiskId); + NCloud::Send(ctx, MakeVolumeProxyServiceId(), std::move(request)); +} + NProto::TError TDestroyVolumeActor::CheckIfDestructionIsAllowed() const { const auto& prefixes = DestructionAllowedOnlyForDisksWithIdPrefixes; @@ -270,9 +292,16 @@ void TDestroyVolumeActor::HandleMarkDiskForCleanupResponse( // disk is broken and will be removed by DR at some point if (error.GetCode() == E_NOT_FOUND) { - LOG_INFO(ctx, TBlockStoreComponents::SERVICE, - "volume %s not found in registry", DiskId.Quote().data()); - } else if (HasError(error)) { + LOG_INFO( + ctx, + TBlockStoreComponents::SERVICE, + "volume %s not found in registry", + DiskId.Quote().data()); + DestroyVolume(ctx); + return; + } + + if (HasError(error)) { LOG_ERROR(ctx, TBlockStoreComponents::SERVICE, "Volume %s: unable to notify DR about disk destruction: %s", DiskId.Quote().data(), @@ -282,7 +311,7 @@ void TDestroyVolumeActor::HandleMarkDiskForCleanupResponse( return; } - DestroyVolume(ctx); + GracefulShutdown(ctx); } void TDestroyVolumeActor::HandleDeallocateDiskResponse( @@ -383,6 +412,45 @@ void TDestroyVolumeActor::HandleStatVolumeResponse( } } +void TDestroyVolumeActor::HandleGracefulShutdownResponse( + const TEvVolume::TEvGracefulShutdownResponse::TPtr& ev, + const TActorContext& ctx) +{ + const auto* msg = ev->Get(); + + if (auto error = msg->GetError(); HasError(error)) { + LOG_ERROR( + ctx, + TBlockStoreComponents::SERVICE, + "Volume %s: unable to gracefully stop volume: %s", + DiskId.Quote().data(), + FormatError(error).data()); + + ReplyAndDie(ctx, std::move(error)); + return; + } + + DestroyVolume(ctx); +} + +void TDestroyVolumeActor::HandleTimeout( + const TEvents::TEvWakeup::TPtr& ev, + const TActorContext& ctx) +{ + Y_UNUSED(ev); + + LOG_ERROR( + ctx, + TBlockStoreComponents::SERVICE, + "Timeout destroy volume request, diskId = %s, destroyIfBroken = %d, " + "sync = %d", + DiskId, + DestroyIfBroken, + Sync); + + ReplyAndDie(ctx, MakeError(E_TIMEOUT, "Timeout")); +} + void TDestroyVolumeActor::ReplyAndDie( const TActorContext& ctx, NProto::TError error) @@ -412,6 +480,12 @@ STFUNC(TDestroyVolumeActor::StateWork) TEvService::TEvStatVolumeResponse, HandleStatVolumeResponse); + HFunc( + TEvVolume::TEvGracefulShutdownResponse, + HandleGracefulShutdownResponse); + + HFunc(TEvents::TEvWakeup, HandleTimeout); + default: HandleUnexpectedEvent(ev, TBlockStoreComponents::SERVICE); break; @@ -449,7 +523,8 @@ void TServiceActor::HandleDestroyVolume( diskId, destroyIfBroken, sync, - fillGeneration); + fillGeneration, + Config->GetDestroyVolumeTimeout()); } } // namespace NCloud::NBlockStore::NStorage diff --git a/cloud/blockstore/libs/storage/volume/testlib/test_env.cpp b/cloud/blockstore/libs/storage/volume/testlib/test_env.cpp index 87f58a1fac..98be0f8bc8 100644 --- a/cloud/blockstore/libs/storage/volume/testlib/test_env.cpp +++ b/cloud/blockstore/libs/storage/volume/testlib/test_env.cpp @@ -551,6 +551,12 @@ TVolumeClient::CreateReadMetaHistoryRequest() return std::make_unique(); } +std::unique_ptr +TVolumeClient::CreateGracefulShutdownRequest() +{ + return std::make_unique(); +} + void TVolumeClient::SendRemoteHttpInfo( const TString& params, HTTP_METHOD method) diff --git a/cloud/blockstore/libs/storage/volume/testlib/test_env.h b/cloud/blockstore/libs/storage/volume/testlib/test_env.h index 6aaadd6f87..f3bef0a49b 100644 --- a/cloud/blockstore/libs/storage/volume/testlib/test_env.h +++ b/cloud/blockstore/libs/storage/volume/testlib/test_env.h @@ -460,6 +460,9 @@ class TVolumeClient std::unique_ptr CreateReadMetaHistoryRequest(); + std::unique_ptr + CreateGracefulShutdownRequest(); + void SendRemoteHttpInfo( const TString& params, HTTP_METHOD method); diff --git a/cloud/blockstore/libs/storage/volume/volume_actor.cpp b/cloud/blockstore/libs/storage/volume/volume_actor.cpp index 2f5901fbd2..c2786e9cb9 100644 --- a/cloud/blockstore/libs/storage/volume/volume_actor.cpp +++ b/cloud/blockstore/libs/storage/volume/volume_actor.cpp @@ -1119,8 +1119,9 @@ STFUNC(TVolumeActor::StateZombie) IgnoreFunc(TEvPartition::TEvWaitReadyResponse); - IgnoreFunc(TEvents::TEvPoisonPill); - IgnoreFunc(TEvents::TEvPoisonTaken); + HFunc(TEvents::TEvPoisonPill, HandlePoisonPill); + HFunc(TEvents::TEvPoisonTaken, HandlePoisonTaken); + HFunc(TEvTablet::TEvTabletStop, HandleTabletStop); IgnoreFunc(TEvLocal::TEvTabletMetrics); diff --git a/cloud/blockstore/libs/storage/volume/volume_actor_startstop.cpp b/cloud/blockstore/libs/storage/volume/volume_actor_startstop.cpp index 002fe1e903..59b70fb3f6 100644 --- a/cloud/blockstore/libs/storage/volume/volume_actor_startstop.cpp +++ b/cloud/blockstore/libs/storage/volume/volume_actor_startstop.cpp @@ -346,6 +346,51 @@ void TVolumeActor::StartPartitionsForGc(const TActorContext& ctx) PartitionsStartedReason = EPartitionsStartedReason::STARTED_FOR_GC; } +void TVolumeActor::HandleGracefulShutdown( + const TEvVolume::TEvGracefulShutdownRequest::TPtr& ev, + const TActorContext& ctx) +{ + if (!State->GetDiskRegistryBasedPartitionActor()) { + LOG_ERROR( + ctx, + TBlockStoreComponents::VOLUME, + "[%lu] GracefulShutdown request was sent to " + "non-DR based disk", + TabletID()); + + NCloud::Reply( + ctx, + *ev, + std::make_unique( + MakeError(E_NOT_IMPLEMENTED, "request is not supported"))); + return; + } + + LOG_INFO( + ctx, + TBlockStoreComponents::VOLUME, + "[%lu] Stop Partition before volume destruction", + TabletID()); + + auto reqInfo = + CreateRequestInfo(ev->Sender, ev->Cookie, ev->Get()->CallContext); + StopPartitions( + ctx, + [reqInfo = std::move(reqInfo)](const auto& ctx) + { + NCloud::Reply( + ctx, + *reqInfo, + std::make_unique()); + }); + + TerminateTransactions(ctx); + KillActors(ctx); + CancelRequests(ctx); + + BecomeAux(ctx, STATE_ZOMBIE); +} + void TVolumeActor::StopPartitions( const TActorContext& ctx, TDiskRegistryBasedPartitionStoppedCallback onPartitionStopped) diff --git a/cloud/blockstore/libs/storage/volume/volume_ut.cpp b/cloud/blockstore/libs/storage/volume/volume_ut.cpp index f1c9f2287c..74db76b2d2 100644 --- a/cloud/blockstore/libs/storage/volume/volume_ut.cpp +++ b/cloud/blockstore/libs/storage/volume/volume_ut.cpp @@ -8148,6 +8148,57 @@ Y_UNIT_TEST_SUITE(TVolumeTest) UNIT_ASSERT(partitionsStopped); } + Y_UNIT_TEST(ShouldGracefulyShutdownVolume) + { + auto runtime = PrepareTestActorRuntime(); + TVolumeClient volume(*runtime); + + bool partitionsStopped = false; + runtime->SetEventFilter( + [&](TTestActorRuntimeBase&, TAutoPtr& event) + { + switch (event->GetTypeRewrite()) { + // Poison pill send to DR based partition actor. + case TEvents::TEvPoisonPill::EventType: { + partitionsStopped = true; + break; + } + } + return false; + }); + + volume.UpdateVolumeConfig( + // default arguments + 0, + 0, + 0, + 0, + false, + 1, + NCloud::NProto::STORAGE_MEDIA_SSD_NONREPLICATED, + 1024, + "vol0", + "cloud", + "folder", + 1 // partitions count + ); + volume.RebootTablet(); + + auto clientInfo = CreateVolumeClientInfo( + NProto::VOLUME_ACCESS_READ_WRITE, + NProto::VOLUME_MOUNT_LOCAL, + false); + + volume.GracefulShutdown(); + UNIT_ASSERT(partitionsStopped); + + // Check that volume after TEvGracefulShutdownRequest + // in zombie state and rejects requsts. + volume.SendGetVolumeInfoRequest(); + auto response = volume.RecvGetVolumeInfoResponse(); + UNIT_ASSERT_VALUES_EQUAL(response->GetStatus(), E_REJECTED); + } + Y_UNIT_TEST(ShouldReturnClientsAndHostnameInStatVolumeResponse) { auto runtime = PrepareTestActorRuntime();