Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue 2542: add message to stop DR based partition before volume destruction #2863

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
3 changes: 3 additions & 0 deletions cloud/blockstore/config/storage.proto
Original file line number Diff line number Diff line change
Expand Up @@ -1103,4 +1103,7 @@ message TStorageServiceConfig
optional uint32 ForcedCompactionRangeCountPerRun = 401;

optional bool YdbViewerServiceEnabled = 402;

// Timeout for TDestroyVolumeActor (in milliseconds)
optional uint32 DestroyVolumeTimeout = 403;
}
4 changes: 4 additions & 0 deletions cloud/blockstore/libs/storage/api/volume.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ namespace NCloud::NBlockStore::NStorage {
xxx(UpdateVolumeParams, __VA_ARGS__) \
xxx(ChangeStorageConfig, __VA_ARGS__) \
xxx(GetStorageConfig, __VA_ARGS__) \
xxx(GracefulShutdown, __VA_ARGS__) \

// BLOCKSTORE_VOLUME_REQUESTS

Expand Down Expand Up @@ -331,6 +332,9 @@ struct TEvVolume
EvGetStorageConfigRequest = EvBegin + 58,
EvGetStorageConfigResponse = EvBegin + 59,

EvGracefulShutdownRequest = EvBegin + 60,
EvGracefulShutdownResponse = EvBegin + 61,

EvEnd
};

Expand Down
2 changes: 2 additions & 0 deletions cloud/blockstore/libs/storage/core/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -527,6 +527,8 @@ TDuration MSeconds(ui32 value)
xxx(EnableToChangeStatesFromDiskRegistryMonpage, bool, false )\
xxx(EnableToChangeErrorStatesFromDiskRegistryMonpage, bool, false )\
xxx(CalculateSplittedUsedQuotaMetric, bool, false )\
\
xxx(DestroyVolumeTimeout, TDuration, Seconds(30) )\
// BLOCKSTORE_STORAGE_CONFIG_RW

#define BLOCKSTORE_STORAGE_CONFIG(xxx) \
Expand Down
1 change: 1 addition & 0 deletions cloud/blockstore/libs/storage/core/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -631,6 +631,7 @@ class TStorageConfig
[[nodiscard]] bool GetCalculateSplittedUsedQuotaMetric() const;

bool GetYdbViewerServiceEnabled() const;
[[nodiscard]] TDuration GetDestroyVolumeTimeout() const;
};

ui64 GetAllocationUnit(
Expand Down
22 changes: 22 additions & 0 deletions cloud/blockstore/libs/storage/protos/volume.proto
Original file line number Diff line number Diff line change
Expand Up @@ -652,3 +652,25 @@ message TGetStorageConfigResponse
// Result Storage config.
NProto.TStorageServiceConfig StorageConfig = 3;
}

////////////////////////////////////////////////////////////////////////////////
// GracefulShutdown request/response.

message TGracefulShutdownRequest
{
// Optional request headers.
THeaders Headers = 1;

// Label of volume to query its storage config. Or empty to
// query server storage config.
string DiskId = 2;
}

message TGracefulShutdownResponse
{
// Optional error, set only if error happened.
NCloud.NProto.TError Error = 1;

// Request traces.
NCloud.NProto.TTraceInfo Trace = 2;
}
85 changes: 78 additions & 7 deletions cloud/blockstore/libs/storage/service/service_actor_destroy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ class TDestroyVolumeActor final
const bool DestroyIfBroken;
const bool Sync;
const ui64 FillGeneration;
const TDuration Timeout;

bool IsDiskRegistryBased = false;
bool VolumeNotFoundInSS = false;
Expand All @@ -47,7 +48,8 @@ class TDestroyVolumeActor final
TString diskId,
bool destroyIfBroken,
bool sync,
ui64 fillGeneration);
ui64 fillGeneration,
TDuration timeout);

void Bootstrap(const TActorContext& ctx);

Expand All @@ -57,6 +59,7 @@ class TDestroyVolumeActor final
void NotifyDiskRegistry(const TActorContext& ctx);
void StatVolume(const TActorContext& ctx);
void DeallocateDisk(const TActorContext& ctx);
void GracefulShutdown(const TActorContext& ctx);
NProto::TError CheckIfDestructionIsAllowed() const;

void HandleModifyResponse(
Expand All @@ -79,6 +82,15 @@ class TDestroyVolumeActor final
const TEvDiskRegistry::TEvDeallocateDiskResponse::TPtr& ev,
const TActorContext& ctx);

void HandleGracefulShutdownResponse(
const TEvVolume::TEvGracefulShutdownResponse::TPtr&
ev,
const TActorContext& ctx);

void HandleTimeout(
const TEvents::TEvWakeup::TPtr& ev,
const TActorContext& ctx);

void ReplyAndDie(const TActorContext& ctx, NProto::TError error);

private:
Expand All @@ -95,7 +107,8 @@ TDestroyVolumeActor::TDestroyVolumeActor(
TString diskId,
bool destroyIfBroken,
bool sync,
ui64 fillGeneration)
ui64 fillGeneration,
TDuration timeout)
: Sender(sender)
, Cookie(cookie)
, AttachedDiskDestructionTimeout(attachedDiskDestructionTimeout)
Expand All @@ -105,10 +118,12 @@ TDestroyVolumeActor::TDestroyVolumeActor(
, DestroyIfBroken(destroyIfBroken)
, Sync(sync)
, FillGeneration(fillGeneration)
, Timeout(timeout)
{}

void TDestroyVolumeActor::Bootstrap(const TActorContext& ctx)
{
ctx.Schedule(Timeout, new TEvents::TEvWakeup());
if (DestroyIfBroken) {
WaitReady(ctx);
} else {
Expand Down Expand Up @@ -180,6 +195,13 @@ void TDestroyVolumeActor::DeallocateDisk(const TActorContext& ctx)
NCloud::Send(ctx, MakeDiskRegistryProxyServiceId(), std::move(request));
}

void TDestroyVolumeActor::GracefulShutdown(const TActorContext& ctx)
{
auto request = std::make_unique<TEvVolume::TEvGracefulShutdownRequest>();
request->Record.SetDiskId(DiskId);
NCloud::Send(ctx, MakeVolumeProxyServiceId(), std::move(request));
}

NProto::TError TDestroyVolumeActor::CheckIfDestructionIsAllowed() const
{
const auto& prefixes = DestructionAllowedOnlyForDisksWithIdPrefixes;
Expand Down Expand Up @@ -270,9 +292,16 @@ void TDestroyVolumeActor::HandleMarkDiskForCleanupResponse(

// disk is broken and will be removed by DR at some point
if (error.GetCode() == E_NOT_FOUND) {
LOG_INFO(ctx, TBlockStoreComponents::SERVICE,
"volume %s not found in registry", DiskId.Quote().data());
} else if (HasError(error)) {
LOG_INFO(
ctx,
TBlockStoreComponents::SERVICE,
"volume %s not found in registry",
DiskId.Quote().data());
DestroyVolume(ctx);
return;
}

if (HasError(error)) {
LOG_ERROR(ctx, TBlockStoreComponents::SERVICE,
"Volume %s: unable to notify DR about disk destruction: %s",
DiskId.Quote().data(),
Expand All @@ -282,7 +311,7 @@ void TDestroyVolumeActor::HandleMarkDiskForCleanupResponse(
return;
}

DestroyVolume(ctx);
GracefulShutdown(ctx);
}

void TDestroyVolumeActor::HandleDeallocateDiskResponse(
Expand Down Expand Up @@ -383,6 +412,41 @@ void TDestroyVolumeActor::HandleStatVolumeResponse(
}
}

void TDestroyVolumeActor::HandleGracefulShutdownResponse(
const TEvVolume::TEvGracefulShutdownResponse::TPtr& ev,
const TActorContext& ctx)
{
const auto* msg = ev->Get();

if (auto error = msg->GetError(); HasError(error)) {
LOG_ERROR(
ctx,
TBlockStoreComponents::SERVICE,
"Volume %s: unable to gracefully stop volume: %s",
DiskId.Quote().data(),
FormatError(error).data());

ReplyAndDie(ctx, std::move(error));
return;
}

DestroyVolume(ctx);
}

void TDestroyVolumeActor::HandleTimeout(
const TEvents::TEvWakeup::TPtr& ev,
const TActorContext& ctx)
{
Y_UNUSED(ev);

LOG_ERROR(
ctx,
TBlockStoreComponents::SERVICE,
"Timeout destroy volume request");

ReplyAndDie(ctx, MakeError(E_TIMEOUT, "Timeout"));
}

void TDestroyVolumeActor::ReplyAndDie(
const TActorContext& ctx,
NProto::TError error)
Expand Down Expand Up @@ -412,6 +476,12 @@ STFUNC(TDestroyVolumeActor::StateWork)
TEvService::TEvStatVolumeResponse,
HandleStatVolumeResponse);

HFunc(
TEvVolume::TEvGracefulShutdownResponse,
HandleGracefulShutdownResponse);

HFunc(TEvents::TEvWakeup, HandleTimeout);

default:
HandleUnexpectedEvent(ev, TBlockStoreComponents::SERVICE);
break;
Expand Down Expand Up @@ -449,7 +519,8 @@ void TServiceActor::HandleDestroyVolume(
diskId,
destroyIfBroken,
sync,
fillGeneration);
fillGeneration,
Config->GetDestroyVolumeTimeout());
}

} // namespace NCloud::NBlockStore::NStorage
6 changes: 6 additions & 0 deletions cloud/blockstore/libs/storage/volume/testlib/test_env.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -551,6 +551,12 @@ TVolumeClient::CreateReadMetaHistoryRequest()
return std::make_unique<TEvVolumePrivate::TEvReadMetaHistoryRequest>();
}

std::unique_ptr<TEvVolume::TEvGracefulShutdownRequest>
TVolumeClient::CreateGracefulShutdownRequest()
{
return std::make_unique<TEvVolume::TEvGracefulShutdownRequest>();
}

void TVolumeClient::SendRemoteHttpInfo(
const TString& params,
HTTP_METHOD method)
Expand Down
3 changes: 3 additions & 0 deletions cloud/blockstore/libs/storage/volume/testlib/test_env.h
Original file line number Diff line number Diff line change
Expand Up @@ -460,6 +460,9 @@ class TVolumeClient
std::unique_ptr<TEvVolumePrivate::TEvReadMetaHistoryRequest>
CreateReadMetaHistoryRequest();

std::unique_ptr<TEvVolume::TEvGracefulShutdownRequest>
CreateGracefulShutdownRequest();

void SendRemoteHttpInfo(
const TString& params,
HTTP_METHOD method);
Expand Down
5 changes: 3 additions & 2 deletions cloud/blockstore/libs/storage/volume/volume_actor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1119,8 +1119,9 @@ STFUNC(TVolumeActor::StateZombie)

IgnoreFunc(TEvPartition::TEvWaitReadyResponse);

IgnoreFunc(TEvents::TEvPoisonPill);
IgnoreFunc(TEvents::TEvPoisonTaken);
HFunc(TEvents::TEvPoisonPill, HandlePoisonPill);
HFunc(TEvents::TEvPoisonTaken, HandlePoisonTaken);
HFunc(TEvTablet::TEvTabletStop, HandleTabletStop);

IgnoreFunc(TEvLocal::TEvTabletMetrics);

Expand Down
45 changes: 45 additions & 0 deletions cloud/blockstore/libs/storage/volume/volume_actor_startstop.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,51 @@ void TVolumeActor::StartPartitionsForGc(const TActorContext& ctx)
PartitionsStartedReason = EPartitionsStartedReason::STARTED_FOR_GC;
}

void TVolumeActor::HandleGracefulShutdown(
const TEvVolume::TEvGracefulShutdownRequest::TPtr& ev,
const TActorContext& ctx)
{
if (!State->GetDiskRegistryBasedPartitionActor()) {
LOG_ERROR(
ctx,
TBlockStoreComponents::VOLUME,
"[%lu] GracefulShutdown request was send to "
"not DR based disk",
TabletID());

NCloud::Reply(
ctx,
*ev,
std::make_unique<TEvVolume::TEvGracefulShutdownResponse>(
MakeError(E_NOT_IMPLEMENTED, "request not supported")));
return;
}

LOG_INFO(
ctx,
TBlockStoreComponents::VOLUME,
"[%lu] Stop Partition before volume destruction",
TabletID());

auto reqInfo =
CreateRequestInfo(ev->Sender, ev->Cookie, ev->Get()->CallContext);
StopPartitions(
ctx,
[reqInfo = std::move(reqInfo)](const auto& ctx)
{
NCloud::Reply(
ctx,
*reqInfo,
std::make_unique<TEvVolume::TEvGracefulShutdownResponse>());
});

vladstepanyuk marked this conversation as resolved.
Show resolved Hide resolved
TerminateTransactions(ctx);
KillActors(ctx);
CancelRequests(ctx);

BecomeAux(ctx, STATE_ZOMBIE);
}

void TVolumeActor::StopPartitions(
const TActorContext& ctx,
TDiskRegistryBasedPartitionStoppedCallback onPartitionStopped)
Expand Down
51 changes: 51 additions & 0 deletions cloud/blockstore/libs/storage/volume/volume_ut.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8148,6 +8148,57 @@ Y_UNIT_TEST_SUITE(TVolumeTest)
UNIT_ASSERT(partitionsStopped);
}

Y_UNIT_TEST(ShouldGracefulyShutdownVolume)
{
auto runtime = PrepareTestActorRuntime();
TVolumeClient volume(*runtime);

bool partitionsStopped = false;
runtime->SetEventFilter(
[&](TTestActorRuntimeBase&, TAutoPtr<IEventHandle>& event)
{
switch (event->GetTypeRewrite()) {
// Poison pill send to DR based partition actor.
case TEvents::TEvPoisonPill::EventType: {
partitionsStopped = true;
break;
}
}
return false;
});

volume.UpdateVolumeConfig(
// default arguments
0,
0,
0,
0,
false,
1,
NCloud::NProto::STORAGE_MEDIA_SSD_NONREPLICATED,
1024,
"vol0",
"cloud",
"folder",
1 // partitions count
);
volume.RebootTablet();

auto clientInfo = CreateVolumeClientInfo(
NProto::VOLUME_ACCESS_READ_WRITE,
NProto::VOLUME_MOUNT_LOCAL,
false);

volume.GracefulShutdown();
UNIT_ASSERT(partitionsStopped);

// Check that volume after TEvGracefulShutdownRequest
// in zombie state and rejects requsts.
volume.SendGetVolumeInfoRequest();
auto response = volume.RecvGetVolumeInfoResponse();
UNIT_ASSERT_VALUES_EQUAL(response->GetStatus(), E_REJECTED);
}

Y_UNIT_TEST(ShouldReturnClientsAndHostnameInStatVolumeResponse)
{
auto runtime = PrepareTestActorRuntime();
Expand Down
Loading