From 5dae14a2c2f1cfca0285f7082492a8e88ddfe5ac Mon Sep 17 00:00:00 2001 From: sebastianburckhardt Date: Fri, 27 Sep 2024 09:04:15 -0700 Subject: [PATCH 1/5] revise checkpoint injection state machine and add tracing --- .../StorageLayer/Faster/CheckpointInjector.cs | 69 +++++++++++++------ .../StorageLayer/Faster/StoreWorker.cs | 8 ++- 2 files changed, 52 insertions(+), 25 deletions(-) diff --git a/src/DurableTask.Netherite/StorageLayer/Faster/CheckpointInjector.cs b/src/DurableTask.Netherite/StorageLayer/Faster/CheckpointInjector.cs index c441df7a..87918e19 100644 --- a/src/DurableTask.Netherite/StorageLayer/Faster/CheckpointInjector.cs +++ b/src/DurableTask.Netherite/StorageLayer/Faster/CheckpointInjector.cs @@ -32,50 +32,75 @@ public CheckpointInjector(TestHooks testHooks) this.testHooks = testHooks; } - internal bool CheckpointDue(LogAccessor log, out StoreWorker.CheckpointTrigger trigger, out long? compactUntil) + internal bool CheckpointDue(LogAccessor log, out StoreWorker.CheckpointTrigger trigger, out long? compactUntil, FasterTraceHelper traceHelper) { if (this.handler != null) { try { + traceHelper.FasterProgress("CheckpointInjector: running handler"); + (trigger, compactUntil) = this.handler(log); - this.handler = null; + this.handler = null; // do not run the same handler again + + traceHelper.FasterProgress($"CheckpointInjector: trigger={trigger} compactUntil={compactUntil}"); if (trigger == StoreWorker.CheckpointTrigger.None) { - this.SequenceComplete(log); + this.SequenceComplete(log, traceHelper); } - } - catch(Exception e) + + return (trigger != StoreWorker.CheckpointTrigger.None); + } + catch (Exception e) { - this.continuation.SetException(e); - this.continuation = null; - throw; + traceHelper.FasterProgress($"CheckpointInjector: handler faulted: {e}"); + + if (this.continuation.TrySetException(e)) + { + traceHelper.FasterProgress("CheckpointInjector: handler continuation released with exception"); + } + else + { + traceHelper.FasterProgress("CheckpointInjector: handler continuation already progressed"); + } } } + + trigger = StoreWorker.CheckpointTrigger.None; + compactUntil = null; + return false; + } + + internal void SequenceComplete(LogAccessor log, FasterTraceHelper traceHelper) + { + traceHelper.FasterProgress("CheckpointInjector: sequence complete"); + + if (this.continuation.TrySetResult(log)) + { + traceHelper.FasterProgress("CheckpointInjector: handler continuation released"); + } else { - trigger = StoreWorker.CheckpointTrigger.None; - compactUntil = null; + traceHelper.FasterProgress("CheckpointInjector: handler continuation already progressed"); } - - return (trigger != StoreWorker.CheckpointTrigger.None); - } - - internal void SequenceComplete(LogAccessor log) - { - this.continuation?.SetResult(log); - this.continuation = null; } - internal void CompactionComplete(IPartitionErrorHandler errorHandler) + internal void CompactionComplete(IPartitionErrorHandler errorHandler, FasterTraceHelper traceHelper) { if (this.InjectFaultAfterCompaction) { errorHandler.HandleError("CheckpointInjector", "inject failure after compaction", null, true, false); - this.InjectFaultAfterCompaction = false; - this.continuation?.SetResult(null); - this.continuation = null; + this.InjectFaultAfterCompaction = false; // do not do this again unless requested again + + if (this.continuation.TrySetResult(null)) + { + traceHelper.FasterProgress("CheckpointInjector: handler continuation released"); + } + else + { + traceHelper.FasterProgress("CheckpointInjector: handler continuation already progressed"); + } } } diff --git a/src/DurableTask.Netherite/StorageLayer/Faster/StoreWorker.cs b/src/DurableTask.Netherite/StorageLayer/Faster/StoreWorker.cs index 99bb2f71..4b0238b9 100644 --- a/src/DurableTask.Netherite/StorageLayer/Faster/StoreWorker.cs +++ b/src/DurableTask.Netherite/StorageLayer/Faster/StoreWorker.cs @@ -345,7 +345,7 @@ bool CheckpointDue(out CheckpointTrigger trigger, out long? compactUntil) // in a test setting, let the test decide when to checkpoint or compact if (this.partition.Settings.TestHooks?.CheckpointInjector != null) { - return this.partition.Settings.TestHooks.CheckpointInjector.CheckpointDue((this.store as FasterKV).Log, out trigger, out compactUntil); + return this.partition.Settings.TestHooks.CheckpointInjector.CheckpointDue((this.store as FasterKV).Log, out trigger, out compactUntil, this.traceHelper); } trigger = CheckpointTrigger.None; @@ -446,11 +446,13 @@ async ValueTask RunCheckpointingStateMachine() GC.Collect(); this.traceHelper.FasterProgress("Checkpointing state machine: resetting to initial state"); + + this.partition.Settings.TestHooks?.CheckpointInjector?.SequenceComplete((this.store as FasterKV).Log, this.traceHelper); + // we have reached the end of the state machine transitions this.pendingStoreCheckpoint = null; this.pendingCheckpointTrigger = CheckpointTrigger.None; this.ScheduleNextIdleCheckpointTime(); - this.partition.Settings.TestHooks?.CheckpointInjector?.SequenceComplete((this.store as FasterKV).Log); } } else if (this.pendingIndexCheckpoint != null) @@ -672,7 +674,7 @@ protected override async Task Process(IList batch) if (target.HasValue) { target = await this.store.RunCompactionAsync(target.Value); - this.partition.Settings.TestHooks?.CheckpointInjector?.CompactionComplete(this.partition.ErrorHandler); + this.partition.Settings.TestHooks?.CheckpointInjector?.CompactionComplete(this.partition.ErrorHandler, this.traceHelper); } this.Notify(); From 0588bc00c0720a8dd630a4da338e3254b4b38c1b Mon Sep 17 00:00:00 2001 From: sebastianburckhardt Date: Fri, 27 Sep 2024 11:27:58 -0700 Subject: [PATCH 2/5] do not include timer tests in concurrent test scenario since timing is inherently nondeterministic --- test/DurableTask.Netherite.Tests/ConcurrentTestsFaster.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/DurableTask.Netherite.Tests/ConcurrentTestsFaster.cs b/test/DurableTask.Netherite.Tests/ConcurrentTestsFaster.cs index 7ed1c752..d5322fdf 100644 --- a/test/DurableTask.Netherite.Tests/ConcurrentTestsFaster.cs +++ b/test/DurableTask.Netherite.Tests/ConcurrentTestsFaster.cs @@ -109,7 +109,7 @@ async Task RunAsync() var tests = new List<(string, Task)>(); - foreach ((string name, Task task) in scenarios.StartAllScenarios(includeTimers: true, includeLarge: true)) + foreach ((string name, Task task) in scenarios.StartAllScenarios(includeTimers: false, includeLarge: true)) { Trace.WriteLine($"TestProgress: Adding {name}"); tests.Add((name, task)); From 38ee164b4dbe3c9fdee3141e91446f664a06bf0f Mon Sep 17 00:00:00 2001 From: sebastianburckhardt Date: Mon, 30 Sep 2024 08:52:24 -0700 Subject: [PATCH 3/5] fix version checking in CacheDebugger and FasterKV, to avoid false alarms that trip our unit tests --- .../StorageLayer/Faster/CacheDebugger.cs | 1 + .../StorageLayer/Faster/FasterKV.cs | 27 ++++++++++++++++--- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/src/DurableTask.Netherite/StorageLayer/Faster/CacheDebugger.cs b/src/DurableTask.Netherite/StorageLayer/Faster/CacheDebugger.cs index 6f39d6fc..5360c424 100644 --- a/src/DurableTask.Netherite/StorageLayer/Faster/CacheDebugger.cs +++ b/src/DurableTask.Netherite/StorageLayer/Faster/CacheDebugger.cs @@ -49,6 +49,7 @@ public enum CacheEvent PostCopyUpdate, SingleWriterUpsert, SingleWriterCopyToTail, + SingleWriterCopyToTailFromOutput, SingleWriterCopyToReadCache, SingleWriterCompaction, PostSingleWriterUpsert, diff --git a/src/DurableTask.Netherite/StorageLayer/Faster/FasterKV.cs b/src/DurableTask.Netherite/StorageLayer/Faster/FasterKV.cs index 1ae1b218..19057d28 100644 --- a/src/DurableTask.Netherite/StorageLayer/Faster/FasterKV.cs +++ b/src/DurableTask.Netherite/StorageLayer/Faster/FasterKV.cs @@ -1828,6 +1828,7 @@ public override void Serialize(ref Value obj) public struct Output { public object Val; + public int Version; public TrackedObject Read(FasterKV store, string eventId) { @@ -2024,6 +2025,7 @@ bool IFunctions.SingleReader(ref Key dst.Val = trackedObject; } + dst.Version = src.Version; this.stats.Read++; return true; } @@ -2054,12 +2056,15 @@ bool IFunctions.ConcurrentReader(ref } dst.Val = trackedObject; + dst.Version = value.Version; this.stats.Read++; return true; } bool IFunctions.SingleWriter(ref Key key, ref EffectTracker input, ref Value src, ref Value dst, ref Output output, ref UpsertInfo info, WriteReason reason) { + bool takeValueFromOutput = false; + switch (reason) { case WriteReason.Upsert: @@ -2076,7 +2081,15 @@ bool IFunctions.SingleWriter(ref Key break; case WriteReason.CopyToTail: - this.cacheDebugger?.Record(key.Val, CacheDebugger.CacheEvent.SingleWriterCopyToTail, src.Version, default, info.Address); + takeValueFromOutput = (output.Val != null); // we have observed that src.Val is null sometimes, so if present, we use output instead + if (takeValueFromOutput) + { + this.cacheDebugger?.Record(key.Val, CacheDebugger.CacheEvent.SingleWriterCopyToTailFromOutput, output.Version, default, info.Address); + } + else + { + this.cacheDebugger?.Record(key.Val, CacheDebugger.CacheEvent.SingleWriterCopyToTail, src.Version, default, info.Address); + } break; case WriteReason.Compaction: @@ -2088,8 +2101,16 @@ bool IFunctions.SingleWriter(ref Key this.cacheDebugger?.Fail("Invalid WriteReason in SingleWriter", key); break; } - dst.Val = output.Val ?? src.Val; - dst.Version = src.Version; + if (takeValueFromOutput) + { + dst.Val = output.Val; + dst.Version = output.Version; + } + else + { + dst.Val = src.Val; + dst.Version = src.Version; + } this.cacheDebugger?.ValidateObjectVersion(dst, key.Val); return true; } From ba4156ec112da7c6c5da14984ea37bacc9688138 Mon Sep 17 00:00:00 2001 From: sebastianburckhardt Date: Mon, 30 Sep 2024 10:16:22 -0700 Subject: [PATCH 4/5] improve text in error message --- src/DurableTask.Netherite/StorageLayer/Faster/CacheDebugger.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/DurableTask.Netherite/StorageLayer/Faster/CacheDebugger.cs b/src/DurableTask.Netherite/StorageLayer/Faster/CacheDebugger.cs index 5360c424..b76e1179 100644 --- a/src/DurableTask.Netherite/StorageLayer/Faster/CacheDebugger.cs +++ b/src/DurableTask.Netherite/StorageLayer/Faster/CacheDebugger.cs @@ -425,7 +425,7 @@ internal void ValidateObjectVersion(FasterKV.Value val, TrackedObjectKey key) if (val.Version != versionOfObject) { var info = this.GetObjectInfo(key); - this.Fail($"incorrect version: model=v{val.Version} actual=v{versionOfObject} obj={val.Val} cacheEvents={info.PrintCacheEvents()}"); + this.Fail($"incorrect version: field=v{val.Version} object=v{versionOfObject} obj={val.Val} cacheEvents={info.PrintCacheEvents()}"); } } From c0b50f6d05aa29dfac22253fd68a8d73c97750db Mon Sep 17 00:00:00 2001 From: sebastianburckhardt Date: Wed, 2 Oct 2024 10:59:32 -0700 Subject: [PATCH 5/5] address PR feedback (update comment) --- src/DurableTask.Netherite/StorageLayer/Faster/FasterKV.cs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/DurableTask.Netherite/StorageLayer/Faster/FasterKV.cs b/src/DurableTask.Netherite/StorageLayer/Faster/FasterKV.cs index 19057d28..a7b323c0 100644 --- a/src/DurableTask.Netherite/StorageLayer/Faster/FasterKV.cs +++ b/src/DurableTask.Netherite/StorageLayer/Faster/FasterKV.cs @@ -2081,7 +2081,10 @@ bool IFunctions.SingleWriter(ref Key break; case WriteReason.CopyToTail: - takeValueFromOutput = (output.Val != null); // we have observed that src.Val is null sometimes, so if present, we use output instead + // we have empirically observed that src does sometimes not contain the correct value (is null) + // we are not sure if this is a bug in FASTER or intended behavior + // as a workaround for those situations, we are passing the source value in the output parameter, which seems to work o.k. + takeValueFromOutput = (output.Val != null); if (takeValueFromOutput) { this.cacheDebugger?.Record(key.Val, CacheDebugger.CacheEvent.SingleWriterCopyToTailFromOutput, output.Version, default, info.Address);