-
Notifications
You must be signed in to change notification settings - Fork 273
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Retry platform-level errors in the isolated process for .NET isolated #2922
Changes from all commits
a335871
f2942b2
f2ed1e9
d6bb8e4
d333530
d1a3bca
72428b9
3aaa919
079f112
065fdd7
090f8e7
a70d014
2ff4b79
68e7503
1cdd4b9
c927268
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -138,10 +138,15 @@ await this.LifeCycleNotificationHelper.OrchestratorStartingAsync( | |
|
||
byte[] triggerReturnValueBytes = Convert.FromBase64String(triggerReturnValue); | ||
P.OrchestratorResponse response = P.OrchestratorResponse.Parser.ParseFrom(triggerReturnValueBytes); | ||
|
||
// TrySetResult may throw if a platform-level error is encountered (like an out of memory exception). | ||
context.SetResult( | ||
response.Actions.Select(ProtobufUtils.ToOrchestratorAction), | ||
response.CustomStatus); | ||
|
||
// Here we throw if the orchestrator completed with an application-level error. When we do this, | ||
// the function's result type will be of type `OrchestrationFailureException` which is reserved | ||
// for application-level errors that do not need to be re-tried. | ||
context.ThrowIfFailed(); | ||
}, | ||
#pragma warning restore CS0618 // Type or member is obsolete (not intended for general public use) | ||
|
@@ -159,6 +164,19 @@ await this.LifeCycleNotificationHelper.OrchestratorStartingAsync( | |
// Re-throw so we can abort this invocation. | ||
this.HostLifetimeService.OnStopping.ThrowIfCancellationRequested(); | ||
} | ||
|
||
// we abort the invocation on "platform level errors" such as: | ||
// - a timeout | ||
// - an out of memory exception | ||
// - a worker process exit | ||
if (functionResult.Exception is Host.FunctionTimeoutException | ||
|| functionResult.Exception?.InnerException is SessionAbortedException // see RemoteOrchestrationContext.TrySetResultInternal for details on OOM-handling | ||
|| (functionResult.Exception?.InnerException?.GetType().ToString().Contains("WorkerProcessExitException") ?? false)) | ||
{ | ||
// TODO: the `WorkerProcessExitException` type is not exposed in our dependencies, it's part of WebJobs.Host.Script. | ||
// Should we add that dependency or should it be exposed in WebJobs.Host? | ||
Comment on lines
+174
to
+177
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't love the string-based comparison, but it'll be string-based unless we add There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
throw functionResult.Exception; | ||
} | ||
} | ||
catch (Exception hostRuntimeException) | ||
{ | ||
|
@@ -238,8 +256,7 @@ await this.LifeCycleNotificationHelper.OrchestratorFailedAsync( | |
else | ||
{ | ||
// the function failed for some other reason | ||
|
||
string exceptionDetails = functionResult.Exception.ToString(); | ||
string exceptionDetails = functionResult.Exception?.ToString() ?? "Framework-internal message: exception details could not be extracted"; | ||
|
||
this.TraceHelper.FunctionFailed( | ||
this.Options.HubName, | ||
|
@@ -258,7 +275,7 @@ await this.LifeCycleNotificationHelper.OrchestratorFailedAsync( | |
|
||
orchestratorResult = OrchestratorExecutionResult.ForFailure( | ||
message: $"Function '{functionName}' failed with an unhandled exception.", | ||
functionResult.Exception); | ||
functionResult.Exception ?? new Exception($"Function '{functionName}' failed with an unknown unhandled exception")); | ||
} | ||
|
||
// Send the result of the orchestrator function to the DTFx dispatch pipeline. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
| ||
Microsoft Visual Studio Solution File, Format Version 12.00 | ||
# Visual Studio Version 17 | ||
VisualStudioVersion = 17.5.002.0 | ||
MinimumVisualStudioVersion = 10.0.40219.1 | ||
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "DotNetIsolated", "DotNetIsolated.csproj", "{B2DBA49D-9D25-46DB-8968-15D5E83B4060}" | ||
EndProject | ||
Global | ||
GlobalSection(SolutionConfigurationPlatforms) = preSolution | ||
Debug|Any CPU = Debug|Any CPU | ||
Release|Any CPU = Release|Any CPU | ||
EndGlobalSection | ||
GlobalSection(ProjectConfigurationPlatforms) = postSolution | ||
{B2DBA49D-9D25-46DB-8968-15D5E83B4060}.Debug|Any CPU.ActiveCfg = Debug|Any CPU | ||
{B2DBA49D-9D25-46DB-8968-15D5E83B4060}.Debug|Any CPU.Build.0 = Debug|Any CPU | ||
{B2DBA49D-9D25-46DB-8968-15D5E83B4060}.Release|Any CPU.ActiveCfg = Release|Any CPU | ||
{B2DBA49D-9D25-46DB-8968-15D5E83B4060}.Release|Any CPU.Build.0 = Release|Any CPU | ||
EndGlobalSection | ||
GlobalSection(SolutionProperties) = preSolution | ||
HideSolutionNode = FALSE | ||
EndGlobalSection | ||
GlobalSection(ExtensibilityGlobals) = postSolution | ||
SolutionGuid = {0954D7B4-582F-4F85-AE3E-5D503FB07DB1} | ||
EndGlobalSection | ||
EndGlobal |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,165 @@ | ||
using Microsoft.Azure.Functions.Worker; | ||
using Microsoft.Azure.Functions.Worker.Http; | ||
using Microsoft.DurableTask; | ||
using Microsoft.DurableTask.Client; | ||
using Microsoft.Extensions.Logging; | ||
using System; | ||
|
||
namespace FaultOrchestrators | ||
{ | ||
public static class FaultyOrchestrators | ||
{ | ||
[Function(nameof(OOMOrchestrator))] | ||
public static Task OOMOrchestrator( | ||
[OrchestrationTrigger] TaskOrchestrationContext context) | ||
{ | ||
// this orchestrator is not deterministic, on purpose. | ||
// we use the non-determinism to force an OOM exception on only the first replay | ||
|
||
// check if a file named "replayEvidence" exists in source code directory, create it if it does not. | ||
// From experience, this code runs in `<sourceCodePath>/bin/output/`, so we store the file two directories above. | ||
// We do this because the /bin/output/ directory gets overridden during the build process, which happens automatically | ||
// when `func host start` is re-invoked. | ||
string evidenceFile = System.IO.Path.Combine(System.IO.Directory.GetCurrentDirectory(), "..", "..", "replayEvidence"); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would suggest giving these replayEvidence files test-specific names to avoid conflicts should multiple retry tests run simultaneously. Even if today they are run sequentially, this may not always be true There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You're right there's a pending improvement here. I chose not to do this for now because, based on how this is written today, the sequential checks protect us here, and we have some pressure to release. But I'm in favor about making this more robust long term. |
||
bool isTheFirstReplay = !System.IO.File.Exists(evidenceFile); | ||
if (isTheFirstReplay) | ||
{ | ||
System.IO.File.Create(evidenceFile).Close(); | ||
|
||
// force the process to run out of memory | ||
List<byte[]> data = new List<byte[]>(); | ||
|
||
for (int i = 0; i < 10000000; i++) | ||
{ | ||
data.Add(new byte[1024 * 1024 * 1024]); | ||
} | ||
|
||
// we expect the code to never reach this statement, it should OOM. | ||
// we throw just in case the code does not time out. This should fail the test | ||
throw new Exception("this should never be reached"); | ||
} | ||
else { | ||
// if it's not the first replay, delete the evidence file and return | ||
System.IO.File.Delete(evidenceFile); | ||
return Task.CompletedTask; | ||
} | ||
} | ||
|
||
[Function(nameof(ProcessExitOrchestrator))] | ||
public static Task ProcessExitOrchestrator( | ||
[OrchestrationTrigger] TaskOrchestrationContext context) | ||
{ | ||
// this orchestrator is not deterministic, on purpose. | ||
// we use the non-determinism to force a sudden process exit on only the first replay | ||
|
||
// check if a file named "replayEvidence" exists in source code directory, create it if it does not. | ||
// From experience, this code runs in `<sourceCodePath>/bin/output/`, so we store the file two directories above. | ||
// We do this because the /bin/output/ directory gets overridden during the build process, which happens automatically | ||
// when `func host start` is re-invoked. | ||
string evidenceFile = System.IO.Path.Combine(System.IO.Directory.GetCurrentDirectory(), "..", "..", "replayEvidence"); | ||
bool isTheFirstReplay = !System.IO.File.Exists(evidenceFile); | ||
if (isTheFirstReplay) | ||
{ | ||
System.IO.File.Create(evidenceFile).Close(); | ||
|
||
// force sudden crash | ||
Environment.FailFast("Simulating crash!"); | ||
throw new Exception("this should never be reached"); | ||
} | ||
else { | ||
// if it's not the first replay, delete the evidence file and return | ||
System.IO.File.Delete(evidenceFile); | ||
return Task.CompletedTask; | ||
} | ||
} | ||
|
||
[Function(nameof(TimeoutOrchestrator))] | ||
public static Task TimeoutOrchestrator( | ||
[OrchestrationTrigger] TaskOrchestrationContext context) | ||
{ | ||
// this orchestrator is not deterministic, on purpose. | ||
// we use the non-determinism to force a timeout on only the first replay | ||
|
||
// check if a file named "replayEvidence" exists in source code directory, create it if it does not. | ||
// From experience, this code runs in `<sourceCodePath>/bin/output/`, so we store the file two directories above. | ||
// We do this because the /bin/output/ directory gets overridden during the build process, which happens automatically | ||
// when `func host start` is re-invoked. | ||
string evidenceFile = System.IO.Path.Combine(System.IO.Directory.GetCurrentDirectory(), "..", "..", "replayEvidence"); | ||
bool isTheFirstReplay = !System.IO.File.Exists(evidenceFile); | ||
|
||
if (isTheFirstReplay) | ||
{ | ||
System.IO.File.Create(evidenceFile).Close(); | ||
|
||
// force the process to timeout after a 1 minute wait | ||
System.Threading.Thread.Sleep(TimeSpan.FromMinutes(1)); | ||
|
||
// we expect the code to never reach this statement, it should time out. | ||
// we throw just in case the code does not time out. This should fail the test | ||
throw new Exception("this should never be reached"); | ||
} | ||
else { | ||
// if it's not the first replay, delete the evidence file and return | ||
System.IO.File.Delete(evidenceFile); | ||
return Task.CompletedTask; | ||
} | ||
} | ||
|
||
[Function("durable_HttpStartOOMOrchestrator")] | ||
public static async Task<HttpResponseData> HttpStartOOMOrchestrator( | ||
[HttpTrigger(AuthorizationLevel.Anonymous, "get", "post")] HttpRequestData req, | ||
[DurableClient] DurableTaskClient client, | ||
FunctionContext executionContext) | ||
{ | ||
ILogger logger = executionContext.GetLogger("durable_HttpStartOOMOrchestrator"); | ||
|
||
// Function input comes from the request content. | ||
string instanceId = await client.ScheduleNewOrchestrationInstanceAsync( | ||
nameof(OOMOrchestrator)); | ||
|
||
logger.LogInformation("Started orchestration with ID = '{instanceId}'.", instanceId); | ||
|
||
// Returns an HTTP 202 response with an instance management payload. | ||
// See https://learn.microsoft.com/azure/azure-functions/durable/durable-functions-http-api#start-orchestration | ||
return await client.CreateCheckStatusResponseAsync(req, instanceId); | ||
} | ||
|
||
[Function("durable_HttpStartProcessExitOrchestrator")] | ||
public static async Task<HttpResponseData> HttpStartProcessExitOrchestrator( | ||
[HttpTrigger(AuthorizationLevel.Anonymous, "get", "post")] HttpRequestData req, | ||
[DurableClient] DurableTaskClient client, | ||
FunctionContext executionContext) | ||
{ | ||
ILogger logger = executionContext.GetLogger("durable_HttpStartProcessExitOrchestrator"); | ||
|
||
// Function input comes from the request content. | ||
string instanceId = await client.ScheduleNewOrchestrationInstanceAsync( | ||
nameof(ProcessExitOrchestrator)); | ||
|
||
logger.LogInformation("Started orchestration with ID = '{instanceId}'.", instanceId); | ||
|
||
// Returns an HTTP 202 response with an instance management payload. | ||
// See https://learn.microsoft.com/azure/azure-functions/durable/durable-functions-http-api#start-orchestration | ||
return await client.CreateCheckStatusResponseAsync(req, instanceId); | ||
} | ||
|
||
[Function("durable_HttpStartTimeoutOrchestrator")] | ||
public static async Task<HttpResponseData> HttpStartTimeoutOrchestrator( | ||
[HttpTrigger(AuthorizationLevel.Anonymous, "get", "post")] HttpRequestData req, | ||
[DurableClient] DurableTaskClient client, | ||
FunctionContext executionContext) | ||
{ | ||
ILogger logger = executionContext.GetLogger("durable_HttpStartTimeoutOrchestrator"); | ||
|
||
// Function input comes from the request content. | ||
string instanceId = await client.ScheduleNewOrchestrationInstanceAsync( | ||
nameof(TimeoutOrchestrator)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Rather than replicating the same HTTP function once for each scenario, it might make more sense to have a single HTTP trigger that takes an orchestrator function name as a parameter. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't feel strongly here, but my personal preference is to have specific HTTP endpoints for each orchestrator. When testing locally, it saves me time to just copy-paste the entire URL to trigger the orchestrator. But it's no big deal. If you feel strongly, I'll generalize it. Just noting this was a deliberate choice. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't feel strongly so I'll leave it up to you, but it would enable all the usual benefits of code sharing (easier to add new tests, better consistency b/c only one place to make updates, etc.) |
||
|
||
logger.LogInformation("Started orchestration with ID = '{instanceId}'.", instanceId); | ||
|
||
// Returns an HTTP 202 response with an instance management payload. | ||
// See https://learn.microsoft.com/azure/azure-functions/durable/durable-functions-http-api#start-orchestration | ||
return await client.CreateCheckStatusResponseAsync(req, instanceId); | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,5 +7,14 @@ | |
"excludedTypes": "Request" | ||
} | ||
} | ||
} | ||
}, | ||
"extensions": { | ||
"durableTask": { | ||
"storageProvider": { | ||
"maxQueuePollingInterval": "00:00:01", | ||
"controlQueueVisibilityTimeout": "00:01:00" | ||
Comment on lines
+14
to
+15
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this just helps make sure we retry orchestrator replays fast enough after a platform-level error. |
||
} | ||
} | ||
}, | ||
"functionTimeout": "00:00:30" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We don't time out sooner because our OOM-based test needs time to run out of memory. |
||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Did you verify that
WorkerProcessExitException
is actually available here? Looking at the function host code, I don't see the exception actually thrown: https://github.com/Azure/azure-functions-host/blob/1088f24c3ae3a6275f18cbf091fa525c2477be91/src/WebJobs.Script/Workers/ProcessManagement/WorkerProcess.cs#L179-L184There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, that's how we stumbled upon this error type in the first place. We found this error by forcing a process exit (
Environment.FailFast
) and placing a breakpoint at this position.But yeah I admit that I also don't see the explicit throw here. We can investigate further, or I can show you the runtime behavior through the debugger.