Skip to content

Commit

Permalink
[application_controller] Stop starting applications when terminating …
Browse files Browse the repository at this point in the history
…node

Applications that didn't finish starting could be left in a half-broken
state during node shutdown. In particular, their dependencies would be
already shut down, while the application itself was running, likely
with many errors.
The behaviour is to replicate what happens if the application finished
starting - with timeout set, we'll respect this timeout; otherwise
we'll wait forever.
  • Loading branch information
michalmuskala committed Feb 25, 2025
1 parent 48b3ad5 commit 7f0ee75
Show file tree
Hide file tree
Showing 5 changed files with 173 additions and 4 deletions.
25 changes: 25 additions & 0 deletions lib/kernel/src/application_controller.erl
Original file line number Diff line number Diff line change
Expand Up @@ -1241,11 +1241,36 @@ terminate(Reason, S) ->
undefined -> infinity;
{ok,T} -> T
end,
[terminate_starting(AppName, Id, ShutdownTimeout)
|| {AppName, _RestartType, _Type, Id} <- S#state.starting, is_pid(Id)],
[terminate_started(Id, ShutdownTimeout)
|| {_AppName, Id} <- S#state.running, is_pid(Id)],
true = ets:delete(ac_tab),
ok.

terminate_starting(AppName, Starter, ShutdownTimeout) ->
receive
%% starter died before replying
{'EXIT', Starter, _} ->
ok;
{'$gen_cast', {application_started, AppName, {ok, Id}}} when is_pid(Id) ->
terminate_started(Id, ShutdownTimeout);
{'$gen_cast', {application_started, AppName, _}} ->
ok;
%% We need to handle any gen_server:call here
%% and reply to them so that they don't deadlock
{'$gen_call', From, _Msg} ->
gen_server:reply(From, {error, terminating}),
terminate_starting(AppName, Starter, ShutdownTimeout)
after ShutdownTimeout ->
Ref = erlang:monitor(process, Starter),
unlink(Starter),
exit(Starter, kill),
receive
{'DOWN', Ref, process, Starter, _} -> ok
end
end.

terminate_started(Id, ShutdownTimeout) ->
Ref = erlang:monitor(process, Id),
unlink(Id),
Expand Down
102 changes: 100 additions & 2 deletions lib/kernel/test/application_SUITE.erl
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,9 @@
-export([config_change/1, persistent_env/1, invalid_app_file/1,
distr_changed_tc1/1, distr_changed_tc2/1,
ensure_started/1, ensure_all_started/1,
shutdown_func/1, do_shutdown/1, shutdown_timeout/1,
shutdown_func/1, do_shutdown/1, shutdown_timeout/1,
shutdown_starting/1, shutdown_starting_timeout/1,
shutdown_starting_application_call/1,
shutdown_application_call/1,shutdown_deadlock/1,
config_relative_paths/1, handle_many_config_files/1,
format_log_1/1, format_log_2/1,
Expand All @@ -61,7 +63,8 @@ all() ->
script_start, nodedown_start, permit_false_start_local,
permit_false_start_dist, get_key, get_env, ensure_all_started,
set_env, set_env_persistent, set_env_errors, get_supervisor,
{group, distr_changed}, config_change, shutdown_func, shutdown_timeout,
{group, distr_changed}, config_change, shutdown_func, shutdown_timeout,
shutdown_starting, shutdown_starting_timeout, shutdown_starting_application_call,
shutdown_application_call, shutdown_deadlock, config_relative_paths, optional_applications,
persistent_env, handle_many_config_files, format_log_1, format_log_2,
configfd_bash, configfd_port_program, invalid_app_file].
Expand Down Expand Up @@ -2578,6 +2581,101 @@ shutdown_timeout(Config) when is_list(Config) ->
end,
ok.

%%%-----------------------------------------------------------------
%%% Test that we wait for starting applications to start
%%% before we start terminating already started
%%%-----------------------------------------------------------------
shutdown_starting(Config) when is_list(Config) ->
DataDir = proplists:get_value(data_dir,Config),
{ok,Cp1} = start_node(?MODULE_STRING++"_shutdown_starting"),
wait_for_ready_net(),
rpc:call(Cp1, code, add_path, [filename:join([DataDir,slow])]),
ok = rpc:call(Cp1, application, load, [slow]),
ok = rpc:call(Cp1, application, set_env, [slow, controller, self()]),
rpc:cast(Cp1, application, start, [slow]),

receive
{server_starting, Server} ->
rpc:cast(Cp1, init, stop, []),
Server ! continue
after 10000 ->
ct:fail("timeout 10 sec: application didn't start")
end,
receive
server_terminating ->
ok
after 10000 ->
ct:fail("timeout 10 sec: node didn't stop starting application")
end.

%%%-----------------------------------------------------------------
%%% Tests that we don't deadlock onwait for starting applications to start
%%% before we start terminating already started
%%%-----------------------------------------------------------------
shutdown_starting_timeout(Config) when is_list(Config) ->
DataDir = proplists:get_value(data_dir,Config),
{ok,Cp1} = start_node(?MODULE_STRING++"_shutdown_starting"),

Check warning on line 2617 in lib/kernel/test/application_SUITE.erl

View workflow job for this annotation

GitHub Actions / CT Test Results

shutdown_starting_timeout failed

artifacts/Unit Test Results/kernel_junit.xml [took 0s]
Raw output
Test shutdown_starting_timeout in application_SUITE failed!
{{badmatch,{error,{already_running,application_SUITE_shutdown_starting@be001c86f0bc}}},
 [{application_SUITE,shutdown_starting_timeout,1,
                     [{file,"application_SUITE.erl"},{line,2617}]},
  {test_server,ts_tc,3,[{file,"test_server.erl"},{line,1794}]},
  {test_server,run_test_case_eval1,6,[{file,"test_server.erl"},{line,1303}]},
  {test_server,run_test_case_eval,9,[{file,"test_server.erl"},{line,1235}]}]}
wait_for_ready_net(),
ok = rpc:call(Cp1, application, set_env, [kernel, shutdown_timeout, 1000]),
rpc:call(Cp1, code, add_path, [filename:join([DataDir,slow])]),
ok = rpc:call(Cp1, application, load, [slow]),
ok = rpc:call(Cp1, application, set_env, [slow, controller, self()]),
rpc:cast(Cp1, application, start, [slow]),

ok = net_kernel:monitor_nodes(true),
_ = rpc:call(Cp1, init, stop, []),
receive
{nodedown,Cp1} ->
ok
after 10000 ->
ct:fail("timeout 10 sec: node termination hangs")
end,
ok.

%%%-----------------------------------------------------------------
%%% Tests that we don't deadlock if we call application:set_env
%%% when terminating an application that didn't finish starting
%%%-----------------------------------------------------------------
shutdown_starting_application_call(Config) when is_list(Config) ->
DataDir = proplists:get_value(data_dir,Config),
{ok,Cp1} = start_node(?MODULE_STRING++"_shutdown_starting"),

Check warning on line 2641 in lib/kernel/test/application_SUITE.erl

View workflow job for this annotation

GitHub Actions / CT Test Results

shutdown_starting_application_call failed

artifacts/Unit Test Results/kernel_junit.xml [took 0s]
Raw output
Test shutdown_starting_application_call in application_SUITE failed!
{{badmatch,{error,{already_running,application_SUITE_shutdown_starting@be001c86f0bc}}},
 [{application_SUITE,shutdown_starting_application_call,1,
                     [{file,"application_SUITE.erl"},{line,2641}]},
  {test_server,ts_tc,3,[{file,"test_server.erl"},{line,1794}]},
  {test_server,run_test_case_eval1,6,[{file,"test_server.erl"},{line,1303}]},
  {test_server,run_test_case_eval,9,[{file,"test_server.erl"},{line,1235}]}]}
wait_for_ready_net(),
rpc:call(Cp1, code, add_path, [filename:dirname(code:which(?MODULE))]),
rpc:call(Cp1, code, add_path, [filename:join([DataDir,slow])]),
ok = rpc:call(Cp1, application, load, [slow]),
ok = rpc:call(Cp1, application, set_env, [slow, controller, self()]),
Terminate = fun() ->
try application:set_env(slow, ab, b, [{timeout, infinity}, {persistent, true}])
catch
exit:terminating -> ok
end
end,
ok = rpc:call(Cp1, application, set_env, [slow, terminate, Terminate]),
rpc:cast(Cp1, application, start, [slow]),

receive
{server_starting, Server} ->
Server ! continue
after 10000 ->
ct:fail("timeout 10 sec: application didn't start")
end,

ok = net_kernel:monitor_nodes(true),
_ = rpc:call(Cp1, init, stop, []),
receive
server_terminating ->
ok
after 10000 ->
ct:fail("timeout 10 sec: node didn't stop starting application")
end,
receive
{nodedown,Cp1} ->
ok
after 10000 ->
ct:fail("timeout 10 sec: node termination hangs")
end,
ok.

%%%-----------------------------------------------------------------
%%% Test that we do not cause a deadlock if we call
%%% application:set_env or application:ensure_started
Expand Down
7 changes: 5 additions & 2 deletions lib/kernel/test/application_SUITE_data/Makefile.src
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ EFLAGS=+debug_info
all: app_start_error.@EMULATOR@ trans_abnormal_sup.@EMULATOR@ \
trans_normal_sup.@EMULATOR@ transient.@EMULATOR@ \
group_leader_sup.@EMULATOR@ group_leader.@EMULATOR@ \
deadlock/deadlock.@EMULATOR@
deadlock/deadlock.@EMULATOR@ slow/slow.@EMULATOR@

app_start_error.@EMULATOR@: app_start_error.erl
erlc $(EFLAGS) app_start_error.erl
Expand All @@ -24,4 +24,7 @@ group_leader_sup.@EMULATOR@: group_leader_sup.erl
erlc $(EFLAGS) group_leader_sup.erl

deadlock/deadlock.@EMULATOR@: deadlock/deadlock.erl
erlc $(EFLAGS) -o deadlock deadlock/deadlock.erl
erlc $(EFLAGS) -o deadlock deadlock/deadlock.erl

slow/slow.@EMULATOR@: slow/slow.erl
erlc $(EFLAGS) -o slow slow/slow.erl
9 changes: 9 additions & 0 deletions lib/kernel/test/application_SUITE_data/slow/slow.app
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{application, slow, [
{vsn, "1"},
{registered, []},
{applications, [kernel, stdlib]},
{modules, [slow]},
{mod, {slow, []}},
{env, []}
]}.

34 changes: 34 additions & 0 deletions lib/kernel/test/application_SUITE_data/slow/slow.erl
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
-module(slow).
-behaviour(application).
-behaviour(supervisor).
-behaviour(gen_server).
-compile(export_all).


%%%-----------------------------------------------------------------
%%% application callbacks
start(_StartType, _StartArgs) ->
supervisor:start_link({local, ?MODULE}, ?MODULE, supervisor).

stop(_State) ->
ok.

init(supervisor) ->
Child = #{id => main, start => {gen_server, start_link, [?MODULE, server, []]}},
{ok, {#{}, [Child]}};
init(server) ->
{ok, Controller} = application:get_env(slow, controller),
process_flag(trap_exit, true),
Controller ! {server_starting, self()},
receive
continue -> ok
end,
{ok, #{controller => Controller}}.

terminate(_Reason, #{controller := Controller}) ->
case application:get_env(slow, terminate) of
{ok, Fun} -> Fun();
_ -> ok
end,
Controller ! server_terminating,
ok.

0 comments on commit 7f0ee75

Please sign in to comment.