Skip to content

Commit

Permalink
Emergency Profile Dumps (tgstation#75924)
Browse files Browse the repository at this point in the history
## About The Pull Request

Adds some hooks to the MC that detect if something ate a ton of real
time last tick, and reacts by dumping our current profile into a file

It's really frustrating to see a spike in td in our performance logs,
but see no reason in the profile because it's only taken every 5
minutes. This resolves that

I'm throwing this up so mso can give it a look over, not sure if I want
to use defines or configs here, taking suggestions

🆑
server: Adds a system to emergency dump profiles if too much time passes
between ticks
config: Added configs that control how often emergency profiles are
allowed to dump, alongside the threshold for what counts as too much
time between ticks
/🆑
  • Loading branch information
LemonInTheDark authored Jun 23, 2023
1 parent 4a212ad commit d93e81c
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 7 deletions.
6 changes: 6 additions & 0 deletions code/controllers/configuration/entries/general.dm
Original file line number Diff line number Diff line change
Expand Up @@ -626,6 +626,12 @@

/datum/config_entry/flag/auto_profile

/datum/config_entry/number/drift_dump_threshold
default = 4 SECONDS

/datum/config_entry/number/drift_profile_delay
default = 15 SECONDS

/datum/config_entry/string/centcom_ban_db // URL for the CentCom Galactic Ban DB API

/datum/config_entry/string/centcom_source_whitelist
Expand Down
21 changes: 20 additions & 1 deletion code/controllers/master.dm
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ GLOBAL_REAL(Master, /datum/controller/master)
var/init_timeofday
var/init_time
var/tickdrift = 0
/// Tickdrift as of last tick, w no averaging going on
var/olddrift = 0

/// How long is the MC sleeping between runs, read only (set by Loop() based off of anti-tick-contention heuristics)
var/sleep_delta = 1
Expand Down Expand Up @@ -60,6 +62,10 @@ GLOBAL_REAL(Master, /datum/controller/master)
/// Outside of initialization, returns null.
var/current_initializing_subsystem = null

/// The last decisecond we force dumped profiling information
/// Used to avoid spamming profile reads since they can be expensive (string memes)
var/last_profiled = 0

var/static/restart_clear = 0
var/static/restart_timeout = 0
var/static/restart_count = 0
Expand Down Expand Up @@ -442,9 +448,14 @@ GLOBAL_REAL(Master, /datum/controller/master)
canary.use_variable()
//the actual loop.
while (1)
tickdrift = max(0, MC_AVERAGE_FAST(tickdrift, (((REALTIMEOFDAY - init_timeofday) - (world.time - init_time)) / world.tick_lag)))
var/newdrift = ((REALTIMEOFDAY - init_timeofday) - (world.time - init_time)) / world.tick_lag
tickdrift = max(0, MC_AVERAGE_FAST(tickdrift, newdrift))
var/starting_tick_usage = TICK_USAGE

if(newdrift - olddrift >= CONFIG_GET(number/drift_dump_threshold))
AttemptProfileDump(CONFIG_GET(number/drift_profile_delay))
olddrift = newdrift

if (init_stage != init_stage_completed)
return MC_LOOP_RTN_NEWSTAGES
if (processing <= 0)
Expand Down Expand Up @@ -803,3 +814,11 @@ GLOBAL_REAL(Master, /datum/controller/master)
for (var/thing in subsystems)
var/datum/controller/subsystem/SS = thing
SS.OnConfigLoad()

/// Attempts to dump our current profile info into a file, triggered if the MC thinks shit is going down
/// Accepts a delay in deciseconds of how long ago our last dump can be, this saves causing performance problems ourselves
/datum/controller/master/proc/AttemptProfileDump(delay)
if(REALTIMEOFDAY - last_profiled <= delay)
return FALSE
last_profiled = REALTIMEOFDAY
SSprofiler.DumpFile(allow_yield = FALSE)
19 changes: 13 additions & 6 deletions code/controllers/subsystem/profiler.dm
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,20 @@ SUBSYSTEM_DEF(profiler)
StopProfiling() //Stop the early start profiler
return SS_INIT_SUCCESS

/datum/controller/subsystem/profiler/fire()
/datum/controller/subsystem/profiler/OnConfigLoad()
if(CONFIG_GET(flag/auto_profile))
DumpFile()
StartProfiling()
can_fire = TRUE
else
StopProfiling()
can_fire = FALSE

/datum/controller/subsystem/profiler/fire()
DumpFile()

/datum/controller/subsystem/profiler/Shutdown()
if(CONFIG_GET(flag/auto_profile))
DumpFile()
DumpFile(allow_yield = FALSE)
world.Profile(PROFILE_CLEAR, type = "sendmaps")
return ..()

Expand All @@ -39,13 +46,13 @@ SUBSYSTEM_DEF(profiler)
world.Profile(PROFILE_STOP)
world.Profile(PROFILE_STOP, type = "sendmaps")


/datum/controller/subsystem/profiler/proc/DumpFile()
/datum/controller/subsystem/profiler/proc/DumpFile(allow_yield = TRUE)
var/timer = TICK_USAGE_REAL
var/current_profile_data = world.Profile(PROFILE_REFRESH, format = "json")
var/current_sendmaps_data = world.Profile(PROFILE_REFRESH, type = "sendmaps", format="json")
fetch_cost = MC_AVERAGE(fetch_cost, TICK_DELTA_TO_MS(TICK_USAGE_REAL - timer))
CHECK_TICK
if(allow_yield)
CHECK_TICK

if(!length(current_profile_data)) //Would be nice to have explicit proc to check this
stack_trace("Warning, profiling stopped manually before dump.")
Expand Down
6 changes: 6 additions & 0 deletions config/config.txt
Original file line number Diff line number Diff line change
Expand Up @@ -469,6 +469,12 @@ DEFAULT_VIEW_SQUARE 15x15
## Enable automatic profiling - Byond 513.1506 and newer only.
#AUTO_PROFILE

## Threshold (in deciseconds) for real time between ticks before we start dumping profiles
DRIFT_DUMP_THRESHOLD 40

## How long to wait (in deciseconds) after a profile dump before logging another tickdrift sourced one
DRIFT_PROFILE_DELAY 150

## Uncomment to enable global ban DB using the provided URL. The API should expect to receive a ckey at the end of the URL.
## More API details can be found here: https://centcom.melonmesa.com
#CENTCOM_BAN_DB https://centcom.melonmesa.com/ban/search
Expand Down

0 comments on commit d93e81c

Please sign in to comment.