-
Notifications
You must be signed in to change notification settings - Fork 21
/
Copy paththrash-protect.py
executable file
·655 lines (582 loc) · 27.4 KB
/
thrash-protect.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
#!/usr/bin/python3
### Simple-Stupid user-space program protecting a linux host from thrashing.
### See the README for details.
### Project home: https://github.com/tobixen/thrash-protect
### This is a rapid prototype implementation. I'm considering to implement in C.
## This was written for python3 (there exists a python24-branch, but
## it won't be maintained). python3 is not available on a lot of
## servers, and those seems to be the only snags when running on
## python 2.7:
from __future__ import with_statement
try:
ProcessLookupError
except NameError:
ProcessLookupError=OSError
try:
FileNotFoundError
except NameError:
FileNotFoundError=IOError
__version__ = "0.14.3"
__author__ = "Tobias Brox"
__copyright__ = "Copyright 2013-2021, Tobias Brox"
__license__ = "GPL"
__maintainer__ = "Tobias Brox"
__email__ = "[email protected]"
__product__ = "thrash-protect"
## subprocess.check_output is not available in python 2.6. this is used in a
## non-critical part of the script, already inside a try-except-scope, so the
## import has been moved there to allow the script to work on servers without 2.7 installed.
#from subprocess import check_output
from os import getenv, kill, getpid, unlink, getpgid, getsid, getpid, getppid
from collections import namedtuple
import time
from datetime import datetime
import glob
import signal
import logging
import random ## for the test_mode
#########################
## Configuration section
#########################
class config:
"""
Collection of configuration variables. (Those are still really
global variables, but looks a bit neater to access
config.bits_per_byte than bits_per_byte. Perhaps we'll parse some
a config file and initiate some object with the name config in
some future version)
"""
## debug
debug_logging = getenv('THRASH_PROTECT_DEBUG_LOGGING', False)
## will check the state and warn if thrash_protects attempts unfreezing a process that is running or freezing a process that is already suspended
debug_checkstate = getenv('THRASH_PROTECT_DEBUG_CHECKSTATE', False)
## Normal sleep interval, in seconds.
interval = float(getenv('THRASH_PROTECT_INTERVAL', '0.5'))
## max acceptable time delta in one iteration
max_acceptable_time_delta = interval/8.0
## Number of acceptable page swaps during the above interval
swap_page_threshold = int(getenv('THRASH_PROTECT_SWAP_PAGE_THRESHOLD', '4'))
## After X number of major pagefaults, we should initiate a process scanning
pgmajfault_scan_threshold = int(getenv('THRASH_PROTECT_PGMAJFAULT_SCAN_THRESHOLD', swap_page_threshold*4))
## process name whitelist
cmd_whitelist = getenv('THRASH_PROTECT_CMD_WHITELIST', '')
cmd_whitelist = cmd_whitelist.split(' ') if cmd_whitelist else ['sshd', 'bash', 'xinit', 'X', 'spectrwm', 'screen', 'SCREEN', 'mutt', 'ssh', 'xterm', 'rxvt', 'urxvt', 'Xorg.bin', 'Xorg', 'systemd-journal']
cmd_blacklist = getenv('THRASH_PROTECT_CMD_BLACKLIST', '').split(' ')
cmd_jobctrllist = getenv('THRASH_PROTECT_CMD_JOBCTRLLIST', 'bash sudo').split(' ')
blacklist_score_multiplier = int(getenv('THRASH_PROTECT_BLACKLIST_SCORE_MULTIPLIER', '16'))
whitelist_score_divider = int(getenv('THRASH_PROTECT_WHITELIST_SCORE_MULTIPLIER', str(blacklist_score_multiplier*4)))
## Unfreezing processes: Ratio of POP compared to GET (integer)
unfreeze_pop_ratio = int(getenv('THRASH_PROTECT_UNFREEZE_POP_RATIO', '5'))
## test_mode - if test_mode and not random.getrandbits(test_mode), then pretend we're thrashed
test_mode = int(getenv('THRASH_PROTECT_TEST_MODE', '0'))
## ADVANCED LOGGING OPTIONS
## When freezing a process, enables logging of username, CPU usage, memory usage and command string
## (will spawn ps, so some overhead costs)
log_user_data_on_freeze = int(getenv('THRASH_PROTECT_LOG_USER_DATA_ON_FREEZE', '0'))
## Log the extra process data on unfreeze (when the extra overhead cost is probably harmless)
log_user_data_on_unfreeze = int(getenv('THRASH_PROTECT_LOG_USER_DATA_ON_UNFREEZE', '1'))
## Enable human-readable date format instead of UNIX timestamp
date_human_readable = int(getenv('THRASH_PROTECT_DATE_HUMAN_READABLE', '1'))
class SystemState:
"""A "system state" is a collection of observed and calculated
variables at a specific point of time. We'll probably never have
more than two instantiated objects - "last" and "current". (This
class replaces a bunch of global variables from version 0.8 -
let's hope that the overhead in instantiation and garbage
collection will be insignificant)
"""
def __init__(self):
self.timestamp = time.time()
self.pagefaults = self.get_pagefaults()
self.swapcount = self.get_swapcount()
self.cooldown_counter = 0
self.unfrozen_pid = None
self.timer_alert = False
def get_pagefaults(self):
with open('/proc/vmstat', 'r') as vmstat:
line = ''
while line is not None:
line = vmstat.readline()
if line.startswith('pgmajfault '):
return int(line[12:])
def get_swapcount(self):
ret = []
with open('/proc/vmstat', 'r') as vmstat:
line = True
while line:
line = vmstat.readline()
if line.startswith('pswp'):
ret.append(int(line[7:]))
return tuple(ret)
def check_swap_threshold(self, prev):
self.cooldown_counter = prev.cooldown_counter
if config.test_mode and not random.getrandbits(config.test_mode):
self.cooldown_counter = prev.cooldown_counter+1
return True
## will return True if we have bidirectional traffic to swap,
## or if we have a big one-directional flow of data.
##
## * if both swap counters are above the swap_page_threshold, trigger
##
## * if one of the swap counters is quite much above the
## swap_page_threshold, while the other is 0, we should trigger
##
## the below algorithm seems to satisfy those two criterias, though
## I'm not much happy with the arbitrary constant "0.1" being thrown
## in.
ret = (
((self.swapcount[0]-prev.swapcount[0]+0.1)/config.swap_page_threshold) *
((self.swapcount[1]-prev.swapcount[1]+0.1)/config.swap_page_threshold)
> 1.0)
## Increase or decrease the busy-counter ... or keep it where it is
if ret:
## thrashing alert, increase the counter
self.cooldown_counter = prev.cooldown_counter+1
if not prev.timer_alert:
logging.debug("potential thrashing detected, but we got no timing alarm. Perhaps max_acceptable_time_delta should be tweaked down")
config.max_acceptable_time_delta/=1.1
elif prev.cooldown_counter and prev.swapcount==self.swapcount and self.timestamp-prev.timestamp>=self.get_sleep_interval():
## not busy at all, and we have slept since the previous check. Decrease counter.
self.cooldown_counter = prev.cooldown_counter-1
if prev.timer_alert:
logging.debug("we got a timer alert, even if the system is not busy. Increasing the timer alert threshold")
config.max_acceptable_time_delta*=1.1
else:
logging.debug("prev.swapcount==self.swapcount: %s, self.timestamp-prev.timestamp>=self.get_sleep_interval(): %s, self.timestamp-prev.timestamp: %s, self.get_sleep_interval(): %s" % (prev.swapcount==self.swapcount, self.timestamp-prev.timestamp>=self.get_sleep_interval(), self.timestamp-prev.timestamp, self.get_sleep_interval()))
## some swapin or swapout has been observed, or we haven't slept since previous run. Keep the cooldown counter steady.
## (Hm - we risk that process A gets frozen but never unfrozen due to process B generating swap activity?)
return ret
def get_sleep_interval(self):
return config.interval/(self.cooldown_counter + 1.0)
def check_delay(self, expected_delay=0):
"""
If the code execution takes a too long time it may be that we're thrashing and this process has been swapped out.
(TODO: detect possible problem: wrong tuning of max_acceptable_time_delta causes this to always trigger)
"""
global frozen_pids
delta = time.time() - self.timestamp - expected_delay
if delta > config.max_acceptable_time_delta:
logging.info("relatively big time delta observed. interval: %s cooldown_counter: %s expected delay: %s max acceptable delta: %s delta: %s time: %s frozen pids: %s. (this message is to be expected every now and then as the max acceptable delta parameter is autotuned)" % (config.interval, self.cooldown_counter, expected_delay, config.max_acceptable_time_delta, delta, time.time(), frozen_pids))
self.cooldown_counter += 2
self.timer_alert = True
return False
return True
class ProcessSelector:
"""Base class for process selector classes.
Those classes have two methods, scan() which will search for a
suitable process to suspend, and update() to update state in the
object, if needed. scan is required
"""
def scan(self):
raise NotImplementedError()
def update(self, prev, curr):
pass
procstat = namedtuple('procstat', ('cmd', 'state', 'majflt', 'ppid'))
def readStat(self, sfn):
try:
return self.readStat_(sfn)
except (FileNotFoundError, ProcessLookupError):
return None
def readStat_(self, sfn):
"""
helper method - reads the stats file and returns a tuple (cmd, state,
majflt, pids)
"""
if isinstance(sfn, int):
sfn = "/proc/%s/stat" % sfn
with open(sfn, 'rb') as stat_file:
stats=[]
stats_tx = stat_file.read().decode('utf-8', 'ignore')
stats_tx = stats_tx.split("(",1)
stats.append(stats_tx[0])
stats_tx=stats_tx[1].rsplit(")",1)
stats.append(stats_tx[0])
stats.extend(stats_tx[1].split(' ')[1:])
return self.procstat(stats[1], stats[2], int(stats[11]), int(stats[3]))
def checkParents(self, pid, ppid=None):
"""
helper method - find a list of pids that should be suspended, given
a pid (and for optimalization reasons, ppid if it's already
known).
If a process running under an interactive bash session gets
suspended, the bash job control kicks in and causes havoc.
Hence, we should check if the cmd of the parent process is
'bash'.
"""
if ppid is None:
stats = self.readStat(pid)
if not stats:
return ()
ppid = stats.ppid
if ppid <= 1:
return (pid,)
pstats = self.readStat(ppid)
if pstats and pstats.cmd in config.cmd_jobctrllist:
return self.checkParents(ppid, pstats.ppid) + (pid,)
else:
return (pid,)
class OOMScoreProcessSelector(ProcessSelector):
"""
Class containing one method for selecting a process to freeze,
based on oom_score. No stored state required.
"""
def scan(self):
oom_scores = glob.glob('/proc/*/oom_score')
max = 0
worstpid = None
for fn in oom_scores:
try:
pid = int(fn.split('/')[2])
except ValueError:
continue
try:
with open(fn, 'r') as oom_score_file:
oom_score = int(oom_score_file.readline())
stats = self.readStat(pid)
if not stats:
continue
if 'T' in stats.state:
logging.debug("oom_score: %s, cmd: %s, pid: %s, state: %s - no touch" % (oom_score, stats.cmd, pid, stats.state))
continue
except FileNotFoundError:
continue
if oom_score > 0:
logging.debug("oom_score: %s, cmd: %s, pid: %s" % (oom_score, stats.cmd, pid))
if stats.cmd in config.cmd_whitelist:
logging.debug("whitelisted process %s %s %s" % (pid, stats.cmd, oom_score))
oom_score /= config.whitelist_score_divider
if stats.cmd in config.cmd_blacklist:
oom_score *= config.blacklist_score_multiplier
if oom_score > max:
## ignore self
if pid in (getpid(), getppid()):
continue
max = oom_score
worstpid = (pid, stats.ppid)
logging.debug("oom scan completed - selected pid: %s" % (worstpid and worstpid[0]))
if worstpid != None:
return self.checkParents(*worstpid)
else:
return None
class LastFrozenProcessSelector(ProcessSelector):
"""Class containing one method for selecting a process to freeze,
simply refreezing the last unfrozen process. The rationale is
that if a process was just resumed and the system start thrashing
again, it would probably be smart to freeze that process again -
and it's also a very cheap operation to do.
If refreezing the last unfrozen process helps, then we're good -
though it may potentially a problem that the same process is
selected all the time.
"""
def __init__(self):
self.last_unfrozen_pid = None
def update(self, prev, cur):
if cur.unfrozen_pid:
self.last_unfrozen_pid = cur.unfrozen_pid
def scan(self):
"""
If a process was just resumed and the system start thrashing again, it would probably be smart to freeze that process again. This is also a very cheap operation
"""
logging.debug("last unfrozen_pid is %s" % self.last_unfrozen_pid)
if self.last_unfrozen_pid in frozen_pids:
logging.debug("last unfrozen_pid is already frozen")
return None
logging.debug("last unfrozen process return - selected pid: %s" % self.last_unfrozen_pid)
## it may have exited already, in that case we should purge the record
if self.last_unfrozen_pid and not [True for x in self.last_unfrozen_pid if self.readStat(x)]:
self.last_unfrozen_pid = None
return self.last_unfrozen_pid
class PageFaultingProcessSelector(ProcessSelector):
"""
Selects the process that have had most page faults since previous
run. This method have two problems; it is relatively expensive in
terms of memory usage since it needs to keep counts of the page
faults for every process, secondly, "page fault" is not equivalent
with "swap". (When a process is started, loading the program code
into memory is usually postponed - when pages that aren't loaded
yet are needed, it's also a "page fault")
"""
def __init__(self):
## TODO: garbage collection
self.pagefault_by_pid = {}
self.cooldown_counter = 0
def update(self, prev, cur):
self.cooldown_counter = cur.cooldown_counter
if cur.pagefaults - prev.pagefaults > config.pgmajfault_scan_threshold:
## If we've had a lot of major page faults, refresh our state
## on major page faults.
self.scan()
def scan(self):
## TODO: garbage collection
stat_files = glob.glob('/proc/*/stat')
max = 0
worstpid = None
for fn in stat_files:
try:
pid = int(fn.split('/')[2])
except ValueError:
continue
stats = self.readStat(fn)
if not stats:
continue
if stats.majflt > 0 and not 'T' in stats.state:
prev = self.pagefault_by_pid.get(pid, 0)
self.pagefault_by_pid[pid] = stats.majflt
diff = stats.majflt - prev
if config.test_mode:
diff += random.getrandbits(3)
if not diff:
continue
if stats.cmd in config.cmd_blacklist:
diff *= config.blacklist_score_multiplier
if stats.cmd in config.cmd_whitelist:
logging.debug("whitelisted process %s %s %s" % (pid, stats.cmd, diff))
diff /= config.whitelist_score_divider
if diff > max:
## ignore self
if pid == getpid():
continue
max = diff
worstpid = (pid, stats.ppid)
logging.debug("pagefault score: %s, cmd: %s, pid: %s" % (diff, stats.cmd, pid))
logging.debug("pagefault scan completed - selected pid: %s" % (worstpid and worstpid[0]))
## give a bit of protection against whitelisted and innocent processes being stopped
## (TODO: hardcoded constants)
if max > 4.0 / (self.cooldown_counter + 1.0):
return self.checkParents(*worstpid)
class GlobalProcessSelector(ProcessSelector):
"""
This is a collection of the various process selectors.
"""
def __init__(self):
## sorted from cheap to expensive. Also, it is surely smart to be quick on refreezing a recently unfrozen process if host starts thrashing again.
self.collection = [LastFrozenProcessSelector(), OOMScoreProcessSelector(), PageFaultingProcessSelector()]
self.scan_method_count = 0
def update(self, prev, cur):
if cur.unfrozen_pid:
self.scan_method_count = 0
for c in self.collection:
c.update(prev, cur)
def scan(self):
logging.debug("scan_processes")
## a for loop here to make sure we fall back on the next method if the first method fails to find anything.
for i in range(0,len(self.collection)):
logging.debug("scan method: %s" % (self.scan_method_count % len(self.collection)))
ret = self.collection[self.scan_method_count % len(self.collection)].scan()
self.scan_method_count += 1
if ret:
return ret
logging.debug("found nothing to stop!? :-(")
def get_date_string():
if config.date_human_readable:
return datetime.strftime(datetime.now(), "%Y-%m-%d %H:%M:%S")
else:
return str(time.time())
## returns string with detailed process information
def get_process_info(pid):
try:
## check_output is only available from 2.7, and compatibility
## with 2.6 is currently a requirement.
## TODO: move the import back where it belongs, eventually.
from subprocess import check_output
## TODO: we should fetch this information from /proc filesystem instead of using ps
info = check_output("ps -p %d uf" % pid, shell = True).decode('utf-8', 'ignore')
info = info.split('\n')[1]
info = info.split()
if len(info) >= 4:
return "u:%10s CPU:%5s%% MEM:%5s%% CMD: %s" % (info[0], info[2], info[3], ' '.join(info[10:]))
else:
return "No information available, the process was probably killed or 'ps' returns unexpected output."
except:
logging.error("Could not fetch process user information, the process is probably gone")
return "problem fetching process information"
def ignore_failure(method):
def _try_except_pass(*args, **kwargs):
try:
method(*args, **kwargs)
except:
logging.critical("Exception ignored", exc_info=True)
return _try_except_pass
## hard coded logic as for now. One state file and one log file.
## state file can be monitored, i.e. through nagios. todo: advanced logging
@ignore_failure
def log_frozen(pid):
with open("/var/log/thrash-protect.log", 'ab') as logfile:
if config.log_user_data_on_freeze:
logfile.write(("%s - frozen pid %5s - %s - list: %s\n" % (get_date_string(), str(pid), get_process_info(pid), frozen_pids)).encode("utf-8"))
else:
logfile.write(("%s - frozen pid %s - frozen list: %s\n" % (get_date_string(), pid, frozen_pids)).encode('utf-8'))
with open("/tmp/thrash-protect-frozen-pid-list", "w") as logfile:
logfile.write(" ".join([" ".join([str(pid) for pid in pid_group]) for pid_group in frozen_pids]) + "\n")
@ignore_failure
def log_unfrozen(pid):
with open("/var/log/thrash-protect.log", 'ab') as logfile:
if config.log_user_data_on_unfreeze:
logfile.write(("%s - unfrozen pid %5s - %s - list: %s\n" % (get_date_string(), str(pid), get_process_info(pid), frozen_pids)).encode('utf-8'))
else:
logfile.write(("%s - unfrozen pid %s\n" % (get_date_string(), pid)).encode('utf-8'))
if frozen_pids:
with open("/tmp/thrash-protect-frozen-pid-list", "w") as logfile:
logfile.write(" ".join([" ".join([str(pid) for pid in pid_group]) for pid_group in frozen_pids]) + "\n")
else:
try:
unlink("/tmp/thrash-protect-frozen-pid-list")
except (FileNotFoundError, OSError):
pass
def _debug_check_state(pid, should_be_suspended=False):
procstate = ProcessSelector().readStat(pid)
if not procstate and not should_be_suspended:
return
if not procstate:
logging.warn("Pid %s should be suspended, but is gone" % pid)
return
is_suspended = 'T' in procstate.state
if is_suspended != should_be_suspended:
logging.warn("Pid %s - state: %s, should_be_suspended: %s - mismatch" % (pid, procstate, should_be_suspended))
if config.debug_checkstate:
debug_check_state = _debug_check_state
else:
debug_check_state = lambda a,b: None
def freeze_something(pids_to_freeze=None):
global frozen_pids
global global_process_selector
pids_to_freeze = pids_to_freeze or global_process_selector.scan()
if not pids_to_freeze:
## process disappeared. ignore failure
logging.info("nothing to freeze found, or the process we were going to suspend has already exited")
return ()
if not hasattr(pids_to_freeze, '__iter__'):
pids_to_freeze = (pids_to_freeze,)
if getpid() in pids_to_freeze:
logging.error("Oups. Own pid is next on the list of processes to freeze. This is very bad. Skipping.")
return ()
for pid_to_freeze in pids_to_freeze:
try:
debug_check_state(pid_to_freeze, 0)
kill(pid_to_freeze, signal.SIGSTOP)
if len(pids_to_freeze)>1:
time.sleep(config.max_acceptable_time_delta/3)
except ProcessLookupError:
continue
if not pids_to_freeze in frozen_pids:
frozen_pids.append(pids_to_freeze)
for pid_to_freeze in pids_to_freeze:
## Logging after freezing - as logging itself may be resource- and timeconsuming.
## Perhaps we should even fork it out.
logging.debug("froze pid %s" % str(pid_to_freeze))
log_frozen(pid_to_freeze)
return pids_to_freeze
def unfreeze_something():
global frozen_pids
global num_unfreezes
if frozen_pids:
## queue or stack? Seems like both approaches are problematic
if num_unfreezes % config.unfreeze_pop_ratio:
pids_to_unfreeze = frozen_pids.pop()
else:
pids_to_unfreeze = frozen_pids.pop(0)
## pids_to_unfreeze can be both numeric and tuple
if not hasattr(pids_to_unfreeze, '__iter__'):
pids_to_unfreeze = [pids_to_unfreeze]
else:
pids_to_unfreeze = list(pids_to_unfreeze)
logging.debug("pids to unfreeze: %s" % pids_to_unfreeze)
for pid_to_unfreeze in reversed(pids_to_unfreeze):
try:
logging.debug("going to unfreeze %s" % str(pid_to_unfreeze))
debug_check_state(pid_to_unfreeze, 1)
kill(pid_to_unfreeze, signal.SIGCONT)
if len(pids_to_unfreeze)>1:
time.sleep(config.max_acceptable_time_delta)
except ProcessLookupError:
## ignore failure
pass
log_unfrozen(pid_to_unfreeze)
num_unfreezes += 1
return pids_to_unfreeze
def thrash_protect(args=None):
current = SystemState()
global frozen_pids
global global_process_selector
## A best-effort attempt on running mlockall()
try:
import ctypes
try:
assert(not ctypes.cdll.LoadLibrary('libc.so.6').mlockall(ctypes.c_int(7)))
except:
assert(not ctypes.cdll.LoadLibrary('libc.so.6').mlockall(ctypes.c_int(3)))
except:
logging.warning("failed to do mlockall() - this makes the program vulnerable of being swapped out in an extreme thrashing event (maybe you're not running the script as root?)", exc_info=False)
while True:
prev = current
current = SystemState()
busy = current.check_swap_threshold(prev)
## If we're thrashing, then freeze something.
if busy:
freeze_something()
elif not current.cooldown_counter:
## If no swapping has been observed for a while then unfreeze something.
current.unfrozen_pid = unfreeze_something()
global_process_selector.update(prev, current)
if current.check_delay() and not busy:
sleep_interval = current.get_sleep_interval()
logging.debug("going to sleep %s" % sleep_interval)
time.sleep(sleep_interval)
current.check_delay(sleep_interval)
def unfreeze_from_tmpfile():
"""
Cleanup - unfreezing pids from last run, if applicable
this may arguably be harmful, if box has been rebooted, or long
time has passed, and the pidfile actually contains processes that
should be frozen. At the other hand, if thrash-protect dies for
any reason, and is instantly restarted by systemd, it's probably a
good thing to start fresh from scratch. (or maybe the system will
go insta-thrashed, that would be quite bad indeed).
"""
try:
with open("/tmp/thrash-protect-frozen-pid-list", "r") as pidfile:
logging.info("cleaning up - unfreezing pids from last run")
pids_to_open = pidfile.read()
for pid in pids_to_open.split():
kill(int(pid), signal.SIGCONT)
except FileNotFoundError:
pass
def cleanup():
## Clean up if exiting due to an exception.
global frozen_pids
for pids_to_unfreeze in frozen_pids:
for pid_to_unfreeze in reversed(pids_to_unfreeze):
try:
kill(pid_to_unfreeze, signal.SIGCONT)
except ProcessLookupError:
pass
try:
unlink("/tmp/thrash-protect-frozen-pid-list")
except FileNotFoundError:
pass
## Globals ... we've refactored most of them away, but some still remains ...
frozen_pids = []
num_unfreezes = 0
## A singleton ...
global_process_selector = GlobalProcessSelector()
def main():
## Parsing arguments (TODO: none provided as for now. The
## configuration passed through environment should also be
## possible to pass through parameters)
try:
import argparse
p = argparse.ArgumentParser(description="protect a linux host from thrashing")
p.add_argument('--version', action='version', version='%(prog)s ' + __version__)
args = p.parse_args()
except ImportError:
## argparse is only available from 2.7 and up
args = None
unfreeze_from_tmpfile()
try:
thrash_protect(args)
finally:
cleanup()
if __name__ == '__main__':
if config.debug_logging:
logging.root.setLevel(10)
main()