From a6211707f904f2a4eefc7fc11548351034cc8bd9 Mon Sep 17 00:00:00 2001 From: Joshua Hahn Date: Thu, 12 Sep 2024 09:58:30 -0700 Subject: [PATCH] Using kernel cgroup.kill and cgroup.freeze in kill plugins Summary: Commit [2 / 3] Replaces manual SIGKILLing of cgroups with the kernel killer (cgroup.kill) and cgroup freezer (cgroup.kill) to prevent proc growth once a cgroup has been selected to be sacrificed. Reviewed By: lnyng Differential Revision: D62256368 fbshipit-source-id: cc96edf6d38fa03dad136082d21d658e6404d4b5 --- src/oomd/plugins/BaseKillPlugin.cpp | 88 +++++++++++++++++++++++----- src/oomd/plugins/BaseKillPlugin.h | 8 ++- src/oomd/plugins/CorePluginsTest.cpp | 10 +++- src/oomd/util/Fs.cpp | 26 ++++++++ src/oomd/util/Fs.h | 6 ++ 5 files changed, 121 insertions(+), 17 deletions(-) diff --git a/src/oomd/plugins/BaseKillPlugin.cpp b/src/oomd/plugins/BaseKillPlugin.cpp index 536aa3c6..61c98c29 100644 --- a/src/oomd/plugins/BaseKillPlugin.cpp +++ b/src/oomd/plugins/BaseKillPlugin.cpp @@ -70,6 +70,7 @@ int BaseKillPlugin::init( argParser_.addArgument("dry", dry_); argParser_.addArgument("always_continue", alwaysContinue_); argParser_.addArgument("debug", debug_); + argParser_.addArgument("kernelkill", kernelKill_); if (!argParser_.parse(args)) { return 1; @@ -443,6 +444,25 @@ int BaseKillPlugin::getAndTryToKillPids(const CgroupContext& target) { return nrKilled; } +int BaseKillPlugin::kernelKillCgroup(const CgroupContext& target) { + if (!Fs::writeKillAt(target.fd())) { + OLOG << "Failed to write to cgroup.kill for " + << target.cgroup().absolutePath(); + return 1; + } + return 0; +} + +int BaseKillPlugin::freezeCgroup(const CgroupContext& target, int freeze) { + if (!Fs::writeFreezeAt(target.fd(), freeze)) { + OLOG << "Failed to write to cgroup.freeze for " + << target.cgroup().absolutePath(); + return 1; + } + + return 0; +} + int BaseKillPlugin::dumpMemoryStat(const CgroupContext& target) { auto stats = Fs::readFileByLine(Fs::Fd::openat(target.fd(), Fs::kMemStatFile)); @@ -474,7 +494,12 @@ int BaseKillPlugin::tryToKillCgroup( return true; } - OLOG << "Trying to kill " << cgroupPath; + if (kernelKill_) { + OLOG << "Trying to kill " << cgroupPath + << " with cgroup.freeze and cgroup.kill"; + } else { + OLOG << "Trying to kill " << cgroupPath; + } if (dumpMemoryStat(target)) { OLOG << "Failed to open " << cgroupPath << "/memory.stat"; @@ -483,26 +508,61 @@ int BaseKillPlugin::tryToKillCgroup( reportKillUuidToXattr(cgroupPath, killUuid); reportKillInitiationToXattr(cgroupPath); - while (tries--) { - // Descendent cgroups created during killing will be missed because - // getAndTryToKillPids reads cgroup children from OomdContext's cache + if (kernelKill_) { + if (freezeCgroup(target, 1)) { + return 0; + } - nrKilled += getAndTryToKillPids(target); + while (tries--) { + auto procsBeforeKill = Fs::readPidsCurrentAt(target.fd()); - if (nrKilled == lastNrKilled) { - break; + if (kernelKillCgroup(target)) { + // Killing failed, so we report the number of processes + // killed up to this point. + return nrKilled; + } + + auto procsAfterKill = Fs::readPidsCurrentAt(target.fd()); + + if (!procsBeforeKill || !procsAfterKill) { + OLOG << "Failed to read pids from " << target.cgroup().absolutePath(); + return nrKilled; + } + + if (procsAfterKill.value() == 0 || + procsBeforeKill.value() == procsAfterKill.value()) { + break; + } + + nrKilled += procsBeforeKill.value() - procsAfterKill.value(); } - // Give it a breather before killing again - // - // Don't sleep after the first round of kills b/c the majority of the - // time the sleep isn't necessary. The system responds fast enough. - if (lastNrKilled) { - std::this_thread::sleep_for(1s); + if (freezeCgroup(target, 0)) { + return nrKilled; } + } else { + while (tries--) { + // Descendent cgroups created during killing will be missed because + // getAndTryToKillPids reads cgroup children from OomdContext's cache - lastNrKilled = nrKilled; + nrKilled += getAndTryToKillPids(target); + + if (nrKilled == lastNrKilled) { + break; + } + + // Give it a breather before killing again + // + // Don't sleep after the first round of kills b/c the majority of the + // time the sleep isn't necessary. The system responds fast enough. + if (lastNrKilled) { + std::this_thread::sleep_for(1s); + } + + lastNrKilled = nrKilled; + } } + reportKillCompletionToXattr(cgroupPath, nrKilled); return nrKilled; } diff --git a/src/oomd/plugins/BaseKillPlugin.h b/src/oomd/plugins/BaseKillPlugin.h index 0a9cc327..6fe71487 100644 --- a/src/oomd/plugins/BaseKillPlugin.h +++ b/src/oomd/plugins/BaseKillPlugin.h @@ -77,6 +77,8 @@ class BaseKillPlugin : public Engine::BasePlugin { } } + int getAndTryToKillPids(const CgroupContext& target); + protected: /* * Required implementation point for kill plugins @@ -192,9 +194,10 @@ class BaseKillPlugin : public Engine::BasePlugin { virtual bool pastPrekillHookTimeout(const OomdContext& ctx) const; virtual int dumpMemoryStat(const CgroupContext& target); - private: - virtual int getAndTryToKillPids(const CgroupContext& target); + virtual int freezeCgroup(const CgroupContext& target, int freeze); + virtual int kernelKillCgroup(const CgroupContext& target); + private: enum class KillResult { SUCCESS, FAILED, @@ -249,6 +252,7 @@ class BaseKillPlugin : public Engine::BasePlugin { bool dry_{false}; bool alwaysContinue_{false}; bool debug_{false}; + bool kernelKill_{false}; struct ActivePrekillHook { public: diff --git a/src/oomd/plugins/CorePluginsTest.cpp b/src/oomd/plugins/CorePluginsTest.cpp index 50129534..c2c32742 100644 --- a/src/oomd/plugins/CorePluginsTest.cpp +++ b/src/oomd/plugins/CorePluginsTest.cpp @@ -15,6 +15,7 @@ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ +#include #include #include @@ -48,11 +49,18 @@ std::unique_ptr createPlugin(const std::string& name) { namespace Oomd { class BaseKillPluginMock : public BaseKillPlugin { public: + /* + * Since we are just simulating killing, we don't need to freeze the cgroup + */ + int freezeCgroup(const CgroupContext& target, int freeze) override { + return 0; + } + /* * We don't actually need to dump memory.stat since we won't * actually be killing any live processes */ - int dumpMemoryStat(const CgroupContext&) override { + int dumpMemoryStat(const CgroupContext& target) override { return 0; } diff --git a/src/oomd/util/Fs.cpp b/src/oomd/util/Fs.cpp index 9423b213..74e2b89b 100644 --- a/src/oomd/util/Fs.cpp +++ b/src/oomd/util/Fs.cpp @@ -594,6 +594,14 @@ SystemMaybe Fs::readSwapMaxAt(const DirFd& dirfd) { return ret; } +SystemMaybe Fs::readPidsCurrentAt(const DirFd& dirfd) { + auto line = Fs::readFileByLine(Fs::Fd::openat(dirfd, Fs::kPidsCurr)); + if (!line) { + return SYSTEM_ERROR(line.error()); + } + return std::stoll((*line)[0]); +} + SystemMaybe> Fs::getVmstat( const std::string& path) { auto lines = readFileByLine(path); @@ -774,6 +782,24 @@ SystemMaybe Fs::writeMemReclaimAt( return noSystemError(); } +SystemMaybe Fs::writeFreezeAt(const DirFd& dirfd, int freeze) { + auto val_str = std::to_string(freeze); + auto ret = + writeControlFileAt(Fs::Fd::openat(dirfd, kCgroupFreeze, false), val_str); + if (!ret) { + return SYSTEM_ERROR(ret.error()); + } + return noSystemError(); +} + +SystemMaybe Fs::writeKillAt(const DirFd& dirfd) { + auto ret = writeControlFileAt(Fs::Fd::openat(dirfd, kCgroupKill, false), "1"); + if (!ret) { + return SYSTEM_ERROR(ret.error()); + } + return noSystemError(); +} + SystemMaybe Fs::getNrDyingDescendantsAt(const DirFd& dirfd) { auto lines = readFileByLine(Fs::Fd::openat(dirfd, kCgroupStatFile)); if (!lines) { diff --git a/src/oomd/util/Fs.h b/src/oomd/util/Fs.h index 1198991c..e712770b 100644 --- a/src/oomd/util/Fs.h +++ b/src/oomd/util/Fs.h @@ -37,6 +37,7 @@ class Fs { static constexpr auto kSubtreeControlFile = "cgroup.subtree_control"; static constexpr auto kProcsFile = "cgroup.procs"; static constexpr auto kEventsFile = "cgroup.events"; + static constexpr auto kCgroupFreeze = "cgroup.freeze"; static constexpr auto kMemCurrentFile = "memory.current"; static constexpr auto kMemPressureFile = "memory.pressure"; static constexpr auto kMemLowFile = "memory.low"; @@ -58,6 +59,8 @@ class Fs { static constexpr auto kOomdUserPreferXAttr = "user.oomd_prefer"; static constexpr auto kOomdSystemAvoidXAttr = "trusted.oomd_avoid"; static constexpr auto kOomdUserAvoidXAttr = "user.oomd_avoid"; + static constexpr auto kPidsCurr = "pids.current"; + static constexpr auto kCgroupKill = "cgroup.kill"; struct DirEnts { std::vector dirs; @@ -225,6 +228,7 @@ class Fs { static SystemMaybe readIopressureAt( const DirFd& dirfd, PressureType type = PressureType::FULL); + static SystemMaybe readPidsCurrentAt(const DirFd& dirfd); static SystemMaybe writeMemhighAt(const DirFd& dirfd, int64_t value); static SystemMaybe writeMemhightmpAt( @@ -235,6 +239,8 @@ class Fs { const DirFd& dirfd, int64_t value, std::optional swappiness); + static SystemMaybe writeFreezeAt(const DirFd& dirfd, int value); + static SystemMaybe writeKillAt(const DirFd& dirfd); static SystemMaybe getNrDyingDescendantsAt(const DirFd& dirfd); static SystemMaybe readKillPreferenceAt(const DirFd& path);