Skip to content

Commit

Permalink
cfgen: disable senpai on host that use SanDisk SD7SN6S256G
Browse files Browse the repository at this point in the history
Summary: more patch :(

Differential Revision: D58306079

fbshipit-source-id: d78f21cbe469b2897bdf6064eb09093043181e25
  • Loading branch information
Chengxiong Ruan authored and facebook-github-bot committed Jun 7, 2024
1 parent 57b302d commit d53ec09
Show file tree
Hide file tree
Showing 5 changed files with 293 additions and 1 deletion.
7 changes: 6 additions & 1 deletion src/oomd/cfgen/src/cfgen.rs
Original file line number Diff line number Diff line change
Expand Up @@ -841,6 +841,10 @@ fn on_ssd(node: &Node) -> bool {
node.storage().has_ssd_root()
}

fn on_sandisk_sd7_sn6_s2(node: &Node) -> bool {
node.storage().has_disk_model("SanDisk SD7SN6S256G")
}

fn io_latency_supported(node: &Node) -> bool {
// Historically, we set this to `false` whe:
// 1. the host has file `/sys/fs/cgroup/io.cost.qos`
Expand Down Expand Up @@ -892,7 +896,8 @@ fn fbtax2_blacklisted_jobs(node: &Node) -> Vec<&'static str> {
}

fn senpai_targets(node: &Node) -> Option<String> {
if !on_ssd(node) {
// Replicating this logic : https://fburl.com/code/1o9lw85h
if !on_ssd(node) || on_sandisk_sd7_sn6_s2(node) {
return None;
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
@generated SignedSource<<abf5163c81fbfcbc1599a3f7ec339d4b>>
@codegen-command arc cfgen update-inputs fb-oomd
{
"fqdn": "twshared16157.35.frc1.facebook.com",
"region": "carolina",
"clusterType": "SERVICE_GENERIC_NON_MEMCACHE",
"modelId": 56441,
"kernelRelease": "6.4.3-0_fbk11_2563_g44ec95f7d7f4",
"serverType": "TYPE_I_WEB",
"experiments": [],
"cpuArchitecture": "broadwellde",
"metalosRootfs": true,
"provisioningConfig": {
"ethtoolByInterface": {
"eth0": {
"maxChannelsCombined": 32
}
},
"cpuCoreCount": 16,
"parentModelId": 56442,
"recoveryEnvironment": false,
"deviceType": "SERVER",
"datacenter": "frc1",
"cluster": "35",
"memTotal": 33468424192,
"osVersion": {
"distribution_name": "CentOS Stream",
"version": 9,
"is_in_ramdisk": false,
"is_metalos": true,
"metalos_variant": "MclassicA"
},
"pciByAddress": {
"0000:04:00.0": {
"vendor_id": 5555,
"device_id": 4117,
"class_code": 131072,
"board_part_number": "MCX4431M-GCAN_FB"
}
},
"static_smc_tiers": [],
"machine": "x86_64"
},
"bootConfig": {
"ethtoolByInterface": {
"eth0": {
"driver": "mlx5_core",
"driver_version": "6.4.3-0_fbk11_2563_g44ec95f7d7f",
"firmware_version": "14.27.2606 (FB_2510111032)",
"bus_info": "0000:04:00.0"
}
}
},
"runtimeConfig": {
"hasHighPrivCert": true,
"regionRoutableCluster": "frc3.23",
"metalos_apis": {
"EFI_MOUNT": 1,
"GENERATORS": 1,
"KDUMP_SUBVOL": 1,
"NETCONSOLE": 1,
"NETWORKD_ROUTES_IGNORE": 1,
"NETWORK_MAX_BUFFER_SIZE": 1,
"SERVICE_IMAGE_GC": 1,
"SERVICE_IMAGE_HELPER": 1,
"SYSTEMD_KEXEC_LOAD": 1,
"TMP_SUBVOL": 2,
"TW_RESCTRL_MOUNT": 1,
"TW_SUBVOL": 1
},
"block_devices": {
"block_devices": {
"sda": {
"size_bytes": 256060514304,
"is_rotational": false,
"model": "SanDisk SD7SN6S256G",
"serial": "151933400106",
"physical_block_size": 512,
"logical_block_size": 512,
"is_root": true
}
}
},
"dynamic_smc_tiers": [],
"cluster_state": "CLUSTER_IN_USE",
"installed_platforms": [
"platform009",
"platform010",
"platform010-compat"
],
"device_nics_enum": [
"ETH0",
"SVC0"
]
},
"reservationConfig": {
"active_machine_materialization_id": "64ba98d886d6b",
"current_reservation_host_profile_id": "KERNEL:kernel_6.4.3-0_fbk11_2563_g44ec95f7d7f4_base.v0|KERNEL_ARGS:kargs_consoleS057600.v1|BOOT_CONFIG:bootloader659_initrd201_swap16G_native_arch"
}
}
4 changes: 4 additions & 0 deletions src/oomd/cfgen/test/cfgen_test_manifest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ samples:
# If the query is too slow, just do serf get --fields name,storage_capacity,components[disk].is_rootdrive,components[disk].disk_obj.is_flash --limit 100 'hostnameScheme_obj.name=twshared'
# And then pick a record that has `components[disk].is_rootdrive=1` and `components[disk].disk_obj.is_flash=0`
production_host: twshared44829.07.ash9
twshared_senpai_disabled_sandisk:
# this host is a sample from the "wdb config monitor" scuba table
production_host: twshared16157.35.frc1.facebook.com
dns:
# this host is a sample from the "wdb config monitor" scuba table
production_host: dns05.17.prn3.facebook.com
Expand All @@ -36,4 +39,5 @@ samples:
# this host is a sample from the "wdb config monitor" scuba table
production_host: tw066.02.cln2.facebook.com
twpool_no_senpai:
# this host is a sample from the "wdb config monitor" scuba table
production_host: tw023.04.vll2.facebook.com
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
@generated SignedSource<<3dd8c7637bb7afa680fc168e9c49060d>>
@codegen-command arc cfgen update-outputs fb-oomd
[Service]
Environment=OOMD_ARGS='--interval 1 --config /etc/oomd2.json --drop-in-dir /run/oomd/dropin'

[Unit]
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
@generated SignedSource<<f257a21d0bc496d0529cf91334e33318>>
@codegen-command arc cfgen update-outputs fb-oomd
{
"rulesets": [
{
"name": "system overview",
"silence-logs": "engine",
"detectors": [
[
"records system stats",
{
"name": "dump_cgroup_overview",
"args": {
"cgroup": "workload.slice"
}
}
]
],
"actions": [
{
"name": "continue",
"args": {}
}
]
},
{
"name": "restart smc_proxy.service on memory threshold",
"detectors": [
[
"memory usage above",
{
"name": "memory_above",
"args": {
"cgroup": "smc_proxy.service",
"duration": "10",
"threshold_anon": "15G"
}
}
]
],
"actions": [
{
"name": "systemd_restart",
"args": {
"dry": "false",
"post_action_delay": "20",
"service": "smc_proxy.service"
}
}
]
},
{
"name": "protection against heavy workload thrashing",
"drop-in": {
"disable-on-drop-in": true,
"detectors": true,
"actions": true
},
"detectors": [
[
"sustained high workload memory pressure",
{
"name": "exists",
"args": {
"cgroup": "workload.slice/workload-tw.slice/sigrid_online_trainer*,workload.slice/workload-tw.slice/*.reservation.slice/sigrid_online_trainer*,workload.slice/workload-tw.slice/*.allotment.slice/sigrid_online_trainer*,workload.slice/workload-tw.slice/analyzer*,workload.slice/workload-tw.slice/*.reservation.slice/analyzer*,workload.slice/workload-tw.slice/*.allotment.slice/analyzer*,workload.slice/workload-tw.slice/bumblebee.*,workload.slice/workload-tw.slice/*.reservation.slice/bumblebee.*,workload.slice/workload-tw.slice/*.allotment.slice/bumblebee.*",
"negate": true
}
},
{
"name": "pressure_above",
"args": {
"cgroup": "workload.slice/workload-tw.slice",
"duration": "180",
"resource": "memory",
"threshold": "80"
}
},
{
"name": "memory_reclaim",
"args": {
"cgroup": "workload.slice/workload-tw.slice",
"duration": "10"
}
}
]
],
"actions": [
{
"name": "kill_by_pg_scan",
"args": {
"cgroup": "workload.slice/workload-tw.slice/*",
"recursive": "true"
}
}
]
},
{
"name": "protection against low swap",
"detectors": [
[
"free swap goes below 10 percent",
{
"name": "exists",
"args": {
"cgroup": "workload.slice/workload-tw.slice/sigrid_online_trainer*,workload.slice/workload-tw.slice/*.reservation.slice/sigrid_online_trainer*,workload.slice/workload-tw.slice/*.allotment.slice/sigrid_online_trainer*,workload.slice/workload-tw.slice/analyzer*,workload.slice/workload-tw.slice/*.reservation.slice/analyzer*,workload.slice/workload-tw.slice/*.allotment.slice/analyzer*,workload.slice/workload-tw.slice/bumblebee.*,workload.slice/workload-tw.slice/*.reservation.slice/bumblebee.*,workload.slice/workload-tw.slice/*.allotment.slice/bumblebee.*",
"negate": true
}
},
{
"name": "swap_free",
"args": {
"threshold_pct": "10"
}
}
]
],
"actions": [
{
"name": "kill_by_swap_usage",
"args": {
"biased_swap_kill": "true",
"cgroup": "system.slice/*,workload.slice/workload-wdb.slice/*,workload.slice/workload-tw.slice/*",
"recursive": "true"
}
}
]
},
{
"name": "senpai drop-in ruleset",
"silence-logs": "engine",
"drop-in": {
"disable-on-drop-in": true,
"actions": true
},
"detectors": [
[
"continue detector group",
{
"name": "continue",
"args": {}
}
]
],
"actions": [
{
"name": "continue",
"args": {}
}
]
},
{
"name": "tw_container drop-in ruleset",
"drop-in": {
"disable-on-drop-in": true,
"detectors": true,
"actions": true
},
"detectors": [
[
"continue",
{
"name": "stop",
"args": {}
}
]
],
"actions": [
{
"name": "continue",
"args": {}
}
],
"prekill_hook_timeout": "45"
}
],
"version": "1.0.0"
}

0 comments on commit d53ec09

Please sign in to comment.