-
Notifications
You must be signed in to change notification settings - Fork 10
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1 changed file
with
158 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,158 @@ | ||
#!/usr/bin/env python | ||
import os | ||
import time | ||
import yaml | ||
import subprocess | ||
import oar.lib.tools as tools | ||
from oar.lib.configuration import Configuration | ||
from oar.lib.database import wait_db_ready | ||
from oar.lib.event import add_new_event_with_host | ||
from oar.lib.globals import get_logger, init_and_get_session, init_config, init_oar | ||
from oar.lib.node import get_nodes_with_given_sql, get_alive_nodes_with_jobs | ||
|
||
|
||
################### CUSTOMIZABLE PART ####################### | ||
|
||
# File where phoenix stores the states of the broken nodes | ||
PHOENIX_DBFILE = "/var/lib/oar/phoenix/oar_phoenix.db" | ||
|
||
# Directory where logfiles are created in case of problems | ||
PHOENIX_LOGDIR = "/var/lib/oar/phoenix/" | ||
|
||
# Command sent to reboot a node (first attempt) | ||
PHOENIX_SOFT_REBOOTCMD = "ssh -p 6667 {NODENAME} oardodo /sbin/reboot" | ||
|
||
# Timeout for a soft rebooted node to be considered hard rebootable | ||
PHOENIX_SOFT_TIMEOUT = 60 | ||
|
||
# Command sent to reboot a node (second attempt) | ||
PHOENIX_HARD_REBOOTCMD = "oardodo ipmitool -I lanplus -U admin -f /etc/ipmipasswd -H {NODENAME}-bmc power off || true;sleep 30;oardodo ipmitool -I lanplus -U admin -f /etc/ipmipasswd -H {NODENAME}-bmc power on;sleep 5;oardodo ipmitool -I lanplus -U admin -f /etc/ipmipasswd -H {NODENAME}-bmc power on" | ||
|
||
# Timeout (s) for a hard rebooted node to be considered really broken, then an email is sent | ||
PHOENIX_HARD_TIMEOUT = 60 | ||
|
||
# Max number of simultaneous reboots (soft OR hard) | ||
PHOENIX_MAX_REBOOTS = 20 | ||
|
||
# Timeout (s) for unix commands | ||
PHOENIX_CMD_TIMEOUT = 15 | ||
|
||
# Properties of the broken nodes (SQL where clause) | ||
PHOENIX_BROKEN_NODES = "state='Suspected' and network_address NOT IN (SELECT distinct(network_address) FROM resources where resource_id IN (SELECT resource_id FROM assigned_resources WHERE assigned_resource_index = 'CURRENT')) and network_address not like 'luke%' and network_address != 'dahu33' and network_address not like 'bigfoot%'" | ||
|
||
################################################################################ | ||
|
||
# Function to get a DB session on OAR DB | ||
def wait_db(): | ||
try: | ||
session = init_and_get_session(config) | ||
wait_db_ready(get_alive_nodes_with_jobs, args=[session]) | ||
except Exception as e: | ||
print(f"Failed to contact database: {e}") | ||
exit(1) | ||
return session | ||
|
||
# Function to send a unix command with timeout and log date in the logfile | ||
def send_cmd(cmd): | ||
try: | ||
current_time = time.strftime("%Y-%m-%d %H:%M:%S") | ||
with open(f"{PHOENIX_LOGDIR}/oar_phoenix.log", "a") as logfile: | ||
process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) | ||
try: | ||
stdout, stderr = process.communicate(timeout=PHOENIX_CMD_TIMEOUT) | ||
res = stdout.decode("utf-8") if stdout else stderr.decode("utf-8") | ||
if res: | ||
logfile.write(f"{current_time} - {res}\n") | ||
else: | ||
logfile.write(f"{current_time} - Command executed, no output\n") | ||
except subprocess.TimeoutExpired: | ||
process.kill() | ||
return "Timed out!" | ||
except Exception as e: | ||
return f"Exception occurred: {str(e)}" | ||
|
||
# Load the DB file | ||
def load_db(file): | ||
with open(file, "r") as yamlfile: | ||
return yaml.safe_load(yamlfile) | ||
|
||
# Export DB to file | ||
def save_db(file, ref): | ||
with open(file, "w") as yamlfile: | ||
yaml.dump(ref, yamlfile) | ||
|
||
# Initialize DB file | ||
def init_db(file): | ||
if not os.path.exists(file): | ||
with open(file, "w") as new_file: | ||
new_file.write("") # Create an empty file if it doesn't exist | ||
|
||
if not os.path.getsize(file): | ||
empty_hash = {} | ||
save_db(file, empty_hash) | ||
|
||
# Remove nodes that are no longer broken from DB | ||
def clean_db(db, broken_nodes): | ||
broken_nodes = [ node[0] for node in broken_nodes ] | ||
for node in list(db): | ||
if node not in broken_nodes: | ||
del db[node] | ||
|
||
# Get nodes to soft_reboot | ||
def get_nodes_to_soft_reboot(db, broken_nodes): | ||
nodes = [] | ||
c = 0 | ||
for node in broken_nodes: | ||
if node[0] not in db: | ||
c += 1 | ||
nodes.append(node[0]) | ||
if c >= PHOENIX_MAX_REBOOTS: | ||
break | ||
return nodes | ||
|
||
# Get nodes to hard_reboot | ||
def get_nodes_to_hard_reboot(db, broken_nodes): | ||
nodes = [] | ||
c = 0 | ||
for node in broken_nodes: | ||
if node[0] in db and "soft_reboot" in db[node[0]]: | ||
if time.time() > db[node[0]]["soft_reboot"] + PHOENIX_SOFT_TIMEOUT: | ||
c += 1 | ||
nodes.append(node[0]) | ||
if c >= PHOENIX_MAX_REBOOTS: | ||
break | ||
return nodes | ||
|
||
# Soft reboot nodes | ||
def soft_reboot_nodes(db, nodes): | ||
with open(f"{PHOENIX_LOGDIR}/oar_phoenix.log", "a") as logfile: | ||
current_time = time.strftime("%Y-%m-%d %H:%M:%S") | ||
for node in nodes: | ||
logfile.write(f"{current_time} - Soft rebooting the broken node {node}\n") | ||
cmd = PHOENIX_SOFT_REBOOTCMD.replace("{NODENAME}", node) | ||
db[node] = {'soft_reboot': time.time()} | ||
send_cmd(cmd) | ||
|
||
# Hard reboot nodes | ||
def hard_reboot_nodes(db, nodes): | ||
with open(f"{PHOENIX_LOGDIR}/oar_phoenix.log", "a") as logfile: | ||
current_time = time.strftime("%Y-%m-%d %H:%M:%S") | ||
for node in nodes: | ||
logfile.write(f"{current_time} - Hard rebooting the broken node {node}\n") | ||
cmd = PHOENIX_HARD_REBOOTCMD.replace("{NODENAME}", node) | ||
del db[node] | ||
db[node] = {'hard_reboot': time.time()} | ||
send_cmd(cmd) | ||
|
||
init_db(PHOENIX_DBFILE) | ||
db = load_db(PHOENIX_DBFILE) | ||
config = init_config() | ||
session = wait_db() | ||
broken_nodes = get_nodes_with_given_sql(session,PHOENIX_BROKEN_NODES) | ||
clean_db(db, broken_nodes) | ||
nodes = get_nodes_to_soft_reboot(db, broken_nodes) | ||
soft_reboot_nodes(db, nodes) | ||
nodes = get_nodes_to_hard_reboot(db, broken_nodes) | ||
hard_reboot_nodes(db, nodes) | ||
save_db(PHOENIX_DBFILE, db) | ||
|