Skip to content

Commit

Permalink
[Phoenix] Python version!
Browse files Browse the repository at this point in the history
  • Loading branch information
bzizou committed Dec 12, 2023
1 parent 59e209e commit f7a637a
Showing 1 changed file with 158 additions and 0 deletions.
158 changes: 158 additions & 0 deletions oar/tools/oar_phoenix.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
#!/usr/bin/env python
import os
import time
import yaml
import subprocess
import oar.lib.tools as tools
from oar.lib.configuration import Configuration
from oar.lib.database import wait_db_ready
from oar.lib.event import add_new_event_with_host
from oar.lib.globals import get_logger, init_and_get_session, init_config, init_oar
from oar.lib.node import get_nodes_with_given_sql, get_alive_nodes_with_jobs


################### CUSTOMIZABLE PART #######################

# File where phoenix stores the states of the broken nodes
PHOENIX_DBFILE = "/var/lib/oar/phoenix/oar_phoenix.db"

# Directory where logfiles are created in case of problems
PHOENIX_LOGDIR = "/var/lib/oar/phoenix/"

# Command sent to reboot a node (first attempt)
PHOENIX_SOFT_REBOOTCMD = "ssh -p 6667 {NODENAME} oardodo /sbin/reboot"

# Timeout for a soft rebooted node to be considered hard rebootable
PHOENIX_SOFT_TIMEOUT = 60

# Command sent to reboot a node (second attempt)
PHOENIX_HARD_REBOOTCMD = "oardodo ipmitool -I lanplus -U admin -f /etc/ipmipasswd -H {NODENAME}-bmc power off || true;sleep 30;oardodo ipmitool -I lanplus -U admin -f /etc/ipmipasswd -H {NODENAME}-bmc power on;sleep 5;oardodo ipmitool -I lanplus -U admin -f /etc/ipmipasswd -H {NODENAME}-bmc power on"

# Timeout (s) for a hard rebooted node to be considered really broken, then an email is sent
PHOENIX_HARD_TIMEOUT = 60

# Max number of simultaneous reboots (soft OR hard)
PHOENIX_MAX_REBOOTS = 20

# Timeout (s) for unix commands
PHOENIX_CMD_TIMEOUT = 15

# Properties of the broken nodes (SQL where clause)
PHOENIX_BROKEN_NODES = "state='Suspected' and network_address NOT IN (SELECT distinct(network_address) FROM resources where resource_id IN (SELECT resource_id FROM assigned_resources WHERE assigned_resource_index = 'CURRENT')) and network_address not like 'luke%' and network_address != 'dahu33' and network_address not like 'bigfoot%'"

################################################################################

# Function to get a DB session on OAR DB
def wait_db():
try:
session = init_and_get_session(config)
wait_db_ready(get_alive_nodes_with_jobs, args=[session])
except Exception as e:
print(f"Failed to contact database: {e}")
exit(1)
return session

# Function to send a unix command with timeout and log date in the logfile
def send_cmd(cmd):
try:
current_time = time.strftime("%Y-%m-%d %H:%M:%S")
with open(f"{PHOENIX_LOGDIR}/oar_phoenix.log", "a") as logfile:
process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
try:
stdout, stderr = process.communicate(timeout=PHOENIX_CMD_TIMEOUT)
res = stdout.decode("utf-8") if stdout else stderr.decode("utf-8")
if res:
logfile.write(f"{current_time} - {res}\n")
else:
logfile.write(f"{current_time} - Command executed, no output\n")
except subprocess.TimeoutExpired:
process.kill()
return "Timed out!"
except Exception as e:
return f"Exception occurred: {str(e)}"

# Load the DB file
def load_db(file):
with open(file, "r") as yamlfile:
return yaml.safe_load(yamlfile)

# Export DB to file
def save_db(file, ref):
with open(file, "w") as yamlfile:
yaml.dump(ref, yamlfile)

# Initialize DB file
def init_db(file):
if not os.path.exists(file):
with open(file, "w") as new_file:
new_file.write("") # Create an empty file if it doesn't exist

if not os.path.getsize(file):
empty_hash = {}
save_db(file, empty_hash)

# Remove nodes that are no longer broken from DB
def clean_db(db, broken_nodes):
broken_nodes = [ node[0] for node in broken_nodes ]
for node in list(db):
if node not in broken_nodes:
del db[node]

# Get nodes to soft_reboot
def get_nodes_to_soft_reboot(db, broken_nodes):
nodes = []
c = 0
for node in broken_nodes:
if node[0] not in db:
c += 1
nodes.append(node[0])
if c >= PHOENIX_MAX_REBOOTS:
break
return nodes

# Get nodes to hard_reboot
def get_nodes_to_hard_reboot(db, broken_nodes):
nodes = []
c = 0
for node in broken_nodes:
if node[0] in db and "soft_reboot" in db[node[0]]:
if time.time() > db[node[0]]["soft_reboot"] + PHOENIX_SOFT_TIMEOUT:
c += 1
nodes.append(node[0])
if c >= PHOENIX_MAX_REBOOTS:
break
return nodes

# Soft reboot nodes
def soft_reboot_nodes(db, nodes):
with open(f"{PHOENIX_LOGDIR}/oar_phoenix.log", "a") as logfile:
current_time = time.strftime("%Y-%m-%d %H:%M:%S")
for node in nodes:
logfile.write(f"{current_time} - Soft rebooting the broken node {node}\n")
cmd = PHOENIX_SOFT_REBOOTCMD.replace("{NODENAME}", node)
db[node] = {'soft_reboot': time.time()}
send_cmd(cmd)

# Hard reboot nodes
def hard_reboot_nodes(db, nodes):
with open(f"{PHOENIX_LOGDIR}/oar_phoenix.log", "a") as logfile:
current_time = time.strftime("%Y-%m-%d %H:%M:%S")
for node in nodes:
logfile.write(f"{current_time} - Hard rebooting the broken node {node}\n")
cmd = PHOENIX_HARD_REBOOTCMD.replace("{NODENAME}", node)
del db[node]
db[node] = {'hard_reboot': time.time()}
send_cmd(cmd)

init_db(PHOENIX_DBFILE)
db = load_db(PHOENIX_DBFILE)
config = init_config()
session = wait_db()
broken_nodes = get_nodes_with_given_sql(session,PHOENIX_BROKEN_NODES)
clean_db(db, broken_nodes)
nodes = get_nodes_to_soft_reboot(db, broken_nodes)
soft_reboot_nodes(db, nodes)
nodes = get_nodes_to_hard_reboot(db, broken_nodes)
hard_reboot_nodes(db, nodes)
save_db(PHOENIX_DBFILE, db)

0 comments on commit f7a637a

Please sign in to comment.