forked from CESNET/pbs.hooks
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhook_overcommit_detector.py
87 lines (67 loc) · 2.5 KB
/
hook_overcommit_detector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import pbs
import re
import smtplib
import socket
import os
import time
def sendmail(subject, body):
sender = 'overcommit_detector@' + socket.getfqdn()
receivers = ['vchlumsky@cesnet.cz']
message = """From: """ + sender + """
To: """ + ",".join(receivers)+ """
Subject: """ + subject+ """
""" + body+ """
"""
try:
smtpObj = smtplib.SMTP('localhost')
smtpObj.sendmail(sender, receivers, message)
except SMTPException:
pass
def parse_exec_vnode(exec_vnode):
resources = {}
for i in str(exec_vnode).split("+"):
i = i.replace("(","")
i = i.replace(")","")
node_i = i.split(":")[0].split(".")[0]
if not node_i in resources.keys():
resources[node_i] = {}
m = re.search('ncpus=([0-9]+?)', i)
if m:
if not 'ncpus' in resources[node_i].keys():
resources[node_i]['ncpus'] = 0
resources[node_i]['ncpus'] += int(m.group(1))
return resources
try:
e = pbs.event()
if e.type == pbs.RUNJOB:
j = e.job
if j.queue and re.match('^[RM]{1}[0-9]+', j.queue.name):
e.accept()
resources = parse_exec_vnode(j.exec_vnode)
for nodename in resources.keys():
node = pbs.server().vnode(nodename)
available_ncpus = node.resources_available['ncpus']
assigned_ncpus = node.resources_assigned['ncpus']
try:
requested_ncpus = resources[nodename]['ncpus']
except:
requested_ncpus = 1
if assigned_ncpus + requested_ncpus > available_ncpus:
now = time.strftime("%Y%m%d%H%M%S", time.localtime())
jobs = []
filename = "/tmp/pbs_overcommit_detector_%s_%s_overcommit_by_%s" % (now, nodename, j.id)
msg = "overcommit_detector detected attempt to overcommit node %s by job %s" % (nodename, j.id)
sendmail("pbs overcommit detector", msg);
os.system("echo \"%s\" >> %s" % (msg, filename))
if node.jobs:
for i in node.jobs.split(','):
jobid = i.strip().split('/')[0]
if jobid in jobs:
continue
jobs.append(jobid)
os.system("printjob %s >> %s" % (jobid, filename))
e.reject(msg)
except SystemExit:
pass
except Exception as err:
e.reject("overcommit_detector failed: %s" % str(err))