forked from CESNET/pbs.hooks
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhook_fairshare.py
92 lines (80 loc) · 3.57 KB
/
hook_fairshare.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import pbs
import os
import string
import re
def size_to_int(mem):
"""
Converts the value of type pbs.size to integer amount of bytes.
Parameter @mem is expected to be of type pbs.size or None.
"""
# Unset values should be treated as zero
if not mem:
return 0
mem = str(mem)
# remove byte suffix
numeric_string = mem[:-1].lower()
# suffix will contain the character before "b" or an empty string
suffix = mem[-2:-1].lower()
# the only suffix is "b", no need to multiply by 1024 ** n
if suffix in string.digits:
suffix = ""
else:
numeric_string = numeric_string[:-1]
value = int(numeric_string)
if suffix == "y":
value *= 1024 ** 8
elif suffix == "z":
value *= 1024 ** 7
elif suffix == "e":
value *= 1024 ** 6
elif suffix == "p":
value *= 1024 ** 5
elif suffix == "t":
value *= 1024 ** 4
elif suffix == "g":
value *= 1024 ** 3
elif suffix == "m":
value *= 1024 ** 2
elif suffix == "k":
value *= 1024 ** 1
return value
try:
e = pbs.event()
if e.type == pbs.PERIODIC:
pbs.logmsg(pbs.LOG_DEBUG, "fairshare periodic started")
infrastructure_ncpus = int()
infrastructure_mem = int()
infrastructure_ngpus = int()
infrastructure_scratch_local = int()
infrastructure_scratch_shared = int()
infrastructure_scratch_ssd = int()
# We divide memory values by 1024 because the implicit conversion of pbs.size
# during division in the scheduling formula converts the value to kilobytes
vnodes = pbs.server().vnodes()
for vnode in vnodes:
if vnode.resources_available["ncpus"]:
infrastructure_ncpus += int(vnode.resources_available["ncpus"])
infrastructure_mem += size_to_int(vnode.resources_available["mem"]) / 1024
if vnode.resources_available["ngpus"]:
infrastructure_ngpus += int(vnode.resources_available["ngpus"])
infrastructure_scratch_local += size_to_int(vnode.resources_available["scratch_local"]) / 1024
infrastructure_scratch_shared += size_to_int(vnode.resources_available["scratch_shared"]) / 1024
infrastructure_scratch_ssd += size_to_int(vnode.resources_available["scratch_ssd"]) / 1024
# The resources cannot be set directly so we have
# to call os.system and set them via qmgr
os.environ["PBSPRO_IGNORE_KERBEROS"] = ""
os.environ['PATH'] += ":/opt/pbs/bin/"
os.system(str.format("qmgr -c \"set server resources_default.infrastructure_ncpus = {}\"", infrastructure_ncpus))
os.system(str.format("qmgr -c \"set server resources_default.infrastructure_mem = {}kb\"", infrastructure_mem))
os.system(str.format("qmgr -c \"set server resources_default.infrastructure_ngpus = {}\"", infrastructure_ngpus))
os.system(str.format("qmgr -c \"set server resources_default.infrastructure_scratch_local = {}kb\"", infrastructure_scratch_local))
os.system(str.format("qmgr -c \"set server resources_default.infrastructure_scratch_shared = {}kb\"", infrastructure_scratch_shared))
os.system(str.format("qmgr -c \"set server resources_default.infrastructure_scratch_ssd = {}kb\"", infrastructure_scratch_ssd))
if e.type == pbs.EXECJOB_BEGIN:
job = pbs.event().job
if job.in_ms_mom():
job.resources_used["fairshare_mem"] = job.Resource_List["mem"]
except SystemExit:
pass
except Exception as err:
e.reject("fairshare hook failed: (%s)" % str(err))