This repository has been archived by the owner on Dec 3, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmemwatcher
194 lines (157 loc) · 6.79 KB
/
memwatcher
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
#!/usr/bin/env python
import gzip
import os
import subprocess
import sys
import signal
import time
from optparse import OptionParser
from datetime import datetime
pidfile = 'reswatch.pid'
ps_file = 'ps.out.gz'
open_files = []
# Deamon code as per Steven's - Advanced programming in the UNIX envirionment
def createDaemon():
try:
pid = os.fork()
except OSError, e:
raise Exception("%s [%d]" % (e.strerror, e.errno))
if pid > 0: # we are the controlling parent
sys.exit(0)
# We are the child
# Call setsid() to create a new session & process group without a controlling terminal
# This process becomes the session and group leader
#and now....
os.setsid()
# We should call a second fork here. The rational is that this process, because it is a session leader
# it *might* be able to aquire a terminal. If we are to be paranoid, we should fork() a second child
# exit the first and run everything in the second child. As this child is not a session leader the python
# process will have no chance to aquire a controlling terminal
try:
pid = os.fork()
except OSError, e:
raise Exception("%s [%d]" % (e.strerror, e.errno))
if pid > 0:
# We are the session leader - exit so that the child can run with no chance of grabbing the terminal
# This is rather paranoid
sys.exit(0)
# We are the child in the new session but not a session leader, work will continue in this process now
# Reset our file mask:
os.umask(0)
def lauchAsDaemon(workdir):
#Test to be sure that workdir exists and we can write to it:
if not (os.path.isdir(workdir) and os.access(workdir, os.W_OK)):
raise (Exception, "Unable to write to work directory: %s" % (workdir))
# OK - after minimal testing, let's fork off and run as a daemon
createDaemon()
# now that we're running as a daemon
# Let's dump our pid:
pid = str(os.getpid())
os.chdir(workdir)
with open(pidfile, 'w') as f:
f.write(pid)
# Now we register for SIGINT/SIGTERM so that we can clean up properly
# atexit apparently doesn't get called here
for sig in (signal.SIGINT, signal.SIGTERM):
signal.signal(sig, cleanupLogFiles)
runPSProcess()
def cleanupLogFiles(signum, frame):
for f in open_files:
f.close()
sys.exit(0)
def runPSProcess(sleepInt=60):
ps_cmd = 'ps -ewwopid,ppid,rss,vsz,pmem,pcpu,time,etime,start_time,wchan,stat,psr,args '
try:
outfile = gzip.open(ps_file, 'w')
open_files.append(outfile)
while True:
outfile.write(datetime.now().strftime("\n\n>>>%y:%m:%d:%H:%M:%S<<<\n"))
cmd = subprocess.Popen(ps_cmd, shell=True, stdout=subprocess.PIPE)
outfile.writelines(cmd.stdout)
outfile.flush()
time.sleep(sleepInt)
finally:
outfile.close()
def parseArgs():
parser = OptionParser(version='0.1a')
parser.add_option('-f', '--host_file', action='store', type='string', dest='hostfile')
parser.add_option('-d', '--work_dir', action='store', type='string', dest='work_dir')
parser.add_option('-r', '--results_dir', action='store', type='string', dest='results_dir')
parser.add_option('--stop', action='store_true', dest='stop')
parser.add_option('--daemon', action='store_true', dest='daemon')
parser.set_defaults(work_dir='/data1',
results_dir='/data',
stop=False,
daemon=False)
return parser.parse_args()
def parseHostfile(hostfile):
# returns an array of tuples of the form (host, work_dir
with open(hostfile, 'r') as f:
return map(lambda x: tuple(x.rstrip().split(':')), f)
def launchProcess(host, workdir):
dest_dir = os.path.join(workdir, 'gpsupport_reswatch')
# Let's try to guess at the python environment. If we have GPHOME set we'll use it otherwise
# We'll need to default to whatever is current in the shell
py_string = os.getenv('GPHOME', '')
if py_string:
py_string = 'source ' + os.path.join(py_string, 'greenplum_path.sh') + '; '
# Now let's just quick check the host as to whether the python version is >= 2.6
try:
subprocess.check_call("ssh -T %s '%s python -c \"import sys; sys.exit(1) if sys.hexversion < 0x020600f0 else 0\"'" % (host, py_string), shell=True)
except subprocess.CalledProcessError, e:
print >> sys.stderr, 'Python version on host %s is < 2.6.0. Aborting' % (host)
sys.exit(1)
try:
subprocess.check_call("ssh -T %s 'mkdir %s'" % (host, dest_dir), shell=True)
except subprocess.CalledProcessError, e:
err = "Error when trying to create directory: " + dest_dir + " on host: " + host
print >> sys.stderr, err
print e
sys.exit(1)
try:
subprocess.check_call('scp -q mem_watcher %s:%s' % (host, dest_dir), shell=True)
except subprocess.CalledProcessError, e:
err = 'Error when trying to copy script to %s:%s' % (host, dest_dir)
print >> sys.stderr, err
sys.exit(1)
try:
# SSH is doing something with it's terminal handling here I don't fully understand
# If we don't force the creation of a pseudo TTY the ssh hangs when the node process exits (forked into background)
subprocess.check_call("ssh -qtt %s '%s %s/mem_watcher --daemon -d %s'" % (host, py_string, dest_dir, dest_dir), shell=True)
except subprocess.CalledProcessError, e:
err = 'Error when trying to launch resource watcher on host %s, aborting' % (host)
print >> sys.stderr, err
sys.exit(1)
def stopProcesses(host, workdir):
dest_dir = os.path.join(workdir, 'gpsupport_reswatch')
try:
subprocess.check_call("ssh -T %s 'kill $(cat %s/%s)'" % (host, dest_dir, pidfile), shell=True)
except subprocess.CalledProcessError, e:
print >> sys.stderr, 'Error stopping process on host: ' + host
print e
return
try:
subprocess.check_call('scp -q %s:%s/%s ./%s.%s' % (host, dest_dir, ps_file, host, ps_file), shell=True)
except subprocess.CalledProcessError, e:
print >> sys.stderr, 'Error retrieving data from host: ' + host
print e
return
try:
subprocess.check_call("ssh -T %s 'rm -rf %s'" % (host, dest_dir), shell=True)
except subprocess.CalledProcessError, e:
print >> sys.stderr, 'Error removing work directory on host: ' + host
print e
return
def main():
(options, args) = parseArgs()
if options.daemon is True:
lauchAsDaemon(options.work_dir)
hostmap = parseHostfile(options.hostfile)
if options.stop:
for mapping in hostmap:
stopProcesses(*mapping)
else:
for mapping in hostmap:
launchProcess(*mapping)
if __name__ == '__main__':
main()