forked from appfirst/nagios-plugins
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcheck_smart.py
executable file
·394 lines (354 loc) · 17 KB
/
check_smart.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
#!/usr/bin/env python
'''
Created on Aug 27, 2012
@author: Yangming
Updated May 1, 2013
clark@appfirst.com
added support for lsi megaraid
Requires:
smartctl (from smartmontools) >= 5.42
if adaptec raid controller, /usr/StorMan/arcconf >= 6.50
if LSI MegaRaid controller, /usr/sbin/MegaCli >= 8.07.07
Examples:
python check_smart.py -p /usr/sbin/ -t OVERALL_HEALTH
python check_smart.py -p /usr/sbin/ -r megaraid -t OVERALL_HEALTH
python check_smart.py -p /usr/sbin/ -r megaraid -t SPIN_RETRY_COUNT
'''
import sys
# ucommands uses python subprocess module, which is only supported in python >= 2.7, but windows needs subprocess
if sys.platform == "win32":
import ucommands as commands
else:
import commands
import nagios
import re
import time
from xml.dom.minidom import parseString
from nagios import CommandBasedPlugin as plugin
import argparse
class SmartAttribute(object):
def __init__(self):
self.value = None
self.threshold = None
self.worst = None
self.raw_value = None
class SmartChecker(nagios.BatchStatusPlugin):
def __init__(self, *args, **kwargs):
super(SmartChecker, self).__init__(*args, **kwargs)
# Hack to determine uniqueness of script defs
check = argparse.ArgumentParser()
check.add_argument("-D", "--disk", required=False, type=str)
chk, unknown = check.parse_known_args()
if sys.platform == "win32":
self.parser.set_defaults(rootdir="c:\\temp\\")
self.parser.add_argument("-f", "--filename", required=False, type=str, default='pd@smartctl')
self.parser.add_argument("-z", "--appname", required=False, type=str, default='smart')
self.parser.add_argument("-D", "--disk", required=False, type=str)
self.parser.add_argument("-r", "--raid", required=False, type=str, choices=["adaptec", "megaraid"], help="raid controller type")
self.parser.add_argument("-p", "--path", required=False, type=str, default="", help="path to smartctl")
#the interval (by sec) indicates how often this program will fetch smart info
#if queried more frequently, it returns merely the last fetched info
self.parser.add_argument("-i", "--interval", required=False, type=int, default=300)
self.parser.add_argument("--unique", required=False, type=str, default=str(chk.disk))
def _get_disks(self, request):
if request.disk:
disklist = [request.disk]
elif request.raid == "megaraid":
# get list of OS device names (/dev/sda1, etc)
devlist = []
cmd = nagios.rootify("/sbin/fdisk -l")
output = commands.getoutput(cmd)
devlist = re.findall(r"(?<=Disk )((?:/[\w-]+)+)(?=:)", output)
#print "devlist: ", devlist
disklist = []
# get list of raid controller device ids
cmd = nagios.rootify('/usr/sbin/MegaCli -PDList -aALL | grep "Device Id"')
output = commands.getoutput(cmd)
#print "MegaCli Output: ", output
for line in output.split('\n'):
did = line.split(":")[1].strip()
# note that it does not matter which 'dev' you specify, as long as it's valid. There is no mapping from 'dev' to 'device ID'
disk = "%s -d sat+megaraid,%s" % (devlist[0], did)
disklist.append(disk)
else:
output = commands.getoutput(self._get_smartctl(request) + " --scan")
if self._validate_scan_output(request, output):
disklist = []
for line in output.split("\n"):
if line:
d = line.split("#")[0].strip()
disklist.append(d)
elif sys.platform != "win32":
cmd = nagios.rootify("/sbin/fdisk -l")
output = commands.getoutput(cmd)
disklist = re.findall(r"(?<=Disk )((?:/[\w-]+)+)(?=:)", output)
else:
nagios.StatusUnknownError(request, "Can't get disk list")
return disklist
def _get_smartctl(self, request):
if sys.platform == "win32":
return request.path + "smartctl"
else:
return nagios.rootify(request.path + "smartctl")
def _validate_scan_output(self, request, output):
if ("is not recognized as an internal or external command" in output
or "No such file or directory" in output
or "command not found" in output):
raise nagios.StatusUnknownError(request, output)
if ("=======> UNRECOGNIZED OPTION: scan" in output):
return False
else:
return True
def _validate_arcconf_output(self, request, output):
if ( not output
or "No such file or directory" in output
or "command not found" in output):
raise nagios.StatusUnknownError(request, "StorMan tool: arcconf is not available")
else:
return True
def _validate_output(self, request, output):
if ("=== START OF READ SMART DATA SECTION ===" in output
or "SMART Health Status" in output):
return True
elif "Smartctl open device:" in output and "failed: No such device" in output:
return False
elif "SMART support is: Unavailable" in output:
return True
else:
raise nagios.StatusUnknownError(request, output)
def _parse_output(self, request, output):
metricset = []
for l in output.split('\n'):
if "ID#" in l or "Pre-fail" in l or "Old_age" in l:
fields = l.split()
if fields[0] == "ID#":
metricset = fields
if fields[0].isdigit() and len(fields) == len(metricset):
attribute = SmartAttribute()
for metric, value in zip(metricset[2:], fields[2:]):
if metric == "VALUE":
attribute.value = value
elif metric == "THRESH":
attribute.threshold = value
elif metric == "WORST":
attribute.worst = value
elif metric == "RAW_VALUE":
attribute.raw_value = value
yield fields[0], attribute
def _detect_adaptec(self, disklist, request):
# map disk name to disk mount path
adaptec_disks = {}
diskset = set(disklist)
for disk in diskset:
cmd = nagios.rootify(self._get_smartctl(request) + " -a %s" % disk)
output = commands.getoutput(cmd)
results = re.findall(r"Device: Adaptec\s+(\S+)", output)
if len(results) == 1:
adaptec_disks[results[0]] = disk
disklist.remove(disk)
# map to disknum to disk name (with disknum)
cmd = nagios.rootify("/usr/StorMan/arcconf getconfig 1")
output = commands.getoutput(cmd)
results = re.findall(r"Logical device number(?:.*\n)+?\n", output, re.M)
diskdict = {}
for result in results:
name = re.findall(r"Logical device name\s*:\s*(.*)", result)[0]
for diskid in re.findall(r"Present \(\D*\d+,[^)]*?(\d+)\)", result):
diskdict[diskid] = "%s-%s" % (adaptec_disks[name], diskid)
return diskdict;
def retrieve_adaptec_status(self, request, disklist):
stats = {}
diskdict = self._detect_adaptec(disklist, request)
if not diskdict:
return stats
cmd = nagios.rootify("/usr/StorMan/arcconf getsmartstats 1")
output = commands.getoutput(cmd)
xml = output[output.index("<SmartStats"):output.index("</SmartStats>")+13]
dom = parseString(xml)
for disknode in dom.getElementsByTagName("PhysicalDriveSmartStats"):
disk = diskdict[disknode.getAttribute("id")]
for attrnode in disknode.getElementsByTagName("Attribute"):
attribute = SmartAttribute()
attribute.value = attrnode.getAttribute("normalizedCurrent")
attribute.worst = attrnode.getAttribute("normalizedWorst")
attribute.raw_value = attrnode.getAttribute("rawValue")
attrid = str(int(attrnode.getAttribute("id"), 16))
stats.setdefault(attrid, {})[disk] = attribute
return stats
def retrieve_batch_status(self, request):
devicelist = self._get_disks(request)
# print "devicelist: ", devicelist
disklist = [d.split()[0] for d in devicelist if d]
# print "disklist: ", disklist
stats = {}
# load the SMART info of adaptec raid controller
if request.raid == "adaptec":
stats.update(self.retrieve_adaptec_status(request, disklist))
return stats
# load the SMART info of the rest disks.
for device_with_type in devicelist:
if device_with_type:
disk = device_with_type.split()[0]
else:
continue
cmd = nagios.rootify(self._get_smartctl(request) + (" -A %s" % device_with_type))
# print "smartctl cmd: ", cmd
output = commands.getoutput(cmd)
if not self._validate_output(request, output):
continue
for attrid, attribute in self._parse_output(request, output):
# stats.setdefault(attrid, {})[disk] = attribute
stats.setdefault(attrid, {})[device_with_type] = attribute
# print "stats: ", stats
return stats
def get_status_value(self, attr, request):
if not hasattr(self, "stats") or self.stats is None:
self.stats = self.retrieve_last_status(request)
if ("fetchtime" not in self.stats
or int(time.time()) - self.stats["fetchtime"] > request.interval):
self.stats = self.retrieve_batch_status(request)
self.stats["fetchtime"] = int(time.time())
self.save_status(request, self.stats)
if attr not in self.stats:
raise nagios.StatusUnknownError(request)
else:
return self.stats[attr]
return self.stats[attr];
@plugin.command("OVERALL_HEALTH")
def get_overall_health(self, request):
disklist = self._get_disks(request)
# load the SMART info of adaptec raid controller
status_code, message = self.check_health_status(request, disklist)
r = nagios.Result(request.option, status_code, message, request.appname)
return r
def check_health_status(self, request, disklist):
message = "overall test results"
status_code = nagios.Status.OK
for device_with_type in disklist:
if device_with_type:
disk = device_with_type.split()[0]
else:
continue
cmd = nagios.rootify(self._get_smartctl(request) + " -H %s" % device_with_type)
# print "get health command: ", cmd
output = commands.getoutput(cmd)
if not self._validate_output(request, output):
continue
if "SMART support is: Unavailable" in output:
message += " %s=NOTSUPPORT" % disk
continue
test_result = re.findall(r"(?<=SMART overall-health self-assessment test result: )(\w+)", output)
if not test_result:
test_result = re.findall(r"(?<=SMART Health Status: )(\w+)", output)
if not test_result:
continue
message += " %s=%s" % (disk, test_result[0])
if test_result[0] != "PASSED" and test_result[0] != "OK":
status_code = nagios.Status.CRITICAL
return status_code, message
def check_all_attribute(self, request, disklist):
if not hasattr(self, "stats") or self.stats is None:
stats = self.retrieve_last_status(request)
if ("fetchtime" not in stats
or int(time.time()) - stats["fetchtime"] > request.interval):
stats = self.retrieve_batch_status(request)
stats["fetchtime"] = int(time.time())
self.save_status(request, stats)
diskstats = {}
status_code = nagios.Status.OK
critical = request.crit
message = "overall health "
for diskattr in stats.itervalues():
for device_with_type, attribute in diskattr.iteritems():
disk = device_with_type.split()[0]
disk_status_code = diskstats.setdefault(disk, nagios.Status.OK)
if not critical:
critical = attribute.threshold
disk_status_code = self.superimpose(disk_status_code, attribute.value, request.warn, critical,
reverse=True, exclusive=True)
if status_code < disk_status_code:
status_code = disk_status_code
diskstats[disk] = disk_status_code
if status_code > nagios.Status.OK:
message += nagios.Status.to_status(status_code)
for disk, stat in diskstats.iteritems():
if stat > nagios.Status.OK:
message += " disk %s status %s" % (disk, stat)
return status_code, message
@plugin.command("ADAPTEC_HEALTH")
def get_adaptec_health(self, request):
if sys.platform == "win32":
raise nagios.StatusUnknownError(request, "Adaptec Health only supported on linux.")
disklist = self._get_disks(request)
diskdict = self._detect_adaptec(disklist, request)
if not diskdict:
raise nagios.StatusUnknownError(request, "No Adaptec Raid Controller detected.")
message = ""
cmd = nagios.rootify("/usr/StorMan/arcconf getlogs 1 stats")
output = commands.getoutput(cmd)
if not self._validate_arcconf_output(request, output):
return
xml = output[output.index("<ControllerLog"):output.index("</ControllerLog>")+16]
dom = parseString(xml)
status_code = nagios.Status.OK
if not request.warn:
warn = 1
else:
warn = request.warn
sub_perfs = []
for statsnodee in dom.getElementsByTagName("physicaldrivestats"):
value = int(statsnodee.getAttribute("smartWarnCnt"))
disk = diskdict[statsnodee.getAttribute("id")]
if value > 0:
if message == "":
message = "smart warnings:"
message += " %s=%s" % (disk, value)
status_code = self.superimpose(status_code, value, warn, request.crit)
sub_perfs.append((disk, value))
if message == "":
message = "no smart warning"
r = nagios.Result(request.option, status_code, message, request.appname)
for disk, value in sub_perfs:
r.add_performance_data(disk, value, warn=warn, crit=request.crit)
return r
@plugin.command("RAW_READ_ERROR_RATE")
def get_raw_read_error_rate(self, request):
sub_perfs = self.get_status_value("1", request)
return self.get_result(request, sub_perfs, 'raw read error rate')
@plugin.command("SPIN_UP_TIME")
def get_spin_up_time(self, request):
sub_perfs = self.get_status_value("3", request)
return self.get_result(request, sub_perfs, 'spin up time')
@plugin.command("REALLOCATE_SECTOR_COUNT")
def get_reallocated_sector_ct(self, request):
sub_perfs = self.get_status_value("5", request)
return self.get_result(request, sub_perfs, 'sector reallocation')
@plugin.command("SPIN_RETRY_COUNT")
def get_spin_retry_count(self, request):
sub_perfs = self.get_status_value("10", request)
return self.get_result(request, sub_perfs, 'spin retries')
@plugin.command("REALLOCATED_EVENT_COUNT")
def get_reallocated_event_count(self, request):
sub_perfs = self.get_status_value("196", request)
return self.get_result(request, sub_perfs, 'reallocated events')
@plugin.command("CUR_PENDING_SECTOR")
def get_current_pending_sector(self, request):
sub_perfs = self.get_status_value("197", request)
return self.get_result(request, sub_perfs, 'current pending sectors')
@plugin.command("OFFLINE_UNCORRECTABLE")
def get_offline_uncorrectable(self, request):
sub_perfs = self.get_status_value("198", request)
return self.get_result(request, sub_perfs, 'offline correctability')
def get_result(self, request, sub_perfs, message):
status_code = nagios.Status.OK
r = nagios.Result(request.option, status_code, message, request.appname)
critical = request.crit
for disk, attribute in sub_perfs.iteritems():
if not critical:
critical = attribute.threshold
status_code = self.superimpose(status_code, attribute.value, request.warn, critical, reverse=True)
r.add_performance_data(disk, attribute.value, warn=request.warn, crit=critical)
r.set_status_code(status_code)
return r
if __name__ == "__main__":
SmartChecker().run(sys.argv[1:])