-
Notifications
You must be signed in to change notification settings - Fork 54
/
Copy pathcheck_snmp_raid.pl
executable file
·1721 lines (1640 loc) · 89.1 KB
/
check_snmp_raid.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/perl -wT
#
# ============================== SUMMARY =====================================
#
# Program : check_snmp_raid / check_sasraid_megaraid / check_megaraid
# Version : 2.302
# Date : Oct 16, 2013
# Author : William Leibzon - [email protected]
# Copyright: (C) 2006-2013 William Leibzon
# Summary : This is a nagios plugin to monitor Raid controller cards with SNMP
# and report status of the physical drives and logical volumes and
# additional information such as battery status, disk and volume errors.
# Licence : GPL 3 - summary below, full text at http://www.fsf.org/licenses/gpl.txt
# =========================== PROGRAM LICENSE ================================
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GnU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
# ===================== INFORMATION ABOUT THIS PLUGIN ========================
#
# check_snmp_raid | check_sasraid_megaraid | check_megaraid | check_sasraid
#
# This is a Nagios plugin that uses SNMP to monitor several types of RAID cards.
# It can check status of physical drives and logical volumes, and check for disk errors.
#
# This was originally written to monitor LSI MegaRAID series of cards, sold by LSI and
# more commonly found in Dell systems under their brand name 'PERC' (PERC3-PERC6).
# Older ones are SCSI RAID cards and newer are SAS RAID cards. New cards sold directly
# are now called MTPFusion and supported by plugin too. The plugin code is general
# enough that it was possible to add support for Adaptec and HP SmartArray cards.
# This was added to 2.x version of this plugin when it was also renamed check_snmp_raid
# Support for more controllers maybe added if you look at the MIBS are willing to
# contribute settings for them.
#
# This plugin requires that Net::SNMP be installed on the machine performing the
# monitoring and that snmp agent be set up on the machine to be monitored.
#
# This plugin is maintained by William Leibzon and released at:
# http://william.leibzon.org/nagios/
#
# =============================== SETUP NOTES ================================
#
# Run this plugin with '-h' to see all avaiable options.
#
# This originally started as check_megaraid plugin but now has been extended
# to work with various cards. You must specify what card with '-T' option.
# The following are acceptable types as of Apr 2013:
# megaraid|sasraid|perc3|perc4|perc5|perc6|mptfusion|sas6ir|sas6|
# adaptec|hp|smartarray|eti|ultrastor
#
# You will need to have SNMP package installed appropriate for your card.
#
# If you have SASRaid (also known as PERC5, PERC6) you need sasraid LSI package
# (lsi_mrdsnmpd unix service). This driver is available at
# http://www.lsi.com/storage_home/products_home/internal_raid/megaraid_sas/megaraid_sas_8480e/index.html
#
# For LSI Megaraid (Dell PERC3 and PERC4 cards) the driver package is
# called percsnmp and you either find it on Dell's support site or at
# http://www.lsi.com/storage_home/products_home/internal_raid/megaraid_scsi/
#
# For other cards please check with vendor you bought the card from for
# an appropriate SNMP driver package.
#
# This is a very old example of nagios config for Megarad card check:
# (note that $USER1$ and $USER6$ are defined in resource.cfg,
# $USER1$ is path to plugin directory and $USER6$ is community string
# also "" around macros in commands.cfg are on purpose, don't forget them):
#
# define command {
# command_name check_megaraid
# command_line $USER1$/check_megaraid.pl -T megaraid -e -o -i -s 1 -H $HOSTADDRESS$ -C $USER6$ -P "$SERVICEPERFDATA$" -S "$SERVICESTATE$,$SERVICESTATETYPE$"
# }
# define service{
# host_name list of target hosts
# service_description Megaraid Status
# check_command check_megaraid
# ...
# }
#
# =========================== VERSION HISTORY ================================
#
# 0. [0.8 - ? 2002] Version 0.8 of check_megaraid plugin was released by
# John Reuning in 2002. His plugin was originally at
# http://www.ibiblio.org/john/megaraid/
#
# This was starting point for this plugin. However less than 10% of the code
# is now from original John's plugin and he has not been involved since then,
# he is now listed as contributor and not as an author. The original
# "Copyright 2002 iBiblio" has also been removed although this may still
# apply to small portions of the code. This note has been added in 2012.
#
# 1. [0.9 - ? 2006] Check consistancy has been downgraded to WARNING
# 2. [0.9 - ? 2006] The message in the output is now more detailed
# 3. [0.9 - ? 2006] The number of drives is now displayed in the output
# 4. [1.1 - Feb 2007] Plugin now retrieves snmp oid for read and write errors
# and reports an alert if its not 0 or -1
# 5. [1.2 - Feb 2007] Plugin now checks 'medium' and 'other' errors for
# all physical drives. This data is returned as performance output and
# in order to detect changes you need to send previous performance results
# as a parameter in the command to this plugin. If your nagios is set to
# send notifications after multiple subsequent non-OK alerts then you
# also need to send previous state so as to force notification
# (performance data would be same as original until non-OK HARD state)
# 6. [1.3 - Apr 2007] Reworked reporting of physical id to report it as
# "control/channel/id" when more than one controller is present or as
# "channnel/id" when one controller and more than one channel
# Also for persnmp5 if you have multiple luns (which should not happen
# with disk drives) it will in theory add lun# as ".lun" to physical id
# 7. [1.35 - Apr 2007] Changed reporting of 'medium' and 'other' errors as
# WARNING. Changed output system so that old performance data is
# reported even for UNKNOWN
# 8. [1.4 - Apr 2007] Added specifying SNMP version and changed default
# to v1 because as I found out this actually gets executed faster.
# Also added capability to profile time it takes for this plugin
# to execute with "--debug_time" option
# 9. [1.45 - May 2007] Modifications to output +num of physical or logical
# drive errors when they happen instead of total number of errors
# Also plugin now reports WARNING when array is degraded but one
# of the disks is being rebuilt
# [1.46 - Jun 2007] Minor bug fixes
# 10. [1.5 - Oct 2007] Additional command-line option added to enable
# extra drive error checks I've written (see above) i.e.
# you now have to pass on "-e" option to enable checks for
# medium & other errors. In reality this was already done as option
# before as you had to pass on "-P" with old performance data to
# make use of it, but now it also has to be specifically enabled
# with '-e' or '--drive_errors" option.
# Also new option '-i' ('--extra_info') which adds more information
# in plugin output. For 1.5 this is drive rebuilt rate info.
# 11. [1.6 - Oct 2007] Additional option '-o' ('--snmp_optimize') to minimize
# number of SNMP queries when extra data is needed. When this is given
# only one extra query is made for specific OIDs that are needed
# instead of multiple SNMP table walks. Note that despite this type
# of optimization working very well for number of my other plugins,
# it is not clear if it is actually better with percsnmp or not. Use
# this at your own risk and do some trials with '--debug_time' option
# to see if it is better for you.
# 12. [1.7 - Nov 2007] Some code cleanup and addition of -O to set base oid.
# The only reason you might want this is if you modified /etc/snmp/snmpd
# to have line other then "pass .1.3.6.1.4.1.3582 /usr/sbin/percmain".
# And the only reason to do such modificatins is if you have both
# PERC3/4 SCSI Megaraid card(s) and PERC5 SAS card which use sassnmp
# driver by LSI (by default that will also try to use 1.3.6.1.4.1.3582).
# 13. [1.72 - Nov 2007] Changing of megaraid OIDs to SASRAID. This is mostly
# quick hack as in the near future I plan to merge both check_megaraid
# and check_sasraid back into one plugin with -T option specifying
# what type of card you want to check
# 14. [1.75 - Dec 2007] Code fixes and merger of check_megaraid and
# check_sasraid plugins.Type -T option added to specify card type.
# 15. [1.8 - Nov 2010, release Dec 15, 2010] patch by Vitaly Pecharsky:
# Added support for mptsas devices, such as Dell SAS6i/R and other
# similar LSI Logic / Symbios Logic SAS1068E PCI-Express Fusion-MPT SAS
# (and possibly other). Use -T mptfusion|sas6|sas6ir switches for
# these cards. Both arrays (logical + physical) and standalone
# (unconfigured physical only) drive configurations are supported.
# Added explicit support for PERC 6 and PERC H700 controllers,
# which are already supporting through sasraid path.
# 16. [1.901 - Dec 25, 2011] Support for SNMP v3.
# Bunch of new options added to support SNMP v3.
# There is also an incompatible change in that default community is no longer
# 'public' - you must now specify community if you use snmp v1 or v2
# This is all for better security for those few who do use this plugin.
# 17. [1.902 - Jan 12, 2012] Minor fixes mostly in documentation.
# 18. [1.91 - Feb 8, 2012] More bug fixes with 1.9 release (forgot to include verb() function)
# 19. [1.92 - Jun 15, 2012] Bug fixed when no SNNP version was specified.
# Verb function & option updated to allow debug info go to file specified
# as a parameter to -v (now also called --debug) rather than just stdout.
# 20. [1.95 - Oct 22, 2012] New version. Patches and Additions that went here:
# a. merged pool request from goochjj (John Gooch):
# Added good_drives threshold check (new '-g' option) and info on
# make and model of physical drives which is activated with "-i" option
# b. applied patch from Robert Wikman (sent by email) that adds checks on status of
# batteries (BBU data) enabled with a new -b (--check_battery) option
# c. code cleanup and refactoring - functions moved to top and option variables renamed
# d. list of contributors section added
# [2.0 - Oct 2012] The version was originaly to be released as 1.95 but with two patches
# and all the code cleanup, the number of changes is more than sub-minor and I released
# it as 2.0. However I later downgraded it back to 1.95 because for 2.0 release the plugin
# is being renamed as check_snmp_raid since it was possible to add support for Adaptec cards.
# * Code contributions for this release: John Gooch, William Leibzon, Robert Wikman *
# 21. [2.1 - Nov 22, 2012] Plugin has been renamed check_snmp_raid. Essentially this is
# first new 2.x release (see above on first released 2.0 being downgraded back to 1.95).
# Release notes for this version:
# a. Adding limited support for Adaptec RAID cards contributed by K Truong
# b. Adding limited support for HP Smart Array RAID, also contributed by K Truong
# c Code updates to make it easier to support more cards and vendors in the future
# d. Making both PHYDRV_CODES and BATTERY_CODES array contain 3 parameters as has
# been the case with LOGDRV_CODES. The first one is short code name,
# 2nd is human-readable text, and 3rd is nagios status it corresponds to.
# e. Documentation updates related to plugin renaming and many other small
# code updates
# f. Starting with 2.x the plugin is licensed under GPL 3.0 licence (was 2.0 before)
# * Code contributions for this release: William Leibzon, Khanh Truong *
# 22. [2.2 - May 3, 2013] The following are additions in this version:
# a. Added limited support for ETI UtraStor (ES1660SS and maybe others) and Synology contollers/devices
# - plugin now supports specifying and checking controller status OIDs for Fan, PowerSupply and similar
# these are all expected to have common simple ok/fail return codes
# - volume checks and temperature OIDs are specified but support for these is not written yet
# b. Added support for battery status and drive vendor and model information for Adaptec cards,
# this is contributed by Stanislav German-Evtushenko (giner on github)
# based on http://www.circitor.fr/Mibs/Html/ADAPTEC-UNIVERSAL-STORAGE-MIB.php#BatteryStatus
# c. Bug fixes, code improvements & refactoring
# - Bug fixes. Fix for debugging. Old DEBUG printfs are replaced with calls to verb() function
# - code_to_description() and code_to_nagiosstatus() functions added
# - %cardtype_map with list of card types added replacing if/elseif in check_options
# - partial rewrite of exit status code for logical and physical drives. This may introduce small
# incompatible changes for megaraid - but old code dates to workarounds for megaraid checks
# and doing manually what 3rd column of LOGDRV_CODES and PHYSDRV_CODES are now used for
# d. New -A/--label option and changes in the nagios status output
# - New -A/--label option allows to specify text to start plugin output in place of default 'Raid'
# - print_output() function has been renamed print_and_exit() and now also exits with specified alert code
# - reordering of output: # of controllders, drives, batteries first, then new additional controller status
# such as 'powersupply is ok' and last model & tasks info which are now labeled as 'tasks []' before data
# * Code contributions for this release: Michael Cook, Stanislav German-Evtushenko, William Leibzon *
# 23. [2.3 - June 28, 2013] The following are additions in this version:
# a. Applied patch by Erez Zarum to properly support multiple sasraid controllers
# . added option -m to enable retrieving extra tabbles for multi-controller support
# b .Imported snmp_get_table(), snmp_get_request(), set_snmp_window() functions from check_netint 2.4b3
# added options for bulksnmp support and for setting snmp msg window
# c. Code refactoring to replace direct calls to net::snmp get_table and get_request
# functions with above ones that do bulk snmp if desired
# * Code contributions for this release: William Leibzon, Erez Zarum *
# 24. [2.301 - Sep 5, 2013] Fix bug with bulk snmp variable names
# [2.302 - Oct 16, 2013] Additional small bug fix, patch by Adrian Frühwirth
#
# ========================== LIST OF CONTRIBUTORS =============================
#
# The following individuals have contributed code, patches, bug fixes and ideas to
# this plugin (listed in last-name alphabetical order):
#
# Michael Cook
# Adrian Frühwirth
# Stanislav German-Evtushenko
# Joe Gooch
# William Leibzon
# Vitaly Percharsky
# John Reuning
# Khanh Truong
# Robert Wikman
# Erez Zarum
#
# Open source community is grateful for all your contributions.
#
# ========================== START OF PROGRAM CODE ===========================
my $version = "2.302";
use strict;
use Getopt::Long;
use Time::HiRes qw(time);
our $no_snmp=0;
eval 'use Net::SNMP';
if ($@) {
$no_snmp=1;
}
# Nagios specific
use lib "/usr/lib/nagios/plugins";
our $TIMEOUT;
our %ERRORS;
eval 'use utils qw(%ERRORS $TIMEOUT)';
if ($@) {
$TIMEOUT = 30;
%ERRORS = ('OK'=>0,'WARNING'=>1,'CRITICAL'=>2,'UNKNOWN'=>3,'DEPENDENT'=>4);
}
# some defaults, most can be overriden by input parameters too
my $cardtype="sasraid"; # default card type. Note: there will not be any default in future versions
my $baseoid=""; # if not specified here, it will use default ".1.3.6.1.4.1.3582"
my $timeout=$TIMEOUT; # default is nagios exported $TIMEOUT variable
my $DEBUG = 0; # to print debug messages, set this to 1
my $MAX_OUTPUTSTR = 2048; # maximum number of characters in otput
my $alert = "CRITICAL"; # default alert type if error condition is found
my $label = "Raid"; # Label to start output with
# SNMP authentication options and their derfault values
my $o_port= 161; # SNMP port
my $o_community = undef; # community - this used to default to 'public' but no more
my $o_login= undef; # Login for snmpv3
my $o_passwd= undef; # Pass for snmpv3
my $v3protocols= undef; # V3 protocol list.
my $o_authproto= 'md5'; # Auth protocol
my $o_privproto= 'des'; # Priv protocol
my $o_privpass= undef; # priv password
my $o_octetlength= undef; # SNMP Message size parameter
my $o_bulksnmp= undef; # do snmp bulk request
my $opt_snmpversion= undef; # SNMP version option, default "1" when undef
my $opt_baseoid= undef; # allows to override default $baseoid above
########## CORE PLUGIN CODE (do not change below this line) ##################
# Other option variables
my $o_host = undef; # hostname
my $o_timeout= undef; # Timeout (Default 20 or what is set in utils.pm, see above)
my $o_help= undef; # wan't some help ?
my $o_version= undef; # print version
my $opt_cardtype= undef; # option to sets card type i.e. 'sasraid' or 'megaraid', or 'mptfusion' or 'perc3', 'perc5' etc
my $opt_alert= undef; # what type of alert to issue
my $opt_debug= undef; # verbose mode/debug file name
my $opt_gooddrives= undef; # how many good drives should system have, less gives an alert
my $opt_perfdata= undef; # -P option to pass previous performance data (to determine if new drive failed)
my $opt_prevstate= undef; # -S option to pass previous state (to determine if new drive failed)
my $opt_debugtime= undef; # used with -P and enabled comparison of how long ops take every time, not for normal operation
my $opt_drverrors= undef; # -e option. megarad only. checks for new medium and other errors, requires previous perf data
my $opt_optimize= undef; # -o experimental option to optimize SNMP queries for faster performance
my $opt_extrainfo= undef; # -i option that gives more info on drives and their state at the expense of more queries
my $opt_battery= undef; # -b option to check if RAID card batteries (BBU) are working
my $opt_label= undef; # text to start plugin output with which overrides default "Raid"
my $opt_multcontrollers=undef; # use this option when multptiple sas controllers are present
# Other global variables
my $nagios_status= "OK"; # nagios return status code, starts with "OK"
my $error= ""; # string that gets set if error is found
my %curr_perf= (); # performance vars
my %prev_perf= (); # previous performance data feed to plugin with -P
my @prev_state= (); # state based on above
my %debug_time= (); # for debugging of how long execution takes
my $session= undef; # SNMP session
my $snmp_session_v= 0; # if no snmp session, its 0, otherwise 1 2 or 3 depending on version of SNMP session opened
my $do_bulk_snmp = 0; # 1 for bulk snmp queries
# Mapping of multipe acceptable names for cards that people can specify with -T to single card type
my %cardtype_map = (
'megaraid' => 'megaraid',
'perc3' => 'megaraid',
'perc4' => 'megaraid',
'sasraid' => 'sasraid',
'perc5' => 'sasraid',
'perc6' => 'sasraid',
'perch700' => 'sasraid',
'mptfusion' => 'mptfusion',
'sas6' => 'mptfusion',
'sas6ir' => 'mptfusion',
'adaptec' => 'adaptec',
'hp' => 'hp',
'smartarray' => 'hp',
'ultrastor' => 'ultrastor',
'eti' => 'ultrastor',
'synology' => 'synology',
);
# These variables are set by set_oids() function based on cardtype
# only $logdrv_status_tableoid, $phydrv_status_tableoid, %LOGDRV_CODES, %PHYDRV_CODES are required
# rest are optional and may only appear for specific card type
my $logdrv_status_tableoid = undef; # logical drives status. responses to that are in %LOGDRV_CODES tabbe
my $logdrv_task_status_tableoid = undef; # logical drive task status info. only adaptec. responses in %LOGDRV_TASK_CODES
my $logdrv_task_completion_tableoid = undef; # logical drive task completion info. similar to rebuild rate for phydrv?
my $phydrv_status_tableoid = undef; # physical drives status. responses to that are in %PHYDRV_CODES table
my $phydrv_mediumerrors_tableoid = undef; # number of medium errors on physical drives. only old scsi megaraid
my $phydrv_othererrors_tableoid = undef; # number of 'other' errors on physical drives. only old scsi megaraid
my $phydrv_vendor_tableoid = undef; # drive vendor or drive type info on physical drives
my $phydrv_product_tableoid = undef; # specific drive model or combined vendor+model on each physical drives
my $phydrv_rebuildstats_tableoid = undef; # rebuild Task Stats. For when new drive is added to existing RAID array
my $phydrv_assignedname_tableoid = undef; # TODO: name of the drive configured in the system
my $phydrv_temperature_tableoid = undef; # TODO: drives temperature
my $phydrv_count_oid = undef; # used only by sasraid to verify number of drives in the system. not a table
my $phydrv_goodcount_oid = undef; # only sasraid. number of good drives in the system. not a table
my $phydrv_badcount_oid = undef; # only sasraid. number of bad drives in the system. not a table
my $phydrv_bad2count_oid = undef; # only sasraid. number of bad drives in the system. not a table
my $adapter_status_tableoid = undef; # only sasraid. get adapter status overall, these includes goodcount,badcount, etc.. to optimize snmp requests.
my $phydrv_controller_tableoid = undef; # only sasraid. get controller id for each drive
my $phydrv_channel_tableoid = undef; # only sasraid. get channel id (enclosure) for each drive
my $phydrv_devid_tableoid = undef; # only sasraid. get device id for each drive
my $phydrv_lunid_tableoid = undef; # only sasraid. get lun id for each drive
my $sys_temperature_oid = undef; # TODO: controller/system temperature
my $readfail_oid = undef; # only old megaraid. number of read fails. not a table
my $writefail_oid = undef; # only old megaraid. number of write fails. not a table
my $adpt_readfail_oid = undef; # only megaraid. number of read fails. not sure of the difference from above any more
my $adpt_writefail_oid = undef; # only megaraid. number of write fails. not sure of the difference from above
my $battery_status_tableoid = undef; # table to check batteries and their status
my %controller_status_oids = (); # set of additional oids to check that report controller operating status
my %controller_status_codes = (); # set of responses for above additional status tables (must be same for all tables)
my %LOGDRV_CODES = (); # Logical Drive Status responses and corresponding description and Nagios exit code
my %LOGDRV_TASK_CODES = (); # Logical Drive Task Status responses and corresponding Nagios exit code
my %PHYDRV_CODES = (); # Physical Drives Status responses and corresponding descriptions and Nagios exit code
my %BATTERY_CODES = (); # Raid Controller Battery status responses and corresponding Nagios exit codes
# Function to set values for OIDs that are used
sub set_oids {
if ($cardtype eq 'megaraid') {
$baseoid = "1.3.6.1.4.1.3582" if $baseoid eq ""; # megaraid standard base oid
$logdrv_status_tableoid = $baseoid . ".1.1.2.1.3"; # megaraid logical
$phydrv_status_tableoid = $baseoid . ".1.1.3.1.4"; # megaraid physical
$phydrv_mediumerrors_tableoid = $baseoid . ".1.1.3.1.12"; # megaraid medium errors
$phydrv_othererrors_tableoid = $baseoid . ".1.1.3.1.15"; # megaraid other errors
$phydrv_rebuildstats_tableoid = $baseoid . ".1.1.3.1.11";
$phydrv_product_tableoid = $baseoid . ".1.1.3.1.8"; # megaraid drive vendor+model
$readfail_oid = $baseoid . ".1.1.1.1.13";
$writefail_oid = $baseoid . ".1.1.1.1.14";
$adpt_readfail_oid = $baseoid . ".1.1.1.1.15";
$adpt_writefail_oid = $baseoid . ".1.1.1.1.16";
## Status codes for logical drives
# 1st column has special meaning:
# 'optimal' is for OK status,
# 'degraded' if it is CRITICAL is forced to WARNING if drive is being rebuild and has WARNING state
# 'initialize' & checkconsistency are just regular WARNING and no longer have special meaning
%LOGDRV_CODES = (
0 => ['offline', 'drive is offline', 'NONE' ],
1 => ['degraded', 'array is degraded', 'CRITICAL' ],
2 => ['optimal', 'functioning properly', 'OK' ],
3 => ['initialize', 'currently initializing', 'WARNING' ],
4 => ['checkconsistency', 'array is being checked', 'WARNING' ],
);
## Status codes for physical drives
%PHYDRV_CODES = (
1 => ['ready', 'ready', 'OK'],
3 => ['online', 'online', 'OK'],
4 => ['failed', 'failed', 'CRITICAL'],
5 => ['rebuild', 'reuild', 'WARNING'],
6 => ['hotspare', 'hotspare', 'OK'],
20 => ['nondisk', 'nondisk', 'OK'],
);
}
elsif ($cardtype eq 'mptfusion') {
$baseoid = "1.3.6.1.4.1.3582" if $baseoid eq ""; # megaraid standard base oid
$logdrv_status_tableoid = $baseoid . ".5.1.4.3.1.2.1.5"; # mptfusion logical
$phydrv_status_tableoid = $baseoid . ".5.1.4.2.1.2.1.10"; # mptfusion physical
$phydrv_mediumerrors_tableoid = $baseoid . ".5.1.4.2.1.2.1.7"; # mptfusion medium errors
$phydrv_othererrors_tableoid = $baseoid . ".5.1.4.2.1.2.1.8"; # mptfusion other errors
$phydrv_vendor_tableoid = $baseoid . ".5.1.4.2.1.2.1.24"; # mptfusion drive vendor (this needs to be verified)
$phydrv_product_tableoid = $baseoid . ".5.1.4.2.1.2.1.25"; # mptfusion drive model (this needs to be verified)
## Status codes for logical drives
# 1st column has special meaning:
# 'optimal' is for OK status,
# 'degraded' if it is CRITICAL is forced to WARNING if drive is being rebuild and has WARNING state
# 'initialize' & checkconsistency are just regular WARNING and no longer have special meaning
%LOGDRV_CODES = (
0 => ['offline', 'volume is offline', 'NONE' ],
1 => ['degraded', 'parially degraded', 'CRITICAL' ],
2 => ['degraded', 'fully degraded', 'CRITICAL' ],
3 => ['optimal', 'functioning properly', 'OK' ]
);
## Status codes for physical drives - these are for MPTFUSION
%PHYDRV_CODES = (
0 => ['unconfigured_good', 'unconfigured_good', 'OK'],
1 => ['unconfigured_bad', 'unconfigured_bad', 'CRITICAL'],
2 => ['hotspare', 'hotspare', 'OK'],
16 => ['offline', 'offline', 'OK'],
17 => ['failed', 'failed', 'CRITICAL'],
20 => ['rebuild', 'rebuild', 'WARNING'],
24 => ['online', 'online', 'OK'],
);
}
elsif ($cardtype eq 'sasraid') {
$baseoid = "1.3.6.1.4.1.3582" if $baseoid eq ""; # megaraid standard base oid
$adapter_status_tableoid = $baseoid . ".4.1.4.1.2.1";
$logdrv_status_tableoid = $baseoid . ".4.1.4.3.1.2.1.5"; # sasraid logical
# $sas_logdrv_name_tableoid = $baseoid . ".4.1.4.3.1.2.1.6"; # sas virtual device name
$phydrv_status_tableoid = $baseoid . ".4.1.4.2.1.2.1.10"; # sasraid physical
$phydrv_mediumerrors_tableoid = $baseoid . ".4.1.4.2.1.2.1.7"; # sasraid medium errors
$phydrv_othererrors_tableoid = $baseoid . ".4.1.4.2.1.2.1.8"; # sasraid other errors
$phydrv_vendor_tableoid = $baseoid . ".4.1.4.2.1.2.1.24"; # sasraid drive vendor
$phydrv_product_tableoid = $baseoid . ".4.1.4.2.1.2.1.25"; # sasraid drive model
$phydrv_controller_tableoid = $baseoid . ".4.1.4.2.1.2.1.22"; # sasraid drive to controller id
$phydrv_channel_tableoid = $baseoid . ".4.1.4.2.1.2.1.18"; # sasraid drive to enclosure/channel id
$phydrv_devid_tableoid = $baseoid . ".4.1.4.2.1.2.1.2"; # sasraid drive to device id
$phydrv_lunid_tableoid = $baseoid . ".4.1.4.2.1.2.1.1"; # sasraid drive to lun id
$phydrv_count_oid = $baseoid . ".4.1.4.1.2.1.21"; # pdPresentCount
$phydrv_goodcount_oid = $baseoid . ".4.1.4.1.2.1.22"; # pdDiskPresentCount
$phydrv_badcount_oid = $baseoid . ".4.1.4.1.2.1.23"; # pdDiskPredFailureCount
$phydrv_bad2count_oid = $baseoid . ".4.1.4.1.2.1.24"; # pdDiskFailureCount
$battery_status_tableoid = $baseoid . ".4.1.4.1.6.2.1.27"; # battery replacement status
## Status codes for logical drives
# 1st column has special meaning:
# 'optimal' is for OK status,
# 'degraded' if it is CRITICAL is forced to WARNING if drive is being rebuild and has WARNING state
# 'initialize' & checkconsistency are just regular WARNING and no longer have special meaning
%LOGDRV_CODES = (
0 => ['offline', 'volume is offline', 'NONE' ],
1 => ['degraded', 'parially degraded', 'CRITICAL' ],
2 => ['degraded', 'fully degraded', 'CRITICAL' ],
3 => ['optimal', 'functioning properly', 'OK' ]
);
## Status codes for physical drives - these are for SASRAID
%PHYDRV_CODES = (
0 => ['unconfigured_good', 'unconfigured_good', 'OK'],
1 => ['unconfigured_bad', 'unconfigured_bad', 'CRITICAL'],
2 => ['hotspare', 'hotspare', 'OK'],
16 => ['offline', 'offline', 'OK'],
17 => ['failed', 'failed', 'CRITICAL'],
20 => ['rebuild', 'rebuild', 'WARNING'],
24 => ['online', 'online', 'OK'],
);
## Status codes for battery replacement - these are for SASRAID
%BATTERY_CODES = (
0 => ['ok', 'Battery OK', 'OK'],
1 => ['fail', 'Battery needs replacement', 'WARNING']
);
}
elsif ($cardtype eq 'adaptec') {
$baseoid = "1.3.6.1.4.1.795" if $baseoid eq ""; # Adaptec base oid
$logdrv_status_tableoid = $baseoid . ".14.1.1000.1.1.12"; # adaptec logical drives status
$logdrv_task_status_tableoid = $baseoid . ".14.1.1000.1.1.6"; # adaptec logical drive task status
$logdrv_task_completion_tableoid = $baseoid . ".14.1.1000.1.1.7"; # adaptec logical drive task completion
$phydrv_status_tableoid = $baseoid . ".14.1.400.1.1.11"; # adaptec physical drive status
$battery_status_tableoid = $baseoid . ".14.1.201.1.1.14"; # adaptec battery status
$phydrv_vendor_tableoid = $baseoid . ".14.1.400.1.1.6"; # adaptec drive vendor
$phydrv_product_tableoid = $baseoid . ".14.1.400.1.1.7"; # adaptec drive model
## Status codes for logical drives
# 1st column has special meaning:
# 'optimal' is for OK status,
# 'degraded' if it is CRITICAL is forced to WARNING if drive is being rebuild and has WARNING state
# 'initialize' & checkconsistency are just regular WARNING and no longer have special meaning
%LOGDRV_CODES = (
1 => ['unknown', 'array state is unknown', 'UNKNOWN'],
2 => ['unknown', 'array state is other or unknown', 'UNKNOWN'],
3 => ['optimal', 'array is funtioning properly', 'OK'],
4 => ['optimal', 'array is funtioning properly', 'OK'],
5 => ['degraded', 'array is impacted', 'CRITICAL'],
6 => ['degraded', 'array is degraded', 'CRITICAL'],
7 => ['failed', 'array failed', 'CRITICAL'],
8 => ['compacted', 'array is compacted', 'UNKNOWN'], # Does anybody know what "compacted" means?
);
## Status codes for logical drives - these code are for ADAPTEC
## 1st and 3d columns are not used so far
%LOGDRV_TASK_CODES = (
1 => ['unknown', 'array task status is unknown', 'UNKNOWN'],
2 => ['other', 'other', 'UNKNOWN'],
3 => ['noTaskActive', 'noTaskActive', 'OK'],
4 => ['reconstruct', 'reconstruct', 'WARNING'],
5 => ['zeroInitialize', 'zeroInitialize', 'WARNING'],
6 => ['verify', 'verify', 'WARNING'],
7 => ['verifyWithFix', 'verifyWithFix', 'WARNING'],
8 => ['modification', 'modification', 'WARNING'],
9 => ['copyback', 'copyback', 'WARNING'],
10 => ['compaction', 'compaction', 'WARNING'],
11 => ['expansion', 'expansion', 'WARNING'],
12 => ['snapshotBackup', 'snapshotBackup', 'WARNING'],
);
## Status codes for physical drives
%PHYDRV_CODES = (
1 => ['unknown', 'unknown', 'WARNING'],
2 => ['other', 'other', 'OK'],
3 => ['okay', 'okay', 'OK'],
4 => ['warning', 'warning', 'WARNING'],
5 => ['failure', 'failure', 'CRITICAL'],
);
## Status codes for batteries - these code are for ADAPTEC
%BATTERY_CODES = (
1 => ['unknown', 'unknown', 'UNKNOWN'],
2 => ['other', 'other', 'WARNING'],
3 => ['notApplicable', 'notApplicable', 'WARNING'],
4 => ['notInstalled', 'notInstalled', 'WARNING'],
5 => ['okay', 'Battery OK', 'OK'],
6 => ['failed', 'failed', 'CRITICAL'],
7 => ['charging', 'charging', 'WARNING'],
8 => ['discharging', 'discharging', 'WARNING'],
9 => ['inMaintenanceMode', 'inMaintenanceMode', 'WARNING']
);
}
elsif ($cardtype eq 'smartarray') {
$baseoid = "1.3.6.1.4.1.232" if $baseoid eq ""; # HP (SmartArray) base oid
$logdrv_status_tableoid = $baseoid . ".3.2.3.1.1.4";
$phydrv_status_tableoid = $baseoid . ".3.2.5.1.1.6";
## Status codes for logical drives
# 1st column has special meaning:
# 'optimal' is for OK status,
# 'degraded' if it is CRITICAL is forced to WARNING if drive is being rebuild and has WARNING state
# 'initialize' & checkconsistency are just regular WARNING and no longer have special meaning
%LOGDRV_CODES = (
# as taken from CPQIDA-MIB
# other(1),
# ok(2),
# failed(3),
# unconfigured(4),
# recovering(5),
# readyForRebuild(6),
# rebuilding(7),
# wrongDrive(8),
# badConnect(9),
# overheating(10),
# shutdown(11),
# expanding(12),
# notAvailable(13),
# queuedForExpansion(14)
1 => ['unknown', 'array state is unknown', 'UNKNOWN'],
2 => ['optimal', 'array is functioning properly', 'OK'],
3 => ['failed', 'array failed', 'CRITICAL'],
4 => ['initialize', 'array is unconfigured', 'WARNING'],
5 => ['degraded', 'array is recovering', 'WARNING'],
6 => ['degraded', 'array is ready for rebuild', 'WARNING'],
7 => ['degraded', 'array is rebuilding', 'WARNING'],
8 => ['other-failure', 'array wrong drive', 'CRITICAL'],
9 => ['other-failure', 'array bad connect', 'CRITICAL'],
10 => ['other-failure', 'array is overheating', 'CRITICAL'],
11 => ['failed', 'array is shutdown', 'CRITICAL'],
12 => ['initialize', 'array is expanding', 'WARNING'],
13 => ['other-failure', 'array not available', 'CRITICAL'],
14 => ['initialize', 'array queued for expansion', 'WARNING'],
);
## Status codes for physical drives
%PHYDRV_CODES = (
1 => ['other', 'other unknown error', 'UNKNOWN'], # maybe this should be critical in nagios?
2 => ['okay', 'okay', 'OK'],
3 => ['failure', 'failure', 'CRITICAL'],
4 => ['warning', 'warning on predictive failure', 'WARNING'], # predictive failure
);
}
elsif ($cardtype eq 'ultrastor') {
$baseoid = "1.3.6.1.4.1.22274" if $baseoid eq ""; # ETI base oid
$logdrv_status_tableoid = $baseoid . ".1.2.3.1.6"; # logical volume status
# $voldrv_status_tableoid = $baseoid . ".1.2.2.1.6"; # ETI volume status (NOT SUPPORTED YET)
$phydrv_status_tableoid = $baseoid . ".1.2.1.1.5"; # physical status
$phydrv_vendor_tableoid = $baseoid . ".1.2.1.1.8"; # drive vendor
$phydrv_product_tableoid = $baseoid . ".1.2.1.1.15"; # drive model
## Status codes for logical drives
# 1st column has special meaning:
# 'optimal' is for OK status,
# 'degraded' if it is CRITICAL is forced to WARNING if drive is being rebuild and has WARNING state
# 'initialize' & checkconsistency are just regular WARNING and no longer have special meaning
%LOGDRV_CODES = ( # 1st column has special meaning when its 'optimal' and 'degraded'
0 => ['offline', 'volume is offline', 'OK' ],
1 => ['degraded', 'partially degraded', 'WARNING' ],
2 => ['degraded', 'fully degraded', 'CRITICAL' ],
3 => ['optimal', 'functioning properly', 'OK' ]
);
## Status codes for physical drives
%PHYDRV_CODES = (
0 => ['unconfigured_good', 'unconfigured_good', 'OK'],
1 => ['unconfigured_bad', 'unconfigured_bad', 'CRITICAL'],
2 => ['hotspare', 'hotspare', 'OK'],
16 => ['offline', 'offline', 'OK'],
17 => ['failed', 'failed', 'CRITICAL'],
20 => ['rebuild', 'rebuild', 'WARNING'],
24 => ['online', 'online', 'OK'],
);
## Controller Systems Status OIDs
%controller_status_oids = (
"generalstatus" => $baseoid . ".1.1.1",
"temperature" => $baseoid . ".1.1.2",
"voltage" => $baseoid . ".1.1.3",
"ups" => $baseoid . ".1.1.4",
"fan" => $baseoid . ".1.1.5",
"powersupply" => $baseoid . ".1.1.6",
"dualcontroller" => $baseoid . ".1.1.7",
);
## Controller general status OID
%controller_status_codes = (
0 => ['good', 'ok', 'OK' ],
1 => ['bad', 'bad', 'CRITICAL' ],
);
}
elsif ($cardtype eq 'synology') {
$baseoid = "1.3.6.1.4.1.6574" if $baseoid eq ""; # Synology base oid
$logdrv_status_tableoid = $baseoid . ".3.3"; # logical volume status
$phydrv_status_tableoid = $baseoid . ".2.5"; # physical status
$phydrv_vendor_tableoid = $baseoid . ".2.4"; # not drive vendor, rather drive type (SATA,SSD)
$phydrv_product_tableoid = $baseoid . ".2.5"; # drive model
$phydrv_assignedname_tableoid = $baseoid . ".2.2"; # name of the drive configured in the system (NOT SUPPORTED YET)
$phydrv_temperature_tableoid = $baseoid . ".2.2"; # drive temperature (NOT SUPPORTED YET)
$sys_temperature_oid = $baseoid . ".1.2"; # system temperature (NOT SUPPORTED YET)
## Status codes for logical drives
# 1st column has special meaning:
# 'optimal' is for OK status,
# 'degraded' if it is CRITICAL is forced to WARNING if drive is being rebuild and has WARNING state
# 'initialize' & checkconsistency are just regular WARNING and no longer have special meaning
%LOGDRV_CODES = ( # 1st column has special meaning when its 'optimal' and 'degraded'
1 => ['optimal', 'RAID is funtioning normally', 'OK' ],
2 => ['degraded', 'RAID is being repaired', 'CRITICAL' ],
3 => ['initialize', 'RAID is being migrated', 'WARNING' ],
4 => ['initialize', 'RAID is being expanded', 'WARNING' ],
5 => ['initialize', 'RAID is being deleted', 'WARNING' ],
6 => ['initialize', 'RAID is being created', 'WARNING' ],
7 => ['initialize', 'RAID is being synced', 'WARNING' ],
8 => ['checkconsistency', 'parity checking of RAID array', 'OK' ],
9 => ['initialize', 'RAID is being assembled', 'WARNING' ],
10 => ['initialize', 'cancel operation', 'WARNING' ], # unsure what to put here for 1st column
11 => [ 'degraded', 'RAID array is degraded but failure is tolerable', 'WARNING' ],
12 => [ 'failed', 'RAID array has crashed and now in read-only', 'CRITICAL' ],
);
## Status codes for physical drives
%PHYDRV_CODES = (
1 => ['normal', 'disk is ok', 'OK'],
2 => ['initialized', 'disk has partitions and no data', 'WARNING'],
3 => ['notinitialized', 'disk has not been initialized', 'WARNING'],
4 => ['partitionfailed', 'partitions on disk are damaged', 'CRITICAL'],
5 => ['crashed', 'the disk ha failed', 'CRITICAL'],
);
## Controller Systems Status OIDs
%controller_status_oids = (
"systemstatus" => $baseoid . ".1.1",
"powersupply" => $baseoid . ".1.3",
"systemfan" => $baseoid . ".1.4.1",
"cpufan" => $baseoid . ".1.4.2",
);
## Controller general status OID
%controller_status_codes = (
1 => ['good', 'ok', 'OK' ],
2 => ['bad', 'bad', 'CRITICAL' ],
);
}
else {
usage("Specified card type $cardtype is not supported\n");
}
}
# get descriptive text for type of error from config arrays
sub code_to_description {
my($CODES, $code) = @_;
my %CODES = %{$CODES};
if (defined($CODES{$code})) {
return $CODES{$code}[1];
}
else {
return "unknown code $code";
}
}
# get nagios status exit code for type of error from config arrays
sub code_to_nagiosstatus {
my($CODES, $code, $current_status) = @_;
my %CODES = %{$CODES};
my $exit_code = "OK";
if (defined($CODES{$code})) {
$exit_code=$CODES{$code}[2];
}
else {
$exit_code=$alert; # should this be $alert ?
}
$exit_code = $current_status if defined($current_status) && $ERRORS{$exit_code}<$ERRORS{$current_status};
return $exit_code;
}
# verbose output for debugging (updated 06/06/12 to write to debug file if specified)
sub verb {
my $t=shift;
if ($DEBUG) {
if ($opt_debug eq "") {
print $t, "\n";
}
else {
if (!open (DEBUGFILE, ">>$opt_debug")) {
print $t, "\n";
}
else {
print DEBUGFILE $t,"\n";
close DEBUGFILE;
}
}
}
}
# version flag function
sub print_version {
print "$0 version $version\n";
}
# display usage information
sub print_usage {
print "Usage:\n";
print "$0 [-s <snmp_version>] -H <host> (-C <snmp_community>) | (-l login -x passwd [-X pass -L <authp>,<privp>) [-p <port>] [-t <timeout>] [-O <base oid>] [-A <status label text>] [-a <alert level>] [--extra_info] [--check_battery] [--multiple_controllers] [-g <num good drives>] [--drive_errors -P <previous performance data> -S <previous state>] [-v [DebugLogFile] || -d DebugLogFile] [--debug_time] [--snmp_optimize] [--bulk_snmp_queries=<optimize|std|on|off>] [--msgsize=<num octets>] [-T megaraid|sasraid|perc3|perc4|perc5|perc6|mptfusion|sas6ir|sas6|adaptec|smartarray|eti|ultrastor|synology\n OR \n";
print "$0 --version | $0 --help (use this to see get more detailed documentation of above options)\n";
}
sub usage {
print $_."\n" foreach @_;
print_usage();
exit $ERRORS{'UNKNOWN'};
}
# display help information
sub help {
print_version();
print "GPL 3.0 license (c) 2006-2013 William Leibzon\n";
print "This plugin uses SNMP to check state of RAID controllers and attached drives.\n";
print "Supported brands are: LSI, MPTFusion, Dell PERC, Adaptec, HP SmartArray, ETI Ultrastor and more.\n";
print "\n";
print_usage();
print "\n";
print "Options:\n";
print " -h, --help\n";
print " Display help\n";
print " -V, --version\n";
print " Display version\n";
print " -A --label <string>\n";
print " Specify text to be printed first in nagios status line. Default label of \"Raid\"\n";
print " -T, --controller_type <type>\n";
print " Type of controller, specify one of:\n";
print " megaraid|sasraid|perc3|perc4|perc5|perc6|perch700|mptfusion|sas6|sas6ir|\n";
print " adaptec|hp|smartarray|eti|ultrastor|synology\n";
print " (aliases: megaraid=perc3,perc4; sasraid=perc5,perc6,perch700;\n";
print " mptfusion=sas6ir,sas6; smartarray=hp; ultrastor=eti)\n";
print " Note: 'sasraid' has been default type if not specified for 1.x, 2.1 and 2.2\n";
print " plugin versions. From 2.3 specifying controller type will be required!\n";
print " -a, --alert <alert level>\n";
print " Alert status to use if an error condition not otherwise known is found\n";
print " accepted values are: \"crit\" and \"warn\" (defaults to crit)\n";
print " Note: This option should not be used any more and will be depreciated in 3.x version\n";
print " type of alert depending on SNMP status from MIB is now specified in arrays for each card type\n";
print " (except old special cases like megaraid & sasraid drive error counts for which it still applies)\n";
print " -m, --multiple_controllers\n";
print " Enables better support of multiple controllers. Retrieves several additional controller->drive map\n";
print " and status tables. Curretly valid only for sasraid cards\n";
print " -b, --check_battery\n";
print " Check and output information on hard drive batteries (BBU) for supported cards\n";
print " 'sasraid' and 'adaptec' card types are currently supported, more maybe added later\n";
print " -i, --extra_info\n";
print " Extra information in output. This may include rebuild rate, product & drive vendor names, etc\n";
print " -g, --good_drive <number>\n";
print " For sasraid check how many good drives should the system have. If its less than this, alert is issued\n";
print " -e, --drive_errors\n";
print " Do additonal checks for medium and other errors on each drive (only megaraid cards).\n";
print " This is about 2x as many SNMP check and so can slow plugin down.\n";
print " !! You will need to pass to plugin previous PERF data and STATE with -P and -S options !!\n";
print " -P, --perf <performance data>\n";
print ' The results of previous check performance data ($SERVICEPERFDATA$ macro)'."\n";
print " which contains number of medium and other errors that were before\n";
print " if this is not the same now then ALERT is sent\n";
print " -S, --state <STATE,STATETYPE>\n";
print " If you use -P and you have notifications sent to be sent at > 1 alerts\n";
print " then you need to send previous state and type (HARD or SOFT) and then\n";
print " this plugin would continue to report non-OK state until STATETYPE changes\n";
print " to HARD thereby making sure user receives NOTIFICATION\n";
print " Proper use of this is have '-S ".'"$SERVICESTATE$,$SERVICESTATETYPE$"'."' in your commands.cfg\n";
print "\nSNMP Access Options:\n";
print " -H, --hostname <host>\n";
print " Hostname or IP address of target to check\n";
print " -O, --oid <base oid>\n";
print " Base OID is normally set based on your controller and you almost never need to change it\n";
print " unless you custom-set it different for your card (the only case I know is when you have both\n";
print " percsnmp and sassnmp cards since normally each would want to use same megarad OID)\n";
print " -s, --snmp_version 1 | 2 | 2c | 3\n";
print " Version of SNMP protocol to use (default is 1 if -C and 3 if -l specified)\n";
print " -p, --port <port>\n";
print " SNMP port (defaults to 161)\n";
print " -C, --community <community>\n";
print " SNMP community string (for SNMP v1 and v2 only)\n";
print " -l, --login=LOGIN ; -x, --passwd=PASSWD\n";
print " Login and auth password for snmpv3 authentication\n";
print " If no priv password exists, implies AuthNoPriv\n";
print " -X, --privpass=PASSWD\n";
print " Priv password for snmpv3 (AuthPriv protocol)\n";
print " -L, --protocols=<authproto>,<privproto>\n";
print " <authproto> : Authentication protocol (md5|sha : default md5)\n";
print " <privproto> : Priv protocols (des|aes : default des)\n";
print " --octetlength=INTEGER, --msgsize=INTEGER\n";
print " Max-size of the SNMP message. Be carefull with network filters.\n";
print " Range 484 - 65535. Default is usually 1472,1452,1460 or 1440 depending on your system.\n";
print " If bulk_snmp_queries (see below) OR --multiple_controllers option (see above)\n";
print " are used then, it is reset to 15 times default or minimum of 16384.\n";
print " -t, --timeout <timeout>\n";
print " Seconds before timing out (defaults to Nagios timeout value or 30 seconds)\n";
print " -o, --snmp_optimize\n";
print " Try to minimize number of SNMP queries replacing snmp_walk with retrieval of OIDs at once\n";
print " !! EXPERIMENTAL, USE AT YOUR OWN RISK !!! Use --debug_time to check it is actually faster.\n";
print " --bulk_snmp_queries[=optimize|std|on|off]\n";
print " Enables or disables using GET_BULK_REQUEST to retrieve SNMP data. Options:\n";
print " 'on' will always try to use BULK_REQUESTS\n";
print " 'off' means do not use BULK_REQUESTS at all\n";
print " 'std' means bulk queries to get table with snmp v2 and v3 but not get requests\n";
print " 'optimize' means bulk queries for table and for get requests of > 30 OIDs\n";
print " Default setting (if --bulk_snmp_queries is not specified) is 'std' without -o\n";
print " and 'optimize' if -o option is specified. If you specify --bulk_snmp_queries\n";
print " without text option after =, this enables 'optimize' even if -o is not used.\n";
print "\nDebug Options:\n";
print " --debug[=FILENAME] | --verbose[=FILENAME]\n";
print " Enables verbose debug output printing exactly what data was retrieved from SNMP\n";
print " This is mainly for manual checks when testing this plugin on the console\n";
print " If filename is specified instead of STDOUT the debug data is written to that file\n";
print " --debug_time \n";
print " This must be used with '-P' option and measures on how long each SNMP operation takes\n";
print " The data gets output out as 'performance' data so this can be seen in nagios, but\n";
print " I'd not expect it to be graphed, just look at it from nagios status cgi when you need\n";
print "\n";
}
# process previous performance data
sub process_perf {
my %pdh;
foreach (split(' ',$_[0])) {
if (/(.*)=(\d+)/) {
verb("prev_perf: $1 = $2");
$pdh{$1}=$2 if $1 !~ /^time_/;
}
}
return %pdh;
}
# print output status and performance data and exit
sub print_and_exit {
my ($out_status,$out_str)=@_;
print "$label $out_status";
# max number of characters is $MAX_OUTPUTSTR defined at the top, if its set it to undef this is not checked
if (defined($out_str) && $out_str) {
$out_str = substr($out_str,0,$MAX_OUTPUTSTR) if defined($MAX_OUTPUTSTR) && length($out_str) > $MAX_OUTPUTSTR;
print " - $out_str";
}
if (defined($opt_perfdata)) {
print " |";
# below is done to force notification on alert condition when you have notifications after 2 or more alerts
if (scalar(keys %curr_perf)!=0 && (!defined($opt_prevstate) || scalar(keys %prev_perf)==0 || (defined($prev_state[0]) && $prev_state[0] ne 'OK' && (!defined($prev_state[1]) || $prev_state[1] eq 'HARD')))) {
print " ". $_ ."=". $curr_perf{$_} foreach keys %curr_perf;
}
else {
print " ". $_ ."=". $prev_perf{$_} foreach keys %prev_perf;
print " total_merr=".$curr_perf{'total_merr'} if defined($curr_perf{'total_merr'});
print " total_merr=".$curr_perf{'total_oerr'} if defined($curr_perf{'total_oerr'});
}
if ($opt_debugtime) {
print " time_".$_ ."=". $debug_time{$_} foreach keys %debug_time;
}
}
print "\n";
exit $ERRORS{$out_status};
}
# Function to parse command line arguments
sub check_options {
Getopt::Long::Configure('bundling', 'no_ignore_case');
GetOptions (
'h' => \$o_help, 'help' => \$o_help,
'V' => \$o_version, 'version' => \$o_version,
't:s' => \$o_timeout, 'timeout:s' => \$o_timeout,
'A:s' => \$opt_label, 'label:s' => \$opt_label,
'O:s' => \$opt_baseoid, 'oid:s' => \$opt_baseoid,
'a:s' => \$opt_alert, 'alert:s' => \$opt_alert,
'v:s' => \$opt_debug, 'verbose:s' => \$opt_debug,
'd:s' => \$opt_debug, 'debug:s' => \$opt_debug,
'debug_time' => \$opt_debugtime,
'P:s' => \$opt_perfdata, 'perf:s' => \$opt_perfdata,
'S:s' => \$opt_prevstate, 'state:s' => \$opt_prevstate,
'e' => \$opt_drverrors, 'drive_errors' => \$opt_drverrors,
'g:i' => \$opt_gooddrives, 'good_drives' => \$opt_gooddrives,
'o' => \$opt_optimize, 'snmp_optimize' => \$opt_optimize,
'i' => \$opt_extrainfo, 'extra_info' => \$opt_extrainfo,
'b' => \$opt_battery, 'check_battery' => \$opt_battery,
'T:s' => \$opt_cardtype, 'controller_type:s' => \$opt_cardtype,
'm' => \$opt_multcontrollers, 'multiple_controllers' => \$opt_multcontrollers,
'C:s' => \$o_community, 'community:s' => \$o_community,
's:s' => \$opt_snmpversion, 'snmp_version:s' => \$opt_snmpversion,
'H:s' => \$o_host, 'hostname:s' => \$o_host,
'p:i' => \$o_port, 'port:i' => \$o_port,
'l:s' => \$o_login, 'login:s' => \$o_login,
'x:s' => \$o_passwd, 'passwd:s' => \$o_passwd,
'X:s' => \$o_privpass, 'privpass:s' => \$o_privpass,
'L:s' => \$v3protocols, 'protocols:s' => \$v3protocols,
'msgsize:i' => \$o_octetlength, 'octetlength:i' => \$o_octetlength,
'bulk_snmp_queries:s' => \$o_bulksnmp,
);
if (defined($o_help)) { help(); exit $ERRORS{"UNKNOWN"}; };
if (defined($o_version)) { print_version(); exit $ERRORS{"UNKNOWN"}; };
# hostname
if (defined($o_host) && $o_host) {
if ($o_host =~ m/^([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+|[a-zA-Z][-a-zA-Z0-9]*(\.[a-zA-Z][-a-zA-Z0-9]*)*)$/) {
$o_host = $1;
}
else {
usage("Invalid hostname: $o_host\n");
}
}
else {
usage("Hostname or IP address not specified\n");
}
if ($no_snmp) {
print "Can't locate Net/SNMP.pm\n"; exit $ERRORS{"UNKNOWN"};
}
# snmp version parameter, default auto-detect with version 1 if community is specified
if (!defined($opt_snmpversion)) {
if (defined($o_community) && !defined($o_login) && !defined($o_passwd)) {
$opt_snmpversion = '1';
}
elsif (!defined($o_community) && defined($o_login) && defined($o_passwd)) {
$opt_snmpversion = '3';
}
else {
usage("Can not autodetect SNMP version when -C and -l are both specified\n");
}
}
if ($opt_snmpversion eq '2' || $opt_snmpversion eq '2c') {
$opt_snmpversion='2';
}
elsif ($opt_snmpversion ne '1' && $opt_snmpversion ne '3') {
usage("Invalid or unsupported value ($opt_snmpversion) for SNMP version\n");
}
if (defined($o_login) || defined($o_passwd)) {
if (defined($o_community)) { usage("Can't mix snmp v1,2c,3 protocols!\n"); }
if ($opt_snmpversion ne '3') { usage("Incorrect snmp version specified!\n"); }
}
if (defined($o_community)) {
if ($opt_snmpversion eq '3') { usage("SNMP version 3 does not use community\n"); }