This commit is contained in:
parent
ae41104f5a
commit
9d2994014e
2 changed files with 63 additions and 48 deletions
3
.pylintrc
Normal file
3
.pylintrc
Normal file
|
@ -0,0 +1,3 @@
|
|||
[MESSAGES CONTROL]
|
||||
disable=consider-using-f-string,
|
||||
invalid-name
|
|
@ -1,45 +1,46 @@
|
|||
#!/usr/bin/env python3
|
||||
#
|
||||
# Nagios plugin to check Ceph cluster state
|
||||
#
|
||||
# This plugin check ceph health, number of OSDs UP, number of MONs UP
|
||||
# and PGs states to determine Ceph cluster status.
|
||||
#
|
||||
# Usage: check_ceph_status [options]
|
||||
#
|
||||
# Options:
|
||||
# -h, --help show this help message and exit
|
||||
# -d, --debug
|
||||
# -b BIN, --bin=BIN Ceph binary (default : /usr/bin/ceph)
|
||||
# --conf=CONF Ceph configuration file
|
||||
# -m MON, --mon=MON Ceph monitor address[:port]
|
||||
# -i ID, --id=ID Ceph client id
|
||||
# -k KEYRING, --keyring=KEYRING
|
||||
# Ceph client keyring file
|
||||
# -w WARNLOSTOSD, --warning-lost-osd=WARNLOSTOSD
|
||||
# Warning number of non-up OSDs (default : 1)
|
||||
# -c CRITLOSTOSD, --critical-lost-osd=CRITLOSTOSD
|
||||
# Critical number of non-up OSDs (default : 2)
|
||||
# -W WARNLOSTMON, --warning-lost-mon=WARNLOSTMON
|
||||
# Warning number of non-up MONs (default : 1)
|
||||
# -C CRITLOSTMON, --critical-lost-mon=CRITLOSTMON
|
||||
# Critical number of non-up MONs (default : 2)
|
||||
#
|
||||
# Copyright (c) 2013 Benjamin Renard <brenard@zionetrix.net>
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU General Public License version 2
|
||||
# as published by the Free Software Foundation.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
#
|
||||
"""
|
||||
Nagios plugin to check Ceph cluster state
|
||||
|
||||
This plugin check ceph health, number of OSDs UP, number of MONs UP
|
||||
and PGs states to determine Ceph cluster status.
|
||||
|
||||
Usage: check_ceph_status [options]
|
||||
|
||||
Options:
|
||||
-h, --help show this help message and exit
|
||||
-d, --debug
|
||||
-b BIN, --bin=BIN Ceph binary (default : /usr/bin/ceph)
|
||||
--conf=CONF Ceph configuration file
|
||||
-m MON, --mon=MON Ceph monitor address[:port]
|
||||
-i ID, --id=ID Ceph client id
|
||||
-k KEYRING, --keyring=KEYRING
|
||||
Ceph client keyring file
|
||||
-w WARNLOSTOSD, --warning-lost-osd=WARNLOSTOSD
|
||||
Warning number of non-up OSDs (default : 1)
|
||||
-c CRITLOSTOSD, --critical-lost-osd=CRITLOSTOSD
|
||||
Critical number of non-up OSDs (default : 2)
|
||||
-W WARNLOSTMON, --warning-lost-mon=WARNLOSTMON
|
||||
Warning number of non-up MONs (default : 1)
|
||||
-C CRITLOSTMON, --critical-lost-mon=CRITLOSTMON
|
||||
Critical number of non-up MONs (default : 2)
|
||||
|
||||
Copyright (c) 2013 Benjamin Renard <brenard@zionetrix.net>
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU General Public License version 2
|
||||
as published by the Free Software Foundation.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
|
@ -185,6 +186,7 @@ ceph_cmd.append('status')
|
|||
ceph_cmd.append('--format=json')
|
||||
|
||||
# exec command
|
||||
# pylint: disable=consider-using-with
|
||||
p = subprocess.Popen(ceph_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
output, err = p.communicate()
|
||||
|
||||
|
@ -209,7 +211,8 @@ total_mon = data['monmap'].get('num_mons', len(data['monmap'].get('mons', [])))
|
|||
if not total_mon:
|
||||
print("UNKNOWN : fail to retreive total number of monitors")
|
||||
sys.exit(STATUS['UNKNOWN'])
|
||||
total_mon_up = len(data.get('quorum', data['health'].get('timechecks', {}).get('mons', [])))
|
||||
total_mon_up = len(data.get(
|
||||
'quorum', data['health'].get('timechecks', {}).get('mons', [])))
|
||||
if not total_mon_up:
|
||||
print("UNKNOWN : fail to retreive total number of UP monitors")
|
||||
sys.exit(STATUS['UNKNOWN'])
|
||||
|
@ -218,7 +221,8 @@ num_lost_mon = total_mon-total_mon_up
|
|||
if num_lost_mon == 0:
|
||||
monstate = "(MONs UP : %s/%s)" % (total_mon_up, total_mon)
|
||||
else:
|
||||
monstate = "%s MONs down (MONs UP : %s/%s)" % (num_lost_mon, total_mon_up, total_mon)
|
||||
monstate = "%s MONs down (MONs UP : %s/%s)" % (
|
||||
num_lost_mon, total_mon_up, total_mon)
|
||||
if num_lost_mon >= options.critlostmon:
|
||||
status = 'CRITICAL'
|
||||
elif num_lost_mon >= options.warnlostmon and status != 'CRITICAL':
|
||||
|
@ -243,15 +247,22 @@ elif num_lost_osd >= options.warnlostosd and status != 'CRITICAL':
|
|||
total_pg = data['pgmap']['num_pgs']
|
||||
pgstate = ""
|
||||
for st in data['pgmap']['pgs_by_state']:
|
||||
if re.search('(down|inconsistent|imcomplete|stale)', st['state_name'], re.IGNORECASE):
|
||||
if re.search(
|
||||
'(down|inconsistent|imcomplete|stale)', st['state_name'],
|
||||
re.IGNORECASE
|
||||
):
|
||||
status = 'CRITICAL'
|
||||
pgstate = "%s / %s PGs %s" % (pgstate, st['count'], st['state_name'])
|
||||
elif re.search('(replay|degraded|repair|recovering|backfill)', st['state_name'], re.IGNORECASE):
|
||||
elif re.search(
|
||||
'(replay|degraded|repair|recovering|backfill)', st['state_name'],
|
||||
re.IGNORECASE
|
||||
):
|
||||
if status != 'CRITICAL':
|
||||
status = "WARNING"
|
||||
pgstate = "%s / %s PGs %s" % (pgstate, st['count'], st['state_name'])
|
||||
elif st['state_name'] == "active+clean":
|
||||
pgstate = "%s / %s/%s PGs active+clean" % (pgstate, st['count'], total_pg)
|
||||
pgstate = "%s / %s/%s PGs active+clean" % (
|
||||
pgstate, st['count'], total_pg)
|
||||
|
||||
msg = "%s : %s%s %s" % (status, health, pgstate, monstate)
|
||||
|
||||
|
@ -259,5 +270,6 @@ msg = "%s : %s%s %s" % (status, health, pgstate, monstate)
|
|||
if num_lost_osd == 0:
|
||||
print("%s (OSDs UP : %s/%s)" % (msg, total_osd_up, total_osd))
|
||||
else:
|
||||
print("%s / %s OSDs down (OSDs UP : %s/%s)" % (msg, num_lost_osd, total_osd_up, total_osd))
|
||||
print("%s / %s OSDs down (OSDs UP : %s/%s)" % (
|
||||
msg, num_lost_osd, total_osd_up, total_osd))
|
||||
sys.exit(STATUS[status])
|
||||
|
|
Loading…
Reference in a new issue