check_ceph_status/check_ceph_status

276 lines
7.6 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Nagios plugin to check Ceph cluster state
This plugin check ceph health, number of OSDs UP, number of MONs UP
and PGs states to determine Ceph cluster status.
Usage: check_ceph_status [options]
Options:
-h, --help show this help message and exit
-d, --debug
-b BIN, --bin=BIN Ceph binary (default : /usr/bin/ceph)
--conf=CONF Ceph configuration file
-m MON, --mon=MON Ceph monitor address[:port]
-i ID, --id=ID Ceph client id
-k KEYRING, --keyring=KEYRING
Ceph client keyring file
-w WARNLOSTOSD, --warning-lost-osd=WARNLOSTOSD
Warning number of non-up OSDs (default : 1)
-c CRITLOSTOSD, --critical-lost-osd=CRITLOSTOSD
Critical number of non-up OSDs (default : 2)
-W WARNLOSTMON, --warning-lost-mon=WARNLOSTMON
Warning number of non-up MONs (default : 1)
-C CRITLOSTMON, --critical-lost-mon=CRITLOSTMON
Critical number of non-up MONs (default : 2)
Copyright (c) 2013 Benjamin Renard <brenard@zionetrix.net>
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License version 2
as published by the Free Software Foundation.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
"""
import argparse
import json
import os
import re
import subprocess
import sys
# default ceph values
CEPH_COMMAND = '/usr/bin/ceph'
WARN_LOST_OSD = 1
CRIT_LOST_OSD = 2
WARN_LOST_MON = 1
CRIT_LOST_MON = 2
# nagios exit code
STATUS = {
'OK': 0,
'WARNING': 1,
'CRITICAL': 2,
'UNKNOWN': 3
}
parser = argparse.ArgumentParser()
parser.add_argument(
'-d', '--debug',
action="store_true",
dest="debug",
default=False
)
parser.add_argument(
'-b', '--bin',
action="store",
dest="bin",
help="Ceph binary (default : %s)" % CEPH_COMMAND,
type=str,
default=CEPH_COMMAND
)
parser.add_argument(
'--conf',
action="store",
dest="conf",
help="Ceph configuration file",
type=str,
default=None
)
parser.add_argument(
'-m', '--mon',
action="store",
dest="mon",
help="Ceph monitor address[:port]",
type=str,
default=None
)
parser.add_argument(
'-i', '--id',
action="store",
dest="id",
help="Ceph client id",
type=str,
default=None
)
parser.add_argument(
'-k', '--keyring',
action="store",
dest="keyring",
help="Ceph client keyring file",
type=str,
default=None
)
parser.add_argument(
'-w', '--warning-lost-osd',
action="store",
dest="warnlostosd",
help="Warning number of non-up OSDs (default : %s)" % WARN_LOST_OSD,
type=int,
default=WARN_LOST_OSD
)
parser.add_argument(
'-c', '--critical-lost-osd',
action="store",
dest="critlostosd",
help="Critical number of non-up OSDs (default : %s)" % CRIT_LOST_OSD,
type=int,
default=CRIT_LOST_OSD
)
parser.add_argument(
'-W', '--warning-lost-mon',
action="store",
dest="warnlostmon",
help="Warning number of non-up MONs (default : %s)" % WARN_LOST_MON,
type=int,
default=WARN_LOST_MON
)
parser.add_argument(
'-C', '--critical-lost-mon',
action="store",
dest="critlostmon",
help="Critical number of non-up MONs (default : %s)" % CRIT_LOST_MON,
type=int,
default=CRIT_LOST_MON
)
options = parser.parse_args()
# validate args
if not os.path.exists(options.bin):
print("ERROR: ceph executable '%s' doesn't exist" % options.bin)
sys.exit(STATUS['UNKNOWN'])
if options.conf and not os.path.exists(options.conf):
print("ERROR: ceph conf file '%s' doesn't exist" % options.conf)
sys.exit(STATUS['UNKNOWN'])
if options.keyring and not os.path.exists(options.keyring):
print("ERROR: keyring file '%s' doesn't exist" % options.keyring)
sys.exit(STATUS['UNKNOWN'])
# build command
ceph_cmd = [options.bin]
if options.mon:
ceph_cmd.append('-m')
ceph_cmd.append(options.mon)
if options.conf:
ceph_cmd.append('-c')
ceph_cmd.append(options.conf)
if options.id:
ceph_cmd.append('--id')
ceph_cmd.append(options.id)
if options.keyring:
ceph_cmd.append('--keyring')
ceph_cmd.append(options.keyring)
ceph_cmd.append('status')
ceph_cmd.append('--format=json')
# exec command
# pylint: disable=consider-using-with
p = subprocess.Popen(ceph_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
output, err = p.communicate()
if not output:
print("UNKNOWN : fail to execute ceph status command")
sys.exit(STATUS['UNKNOWN'])
data = json.loads(output.decode(sys.getdefaultencoding()))
status = 'OK'
health = data['health'].get('status', data['health'].get('overall_status'))
if not health:
print("UNKNOWN : fail to retreive health status")
sys.exit(STATUS['UNKNOWN'])
if health == 'HEALTH_WARN':
status = 'WARNING'
elif health == 'HEALTH_CRIT':
status = 'CRITICAL'
total_mon = data['monmap'].get('num_mons', len(data['monmap'].get('mons', [])))
if not total_mon:
print("UNKNOWN : fail to retreive total number of monitors")
sys.exit(STATUS['UNKNOWN'])
total_mon_up = len(data.get(
'quorum', data['health'].get('timechecks', {}).get('mons', [])))
if not total_mon_up:
print("UNKNOWN : fail to retreive total number of UP monitors")
sys.exit(STATUS['UNKNOWN'])
num_lost_mon = total_mon-total_mon_up
if num_lost_mon == 0:
monstate = "(MONs UP : %s/%s)" % (total_mon_up, total_mon)
else:
monstate = "%s MONs down (MONs UP : %s/%s)" % (
num_lost_mon, total_mon_up, total_mon)
if num_lost_mon >= options.critlostmon:
status = 'CRITICAL'
elif num_lost_mon >= options.warnlostmon and status != 'CRITICAL':
status = 'WARNING'
total_osd = data['osdmap'].get('osdmap', data['osdmap']).get('num_osds')
if total_osd is None:
print("UNKNOWN : fail to retreive total number of OSD")
sys.exit(STATUS['UNKNOWN'])
total_osd_up = data['osdmap'].get('osdmap', data['osdmap']).get('num_up_osds')
if total_osd_up is None:
print("UNKNOWN : fail to retreive total number of UP OSD")
sys.exit(STATUS['UNKNOWN'])
num_lost_osd = total_osd - total_osd_up
if num_lost_osd >= options.critlostosd:
status = 'CRITICAL'
elif num_lost_osd >= options.warnlostosd and status != 'CRITICAL':
status = 'WARNING'
total_pg = data['pgmap']['num_pgs']
pgstate = ""
for st in data['pgmap']['pgs_by_state']:
if re.search(
'(down|inconsistent|imcomplete|stale)', st['state_name'],
re.IGNORECASE
):
status = 'CRITICAL'
pgstate = "%s / %s PGs %s" % (pgstate, st['count'], st['state_name'])
elif re.search(
'(replay|degraded|repair|recovering|backfill)', st['state_name'],
re.IGNORECASE
):
if status != 'CRITICAL':
status = "WARNING"
pgstate = "%s / %s PGs %s" % (pgstate, st['count'], st['state_name'])
elif st['state_name'] == "active+clean":
pgstate = "%s / %s/%s PGs active+clean" % (
pgstate, st['count'], total_pg)
msg = "%s : %s%s %s" % (status, health, pgstate, monstate)
if num_lost_osd == 0:
print("%s (OSDs UP : %s/%s)" % (msg, total_osd_up, total_osd))
else:
print("%s / %s OSDs down (OSDs UP : %s/%s)" % (
msg, num_lost_osd, total_osd_up, total_osd))
sys.exit(STATUS[status])