check_ceph_status/check_ceph_status

276 lines
7.6 KiB
Plaintext
Raw Normal View History

2022-12-15 18:28:21 +01:00
#!/usr/bin/env python3
2022-12-17 00:16:11 +01:00
"""
Nagios plugin to check Ceph cluster state
This plugin check ceph health, number of OSDs UP, number of MONs UP
and PGs states to determine Ceph cluster status.
Usage: check_ceph_status [options]
Options:
-h, --help show this help message and exit
-d, --debug
-b BIN, --bin=BIN Ceph binary (default : /usr/bin/ceph)
--conf=CONF Ceph configuration file
-m MON, --mon=MON Ceph monitor address[:port]
-i ID, --id=ID Ceph client id
-k KEYRING, --keyring=KEYRING
Ceph client keyring file
-w WARNLOSTOSD, --warning-lost-osd=WARNLOSTOSD
Warning number of non-up OSDs (default : 1)
-c CRITLOSTOSD, --critical-lost-osd=CRITLOSTOSD
Critical number of non-up OSDs (default : 2)
-W WARNLOSTMON, --warning-lost-mon=WARNLOSTMON
Warning number of non-up MONs (default : 1)
-C CRITLOSTMON, --critical-lost-mon=CRITLOSTMON
Critical number of non-up MONs (default : 2)
Copyright (c) 2013 Benjamin Renard <brenard@zionetrix.net>
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License version 2
as published by the Free Software Foundation.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
"""
2022-12-15 18:28:21 +01:00
import argparse
import json
2022-12-15 18:28:21 +01:00
import os
import re
2022-12-15 18:28:21 +01:00
import subprocess
import sys
2013-12-03 23:44:34 +01:00
# default ceph values
CEPH_COMMAND = '/usr/bin/ceph'
WARN_LOST_OSD = 1
CRIT_LOST_OSD = 2
WARN_LOST_MON = 1
CRIT_LOST_MON = 2
# nagios exit code
STATUS = {
'OK': 0,
'WARNING': 1,
'CRITICAL': 2,
'UNKNOWN': 3
2013-12-03 23:44:34 +01:00
}
parser = argparse.ArgumentParser()
parser.add_argument(
'-d', '--debug',
action="store_true",
dest="debug",
default=False
)
parser.add_argument(
'-b', '--bin',
action="store",
dest="bin",
help="Ceph binary (default : %s)" % CEPH_COMMAND,
type=str,
default=CEPH_COMMAND
)
parser.add_argument(
'--conf',
action="store",
dest="conf",
help="Ceph configuration file",
type=str,
default=None
)
parser.add_argument(
'-m', '--mon',
action="store",
dest="mon",
help="Ceph monitor address[:port]",
type=str,
default=None
)
parser.add_argument(
'-i', '--id',
action="store",
dest="id",
help="Ceph client id",
type=str,
default=None
)
parser.add_argument(
'-k', '--keyring',
action="store",
dest="keyring",
help="Ceph client keyring file",
type=str,
default=None
)
parser.add_argument(
'-w', '--warning-lost-osd',
action="store",
dest="warnlostosd",
help="Warning number of non-up OSDs (default : %s)" % WARN_LOST_OSD,
type=int,
default=WARN_LOST_OSD
)
parser.add_argument(
'-c', '--critical-lost-osd',
action="store",
dest="critlostosd",
help="Critical number of non-up OSDs (default : %s)" % CRIT_LOST_OSD,
type=int,
default=CRIT_LOST_OSD
)
parser.add_argument(
'-W', '--warning-lost-mon',
action="store",
dest="warnlostmon",
help="Warning number of non-up MONs (default : %s)" % WARN_LOST_MON,
type=int,
default=WARN_LOST_MON
)
parser.add_argument(
'-C', '--critical-lost-mon',
action="store",
dest="critlostmon",
help="Critical number of non-up MONs (default : %s)" % CRIT_LOST_MON,
type=int,
default=CRIT_LOST_MON
)
options = parser.parse_args()
2013-12-03 23:44:34 +01:00
2022-12-17 00:16:11 +01:00
# validate args
2013-12-03 23:44:34 +01:00
if not os.path.exists(options.bin):
print("ERROR: ceph executable '%s' doesn't exist" % options.bin)
2013-12-03 23:44:34 +01:00
sys.exit(STATUS['UNKNOWN'])
if options.conf and not os.path.exists(options.conf):
print("ERROR: ceph conf file '%s' doesn't exist" % options.conf)
2013-12-03 23:44:34 +01:00
sys.exit(STATUS['UNKNOWN'])
if options.keyring and not os.path.exists(options.keyring):
print("ERROR: keyring file '%s' doesn't exist" % options.keyring)
2013-12-03 23:44:34 +01:00
sys.exit(STATUS['UNKNOWN'])
# build command
ceph_cmd = [options.bin]
if options.mon:
ceph_cmd.append('-m')
ceph_cmd.append(options.mon)
if options.conf:
ceph_cmd.append('-c')
ceph_cmd.append(options.conf)
if options.id:
ceph_cmd.append('--id')
ceph_cmd.append(options.id)
if options.keyring:
ceph_cmd.append('--keyring')
ceph_cmd.append(options.keyring)
ceph_cmd.append('status')
ceph_cmd.append('--format=json')
2013-12-03 23:44:34 +01:00
# exec command
2022-12-17 00:16:11 +01:00
# pylint: disable=consider-using-with
2022-12-15 18:41:01 +01:00
p = subprocess.Popen(ceph_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
output, err = p.communicate()
2013-12-03 23:44:34 +01:00
2021-05-21 13:04:32 +02:00
if not output:
print("UNKNOWN : fail to execute ceph status command")
sys.exit(STATUS['UNKNOWN'])
2021-05-21 13:04:32 +02:00
data = json.loads(output.decode(sys.getdefaultencoding()))
2021-05-21 13:04:32 +02:00
status = 'OK'
health = data['health'].get('status', data['health'].get('overall_status'))
if not health:
print("UNKNOWN : fail to retreive health status")
sys.exit(STATUS['UNKNOWN'])
if health == 'HEALTH_WARN':
status = 'WARNING'
elif health == 'HEALTH_CRIT':
status = 'CRITICAL'
2021-05-21 13:04:32 +02:00
total_mon = data['monmap'].get('num_mons', len(data['monmap'].get('mons', [])))
if not total_mon:
print("UNKNOWN : fail to retreive total number of monitors")
sys.exit(STATUS['UNKNOWN'])
2022-12-17 00:16:11 +01:00
total_mon_up = len(data.get(
'quorum', data['health'].get('timechecks', {}).get('mons', [])))
2021-05-21 13:04:32 +02:00
if not total_mon_up:
print("UNKNOWN : fail to retreive total number of UP monitors")
sys.exit(STATUS['UNKNOWN'])
num_lost_mon = total_mon-total_mon_up
if num_lost_mon == 0:
monstate = "(MONs UP : %s/%s)" % (total_mon_up, total_mon)
else:
2022-12-17 00:16:11 +01:00
monstate = "%s MONs down (MONs UP : %s/%s)" % (
num_lost_mon, total_mon_up, total_mon)
2021-05-21 13:04:32 +02:00
if num_lost_mon >= options.critlostmon:
status = 'CRITICAL'
2021-05-21 13:04:32 +02:00
elif num_lost_mon >= options.warnlostmon and status != 'CRITICAL':
status = 'WARNING'
2021-05-21 13:04:32 +02:00
total_osd = data['osdmap'].get('osdmap', data['osdmap']).get('num_osds')
if total_osd is None:
print("UNKNOWN : fail to retreive total number of OSD")
sys.exit(STATUS['UNKNOWN'])
total_osd_up = data['osdmap'].get('osdmap', data['osdmap']).get('num_up_osds')
if total_osd_up is None:
print("UNKNOWN : fail to retreive total number of UP OSD")
sys.exit(STATUS['UNKNOWN'])
2021-05-21 13:04:32 +02:00
num_lost_osd = total_osd - total_osd_up
if num_lost_osd >= options.critlostosd:
status = 'CRITICAL'
elif num_lost_osd >= options.warnlostosd and status != 'CRITICAL':
status = 'WARNING'
total_pg = data['pgmap']['num_pgs']
pgstate = ""
for st in data['pgmap']['pgs_by_state']:
2022-12-17 00:16:11 +01:00
if re.search(
'(down|inconsistent|imcomplete|stale)', st['state_name'],
re.IGNORECASE
):
2021-05-21 13:04:32 +02:00
status = 'CRITICAL'
pgstate = "%s / %s PGs %s" % (pgstate, st['count'], st['state_name'])
2022-12-17 00:16:11 +01:00
elif re.search(
'(replay|degraded|repair|recovering|backfill)', st['state_name'],
re.IGNORECASE
):
2021-05-21 13:04:32 +02:00
if status != 'CRITICAL':
status = "WARNING"
pgstate = "%s / %s PGs %s" % (pgstate, st['count'], st['state_name'])
elif st['state_name'] == "active+clean":
2022-12-17 00:16:11 +01:00
pgstate = "%s / %s/%s PGs active+clean" % (
pgstate, st['count'], total_pg)
2021-05-21 13:04:32 +02:00
msg = "%s : %s%s %s" % (status, health, pgstate, monstate)
if num_lost_osd == 0:
print("%s (OSDs UP : %s/%s)" % (msg, total_osd_up, total_osd))
else:
2022-12-17 00:16:11 +01:00
print("%s / %s OSDs down (OSDs UP : %s/%s)" % (
msg, num_lost_osd, total_osd_up, total_osd))
2021-05-21 13:04:32 +02:00
sys.exit(STATUS[status])