#!/usr/bin/env python3 """ Nagios plugin to check Ceph cluster state This plugin check ceph health, number of OSDs UP, number of MONs UP and PGs states to determine Ceph cluster status. Usage: check_ceph_status [options] Options: -h, --help show this help message and exit -d, --debug -b BIN, --bin=BIN Ceph binary (default : /usr/bin/ceph) --conf=CONF Ceph configuration file -m MON, --mon=MON Ceph monitor address[:port] -i ID, --id=ID Ceph client id -k KEYRING, --keyring=KEYRING Ceph client keyring file -w WARNLOSTOSD, --warning-lost-osd=WARNLOSTOSD Warning number of non-up OSDs (default : 1) -c CRITLOSTOSD, --critical-lost-osd=CRITLOSTOSD Critical number of non-up OSDs (default : 2) -W WARNLOSTMON, --warning-lost-mon=WARNLOSTMON Warning number of non-up MONs (default : 1) -C CRITLOSTMON, --critical-lost-mon=CRITLOSTMON Critical number of non-up MONs (default : 2) Copyright (c) 2013 Benjamin Renard This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License version 2 as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. """ import argparse import json import os import re import subprocess import sys # default ceph values CEPH_COMMAND = '/usr/bin/ceph' WARN_LOST_OSD = 1 CRIT_LOST_OSD = 2 WARN_LOST_MON = 1 CRIT_LOST_MON = 2 # nagios exit code STATUS = { 'OK': 0, 'WARNING': 1, 'CRITICAL': 2, 'UNKNOWN': 3 } parser = argparse.ArgumentParser() parser.add_argument( '-d', '--debug', action="store_true", dest="debug", default=False ) parser.add_argument( '-b', '--bin', action="store", dest="bin", help="Ceph binary (default : %s)" % CEPH_COMMAND, type=str, default=CEPH_COMMAND ) parser.add_argument( '--conf', action="store", dest="conf", help="Ceph configuration file", type=str, default=None ) parser.add_argument( '-m', '--mon', action="store", dest="mon", help="Ceph monitor address[:port]", type=str, default=None ) parser.add_argument( '-i', '--id', action="store", dest="id", help="Ceph client id", type=str, default=None ) parser.add_argument( '-k', '--keyring', action="store", dest="keyring", help="Ceph client keyring file", type=str, default=None ) parser.add_argument( '-w', '--warning-lost-osd', action="store", dest="warnlostosd", help="Warning number of non-up OSDs (default : %s)" % WARN_LOST_OSD, type=int, default=WARN_LOST_OSD ) parser.add_argument( '-c', '--critical-lost-osd', action="store", dest="critlostosd", help="Critical number of non-up OSDs (default : %s)" % CRIT_LOST_OSD, type=int, default=CRIT_LOST_OSD ) parser.add_argument( '-W', '--warning-lost-mon', action="store", dest="warnlostmon", help="Warning number of non-up MONs (default : %s)" % WARN_LOST_MON, type=int, default=WARN_LOST_MON ) parser.add_argument( '-C', '--critical-lost-mon', action="store", dest="critlostmon", help="Critical number of non-up MONs (default : %s)" % CRIT_LOST_MON, type=int, default=CRIT_LOST_MON ) options = parser.parse_args() # validate args if not os.path.exists(options.bin): print("ERROR: ceph executable '%s' doesn't exist" % options.bin) sys.exit(STATUS['UNKNOWN']) if options.conf and not os.path.exists(options.conf): print("ERROR: ceph conf file '%s' doesn't exist" % options.conf) sys.exit(STATUS['UNKNOWN']) if options.keyring and not os.path.exists(options.keyring): print("ERROR: keyring file '%s' doesn't exist" % options.keyring) sys.exit(STATUS['UNKNOWN']) # build command ceph_cmd = [options.bin] if options.mon: ceph_cmd.append('-m') ceph_cmd.append(options.mon) if options.conf: ceph_cmd.append('-c') ceph_cmd.append(options.conf) if options.id: ceph_cmd.append('--id') ceph_cmd.append(options.id) if options.keyring: ceph_cmd.append('--keyring') ceph_cmd.append(options.keyring) ceph_cmd.append('status') ceph_cmd.append('--format=json') # exec command # pylint: disable=consider-using-with p = subprocess.Popen(ceph_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) output, err = p.communicate() if not output: print("UNKNOWN : fail to execute ceph status command") sys.exit(STATUS['UNKNOWN']) data = json.loads(output.decode(sys.getdefaultencoding())) status = 'OK' health = data['health'].get('status', data['health'].get('overall_status')) if not health: print("UNKNOWN : fail to retreive health status") sys.exit(STATUS['UNKNOWN']) if health == 'HEALTH_WARN': status = 'WARNING' elif health == 'HEALTH_CRIT': status = 'CRITICAL' total_mon = data['monmap'].get('num_mons', len(data['monmap'].get('mons', []))) if not total_mon: print("UNKNOWN : fail to retreive total number of monitors") sys.exit(STATUS['UNKNOWN']) total_mon_up = len(data.get( 'quorum', data['health'].get('timechecks', {}).get('mons', []))) if not total_mon_up: print("UNKNOWN : fail to retreive total number of UP monitors") sys.exit(STATUS['UNKNOWN']) num_lost_mon = total_mon-total_mon_up if num_lost_mon == 0: monstate = "(MONs UP : %s/%s)" % (total_mon_up, total_mon) else: monstate = "%s MONs down (MONs UP : %s/%s)" % ( num_lost_mon, total_mon_up, total_mon) if num_lost_mon >= options.critlostmon: status = 'CRITICAL' elif num_lost_mon >= options.warnlostmon and status != 'CRITICAL': status = 'WARNING' total_osd = data['osdmap'].get('osdmap', data['osdmap']).get('num_osds') if total_osd is None: print("UNKNOWN : fail to retreive total number of OSD") sys.exit(STATUS['UNKNOWN']) total_osd_up = data['osdmap'].get('osdmap', data['osdmap']).get('num_up_osds') if total_osd_up is None: print("UNKNOWN : fail to retreive total number of UP OSD") sys.exit(STATUS['UNKNOWN']) num_lost_osd = total_osd - total_osd_up if num_lost_osd >= options.critlostosd: status = 'CRITICAL' elif num_lost_osd >= options.warnlostosd and status != 'CRITICAL': status = 'WARNING' total_pg = data['pgmap']['num_pgs'] pgstate = "" for st in data['pgmap']['pgs_by_state']: if re.search( '(down|inconsistent|imcomplete|stale)', st['state_name'], re.IGNORECASE ): status = 'CRITICAL' pgstate = "%s / %s PGs %s" % (pgstate, st['count'], st['state_name']) elif re.search( '(replay|degraded|repair|recovering|backfill)', st['state_name'], re.IGNORECASE ): if status != 'CRITICAL': status = "WARNING" pgstate = "%s / %s PGs %s" % (pgstate, st['count'], st['state_name']) elif st['state_name'] == "active+clean": pgstate = "%s / %s/%s PGs active+clean" % ( pgstate, st['count'], total_pg) msg = "%s : %s%s %s" % (status, health, pgstate, monstate) if num_lost_osd == 0: print("%s (OSDs UP : %s/%s)" % (msg, total_osd_up, total_osd)) else: print("%s / %s OSDs down (OSDs UP : %s/%s)" % ( msg, num_lost_osd, total_osd_up, total_osd)) sys.exit(STATUS[status])