435 lines
14 KiB
Bash
Executable file
435 lines
14 KiB
Bash
Executable file
#!/bin/bash
|
|
#
|
|
# Nagios plugin to check Postgresql streamin replication state
|
|
#
|
|
# Could be use on Master or on standby node
|
|
#
|
|
# Requirements:
|
|
#
|
|
# Some CLI tools: awk, sed, bc, psql and pg_lscluster
|
|
#
|
|
# On master node: Slaves must be able to connect with user from recovery.conf
|
|
# (or user specify using -U) to database with the same name
|
|
# (or another specified with -D) as trust (or via md5 using
|
|
# password specified in ~/.pgpass). This user must have
|
|
# SUPERUSER privilege (need to get replication details).
|
|
#
|
|
# On standby node: PG_USER must be able to connect localy on the database
|
|
# with the same name (or another specified with -D) as trust
|
|
# (or via md5 using password specified in ~/.pgpass).
|
|
#
|
|
# Author: Benjamin Renard <brenard@easter-eggs.com>
|
|
# Date: Wed, 04 Nov 2020 15:31:13 +0100
|
|
# Source: https://gogs.zionetrix.net/bn8/check_pg_streaming_replication
|
|
# SPDX-License-Identifier: GPL-3.0-or-later
|
|
#
|
|
|
|
DEFAULT_PG_USER=postgres
|
|
DEFAULT_PG_VERSION=9.1
|
|
DEFAULT_PG_MAIN=/var/lib/postgresql/$PG_VERSION/main
|
|
DEFAULT_PG_PORT=5432
|
|
PG_USER=""
|
|
PG_VERSION=""
|
|
PG_MAIN=""
|
|
PG_MASTER_USER=""
|
|
PSQL_BIN=/usr/bin/psql
|
|
PG_LSCLUSTER_BIN=/usr/bin/pg_lsclusters
|
|
RECOVERY_CONF_FILENAME=recovery.conf
|
|
RECOVERY_CONF=""
|
|
PG_DEFAULT_PORT=""
|
|
PG_DEFAULT_APP_NAME=$( hostname )
|
|
PG_DB=""
|
|
CHECK_CUR_MASTER_XLOG=1
|
|
REPLAY_WARNING_DELAY=3
|
|
REPLAY_CRITICAL_DELAY=5
|
|
|
|
DEBUG=0
|
|
|
|
function usage () {
|
|
cat << EOF
|
|
Usage : $0 [-d] [-h] [options]
|
|
-u pg_user Specify local Postgres user (Default: try to auto-detect or use $DEFAULT_PG_USER)
|
|
-b psql_bin Specify psql binary path (Default: $PSQL_BIN)
|
|
-B pg_lsclusters_bin Specify pg_lsclusters binary path (Default: $PG_LSCLUSTER_BIN)
|
|
-V pg_version Specify Postgres version (Default: try to auto-detect or use $DEFAULT_PG_VERSION)
|
|
-m pg_main Specify Postgres main directory path (Default: try to auto-detect or use
|
|
$DEFAULT_PG_MAIN)
|
|
-r recovery_conf Specify Postgres recovery configuration file path
|
|
(Default: [PG_MAIN]/$RECOVERY_CONF_FILENAME)
|
|
-U pg_master_user Specify Postgres user to use on master (Default: user from recovery.conf file)
|
|
-p pg_port Specify default Postgres master TCP port (Default: same as local PostgreSQL
|
|
port if detected or use $DEFAULT_PG_PORT)
|
|
-D dbname Specify DB name on Postgres master/slave to connect on (Default: PG_USER, must
|
|
match with .pgpass one is used)
|
|
-C 1/0 Enable or disable check if the current XLOG file of the master host is the same
|
|
of the last replay XLOG file (Default: $CHECK_CUR_MASTER_XLOG)
|
|
-w replay_warn_delay Specify the replay warning delay in second (Default: $REPLAY_WARNING_DELAY)
|
|
-c replay_crit_delay Specify the replay critical delay in second (Default: $REPLAY_CRITICAL_DELAY)
|
|
-d Debug mode
|
|
-h Show this message
|
|
EOF
|
|
exit 0
|
|
}
|
|
|
|
while getopts "hu:b:B:V:m:r:U:p:D:C:w:c:d" OPTION
|
|
do
|
|
case $OPTION in
|
|
u)
|
|
PG_USER=$OPTARG
|
|
;;
|
|
b)
|
|
PSQL_BIN=$OPTARG
|
|
;;
|
|
B)
|
|
PG_LSCLUSTER_BIN=$OPTARG
|
|
;;
|
|
V)
|
|
PG_VERSION=$OPTARG
|
|
;;
|
|
m)
|
|
PG_MAIN=$OPTARG
|
|
;;
|
|
r)
|
|
RECOVERY_CONF=$OPTARG
|
|
;;
|
|
U)
|
|
PG_MASTER_USER=$OPTARG
|
|
;;
|
|
p)
|
|
PG_DEFAULT_PORT=$OPTARG
|
|
;;
|
|
D)
|
|
PG_DB=$OPTARG
|
|
;;
|
|
C)
|
|
CHECK_CUR_MASTER_XLOG=$OPTARG
|
|
;;
|
|
w)
|
|
REPLAY_WARNING_DELAY=$OPTARG
|
|
;;
|
|
c)
|
|
REPLAY_CRITICAL_DELAY=$OPTARG
|
|
;;
|
|
d)
|
|
DEBUG=1
|
|
;;
|
|
h)
|
|
usage
|
|
;;
|
|
\?)
|
|
echo -n "Unkown option"
|
|
usage
|
|
esac
|
|
done
|
|
|
|
function debug() {
|
|
if [ $DEBUG -eq 1 ]
|
|
then
|
|
>&2 echo -e "[DEBUG] $1"
|
|
fi
|
|
}
|
|
|
|
debug "Starting options (before handling auto-detection/default values) :
|
|
PG_VERSION = $PG_VERSION
|
|
PG_DB = $PG_DB
|
|
PG_USER = $PG_USER
|
|
PSQL_BIN = $PSQL_BIN
|
|
PG_LSCLUSTER_BIN = $PG_LSCLUSTER_BIN
|
|
PG_MAIN = $PG_MAIN
|
|
RECOVERY_CONF = $RECOVERY_CONF
|
|
PG_DEFAULT_PORT = $PG_DEFAULT_PORT
|
|
PG_DEFAULT_APP_NAME = $PG_DEFAULT_APP_NAME
|
|
CHECK_CUR_MASTER_XLOG = $CHECK_CUR_MASTER_XLOG
|
|
REPLAY_WARNING_DELAY = $REPLAY_WARNING_DELAY
|
|
REPLAY_CRITICAL_DELAY = $REPLAY_CRITICAL_DELAY
|
|
"
|
|
|
|
# Auto-detect PostgreSQL information using pg_lsclusters
|
|
if [ -x "$PG_LSCLUSTER_BIN" ]
|
|
then
|
|
PG_CLUSTER=$( $PG_LSCLUSTER_BIN -h 2>/dev/null|head -n1 )
|
|
if [ -n "$PG_CLUSTER" ]
|
|
then
|
|
debug "pg_lsclusters output:\n\t$PG_CLUSTER"
|
|
# Output example:
|
|
# 9.6 main 5432 online,recovery postgres /var/lib/postgresql/9.6/main /var/log/postgresql/postgresql-9.6-main.log
|
|
[ -z "$PG_VERSION" ] && PG_VERSION=$( echo "$PG_CLUSTER"|awk -F ' +' '{print $1}' )
|
|
[ -z "$PG_DEFAULT_PORT" ] && PG_DEFAULT_PORT=$( echo "$PG_CLUSTER"|awk -F ' +' '{print $3}' )
|
|
[ -z "$PG_USER" ] && PG_USER=$( echo "$PG_CLUSTER"|awk -F ' +' '{print $5}' )
|
|
[ -z "$PG_MAIN" ] && PG_MAIN=$( echo "$PG_CLUSTER"|awk -F ' +' '{print $6}' )
|
|
fi
|
|
else
|
|
debug "pg_lsclusters not found ($PG_LSCLUSTER_BIN): parameters auto-detection disabled"
|
|
fi
|
|
|
|
# If auto-detection failed, use default values
|
|
[ -z "$PG_USER" ] && PG_USER="$DEFAULT_PG_USER"
|
|
[ -z "$PG_VERSION" ] && PG_VERSION="$DEFAULT_PG_VERSION"
|
|
[ -z "$PG_MAIN" ] && PG_MAIN="$DEFAULT_PG_MAIN"
|
|
[ -z "$PG_DEFAULT_PORT" ] && PG_DEFAULT_PORT="$DEFAULT_PG_PORT"
|
|
|
|
# Check PG_USER
|
|
[ -z "$PG_USER" ] && echo "UNKNOWN : Postgres user not specified" && exit 3
|
|
id "$PG_USER" > /dev/null 2>&1
|
|
[ $? -ne 0 ] && echo "UNKNOWN : Invalid Postgres user ($PG_USER)" && exit 3
|
|
|
|
# Check PSQL_BIN
|
|
[ ! -x "$PSQL_BIN" ] && echo "UNKNOWN : Invalid psql bin path ($PSQL_BIN)" && exit 3
|
|
|
|
# Check PG_MAIN
|
|
[ ! -d "$PG_MAIN/" ] && echo "UNKNOWN : Invalid Postgres main directory path ($PG_MAIN)" && exit 3
|
|
|
|
# Check RECOVERY_CONF
|
|
[ -z "$RECOVERY_CONF" ] && RECOVERY_CONF="$PG_MAIN/$RECOVERY_CONF_FILENAME"
|
|
|
|
# Check PG_DEFAULT_PORT
|
|
[ $( echo "$PG_DEFAULT_PORT"|grep -c -E '^[0-9]*$' ) -ne 1 ] && "UNKNOWN : Postgres default master TCP port must be an integer." && exit 3
|
|
|
|
# If PG_DB is not provided with -D parameter, use PG_USER as default value
|
|
[ -z "$PG_DB" ] && PG_DB="$PG_USER"
|
|
|
|
function psql_get () {
|
|
sql="$1"
|
|
debug "Exec 'echo \"$sql\"|sudo -u $PG_USER $PSQL_BIN -d \"$PG_DB\" -w -t -P format=unaligned"
|
|
echo "$sql"|sudo -u $PG_USER $PSQL_BIN -d "$PG_DB" -w -t -P format=unaligned
|
|
}
|
|
|
|
function psql_master_get () {
|
|
sql="$1"
|
|
debug "Exec 'echo \"$sql\"|sudo -u $PG_USER $PSQL_BIN -U $M_USER -h $M_HOST -w -p $M_PORT -d $PG_DB -t -P format=unaligned"
|
|
echo "$sql"|sudo -u $PG_USER $PSQL_BIN -U $M_USER -h $M_HOST -w -p $M_PORT -d $PG_DB -t -P format=unaligned
|
|
}
|
|
|
|
debug "Running options :
|
|
PG_VERSION = $PG_VERSION
|
|
PG_DB = $PG_DB
|
|
PG_USER = $PG_USER
|
|
PSQL_BIN = $PSQL_BIN
|
|
PG_LSCLUSTER_BIN = $PG_LSCLUSTER_BIN
|
|
PG_MAIN = $PG_MAIN
|
|
RECOVERY_CONF = $RECOVERY_CONF
|
|
PG_DEFAULT_PORT = $PG_DEFAULT_PORT
|
|
PG_DEFAULT_APP_NAME = $PG_DEFAULT_APP_NAME
|
|
CHECK_CUR_MASTER_XLOG = $CHECK_CUR_MASTER_XLOG
|
|
REPLAY_WARNING_DELAY = $REPLAY_WARNING_DELAY
|
|
REPLAY_CRITICAL_DELAY = $REPLAY_CRITICAL_DELAY
|
|
"
|
|
|
|
# Set some stuff to PostgreSQL version
|
|
if [ $( echo "$PG_VERSION < 10" |bc -l ) -eq 1 ]
|
|
then
|
|
pg_last_wal_receive_lsn='pg_last_xlog_receive_location()'
|
|
pg_last_wal_replay_lsn='pg_last_xlog_replay_location()'
|
|
pg_current_wal_lsn='pg_current_xlog_location()'
|
|
pg_wal_lsn_diff='pg_xlog_location_diff'
|
|
sent_lsn='sent_location'
|
|
write_lsn='write_location'
|
|
else
|
|
pg_last_wal_receive_lsn='pg_last_wal_receive_lsn()'
|
|
pg_last_wal_replay_lsn='pg_last_wal_replay_lsn()'
|
|
pg_current_wal_lsn='pg_current_wal_lsn()'
|
|
pg_wal_lsn_diff='pg_wal_lsn_diff'
|
|
sent_lsn='sent_lsn'
|
|
write_lsn='write_lsn'
|
|
fi
|
|
|
|
# Postgres is running ?
|
|
if [ $DEBUG -eq 0 ]
|
|
then
|
|
psql_get '\q' 2> /dev/null
|
|
else
|
|
psql_get '\q'
|
|
fi
|
|
if [ $? -ne 0 ]
|
|
then
|
|
echo "CRITICAL : Postgres is not running !"
|
|
exit 2
|
|
fi
|
|
debug "Postgres is running"
|
|
|
|
RECOVERY_MODE=0
|
|
[ "$( psql_get 'SELECT pg_is_in_recovery();' )" == "t" ] && RECOVERY_MODE=1
|
|
|
|
if [ -f $RECOVERY_CONF ]
|
|
then
|
|
debug "File recovery.conf found. Hot-standby mode."
|
|
|
|
# Check recovery mode
|
|
if [ $RECOVERY_MODE -ne 1 ]
|
|
then
|
|
echo "CRITICAL : Not in recovery mode while recovery.conf file found !"
|
|
exit 2
|
|
fi
|
|
debug "Postgres is in recovery mode"
|
|
|
|
LAST_XLOG_RECEIVE=$( psql_get "SELECT $pg_last_wal_receive_lsn" )
|
|
debug "Last xlog file receive : $LAST_XLOG_RECEIVE"
|
|
LAST_XLOG_REPLAY=$( psql_get "SELECT $pg_last_wal_replay_lsn" )
|
|
debug "Last xlog file replay : $LAST_XLOG_REPLAY"
|
|
|
|
|
|
# Get master connection informations from recovery.conf file
|
|
MASTER_CONN_INFOS=$( egrep '^ *primary_conninfo' $RECOVERY_CONF|sed "s/^ *primary_conninfo *= *\(.\+\) *$/\1/" )
|
|
if [ ! -n "$MASTER_CONN_INFOS" ]
|
|
then
|
|
echo "UNKNOWN : Can't retreive master connection informations form recovery.conf file"
|
|
exit 3
|
|
fi
|
|
debug "Master connection informations : $MASTER_CONN_INFOS"
|
|
|
|
M_HOST=$( echo "$MASTER_CONN_INFOS"| grep 'host=' | sed 's/^.*host= *\([0-9a-zA-Z.-]\+\) *.*$/\1/' )
|
|
if [ ! -n "$M_HOST" ]
|
|
then
|
|
echo "UNKNOWN : Can't retreive master host from recovery.conf file"
|
|
exit 3
|
|
fi
|
|
debug "Master host : $M_HOST"
|
|
|
|
M_PORT=$( echo "$MASTER_CONN_INFOS"| grep 'port=' | sed 's/^.*port= *\([0-9a-zA-Z.-]\+\) *.*$/\1/' )
|
|
if [ ! -n "$M_PORT" ]
|
|
then
|
|
debug "Master port not specified, use default : $PG_DEFAULT_PORT"
|
|
M_PORT=$PG_DEFAULT_PORT
|
|
else
|
|
debug "Master port : $M_PORT"
|
|
fi
|
|
|
|
if [ -n "$PG_MASTER_USER" ]
|
|
then
|
|
debug "Master user provided by command-line, use it : $PG_MASTER_USER"
|
|
M_USER="$PG_MASTER_USER"
|
|
else
|
|
M_USER=$( echo "$MASTER_CONN_INFOS"| grep 'user=' | sed 's/^.*user= *\([0-9a-zA-Z.-]\+\) *.*$/\1/' )
|
|
if [ ! -n "$M_USER" ]
|
|
then
|
|
debug "Master user not specified, use default : $PG_USER"
|
|
M_USER=$PG_USER
|
|
else
|
|
debug "Master user : $M_USER"
|
|
fi
|
|
fi
|
|
|
|
M_APP_NAME=$( echo "$MASTER_CONN_INFOS"| grep 'application_name=' | sed "s/^.*application_name=[ \'\"]*\([^ \'\"]\+\)[ \'\"]*.*$/\1/" )
|
|
if [ ! -n "$M_APP_NAME" ]
|
|
then
|
|
debug "Master application name not specified, use default : $PG_DEFAULT_APP_NAME"
|
|
M_APP_NAME=$PG_DEFAULT_APP_NAME
|
|
else
|
|
debug "Master application name : $M_APP_NAME"
|
|
fi
|
|
|
|
# Get current state information from master
|
|
M_CUR_REPL_STATE_INFO="$( psql_master_get "SELECT state, sync_state FROM pg_stat_replication WHERE application_name='$M_APP_NAME';" )"
|
|
if [ ! -n "$M_CUR_REPL_STATE_INFO" ]
|
|
then
|
|
echo "UNKNOWN : Can't retreive current replication state information from master server"
|
|
exit 3
|
|
fi
|
|
debug "Master current replication state:\n\tstate|sync_state\n\t$M_CUR_REPL_STATE_INFO"
|
|
|
|
M_CUR_STATE=$( echo "$M_CUR_REPL_STATE_INFO"|cut -d'|' -f1 )
|
|
debug "Master current state : $M_CUR_STATE"
|
|
if [ "$M_CUR_STATE" != "streaming" ]
|
|
then
|
|
echo "CRITICAL : this host is not in streaming state according to master host (current state = '$M_CUR_STATE')"
|
|
exit 2
|
|
fi
|
|
|
|
M_CUR_SYNC_STATE=$( echo "$M_CUR_REPL_STATE_INFO"|cut -d'|' -f2 )
|
|
debug "Master current sync state : $M_CUR_SYNC_STATE"
|
|
if [ "$M_CUR_SYNC_STATE" != "sync" ]
|
|
then
|
|
echo "CRITICAL : this host is not synchronized according to master host (current sync state = '$M_CUR_SYNC_STATE')"
|
|
exit 2
|
|
fi
|
|
|
|
# Check current master XLOG file vs last replay XLOG file
|
|
if [ "$CHECK_CUR_MASTER_XLOG" == "1" ]
|
|
then
|
|
# Get current xlog file from master
|
|
M_CUR_XLOG="$( psql_master_get 'SELECT pg_current_xlog_location()' )"
|
|
if [ ! -n "$M_CUR_XLOG" ]
|
|
then
|
|
echo "UNKNOWN : Can't retreive current xlog from master server"
|
|
exit 3
|
|
fi
|
|
debug "Master current xlog : $M_CUR_XLOG"
|
|
|
|
# Master current xlog is the last receive xlog ?
|
|
if [ "$M_CUR_XLOG" != "$LAST_XLOG_RECEIVE" ]
|
|
then
|
|
echo "CRITICAL : Master current xlog is not the last receive xlog"
|
|
exit 2
|
|
fi
|
|
debug "Master current xlog is the last receive xlog"
|
|
fi
|
|
|
|
# The last receive xlog is the last replay file ?
|
|
if [ "$LAST_XLOG_RECEIVE" != "$LAST_XLOG_REPLAY" ]
|
|
then
|
|
debug "/!\ The last receive xlog is NOT the last replay file ('$M_CUR_XLOG' / '$LAST_XLOG_RECEIVE')"
|
|
REPLAY_DELAY="$( psql_get 'SELECT EXTRACT(EPOCH FROM now() - pg_last_xact_replay_timestamp());' )"
|
|
debug "Replay delay is $REPLAY_DELAY second(s)"
|
|
if [ $( echo "$REPLAY_DELAY >= $REPLAY_CRITICAL_DELAY"|bc -l ) -gt 0 ]
|
|
then
|
|
echo "CRITICAL : last receive xlog file is not the last replay file ('$LAST_XLOG_RECEIVE' / '$LAST_XLOG_REPLAY') and replay delay is $REPLAY_DELAY second(s)"
|
|
exit 2
|
|
fi
|
|
if [ $( echo "$REPLAY_DELAY >= $REPLAY_WARNING_DELAY"|bc -l ) -gt 0 ]
|
|
then
|
|
echo "WARNING : last receive xlog file is not the last replay file ('$LAST_XLOG_RECEIVE' / '$LAST_XLOG_REPLAY') and replay delay is $REPLAY_DELAY second(s)"
|
|
exit 1
|
|
fi
|
|
debug "Replay delay is not worrying"
|
|
fi
|
|
debug "Last receive xlog file is the last replay file"
|
|
|
|
echo "OK : Hot-standby server is uptodate"
|
|
exit 0
|
|
else
|
|
debug "File recovery.conf not found. Master mode."
|
|
|
|
# Check recovery mode
|
|
if [ $RECOVERY_MODE -eq 1 ]
|
|
then
|
|
echo "CRITICAL : In recovery mode while recovery.conf file not found !"
|
|
exit 2
|
|
fi
|
|
debug "Postgres is not in recovery mode"
|
|
|
|
# Check standby client
|
|
STANDBY_CLIENTS=$( psql_get "SELECT application_name, client_addr, sent_location, write_location, state, sync_state, current_lag
|
|
FROM (
|
|
SELECT application_name, client_addr, sent_location, write_location, state, sync_state, current_lag
|
|
FROM (
|
|
SELECT application_name, client_addr, $sent_lsn AS sent_location, $write_lsn AS write_location, state, sync_state,
|
|
$pg_wal_lsn_diff($pg_current_wal_lsn, $write_lsn) AS current_lag
|
|
FROM pg_stat_replication
|
|
) AS s2
|
|
) AS s1" )
|
|
if [ ! -n "$STANDBY_CLIENTS" ]
|
|
then
|
|
echo "WARNING : no stand-by client connected"
|
|
exit 1
|
|
fi
|
|
debug "Stand-by client(s):\n\t$( echo -e "$STANDBY_CLIENTS"|sed 's/\n/\n\t/' )"
|
|
|
|
STANDBY_CLIENTS_TXT=""
|
|
STANDBY_CLIENTS_COUNT=0
|
|
for line in $STANDBY_CLIENTS
|
|
do
|
|
let STANDBY_CLIENTS_COUNT=STANDBY_CLIENTS_COUNT+1
|
|
|
|
NAME=$( echo $line|cut -d '|' -f 1 )
|
|
IP=$( echo $line|cut -d '|' -f 2 )
|
|
SENT_LOCATION=$( echo $line|cut -d '|' -f 3 )
|
|
WRITE_LOCATION=$( echo $line|cut -d '|' -f 4 )
|
|
STATE=$( echo $line|cut -d '|' -f 5 )
|
|
SYNC_STATE=$( echo $line|cut -d '|' -f 6 )
|
|
LAG=$( echo $line|cut -d '|' -f 7 )
|
|
STANDBY_CLIENTS_TXT="$STANDBY_CLIENTS_TXT\n$NAME ($IP): $STATE/$SYNC_STATE (Location: sent='$SENT_LOCATION' / write='$WRITE_LOCATION', Lag: ${LAG}b)"
|
|
done
|
|
|
|
echo -e "OK : $STANDBY_CLIENTS_COUNT stand-by client(s) connected\n$STANDBY_CLIENTS_TXT"
|
|
exit 0
|
|
fi
|