check_pg_streaming_replication/check_pg_streaming_replication
2024-07-15 15:25:23 +02:00

494 lines
19 KiB
Bash
Executable file

#!/bin/bash
# vim: tabstop=4 shiftwidth=4 softtabstop=4 expandtab
#
# Nagios plugin to check Postgresql streamin replication state
#
# Could be use on Master or on standby node
#
# Requirements:
#
# Some CLI tools: sudo, awk, sed, bc, psql and pg_lscluster
#
# On master node: Slave nodes must be able to connect with user from recovery.conf /
# `postgresql.auto.conf` (or user specify using -U) to database with the same
# name (or another specified with -D) as trust (or using password specified in
# ~/.pgpass). This user must have SUPERUSER privilege (need to get replication
# details).
#
# On standby node: PG_USER must be able to connect locally on the database with the same name
# (or another specified with -D) as trust (or using password specified in
# ~/.pgpass).
#
# Author: Benjamin Renard <brenard@easter-eggs.com>
# Date: Mon, 03 Jun 2024 15:31:29 +0200
# Source: https://gitea.zionetrix.net/bn8/check_pg_streaming_replication
# SPDX-License-Identifier: GPL-3.0-or-later
#
DEFAULT_PG_USER=postgres
DEFAULT_PG_VERSION=9.1
DEFAULT_PG_MAIN=/var/lib/postgresql/$PG_VERSION/main
DEFAULT_PG_PORT=5432
PG_USER=""
PG_VERSION=""
PG_MAIN=""
PG_MASTER_USER=""
PSQL_BIN=/usr/bin/psql
PG_LSCLUSTER_BIN=/usr/bin/pg_lsclusters
RECOVERY_CONF=""
PG_DEFAULT_PORT=""
PG_DEFAULT_APP_NAME=$( hostname )
PG_DB=""
CHECK_CUR_MASTER_LSN=1
REPLAY_WARNING_DELAY=3
REPLAY_CRITICAL_DELAY=5
EXPECTED_SYNC_STATE=sync
EXPECTED_MODE=auto
DEBUG=0
function usage () {
ERROR="$1"
[[ -n "$ERROR" ]] && echo -e "$ERROR\n"
cat << EOF
Usage: $0 [-d] [-h] [options]
-u pg_user Specify local Postgres user (Default: try to auto-detect or use $DEFAULT_PG_USER)
-b psql_bin Specify psql binary path (Default: $PSQL_BIN)
-B pg_lsclusters_bin Specify pg_lsclusters binary path (Default: $PG_LSCLUSTER_BIN)
-V pg_version Specify Postgres version (Default: try to auto-detect or use $DEFAULT_PG_VERSION)
-m pg_main Specify Postgres main directory path (Default: try to auto-detect or use
$DEFAULT_PG_MAIN)
-r recovery_conf Specify Postgres recovery configuration file path
(Default: [PG_MAIN]/recovery.conf on PG <= 11, [PG_MAIN]/postgresql.auto.conf on PG >= 12)
-U pg_master_user Specify Postgres user to use on master (Default: user from recovery.conf file)
-p pg_port Specify default Postgres master TCP port (Default: same as local PostgreSQL
port if detected or use $DEFAULT_PG_PORT)
-D dbname Specify DB name on Postgres master/slave to connect on (Default: PG_USER, must
match with .pgpass one is used)
-C 1/0 Enable or disable check if the current LSN of the master host is the same
of the last received LSN (Default: $CHECK_CUR_MASTER_LSN)
-w replay_warn_delay Specify the replay warning delay in second (Default: $REPLAY_WARNING_DELAY)
-c replay_crit_delay Specify the replay critical delay in second (Default: $REPLAY_CRITICAL_DELAY)
-e expected_sync_state The expected replication state ('sync' or 'async', default: $EXPECTED_SYNC_STATE)
-E expected_mode The expected mode ('master', 'hot-standby' or 'auto', default: '$EXPECTED_MODE')
-d Debug mode
-h Show this message
EOF
[[ -n "$ERROR" ]] && exit 1 || exit 0
}
while getopts "hu:b:B:V:m:r:U:p:D:C:w:c:e:E:d" OPTION; do
case $OPTION in
u)
PG_USER=$OPTARG
;;
b)
PSQL_BIN=$OPTARG
;;
B)
PG_LSCLUSTER_BIN=$OPTARG
;;
V)
PG_VERSION=$OPTARG
;;
m)
PG_MAIN=$OPTARG
;;
r)
RECOVERY_CONF=$OPTARG
;;
U)
PG_MASTER_USER=$OPTARG
;;
p)
PG_DEFAULT_PORT=$OPTARG
;;
D)
PG_DB=$OPTARG
;;
C)
CHECK_CUR_MASTER_LSN=$OPTARG
;;
w)
REPLAY_WARNING_DELAY=$OPTARG
;;
c)
REPLAY_CRITICAL_DELAY=$OPTARG
;;
e)
[[ "$OPTARG" != "sync" ]] && [[ "$OPTARG" != "async" ]] && \
usage "Invalid expected replication state '$OPTARG'. Possible values: sync or async."
EXPECTED_SYNC_STATE=$OPTARG
;;
E)
[[ "$OPTARG" != "master" ]] && [[ "$OPTARG" != "hot-standby" ]] && [[ "$OPTARG" != "auto" ]] && \
usage "Invalid expected mode '$OPTARG'. Possible values: master, hot-standby or auto."
EXPECTED_MODE=$OPTARG
;;
d)
DEBUG=1
;;
h)
usage
;;
\?)
echo -n "Unknown option"
usage
esac
done
function debug() {
if [[ $DEBUG -eq 1 ]]; then
>&2 echo -e "[DEBUG] $1"
fi
}
debug "Starting options (before handling auto-detection/default values):
PG_VERSION = $PG_VERSION
PG_DB = $PG_DB
PG_USER = $PG_USER
PSQL_BIN = $PSQL_BIN
PG_LSCLUSTER_BIN = $PG_LSCLUSTER_BIN
PG_MAIN = $PG_MAIN
RECOVERY_CONF = $RECOVERY_CONF
PG_DEFAULT_PORT = $PG_DEFAULT_PORT
PG_DEFAULT_APP_NAME = $PG_DEFAULT_APP_NAME
CHECK_CUR_MASTER_LSN = $CHECK_CUR_MASTER_LSN
REPLAY_WARNING_DELAY = $REPLAY_WARNING_DELAY
REPLAY_CRITICAL_DELAY = $REPLAY_CRITICAL_DELAY
EXPECTED_SYNC_STATE = $EXPECTED_SYNC_STATE
EXPECTED_MODE = $EXPECTED_MODE
"
# Auto-detect PostgreSQL information using pg_lsclusters
if [[ -x "$PG_LSCLUSTER_BIN" ]]; then
PG_CLUSTER=$( $PG_LSCLUSTER_BIN -h 2>/dev/null|head -n1 )
if [[ -n "$PG_CLUSTER" ]]; then
debug "pg_lsclusters output:\n\t$PG_CLUSTER"
# Output example:
# 9.6 main 5432 online,recovery postgres /var/lib/postgresql/9.6/main /var/log/postgresql/postgresql-9.6-main.log
[[ -z "$PG_VERSION" ]] && PG_VERSION=$( echo "$PG_CLUSTER"|awk -F ' +' '{print $1}' )
[[ -z "$PG_DEFAULT_PORT" ]] && PG_DEFAULT_PORT=$( echo "$PG_CLUSTER"|awk -F ' +' '{print $3}' )
[[ -z "$PG_USER" ]] && PG_USER=$( echo "$PG_CLUSTER"|awk -F ' +' '{print $5}' )
[[ -z "$PG_MAIN" ]] && PG_MAIN=$( echo "$PG_CLUSTER"|awk -F ' +' '{print $6}' )
fi
else
debug "pg_lsclusters not found ($PG_LSCLUSTER_BIN): parameters auto-detection disabled"
fi
# If auto-detection failed, use default values
[[ -z "$PG_USER" ]] && PG_USER="$DEFAULT_PG_USER"
[[ -z "$PG_VERSION" ]] && PG_VERSION="$DEFAULT_PG_VERSION"
[[ -z "$PG_MAIN" ]] && PG_MAIN="$DEFAULT_PG_MAIN"
[[ -z "$PG_DEFAULT_PORT" ]] && PG_DEFAULT_PORT="$DEFAULT_PG_PORT"
# Check PG_USER
[[ -z "$PG_USER" ]] && echo "UNKNOWN: Postgres user not specified" && exit 3
id "$PG_USER" > /dev/null 2>&1 || { echo "UNKNOWN: Invalid Postgres user ($PG_USER)"; exit 3; }
# Check PSQL_BIN
[[ ! -x "$PSQL_BIN" ]] && echo "UNKNOWN: Invalid psql bin path ($PSQL_BIN)" && exit 3
# Check PG_MAIN
[[ ! -d "$PG_MAIN/" ]] && echo "UNKNOWN: Invalid Postgres main directory path ($PG_MAIN)" && exit 3
# Check RECOVERY_CONF
if [[ -z "$RECOVERY_CONF" ]]; then
[[ $PG_VERSION -le 11 ]] && RECOVERY_CONF_FILENAME="recovery.conf" || RECOVERY_CONF_FILENAME="postgresql.auto.conf"
RECOVERY_CONF="$PG_MAIN/$RECOVERY_CONF_FILENAME"
else
RECOVERY_CONF_FILENAME=$( basename "$RECOVERY_CONF" )
fi
# Check PG_DEFAULT_PORT
[[ $( grep -c -E '^[0-9]*$' <<< "$PG_DEFAULT_PORT" ) -ne 1 ]] && \
echo "UNKNOWN: Postgres default master TCP port must be an integer." && exit 3
# If PG_DB is not provided with -D parameter, use PG_USER as default value
[[ -z "$PG_DB" ]] && PG_DB="$PG_USER"
function psql_get () {
sql="$1"
debug "Exec 'echo \"$sql\"|sudo -u $PG_USER $PSQL_BIN -d \"$PG_DB\" -w -t -P format=unaligned"
sudo -u "$PG_USER" "$PSQL_BIN" -d "$PG_DB" -w -t -P format=unaligned <<< "$sql"
}
function psql_master_get () {
sql="$1"
debug "Exec 'echo \"$sql\"|sudo -u $PG_USER $PSQL_BIN -U $M_USER -h $M_HOST -w -p $M_PORT -d $PG_DB -t -P format=unaligned"
sudo -u "$PG_USER" "$PSQL_BIN" -U "$M_USER" -h "$M_HOST" -w -p "$M_PORT" -d "$PG_DB" -t -P format=unaligned <<< "$sql"
}
debug "Running options:
PG_VERSION = $PG_VERSION
PG_DB = $PG_DB
PG_USER = $PG_USER
PSQL_BIN = $PSQL_BIN
PG_LSCLUSTER_BIN = $PG_LSCLUSTER_BIN
PG_MAIN = $PG_MAIN
RECOVERY_CONF = $RECOVERY_CONF
PG_DEFAULT_PORT = $PG_DEFAULT_PORT
PG_DEFAULT_APP_NAME = $PG_DEFAULT_APP_NAME
CHECK_CUR_MASTER_LSN = $CHECK_CUR_MASTER_LSN
REPLAY_WARNING_DELAY = $REPLAY_WARNING_DELAY
REPLAY_CRITICAL_DELAY = $REPLAY_CRITICAL_DELAY
"
# Set some stuff to PostgreSQL version
if [[ $( bc -l <<< "$PG_VERSION < 10" ) -eq 1 ]]; then
pg_last_wal_receive_lsn='pg_last_xlog_receive_location()'
pg_last_wal_replay_lsn='pg_last_xlog_replay_location()'
pg_current_wal_lsn='pg_current_xlog_location()'
pg_wal_lsn_diff='pg_xlog_location_diff'
sent_lsn='sent_location'
write_lsn='write_location'
else
pg_last_wal_receive_lsn='pg_last_wal_receive_lsn()'
pg_last_wal_replay_lsn='pg_last_wal_replay_lsn()'
pg_current_wal_lsn='pg_current_wal_lsn()'
pg_wal_lsn_diff='pg_wal_lsn_diff'
sent_lsn='sent_lsn'
write_lsn='write_lsn'
fi
# Postgres is running ?
if [[ $DEBUG -eq 0 ]]; then
psql_get '\q' 2> /dev/null
is_running=$?
else
psql_get '\q'
is_running=$?
fi
if [[ $is_running -ne 0 ]]; then
echo "CRITICAL: Postgres is not running !"
exit 2
fi
debug "Postgres is running"
RECOVERY_MODE=0
[[ "$( psql_get 'SELECT pg_is_in_recovery();' )" == "t" ]] && RECOVERY_MODE=1
if [[ "$EXPECTED_MODE" == "auto" ]]; then
debug "Auto-detect mode"
if [[ $RECOVERY_MODE -eq 1 ]]; then
debug "Postgres is in recovery mode. Hot-standby mode."
EXPECTED_MODE="hot-standby"
elif [[ -f $RECOVERY_CONF ]] && [[ $( grep -cE '^\s*primary_conninfo' "$RECOVERY_CONF" ) -gt 0 ]]; then
debug "File $RECOVERY_CONF_FILENAME found and contain primary_conninfo. Hot-standby mode."
EXPECTED_MODE="hot-standby"
else
debug "Postgres not in recovery mode and file $RECOVERY_CONF_FILENAME not found " \
"(or does not contain primary_conninfo). Master mode."
EXPECTED_MODE="master"
fi
fi
if [[ "$EXPECTED_MODE" == "hot-standby" ]]; then
# Check recovery mode
if [[ $RECOVERY_MODE -ne 1 ]]; then
echo "CRITICAL: Not in recovery mode while $RECOVERY_CONF_FILENAME file found !"
exit 2
fi
debug "Postgres is in recovery mode"
# Get local current last received/replayed LSN
LAST_RECEIVED_LSN=$( psql_get "SELECT $pg_last_wal_receive_lsn" )
debug "Last received LSN: $LAST_RECEIVED_LSN"
LAST_REPLAYED_LSN=$( psql_get "SELECT $pg_last_wal_replay_lsn" )
debug "Last replayed LSN: $LAST_REPLAYED_LSN"
# Get master connection information from primary_conninfo configuration parameter
MASTER_CONN_INFOS=$( psql_get "SHOW primary_conninfo" )
if [[ -z "$MASTER_CONN_INFOS" ]]; then
echo "UNKNOWN: Can't retrieve master connection information from primary_conninfo configuration parameter"
exit 3
fi
debug "Master connection information: $MASTER_CONN_INFOS"
M_HOST=$( grep 'host=' <<< "$MASTER_CONN_INFOS" | sed 's/^.*host= *\([0-9a-zA-Z.-]\+\) *.*$/\1/' )
if [[ -z "$M_HOST" ]]; then
echo "UNKNOWN: Can't retrieve master host from primary_conninfo configuration parameter"
exit 3
fi
debug "Master host: $M_HOST"
M_PORT=$( grep 'port=' <<< "$MASTER_CONN_INFOS" | sed 's/^.*port= *\([0-9a-zA-Z.-]\+\) *.*$/\1/' )
if [[ -z "$M_PORT" ]]; then
debug "Master port not specified, use default: $PG_DEFAULT_PORT"
M_PORT=$PG_DEFAULT_PORT
else
debug "Master port: $M_PORT"
fi
if [[ -n "$PG_MASTER_USER" ]]; then
debug "Master user provided by command-line, use it: $PG_MASTER_USER"
M_USER="$PG_MASTER_USER"
else
M_USER=$( grep 'user=' <<< "$MASTER_CONN_INFOS" | sed 's/^.*user= *\([0-9a-zA-Z.-]\+\) *.*$/\1/' )
if [[ -z "$M_USER" ]]; then
debug "Master user not specified, use default: $PG_USER"
M_USER=$PG_USER
else
debug "Master user: $M_USER"
fi
fi
M_APP_NAME=$( grep 'application_name=' <<< "$MASTER_CONN_INFOS" | sed "s/^.*application_name=[ \'\"]*\([^ \'\"]\+\)[ \'\"]*.*$/\1/" )
if [[ -z "$M_APP_NAME" ]]; then
if [[ $PG_VERSION -ge 12 ]]; then
debug "Master application name not specified, use cluster_name if defined"
CLUSTER_NAME=$( psql_get "SELECT current_setting('cluster_name')" )
debug "Cluster name: $CLUSTER_NAME"
if [[ -n "$CLUSTER_NAME" ]]; then
M_APP_NAME=$CLUSTER_NAME
else
debug "Cluster name not defined, use default: $PG_DEFAULT_APP_NAME"
M_APP_NAME=$PG_DEFAULT_APP_NAME
fi
else
debug "Master application name not specified, use default: $PG_DEFAULT_APP_NAME"
M_APP_NAME=$PG_DEFAULT_APP_NAME
fi
else
debug "Master application name: $M_APP_NAME"
fi
# Get current replication state information from master
M_CUR_REPL_STATE_INFO="$( psql_master_get "SELECT state, sync_state, $sent_lsn AS sent_lsn, $write_lsn AS write_lsn FROM pg_stat_replication WHERE application_name='$M_APP_NAME';" )"
if [[ -z "$M_CUR_REPL_STATE_INFO" ]]; then
echo "UNKNOWN: Can't retrieve current replication state information from master server"
exit 3
fi
debug "Master current replication state:\n\tstate|sync_state|sent_lsn|write_lsn\n\t$M_CUR_REPL_STATE_INFO"
M_CUR_STATE=$( cut -d'|' -f1 <<< "$M_CUR_REPL_STATE_INFO" )
debug "Master current state: $M_CUR_STATE"
if [[ "$M_CUR_STATE" != "streaming" ]]; then
echo "CRITICAL: this host is not in streaming state according to master host (current state = '$M_CUR_STATE')"
exit 2
fi
M_CUR_SYNC_STATE=$( cut -d'|' -f2 <<< "$M_CUR_REPL_STATE_INFO" )
debug "Master current sync state: $M_CUR_SYNC_STATE"
if [[ "$M_CUR_SYNC_STATE" != "$EXPECTED_SYNC_STATE" ]]; then
echo "CRITICAL: unexpected replication state '$M_CUR_SYNC_STATE' (expected state = '$EXPECTED_SYNC_STATE')"
exit 2
fi
M_CUR_SENT_LSN=$( cut -d'|' -f3 <<< "$M_CUR_REPL_STATE_INFO" )
M_CUR_WRITED_LSN=$( cut -d'|' -f4 <<< "$M_CUR_REPL_STATE_INFO" )
debug "Master current last sent/writed LSN: '$M_CUR_SENT_LSN' / '$M_CUR_WRITED_LSN'"
# Check current master LSN vs last received LSN
if [[ "$CHECK_CUR_MASTER_LSN" == "1" ]]; then
# Get current LSN from master
M_CUR_LSN="$( psql_master_get "SELECT $pg_current_wal_lsn" )"
if [[ -z "$M_CUR_LSN" ]]; then
echo "UNKNOWN: Can't retrieve current LSN from master server"
exit 3
fi
debug "Master current LSN: $M_CUR_LSN"
# Master current LSN is the last received LSN ?
if [[ "$M_CUR_LSN" != "$LAST_RECEIVED_LSN" ]]; then
echo "CRITICAL: Master current LSN is not the last received LSN"
exit 2
fi
debug "Master current LSN is the last received LSN"
fi
# The last received LSN is the last replayed ?
if [[ "$LAST_RECEIVED_LSN" != "$LAST_REPLAYED_LSN" ]]; then
debug "/!\ The last received LSN is NOT the last replayed LSN ('$M_CUR_LSN' / '$LAST_REPLAYED_LSN')"
REPLAY_DELAY="$( psql_get 'SELECT EXTRACT(EPOCH FROM now() - pg_last_xact_replay_timestamp());' )"
debug "Replay delay is $REPLAY_DELAY second(s)"
if [[ $( bc -l <<< "$REPLAY_DELAY >= $REPLAY_CRITICAL_DELAY" ) -gt 0 ]]; then
echo "CRITICAL: last received LSN is not the last replayed ('$LAST_RECEIVED_LSN' / '$LAST_REPLAYED_LSN') and replay delay is $REPLAY_DELAY second(s)"
exit 2
fi
if [[ $( bc -l <<< "$REPLAY_DELAY >= $REPLAY_WARNING_DELAY" ) -gt 0 ]]; then
echo "WARNING: last received LSN is not the last replay file ('$LAST_RECEIVED_LSN' / '$LAST_REPLAYED_LSN') and replay delay is $REPLAY_DELAY second(s)"
exit 1
fi
debug "Replay delay is not worrying"
fi
debug "Last received LSN is the last replayed file"
# The master last sent LSN is the last received (and synced) ?
if [[ "$M_CUR_SENT_LSN" != "$LAST_RECEIVED_LSN" ]]; then
echo "WARNING: master last sent LSN is not already received (and synced to disk) by slave. May be we have some network delay or load on slave"
echo "Master last sent LSN: $M_CUR_SENT_LSN"
echo "Slave last received (and synced to disk) LSN: $LAST_RECEIVED_LSN"
exit 1
fi
echo "OK: Hot-standby server is up-to-date"
echo "Replication state: $M_CUR_SYNC_STATE"
echo "Last sent/writed LSN: '$M_CUR_SENT_LSN' / '$M_CUR_WRITED_LSN'"
[[ "$LAST_RECEIVED_LSN" != "$LAST_REPLAYED_LSN" ]] && echo "Replay delay: ${REPLAY_DELAY}s"
exit 0
elif [[ "$EXPECTED_MODE" == "master" ]]; then
# Check recovery mode
if [[ $RECOVERY_MODE -eq 1 ]]; then
echo "CRITICAL: In recovery mode while expected mode is master!"
exit 2
fi
debug "Postgres is not in recovery mode"
# Retrieve current lsn
CURRENT_LSN=$( psql_get "SELECT $pg_current_wal_lsn" )
if [[ -z "$CURRENT_LSN" ]]; then
echo "UNKNOWN: Fail to retrieve current LSN (Log Sequence Number)"
exit 3
fi
debug "Current LSN: $CURRENT_LSN"
# Check standby client
STANDBY_CLIENTS=$( psql_get "SELECT application_name, client_addr, sent_lsn, write_lsn, state, sync_state, current_lag
FROM (
SELECT application_name, client_addr, sent_lsn, write_lsn, state, sync_state, current_lag
FROM (
SELECT application_name, client_addr, $sent_lsn AS sent_lsn, $write_lsn AS write_lsn, state, sync_state,
$pg_wal_lsn_diff($pg_current_wal_lsn, $write_lsn) AS current_lag
FROM pg_stat_replication
) AS s2
) AS s1" )
if [[ -z "$STANDBY_CLIENTS" ]]; then
echo "WARNING: no stand-by client connected"
exit 1
fi
debug "Stand-by client(s):\n\t${STANDBY_CLIENTS//$'\n'/\\n\\t}"
STANDBY_CLIENTS_TXT=""
STANDBY_CLIENTS_COUNT=0
CURRENT_LSN_IS_LAST_SENT=1
for line in $STANDBY_CLIENTS; do
(( STANDBY_CLIENTS_COUNT+=1 ))
NAME=$( cut -d '|' -f 1 <<< "$line" )
IP=$( cut -d '|' -f 2 <<< "$line" )
SENT_LSN=$( cut -d '|' -f 3 <<< "$line" )
WRITED_LSN=$( cut -d '|' -f 4 <<< "$line" )
STATE=$( cut -d '|' -f 5 <<< "$line" )
SYNC_STATE=$( cut -d '|' -f 6 <<< "$line" )
LAG=$( cut -d '|' -f 7 <<< "$line" )
STANDBY_CLIENTS_TXT="$STANDBY_CLIENTS_TXT\n$NAME ($IP): $STATE/$SYNC_STATE (LSN: sent='$SENT_LSN' / writed='$WRITED_LSN', Lag: ${LAG}b)"
[[ "$SENT_LSN" != "$CURRENT_LSN" ]] && CURRENT_LSN_IS_LAST_SENT=0
done
if [[ $CURRENT_LSN_IS_LAST_SENT -eq 1 ]]; then
echo "OK: $STANDBY_CLIENTS_COUNT stand-by client(s) connected"
EXIT_CODE=0
else
echo "WARNING: current master LSN is not the last sent to stand-by client(s) connected. May be we have some load ?"
EXIT_CODE=1
fi
echo "Current master LSN: $CURRENT_LSN"
echo -e "$STANDBY_CLIENTS_TXT"
exit $EXIT_CODE
else
echo "UNKNOWN - Invalid mode '$EXPECTED_MODE'"
exit 3
fi