#!/bin/bash # # Nagios plugin to check Postgresql streamin replication state # # Could be use on Master or on standby node # # Requirements: # # Some CLI tools: sudo, awk, sed, bc, psql and pg_lscluster # # On master node: Slaves must be able to connect with user from recovery.conf # (or user specify using -U) to database with the same name # (or another specified with -D) as trust (or via md5 using # password specified in ~/.pgpass). This user must have # SUPERUSER privilege (need to get replication details). # # On standby node: PG_USER must be able to connect localy on the database # with the same name (or another specified with -D) as trust # (or via md5 using password specified in ~/.pgpass). # # Author: Benjamin Renard # Date: Wed, 04 Nov 2020 15:31:13 +0100 # Source: https://gogs.zionetrix.net/bn8/check_pg_streaming_replication # SPDX-License-Identifier: GPL-3.0-or-later # DEFAULT_PG_USER=postgres DEFAULT_PG_VERSION=9.1 DEFAULT_PG_MAIN=/var/lib/postgresql/$PG_VERSION/main DEFAULT_PG_PORT=5432 PG_USER="" PG_VERSION="" PG_MAIN="" PG_MASTER_USER="" PSQL_BIN=/usr/bin/psql PG_LSCLUSTER_BIN=/usr/bin/pg_lsclusters RECOVERY_CONF_FILENAME=recovery.conf RECOVERY_CONF="" PG_DEFAULT_PORT="" PG_DEFAULT_APP_NAME=$( hostname ) PG_DB="" CHECK_CUR_MASTER_LSN=1 REPLAY_WARNING_DELAY=3 REPLAY_CRITICAL_DELAY=5 DEBUG=0 function usage () { cat << EOF Usage: $0 [-d] [-h] [options] -u pg_user Specify local Postgres user (Default: try to auto-detect or use $DEFAULT_PG_USER) -b psql_bin Specify psql binary path (Default: $PSQL_BIN) -B pg_lsclusters_bin Specify pg_lsclusters binary path (Default: $PG_LSCLUSTER_BIN) -V pg_version Specify Postgres version (Default: try to auto-detect or use $DEFAULT_PG_VERSION) -m pg_main Specify Postgres main directory path (Default: try to auto-detect or use $DEFAULT_PG_MAIN) -r recovery_conf Specify Postgres recovery configuration file path (Default: [PG_MAIN]/$RECOVERY_CONF_FILENAME) -U pg_master_user Specify Postgres user to use on master (Default: user from recovery.conf file) -p pg_port Specify default Postgres master TCP port (Default: same as local PostgreSQL port if detected or use $DEFAULT_PG_PORT) -D dbname Specify DB name on Postgres master/slave to connect on (Default: PG_USER, must match with .pgpass one is used) -C 1/0 Enable or disable check if the current LSN of the master host is the same of the last received LSN (Default: $CHECK_CUR_MASTER_LSN) -w replay_warn_delay Specify the replay warning delay in second (Default: $REPLAY_WARNING_DELAY) -c replay_crit_delay Specify the replay critical delay in second (Default: $REPLAY_CRITICAL_DELAY) -d Debug mode -h Show this message EOF exit 0 } while getopts "hu:b:B:V:m:r:U:p:D:C:w:c:d" OPTION do case $OPTION in u) PG_USER=$OPTARG ;; b) PSQL_BIN=$OPTARG ;; B) PG_LSCLUSTER_BIN=$OPTARG ;; V) PG_VERSION=$OPTARG ;; m) PG_MAIN=$OPTARG ;; r) RECOVERY_CONF=$OPTARG ;; U) PG_MASTER_USER=$OPTARG ;; p) PG_DEFAULT_PORT=$OPTARG ;; D) PG_DB=$OPTARG ;; C) CHECK_CUR_MASTER_LSN=$OPTARG ;; w) REPLAY_WARNING_DELAY=$OPTARG ;; c) REPLAY_CRITICAL_DELAY=$OPTARG ;; d) DEBUG=1 ;; h) usage ;; \?) echo -n "Unkown option" usage esac done function debug() { if [ $DEBUG -eq 1 ] then >&2 echo -e "[DEBUG] $1" fi } debug "Starting options (before handling auto-detection/default values): PG_VERSION = $PG_VERSION PG_DB = $PG_DB PG_USER = $PG_USER PSQL_BIN = $PSQL_BIN PG_LSCLUSTER_BIN = $PG_LSCLUSTER_BIN PG_MAIN = $PG_MAIN RECOVERY_CONF = $RECOVERY_CONF PG_DEFAULT_PORT = $PG_DEFAULT_PORT PG_DEFAULT_APP_NAME = $PG_DEFAULT_APP_NAME CHECK_CUR_MASTER_LSN = $CHECK_CUR_MASTER_LSN REPLAY_WARNING_DELAY = $REPLAY_WARNING_DELAY REPLAY_CRITICAL_DELAY = $REPLAY_CRITICAL_DELAY " # Auto-detect PostgreSQL information using pg_lsclusters if [ -x "$PG_LSCLUSTER_BIN" ] then PG_CLUSTER=$( $PG_LSCLUSTER_BIN -h 2>/dev/null|head -n1 ) if [ -n "$PG_CLUSTER" ] then debug "pg_lsclusters output:\n\t$PG_CLUSTER" # Output example: # 9.6 main 5432 online,recovery postgres /var/lib/postgresql/9.6/main /var/log/postgresql/postgresql-9.6-main.log [ -z "$PG_VERSION" ] && PG_VERSION=$( echo "$PG_CLUSTER"|awk -F ' +' '{print $1}' ) [ -z "$PG_DEFAULT_PORT" ] && PG_DEFAULT_PORT=$( echo "$PG_CLUSTER"|awk -F ' +' '{print $3}' ) [ -z "$PG_USER" ] && PG_USER=$( echo "$PG_CLUSTER"|awk -F ' +' '{print $5}' ) [ -z "$PG_MAIN" ] && PG_MAIN=$( echo "$PG_CLUSTER"|awk -F ' +' '{print $6}' ) fi else debug "pg_lsclusters not found ($PG_LSCLUSTER_BIN): parameters auto-detection disabled" fi # If auto-detection failed, use default values [ -z "$PG_USER" ] && PG_USER="$DEFAULT_PG_USER" [ -z "$PG_VERSION" ] && PG_VERSION="$DEFAULT_PG_VERSION" [ -z "$PG_MAIN" ] && PG_MAIN="$DEFAULT_PG_MAIN" [ -z "$PG_DEFAULT_PORT" ] && PG_DEFAULT_PORT="$DEFAULT_PG_PORT" # Check PG_USER [ -z "$PG_USER" ] && echo "UNKNOWN: Postgres user not specified" && exit 3 id "$PG_USER" > /dev/null 2>&1 [ $? -ne 0 ] && echo "UNKNOWN: Invalid Postgres user ($PG_USER)" && exit 3 # Check PSQL_BIN [ ! -x "$PSQL_BIN" ] && echo "UNKNOWN: Invalid psql bin path ($PSQL_BIN)" && exit 3 # Check PG_MAIN [ ! -d "$PG_MAIN/" ] && echo "UNKNOWN: Invalid Postgres main directory path ($PG_MAIN)" && exit 3 # Check RECOVERY_CONF [ -z "$RECOVERY_CONF" ] && RECOVERY_CONF="$PG_MAIN/$RECOVERY_CONF_FILENAME" # Check PG_DEFAULT_PORT [ $( echo "$PG_DEFAULT_PORT"|grep -c -E '^[0-9]*$' ) -ne 1 ] && "UNKNOWN: Postgres default master TCP port must be an integer." && exit 3 # If PG_DB is not provided with -D parameter, use PG_USER as default value [ -z "$PG_DB" ] && PG_DB="$PG_USER" function psql_get () { sql="$1" debug "Exec 'echo \"$sql\"|sudo -u $PG_USER $PSQL_BIN -d \"$PG_DB\" -w -t -P format=unaligned" echo "$sql"|sudo -u $PG_USER $PSQL_BIN -d "$PG_DB" -w -t -P format=unaligned } function psql_master_get () { sql="$1" debug "Exec 'echo \"$sql\"|sudo -u $PG_USER $PSQL_BIN -U $M_USER -h $M_HOST -w -p $M_PORT -d $PG_DB -t -P format=unaligned" echo "$sql"|sudo -u $PG_USER $PSQL_BIN -U $M_USER -h $M_HOST -w -p $M_PORT -d $PG_DB -t -P format=unaligned } debug "Running options: PG_VERSION = $PG_VERSION PG_DB = $PG_DB PG_USER = $PG_USER PSQL_BIN = $PSQL_BIN PG_LSCLUSTER_BIN = $PG_LSCLUSTER_BIN PG_MAIN = $PG_MAIN RECOVERY_CONF = $RECOVERY_CONF PG_DEFAULT_PORT = $PG_DEFAULT_PORT PG_DEFAULT_APP_NAME = $PG_DEFAULT_APP_NAME CHECK_CUR_MASTER_LSN = $CHECK_CUR_MASTER_LSN REPLAY_WARNING_DELAY = $REPLAY_WARNING_DELAY REPLAY_CRITICAL_DELAY = $REPLAY_CRITICAL_DELAY " # Set some stuff to PostgreSQL version if [ $( echo "$PG_VERSION < 10" |bc -l ) -eq 1 ] then pg_last_wal_receive_lsn='pg_last_xlog_receive_location()' pg_last_wal_replay_lsn='pg_last_xlog_replay_location()' pg_current_wal_lsn='pg_current_xlog_location()' pg_wal_lsn_diff='pg_xlog_location_diff' sent_lsn='sent_location' write_lsn='write_location' else pg_last_wal_receive_lsn='pg_last_wal_receive_lsn()' pg_last_wal_replay_lsn='pg_last_wal_replay_lsn()' pg_current_wal_lsn='pg_current_wal_lsn()' pg_wal_lsn_diff='pg_wal_lsn_diff' sent_lsn='sent_lsn' write_lsn='write_lsn' fi # Postgres is running ? if [ $DEBUG -eq 0 ] then psql_get '\q' 2> /dev/null else psql_get '\q' fi if [ $? -ne 0 ] then echo "CRITICAL: Postgres is not running !" exit 2 fi debug "Postgres is running" RECOVERY_MODE=0 [ "$( psql_get 'SELECT pg_is_in_recovery();' )" == "t" ] && RECOVERY_MODE=1 if [ -f $RECOVERY_CONF ] then debug "File recovery.conf found. Hot-standby mode." # Check recovery mode if [ $RECOVERY_MODE -ne 1 ] then echo "CRITICAL: Not in recovery mode while recovery.conf file found !" exit 2 fi debug "Postgres is in recovery mode" # Get local current last received/replayed LSN LAST_RECEIVED_LSN=$( psql_get "SELECT $pg_last_wal_receive_lsn" ) debug "Last received LSN: $LAST_RECEIVED_LSN" LAST_REPLAYED_LSN=$( psql_get "SELECT $pg_last_wal_replay_lsn" ) debug "Last replayed LSN: $LAST_REPLAYED_LSN" # Get master connection informations from recovery.conf file MASTER_CONN_INFOS=$( egrep '^ *primary_conninfo' $RECOVERY_CONF|sed "s/^ *primary_conninfo *= *\(.\+\) *$/\1/" ) if [ ! -n "$MASTER_CONN_INFOS" ] then echo "UNKNOWN: Can't retreive master connection informations form recovery.conf file" exit 3 fi debug "Master connection informations: $MASTER_CONN_INFOS" M_HOST=$( echo "$MASTER_CONN_INFOS"| grep 'host=' | sed 's/^.*host= *\([0-9a-zA-Z.-]\+\) *.*$/\1/' ) if [ ! -n "$M_HOST" ] then echo "UNKNOWN: Can't retreive master host from recovery.conf file" exit 3 fi debug "Master host: $M_HOST" M_PORT=$( echo "$MASTER_CONN_INFOS"| grep 'port=' | sed 's/^.*port= *\([0-9a-zA-Z.-]\+\) *.*$/\1/' ) if [ ! -n "$M_PORT" ] then debug "Master port not specified, use default: $PG_DEFAULT_PORT" M_PORT=$PG_DEFAULT_PORT else debug "Master port: $M_PORT" fi if [ -n "$PG_MASTER_USER" ] then debug "Master user provided by command-line, use it: $PG_MASTER_USER" M_USER="$PG_MASTER_USER" else M_USER=$( echo "$MASTER_CONN_INFOS"| grep 'user=' | sed 's/^.*user= *\([0-9a-zA-Z.-]\+\) *.*$/\1/' ) if [ ! -n "$M_USER" ] then debug "Master user not specified, use default: $PG_USER" M_USER=$PG_USER else debug "Master user: $M_USER" fi fi M_APP_NAME=$( echo "$MASTER_CONN_INFOS"| grep 'application_name=' | sed "s/^.*application_name=[ \'\"]*\([^ \'\"]\+\)[ \'\"]*.*$/\1/" ) if [ ! -n "$M_APP_NAME" ] then debug "Master application name not specified, use default: $PG_DEFAULT_APP_NAME" M_APP_NAME=$PG_DEFAULT_APP_NAME else debug "Master application name: $M_APP_NAME" fi # Get current replication state information from master M_CUR_REPL_STATE_INFO="$( psql_master_get "SELECT state, sync_state, $sent_lsn AS sent_lsn, $write_lsn AS write_lsn FROM pg_stat_replication WHERE application_name='$M_APP_NAME';" )" if [ ! -n "$M_CUR_REPL_STATE_INFO" ] then echo "UNKNOWN: Can't retreive current replication state information from master server" exit 3 fi debug "Master current replication state:\n\tstate|sync_state|sent_lsn|write_lsn\n\t$M_CUR_REPL_STATE_INFO" M_CUR_STATE=$( echo "$M_CUR_REPL_STATE_INFO"|cut -d'|' -f1 ) debug "Master current state: $M_CUR_STATE" if [ "$M_CUR_STATE" != "streaming" ] then echo "CRITICAL: this host is not in streaming state according to master host (current state = '$M_CUR_STATE')" exit 2 fi M_CUR_SYNC_STATE=$( echo "$M_CUR_REPL_STATE_INFO"|cut -d'|' -f2 ) debug "Master current sync state: $M_CUR_SYNC_STATE" if [ "$M_CUR_SYNC_STATE" != "sync" ] then echo "CRITICAL: this host is not synchronized according to master host (current sync state = '$M_CUR_SYNC_STATE')" exit 2 fi M_CUR_SENT_LSN=$( echo "$M_CUR_REPL_STATE_INFO"|cut -d'|' -f3 ) M_CUR_WRITED_LSN=$( echo "$M_CUR_REPL_STATE_INFO"|cut -d'|' -f4 ) debug "Master current last sent/writed LSN: '$M_CUR_SENT_LSN' / '$M_CUR_WRITED_LSN'" # Check current master LSN vs last received LSN if [ "$CHECK_CUR_MASTER_LSN" == "1" ] then # Get current LSN from master M_CUR_LSN="$( psql_master_get "SELECT $pg_current_wal_lsn" )" if [ ! -n "$M_CUR_LSN" ] then echo "UNKNOWN: Can't retreive current LSN from master server" exit 3 fi debug "Master current LSN: $M_CUR_LSN" # Master current LSN is the last received LSN ? if [ "$M_CUR_LSN" != "$LAST_RECEIVED_LSN" ] then echo "CRITICAL: Master current LSN is not the last received LSN" exit 2 fi debug "Master current LSN is the last received LSN" fi # The last received LSN is the last replayed ? if [ "$LAST_RECEIVED_LSN" != "$LAST_REPLAYED_LSN" ] then debug "/!\ The last received LSN is NOT the last replayed LSN ('$M_CUR_LSN' / '$LAST_REPLAYED_LSN')" REPLAY_DELAY="$( psql_get 'SELECT EXTRACT(EPOCH FROM now() - pg_last_xact_replay_timestamp());' )" debug "Replay delay is $REPLAY_DELAY second(s)" if [ $( echo "$REPLAY_DELAY >= $REPLAY_CRITICAL_DELAY"|bc -l ) -gt 0 ] then echo "CRITICAL: last received LSN is not the last replayed ('$LAST_RECEIVED_LSN' / '$LAST_REPLAYED_LSN') and replay delay is $REPLAY_DELAY second(s)" exit 2 fi if [ $( echo "$REPLAY_DELAY >= $REPLAY_WARNING_DELAY"|bc -l ) -gt 0 ] then echo "WARNING: last received LSN is not the last replay file ('$LAST_RECEIVED_LSN' / '$LAST_REPLAYED_LSN') and replay delay is $REPLAY_DELAY second(s)" exit 1 fi debug "Replay delay is not worrying" fi debug "Last received LSN is the last replayed file" # The master last sent LSN is the last received (and synced) ? if [ "$M_CUR_SENT_LSN" != "$LAST_RECEIVED_LSN" ] then echo "WARNING: master last sent LSN is not already received (and synced to disk) by slave. May be we have some network delay or load on slave" echo "Master last sent LSN: $M_CUR_SENT_LSN" echo "Slave last received (and synced to disk) LSN: $LAST_RECEIVED_LSN" exit 1 fi echo "OK: Hot-standby server is uptodate" exit 0 else debug "File recovery.conf not found. Master mode." # Check recovery mode if [ $RECOVERY_MODE -eq 1 ] then echo "CRITICAL: In recovery mode while recovery.conf file not found !" exit 2 fi debug "Postgres is not in recovery mode" # Retreive current lsn CURRENT_LSN=$( psql_get "SELECT $pg_current_wal_lsn" ) if [ -z "$CURRENT_LSN" ] then echo "UNKNOWN: Fail to retreive current LSN (Log Sequence Number)" exit 3 fi debug "Current LSN: $CURRENT_LSN" # Check standby client STANDBY_CLIENTS=$( psql_get "SELECT application_name, client_addr, sent_lsn, write_lsn, state, sync_state, current_lag FROM ( SELECT application_name, client_addr, sent_lsn, write_lsn, state, sync_state, current_lag FROM ( SELECT application_name, client_addr, $sent_lsn AS sent_lsn, $write_lsn AS write_lsn, state, sync_state, $pg_wal_lsn_diff($pg_current_wal_lsn, $write_lsn) AS current_lag FROM pg_stat_replication ) AS s2 ) AS s1" ) if [ ! -n "$STANDBY_CLIENTS" ] then echo "WARNING: no stand-by client connected" exit 1 fi debug "Stand-by client(s):\n\t$( echo -e "$STANDBY_CLIENTS"|sed 's/\n/\n\t/' )" STANDBY_CLIENTS_TXT="" STANDBY_CLIENTS_COUNT=0 CURRENT_LSN_IS_LAST_SENT=1 for line in $STANDBY_CLIENTS do let STANDBY_CLIENTS_COUNT=STANDBY_CLIENTS_COUNT+1 NAME=$( echo $line|cut -d '|' -f 1 ) IP=$( echo $line|cut -d '|' -f 2 ) SENT_LSN=$( echo $line|cut -d '|' -f 3 ) WRITED_LSN=$( echo $line|cut -d '|' -f 4 ) STATE=$( echo $line|cut -d '|' -f 5 ) SYNC_STATE=$( echo $line|cut -d '|' -f 6 ) LAG=$( echo $line|cut -d '|' -f 7 ) STANDBY_CLIENTS_TXT="$STANDBY_CLIENTS_TXT\n$NAME ($IP): $STATE/$SYNC_STATE (LSN: sent='$SENT_LSN' / writed='$WRITED_LSN', Lag: ${LAG}b)" [ "$SENT_LSN" != "$CURRENT_LSN" ] && CURRENT_LSN_IS_LAST_SENT=0 done if [ $CURRENT_LSN_IS_LAST_SENT -eq 1 ] then echo "OK: $STANDBY_CLIENTS_COUNT stand-by client(s) connected" EXIT_CODE=0 else echo "WARNING: current master LSN is not the last sent to stand-by client(s) connected. May be we have some load ?" EXIT_CODE=1 fi echo "Current master LSN: $CURRENT_LSN" echo -e "$STANDBY_CLIENTS_TXT" exit $EXIT_CODE fi