From 0443f56b1d36c8c2c12aa699020b44fcb3245ddc Mon Sep 17 00:00:00 2001 From: Benjamin Renard Date: Wed, 4 Nov 2020 16:20:41 +0100 Subject: [PATCH] Improve LSN master/slave checks --- check_pg_streaming_replication | 111 ++++++++++++++++++++++----------- 1 file changed, 73 insertions(+), 38 deletions(-) diff --git a/check_pg_streaming_replication b/check_pg_streaming_replication index a516bfc..e71ec82 100755 --- a/check_pg_streaming_replication +++ b/check_pg_streaming_replication @@ -39,7 +39,7 @@ RECOVERY_CONF="" PG_DEFAULT_PORT="" PG_DEFAULT_APP_NAME=$( hostname ) PG_DB="" -CHECK_CUR_MASTER_XLOG=1 +CHECK_CUR_MASTER_LSN=1 REPLAY_WARNING_DELAY=3 REPLAY_CRITICAL_DELAY=5 @@ -61,8 +61,8 @@ Usage: $0 [-d] [-h] [options] port if detected or use $DEFAULT_PG_PORT) -D dbname Specify DB name on Postgres master/slave to connect on (Default: PG_USER, must match with .pgpass one is used) - -C 1/0 Enable or disable check if the current XLOG file of the master host is the same - of the last replay XLOG file (Default: $CHECK_CUR_MASTER_XLOG) + -C 1/0 Enable or disable check if the current LSN of the master host is the same + of the last received LSN (Default: $CHECK_CUR_MASTER_LSN) -w replay_warn_delay Specify the replay warning delay in second (Default: $REPLAY_WARNING_DELAY) -c replay_crit_delay Specify the replay critical delay in second (Default: $REPLAY_CRITICAL_DELAY) -d Debug mode @@ -102,7 +102,7 @@ do PG_DB=$OPTARG ;; C) - CHECK_CUR_MASTER_XLOG=$OPTARG + CHECK_CUR_MASTER_LSN=$OPTARG ;; w) REPLAY_WARNING_DELAY=$OPTARG @@ -139,7 +139,7 @@ PG_MAIN = $PG_MAIN RECOVERY_CONF = $RECOVERY_CONF PG_DEFAULT_PORT = $PG_DEFAULT_PORT PG_DEFAULT_APP_NAME = $PG_DEFAULT_APP_NAME -CHECK_CUR_MASTER_XLOG = $CHECK_CUR_MASTER_XLOG +CHECK_CUR_MASTER_LSN = $CHECK_CUR_MASTER_LSN REPLAY_WARNING_DELAY = $REPLAY_WARNING_DELAY REPLAY_CRITICAL_DELAY = $REPLAY_CRITICAL_DELAY " @@ -210,7 +210,7 @@ PG_MAIN = $PG_MAIN RECOVERY_CONF = $RECOVERY_CONF PG_DEFAULT_PORT = $PG_DEFAULT_PORT PG_DEFAULT_APP_NAME = $PG_DEFAULT_APP_NAME -CHECK_CUR_MASTER_XLOG = $CHECK_CUR_MASTER_XLOG +CHECK_CUR_MASTER_LSN = $CHECK_CUR_MASTER_LSN REPLAY_WARNING_DELAY = $REPLAY_WARNING_DELAY REPLAY_CRITICAL_DELAY = $REPLAY_CRITICAL_DELAY " @@ -262,10 +262,11 @@ then fi debug "Postgres is in recovery mode" - LAST_XLOG_RECEIVE=$( psql_get "SELECT $pg_last_wal_receive_lsn" ) - debug "Last xlog file receive: $LAST_XLOG_RECEIVE" - LAST_XLOG_REPLAY=$( psql_get "SELECT $pg_last_wal_replay_lsn" ) - debug "Last xlog file replay: $LAST_XLOG_REPLAY" + # Get local current last received/replayed LSN + LAST_RECEIVED_LSN=$( psql_get "SELECT $pg_last_wal_receive_lsn" ) + debug "Last received LSN: $LAST_RECEIVED_LSN" + LAST_REPLAYED_LSN=$( psql_get "SELECT $pg_last_wal_replay_lsn" ) + debug "Last replayed LSN: $LAST_REPLAYED_LSN" # Get master connection informations from recovery.conf file @@ -318,14 +319,14 @@ then debug "Master application name: $M_APP_NAME" fi - # Get current state information from master - M_CUR_REPL_STATE_INFO="$( psql_master_get "SELECT state, sync_state FROM pg_stat_replication WHERE application_name='$M_APP_NAME';" )" + # Get current replication state information from master + M_CUR_REPL_STATE_INFO="$( psql_master_get "SELECT state, sync_state, $sent_lsn AS sent_lsn, $write_lsn AS write_lsn FROM pg_stat_replication WHERE application_name='$M_APP_NAME';" )" if [ ! -n "$M_CUR_REPL_STATE_INFO" ] then echo "UNKNOWN: Can't retreive current replication state information from master server" exit 3 fi - debug "Master current replication state:\n\tstate|sync_state\n\t$M_CUR_REPL_STATE_INFO" + debug "Master current replication state:\n\tstate|sync_state|sent_lsn|write_lsn\n\t$M_CUR_REPL_STATE_INFO" M_CUR_STATE=$( echo "$M_CUR_REPL_STATE_INFO"|cut -d'|' -f1 ) debug "Master current state: $M_CUR_STATE" @@ -343,46 +344,59 @@ then exit 2 fi - # Check current master XLOG file vs last replay XLOG file - if [ "$CHECK_CUR_MASTER_XLOG" == "1" ] + M_CUR_SENT_LSN=$( echo "$M_CUR_REPL_STATE_INFO"|cut -d'|' -f3 ) + M_CUR_WRITED_LSN=$( echo "$M_CUR_REPL_STATE_INFO"|cut -d'|' -f4 ) + debug "Master current last sent/writed LSN: '$M_CUR_SENT_LSN' / '$M_CUR_WRITED_LSN'" + + # Check current master LSN vs last received LSN + if [ "$CHECK_CUR_MASTER_LSN" == "1" ] then - # Get current xlog file from master - M_CUR_XLOG="$( psql_master_get 'SELECT pg_current_xlog_location()' )" - if [ ! -n "$M_CUR_XLOG" ] + # Get current LSN from master + M_CUR_LSN="$( psql_master_get "SELECT $pg_current_wal_lsn" )" + if [ ! -n "$M_CUR_LSN" ] then - echo "UNKNOWN: Can't retreive current xlog from master server" + echo "UNKNOWN: Can't retreive current LSN from master server" exit 3 fi - debug "Master current xlog: $M_CUR_XLOG" + debug "Master current LSN: $M_CUR_LSN" - # Master current xlog is the last receive xlog ? - if [ "$M_CUR_XLOG" != "$LAST_XLOG_RECEIVE" ] + # Master current LSN is the last received LSN ? + if [ "$M_CUR_LSN" != "$LAST_RECEIVED_LSN" ] then - echo "CRITICAL: Master current xlog is not the last receive xlog" + echo "CRITICAL: Master current LSN is not the last received LSN" exit 2 fi - debug "Master current xlog is the last receive xlog" + debug "Master current LSN is the last received LSN" fi - # The last receive xlog is the last replay file ? - if [ "$LAST_XLOG_RECEIVE" != "$LAST_XLOG_REPLAY" ] + # The last received LSN is the last replayed ? + if [ "$LAST_RECEIVED_LSN" != "$LAST_REPLAYED_LSN" ] then - debug "/!\ The last receive xlog is NOT the last replay file ('$M_CUR_XLOG' / '$LAST_XLOG_RECEIVE')" + debug "/!\ The last received LSN is NOT the last replayed LSN ('$M_CUR_LSN' / '$LAST_REPLAYED_LSN')" REPLAY_DELAY="$( psql_get 'SELECT EXTRACT(EPOCH FROM now() - pg_last_xact_replay_timestamp());' )" debug "Replay delay is $REPLAY_DELAY second(s)" if [ $( echo "$REPLAY_DELAY >= $REPLAY_CRITICAL_DELAY"|bc -l ) -gt 0 ] then - echo "CRITICAL: last receive xlog file is not the last replay file ('$LAST_XLOG_RECEIVE' / '$LAST_XLOG_REPLAY') and replay delay is $REPLAY_DELAY second(s)" + echo "CRITICAL: last received LSN is not the last replayed ('$LAST_RECEIVED_LSN' / '$LAST_REPLAYED_LSN') and replay delay is $REPLAY_DELAY second(s)" exit 2 fi if [ $( echo "$REPLAY_DELAY >= $REPLAY_WARNING_DELAY"|bc -l ) -gt 0 ] then - echo "WARNING: last receive xlog file is not the last replay file ('$LAST_XLOG_RECEIVE' / '$LAST_XLOG_REPLAY') and replay delay is $REPLAY_DELAY second(s)" + echo "WARNING: last received LSN is not the last replay file ('$LAST_RECEIVED_LSN' / '$LAST_REPLAYED_LSN') and replay delay is $REPLAY_DELAY second(s)" exit 1 fi debug "Replay delay is not worrying" fi - debug "Last receive xlog file is the last replay file" + debug "Last received LSN is the last replayed file" + + # The master last sent LSN is the last received (and synced) ? + if [ "$M_CUR_SENT_LSN" != "$LAST_RECEIVED_LSN" ] + then + echo "WARNING: master last sent LSN is not already received (and synced to disk) by slave. May be we have some network delay or load on slave" + echo "Master last sent LSN: $M_CUR_SENT_LSN" + echo "Slave last received (and synced to disk) LSN: $LAST_RECEIVED_LSN" + exit 1 + fi echo "OK: Hot-standby server is uptodate" exit 0 @@ -397,12 +411,21 @@ else fi debug "Postgres is not in recovery mode" + # Retreive current lsn + CURRENT_LSN=$( psql_get "SELECT $pg_current_wal_lsn" ) + if [ -z "$CURRENT_LSN" ] + then + echo "UNKNOWN: Fail to retreive current LSN (Log Sequence Number)" + exit 3 + fi + debug "Current LSN: $CURRENT_LSN" + # Check standby client - STANDBY_CLIENTS=$( psql_get "SELECT application_name, client_addr, sent_location, write_location, state, sync_state, current_lag + STANDBY_CLIENTS=$( psql_get "SELECT application_name, client_addr, sent_lsn, write_lsn, state, sync_state, current_lag FROM ( - SELECT application_name, client_addr, sent_location, write_location, state, sync_state, current_lag + SELECT application_name, client_addr, sent_lsn, write_lsn, state, sync_state, current_lag FROM ( - SELECT application_name, client_addr, $sent_lsn AS sent_location, $write_lsn AS write_location, state, sync_state, + SELECT application_name, client_addr, $sent_lsn AS sent_lsn, $write_lsn AS write_lsn, state, sync_state, $pg_wal_lsn_diff($pg_current_wal_lsn, $write_lsn) AS current_lag FROM pg_stat_replication ) AS s2 @@ -416,20 +439,32 @@ else STANDBY_CLIENTS_TXT="" STANDBY_CLIENTS_COUNT=0 + CURRENT_LSN_IS_LAST_SENT=1 for line in $STANDBY_CLIENTS do let STANDBY_CLIENTS_COUNT=STANDBY_CLIENTS_COUNT+1 NAME=$( echo $line|cut -d '|' -f 1 ) IP=$( echo $line|cut -d '|' -f 2 ) - SENT_LOCATION=$( echo $line|cut -d '|' -f 3 ) - WRITE_LOCATION=$( echo $line|cut -d '|' -f 4 ) + SENT_LSN=$( echo $line|cut -d '|' -f 3 ) + WRITED_LSN=$( echo $line|cut -d '|' -f 4 ) STATE=$( echo $line|cut -d '|' -f 5 ) SYNC_STATE=$( echo $line|cut -d '|' -f 6 ) LAG=$( echo $line|cut -d '|' -f 7 ) - STANDBY_CLIENTS_TXT="$STANDBY_CLIENTS_TXT\n$NAME ($IP): $STATE/$SYNC_STATE (Location: sent='$SENT_LOCATION' / write='$WRITE_LOCATION', Lag: ${LAG}b)" + STANDBY_CLIENTS_TXT="$STANDBY_CLIENTS_TXT\n$NAME ($IP): $STATE/$SYNC_STATE (LSN: sent='$SENT_LSN' / writed='$WRITED_LSN', Lag: ${LAG}b)" + [ "$SENT_LSN" != "$CURRENT_LSN" ] && CURRENT_LSN_IS_LAST_SENT=0 done - echo -e "OK: $STANDBY_CLIENTS_COUNT stand-by client(s) connected\n$STANDBY_CLIENTS_TXT" - exit 0 + if [ $CURRENT_LSN_IS_LAST_SENT -eq 1 ] + then + echo "OK: $STANDBY_CLIENTS_COUNT stand-by client(s) connected" + EXIT_CODE=0 + else + echo "WARNING: current master LSN is not the last sent to stand-by client(s) connected. May be we have some load ?" + EXIT_CODE=1 + fi + + echo "Current master LSN: $CURRENT_LSN" + echo -e "$STANDBY_CLIENTS_TXT" + exit $EXIT_CODE fi