Improve LSN master/slave checks
This commit is contained in:
parent
d4cbdb3c79
commit
0443f56b1d
1 changed files with 73 additions and 38 deletions
|
@ -39,7 +39,7 @@ RECOVERY_CONF=""
|
|||
PG_DEFAULT_PORT=""
|
||||
PG_DEFAULT_APP_NAME=$( hostname )
|
||||
PG_DB=""
|
||||
CHECK_CUR_MASTER_XLOG=1
|
||||
CHECK_CUR_MASTER_LSN=1
|
||||
REPLAY_WARNING_DELAY=3
|
||||
REPLAY_CRITICAL_DELAY=5
|
||||
|
||||
|
@ -61,8 +61,8 @@ Usage: $0 [-d] [-h] [options]
|
|||
port if detected or use $DEFAULT_PG_PORT)
|
||||
-D dbname Specify DB name on Postgres master/slave to connect on (Default: PG_USER, must
|
||||
match with .pgpass one is used)
|
||||
-C 1/0 Enable or disable check if the current XLOG file of the master host is the same
|
||||
of the last replay XLOG file (Default: $CHECK_CUR_MASTER_XLOG)
|
||||
-C 1/0 Enable or disable check if the current LSN of the master host is the same
|
||||
of the last received LSN (Default: $CHECK_CUR_MASTER_LSN)
|
||||
-w replay_warn_delay Specify the replay warning delay in second (Default: $REPLAY_WARNING_DELAY)
|
||||
-c replay_crit_delay Specify the replay critical delay in second (Default: $REPLAY_CRITICAL_DELAY)
|
||||
-d Debug mode
|
||||
|
@ -102,7 +102,7 @@ do
|
|||
PG_DB=$OPTARG
|
||||
;;
|
||||
C)
|
||||
CHECK_CUR_MASTER_XLOG=$OPTARG
|
||||
CHECK_CUR_MASTER_LSN=$OPTARG
|
||||
;;
|
||||
w)
|
||||
REPLAY_WARNING_DELAY=$OPTARG
|
||||
|
@ -139,7 +139,7 @@ PG_MAIN = $PG_MAIN
|
|||
RECOVERY_CONF = $RECOVERY_CONF
|
||||
PG_DEFAULT_PORT = $PG_DEFAULT_PORT
|
||||
PG_DEFAULT_APP_NAME = $PG_DEFAULT_APP_NAME
|
||||
CHECK_CUR_MASTER_XLOG = $CHECK_CUR_MASTER_XLOG
|
||||
CHECK_CUR_MASTER_LSN = $CHECK_CUR_MASTER_LSN
|
||||
REPLAY_WARNING_DELAY = $REPLAY_WARNING_DELAY
|
||||
REPLAY_CRITICAL_DELAY = $REPLAY_CRITICAL_DELAY
|
||||
"
|
||||
|
@ -210,7 +210,7 @@ PG_MAIN = $PG_MAIN
|
|||
RECOVERY_CONF = $RECOVERY_CONF
|
||||
PG_DEFAULT_PORT = $PG_DEFAULT_PORT
|
||||
PG_DEFAULT_APP_NAME = $PG_DEFAULT_APP_NAME
|
||||
CHECK_CUR_MASTER_XLOG = $CHECK_CUR_MASTER_XLOG
|
||||
CHECK_CUR_MASTER_LSN = $CHECK_CUR_MASTER_LSN
|
||||
REPLAY_WARNING_DELAY = $REPLAY_WARNING_DELAY
|
||||
REPLAY_CRITICAL_DELAY = $REPLAY_CRITICAL_DELAY
|
||||
"
|
||||
|
@ -262,10 +262,11 @@ then
|
|||
fi
|
||||
debug "Postgres is in recovery mode"
|
||||
|
||||
LAST_XLOG_RECEIVE=$( psql_get "SELECT $pg_last_wal_receive_lsn" )
|
||||
debug "Last xlog file receive: $LAST_XLOG_RECEIVE"
|
||||
LAST_XLOG_REPLAY=$( psql_get "SELECT $pg_last_wal_replay_lsn" )
|
||||
debug "Last xlog file replay: $LAST_XLOG_REPLAY"
|
||||
# Get local current last received/replayed LSN
|
||||
LAST_RECEIVED_LSN=$( psql_get "SELECT $pg_last_wal_receive_lsn" )
|
||||
debug "Last received LSN: $LAST_RECEIVED_LSN"
|
||||
LAST_REPLAYED_LSN=$( psql_get "SELECT $pg_last_wal_replay_lsn" )
|
||||
debug "Last replayed LSN: $LAST_REPLAYED_LSN"
|
||||
|
||||
|
||||
# Get master connection informations from recovery.conf file
|
||||
|
@ -318,14 +319,14 @@ then
|
|||
debug "Master application name: $M_APP_NAME"
|
||||
fi
|
||||
|
||||
# Get current state information from master
|
||||
M_CUR_REPL_STATE_INFO="$( psql_master_get "SELECT state, sync_state FROM pg_stat_replication WHERE application_name='$M_APP_NAME';" )"
|
||||
# Get current replication state information from master
|
||||
M_CUR_REPL_STATE_INFO="$( psql_master_get "SELECT state, sync_state, $sent_lsn AS sent_lsn, $write_lsn AS write_lsn FROM pg_stat_replication WHERE application_name='$M_APP_NAME';" )"
|
||||
if [ ! -n "$M_CUR_REPL_STATE_INFO" ]
|
||||
then
|
||||
echo "UNKNOWN: Can't retreive current replication state information from master server"
|
||||
exit 3
|
||||
fi
|
||||
debug "Master current replication state:\n\tstate|sync_state\n\t$M_CUR_REPL_STATE_INFO"
|
||||
debug "Master current replication state:\n\tstate|sync_state|sent_lsn|write_lsn\n\t$M_CUR_REPL_STATE_INFO"
|
||||
|
||||
M_CUR_STATE=$( echo "$M_CUR_REPL_STATE_INFO"|cut -d'|' -f1 )
|
||||
debug "Master current state: $M_CUR_STATE"
|
||||
|
@ -343,46 +344,59 @@ then
|
|||
exit 2
|
||||
fi
|
||||
|
||||
# Check current master XLOG file vs last replay XLOG file
|
||||
if [ "$CHECK_CUR_MASTER_XLOG" == "1" ]
|
||||
M_CUR_SENT_LSN=$( echo "$M_CUR_REPL_STATE_INFO"|cut -d'|' -f3 )
|
||||
M_CUR_WRITED_LSN=$( echo "$M_CUR_REPL_STATE_INFO"|cut -d'|' -f4 )
|
||||
debug "Master current last sent/writed LSN: '$M_CUR_SENT_LSN' / '$M_CUR_WRITED_LSN'"
|
||||
|
||||
# Check current master LSN vs last received LSN
|
||||
if [ "$CHECK_CUR_MASTER_LSN" == "1" ]
|
||||
then
|
||||
# Get current xlog file from master
|
||||
M_CUR_XLOG="$( psql_master_get 'SELECT pg_current_xlog_location()' )"
|
||||
if [ ! -n "$M_CUR_XLOG" ]
|
||||
# Get current LSN from master
|
||||
M_CUR_LSN="$( psql_master_get "SELECT $pg_current_wal_lsn" )"
|
||||
if [ ! -n "$M_CUR_LSN" ]
|
||||
then
|
||||
echo "UNKNOWN: Can't retreive current xlog from master server"
|
||||
echo "UNKNOWN: Can't retreive current LSN from master server"
|
||||
exit 3
|
||||
fi
|
||||
debug "Master current xlog: $M_CUR_XLOG"
|
||||
debug "Master current LSN: $M_CUR_LSN"
|
||||
|
||||
# Master current xlog is the last receive xlog ?
|
||||
if [ "$M_CUR_XLOG" != "$LAST_XLOG_RECEIVE" ]
|
||||
# Master current LSN is the last received LSN ?
|
||||
if [ "$M_CUR_LSN" != "$LAST_RECEIVED_LSN" ]
|
||||
then
|
||||
echo "CRITICAL: Master current xlog is not the last receive xlog"
|
||||
echo "CRITICAL: Master current LSN is not the last received LSN"
|
||||
exit 2
|
||||
fi
|
||||
debug "Master current xlog is the last receive xlog"
|
||||
debug "Master current LSN is the last received LSN"
|
||||
fi
|
||||
|
||||
# The last receive xlog is the last replay file ?
|
||||
if [ "$LAST_XLOG_RECEIVE" != "$LAST_XLOG_REPLAY" ]
|
||||
# The last received LSN is the last replayed ?
|
||||
if [ "$LAST_RECEIVED_LSN" != "$LAST_REPLAYED_LSN" ]
|
||||
then
|
||||
debug "/!\ The last receive xlog is NOT the last replay file ('$M_CUR_XLOG' / '$LAST_XLOG_RECEIVE')"
|
||||
debug "/!\ The last received LSN is NOT the last replayed LSN ('$M_CUR_LSN' / '$LAST_REPLAYED_LSN')"
|
||||
REPLAY_DELAY="$( psql_get 'SELECT EXTRACT(EPOCH FROM now() - pg_last_xact_replay_timestamp());' )"
|
||||
debug "Replay delay is $REPLAY_DELAY second(s)"
|
||||
if [ $( echo "$REPLAY_DELAY >= $REPLAY_CRITICAL_DELAY"|bc -l ) -gt 0 ]
|
||||
then
|
||||
echo "CRITICAL: last receive xlog file is not the last replay file ('$LAST_XLOG_RECEIVE' / '$LAST_XLOG_REPLAY') and replay delay is $REPLAY_DELAY second(s)"
|
||||
echo "CRITICAL: last received LSN is not the last replayed ('$LAST_RECEIVED_LSN' / '$LAST_REPLAYED_LSN') and replay delay is $REPLAY_DELAY second(s)"
|
||||
exit 2
|
||||
fi
|
||||
if [ $( echo "$REPLAY_DELAY >= $REPLAY_WARNING_DELAY"|bc -l ) -gt 0 ]
|
||||
then
|
||||
echo "WARNING: last receive xlog file is not the last replay file ('$LAST_XLOG_RECEIVE' / '$LAST_XLOG_REPLAY') and replay delay is $REPLAY_DELAY second(s)"
|
||||
echo "WARNING: last received LSN is not the last replay file ('$LAST_RECEIVED_LSN' / '$LAST_REPLAYED_LSN') and replay delay is $REPLAY_DELAY second(s)"
|
||||
exit 1
|
||||
fi
|
||||
debug "Replay delay is not worrying"
|
||||
fi
|
||||
debug "Last receive xlog file is the last replay file"
|
||||
debug "Last received LSN is the last replayed file"
|
||||
|
||||
# The master last sent LSN is the last received (and synced) ?
|
||||
if [ "$M_CUR_SENT_LSN" != "$LAST_RECEIVED_LSN" ]
|
||||
then
|
||||
echo "WARNING: master last sent LSN is not already received (and synced to disk) by slave. May be we have some network delay or load on slave"
|
||||
echo "Master last sent LSN: $M_CUR_SENT_LSN"
|
||||
echo "Slave last received (and synced to disk) LSN: $LAST_RECEIVED_LSN"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "OK: Hot-standby server is uptodate"
|
||||
exit 0
|
||||
|
@ -397,12 +411,21 @@ else
|
|||
fi
|
||||
debug "Postgres is not in recovery mode"
|
||||
|
||||
# Retreive current lsn
|
||||
CURRENT_LSN=$( psql_get "SELECT $pg_current_wal_lsn" )
|
||||
if [ -z "$CURRENT_LSN" ]
|
||||
then
|
||||
echo "UNKNOWN: Fail to retreive current LSN (Log Sequence Number)"
|
||||
exit 3
|
||||
fi
|
||||
debug "Current LSN: $CURRENT_LSN"
|
||||
|
||||
# Check standby client
|
||||
STANDBY_CLIENTS=$( psql_get "SELECT application_name, client_addr, sent_location, write_location, state, sync_state, current_lag
|
||||
STANDBY_CLIENTS=$( psql_get "SELECT application_name, client_addr, sent_lsn, write_lsn, state, sync_state, current_lag
|
||||
FROM (
|
||||
SELECT application_name, client_addr, sent_location, write_location, state, sync_state, current_lag
|
||||
SELECT application_name, client_addr, sent_lsn, write_lsn, state, sync_state, current_lag
|
||||
FROM (
|
||||
SELECT application_name, client_addr, $sent_lsn AS sent_location, $write_lsn AS write_location, state, sync_state,
|
||||
SELECT application_name, client_addr, $sent_lsn AS sent_lsn, $write_lsn AS write_lsn, state, sync_state,
|
||||
$pg_wal_lsn_diff($pg_current_wal_lsn, $write_lsn) AS current_lag
|
||||
FROM pg_stat_replication
|
||||
) AS s2
|
||||
|
@ -416,20 +439,32 @@ else
|
|||
|
||||
STANDBY_CLIENTS_TXT=""
|
||||
STANDBY_CLIENTS_COUNT=0
|
||||
CURRENT_LSN_IS_LAST_SENT=1
|
||||
for line in $STANDBY_CLIENTS
|
||||
do
|
||||
let STANDBY_CLIENTS_COUNT=STANDBY_CLIENTS_COUNT+1
|
||||
|
||||
NAME=$( echo $line|cut -d '|' -f 1 )
|
||||
IP=$( echo $line|cut -d '|' -f 2 )
|
||||
SENT_LOCATION=$( echo $line|cut -d '|' -f 3 )
|
||||
WRITE_LOCATION=$( echo $line|cut -d '|' -f 4 )
|
||||
SENT_LSN=$( echo $line|cut -d '|' -f 3 )
|
||||
WRITED_LSN=$( echo $line|cut -d '|' -f 4 )
|
||||
STATE=$( echo $line|cut -d '|' -f 5 )
|
||||
SYNC_STATE=$( echo $line|cut -d '|' -f 6 )
|
||||
LAG=$( echo $line|cut -d '|' -f 7 )
|
||||
STANDBY_CLIENTS_TXT="$STANDBY_CLIENTS_TXT\n$NAME ($IP): $STATE/$SYNC_STATE (Location: sent='$SENT_LOCATION' / write='$WRITE_LOCATION', Lag: ${LAG}b)"
|
||||
STANDBY_CLIENTS_TXT="$STANDBY_CLIENTS_TXT\n$NAME ($IP): $STATE/$SYNC_STATE (LSN: sent='$SENT_LSN' / writed='$WRITED_LSN', Lag: ${LAG}b)"
|
||||
[ "$SENT_LSN" != "$CURRENT_LSN" ] && CURRENT_LSN_IS_LAST_SENT=0
|
||||
done
|
||||
|
||||
echo -e "OK: $STANDBY_CLIENTS_COUNT stand-by client(s) connected\n$STANDBY_CLIENTS_TXT"
|
||||
exit 0
|
||||
if [ $CURRENT_LSN_IS_LAST_SENT -eq 1 ]
|
||||
then
|
||||
echo "OK: $STANDBY_CLIENTS_COUNT stand-by client(s) connected"
|
||||
EXIT_CODE=0
|
||||
else
|
||||
echo "WARNING: current master LSN is not the last sent to stand-by client(s) connected. May be we have some load ?"
|
||||
EXIT_CODE=1
|
||||
fi
|
||||
|
||||
echo "Current master LSN: $CURRENT_LSN"
|
||||
echo -e "$STANDBY_CLIENTS_TXT"
|
||||
exit $EXIT_CODE
|
||||
fi
|
||||
|
|
Loading…
Reference in a new issue