Make the check more adjustable to allow some delay between xlog files sent by master, received and replayed
This commit is contained in:
parent
327f382b30
commit
b092186b89
2 changed files with 111 additions and 27 deletions
37
README.md
37
README.md
|
@ -11,8 +11,13 @@ This script :
|
|||
- retreive from Postgres the last _xlog_ file receive and the _xlog_ file replay
|
||||
- check if Postgres recovery configuration file is NOT present (_CRITICAL_ raise if present)
|
||||
- retreive master connection informations from Postgres recovery configuration file (_UNKNOWN_ raise on error). Default Postgres master TCP port will be used if port is not specify.
|
||||
- retreive current _xlog_ file from Postgres master server by making a connection on master server (_UNKNOWN_ raise on error).
|
||||
- check if the last receive _xlog_ file is the last replay _xlog_ file (_WARNING_ raise if not)
|
||||
- retreive the current state and sync state of the host from Postgres master server by making a connection on master server (_UNKNOWN_ raise on error).
|
||||
- check if the current state of the host is "streaming" (_CRITICAL_ raise if not)
|
||||
- check if the current sync state of the host is "sync" (_CRITICAL_ raise if not)
|
||||
- if the check of the current XLOG file of the master host is enabled :
|
||||
- retreive current _xlog_ file from Postgres master server by making a connection on master server (_UNKNOWN_ raise on error).
|
||||
- check if the current master _xlog_ file is the last received _xlog_ file (_CRITICAL_ raise if not)
|
||||
- check if the last received _xlog_ file is the last replay _xlog_ file : if not, check the current delay with the last replayed transaction against _replay_warn_delay_ and _replay_crit_delay_ thresholds and raise corresponding error if they are exceeded
|
||||
- Return _OK_ state
|
||||
- if Postgres is not in recovery mode :
|
||||
- check if Postgres recovery configuration file is present (_CRITICAL_ raise if present)
|
||||
|
@ -33,17 +38,23 @@ Requirements
|
|||
Usage
|
||||
-----
|
||||
|
||||
Usage : ./check_pg_streaming_replication [-h] [-d] [options]
|
||||
-u pg_user Specify Postgres user (Default : postgres)
|
||||
-b psql_bin Specify psql binary path (Default : /usr/bin/psql)
|
||||
-m pg_main Specify Postgres main directory path
|
||||
(Default : /var/lib/postgresql/9.1/main)
|
||||
-r recovery_conf Specify Postgres recovery configuration file path
|
||||
(Default : /var/lib/postgresql/9.1/main/recovery.conf)
|
||||
-p pg_port Specify default Postgres master TCP port (Default : 5432)
|
||||
-D dbname Specify DB name on Postgres hosts to connect on (Default : postgres)
|
||||
-d Debug mode
|
||||
-h Show this message
|
||||
Usage : check_pg_streaming_replication [-d] [-h] [options]
|
||||
-u pg_user Specify Postgres user (Default : postgres)
|
||||
-b psql_bin Specify psql binary path (Default : /usr/bin/psql)
|
||||
-m pg_main Specify Postgres main directory path
|
||||
(By default, try to auto-detect it, on your system it :
|
||||
/var/lib/postgresql/9.6/main)
|
||||
-r recovery_conf Specify Postgres recovery configuration file path
|
||||
(Default : [PG_MAIN]/recovery.conf)
|
||||
-U pg_master_user Specify Postgres user to use on master (Default : user from recovery.conf file)
|
||||
-p pg_port Specify default Postgres master TCP port (Default : 5432)
|
||||
-D dbname Specify DB name on Postgres master/slave to connect on (Default : PG_USER)
|
||||
-C 1/0 Enable or disable check if the current XLOG file of the master host is the same
|
||||
of the last replay XLOG file (Default : 1)
|
||||
-w replay_warn_delay Specify the replay warning delay in second (Default : 3)
|
||||
-c replay_crit_delay Specify the replay critical delay in second (Default : 5)
|
||||
-d Debug mode
|
||||
-h Show this message
|
||||
|
||||
Copyright
|
||||
---------
|
||||
|
|
|
@ -35,7 +35,11 @@ fi
|
|||
RECOVERY_CONF_FILENAME=recovery.conf
|
||||
RECOVERY_CONF=""
|
||||
PG_DEFAULT_PORT=5432
|
||||
PG_DEFAULT_APP_NAME=$( hostname )
|
||||
PG_DB=""
|
||||
CHECK_CUR_MASTER_XLOG=1
|
||||
REPLAY_WARNING_DELAY=3
|
||||
REPLAY_CRITICAL_DELAY=5
|
||||
|
||||
DEBUG=0
|
||||
|
||||
|
@ -52,13 +56,17 @@ Usage : $0 [-d] [-h] [options]
|
|||
-U pg_master_user Specify Postgres user to use on master (Default : user from recovery.conf file)
|
||||
-p pg_port Specify default Postgres master TCP port (Default : $PG_DEFAULT_PORT)
|
||||
-D dbname Specify DB name on Postgres master/slave to connect on (Default : PG_USER)
|
||||
-C 1/0 Enable or disable check if the current XLOG file of the master host is the same
|
||||
of the last replay XLOG file (Default : $CHECK_CUR_MASTER_XLOG)
|
||||
-w replay_warn_delay Specify the replay warning delay in second (Default : $REPLAY_WARNING_DELAY)
|
||||
-c replay_crit_delay Specify the replay critical delay in second (Default : $REPLAY_CRITICAL_DELAY)
|
||||
-d Debug mode
|
||||
-h Show this message
|
||||
EOF
|
||||
exit 0
|
||||
}
|
||||
|
||||
while getopts "hu:b:m:r:U:p:D:d" OPTION
|
||||
while getopts "hu:b:m:r:U:p:D:C:w:c:d" OPTION
|
||||
do
|
||||
case $OPTION in
|
||||
u)
|
||||
|
@ -82,6 +90,15 @@ do
|
|||
D)
|
||||
PG_DB=$OPTARG
|
||||
;;
|
||||
C)
|
||||
CHECK_CUR_MASTER_XLOG=$OPTARG
|
||||
;;
|
||||
w)
|
||||
REPLAY_WARNING_DELAY=$OPTARG
|
||||
;;
|
||||
c)
|
||||
REPLAY_CRITICAL_DELAY=$OPTARG
|
||||
;;
|
||||
d)
|
||||
DEBUG=1
|
||||
;;
|
||||
|
@ -134,11 +151,17 @@ function debug() {
|
|||
}
|
||||
|
||||
debug "Running options :
|
||||
PG_DB = $PG_DB
|
||||
PG_USER = $PG_USER
|
||||
PSQL_BIN = $PSQL_BIN
|
||||
PG_MAIN = $PG_MAIN
|
||||
RECOVERY_CONF = $RECOVERY_CONF
|
||||
PG_DEFAULT_PORT = $PG_DEFAULT_PORT"
|
||||
PG_DEFAULT_PORT = $PG_DEFAULT_PORT
|
||||
PG_DEFAULT_APP_NAME = $PG_DEFAULT_APP_NAME
|
||||
CHECK_CUR_MASTER_XLOG = $CHECK_CUR_MASTER_XLOG
|
||||
REPLAY_WARNING_DELAY = $REPLAY_WARNING_DELAY
|
||||
REPLAY_CRITICAL_DELAY = $REPLAY_CRITICAL_DELAY
|
||||
"
|
||||
|
||||
# Postgres is running ?
|
||||
if [ $DEBUG -eq 0 ]
|
||||
|
@ -215,29 +238,79 @@ then
|
|||
debug "Master user : $M_USER"
|
||||
fi
|
||||
fi
|
||||
|
||||
# Get current xlog file from master
|
||||
M_CUR_XLOG="$( psql_master_get 'SELECT pg_current_xlog_location()' )"
|
||||
if [ ! -n "$M_CUR_XLOG" ]
|
||||
|
||||
M_APP_NAME=$( echo "$MASTER_CONN_INFOS"| grep 'application_name=' | sed "s/^.*application_name=[ \'\"]*\([^ \'\"]\+\)[ \'\"]*.*$/\1/" )
|
||||
if [ ! -n "$M_APP_NAME" ]
|
||||
then
|
||||
echo "UNKNOWN : Can't retreive current xlog from master server"
|
||||
debug "Master application name not specified, use default : $PG_DEFAULT_APP_NAME"
|
||||
M_APP_NAME=$PG_DEFAULT_APP_NAME
|
||||
else
|
||||
debug "Master application name : $M_APP_NAME"
|
||||
fi
|
||||
|
||||
# Get current state/sync_state from master
|
||||
M_CUR_STATE_SYNC_STATE="$( psql_master_get "SELECT state,sync_state FROM pg_stat_replication WHERE application_name='$M_APP_NAME';" )"
|
||||
if [ ! -n "$M_CUR_STATE_SYNC_STATE" ]
|
||||
then
|
||||
echo "UNKNOWN : Can't retreive current state and sync state from master server"
|
||||
exit 3
|
||||
fi
|
||||
debug "Master current xlog : $M_CUR_XLOG"
|
||||
debug "Master current state / sync_state : $M_CUR_STATE_SYNC_STATE"
|
||||
|
||||
# Master current xlog is the last receive xlog ?
|
||||
if [ "$M_CUR_XLOG" != "$LAST_XLOG_RECEIVE" ]
|
||||
M_CUR_STATE=$( echo "$M_CUR_STATE_SYNC_STATE"|cut -d'|' -f1 )
|
||||
debug "Master current state : $M_CUR_STATE"
|
||||
if [ "$M_CUR_STATE" != "streaming" ]
|
||||
then
|
||||
echo "CRITICAL : Master current xlog is not the last receive xlog"
|
||||
echo "CRITICAL : this host is not in streaming state according to master host (current state = '$M_CUR_STATE')"
|
||||
exit 2
|
||||
fi
|
||||
debug "Master current xlog is the last receive xlog"
|
||||
|
||||
M_CUR_SYNC_STATE=$( echo "$M_CUR_STATE_SYNC_STATE"|cut -d'|' -f2 )
|
||||
debug "Master current sync state : $M_CUR_SYNC_STATE"
|
||||
if [ "$M_CUR_SYNC_STATE" != "sync" ]
|
||||
then
|
||||
echo "CRITICAL : this host is not synchronized according to master host (current sync state = '$M_CUR_SYNC_STATE')"
|
||||
exit 2
|
||||
fi
|
||||
|
||||
# Check current master XLOG file vs last replay XLOG file
|
||||
if [ "$CHECK_CUR_MASTER_XLOG" == "1" ]
|
||||
then
|
||||
# Get current xlog file from master
|
||||
M_CUR_XLOG="$( psql_master_get 'SELECT pg_current_xlog_location()' )"
|
||||
if [ ! -n "$M_CUR_XLOG" ]
|
||||
then
|
||||
echo "UNKNOWN : Can't retreive current xlog from master server"
|
||||
exit 3
|
||||
fi
|
||||
debug "Master current xlog : $M_CUR_XLOG"
|
||||
|
||||
# Master current xlog is the last receive xlog ?
|
||||
if [ "$M_CUR_XLOG" != "$LAST_XLOG_RECEIVE" ]
|
||||
then
|
||||
echo "CRITICAL : Master current xlog is not the last receive xlog"
|
||||
exit 2
|
||||
fi
|
||||
debug "Master current xlog is the last receive xlog"
|
||||
fi
|
||||
|
||||
# The last receive xlog is the last replay file ?
|
||||
if [ "$LAST_XLOG_RECEIVE" != "$LAST_XLOG_REPLAY" ]
|
||||
then
|
||||
echo "WARNING : last receive xlog file is not the last replay file"
|
||||
exit 1
|
||||
debug "/!\ The last receive xlog is NOT the last replay file ('$M_CUR_XLOG' / '$LAST_XLOG_RECEIVE')"
|
||||
REPLAY_DELAY="$( psql_get 'SELECT EXTRACT(EPOCH FROM now() - pg_last_xact_replay_timestamp());' )"
|
||||
debug "Replay delay is $REPLAY_DELAY second(s)"
|
||||
if [ $( echo "$REPLAY_DELAY >= $REPLAY_CRITICAL_DELAY"|bc -l ) -gt 0 ]
|
||||
then
|
||||
echo "CRITICAL : last receive xlog file is not the last replay file ('$LAST_XLOG_RECEIVE' / '$LAST_XLOG_REPLAY') and replay delay is $REPLAY_DELAY second(s)"
|
||||
exit 2
|
||||
fi
|
||||
if [ $( echo "$REPLAY_DELAY >= $REPLAY_WARNING_DELAY"|bc -l ) -gt 0 ]
|
||||
then
|
||||
echo "WARNING : last receive xlog file is not the last replay file ('$LAST_XLOG_RECEIVE' / '$LAST_XLOG_REPLAY') and replay delay is $REPLAY_DELAY second(s)"
|
||||
exit 1
|
||||
fi
|
||||
debug "Replay delay is not worrying"
|
||||
fi
|
||||
debug "Last receive xlog file is the last replay file"
|
||||
|
||||
|
|
Loading…
Reference in a new issue