Compare commits

..

No commits in common. "8d172e944c71a82ab704f80c6b48100475f17a85" and "8b03a8bc21647258257571380701fad9496779c1" have entirely different histories.

3 changed files with 328 additions and 357 deletions

View file

@ -1,25 +0,0 @@
# Pre-commit hooks to run tests and ensure code is cleaned.
# See https://pre-commit.com for more information
---
repos:
- repo: https://github.com/codespell-project/codespell
rev: v2.2.2
hooks:
- id: codespell
args:
- --ignore-words-list=exten
- --skip="./.*,*.csv,*.json,*.ini,*.subject,*.txt,*.html,*.log,*.conf"
- --quiet-level=2
- --ignore-regex=.*codespell-ignore$
# - --write-changes # Uncomment to write changes
exclude_types: [csv, json]
- repo: https://github.com/adrienverge/yamllint
rev: v1.32.0
hooks:
- id: yamllint
ignore: .github/
- repo: https://github.com/pre-commit/mirrors-prettier
rev: v2.7.1
hooks:
- id: prettier
args: ["--print-width", "100"]

View file

@ -7,14 +7,14 @@ This script :
- check if Postgres is running (_CRITICAL_ raise if not) - check if Postgres is running (_CRITICAL_ raise if not)
- check if Postgres is in recovery mode : - check if Postgres is in recovery mode :
- if Postgres is in recovery mode : - if Postgres is in recovery mode :
- retrieve from Postgres the last _xlog_ file receive and the _xlog_ file replay - retreive from Postgres the last _xlog_ file receive and the _xlog_ file replay
- check if Postgres recovery configuration file is NOT present (_CRITICAL_ raise if present) - check if Postgres recovery configuration file is NOT present (_CRITICAL_ raise if present)
- retrieve master connection information from Postgres recovery configuration file (_UNKNOWN_ raise on error). Default Postgres master TCP port will be used if port is not specify. - retreive master connection informations from Postgres recovery configuration file (_UNKNOWN_ raise on error). Default Postgres master TCP port will be used if port is not specify.
- retrieve the current state and sync state of the host from Postgres master server by making a connection on master server (_UNKNOWN_ raise on error). - retreive the current state and sync state of the host from Postgres master server by making a connection on master server (_UNKNOWN_ raise on error).
- check if the current state of the host is "streaming" (_CRITICAL_ raise if not) - check if the current state of the host is "streaming" (_CRITICAL_ raise if not)
- check if the current sync state of the host is "sync" (or the state specified using `-e` parameter, _CRITICAL_ raise if not) - check if the current sync state of the host is "sync" (or the state specified using `-e` parameter, _CRITICAL_ raise if not)
- if the check of the current XLOG file of the master host is enabled : - if the check of the current XLOG file of the master host is enabled :
- retrieve current _xlog_ file from Postgres master server by making a connection on master server (_UNKNOWN_ raise on error). - retreive current _xlog_ file from Postgres master server by making a connection on master server (_UNKNOWN_ raise on error).
- check if the current master _xlog_ file is the last received _xlog_ file (_CRITICAL_ raise if not) - check if the current master _xlog_ file is the last received _xlog_ file (_CRITICAL_ raise if not)
- check if the last received _xlog_ file is the last replay _xlog_ file : if not, check the current delay with the last replayed transaction against _replay_warn_delay_ and _replay_crit_delay_ thresholds and raise corresponding error if they are exceeded - check if the last received _xlog_ file is the last replay _xlog_ file : if not, check the current delay with the last replayed transaction against _replay_warn_delay_ and _replay_crit_delay_ thresholds and raise corresponding error if they are exceeded
- Return _OK_ state - Return _OK_ state
@ -27,11 +27,11 @@ This script :
## Requirements ## Requirements
- Some CLI tools: `sudo`, `awk`, `sed`, `bc`, `psql` and `pg_lscluster` * Some CLI tools: `sudo`, `awk`, `sed`, `bc`, `psql` and `pg_lscluster`
- **On master node:** Slaves must be able to connect with user from `recovery.conf` / `postgresql.auto.conf` (or user specify using `-U`) to database with the same name (or another specified with `-D`) as `trust` (or using password specified in `~/.pgpass`). This user must have `SUPERUSER` privilege (need to get replication details). * **On master node:** Slaves must be able to connect with user from `recovery.conf` / `postgresql.auto.conf` (or user specify using `-U`) to database with the same name (or another specified with `-D`) as `trust` (or using password specified in `~/.pgpass`). This user must have `SUPERUSER` privilege (need to get replication details).
- **On standby node:** `PG_USER` must be able to connect locally on the database with the same name `(or another specified with -D)` as `trust` (or using password specified in `~/.pgpass`). * **On standby node:** `PG_USER` must be able to connect localy on the database with the same name `(or another specified with -D)` as `trust` (or using password specified in `~/.pgpass`).
## Installation ## Installation

View file

@ -1,5 +1,4 @@
#!/bin/bash #!/bin/bash
# vim: tabstop=4 shiftwidth=4 softtabstop=4 expandtab
# #
# Nagios plugin to check Postgresql streamin replication state # Nagios plugin to check Postgresql streamin replication state
# #
@ -15,7 +14,7 @@
# ~/.pgpass). This user must have SUPERUSER privilege (need to get replication # ~/.pgpass). This user must have SUPERUSER privilege (need to get replication
# details). # details).
# #
# On standby node: PG_USER must be able to connect locally on the database with the same name # On standby node: PG_USER must be able to connect localy on the database with the same name
# (or another specified with -D) as trust (or using password specified in # (or another specified with -D) as trust (or using password specified in
# ~/.pgpass). # ~/.pgpass).
# #
@ -47,95 +46,95 @@ EXPECTED_SYNC_STATE=sync
DEBUG=0 DEBUG=0
function usage () { function usage () {
ERROR="$1" ERROR="$1"
[ -n "$ERROR" ] && echo -e "$ERROR\n" [ -n "$ERROR" ] && echo -e "$ERROR\n"
cat << EOF cat << EOF
Usage: $0 [-d] [-h] [options] Usage: $0 [-d] [-h] [options]
-u pg_user Specify local Postgres user (Default: try to auto-detect or use $DEFAULT_PG_USER) -u pg_user Specify local Postgres user (Default: try to auto-detect or use $DEFAULT_PG_USER)
-b psql_bin Specify psql binary path (Default: $PSQL_BIN) -b psql_bin Specify psql binary path (Default: $PSQL_BIN)
-B pg_lsclusters_bin Specify pg_lsclusters binary path (Default: $PG_LSCLUSTER_BIN) -B pg_lsclusters_bin Specify pg_lsclusters binary path (Default: $PG_LSCLUSTER_BIN)
-V pg_version Specify Postgres version (Default: try to auto-detect or use $DEFAULT_PG_VERSION) -V pg_version Specify Postgres version (Default: try to auto-detect or use $DEFAULT_PG_VERSION)
-m pg_main Specify Postgres main directory path (Default: try to auto-detect or use -m pg_main Specify Postgres main directory path (Default: try to auto-detect or use
$DEFAULT_PG_MAIN) $DEFAULT_PG_MAIN)
-r recovery_conf Specify Postgres recovery configuration file path -r recovery_conf Specify Postgres recovery configuration file path
( Default: [PG_MAIN]/recovery.conf on PG <= 11, [PG_MAIN]/postgresql.auto.conf on PG >= 12) (Default: [PG_MAIN]/recovery.conf on PG <= 11, [PG_MAIN]/postgresql.auto.conf on PG >= 12)
-U pg_master_user Specify Postgres user to use on master (Default: user from recovery.conf file) -U pg_master_user Specify Postgres user to use on master (Default: user from recovery.conf file)
-p pg_port Specify default Postgres master TCP port (Default: same as local PostgreSQL -p pg_port Specify default Postgres master TCP port (Default: same as local PostgreSQL
port if detected or use $DEFAULT_PG_PORT) port if detected or use $DEFAULT_PG_PORT)
-D dbname Specify DB name on Postgres master/slave to connect on (Default: PG_USER, must -D dbname Specify DB name on Postgres master/slave to connect on (Default: PG_USER, must
match with .pgpass one is used) match with .pgpass one is used)
-C 1/0 Enable or disable check if the current LSN of the master host is the same -C 1/0 Enable or disable check if the current LSN of the master host is the same
of the last received LSN (Default: $CHECK_CUR_MASTER_LSN) of the last received LSN (Default: $CHECK_CUR_MASTER_LSN)
-w replay_warn_delay Specify the replay warning delay in second (Default: $REPLAY_WARNING_DELAY) -w replay_warn_delay Specify the replay warning delay in second (Default: $REPLAY_WARNING_DELAY)
-c replay_crit_delay Specify the replay critical delay in second (Default: $REPLAY_CRITICAL_DELAY) -c replay_crit_delay Specify the replay critical delay in second (Default: $REPLAY_CRITICAL_DELAY)
-e expected_sync_state The expected replication state ('sync' or 'async', default: $EXPECTED_SYNC_STATE) -e expected_sync_state The expected replication state ('sync' or 'async', default: $EXPECTED_SYNC_STATE)
-d Debug mode -d Debug mode
-h Show this message -h Show this message
EOF EOF
[ -n "$ERROR" ] && exit 1 || exit 0 [ -n "$ERROR" ] && exit 1 || exit 0
} }
while getopts "hu:b:B:V:m:r:U:p:D:C:w:c:e:d" OPTION while getopts "hu:b:B:V:m:r:U:p:D:C:w:c:e:d" OPTION
do do
case $OPTION in case $OPTION in
u) u)
PG_USER=$OPTARG PG_USER=$OPTARG
;; ;;
b) b)
PSQL_BIN=$OPTARG PSQL_BIN=$OPTARG
;; ;;
B) B)
PG_LSCLUSTER_BIN=$OPTARG PG_LSCLUSTER_BIN=$OPTARG
;; ;;
V) V)
PG_VERSION=$OPTARG PG_VERSION=$OPTARG
;; ;;
m) m)
PG_MAIN=$OPTARG PG_MAIN=$OPTARG
;; ;;
r) r)
RECOVERY_CONF=$OPTARG RECOVERY_CONF=$OPTARG
;; ;;
U) U)
PG_MASTER_USER=$OPTARG PG_MASTER_USER=$OPTARG
;; ;;
p) p)
PG_DEFAULT_PORT=$OPTARG PG_DEFAULT_PORT=$OPTARG
;; ;;
D) D)
PG_DB=$OPTARG PG_DB=$OPTARG
;; ;;
C) C)
CHECK_CUR_MASTER_LSN=$OPTARG CHECK_CUR_MASTER_LSN=$OPTARG
;; ;;
w) w)
REPLAY_WARNING_DELAY=$OPTARG REPLAY_WARNING_DELAY=$OPTARG
;; ;;
c) c)
REPLAY_CRITICAL_DELAY=$OPTARG REPLAY_CRITICAL_DELAY=$OPTARG
;; ;;
e) e)
[ "$OPTARG" != "sync" -a "$OPTARG" != "async" ] && \ [ "$OPTARG" != "sync" -a "$OPTARG" != "async" ] && \
usage "Invalid expected replication state '$OPTARG'. Possible values: sync or async." usage "Invalid expected replication state '$OPTARG'. Possible values: sync or async."
EXPECTED_SYNC_STATE=$OPTARG EXPECTED_SYNC_STATE=$OPTARG
;; ;;
d) d)
DEBUG=1 DEBUG=1
;; ;;
h) h)
usage usage
;; ;;
\?) \?)
echo -n "Unknown option" echo -n "Unkown option"
usage usage
esac esac
done done
function debug() { function debug() {
if [ $DEBUG -eq 1 ] if [ $DEBUG -eq 1 ]
then then
>&2 echo -e "[DEBUG] $1" >&2 echo -e "[DEBUG] $1"
fi fi
} }
debug "Starting options (before handling auto-detection/default values): debug "Starting options (before handling auto-detection/default values):
@ -156,19 +155,19 @@ REPLAY_CRITICAL_DELAY = $REPLAY_CRITICAL_DELAY
# Auto-detect PostgreSQL information using pg_lsclusters # Auto-detect PostgreSQL information using pg_lsclusters
if [ -x "$PG_LSCLUSTER_BIN" ] if [ -x "$PG_LSCLUSTER_BIN" ]
then then
PG_CLUSTER=$( $PG_LSCLUSTER_BIN -h 2>/dev/null|head -n1 ) PG_CLUSTER=$( $PG_LSCLUSTER_BIN -h 2>/dev/null|head -n1 )
if [ -n "$PG_CLUSTER" ] if [ -n "$PG_CLUSTER" ]
then then
debug "pg_lsclusters output:\n\t$PG_CLUSTER" debug "pg_lsclusters output:\n\t$PG_CLUSTER"
# Output example: # Output example:
# 9.6 main 5432 online,recovery postgres /var/lib/postgresql/9.6/main /var/log/postgresql/postgresql-9.6-main.log # 9.6 main 5432 online,recovery postgres /var/lib/postgresql/9.6/main /var/log/postgresql/postgresql-9.6-main.log
[ -z "$PG_VERSION" ] && PG_VERSION=$( echo "$PG_CLUSTER"|awk -F ' +' '{print $1}' ) [ -z "$PG_VERSION" ] && PG_VERSION=$( echo "$PG_CLUSTER"|awk -F ' +' '{print $1}' )
[ -z "$PG_DEFAULT_PORT" ] && PG_DEFAULT_PORT=$( echo "$PG_CLUSTER"|awk -F ' +' '{print $3}' ) [ -z "$PG_DEFAULT_PORT" ] && PG_DEFAULT_PORT=$( echo "$PG_CLUSTER"|awk -F ' +' '{print $3}' )
[ -z "$PG_USER" ] && PG_USER=$( echo "$PG_CLUSTER"|awk -F ' +' '{print $5}' ) [ -z "$PG_USER" ] && PG_USER=$( echo "$PG_CLUSTER"|awk -F ' +' '{print $5}' )
[ -z "$PG_MAIN" ] && PG_MAIN=$( echo "$PG_CLUSTER"|awk -F ' +' '{print $6}' ) [ -z "$PG_MAIN" ] && PG_MAIN=$( echo "$PG_CLUSTER"|awk -F ' +' '{print $6}' )
fi fi
else else
debug "pg_lsclusters not found ($PG_LSCLUSTER_BIN): parameters auto-detection disabled" debug "pg_lsclusters not found ($PG_LSCLUSTER_BIN): parameters auto-detection disabled"
fi fi
# If auto-detection failed, use default values # If auto-detection failed, use default values
@ -190,8 +189,8 @@ id "$PG_USER" > /dev/null 2>&1
# Check RECOVERY_CONF # Check RECOVERY_CONF
if [ -z "$RECOVERY_CONF" ]; then if [ -z "$RECOVERY_CONF" ]; then
[ $PG_VERSION -le 11 ] && RECOVERY_CONF_FILENAME="recovery.conf" || RECOVERY_CONF_FILENAME="postgresql.auto.conf" [ $PG_VERSION -le 11 ] && RECOVERY_CONF_FILENAME="recovery.conf" || RECOVERY_CONF_FILENAME="postgresql.auto.conf"
RECOVERY_CONF="$PG_MAIN/$RECOVERY_CONF_FILENAME" RECOVERY_CONF="$PG_MAIN/$RECOVERY_CONF_FILENAME"
fi fi
# Check PG_DEFAULT_PORT # Check PG_DEFAULT_PORT
@ -201,15 +200,15 @@ fi
[ -z "$PG_DB" ] && PG_DB="$PG_USER" [ -z "$PG_DB" ] && PG_DB="$PG_USER"
function psql_get () { function psql_get () {
sql="$1" sql="$1"
debug "Exec 'echo \"$sql\"|sudo -u $PG_USER $PSQL_BIN -d \"$PG_DB\" -w -t -P format=unaligned" debug "Exec 'echo \"$sql\"|sudo -u $PG_USER $PSQL_BIN -d \"$PG_DB\" -w -t -P format=unaligned"
echo "$sql"|sudo -u $PG_USER $PSQL_BIN -d "$PG_DB" -w -t -P format=unaligned echo "$sql"|sudo -u $PG_USER $PSQL_BIN -d "$PG_DB" -w -t -P format=unaligned
} }
function psql_master_get () { function psql_master_get () {
sql="$1" sql="$1"
debug "Exec 'echo \"$sql\"|sudo -u $PG_USER $PSQL_BIN -U $M_USER -h $M_HOST -w -p $M_PORT -d $PG_DB -t -P format=unaligned" debug "Exec 'echo \"$sql\"|sudo -u $PG_USER $PSQL_BIN -U $M_USER -h $M_HOST -w -p $M_PORT -d $PG_DB -t -P format=unaligned"
echo "$sql"|sudo -u $PG_USER $PSQL_BIN -U $M_USER -h $M_HOST -w -p $M_PORT -d $PG_DB -t -P format=unaligned echo "$sql"|sudo -u $PG_USER $PSQL_BIN -U $M_USER -h $M_HOST -w -p $M_PORT -d $PG_DB -t -P format=unaligned
} }
debug "Running options: debug "Running options:
@ -230,32 +229,32 @@ REPLAY_CRITICAL_DELAY = $REPLAY_CRITICAL_DELAY
# Set some stuff to PostgreSQL version # Set some stuff to PostgreSQL version
if [ $( echo "$PG_VERSION < 10" |bc -l ) -eq 1 ] if [ $( echo "$PG_VERSION < 10" |bc -l ) -eq 1 ]
then then
pg_last_wal_receive_lsn='pg_last_xlog_receive_location()' pg_last_wal_receive_lsn='pg_last_xlog_receive_location()'
pg_last_wal_replay_lsn='pg_last_xlog_replay_location()' pg_last_wal_replay_lsn='pg_last_xlog_replay_location()'
pg_current_wal_lsn='pg_current_xlog_location()' pg_current_wal_lsn='pg_current_xlog_location()'
pg_wal_lsn_diff='pg_xlog_location_diff' pg_wal_lsn_diff='pg_xlog_location_diff'
sent_lsn='sent_location' sent_lsn='sent_location'
write_lsn='write_location' write_lsn='write_location'
else else
pg_last_wal_receive_lsn='pg_last_wal_receive_lsn()' pg_last_wal_receive_lsn='pg_last_wal_receive_lsn()'
pg_last_wal_replay_lsn='pg_last_wal_replay_lsn()' pg_last_wal_replay_lsn='pg_last_wal_replay_lsn()'
pg_current_wal_lsn='pg_current_wal_lsn()' pg_current_wal_lsn='pg_current_wal_lsn()'
pg_wal_lsn_diff='pg_wal_lsn_diff' pg_wal_lsn_diff='pg_wal_lsn_diff'
sent_lsn='sent_lsn' sent_lsn='sent_lsn'
write_lsn='write_lsn' write_lsn='write_lsn'
fi fi
# Postgres is running ? # Postgres is running ?
if [ $DEBUG -eq 0 ] if [ $DEBUG -eq 0 ]
then then
psql_get '\q' 2> /dev/null psql_get '\q' 2> /dev/null
else else
psql_get '\q' psql_get '\q'
fi fi
if [ $? -ne 0 ] if [ $? -ne 0 ]
then then
echo "CRITICAL: Postgres is not running !" echo "CRITICAL: Postgres is not running !"
exit 2 exit 2
fi fi
debug "Postgres is running" debug "Postgres is running"
@ -264,236 +263,233 @@ RECOVERY_MODE=0
if [ -f $RECOVERY_CONF ] if [ -f $RECOVERY_CONF ]
then then
debug "File recovery.conf found. Hot-standby mode." debug "File recovery.conf found. Hot-standby mode."
# Check recovery mode # Check recovery mode
if [ $RECOVERY_MODE -ne 1 ] if [ $RECOVERY_MODE -ne 1 ]
then then
echo "CRITICAL: Not in recovery mode while recovery.conf file found !" echo "CRITICAL: Not in recovery mode while recovery.conf file found !"
exit 2 exit 2
fi fi
debug "Postgres is in recovery mode" debug "Postgres is in recovery mode"
# Get local current last received/replayed LSN # Get local current last received/replayed LSN
LAST_RECEIVED_LSN=$( psql_get "SELECT $pg_last_wal_receive_lsn" ) LAST_RECEIVED_LSN=$( psql_get "SELECT $pg_last_wal_receive_lsn" )
debug "Last received LSN: $LAST_RECEIVED_LSN" debug "Last received LSN: $LAST_RECEIVED_LSN"
LAST_REPLAYED_LSN=$( psql_get "SELECT $pg_last_wal_replay_lsn" ) LAST_REPLAYED_LSN=$( psql_get "SELECT $pg_last_wal_replay_lsn" )
debug "Last replayed LSN: $LAST_REPLAYED_LSN" debug "Last replayed LSN: $LAST_REPLAYED_LSN"
# Get master connection information from recovery.conf file # Get master connection informations from recovery.conf file
MASTER_CONN_INFOS=$( egrep '^ *primary_conninfo' $RECOVERY_CONF|sed "s/^ *primary_conninfo *= *\(.\+\) *$/\1/" ) MASTER_CONN_INFOS=$( egrep '^ *primary_conninfo' $RECOVERY_CONF|sed "s/^ *primary_conninfo *= *\(.\+\) *$/\1/" )
if [ ! -n "$MASTER_CONN_INFOS" ] if [ ! -n "$MASTER_CONN_INFOS" ]
then then
echo "UNKNOWN: Can't retrieve master connection information form recovery.conf file" echo "UNKNOWN: Can't retreive master connection informations form recovery.conf file"
exit 3 exit 3
fi fi
debug "Master connection information: $MASTER_CONN_INFOS" debug "Master connection informations: $MASTER_CONN_INFOS"
M_HOST=$( echo "$MASTER_CONN_INFOS"| grep 'host=' | sed 's/^.*host= *\([0-9a-zA-Z.-]\+\) *.*$/\1/' ) M_HOST=$( echo "$MASTER_CONN_INFOS"| grep 'host=' | sed 's/^.*host= *\([0-9a-zA-Z.-]\+\) *.*$/\1/' )
if [ ! -n "$M_HOST" ] if [ ! -n "$M_HOST" ]
then then
echo "UNKNOWN: Can't retrieve master host from recovery.conf file" echo "UNKNOWN: Can't retreive master host from recovery.conf file"
exit 3 exit 3
fi fi
debug "Master host: $M_HOST" debug "Master host: $M_HOST"
M_PORT=$( echo "$MASTER_CONN_INFOS"| grep 'port=' | sed 's/^.*port= *\([0-9a-zA-Z.-]\+\) *.*$/\1/' ) M_PORT=$( echo "$MASTER_CONN_INFOS"| grep 'port=' | sed 's/^.*port= *\([0-9a-zA-Z.-]\+\) *.*$/\1/' )
if [ ! -n "$M_PORT" ] if [ ! -n "$M_PORT" ]
then then
debug "Master port not specified, use default: $PG_DEFAULT_PORT" debug "Master port not specified, use default: $PG_DEFAULT_PORT"
M_PORT=$PG_DEFAULT_PORT M_PORT=$PG_DEFAULT_PORT
else else
debug "Master port: $M_PORT" debug "Master port: $M_PORT"
fi fi
if [ -n "$PG_MASTER_USER" ] if [ -n "$PG_MASTER_USER" ]
then then
debug "Master user provided by command-line, use it: $PG_MASTER_USER" debug "Master user provided by command-line, use it: $PG_MASTER_USER"
M_USER="$PG_MASTER_USER" M_USER="$PG_MASTER_USER"
else else
M_USER=$( echo "$MASTER_CONN_INFOS"| grep 'user=' | sed 's/^.*user= *\([0-9a-zA-Z.-]\+\) *.*$/\1/' ) M_USER=$( echo "$MASTER_CONN_INFOS"| grep 'user=' | sed 's/^.*user= *\([0-9a-zA-Z.-]\+\) *.*$/\1/' )
if [ ! -n "$M_USER" ] if [ ! -n "$M_USER" ]
then then
debug "Master user not specified, use default: $PG_USER" debug "Master user not specified, use default: $PG_USER"
M_USER=$PG_USER M_USER=$PG_USER
else else
debug "Master user: $M_USER" debug "Master user: $M_USER"
fi fi
fi fi
M_APP_NAME=$( echo "$MASTER_CONN_INFOS"| grep 'application_name=' | sed "s/^.*application_name=[ \'\"]*\([^ \'\"]\+\)[ \'\"]*.*$/\1/" ) M_APP_NAME=$( echo "$MASTER_CONN_INFOS"| grep 'application_name=' | sed "s/^.*application_name=[ \'\"]*\([^ \'\"]\+\)[ \'\"]*.*$/\1/" )
if [ ! -n "$M_APP_NAME" ] if [ ! -n "$M_APP_NAME" ]
then then
if [ $PG_VERSION -ge 12 ] if [ $PG_VERSION -ge 12 ]
then then
debug "Master application name not specified, use cluster_name if defined" debug "Master application name not specified, use cluster_name if defined"
CLUSTER_NAME=$( psql_get "SELECT current_setting('cluster_name')" ) CLUSTER_NAME=$( psql_get "SELECT current_setting('cluster_name')" )
debug "Cluster name: $CLUSTER_NAME" debug "Cluster name: $CLUSTER_NAME"
if [ -n "$CLUSTER_NAME" ] if [ -n "$CLUSTER_NAME" ]
then then
M_APP_NAME=$CLUSTER_NAME M_APP_NAME=$CLUSTER_NAME
else else
debug "Cluster name not defined, use default: $PG_DEFAULT_APP_NAME" debug "Cluster name not defined, use default: $PG_DEFAULT_APP_NAME"
M_APP_NAME=$PG_DEFAULT_APP_NAME M_APP_NAME=$PG_DEFAULT_APP_NAME
fi fi
else else
debug "Master application name not specified, use default: $PG_DEFAULT_APP_NAME" debug "Master application name not specified, use default: $PG_DEFAULT_APP_NAME"
M_APP_NAME=$PG_DEFAULT_APP_NAME M_APP_NAME=$PG_DEFAULT_APP_NAME
fi fi
else else
debug "Master application name: $M_APP_NAME" debug "Master application name: $M_APP_NAME"
fi fi
# Get current replication state information from master # Get current replication state information from master
M_CUR_REPL_STATE_INFO="$( psql_master_get "SELECT state, sync_state, $sent_lsn AS sent_lsn, $write_lsn AS write_lsn FROM pg_stat_replication WHERE application_name='$M_APP_NAME';" )" M_CUR_REPL_STATE_INFO="$( psql_master_get "SELECT state, sync_state, $sent_lsn AS sent_lsn, $write_lsn AS write_lsn FROM pg_stat_replication WHERE application_name='$M_APP_NAME';" )"
if [ ! -n "$M_CUR_REPL_STATE_INFO" ] if [ ! -n "$M_CUR_REPL_STATE_INFO" ]
then then
echo "UNKNOWN: Can't retrieve current replication state information from master server" echo "UNKNOWN: Can't retreive current replication state information from master server"
exit 3 exit 3
fi fi
debug "Master current replication state:\n\tstate|sync_state|sent_lsn|write_lsn\n\t$M_CUR_REPL_STATE_INFO" debug "Master current replication state:\n\tstate|sync_state|sent_lsn|write_lsn\n\t$M_CUR_REPL_STATE_INFO"
M_CUR_STATE=$( echo "$M_CUR_REPL_STATE_INFO"|cut -d'|' -f1 ) M_CUR_STATE=$( echo "$M_CUR_REPL_STATE_INFO"|cut -d'|' -f1 )
debug "Master current state: $M_CUR_STATE" debug "Master current state: $M_CUR_STATE"
if [ "$M_CUR_STATE" != "streaming" ] if [ "$M_CUR_STATE" != "streaming" ]
then then
echo "CRITICAL: this host is not in streaming state according to master host (current state = '$M_CUR_STATE')" echo "CRITICAL: this host is not in streaming state according to master host (current state = '$M_CUR_STATE')"
exit 2 exit 2
fi fi
M_CUR_SYNC_STATE=$( echo "$M_CUR_REPL_STATE_INFO"|cut -d'|' -f2 ) M_CUR_SYNC_STATE=$( echo "$M_CUR_REPL_STATE_INFO"|cut -d'|' -f2 )
debug "Master current sync state: $M_CUR_SYNC_STATE" debug "Master current sync state: $M_CUR_SYNC_STATE"
if [ "$M_CUR_SYNC_STATE" != "$EXPECTED_SYNC_STATE" ] if [ "$M_CUR_SYNC_STATE" != "$EXPECTED_SYNC_STATE" ]
then then
echo "CRITICAL: unexpected replication state '$M_CUR_SYNC_STATE' (expected state = '$EXPECTED_SYNC_STATE')" echo "CRITICAL: unexpected replication state '$M_CUR_SYNC_STATE' (expected state = '$EXPECTED_SYNC_STATE')"
exit 2 exit 2
fi fi
M_CUR_SENT_LSN=$( echo "$M_CUR_REPL_STATE_INFO"|cut -d'|' -f3 ) M_CUR_SENT_LSN=$( echo "$M_CUR_REPL_STATE_INFO"|cut -d'|' -f3 )
M_CUR_WRITED_LSN=$( echo "$M_CUR_REPL_STATE_INFO"|cut -d'|' -f4 ) M_CUR_WRITED_LSN=$( echo "$M_CUR_REPL_STATE_INFO"|cut -d'|' -f4 )
debug "Master current last sent/writed LSN: '$M_CUR_SENT_LSN' / '$M_CUR_WRITED_LSN'" debug "Master current last sent/writed LSN: '$M_CUR_SENT_LSN' / '$M_CUR_WRITED_LSN'"
# Check current master LSN vs last received LSN # Check current master LSN vs last received LSN
if [ "$CHECK_CUR_MASTER_LSN" == "1" ] if [ "$CHECK_CUR_MASTER_LSN" == "1" ]
then then
# Get current LSN from master # Get current LSN from master
M_CUR_LSN="$( psql_master_get "SELECT $pg_current_wal_lsn" )" M_CUR_LSN="$( psql_master_get "SELECT $pg_current_wal_lsn" )"
if [ ! -n "$M_CUR_LSN" ] if [ ! -n "$M_CUR_LSN" ]
then then
echo "UNKNOWN: Can't retrieve current LSN from master server" echo "UNKNOWN: Can't retreive current LSN from master server"
exit 3 exit 3
fi fi
debug "Master current LSN: $M_CUR_LSN" debug "Master current LSN: $M_CUR_LSN"
# Master current LSN is the last received LSN ? # Master current LSN is the last received LSN ?
if [ "$M_CUR_LSN" != "$LAST_RECEIVED_LSN" ] if [ "$M_CUR_LSN" != "$LAST_RECEIVED_LSN" ]
then then
echo "CRITICAL: Master current LSN is not the last received LSN" echo "CRITICAL: Master current LSN is not the last received LSN"
exit 2 exit 2
fi fi
debug "Master current LSN is the last received LSN" debug "Master current LSN is the last received LSN"
fi fi
# The last received LSN is the last replayed ? # The last received LSN is the last replayed ?
if [ "$LAST_RECEIVED_LSN" != "$LAST_REPLAYED_LSN" ] if [ "$LAST_RECEIVED_LSN" != "$LAST_REPLAYED_LSN" ]
then then
debug "/!\ The last received LSN is NOT the last replayed LSN ('$M_CUR_LSN' / '$LAST_REPLAYED_LSN')" debug "/!\ The last received LSN is NOT the last replayed LSN ('$M_CUR_LSN' / '$LAST_REPLAYED_LSN')"
REPLAY_DELAY="$( psql_get 'SELECT EXTRACT(EPOCH FROM now() - pg_last_xact_replay_timestamp());' )" REPLAY_DELAY="$( psql_get 'SELECT EXTRACT(EPOCH FROM now() - pg_last_xact_replay_timestamp());' )"
debug "Replay delay is $REPLAY_DELAY second(s)" debug "Replay delay is $REPLAY_DELAY second(s)"
if [ $( echo "$REPLAY_DELAY >= $REPLAY_CRITICAL_DELAY"|bc -l ) -gt 0 ] if [ $( echo "$REPLAY_DELAY >= $REPLAY_CRITICAL_DELAY"|bc -l ) -gt 0 ]
then then
echo "CRITICAL: last received LSN is not the last replayed ('$LAST_RECEIVED_LSN' / '$LAST_REPLAYED_LSN') and replay delay is $REPLAY_DELAY second(s)" echo "CRITICAL: last received LSN is not the last replayed ('$LAST_RECEIVED_LSN' / '$LAST_REPLAYED_LSN') and replay delay is $REPLAY_DELAY second(s)"
exit 2 exit 2
fi fi
if [ $( echo "$REPLAY_DELAY >= $REPLAY_WARNING_DELAY"|bc -l ) -gt 0 ] if [ $( echo "$REPLAY_DELAY >= $REPLAY_WARNING_DELAY"|bc -l ) -gt 0 ]
then then
echo "WARNING: last received LSN is not the last replay file ('$LAST_RECEIVED_LSN' / '$LAST_REPLAYED_LSN') and replay delay is $REPLAY_DELAY second(s)" echo "WARNING: last received LSN is not the last replay file ('$LAST_RECEIVED_LSN' / '$LAST_REPLAYED_LSN') and replay delay is $REPLAY_DELAY second(s)"
exit 1 exit 1
fi fi
debug "Replay delay is not worrying" debug "Replay delay is not worrying"
fi fi
debug "Last received LSN is the last replayed file" debug "Last received LSN is the last replayed file"
# The master last sent LSN is the last received (and synced) ? # The master last sent LSN is the last received (and synced) ?
if [ "$M_CUR_SENT_LSN" != "$LAST_RECEIVED_LSN" ] if [ "$M_CUR_SENT_LSN" != "$LAST_RECEIVED_LSN" ]
then then
echo "WARNING: master last sent LSN is not already received (and synced to disk) by slave. May be we have some network delay or load on slave" echo "WARNING: master last sent LSN is not already received (and synced to disk) by slave. May be we have some network delay or load on slave"
echo "Master last sent LSN: $M_CUR_SENT_LSN" echo "Master last sent LSN: $M_CUR_SENT_LSN"
echo "Slave last received (and synced to disk) LSN: $LAST_RECEIVED_LSN" echo "Slave last received (and synced to disk) LSN: $LAST_RECEIVED_LSN"
exit 1 exit 1
fi fi
echo "OK: Hot-standby server is up-to-date" echo "OK: Hot-standby server is uptodate"
echo "Replication state: $M_CUR_SYNC_STATE" exit 0
echo "Last sent/writed LSN: '$M_CUR_SENT_LSN' / '$M_CUR_WRITED_LSN'"
[ "$LAST_RECEIVED_LSN" != "$LAST_REPLAYED_LSN" ] && echo "Replay delay: ${REPLAY_DELAY}s"
exit 0
else else
debug "File recovery.conf not found. Master mode." debug "File recovery.conf not found. Master mode."
# Check recovery mode # Check recovery mode
if [ $RECOVERY_MODE -eq 1 ] if [ $RECOVERY_MODE -eq 1 ]
then then
echo "CRITICAL: In recovery mode while recovery.conf file not found !" echo "CRITICAL: In recovery mode while recovery.conf file not found !"
exit 2 exit 2
fi fi
debug "Postgres is not in recovery mode" debug "Postgres is not in recovery mode"
# Retrieve current lsn # Retreive current lsn
CURRENT_LSN=$( psql_get "SELECT $pg_current_wal_lsn" ) CURRENT_LSN=$( psql_get "SELECT $pg_current_wal_lsn" )
if [ -z "$CURRENT_LSN" ] if [ -z "$CURRENT_LSN" ]
then then
echo "UNKNOWN: Fail to retrieve current LSN (Log Sequence Number)" echo "UNKNOWN: Fail to retreive current LSN (Log Sequence Number)"
exit 3 exit 3
fi fi
debug "Current LSN: $CURRENT_LSN" debug "Current LSN: $CURRENT_LSN"
# Check standby client # Check standby client
STANDBY_CLIENTS=$( psql_get "SELECT application_name, client_addr, sent_lsn, write_lsn, state, sync_state, current_lag STANDBY_CLIENTS=$( psql_get "SELECT application_name, client_addr, sent_lsn, write_lsn, state, sync_state, current_lag
FROM ( FROM (
SELECT application_name, client_addr, sent_lsn, write_lsn, state, sync_state, current_lag SELECT application_name, client_addr, sent_lsn, write_lsn, state, sync_state, current_lag
FROM ( FROM (
SELECT application_name, client_addr, $sent_lsn AS sent_lsn, $write_lsn AS write_lsn, state, sync_state, SELECT application_name, client_addr, $sent_lsn AS sent_lsn, $write_lsn AS write_lsn, state, sync_state,
$pg_wal_lsn_diff($pg_current_wal_lsn, $write_lsn) AS current_lag $pg_wal_lsn_diff($pg_current_wal_lsn, $write_lsn) AS current_lag
FROM pg_stat_replication FROM pg_stat_replication
) AS s2 ) AS s2
) AS s1" ) ) AS s1" )
if [ ! -n "$STANDBY_CLIENTS" ] if [ ! -n "$STANDBY_CLIENTS" ]
then then
echo "WARNING: no stand-by client connected" echo "WARNING: no stand-by client connected"
exit 1 exit 1
fi fi
debug "Stand-by client(s):\n\t$( echo -e "$STANDBY_CLIENTS"|sed 's/\n/\n\t/' )" debug "Stand-by client(s):\n\t$( echo -e "$STANDBY_CLIENTS"|sed 's/\n/\n\t/' )"
STANDBY_CLIENTS_TXT="" STANDBY_CLIENTS_TXT=""
STANDBY_CLIENTS_COUNT=0 STANDBY_CLIENTS_COUNT=0
CURRENT_LSN_IS_LAST_SENT=1 CURRENT_LSN_IS_LAST_SENT=1
for line in $STANDBY_CLIENTS for line in $STANDBY_CLIENTS
do do
let STANDBY_CLIENTS_COUNT=STANDBY_CLIENTS_COUNT+1 let STANDBY_CLIENTS_COUNT=STANDBY_CLIENTS_COUNT+1
NAME=$( echo $line|cut -d '|' -f 1 ) NAME=$( echo $line|cut -d '|' -f 1 )
IP=$( echo $line|cut -d '|' -f 2 ) IP=$( echo $line|cut -d '|' -f 2 )
SENT_LSN=$( echo $line|cut -d '|' -f 3 ) SENT_LSN=$( echo $line|cut -d '|' -f 3 )
WRITED_LSN=$( echo $line|cut -d '|' -f 4 ) WRITED_LSN=$( echo $line|cut -d '|' -f 4 )
STATE=$( echo $line|cut -d '|' -f 5 ) STATE=$( echo $line|cut -d '|' -f 5 )
SYNC_STATE=$( echo $line|cut -d '|' -f 6 ) SYNC_STATE=$( echo $line|cut -d '|' -f 6 )
LAG=$( echo $line|cut -d '|' -f 7 ) LAG=$( echo $line|cut -d '|' -f 7 )
STANDBY_CLIENTS_TXT="$STANDBY_CLIENTS_TXT\n$NAME ($IP): $STATE/$SYNC_STATE (LSN: sent='$SENT_LSN' / writed='$WRITED_LSN', Lag: ${LAG}b)" STANDBY_CLIENTS_TXT="$STANDBY_CLIENTS_TXT\n$NAME ($IP): $STATE/$SYNC_STATE (LSN: sent='$SENT_LSN' / writed='$WRITED_LSN', Lag: ${LAG}b)"
[ "$SENT_LSN" != "$CURRENT_LSN" ] && CURRENT_LSN_IS_LAST_SENT=0 [ "$SENT_LSN" != "$CURRENT_LSN" ] && CURRENT_LSN_IS_LAST_SENT=0
done done
if [ $CURRENT_LSN_IS_LAST_SENT -eq 1 ] if [ $CURRENT_LSN_IS_LAST_SENT -eq 1 ]
then then
echo "OK: $STANDBY_CLIENTS_COUNT stand-by client(s) connected" echo "OK: $STANDBY_CLIENTS_COUNT stand-by client(s) connected"
EXIT_CODE=0 EXIT_CODE=0
else else
echo "WARNING: current master LSN is not the last sent to stand-by client(s) connected. May be we have some load ?" echo "WARNING: current master LSN is not the last sent to stand-by client(s) connected. May be we have some load ?"
EXIT_CODE=1 EXIT_CODE=1
fi fi
echo "Current master LSN: $CURRENT_LSN" echo "Current master LSN: $CURRENT_LSN"
echo -e "$STANDBY_CLIENTS_TXT" echo -e "$STANDBY_CLIENTS_TXT"
exit $EXIT_CODE exit $EXIT_CODE
fi fi