From 8d172e944c71a82ab704f80c6b48100475f17a85 Mon Sep 17 00:00:00 2001 From: Benjamin Renard Date: Mon, 3 Jun 2024 15:47:30 +0200 Subject: [PATCH] Code cleaning and introduce some pre-commit hooks --- .pre-commit-config.yaml | 25 ++ README.md | 14 +- check_pg_streaming_replication | 649 +++++++++++++++++---------------- 3 files changed, 357 insertions(+), 331 deletions(-) create mode 100644 .pre-commit-config.yaml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..cb6d9c9 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,25 @@ +# Pre-commit hooks to run tests and ensure code is cleaned. +# See https://pre-commit.com for more information +--- +repos: + - repo: https://github.com/codespell-project/codespell + rev: v2.2.2 + hooks: + - id: codespell + args: + - --ignore-words-list=exten + - --skip="./.*,*.csv,*.json,*.ini,*.subject,*.txt,*.html,*.log,*.conf" + - --quiet-level=2 + - --ignore-regex=.*codespell-ignore$ + # - --write-changes # Uncomment to write changes + exclude_types: [csv, json] + - repo: https://github.com/adrienverge/yamllint + rev: v1.32.0 + hooks: + - id: yamllint + ignore: .github/ + - repo: https://github.com/pre-commit/mirrors-prettier + rev: v2.7.1 + hooks: + - id: prettier + args: ["--print-width", "100"] diff --git a/README.md b/README.md index 9675d9e..99b1885 100644 --- a/README.md +++ b/README.md @@ -7,14 +7,14 @@ This script : - check if Postgres is running (_CRITICAL_ raise if not) - check if Postgres is in recovery mode : - if Postgres is in recovery mode : - - retreive from Postgres the last _xlog_ file receive and the _xlog_ file replay + - retrieve from Postgres the last _xlog_ file receive and the _xlog_ file replay - check if Postgres recovery configuration file is NOT present (_CRITICAL_ raise if present) - - retreive master connection informations from Postgres recovery configuration file (_UNKNOWN_ raise on error). Default Postgres master TCP port will be used if port is not specify. - - retreive the current state and sync state of the host from Postgres master server by making a connection on master server (_UNKNOWN_ raise on error). + - retrieve master connection information from Postgres recovery configuration file (_UNKNOWN_ raise on error). Default Postgres master TCP port will be used if port is not specify. + - retrieve the current state and sync state of the host from Postgres master server by making a connection on master server (_UNKNOWN_ raise on error). - check if the current state of the host is "streaming" (_CRITICAL_ raise if not) - check if the current sync state of the host is "sync" (or the state specified using `-e` parameter, _CRITICAL_ raise if not) - if the check of the current XLOG file of the master host is enabled : - - retreive current _xlog_ file from Postgres master server by making a connection on master server (_UNKNOWN_ raise on error). + - retrieve current _xlog_ file from Postgres master server by making a connection on master server (_UNKNOWN_ raise on error). - check if the current master _xlog_ file is the last received _xlog_ file (_CRITICAL_ raise if not) - check if the last received _xlog_ file is the last replay _xlog_ file : if not, check the current delay with the last replayed transaction against _replay_warn_delay_ and _replay_crit_delay_ thresholds and raise corresponding error if they are exceeded - Return _OK_ state @@ -27,11 +27,11 @@ This script : ## Requirements -* Some CLI tools: `sudo`, `awk`, `sed`, `bc`, `psql` and `pg_lscluster` +- Some CLI tools: `sudo`, `awk`, `sed`, `bc`, `psql` and `pg_lscluster` -* **On master node:** Slaves must be able to connect with user from `recovery.conf` / `postgresql.auto.conf` (or user specify using `-U`) to database with the same name (or another specified with `-D`) as `trust` (or using password specified in `~/.pgpass`). This user must have `SUPERUSER` privilege (need to get replication details). +- **On master node:** Slaves must be able to connect with user from `recovery.conf` / `postgresql.auto.conf` (or user specify using `-U`) to database with the same name (or another specified with `-D`) as `trust` (or using password specified in `~/.pgpass`). This user must have `SUPERUSER` privilege (need to get replication details). -* **On standby node:** `PG_USER` must be able to connect localy on the database with the same name `(or another specified with -D)` as `trust` (or using password specified in `~/.pgpass`). +- **On standby node:** `PG_USER` must be able to connect locally on the database with the same name `(or another specified with -D)` as `trust` (or using password specified in `~/.pgpass`). ## Installation diff --git a/check_pg_streaming_replication b/check_pg_streaming_replication index 0b29aa5..157e9d6 100755 --- a/check_pg_streaming_replication +++ b/check_pg_streaming_replication @@ -1,4 +1,5 @@ #!/bin/bash +# vim: tabstop=4 shiftwidth=4 softtabstop=4 expandtab # # Nagios plugin to check Postgresql streamin replication state # @@ -14,7 +15,7 @@ # ~/.pgpass). This user must have SUPERUSER privilege (need to get replication # details). # -# On standby node: PG_USER must be able to connect localy on the database with the same name +# On standby node: PG_USER must be able to connect locally on the database with the same name # (or another specified with -D) as trust (or using password specified in # ~/.pgpass). # @@ -46,95 +47,95 @@ EXPECTED_SYNC_STATE=sync DEBUG=0 function usage () { - ERROR="$1" - [ -n "$ERROR" ] && echo -e "$ERROR\n" - cat << EOF + ERROR="$1" + [ -n "$ERROR" ] && echo -e "$ERROR\n" + cat << EOF Usage: $0 [-d] [-h] [options] - -u pg_user Specify local Postgres user (Default: try to auto-detect or use $DEFAULT_PG_USER) - -b psql_bin Specify psql binary path (Default: $PSQL_BIN) - -B pg_lsclusters_bin Specify pg_lsclusters binary path (Default: $PG_LSCLUSTER_BIN) - -V pg_version Specify Postgres version (Default: try to auto-detect or use $DEFAULT_PG_VERSION) - -m pg_main Specify Postgres main directory path (Default: try to auto-detect or use - $DEFAULT_PG_MAIN) - -r recovery_conf Specify Postgres recovery configuration file path - (Default: [PG_MAIN]/recovery.conf on PG <= 11, [PG_MAIN]/postgresql.auto.conf on PG >= 12) - -U pg_master_user Specify Postgres user to use on master (Default: user from recovery.conf file) - -p pg_port Specify default Postgres master TCP port (Default: same as local PostgreSQL - port if detected or use $DEFAULT_PG_PORT) - -D dbname Specify DB name on Postgres master/slave to connect on (Default: PG_USER, must - match with .pgpass one is used) - -C 1/0 Enable or disable check if the current LSN of the master host is the same - of the last received LSN (Default: $CHECK_CUR_MASTER_LSN) - -w replay_warn_delay Specify the replay warning delay in second (Default: $REPLAY_WARNING_DELAY) - -c replay_crit_delay Specify the replay critical delay in second (Default: $REPLAY_CRITICAL_DELAY) - -e expected_sync_state The expected replication state ('sync' or 'async', default: $EXPECTED_SYNC_STATE) - -d Debug mode - -h Show this message + -u pg_user Specify local Postgres user (Default: try to auto-detect or use $DEFAULT_PG_USER) + -b psql_bin Specify psql binary path (Default: $PSQL_BIN) + -B pg_lsclusters_bin Specify pg_lsclusters binary path (Default: $PG_LSCLUSTER_BIN) + -V pg_version Specify Postgres version (Default: try to auto-detect or use $DEFAULT_PG_VERSION) + -m pg_main Specify Postgres main directory path (Default: try to auto-detect or use + $DEFAULT_PG_MAIN) + -r recovery_conf Specify Postgres recovery configuration file path + ( Default: [PG_MAIN]/recovery.conf on PG <= 11, [PG_MAIN]/postgresql.auto.conf on PG >= 12) + -U pg_master_user Specify Postgres user to use on master (Default: user from recovery.conf file) + -p pg_port Specify default Postgres master TCP port (Default: same as local PostgreSQL + port if detected or use $DEFAULT_PG_PORT) + -D dbname Specify DB name on Postgres master/slave to connect on (Default: PG_USER, must + match with .pgpass one is used) + -C 1/0 Enable or disable check if the current LSN of the master host is the same + of the last received LSN (Default: $CHECK_CUR_MASTER_LSN) + -w replay_warn_delay Specify the replay warning delay in second (Default: $REPLAY_WARNING_DELAY) + -c replay_crit_delay Specify the replay critical delay in second (Default: $REPLAY_CRITICAL_DELAY) + -e expected_sync_state The expected replication state ('sync' or 'async', default: $EXPECTED_SYNC_STATE) + -d Debug mode + -h Show this message EOF - [ -n "$ERROR" ] && exit 1 || exit 0 + [ -n "$ERROR" ] && exit 1 || exit 0 } while getopts "hu:b:B:V:m:r:U:p:D:C:w:c:e:d" OPTION do - case $OPTION in - u) - PG_USER=$OPTARG - ;; - b) - PSQL_BIN=$OPTARG - ;; - B) - PG_LSCLUSTER_BIN=$OPTARG - ;; - V) - PG_VERSION=$OPTARG - ;; - m) - PG_MAIN=$OPTARG - ;; - r) - RECOVERY_CONF=$OPTARG - ;; - U) - PG_MASTER_USER=$OPTARG - ;; - p) - PG_DEFAULT_PORT=$OPTARG - ;; - D) - PG_DB=$OPTARG - ;; - C) - CHECK_CUR_MASTER_LSN=$OPTARG - ;; - w) - REPLAY_WARNING_DELAY=$OPTARG - ;; - c) - REPLAY_CRITICAL_DELAY=$OPTARG - ;; - e) - [ "$OPTARG" != "sync" -a "$OPTARG" != "async" ] && \ - usage "Invalid expected replication state '$OPTARG'. Possible values: sync or async." - EXPECTED_SYNC_STATE=$OPTARG - ;; - d) - DEBUG=1 - ;; - h) - usage - ;; - \?) - echo -n "Unkown option" - usage - esac + case $OPTION in + u) + PG_USER=$OPTARG + ;; + b) + PSQL_BIN=$OPTARG + ;; + B) + PG_LSCLUSTER_BIN=$OPTARG + ;; + V) + PG_VERSION=$OPTARG + ;; + m) + PG_MAIN=$OPTARG + ;; + r) + RECOVERY_CONF=$OPTARG + ;; + U) + PG_MASTER_USER=$OPTARG + ;; + p) + PG_DEFAULT_PORT=$OPTARG + ;; + D) + PG_DB=$OPTARG + ;; + C) + CHECK_CUR_MASTER_LSN=$OPTARG + ;; + w) + REPLAY_WARNING_DELAY=$OPTARG + ;; + c) + REPLAY_CRITICAL_DELAY=$OPTARG + ;; + e) + [ "$OPTARG" != "sync" -a "$OPTARG" != "async" ] && \ + usage "Invalid expected replication state '$OPTARG'. Possible values: sync or async." + EXPECTED_SYNC_STATE=$OPTARG + ;; + d) + DEBUG=1 + ;; + h) + usage + ;; + \?) + echo -n "Unknown option" + usage + esac done function debug() { - if [ $DEBUG -eq 1 ] - then - >&2 echo -e "[DEBUG] $1" - fi + if [ $DEBUG -eq 1 ] + then + >&2 echo -e "[DEBUG] $1" + fi } debug "Starting options (before handling auto-detection/default values): @@ -155,19 +156,19 @@ REPLAY_CRITICAL_DELAY = $REPLAY_CRITICAL_DELAY # Auto-detect PostgreSQL information using pg_lsclusters if [ -x "$PG_LSCLUSTER_BIN" ] then - PG_CLUSTER=$( $PG_LSCLUSTER_BIN -h 2>/dev/null|head -n1 ) - if [ -n "$PG_CLUSTER" ] - then - debug "pg_lsclusters output:\n\t$PG_CLUSTER" - # Output example: - # 9.6 main 5432 online,recovery postgres /var/lib/postgresql/9.6/main /var/log/postgresql/postgresql-9.6-main.log - [ -z "$PG_VERSION" ] && PG_VERSION=$( echo "$PG_CLUSTER"|awk -F ' +' '{print $1}' ) - [ -z "$PG_DEFAULT_PORT" ] && PG_DEFAULT_PORT=$( echo "$PG_CLUSTER"|awk -F ' +' '{print $3}' ) - [ -z "$PG_USER" ] && PG_USER=$( echo "$PG_CLUSTER"|awk -F ' +' '{print $5}' ) - [ -z "$PG_MAIN" ] && PG_MAIN=$( echo "$PG_CLUSTER"|awk -F ' +' '{print $6}' ) - fi + PG_CLUSTER=$( $PG_LSCLUSTER_BIN -h 2>/dev/null|head -n1 ) + if [ -n "$PG_CLUSTER" ] + then + debug "pg_lsclusters output:\n\t$PG_CLUSTER" + # Output example: + # 9.6 main 5432 online,recovery postgres /var/lib/postgresql/9.6/main /var/log/postgresql/postgresql-9.6-main.log + [ -z "$PG_VERSION" ] && PG_VERSION=$( echo "$PG_CLUSTER"|awk -F ' +' '{print $1}' ) + [ -z "$PG_DEFAULT_PORT" ] && PG_DEFAULT_PORT=$( echo "$PG_CLUSTER"|awk -F ' +' '{print $3}' ) + [ -z "$PG_USER" ] && PG_USER=$( echo "$PG_CLUSTER"|awk -F ' +' '{print $5}' ) + [ -z "$PG_MAIN" ] && PG_MAIN=$( echo "$PG_CLUSTER"|awk -F ' +' '{print $6}' ) + fi else - debug "pg_lsclusters not found ($PG_LSCLUSTER_BIN): parameters auto-detection disabled" + debug "pg_lsclusters not found ($PG_LSCLUSTER_BIN): parameters auto-detection disabled" fi # If auto-detection failed, use default values @@ -189,8 +190,8 @@ id "$PG_USER" > /dev/null 2>&1 # Check RECOVERY_CONF if [ -z "$RECOVERY_CONF" ]; then - [ $PG_VERSION -le 11 ] && RECOVERY_CONF_FILENAME="recovery.conf" || RECOVERY_CONF_FILENAME="postgresql.auto.conf" - RECOVERY_CONF="$PG_MAIN/$RECOVERY_CONF_FILENAME" + [ $PG_VERSION -le 11 ] && RECOVERY_CONF_FILENAME="recovery.conf" || RECOVERY_CONF_FILENAME="postgresql.auto.conf" + RECOVERY_CONF="$PG_MAIN/$RECOVERY_CONF_FILENAME" fi # Check PG_DEFAULT_PORT @@ -200,15 +201,15 @@ fi [ -z "$PG_DB" ] && PG_DB="$PG_USER" function psql_get () { - sql="$1" - debug "Exec 'echo \"$sql\"|sudo -u $PG_USER $PSQL_BIN -d \"$PG_DB\" -w -t -P format=unaligned" - echo "$sql"|sudo -u $PG_USER $PSQL_BIN -d "$PG_DB" -w -t -P format=unaligned + sql="$1" + debug "Exec 'echo \"$sql\"|sudo -u $PG_USER $PSQL_BIN -d \"$PG_DB\" -w -t -P format=unaligned" + echo "$sql"|sudo -u $PG_USER $PSQL_BIN -d "$PG_DB" -w -t -P format=unaligned } function psql_master_get () { - sql="$1" - debug "Exec 'echo \"$sql\"|sudo -u $PG_USER $PSQL_BIN -U $M_USER -h $M_HOST -w -p $M_PORT -d $PG_DB -t -P format=unaligned" - echo "$sql"|sudo -u $PG_USER $PSQL_BIN -U $M_USER -h $M_HOST -w -p $M_PORT -d $PG_DB -t -P format=unaligned + sql="$1" + debug "Exec 'echo \"$sql\"|sudo -u $PG_USER $PSQL_BIN -U $M_USER -h $M_HOST -w -p $M_PORT -d $PG_DB -t -P format=unaligned" + echo "$sql"|sudo -u $PG_USER $PSQL_BIN -U $M_USER -h $M_HOST -w -p $M_PORT -d $PG_DB -t -P format=unaligned } debug "Running options: @@ -229,32 +230,32 @@ REPLAY_CRITICAL_DELAY = $REPLAY_CRITICAL_DELAY # Set some stuff to PostgreSQL version if [ $( echo "$PG_VERSION < 10" |bc -l ) -eq 1 ] then - pg_last_wal_receive_lsn='pg_last_xlog_receive_location()' - pg_last_wal_replay_lsn='pg_last_xlog_replay_location()' - pg_current_wal_lsn='pg_current_xlog_location()' - pg_wal_lsn_diff='pg_xlog_location_diff' - sent_lsn='sent_location' - write_lsn='write_location' + pg_last_wal_receive_lsn='pg_last_xlog_receive_location()' + pg_last_wal_replay_lsn='pg_last_xlog_replay_location()' + pg_current_wal_lsn='pg_current_xlog_location()' + pg_wal_lsn_diff='pg_xlog_location_diff' + sent_lsn='sent_location' + write_lsn='write_location' else - pg_last_wal_receive_lsn='pg_last_wal_receive_lsn()' - pg_last_wal_replay_lsn='pg_last_wal_replay_lsn()' - pg_current_wal_lsn='pg_current_wal_lsn()' - pg_wal_lsn_diff='pg_wal_lsn_diff' - sent_lsn='sent_lsn' - write_lsn='write_lsn' + pg_last_wal_receive_lsn='pg_last_wal_receive_lsn()' + pg_last_wal_replay_lsn='pg_last_wal_replay_lsn()' + pg_current_wal_lsn='pg_current_wal_lsn()' + pg_wal_lsn_diff='pg_wal_lsn_diff' + sent_lsn='sent_lsn' + write_lsn='write_lsn' fi # Postgres is running ? if [ $DEBUG -eq 0 ] then - psql_get '\q' 2> /dev/null + psql_get '\q' 2> /dev/null else - psql_get '\q' + psql_get '\q' fi if [ $? -ne 0 ] then - echo "CRITICAL: Postgres is not running !" - exit 2 + echo "CRITICAL: Postgres is not running !" + exit 2 fi debug "Postgres is running" @@ -263,236 +264,236 @@ RECOVERY_MODE=0 if [ -f $RECOVERY_CONF ] then - debug "File recovery.conf found. Hot-standby mode." + debug "File recovery.conf found. Hot-standby mode." - # Check recovery mode - if [ $RECOVERY_MODE -ne 1 ] - then - echo "CRITICAL: Not in recovery mode while recovery.conf file found !" - exit 2 - fi - debug "Postgres is in recovery mode" + # Check recovery mode + if [ $RECOVERY_MODE -ne 1 ] + then + echo "CRITICAL: Not in recovery mode while recovery.conf file found !" + exit 2 + fi + debug "Postgres is in recovery mode" - # Get local current last received/replayed LSN - LAST_RECEIVED_LSN=$( psql_get "SELECT $pg_last_wal_receive_lsn" ) - debug "Last received LSN: $LAST_RECEIVED_LSN" - LAST_REPLAYED_LSN=$( psql_get "SELECT $pg_last_wal_replay_lsn" ) - debug "Last replayed LSN: $LAST_REPLAYED_LSN" + # Get local current last received/replayed LSN + LAST_RECEIVED_LSN=$( psql_get "SELECT $pg_last_wal_receive_lsn" ) + debug "Last received LSN: $LAST_RECEIVED_LSN" + LAST_REPLAYED_LSN=$( psql_get "SELECT $pg_last_wal_replay_lsn" ) + debug "Last replayed LSN: $LAST_REPLAYED_LSN" - # Get master connection informations from recovery.conf file - MASTER_CONN_INFOS=$( egrep '^ *primary_conninfo' $RECOVERY_CONF|sed "s/^ *primary_conninfo *= *\(.\+\) *$/\1/" ) - if [ ! -n "$MASTER_CONN_INFOS" ] - then - echo "UNKNOWN: Can't retreive master connection informations form recovery.conf file" - exit 3 - fi - debug "Master connection informations: $MASTER_CONN_INFOS" + # Get master connection information from recovery.conf file + MASTER_CONN_INFOS=$( egrep '^ *primary_conninfo' $RECOVERY_CONF|sed "s/^ *primary_conninfo *= *\(.\+\) *$/\1/" ) + if [ ! -n "$MASTER_CONN_INFOS" ] + then + echo "UNKNOWN: Can't retrieve master connection information form recovery.conf file" + exit 3 + fi + debug "Master connection information: $MASTER_CONN_INFOS" - M_HOST=$( echo "$MASTER_CONN_INFOS"| grep 'host=' | sed 's/^.*host= *\([0-9a-zA-Z.-]\+\) *.*$/\1/' ) - if [ ! -n "$M_HOST" ] - then - echo "UNKNOWN: Can't retreive master host from recovery.conf file" - exit 3 - fi - debug "Master host: $M_HOST" + M_HOST=$( echo "$MASTER_CONN_INFOS"| grep 'host=' | sed 's/^.*host= *\([0-9a-zA-Z.-]\+\) *.*$/\1/' ) + if [ ! -n "$M_HOST" ] + then + echo "UNKNOWN: Can't retrieve master host from recovery.conf file" + exit 3 + fi + debug "Master host: $M_HOST" - M_PORT=$( echo "$MASTER_CONN_INFOS"| grep 'port=' | sed 's/^.*port= *\([0-9a-zA-Z.-]\+\) *.*$/\1/' ) - if [ ! -n "$M_PORT" ] - then - debug "Master port not specified, use default: $PG_DEFAULT_PORT" - M_PORT=$PG_DEFAULT_PORT - else - debug "Master port: $M_PORT" - fi + M_PORT=$( echo "$MASTER_CONN_INFOS"| grep 'port=' | sed 's/^.*port= *\([0-9a-zA-Z.-]\+\) *.*$/\1/' ) + if [ ! -n "$M_PORT" ] + then + debug "Master port not specified, use default: $PG_DEFAULT_PORT" + M_PORT=$PG_DEFAULT_PORT + else + debug "Master port: $M_PORT" + fi - if [ -n "$PG_MASTER_USER" ] - then - debug "Master user provided by command-line, use it: $PG_MASTER_USER" - M_USER="$PG_MASTER_USER" - else - M_USER=$( echo "$MASTER_CONN_INFOS"| grep 'user=' | sed 's/^.*user= *\([0-9a-zA-Z.-]\+\) *.*$/\1/' ) - if [ ! -n "$M_USER" ] - then - debug "Master user not specified, use default: $PG_USER" - M_USER=$PG_USER - else - debug "Master user: $M_USER" - fi - fi + if [ -n "$PG_MASTER_USER" ] + then + debug "Master user provided by command-line, use it: $PG_MASTER_USER" + M_USER="$PG_MASTER_USER" + else + M_USER=$( echo "$MASTER_CONN_INFOS"| grep 'user=' | sed 's/^.*user= *\([0-9a-zA-Z.-]\+\) *.*$/\1/' ) + if [ ! -n "$M_USER" ] + then + debug "Master user not specified, use default: $PG_USER" + M_USER=$PG_USER + else + debug "Master user: $M_USER" + fi + fi - M_APP_NAME=$( echo "$MASTER_CONN_INFOS"| grep 'application_name=' | sed "s/^.*application_name=[ \'\"]*\([^ \'\"]\+\)[ \'\"]*.*$/\1/" ) - if [ ! -n "$M_APP_NAME" ] - then - if [ $PG_VERSION -ge 12 ] - then - debug "Master application name not specified, use cluster_name if defined" - CLUSTER_NAME=$( psql_get "SELECT current_setting('cluster_name')" ) - debug "Cluster name: $CLUSTER_NAME" - if [ -n "$CLUSTER_NAME" ] - then - M_APP_NAME=$CLUSTER_NAME - else - debug "Cluster name not defined, use default: $PG_DEFAULT_APP_NAME" - M_APP_NAME=$PG_DEFAULT_APP_NAME - fi - else - debug "Master application name not specified, use default: $PG_DEFAULT_APP_NAME" - M_APP_NAME=$PG_DEFAULT_APP_NAME - fi - else - debug "Master application name: $M_APP_NAME" - fi + M_APP_NAME=$( echo "$MASTER_CONN_INFOS"| grep 'application_name=' | sed "s/^.*application_name=[ \'\"]*\([^ \'\"]\+\)[ \'\"]*.*$/\1/" ) + if [ ! -n "$M_APP_NAME" ] + then + if [ $PG_VERSION -ge 12 ] + then + debug "Master application name not specified, use cluster_name if defined" + CLUSTER_NAME=$( psql_get "SELECT current_setting('cluster_name')" ) + debug "Cluster name: $CLUSTER_NAME" + if [ -n "$CLUSTER_NAME" ] + then + M_APP_NAME=$CLUSTER_NAME + else + debug "Cluster name not defined, use default: $PG_DEFAULT_APP_NAME" + M_APP_NAME=$PG_DEFAULT_APP_NAME + fi + else + debug "Master application name not specified, use default: $PG_DEFAULT_APP_NAME" + M_APP_NAME=$PG_DEFAULT_APP_NAME + fi + else + debug "Master application name: $M_APP_NAME" + fi - # Get current replication state information from master - M_CUR_REPL_STATE_INFO="$( psql_master_get "SELECT state, sync_state, $sent_lsn AS sent_lsn, $write_lsn AS write_lsn FROM pg_stat_replication WHERE application_name='$M_APP_NAME';" )" - if [ ! -n "$M_CUR_REPL_STATE_INFO" ] - then - echo "UNKNOWN: Can't retreive current replication state information from master server" - exit 3 - fi - debug "Master current replication state:\n\tstate|sync_state|sent_lsn|write_lsn\n\t$M_CUR_REPL_STATE_INFO" + # Get current replication state information from master + M_CUR_REPL_STATE_INFO="$( psql_master_get "SELECT state, sync_state, $sent_lsn AS sent_lsn, $write_lsn AS write_lsn FROM pg_stat_replication WHERE application_name='$M_APP_NAME';" )" + if [ ! -n "$M_CUR_REPL_STATE_INFO" ] + then + echo "UNKNOWN: Can't retrieve current replication state information from master server" + exit 3 + fi + debug "Master current replication state:\n\tstate|sync_state|sent_lsn|write_lsn\n\t$M_CUR_REPL_STATE_INFO" - M_CUR_STATE=$( echo "$M_CUR_REPL_STATE_INFO"|cut -d'|' -f1 ) - debug "Master current state: $M_CUR_STATE" - if [ "$M_CUR_STATE" != "streaming" ] - then - echo "CRITICAL: this host is not in streaming state according to master host (current state = '$M_CUR_STATE')" - exit 2 - fi + M_CUR_STATE=$( echo "$M_CUR_REPL_STATE_INFO"|cut -d'|' -f1 ) + debug "Master current state: $M_CUR_STATE" + if [ "$M_CUR_STATE" != "streaming" ] + then + echo "CRITICAL: this host is not in streaming state according to master host (current state = '$M_CUR_STATE')" + exit 2 + fi - M_CUR_SYNC_STATE=$( echo "$M_CUR_REPL_STATE_INFO"|cut -d'|' -f2 ) - debug "Master current sync state: $M_CUR_SYNC_STATE" - if [ "$M_CUR_SYNC_STATE" != "$EXPECTED_SYNC_STATE" ] - then - echo "CRITICAL: unexpected replication state '$M_CUR_SYNC_STATE' (expected state = '$EXPECTED_SYNC_STATE')" - exit 2 - fi + M_CUR_SYNC_STATE=$( echo "$M_CUR_REPL_STATE_INFO"|cut -d'|' -f2 ) + debug "Master current sync state: $M_CUR_SYNC_STATE" + if [ "$M_CUR_SYNC_STATE" != "$EXPECTED_SYNC_STATE" ] + then + echo "CRITICAL: unexpected replication state '$M_CUR_SYNC_STATE' (expected state = '$EXPECTED_SYNC_STATE')" + exit 2 + fi - M_CUR_SENT_LSN=$( echo "$M_CUR_REPL_STATE_INFO"|cut -d'|' -f3 ) - M_CUR_WRITED_LSN=$( echo "$M_CUR_REPL_STATE_INFO"|cut -d'|' -f4 ) - debug "Master current last sent/writed LSN: '$M_CUR_SENT_LSN' / '$M_CUR_WRITED_LSN'" + M_CUR_SENT_LSN=$( echo "$M_CUR_REPL_STATE_INFO"|cut -d'|' -f3 ) + M_CUR_WRITED_LSN=$( echo "$M_CUR_REPL_STATE_INFO"|cut -d'|' -f4 ) + debug "Master current last sent/writed LSN: '$M_CUR_SENT_LSN' / '$M_CUR_WRITED_LSN'" - # Check current master LSN vs last received LSN - if [ "$CHECK_CUR_MASTER_LSN" == "1" ] - then - # Get current LSN from master - M_CUR_LSN="$( psql_master_get "SELECT $pg_current_wal_lsn" )" - if [ ! -n "$M_CUR_LSN" ] - then - echo "UNKNOWN: Can't retreive current LSN from master server" - exit 3 - fi - debug "Master current LSN: $M_CUR_LSN" + # Check current master LSN vs last received LSN + if [ "$CHECK_CUR_MASTER_LSN" == "1" ] + then + # Get current LSN from master + M_CUR_LSN="$( psql_master_get "SELECT $pg_current_wal_lsn" )" + if [ ! -n "$M_CUR_LSN" ] + then + echo "UNKNOWN: Can't retrieve current LSN from master server" + exit 3 + fi + debug "Master current LSN: $M_CUR_LSN" - # Master current LSN is the last received LSN ? - if [ "$M_CUR_LSN" != "$LAST_RECEIVED_LSN" ] - then - echo "CRITICAL: Master current LSN is not the last received LSN" - exit 2 - fi - debug "Master current LSN is the last received LSN" - fi + # Master current LSN is the last received LSN ? + if [ "$M_CUR_LSN" != "$LAST_RECEIVED_LSN" ] + then + echo "CRITICAL: Master current LSN is not the last received LSN" + exit 2 + fi + debug "Master current LSN is the last received LSN" + fi - # The last received LSN is the last replayed ? - if [ "$LAST_RECEIVED_LSN" != "$LAST_REPLAYED_LSN" ] - then - debug "/!\ The last received LSN is NOT the last replayed LSN ('$M_CUR_LSN' / '$LAST_REPLAYED_LSN')" - REPLAY_DELAY="$( psql_get 'SELECT EXTRACT(EPOCH FROM now() - pg_last_xact_replay_timestamp());' )" - debug "Replay delay is $REPLAY_DELAY second(s)" - if [ $( echo "$REPLAY_DELAY >= $REPLAY_CRITICAL_DELAY"|bc -l ) -gt 0 ] - then - echo "CRITICAL: last received LSN is not the last replayed ('$LAST_RECEIVED_LSN' / '$LAST_REPLAYED_LSN') and replay delay is $REPLAY_DELAY second(s)" - exit 2 - fi - if [ $( echo "$REPLAY_DELAY >= $REPLAY_WARNING_DELAY"|bc -l ) -gt 0 ] - then - echo "WARNING: last received LSN is not the last replay file ('$LAST_RECEIVED_LSN' / '$LAST_REPLAYED_LSN') and replay delay is $REPLAY_DELAY second(s)" - exit 1 - fi - debug "Replay delay is not worrying" - fi - debug "Last received LSN is the last replayed file" + # The last received LSN is the last replayed ? + if [ "$LAST_RECEIVED_LSN" != "$LAST_REPLAYED_LSN" ] + then + debug "/!\ The last received LSN is NOT the last replayed LSN ('$M_CUR_LSN' / '$LAST_REPLAYED_LSN')" + REPLAY_DELAY="$( psql_get 'SELECT EXTRACT(EPOCH FROM now() - pg_last_xact_replay_timestamp());' )" + debug "Replay delay is $REPLAY_DELAY second(s)" + if [ $( echo "$REPLAY_DELAY >= $REPLAY_CRITICAL_DELAY"|bc -l ) -gt 0 ] + then + echo "CRITICAL: last received LSN is not the last replayed ('$LAST_RECEIVED_LSN' / '$LAST_REPLAYED_LSN') and replay delay is $REPLAY_DELAY second(s)" + exit 2 + fi + if [ $( echo "$REPLAY_DELAY >= $REPLAY_WARNING_DELAY"|bc -l ) -gt 0 ] + then + echo "WARNING: last received LSN is not the last replay file ('$LAST_RECEIVED_LSN' / '$LAST_REPLAYED_LSN') and replay delay is $REPLAY_DELAY second(s)" + exit 1 + fi + debug "Replay delay is not worrying" + fi + debug "Last received LSN is the last replayed file" - # The master last sent LSN is the last received (and synced) ? - if [ "$M_CUR_SENT_LSN" != "$LAST_RECEIVED_LSN" ] - then - echo "WARNING: master last sent LSN is not already received (and synced to disk) by slave. May be we have some network delay or load on slave" - echo "Master last sent LSN: $M_CUR_SENT_LSN" - echo "Slave last received (and synced to disk) LSN: $LAST_RECEIVED_LSN" - exit 1 - fi + # The master last sent LSN is the last received (and synced) ? + if [ "$M_CUR_SENT_LSN" != "$LAST_RECEIVED_LSN" ] + then + echo "WARNING: master last sent LSN is not already received (and synced to disk) by slave. May be we have some network delay or load on slave" + echo "Master last sent LSN: $M_CUR_SENT_LSN" + echo "Slave last received (and synced to disk) LSN: $LAST_RECEIVED_LSN" + exit 1 + fi - echo "OK: Hot-standby server is uptodate" - echo "Replication state: $M_CUR_SYNC_STATE" - echo "Last sent/writed LSN: '$M_CUR_SENT_LSN' / '$M_CUR_WRITED_LSN'" - [ "$LAST_RECEIVED_LSN" != "$LAST_REPLAYED_LSN" ] && echo "Replay delay: ${REPLAY_DELAY}s" - exit 0 + echo "OK: Hot-standby server is up-to-date" + echo "Replication state: $M_CUR_SYNC_STATE" + echo "Last sent/writed LSN: '$M_CUR_SENT_LSN' / '$M_CUR_WRITED_LSN'" + [ "$LAST_RECEIVED_LSN" != "$LAST_REPLAYED_LSN" ] && echo "Replay delay: ${REPLAY_DELAY}s" + exit 0 else - debug "File recovery.conf not found. Master mode." + debug "File recovery.conf not found. Master mode." - # Check recovery mode - if [ $RECOVERY_MODE -eq 1 ] - then - echo "CRITICAL: In recovery mode while recovery.conf file not found !" - exit 2 - fi - debug "Postgres is not in recovery mode" + # Check recovery mode + if [ $RECOVERY_MODE -eq 1 ] + then + echo "CRITICAL: In recovery mode while recovery.conf file not found !" + exit 2 + fi + debug "Postgres is not in recovery mode" - # Retreive current lsn - CURRENT_LSN=$( psql_get "SELECT $pg_current_wal_lsn" ) - if [ -z "$CURRENT_LSN" ] - then - echo "UNKNOWN: Fail to retreive current LSN (Log Sequence Number)" - exit 3 - fi - debug "Current LSN: $CURRENT_LSN" + # Retrieve current lsn + CURRENT_LSN=$( psql_get "SELECT $pg_current_wal_lsn" ) + if [ -z "$CURRENT_LSN" ] + then + echo "UNKNOWN: Fail to retrieve current LSN (Log Sequence Number)" + exit 3 + fi + debug "Current LSN: $CURRENT_LSN" - # Check standby client - STANDBY_CLIENTS=$( psql_get "SELECT application_name, client_addr, sent_lsn, write_lsn, state, sync_state, current_lag - FROM ( - SELECT application_name, client_addr, sent_lsn, write_lsn, state, sync_state, current_lag - FROM ( - SELECT application_name, client_addr, $sent_lsn AS sent_lsn, $write_lsn AS write_lsn, state, sync_state, - $pg_wal_lsn_diff($pg_current_wal_lsn, $write_lsn) AS current_lag - FROM pg_stat_replication - ) AS s2 - ) AS s1" ) - if [ ! -n "$STANDBY_CLIENTS" ] - then - echo "WARNING: no stand-by client connected" - exit 1 - fi - debug "Stand-by client(s):\n\t$( echo -e "$STANDBY_CLIENTS"|sed 's/\n/\n\t/' )" + # Check standby client + STANDBY_CLIENTS=$( psql_get "SELECT application_name, client_addr, sent_lsn, write_lsn, state, sync_state, current_lag + FROM ( + SELECT application_name, client_addr, sent_lsn, write_lsn, state, sync_state, current_lag + FROM ( + SELECT application_name, client_addr, $sent_lsn AS sent_lsn, $write_lsn AS write_lsn, state, sync_state, + $pg_wal_lsn_diff($pg_current_wal_lsn, $write_lsn) AS current_lag + FROM pg_stat_replication + ) AS s2 + ) AS s1" ) + if [ ! -n "$STANDBY_CLIENTS" ] + then + echo "WARNING: no stand-by client connected" + exit 1 + fi + debug "Stand-by client(s):\n\t$( echo -e "$STANDBY_CLIENTS"|sed 's/\n/\n\t/' )" - STANDBY_CLIENTS_TXT="" - STANDBY_CLIENTS_COUNT=0 - CURRENT_LSN_IS_LAST_SENT=1 - for line in $STANDBY_CLIENTS - do - let STANDBY_CLIENTS_COUNT=STANDBY_CLIENTS_COUNT+1 + STANDBY_CLIENTS_TXT="" + STANDBY_CLIENTS_COUNT=0 + CURRENT_LSN_IS_LAST_SENT=1 + for line in $STANDBY_CLIENTS + do + let STANDBY_CLIENTS_COUNT=STANDBY_CLIENTS_COUNT+1 - NAME=$( echo $line|cut -d '|' -f 1 ) - IP=$( echo $line|cut -d '|' -f 2 ) - SENT_LSN=$( echo $line|cut -d '|' -f 3 ) - WRITED_LSN=$( echo $line|cut -d '|' -f 4 ) - STATE=$( echo $line|cut -d '|' -f 5 ) - SYNC_STATE=$( echo $line|cut -d '|' -f 6 ) - LAG=$( echo $line|cut -d '|' -f 7 ) - STANDBY_CLIENTS_TXT="$STANDBY_CLIENTS_TXT\n$NAME ($IP): $STATE/$SYNC_STATE (LSN: sent='$SENT_LSN' / writed='$WRITED_LSN', Lag: ${LAG}b)" - [ "$SENT_LSN" != "$CURRENT_LSN" ] && CURRENT_LSN_IS_LAST_SENT=0 - done + NAME=$( echo $line|cut -d '|' -f 1 ) + IP=$( echo $line|cut -d '|' -f 2 ) + SENT_LSN=$( echo $line|cut -d '|' -f 3 ) + WRITED_LSN=$( echo $line|cut -d '|' -f 4 ) + STATE=$( echo $line|cut -d '|' -f 5 ) + SYNC_STATE=$( echo $line|cut -d '|' -f 6 ) + LAG=$( echo $line|cut -d '|' -f 7 ) + STANDBY_CLIENTS_TXT="$STANDBY_CLIENTS_TXT\n$NAME ($IP): $STATE/$SYNC_STATE (LSN: sent='$SENT_LSN' / writed='$WRITED_LSN', Lag: ${LAG}b)" + [ "$SENT_LSN" != "$CURRENT_LSN" ] && CURRENT_LSN_IS_LAST_SENT=0 + done - if [ $CURRENT_LSN_IS_LAST_SENT -eq 1 ] - then - echo "OK: $STANDBY_CLIENTS_COUNT stand-by client(s) connected" - EXIT_CODE=0 - else - echo "WARNING: current master LSN is not the last sent to stand-by client(s) connected. May be we have some load ?" - EXIT_CODE=1 - fi + if [ $CURRENT_LSN_IS_LAST_SENT -eq 1 ] + then + echo "OK: $STANDBY_CLIENTS_COUNT stand-by client(s) connected" + EXIT_CODE=0 + else + echo "WARNING: current master LSN is not the last sent to stand-by client(s) connected. May be we have some load ?" + EXIT_CODE=1 + fi - echo "Current master LSN: $CURRENT_LSN" - echo -e "$STANDBY_CLIENTS_TXT" - exit $EXIT_CODE + echo "Current master LSN: $CURRENT_LSN" + echo -e "$STANDBY_CLIENTS_TXT" + exit $EXIT_CODE fi