From bc078f83e8b079f03f99532f606e1ccab6f6ef45 Mon Sep 17 00:00:00 2001 From: Benjamin Renard Date: Tue, 16 Jul 2024 13:43:26 +0200 Subject: [PATCH] Code cleaning --- check_pg_streaming_replication | 186 +++++++++++++++++++++------------ 1 file changed, 121 insertions(+), 65 deletions(-) diff --git a/check_pg_streaming_replication b/check_pg_streaming_replication index 7f676c6..f4c6b7e 100755 --- a/check_pg_streaming_replication +++ b/check_pg_streaming_replication @@ -48,29 +48,37 @@ EXPECTED_MODE=auto DEBUG=0 function usage () { - ERROR="$1" + ERROR="$*" [[ -n "$ERROR" ]] && echo -e "$ERROR\n" cat << EOF Usage: $0 [-d] [-h] [options] - -u pg_user Specify local Postgres user (Default: try to auto-detect or use $DEFAULT_PG_USER) + -u pg_user Specify local Postgres user (Default: try to auto-detect or + use $DEFAULT_PG_USER) -b psql_bin Specify psql binary path (Default: $PSQL_BIN) -B pg_lsclusters_bin Specify pg_lsclusters binary path (Default: $PG_LSCLUSTER_BIN) - -V pg_version Specify Postgres version (Default: try to auto-detect or use $DEFAULT_PG_VERSION) - -m pg_main Specify Postgres main directory path (Default: try to auto-detect or use - $DEFAULT_PG_MAIN) + -V pg_version Specify Postgres version (Default: try to auto-detect or + use $DEFAULT_PG_VERSION) + -m pg_main Specify Postgres main directory path (Default: try to auto-detect or + use $DEFAULT_PG_MAIN) -r recovery_conf Specify Postgres recovery configuration file path - (Default: [PG_MAIN]/recovery.conf on PG <= 11, [PG_MAIN]/postgresql.auto.conf on PG >= 12) - -U pg_master_user Specify Postgres user to use on master (Default: user from recovery.conf file) - -p pg_port Specify default Postgres master TCP port (Default: same as local PostgreSQL - port if detected or use $DEFAULT_PG_PORT) - -D dbname Specify DB name on Postgres master/slave to connect on (Default: PG_USER, must - match with .pgpass one is used) - -C 1/0 Enable or disable check if the current LSN of the master host is the same - of the last received LSN (Default: $CHECK_CUR_MASTER_LSN) - -w replay_warn_delay Specify the replay warning delay in second (Default: $REPLAY_WARNING_DELAY) - -c replay_crit_delay Specify the replay critical delay in second (Default: $REPLAY_CRITICAL_DELAY) - -e expected_sync_state The expected replication state ('sync' or 'async', default: $EXPECTED_SYNC_STATE) - -E expected_mode The expected mode ('master', 'hot-standby' or 'auto', default: '$EXPECTED_MODE') + (Default: [PG_MAIN]/recovery.conf on PG <= 11, + [PG_MAIN]/postgresql.auto.conf on PG >= 12) + -U pg_master_user Specify Postgres user to use on master (Default: user from recovery.conf + file) + -p pg_port Specify default Postgres master TCP port (Default: same as local + PostgreSQL port if detected or use $DEFAULT_PG_PORT) + -D dbname Specify DB name on Postgres master/slave to connect on (Default: + PG_USER, must match with .pgpass one is used) + -C 1/0 Enable or disable check if the current LSN of the master host is the + same of the last received LSN (Default: $CHECK_CUR_MASTER_LSN) + -w replay_warn_delay Specify the replay warning delay in second + (Default: $REPLAY_WARNING_DELAY) + -c replay_crit_delay Specify the replay critical delay in second + (Default: $REPLAY_CRITICAL_DELAY) + -e expected_sync_state The expected replication state ('sync' or 'async', + default: $EXPECTED_SYNC_STATE) + -E expected_mode The expected mode ('master', 'hot-standby' or 'auto', + default: '$EXPECTED_MODE') -d Debug mode -h Show this message EOF @@ -117,12 +125,15 @@ while getopts "hu:b:B:V:m:r:U:p:D:C:w:c:e:E:d" OPTION; do ;; e) [[ "$OPTARG" != "sync" ]] && [[ "$OPTARG" != "async" ]] && \ - usage "Invalid expected replication state '$OPTARG'. Possible values: sync or async." + usage "Invalid expected replication state '$OPTARG'." \ + "Possible values: sync or async." EXPECTED_SYNC_STATE=$OPTARG ;; E) - [[ "$OPTARG" != "master" ]] && [[ "$OPTARG" != "hot-standby" ]] && [[ "$OPTARG" != "auto" ]] && \ - usage "Invalid expected mode '$OPTARG'. Possible values: master, hot-standby or auto." + [[ "$OPTARG" != "master" ]] && [[ "$OPTARG" != "hot-standby" ]] && \ + [[ "$OPTARG" != "auto" ]] && \ + usage "Invalid expected mode '$OPTARG'. Possible values: master, hot-standby" \ + "or auto." EXPECTED_MODE=$OPTARG ;; d) @@ -139,7 +150,7 @@ done function debug() { if [[ $DEBUG -eq 1 ]]; then - >&2 echo -e "[DEBUG] $1" + >&2 echo -e "[DEBUG] $*" fi } @@ -162,15 +173,19 @@ EXPECTED_MODE = $EXPECTED_MODE # Auto-detect PostgreSQL information using pg_lsclusters if [[ -x "$PG_LSCLUSTER_BIN" ]]; then - PG_CLUSTER=$( $PG_LSCLUSTER_BIN -h 2>/dev/null|head -n1 ) + PG_CLUSTER=$( $PG_LSCLUSTER_BIN -h 2>/dev/null | head -n1 ) if [[ -n "$PG_CLUSTER" ]]; then debug "pg_lsclusters output:\n\t$PG_CLUSTER" # Output example: - # 9.6 main 5432 online,recovery postgres /var/lib/postgresql/9.6/main /var/log/postgresql/postgresql-9.6-main.log - [[ -z "$PG_VERSION" ]] && PG_VERSION=$( echo "$PG_CLUSTER"|awk -F ' +' '{print $1}' ) - [[ -z "$PG_DEFAULT_PORT" ]] && PG_DEFAULT_PORT=$( echo "$PG_CLUSTER"|awk -F ' +' '{print $3}' ) - [[ -z "$PG_USER" ]] && PG_USER=$( echo "$PG_CLUSTER"|awk -F ' +' '{print $5}' ) - [[ -z "$PG_MAIN" ]] && PG_MAIN=$( echo "$PG_CLUSTER"|awk -F ' +' '{print $6}' ) + # 9.6 main 5432 online,recovery postgres /var/lib/postgresql/9.6/main \ + # /var/log/postgresql/postgresql-9.6-main.log + # 13 main 5432 online,recovery,pacemaker postgres /var/lib/postgresql/13/main \ + # /var/log/postgresql/postgresql-13-main.log + [[ -z "$PG_VERSION" ]] && PG_VERSION=$( awk -F ' +' '{print $1}' <<< "$PG_CLUSTER" ) + [[ -z "$PG_DEFAULT_PORT" ]] && \ + PG_DEFAULT_PORT=$( awk -F ' +' '{print $3}' <<< "$PG_CLUSTER" ) + [[ -z "$PG_USER" ]] && PG_USER=$( awk -F ' +' '{print $5}' <<< "$PG_CLUSTER" ) + [[ -z "$PG_MAIN" ]] && PG_MAIN=$( awk -F ' +' '{print $6}' <<< "$PG_CLUSTER" ) fi else debug "pg_lsclusters not found ($PG_LSCLUSTER_BIN): parameters auto-detection disabled" @@ -194,7 +209,11 @@ id "$PG_USER" > /dev/null 2>&1 || { echo "UNKNOWN: Invalid Postgres user ($PG_US # Check RECOVERY_CONF if [[ -z "$RECOVERY_CONF" ]]; then - [[ $PG_VERSION -le 11 ]] && RECOVERY_CONF_FILENAME="recovery.conf" || RECOVERY_CONF_FILENAME="postgresql.auto.conf" + if [[ $PG_VERSION -le 11 ]]; then + RECOVERY_CONF_FILENAME="recovery.conf" + else + RECOVERY_CONF_FILENAME="postgresql.auto.conf" + fi RECOVERY_CONF="$PG_MAIN/$RECOVERY_CONF_FILENAME" else RECOVERY_CONF_FILENAME=$( basename "$RECOVERY_CONF" ) @@ -208,15 +227,17 @@ fi [[ -z "$PG_DB" ]] && PG_DB="$PG_USER" function psql_get () { - sql="$1" - debug "Exec 'echo \"$sql\"|sudo -u $PG_USER $PSQL_BIN -d \"$PG_DB\" -w -t -P format=unaligned" + local sql="$*" + debug "Exec 'sudo -u $PG_USER $PSQL_BIN -d \"$PG_DB\" -w -t -P format=unaligned <<< \"$sql\"" sudo -u "$PG_USER" "$PSQL_BIN" -d "$PG_DB" -w -t -P format=unaligned <<< "$sql" } function psql_master_get () { - sql="$1" - debug "Exec 'echo \"$sql\"|sudo -u $PG_USER $PSQL_BIN -U $M_USER -h $M_HOST -w -p $M_PORT -d $PG_DB -t -P format=unaligned" - sudo -u "$PG_USER" "$PSQL_BIN" -U "$M_USER" -h "$M_HOST" -w -p "$M_PORT" -d "$PG_DB" -t -P format=unaligned <<< "$sql" + local sql="$*" + debug "Exec 'sudo -u $PG_USER $PSQL_BIN -U $M_USER -h $M_HOST -w -p $M_PORT -d $PG_DB -t" \ + "-P format=unaligned <<< \"$sql\"" + sudo -u "$PG_USER" "$PSQL_BIN" \ + -U "$M_USER" -h "$M_HOST" -w -p "$M_PORT" -d "$PG_DB" -t -P format=unaligned <<< "$sql" } debug "Running options: @@ -273,7 +294,8 @@ if [[ "$EXPECTED_MODE" == "auto" ]]; then if [[ $RECOVERY_MODE -eq 1 ]]; then debug "Postgres is in recovery mode. Hot-standby mode." EXPECTED_MODE="hot-standby" - elif [[ -f $RECOVERY_CONF ]] && [[ $( grep -cE '^\s*primary_conninfo' "$RECOVERY_CONF" ) -gt 0 ]]; then + elif [[ -f $RECOVERY_CONF ]] && \ + [[ $( grep -cE '^\s*primary_conninfo' "$RECOVERY_CONF" ) -gt 0 ]]; then debug "File $RECOVERY_CONF_FILENAME found and contain primary_conninfo. Hot-standby mode." EXPECTED_MODE="hot-standby" else @@ -301,19 +323,24 @@ if [[ "$EXPECTED_MODE" == "hot-standby" ]]; then # Get master connection information from primary_conninfo configuration parameter MASTER_CONN_INFOS=$( psql_get "SHOW primary_conninfo" ) if [[ -z "$MASTER_CONN_INFOS" ]]; then - echo "UNKNOWN: Can't retrieve master connection information from primary_conninfo configuration parameter" + echo "UNKNOWN: Can't retrieve master connection information from primary_conninfo" \ + "configuration parameter" exit 3 fi debug "Master connection information: $MASTER_CONN_INFOS" - M_HOST=$( grep 'host=' <<< "$MASTER_CONN_INFOS" | sed 's/^.*host= *\([0-9a-zA-Z.-]\+\) *.*$/\1/' ) + M_HOST=$( + grep 'host=' <<< "$MASTER_CONN_INFOS" | sed 's/^.*host= *\([0-9a-zA-Z.-]\+\) *.*$/\1/' + ) if [[ -z "$M_HOST" ]]; then echo "UNKNOWN: Can't retrieve master host from primary_conninfo configuration parameter" exit 3 fi debug "Master host: $M_HOST" - M_PORT=$( grep 'port=' <<< "$MASTER_CONN_INFOS" | sed 's/^.*port= *\([0-9a-zA-Z.-]\+\) *.*$/\1/' ) + M_PORT=$( + grep 'port=' <<< "$MASTER_CONN_INFOS" | sed 's/^.*port= *\([0-9a-zA-Z.-]\+\) *.*$/\1/' + ) if [[ -z "$M_PORT" ]]; then debug "Master port not specified, use default: $PG_DEFAULT_PORT" M_PORT=$PG_DEFAULT_PORT @@ -325,7 +352,9 @@ if [[ "$EXPECTED_MODE" == "hot-standby" ]]; then debug "Master user provided by command-line, use it: $PG_MASTER_USER" M_USER="$PG_MASTER_USER" else - M_USER=$( grep 'user=' <<< "$MASTER_CONN_INFOS" | sed 's/^.*user= *\([0-9a-zA-Z.-]\+\) *.*$/\1/' ) + M_USER=$( + grep 'user=' <<< "$MASTER_CONN_INFOS" | sed 's/^.*user= *\([0-9a-zA-Z.-]\+\) *.*$/\1/' + ) if [[ -z "$M_USER" ]]; then debug "Master user not specified, use default: $PG_USER" M_USER=$PG_USER @@ -334,7 +363,10 @@ if [[ "$EXPECTED_MODE" == "hot-standby" ]]; then fi fi - M_APP_NAME=$( grep 'application_name=' <<< "$MASTER_CONN_INFOS" | sed "s/^.*application_name=[ \'\"]*\([^ \'\"]\+\)[ \'\"]*.*$/\1/" ) + M_APP_NAME=$( + grep 'application_name=' <<< "$MASTER_CONN_INFOS" | + sed "s/^.*application_name=[ \'\"]*\([^ \'\"]\+\)[ \'\"]*.*$/\1/" + ) if [[ -z "$M_APP_NAME" ]]; then if [[ $PG_VERSION -ge 12 ]]; then debug "Master application name not specified, use cluster_name if defined" @@ -355,24 +387,31 @@ if [[ "$EXPECTED_MODE" == "hot-standby" ]]; then fi # Get current replication state information from master - M_CUR_REPL_STATE_INFO="$( psql_master_get "SELECT state, sync_state, $sent_lsn AS sent_lsn, $write_lsn AS write_lsn FROM pg_stat_replication WHERE application_name='$M_APP_NAME';" )" + M_CUR_REPL_STATE_INFO="$( + psql_master_get \ + "SELECT state, sync_state, $sent_lsn AS sent_lsn, $write_lsn AS write_lsn" \ + "FROM pg_stat_replication WHERE application_name='$M_APP_NAME';" + )" if [[ -z "$M_CUR_REPL_STATE_INFO" ]]; then echo "UNKNOWN: Can't retrieve current replication state information from master server" exit 3 fi - debug "Master current replication state:\n\tstate|sync_state|sent_lsn|write_lsn\n\t$M_CUR_REPL_STATE_INFO" + debug "Master current replication state:\n" \ + "\tstate|sync_state|sent_lsn|write_lsn\n\t$M_CUR_REPL_STATE_INFO" M_CUR_STATE=$( cut -d'|' -f1 <<< "$M_CUR_REPL_STATE_INFO" ) debug "Master current state: $M_CUR_STATE" if [[ "$M_CUR_STATE" != "streaming" ]]; then - echo "CRITICAL: this host is not in streaming state according to master host (current state = '$M_CUR_STATE')" + echo "CRITICAL: this host is not in streaming state according to master host" \ + "(current state = '$M_CUR_STATE')" exit 2 fi M_CUR_SYNC_STATE=$( cut -d'|' -f2 <<< "$M_CUR_REPL_STATE_INFO" ) debug "Master current sync state: $M_CUR_SYNC_STATE" if [[ "$M_CUR_SYNC_STATE" != "$EXPECTED_SYNC_STATE" ]]; then - echo "CRITICAL: unexpected replication state '$M_CUR_SYNC_STATE' (expected state = '$EXPECTED_SYNC_STATE')" + echo "CRITICAL: unexpected replication state '$M_CUR_SYNC_STATE'" \ + "(expected state = '$EXPECTED_SYNC_STATE')" exit 2 fi @@ -400,15 +439,22 @@ if [[ "$EXPECTED_MODE" == "hot-standby" ]]; then # The last received LSN is the last replayed ? if [[ "$LAST_RECEIVED_LSN" != "$LAST_REPLAYED_LSN" ]]; then - debug "/!\ The last received LSN is NOT the last replayed LSN ('$M_CUR_LSN' / '$LAST_REPLAYED_LSN')" - REPLAY_DELAY="$( psql_get 'SELECT EXTRACT(EPOCH FROM now() - pg_last_xact_replay_timestamp());' )" + debug "/!\ The last received LSN is NOT the last replayed LSN" \ + "('$M_CUR_LSN' / '$LAST_REPLAYED_LSN')" + REPLAY_DELAY="$( + psql_get 'SELECT EXTRACT(EPOCH FROM now() - pg_last_xact_replay_timestamp());' + )" debug "Replay delay is $REPLAY_DELAY second(s)" if [[ $( bc -l <<< "$REPLAY_DELAY >= $REPLAY_CRITICAL_DELAY" ) -gt 0 ]]; then - echo "CRITICAL: last received LSN is not the last replayed ('$LAST_RECEIVED_LSN' / '$LAST_REPLAYED_LSN') and replay delay is $REPLAY_DELAY second(s)" + echo "CRITICAL: last received LSN is not the last replayed" \ + "('$LAST_RECEIVED_LSN' / '$LAST_REPLAYED_LSN') and" \ + "replay delay is $REPLAY_DELAY second(s)" exit 2 fi if [[ $( bc -l <<< "$REPLAY_DELAY >= $REPLAY_WARNING_DELAY" ) -gt 0 ]]; then - echo "WARNING: last received LSN is not the last replay file ('$LAST_RECEIVED_LSN' / '$LAST_REPLAYED_LSN') and replay delay is $REPLAY_DELAY second(s)" + echo "WARNING: last received LSN is not the last replay file" \ + "('$LAST_RECEIVED_LSN' / '$LAST_REPLAYED_LSN') and" \ + "replay delay is $REPLAY_DELAY second(s)" exit 1 fi debug "Replay delay is not worrying" @@ -417,7 +463,8 @@ if [[ "$EXPECTED_MODE" == "hot-standby" ]]; then # The master last sent LSN is the last received (and synced) ? if [[ "$M_CUR_SENT_LSN" != "$LAST_RECEIVED_LSN" ]]; then - echo "WARNING: master last sent LSN is not already received (and synced to disk) by slave. May be we have some network delay or load on slave" + echo "WARNING: master last sent LSN is not already received (and synced to disk) by slave." + echo "May be we have some network delay or load on slave" echo "Master last sent LSN: $M_CUR_SENT_LSN" echo "Slave last received (and synced to disk) LSN: $LAST_RECEIVED_LSN" exit 1 @@ -445,27 +492,32 @@ elif [[ "$EXPECTED_MODE" == "master" ]]; then debug "Current LSN: $CURRENT_LSN" # Check standby client - STANDBY_CLIENTS=$( psql_get "SELECT application_name, client_addr, sent_lsn, write_lsn, state, sync_state, current_lag - FROM ( - SELECT application_name, client_addr, sent_lsn, write_lsn, state, sync_state, current_lag - FROM ( - SELECT application_name, client_addr, $sent_lsn AS sent_lsn, $write_lsn AS write_lsn, state, sync_state, - $pg_wal_lsn_diff($pg_current_wal_lsn, $write_lsn) AS current_lag - FROM pg_stat_replication - ) AS s2 - ) AS s1" ) + STANDBY_CLIENTS=$( + psql_get \ + "SELECT + application_name, client_addr, sent_lsn, write_lsn, state, sync_state, current_lag + FROM ( + SELECT + application_name, client_addr, sent_lsn, write_lsn, state, sync_state, + current_lag + FROM ( + SELECT + application_name, client_addr, $sent_lsn AS sent_lsn, + $write_lsn AS write_lsn, state, sync_state, + $pg_wal_lsn_diff($pg_current_wal_lsn, $write_lsn) AS current_lag + FROM pg_stat_replication + ) AS s2 + ) AS s1" + ) if [[ -z "$STANDBY_CLIENTS" ]]; then echo "WARNING: no stand-by client connected" exit 1 fi debug "Stand-by client(s):\n\t${STANDBY_CLIENTS//$'\n'/\\n\\t}" - STANDBY_CLIENTS_TXT="" - STANDBY_CLIENTS_COUNT=0 + STANDBY_CLIENTS_ROWS=() CURRENT_LSN_IS_LAST_SENT=1 for line in $STANDBY_CLIENTS; do - (( STANDBY_CLIENTS_COUNT+=1 )) - NAME=$( cut -d '|' -f 1 <<< "$line" ) IP=$( cut -d '|' -f 2 <<< "$line" ) SENT_LSN=$( cut -d '|' -f 3 <<< "$line" ) @@ -473,20 +525,24 @@ elif [[ "$EXPECTED_MODE" == "master" ]]; then STATE=$( cut -d '|' -f 5 <<< "$line" ) SYNC_STATE=$( cut -d '|' -f 6 <<< "$line" ) LAG=$( cut -d '|' -f 7 <<< "$line" ) - STANDBY_CLIENTS_TXT="$STANDBY_CLIENTS_TXT\n$NAME ($IP): $STATE/$SYNC_STATE (LSN: sent='$SENT_LSN' / writed='$WRITED_LSN', Lag: ${LAG}b)" + STANDBY_CLIENTS_ROW="$NAME ($IP): $STATE/$SYNC_STATE" + STANDBY_CLIENTS_ROW+=" (LSN: sent='$SENT_LSN' / writed='$WRITED_LSN', Lag: ${LAG}b" + STANDBY_CLIENTS_ROWS+=( "$STANDBY_CLIENTS_ROW" ) [[ "$SENT_LSN" != "$CURRENT_LSN" ]] && CURRENT_LSN_IS_LAST_SENT=0 done if [[ $CURRENT_LSN_IS_LAST_SENT -eq 1 ]]; then - echo "OK: $STANDBY_CLIENTS_COUNT stand-by client(s) connected" + echo "OK: ${#STANDBY_CLIENTS_ROWS[@]} stand-by client(s) connected" EXIT_CODE=0 else - echo "WARNING: current master LSN is not the last sent to stand-by client(s) connected. May be we have some load ?" + echo "WARNING: current master LSN is not the last sent to stand-by client(s) connected." \ + "May be we have some load ?" EXIT_CODE=1 fi echo "Current master LSN: $CURRENT_LSN" - echo -e "$STANDBY_CLIENTS_TXT" + IFS=$'\n' + echo "${STANDBY_CLIENTS_ROWS[*]}" exit $EXIT_CODE else echo "UNKNOWN - Invalid mode '$EXPECTED_MODE'"