From b7f731c6ac9aec900a50ca4dad79f363afcb6f5a Mon Sep 17 00:00:00 2001 From: James Campbell Date: Wed, 18 Jun 2025 03:03:34 -0400 Subject: [PATCH] Fix and reformat several queries --- sample-config/pgmon-metrics.yml | 205 +++++++++++++++++++++++++++++--- src/pgmon.py | 1 + tests/docker-compose.yml | 2 +- tests/run-tests.sh | 5 +- 4 files changed, 192 insertions(+), 21 deletions(-) diff --git a/sample-config/pgmon-metrics.yml b/sample-config/pgmon-metrics.yml index dcf6588..070e624 100644 --- a/sample-config/pgmon-metrics.yml +++ b/sample-config/pgmon-metrics.yml @@ -5,16 +5,36 @@ metrics: discover_dbs: type: set query: - 0: SELECT datname AS dbname FROM pg_database + 0: > + SELECT datname AS dbname + FROM pg_database discover_rep: type: set query: - 0: SELECT client_addr || '_' || regexp_replace(application_name, '[ ,]', '_', 'g') AS repid, client_addr, state FROM pg_stat_replication + 0: > + SELECT client_addr || '_' || regexp_replace(application_name, '[ ,]', '_', 'g') AS repid, + client_addr, + state + FROM pg_stat_replication discover_slots: type: set query: - 90400: SELECT slot_name, plugin, slot_type, database, false as temporary, active FROM pg_replication_slots - 100000: SELECT slot_name, plugin, slot_type, database, temporary, active FROM pg_replication_slots + 90400: > + SELECT slot_name, + plugin, + slot_type, + database, + false as temporary, + active + FROM pg_replication_slots + 100000: > + SELECT slot_name, + plugin, + slot_type, + database, + temporary, + active + FROM pg_replication_slots ## # cluster-wide metrics @@ -27,13 +47,43 @@ metrics: max_frozen_age: type: row query: - 0: SELECT max(age(datfrozenxid)) AS xid_age, 0 AS mxid_age FROM pg_database - 90600: SELECT max(age(datfrozenxid)) AS xid_age, max(mxid_age(datminmxid)) AS mxid_age FROM pg_database + 0: > + SELECT max(age(datfrozenxid)) AS xid_age, + 0 AS mxid_age + FROM pg_database + 90600: > + SELECT max(age(datfrozenxid)) AS xid_age, + max(mxid_age(datminmxid)) AS mxid_age + FROM pg_database bgwriter: type: row query: - 0: SELECT checkpoints_timed, checkpoints_req, checkpoint_write_time, checkpoint_sync_time, buffers_checkpoint, buffers_clean, maxwritten_clean, buffers_backend, buffers_backend_fsync, buffers_alloc FROM pg_stat_bgwriter; + 0: > + SELECT checkpoints_timed, + checkpoints_req, + checkpoint_write_time, + checkpoint_sync_time, + buffers_checkpoint, + buffers_clean, + maxwritten_clean, + buffers_backend, + buffers_backend_fsync, + buffers_alloc + FROM pg_stat_bgwriter + 170000: > + SELECT cp.num_timed AS checkpoints_timed, + cp.num_requested AS checkpoints_req, + cp.write_time AS checkpoint_write_time, + cp.sync_time AS checkpoint_sync_time, + cp.buffers_written AS buffers_checkpoint, + bg.buffers_clean AS buffers_clean, + bg.maxwritten_clean AS maxwritten_clean, + NULL AS buffers_backend, + NULL AS buffers_backend_fsync, + bg.buffers_alloc AS buffers_alloc + FROM pg_stat_bgwriter bg + CROSS JOIN pg_stat_checkpointer cp ## @@ -42,27 +92,89 @@ metrics: db_stats: type: row query: - 0: SELECT numbackends, xact_commit, xact_rollback, blks_read, blks_hit, tup_returned, tup_fetched, tup_inserted, tup_updated, tup_deleted, conflicts, temp_files, temp_bytes, deadlocks, 0 AS checksum_failures, blk_read_time, blk_write_time, 0 AS session_time, 0 AS active_time, 0 AS idle_in_transaction_time, 0 AS sessions, 0 AS sessions_abandoned, 0 AS sessions_fatal, 0 AS sessions_killed, extract('epoch' from stats_reset)::float AS stats_reset FROM pg_stat_database WHERE datname = %(dbname)s - 140000: SELECT numbackends, xact_commit, xact_rollback, blks_read, blks_hit, tup_returned, tup_fetched, tup_inserted, tup_updated, tup_deleted, conflicts, temp_files, temp_bytes, deadlocks, COALESCE(checksum_failures, 0) AS checksum_failures, blk_read_time, blk_write_time, session_time, active_time, idle_in_transaction_time, sessions, sessions_abandoned, sessions_fatal, sessions_killed, extract('epoch' from stats_reset)::float AS stats_reset FROM pg_stat_database WHERE datname = %(dbname)s + 0: > + SELECT numbackends, + xact_commit, + xact_rollback, + blks_read, + blks_hit, + tup_returned, + tup_fetched, + tup_inserted, + tup_updated, + tup_deleted, + conflicts, + temp_files, + temp_bytes, + deadlocks, + 0 AS checksum_failures, + blk_read_time, + blk_write_time, + 0 AS session_time, + 0 AS active_time, + 0 AS idle_in_transaction_time, + 0 AS sessions, + 0 AS sessions_abandoned, + 0 AS sessions_fatal, + 0 AS sessions_killed, + extract('epoch' from stats_reset)::float AS stats_reset + FROM pg_stat_database WHERE datname = %(dbname)s + 140000: > + SELECT numbackends, + xact_commit, + xact_rollback, + blks_read, + blks_hit, + tup_returned, + tup_fetched, + tup_inserted, + tup_updated, + tup_deleted, + conflicts, + temp_files, + temp_bytes, + deadlocks, + COALESCE(checksum_failures, + 0) AS checksum_failures, + blk_read_time, + blk_write_time, + session_time, + active_time, + idle_in_transaction_time, + sessions, + sessions_abandoned, + sessions_fatal, + sessions_killed, + extract('epoch' from stats_reset)::float AS stats_reset + FROM pg_stat_database WHERE datname = %(dbname)s test_args: dbname: postgres hit_ratios: type: row query: - 0: SELECT sum(heap_blks_read)::float / NULLIF(sum(heap_blks_read + heap_blks_hit), 0) AS avg_heap_hit_ratio, sum(idx_blks_hit)::float / NULLIF(sum(idx_blks_read + idx_blks_hit), 0) AS avg_idx_hit_ratio, sum(toast_blks_hit)::float / NULLIF(sum(toast_blks_read + toast_blks_hit), 0) AS avg_toast_hit_ratio, sum(tidx_blks_hit)::float / NULLIF(sum(tidx_blks_read + tidx_blks_hit), 0) AS avg_tidx_hit_ratio FROM pg_statio_all_tables + 0: > + SELECT sum(heap_blks_read)::float / NULLIF(sum(heap_blks_read + heap_blks_hit), 0) AS avg_heap_hit_ratio, + sum(idx_blks_hit)::float / NULLIF(sum(idx_blks_read + idx_blks_hit), 0) AS avg_idx_hit_ratio, + sum(toast_blks_hit)::float / NULLIF(sum(toast_blks_read + toast_blks_hit), 0) AS avg_toast_hit_ratio, + sum(tidx_blks_hit)::float / NULLIF(sum(tidx_blks_read + tidx_blks_hit), 0) AS avg_tidx_hit_ratio + FROM pg_statio_all_tables + test_args: + dbname: postgres activity: - type: row + type: set query: 0: > - SELECT count(*) as backend_count, - count(*) FILTER (WHERE state = 'idle') AS idle_count, - count(*) FILTER (WHERE state = 'idle in transaction') AS idle_tx_count, - COALESCE(EXTRACT(EPOCH FROM max(now() - state_change) FILTER (WHERE state = 'active'))::float, 0) AS max_active_time, - COALESCE(EXTRACT(EPOCH FROM max(now() - state_change) FILTER (WHERE state = 'idle in transaction'))::float, 0) AS max_idle_tx_time + SELECT state, + count(*) AS backend_count, + COALESCE(EXTRACT(EPOCH FROM max(now() - state_change))::float, 0) AS max_state_time FROM pg_stat_activity WHERE datname = %(dbname)s + GROUP BY state + test_args: + dbname: postgres + ## # Per-replication metrics @@ -70,7 +182,46 @@ metrics: rep_stats: type: row query: - 90400: SELECT pid, usename, EXTRACT(EPOCH FROM backend_start)::integer AS backend_start, state, sent_lsn, write_lsn, flush_lsn, replay_lsn, COALESCE(EXTRACT(EPOCH FROM write_lag), 0)::integer AS write_lag, COALESCE(EXTRACT(EPOCH FROM flush_lag), 0)::integer AS flush_lag, COALESCE(EXTRACT(EPOCH FROM replay_lag), 0)::integer AS replay_lag, sync_state FROM pg_stat_replication WHERE client_addr || '_' || regexp_replace(application_name, '[ ,]', '_', 'g') = %(repid)s + 90400: > + SELECT pid, usename, + EXTRACT(EPOCH FROM backend_start)::integer AS backend_start, + state, + pg_xlog_location_diff(pg_current_xlog_location(), sent_location) AS sent_lsn, + pg_xlog_location_diff(pg_current_xlog_location(), write_location) AS write_lsn, + pg_xlog_location_diff(pg_current_xlog_location(), flush_location) AS flush_lsn, + pg_xlog_location_diff(pg_current_xlog_location(), replay_location) AS replay_lsn, + NULL AS write_lag, + NULL AS flush_lag, + NULL AS replay_lag, + sync_state + FROM pg_stat_replication + WHERE client_addr || '_' || regexp_replace(application_name, '[ ,]', '_', 'g') = %(repid)s + 90600: > + SELECT pid, usename, + EXTRACT(EPOCH FROM backend_start)::integer AS backend_start, + state, + pg_xlog_lsn_diff(pg_current_xlog_lsn(), sent_lsn) AS sent_lsn, + pg_xlog_lsn_diff(pg_current_xlog_lsn(), write_lsn) AS write_lsn, + pg_xlog_lsn_diff(pg_current_xlog_lsn(), flush_lsn) AS flush_lsn, + pg_xlog_lsn_diff(pg_current_xlog_lsn(), replay_lsn) AS replay_lsn, + COALESCE(EXTRACT(EPOCH FROM write_lag), 0)::integer AS write_lag, + COALESCE(EXTRACT(EPOCH FROM flush_lag), 0)::integer AS flush_lag, + COALESCE(EXTRACT(EPOCH FROM replay_lag), 0)::integer AS replay_lag, + sync_state + FROM pg_stat_replication WHERE client_addr || '_' || regexp_replace(application_name, '[ ,]', '_', 'g') = %(repid)s + 100000: > + SELECT pid, usename, + EXTRACT(EPOCH FROM backend_start)::integer AS backend_start, + state, + pg_wal_lsn_diff(pg_current_wal_lsn(), sent_lsn) AS sent_lsn, + pg_wal_lsn_diff(pg_current_wal_lsn(), write_lsn) AS write_lsn, + pg_wal_lsn_diff(pg_current_wal_lsn(), flush_lsn) AS flush_lsn, + pg_wal_lsn_diff(pg_current_wal_lsn(), replay_lsn) AS replay_lsn, + COALESCE(EXTRACT(EPOCH FROM write_lag), 0)::integer AS write_lag, + COALESCE(EXTRACT(EPOCH FROM flush_lag), 0)::integer AS flush_lag, + COALESCE(EXTRACT(EPOCH FROM replay_lag), 0)::integer AS replay_lag, + sync_state + FROM pg_stat_replication WHERE client_addr || '_' || regexp_replace(application_name, '[ ,]', '_', 'g') = %(repid)s test_args: repid: 127.0.0.1_test_rep @@ -81,8 +232,24 @@ metrics: slot_stats: type: row query: - 90400: SELECT active_pid, xmin, pg_xlog_location_diff(pg_current_xlog_location(), restart_lsn) AS restart_bytes, pg_xlog_location_diff(pg_current_xlog_location(), confirmed_flush_lsn) AS confirmed_flush_bytes FROM pg_replication_slots WHERE slot_name = %(slot)s - 100000: SELECT active_pid, xmin, pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn) AS restart_bytes, pg_wal_lsn_diff(pg_current_wal_lsn(), confirmed_flush_lsn) AS confirmed_flush_bytes FROM pg_replication_slots WHERE slot_name = %(slot)s + 90400: > + SELECT NULL as active_pid, + xmin, + pg_xlog_location_diff(pg_current_xlog_location(), restart_lsn) AS restart_bytes, + NULL AS confirmed_flush_bytes + FROM pg_replication_slots WHERE slot_name = %(slot)s + 90600: > + SELECT active_pid, + xmin, + pg_xlog_location_diff(pg_current_xlog_location(), restart_lsn) AS restart_bytes, + pg_xlog_location_diff(pg_current_xlog_location(), confirmed_flush_lsn) AS confirmed_flush_bytes + FROM pg_replication_slots WHERE slot_name = %(slot)s + 100000: > + SELECT active_pid, + xmin, + pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn) AS restart_bytes, + pg_wal_lsn_diff(pg_current_wal_lsn(), confirmed_flush_lsn) AS confirmed_flush_bytes + FROM pg_replication_slots WHERE slot_name = %(slot)s test_args: slot: test_slot diff --git a/src/pgmon.py b/src/pgmon.py index 2ca313c..67057f5 100755 --- a/src/pgmon.py +++ b/src/pgmon.py @@ -655,6 +655,7 @@ def test_queries(): for name, metric in config["metrics"].items(): # If the metric has arguments to use while testing, grab those args = metric.get("test_args", {}) + print("Testing {} [{}]".format(name, ", ".join(["{}={}".format(key, value) for key, value in args.items()]))) # Run the query without the ability to retry. try: res = sample_metric(dbname, name, args, retry=False) diff --git a/tests/docker-compose.yml b/tests/docker-compose.yml index ae9b5f6..fe2a9c8 100644 --- a/tests/docker-compose.yml +++ b/tests/docker-compose.yml @@ -23,7 +23,7 @@ services: test: [ "CMD-SHELL", "pg_controldata /var/lib/postgresql/data/ | grep -q 'in production'" ] interval: 5s timeout: 2s - retries: 20 + retries: 40 command: > postgres -c ssl=on -c ssl_cert_file='/etc/ssl/certs/ssl-cert-snakeoil.pem' diff --git a/tests/run-tests.sh b/tests/run-tests.sh index 9bf472a..0f1e1a2 100755 --- a/tests/run-tests.sh +++ b/tests/run-tests.sh @@ -6,12 +6,15 @@ versions=( $@ ) # If we weren't given any versions, test them all if [ ${#versions[@]} -eq 0 ] then - versions=( 9.2 9.6 10 11 12 13 14 15 16 17 ) + versions=( 9.2 9.4 9.6 10 11 12 13 14 15 16 17 ) fi # Image tags to use declare -A images=() images["9.2"]='9.2' +images["9.3"]='9.3' +images["9.4"]='9.4' +images["9.5"]='9.5' images["9.6"]='9.6-bullseye' images["10"]='10-bullseye' images["11"]='11-bookworm'