From ecb616f6d923d4347a3304a9378ae6b28cf85359 Mon Sep 17 00:00:00 2001 From: James Campbell Date: Wed, 18 Jun 2025 03:23:43 -0400 Subject: [PATCH] Allow retry in query tests --- sample-config/pgmon-metrics.yml | 13 ------------- src/pgmon.py | 7 +++++-- 2 files changed, 5 insertions(+), 15 deletions(-) diff --git a/sample-config/pgmon-metrics.yml b/sample-config/pgmon-metrics.yml index 070e624..97c8209 100644 --- a/sample-config/pgmon-metrics.yml +++ b/sample-config/pgmon-metrics.yml @@ -196,19 +196,6 @@ metrics: sync_state FROM pg_stat_replication WHERE client_addr || '_' || regexp_replace(application_name, '[ ,]', '_', 'g') = %(repid)s - 90600: > - SELECT pid, usename, - EXTRACT(EPOCH FROM backend_start)::integer AS backend_start, - state, - pg_xlog_lsn_diff(pg_current_xlog_lsn(), sent_lsn) AS sent_lsn, - pg_xlog_lsn_diff(pg_current_xlog_lsn(), write_lsn) AS write_lsn, - pg_xlog_lsn_diff(pg_current_xlog_lsn(), flush_lsn) AS flush_lsn, - pg_xlog_lsn_diff(pg_current_xlog_lsn(), replay_lsn) AS replay_lsn, - COALESCE(EXTRACT(EPOCH FROM write_lag), 0)::integer AS write_lag, - COALESCE(EXTRACT(EPOCH FROM flush_lag), 0)::integer AS flush_lag, - COALESCE(EXTRACT(EPOCH FROM replay_lag), 0)::integer AS replay_lag, - sync_state - FROM pg_stat_replication WHERE client_addr || '_' || regexp_replace(application_name, '[ ,]', '_', 'g') = %(repid)s 100000: > SELECT pid, usename, EXTRACT(EPOCH FROM backend_start)::integer AS backend_start, diff --git a/src/pgmon.py b/src/pgmon.py index 67057f5..2a58d93 100755 --- a/src/pgmon.py +++ b/src/pgmon.py @@ -656,9 +656,12 @@ def test_queries(): # If the metric has arguments to use while testing, grab those args = metric.get("test_args", {}) print("Testing {} [{}]".format(name, ", ".join(["{}={}".format(key, value) for key, value in args.items()]))) - # Run the query without the ability to retry. + # Run the query with the ability to retry, mostly because the PostgreSQL + # docker image restarts during its initialization phase. If the health + # check passes during the first phase, the agent will start testing too + # early and will break when PostgreSQL restarts. try: - res = sample_metric(dbname, name, args, retry=False) + res = sample_metric(dbname, name, args, retry=True) except MetricVersionError: print("{} -> Unsupported for this version".format(name)) continue