diff --git a/sample-config/pgmon-metrics.yml b/sample-config/pgmon-metrics.yml index 070e624..97c8209 100644 --- a/sample-config/pgmon-metrics.yml +++ b/sample-config/pgmon-metrics.yml @@ -196,19 +196,6 @@ metrics: sync_state FROM pg_stat_replication WHERE client_addr || '_' || regexp_replace(application_name, '[ ,]', '_', 'g') = %(repid)s - 90600: > - SELECT pid, usename, - EXTRACT(EPOCH FROM backend_start)::integer AS backend_start, - state, - pg_xlog_lsn_diff(pg_current_xlog_lsn(), sent_lsn) AS sent_lsn, - pg_xlog_lsn_diff(pg_current_xlog_lsn(), write_lsn) AS write_lsn, - pg_xlog_lsn_diff(pg_current_xlog_lsn(), flush_lsn) AS flush_lsn, - pg_xlog_lsn_diff(pg_current_xlog_lsn(), replay_lsn) AS replay_lsn, - COALESCE(EXTRACT(EPOCH FROM write_lag), 0)::integer AS write_lag, - COALESCE(EXTRACT(EPOCH FROM flush_lag), 0)::integer AS flush_lag, - COALESCE(EXTRACT(EPOCH FROM replay_lag), 0)::integer AS replay_lag, - sync_state - FROM pg_stat_replication WHERE client_addr || '_' || regexp_replace(application_name, '[ ,]', '_', 'g') = %(repid)s 100000: > SELECT pid, usename, EXTRACT(EPOCH FROM backend_start)::integer AS backend_start, diff --git a/src/pgmon.py b/src/pgmon.py index 67057f5..2a58d93 100755 --- a/src/pgmon.py +++ b/src/pgmon.py @@ -656,9 +656,12 @@ def test_queries(): # If the metric has arguments to use while testing, grab those args = metric.get("test_args", {}) print("Testing {} [{}]".format(name, ", ".join(["{}={}".format(key, value) for key, value in args.items()]))) - # Run the query without the ability to retry. + # Run the query with the ability to retry, mostly because the PostgreSQL + # docker image restarts during its initialization phase. If the health + # check passes during the first phase, the agent will start testing too + # early and will break when PostgreSQL restarts. try: - res = sample_metric(dbname, name, args, retry=False) + res = sample_metric(dbname, name, args, retry=True) except MetricVersionError: print("{} -> Unsupported for this version".format(name)) continue