From ea397ef88968f4aac5f9b0af663b38c30306995f Mon Sep 17 00:00:00 2001 From: James Campbell Date: Wed, 18 Jun 2025 17:39:16 -0400 Subject: [PATCH] Improve docker-related query test failure avoidance --- src/pgmon.py | 27 ++++++++++++++++++--------- tests/run-tests.sh | 8 -------- 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/src/pgmon.py b/src/pgmon.py index 2a58d93..71d3491 100755 --- a/src/pgmon.py +++ b/src/pgmon.py @@ -656,15 +656,24 @@ def test_queries(): # If the metric has arguments to use while testing, grab those args = metric.get("test_args", {}) print("Testing {} [{}]".format(name, ", ".join(["{}={}".format(key, value) for key, value in args.items()]))) - # Run the query with the ability to retry, mostly because the PostgreSQL - # docker image restarts during its initialization phase. If the health - # check passes during the first phase, the agent will start testing too - # early and will break when PostgreSQL restarts. - try: - res = sample_metric(dbname, name, args, retry=True) - except MetricVersionError: - print("{} -> Unsupported for this version".format(name)) - continue + # When testing against a docker container, we may end up connecting + # before the service is truly up (it restarts during the initialization + # phase). To cope with this, we'll allow a few connection failures. + tries = 5 + while True: + # Run the query without the ability to retry + try: + res = sample_metric(dbname, name, args, retry=False) + break + except MetricVersionError: + res = "Unsupported for this version" + break + except psycopg2.OperationalError as e: + print("Error encountered, {} tries left: {}".format(tries, e)) + if tries <= 0: + raise + time.sleep(1) + tries -= 1 # Compare the result to the provided sample results # TODO print("{} -> {}".format(name, res)) diff --git a/tests/run-tests.sh b/tests/run-tests.sh index 897d49b..0f1e1a2 100755 --- a/tests/run-tests.sh +++ b/tests/run-tests.sh @@ -41,14 +41,6 @@ do # Specify the version we're testing against export PGTAG="${images["$version"]}" - # Start the db container first and wait a moment for it to initialize - # This isn't perfect, but if the health check catches PostgreSQL when it's - # first being initialized, the agent can fail to connect. - # A better solution would probably be to make the agent retry more. - docker compose up -d db - - sleep 2 - # Start the containers docker compose up --exit-code-from=agent agent rc=$?