11 changed files with 150 additions and 250 deletions
--- a/openrc/pgmon.initd
+++ b/openrc/pgmon.initd
@ -19,6 +19,6 @@ start_pre() {
 }
 command="/usr/bin/pgmon"
-command_args="-c '$CONFIG_FILE'"
+command_args="'$CONFIG_FILE'"
 command_background="true"
 command_user="${PGMON_USER}:${PGMON_GROUP}"
--- a/sample-config/pgmon-metrics.yml
+++ b/sample-config/pgmon-metrics.yml
@ -11,8 +11,7 @@ metrics:
  discover_slots:
    type: set
    query:
-      0: SELECT slot_name, plugin, slot_type, database, false as temporary, active FROM pg_replication_slots
+      0: SELECT slot_name, plugin, slot_type, database, temporary, active FROM pg_replication_slots
      100000: SELECT slot_name, plugin, slot_type, database, temporary, active FROM pg_replication_slots
  # cluster-wide metrics
  version:
@ -30,8 +29,6 @@ metrics:
    query:
      0: SELECT numbackends, xact_commit, xact_rollback, blks_read, blks_hit, tup_returned, tup_fetched, tup_inserted, tup_updated, tup_deleted, conflicts, temp_files, temp_bytes, deadlocks, blk_read_time, blk_write_time, extract('epoch' from stats_reset)::float FROM pg_stat_database WHERE datname = %(dbname)s
      140000: SELECT numbackends, xact_commit, xact_rollback, blks_read, blks_hit, tup_returned, tup_fetched, tup_inserted, tup_updated, tup_deleted, conflicts, temp_files, temp_bytes, deadlocks, COALESCE(checksum_failures, 0) AS checksum_failures, blk_read_time, blk_write_time, session_time, active_time, idle_in_transaction_time, sessions, sessions_abandoned, sessions_fatal, sessions_killed, extract('epoch' from stats_reset)::float FROM pg_stat_database WHERE datname = %(dbname)s
    test_args:
      dbname: postgres
  # Debugging
  ntables:
@ -43,9 +40,7 @@ metrics:
  rep_stats:
    type: row
    query:
-      0: SELECT * FROM pg_stat_replication WHERE client_addr || '_' || regexp_replace(application_name, '[ ,]', '_', 'g') = '{repid}'
+      0: SELECT * FROM pg_stat_database WHERE client_addr || '_' || regexp_replace(application_name, '[ ,]', '_', 'g') = '{repid}'
    test_args:
      repid: 127.0.0.1_test_rep
  # Debugging
  sleep:
@ -57,7 +52,4 @@ metrics:
  slot_stats:
    type: row
    query:
-      0: SELECT active_pid, xmin, pg_xlog_location_diff(pg_current_xlog_location(), restart_lsn) AS restart_bytes, pg_xlog_location_diff(pg_current_xlog_location(), confirmed_flush_lsn) AS confirmed_flush_bytes FROM pg_replication_slots WHERE slot_name = '{slot}'
+      0: SELECT active_pid, xmin, pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn) AS restart_bytes, pg_wal_lsn_diff(pg_current_wal_lsn(), confirmed_flush_lsn) AS confirmed_flush_bytes FROM pg_replication_slots WHERE slot_name = '{slot}'
      100000: SELECT active_pid, xmin, pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn) AS restart_bytes, pg_wal_lsn_diff(pg_current_wal_lsn(), confirmed_flush_lsn) AS confirmed_flush_bytes FROM pg_replication_slots WHERE slot_name = '{slot}'
    test_args:
      slot: test_slot
--- a/sample-config/pgmon.yml
+++ b/sample-config/pgmon.yml
@ -1,6 +1,3 @@
 # The address the agent binds to
 #address: 127.0.0.1
 # The port the agent listens on for requests
 #port: 5400
@ -29,9 +26,6 @@
 # Default database to connect to when none is specified for a metric
 #dbname: 'postgres'
 # SSL connection mode
 #ssl_mode: require
 # Timeout for getting a connection slot from a pool
 #pool_slot_timeout: 5
--- a/src/pgmon.py
+++ b/src/pgmon.py
@ -4,7 +4,6 @@ import yaml
 import json
 import time
 import os
 import sys
 import argparse
 import logging
@ -75,18 +74,12 @@ class UnhappyDBError(Exception):
    pass
 class UnknownMetricError(Exception):
    pass
 class MetricVersionError(Exception):
    pass
 # Default config settings
 default_config = {
    # The address the agent binds to
    "address": "127.0.0.1",
    # The port the agent listens on for requests
    "port": 5400,
    # Min PostgreSQL connection pool size (per database)
@ -105,8 +98,6 @@ default_config = {
    "dbport": 5432,
    # Default database to connect to when none is specified for a metric
    "dbname": "postgres",
    # SSL connection mode
    "ssl_mode": "require",
    # Timeout for getting a connection slot from a pool
    "pool_slot_timeout": 5,
    # PostgreSQL connection timeout (seconds)
@ -329,7 +320,6 @@ def get_pool(dbname):
            # lock
            if dbname not in connections:
                log.info("Creating connection pool for: {}".format(dbname))
                # Actually create the connection pool
                connections[dbname] = ConnectionPool(
                    dbname,
                    int(config["min_pool_size"]),
@ -339,7 +329,7 @@ def get_pool(dbname):
                    port=config["dbport"],
                    user=config["dbuser"],
                    connect_timeout=int(config["connect_timeout"]),
-                    sslmode=config["ssl_mode"],
+                    sslmode="require",
                )
            # Clear the unhappy indicator if present
            unhappy_cooldown.pop(dbname, None)
@ -387,16 +377,10 @@ def run_query_no_retry(pool, return_type, query, args):
                res = curs.fetchall()
                if return_type == "value":
                    if len(res) == 0:
                        return ""
                    return str(list(res[0].values())[0])
                elif return_type == "row":
                    if len(res) == 0:
                        return "[]"
                    return json.dumps(res[0])
                elif return_type == "column":
                    if len(res) == 0:
                        return "[]"
                    return json.dumps([list(r.values())[0] for r in res])
                elif return_type == "set":
                    return json.dumps(res)
@ -404,7 +388,7 @@ def run_query_no_retry(pool, return_type, query, args):
            dbname = pool.name
            if dbname in unhappy_cooldown:
                raise UnhappyDBError()
-            elif conn.closed != 0:
+            elif conn.broken:
                raise DisconnectedError()
            else:
                raise
@ -482,53 +466,6 @@ def get_cluster_version():
    return cluster_version
 def sample_metric(dbname, metric_name, args, retry=True):
    """
    Run the appropriate query for the named metric against the specified database
    """
    # Get the metric definition
    try:
        metric = config["metrics"][metric_name]
    except KeyError:
        raise UnknownMetricError("Unknown metric: {}".format(metric_name))
    # Get the connection pool for the database, or create one if it doesn't
    # already exist.
    pool = get_pool(dbname)
    # Identify the PostgreSQL version
    version = get_cluster_version()
    # Get the query version
    query = get_query(metric, version)
    # Execute the quert
    if retry:
        return run_query(pool, metric["type"], query, args)
    else:
        return run_query_no_retry(pool, metric["type"], query, args)
 def test_queries():
    """
    Run all of the metric queries against a database and check the results
    """
    # We just use the default db for tests
    dbname = config["dbname"]
    # Loop through all defined metrics.
    for name, metric in config["metrics"].items():
        # If the metric has arguments to use while testing, grab those
        args = metric.get("test_args", {})
        # Run the query without the ability to retry.
        res = sample_metric(dbname, name, args, retry=False)
        # Compare the result to the provided sample results
        # TODO
        print("{} -> {}".format(name, res))
    # Return the number of errors
    # TODO
    return 0
 class SimpleHTTPRequestHandler(BaseHTTPRequestHandler):
    """
    This is our request handling server.  It is responsible for listening for
@ -557,10 +494,10 @@ class SimpleHTTPRequestHandler(BaseHTTPRequestHandler):
        """
        # Parse the URL
        parsed_path = urlparse(self.path)
-        metric_name = parsed_path.path.strip("/")
+        name = parsed_path.path.strip("/")
        parsed_query = parse_qs(parsed_path.query)
-        if metric_name == "agent_version":
+        if name == "agent_version":
            self._reply(200, VERSION)
            return
@ -568,31 +505,60 @@ class SimpleHTTPRequestHandler(BaseHTTPRequestHandler):
        #       single values, just grab the first from each.
        args = {key: values[0] for key, values in parsed_query.items()}
        # Get the metric definition
        try:
            metric = config["metrics"][name]
        except KeyError:
            log.error("Unknown metric: {}".format(name))
            self._reply(404, "Unknown metric")
            return
        # Get the dbname.  If none was provided, use the default from the
        # config.
        dbname = args.get("dbname", config["dbname"])
-        # Sample the metric
+        # Get the connection pool for the database, or create one if it doesn't
        # already exist.
        try:
-            self._reply(200, sample_metric(dbname, metric_name, args))
+            pool = get_pool(dbname)
-            return
+        except UnhappyDBError:
        except UnknownMetricError as e:
            log.error("Unknown metric: {}".format(metric_name))
            self._reply(404, "Unknown metric")
            return
        except MetricVersionError as e:
            log.error(
                "Failed to find a version of {} for {}".format(metric_name, version)
            )
            self._reply(404, "Unsupported version")
            return
        except UnhappyDBError as e:
            log.info("Database {} is unhappy, please be patient".format(dbname))
            self._reply(503, "Database unavailable")
            return
        # Identify the PostgreSQL version
        try:
            version = get_cluster_version()
        except UnhappyDBError:
            return
        except Exception as e:
-            log.error("Error running query: {}".format(e))
+            if dbname in unhappy_cooldown:
-            self._reply(500, "Unexpected error: {}".format(e))
+                log.info("Database {} is unhappy, please be patient".format(dbname))
                self._reply(503, "Database unavailable")
            else:
                log.error("Failed to get PostgreSQL version: {}".format(e))
                self._reply(500, "Error getting DB version")
            return
        # Get the query version
        try:
            query = get_query(metric, version)
        except KeyError:
            log.error("Failed to find a version of {} for {}".format(name, version))
            self._reply(404, "Unsupported version")
            return
        # Execute the quert
        try:
            self._reply(200, run_query(pool, metric["type"], query, args))
            return
        except Exception as e:
            if dbname in unhappy_cooldown:
                log.info("Database {} is unhappy, please be patient".format(dbname))
                self._reply(503, "Database unavailable")
            else:
                log.error("Error running query: {}".format(e))
                self._reply(500, "Error running query")
            return
    def _reply(self, code, content):
@ -613,17 +579,12 @@ if __name__ == "__main__":
    )
    parser.add_argument(
-        "-c",
+        "config_file",
        "--config_file",
        default="pgmon.yml",
        nargs="?",
        help="The config file to read (default: %(default)s)",
    )
    parser.add_argument(
        "-t", "--test", action="store_true", help="Run query tests and exit"
    )
    args = parser.parse_args()
    # Set the config file path
@ -632,16 +593,8 @@ if __name__ == "__main__":
    # Read the config file
    read_config(config_file)
    # Run query tests and exit if test mode is enabled
    if args.test:
        errors = test_queries()
        if errors > 0:
            sys.exit(1)
        else:
            sys.exit(0)
    # Set up the http server to receive requests
-    server_address = (config["address"], config["port"])
+    server_address = ("127.0.0.1", config["port"])
    httpd = ThreadingHTTPServer(server_address, SimpleHTTPRequestHandler)
    # Set up the signal handler
--- a/systemd/pgmon@.service
+++ b/systemd/pgmon@.service
@ -7,7 +7,7 @@ After=network.target
 [Service]
 EnvironmentFile=/etc/pgmon/%i-service.conf
 User=${SERVICE_USER:-postgres}
-ExecStart=/usr/local/bin/pgmon -c /etc/pgmon/%i.yml
+ExecStart=/usr/local/bin/pgmon /etc/pgmon/%i.yml
 ExecReload=kill -HUP $MAINPID
 Restart=on-failure
 Type=exec
--- a/tests/Dockerfile
+++ b/tests/Dockerfile
@ -1,22 +0,0 @@
 FROM alpine:3.21
 RUN apk update && \
    apk add py3-psycopg2 \
            py3-yaml \
            tini
 WORKDIR /app
 COPY src/pgmon.py /app/
 COPY sample-config/pgmon-metrics.yml /app/
 COPY tests/test-config.yml /app/
 COPY --chmod=0600 --chown=postgres:postgres tests/pgpass /root/.pgpass
 ENTRYPOINT ["tini", "--"]
 EXPOSE 5400
 CMD ["/app/pgmon.py", "-c", "/app/test-config.yml", "--test"]
--- a/tests/docker-compose.yml
+++ b/tests/docker-compose.yml
@ -1,32 +0,0 @@
 ---
 services:
  agent:
    image: pgmon
    build:
      context: ..
      dockerfile: tests/Dockerfile
    ports:
      - :5400
    depends_on:
      db:
        condition: service_healthy
  db:
    image: "postgres:${PGTAG:-17-bookworm}"
    ports:
      - :5432
    environment:
      POSTGRES_PASSWORD: secret
    healthcheck:
      #test: [ "CMD", "pg_isready", "-U", "postgres" ]
      test: [ "CMD-SHELL", "pg_controldata /var/lib/postgresql/data/ | grep -q 'in production'" ]
      interval: 5s
      timeout: 2s
      retries: 10
    command: >
      postgres -c ssl=on
               -c ssl_cert_file='/etc/ssl/certs/ssl-cert-snakeoil.pem'
               -c ssl_key_file='/etc/ssl/private/ssl-cert-snakeoil.key'
               -c listen_addresses='*'
--- a/tests/pgpass
+++ b/tests/pgpass
@ -1 +0,0 @@
 db:5432:*:postgres:secret
--- a/tests/run-tests.sh
+++ b/tests/run-tests.sh
@ -1,62 +0,0 @@
 #!/bin/bash
 # Versions to test
 versions=( $@ )
 # If we weren't given any versions, test them all
 if [ ${#versions[@]} -eq 0 ]
 then
    versions=( 9.2 9.6 10 11 12 13 14 15 16 17 )
 fi
 # Image tags to use
 declare -A images=()
 images["9.2"]='9.2'
 images["9.6"]='9.6-bullseye'
 images["10"]='10-bullseye'
 images["11"]='11-bookworm'
 images["12"]='12-bookworm'
 images["13"]='13-bookworm'
 images["14"]='14-bookworm'
 images["15"]='15-bookworm'
 images["16"]='16-bookworm'
 images["17"]='17-bookworm'
 declare -A results=()
 # Make sure everything's down to start with
 docker compose down
 # Make sure our agent container is up to date
 docker compose build agent
 for version in "${versions[@]}"
 do
    echo
    echo "Testing: PostgreSQL ${version}"
    # Specify the version we're testing against
    export PGTAG="${images["$version"]}"
    # Start the containers
    docker compose up --exit-code-from=agent agent
    rc=$?
    results["$version"]=$rc
    # Destroy the containers
    docker compose down
 done
 echo
 echo
 for v in "${versions[@]}"
 do
    case "${results["$v"]}" in
        0) msg="OK" ;;
        1) msg="Query failure detected" ;;
        18) msg="Docker image error: 18" ;;
        *) msg="Unexpected error: ${results["$v"]}" ;;
    esac
    echo "$v -> $msg"
 done
--- a/tests/sql-tests.py
+++ b/tests/sql-tests.py
@ -0,0 +1,94 @@
 #!/usr/bin/env python3
 from testcontainers import PostgresContainer
 import requests
 import yaml
 import sys
 pg_versions = [9.2, 9.6, 10, 11, 12, 13, 14, 15, 16, 17]
 pgmon_port = 93849
 tests = {}
 container = None
 def std_version(version):
    if version[0] == "9":
        return int(f"{version[0]}0{version[1]}00")
    else:
        return int(f"{version}0000")
 def run_test(metric, params, status, check):
    """
    Validate the return code and restults of a query
    params:
      metric: The name of the metric to test
      params: A dictionary of query parameters to use when testing
      status: The expected status code
      check: A regular expression to validate the results (or None)
    """
    result = requests.get(f"http://localhost:{pgmon_port}/{metric}", params=params)
    if result.status_code != status:
        print(
            f"FAIL: {metric}[{params}] returned wrong status code: {result.status_code}"
        )
        return False
    if re.match(check, result.text):
        print(f"SUCCESS: {metric}[{params}]")
        return True
    else:
        print(f"FAIL: {metric}[{params}] result is invalid, got:\n {result.text}")
        return False
 def run_all_tests(version):
    """
    Run all defined tests against the current running instance
    params:
      version: The PostgreSQL version currently being tested (server_version_num format)
    """
    errors = 0
    # Convert versions like 12 to 120000
    version_num = std_version(version)
    # Loop through all of the metrics to test.
    for metric in tests.keys():
        params = metric.get("params", {})
        status = 200
        check = ""
        # Find the correct version of the status and check parameters (assuming there are any).
        # If there are any check conditions, find the highest version that does not exceed the version we're currently testing against.
        # To do this, we order the keys (versions) in reverse, so we start with the highest.
        for v in reversed(sorted(metric.get("expect", {}).keys())):
            # If we've reached a version <= the one we're testing use it.
            if int(v) <= version_num:
                status = metric["expect"][v]["status"]
                check = metric["expect"][v]["check"]
                break
        if not run_test(metric, metrics[metric].get(params, {}), status, check):
            errors += 1
    return errors
 def start_test_db(version):
    # container = PostgresContainer()
    pass
 # Read the test script
 try:
    with open("metric_tests.yml", "r") as f:
        tests = yaml.safe_load(f)
 except yaml.parser.ParserError as e:
    sys.exit("Failed to parse metrics_test.yml: {e}")
--- a/tests/test-config.yml
+++ b/tests/test-config.yml
@ -1,16 +0,0 @@
 ---
 # Bind to all interfaces so we can submit requests from outside the test container
 address: 0.0.0.0
 # We always just connect to the db container
 dbhost: db
 dbport: 5432
 dbuser: postgres
 # Allow some insecure SSL parameters for the 9.2 test
 ssl_ciphers: DEFAULT@SECLEVEL=1
 # Pull in the standard metrics
 include:
  - pgmon-metrics.yml