13 changed files with 62 additions and 344 deletions
--- a/GENTOO/pgmon-1.0.2.ebuild
+++ b/GENTOO/pgmon-1.0.2.ebuild
@ -1,73 +0,0 @@
 # Copyright 2024 Gentoo Authors
 # Distributed under the terms of the GNU General Public License v2
 EAPI=8
 PYTHON_COMPAT=( python3_{6..13} )
 inherit python-r1
 DESCRIPTION="PostgreSQL monitoring bridge"
 HOMEPAGE="None"
 LICENSE="BSD"
 SLOT="0"
 KEYWORDS="amd64"
 SRC_URI="https://code2.shh-dot-com.org/james/${PN}/archive/v${PV}.tar.bz2 -> ${P}.tar.bz2"
 IUSE="-systemd"
 DEPEND="
 	${PYTHON_DEPS}
 	dev-python/psycopg:2
 	dev-python/pyyaml
 	app-admin/logrotate
 	"
 RDEPEND="${DEPEND}"
 BDEPEND=""
 RESTRICT="fetch"
 #S="${WORKDIR}/${PN}"
 pkg_nofetch() {
 	einfo "Please download"
 	einfo "  - ${P}.tar.bz2"
 	einfo "from ${HOMEPAGE} and place it in your DISTDIR directory."
 	einfo "The file should be owned by portage:portage."
 }
 src_compile() {
 	true
 }
 src_install() {
 	# Install init script
 	if ! use systemd ; then
 		newinitd "openrc/pgmon.initd" pgmon
 		newconfd "openrc/pgmon.confd" pgmon
 	fi
 	# Install systemd unit
 	if use systemd ; then
 		systemd_dounit "systemd/pgmon.service"
 	fi
 	# Install script
 	exeinto /usr/bin
 	newexe "src/pgmon.py" pgmon
 	# Install default config
 	diropts -o root -g root -m 0755
 	insinto /etc/pgmon
 	doins "sample-config/pgmon.yml"
 	doins "sample-config/pgmon-metrics.yml"
 	# Install logrotate config
 	insinto /etc/logrotate.d
 	newins "logrotate/pgmon.logrotate" pgmon
 	# Install man page
 	doman manpages/pgmon.1
 }
--- a/16
+++ b/16
@ -1,10 +1,9 @@
 # Package details
 PACKAGE_NAME := pgmon
 VERSION := 1.0.1
 SCRIPT := src/$(PACKAGE_NAME).py
 VERSION := $(shell grep -m 1 '^VERSION = ' "$(SCRIPT)" | sed -ne 's/.*"\(.*\)".*/\1/p')
 # Where packages are built
 BUILD_DIR := build
@ -23,7 +22,7 @@ SUPPORTED := ubuntu-20.04 \
 # These targets are the main ones to use for most things.
 ##
-.PHONY: all clean tgz test query-tests install
+.PHONY: all clean tgz test install
 # Build all packages
@ -65,10 +64,6 @@ clean:
 test:
 	cd src ; python3 -m unittest
 # Run query tests
 query-tests:
 	cd tests ; ./run-tests.sh
 # Install the script at the specified base directory
 install:
 	# Set up directories
@ -97,7 +92,7 @@ install:
 # Run all of the install tests
-.PHONY: install-tests debian-%-install-test rockylinux-%-install-test ubuntu-%-install-test gentoo-install-test
+.PHONY: install-tests debian-%-install-test rockylinux-%-install-test ubuntu-%-install-test
 install-tests: $(foreach distro_release, $(SUPPORTED), $(distro_release)-install-test)
@ -129,11 +124,6 @@ oraclelinux-%-install-test:
 	     oraclelinux:7 \
 	     bash -c 'yum makecache && yum install -y /output/$(PACKAGE_NAME)-$(VERSION)-1.el7.noarch.rpm'
 # Run a Gentoo install test
 gentoo-install-test:
 	# May impliment this in the future, but would require additional headaches to set up a repo
 	true
 ##
 # Container targets
 #
--- a/openrc/pgmon.initd
+++ b/openrc/pgmon.initd
@ -19,6 +19,6 @@ start_pre() {
 }
 command="/usr/bin/pgmon"
-command_args="-c '$CONFIG_FILE'"
+command_args="'$CONFIG_FILE'"
 command_background="true"
 command_user="${PGMON_USER}:${PGMON_GROUP}"
--- a/requirements-dev.yml
+++ b/requirements-dev.yml
@ -1,4 +0,0 @@
 -r requirements.txt
 testcontainers[postgresql]
 pytest
 black
--- a/sample-config/pgmon-metrics.yml
+++ b/sample-config/pgmon-metrics.yml
@ -11,8 +11,7 @@ metrics:
  discover_slots:
    type: set
    query:
-      90400: SELECT slot_name, plugin, slot_type, database, false as temporary, active FROM pg_replication_slots
+      0: SELECT slot_name, plugin, slot_type, database, temporary, active FROM pg_replication_slots
      100000: SELECT slot_name, plugin, slot_type, database, temporary, active FROM pg_replication_slots
  # cluster-wide metrics
  version:
@ -20,9 +19,9 @@ metrics:
    query:
      0: SHOW server_version_num
  max_frozen_age:
-    type: row
+    type: value
    query:
-      0: SELECT max(age(datfrozenxid)), max(mxid_age(datminmxid)) FROM pg_database
+      0: SELECT max(age(datfrozenxid)) FROM pg_database
  # Per-database metrics
  db_stats:
@ -30,8 +29,6 @@ metrics:
    query:
      0: SELECT numbackends, xact_commit, xact_rollback, blks_read, blks_hit, tup_returned, tup_fetched, tup_inserted, tup_updated, tup_deleted, conflicts, temp_files, temp_bytes, deadlocks, blk_read_time, blk_write_time, extract('epoch' from stats_reset)::float FROM pg_stat_database WHERE datname = %(dbname)s
      140000: SELECT numbackends, xact_commit, xact_rollback, blks_read, blks_hit, tup_returned, tup_fetched, tup_inserted, tup_updated, tup_deleted, conflicts, temp_files, temp_bytes, deadlocks, COALESCE(checksum_failures, 0) AS checksum_failures, blk_read_time, blk_write_time, session_time, active_time, idle_in_transaction_time, sessions, sessions_abandoned, sessions_fatal, sessions_killed, extract('epoch' from stats_reset)::float FROM pg_stat_database WHERE datname = %(dbname)s
    test_args:
      dbname: postgres
  # Debugging
  ntables:
@ -43,9 +40,7 @@ metrics:
  rep_stats:
    type: row
    query:
-      90400: SELECT * FROM pg_stat_replication WHERE client_addr || '_' || regexp_replace(application_name, '[ ,]', '_', 'g') = '{repid}'
+      0: SELECT * FROM pg_stat_database WHERE client_addr || '_' || regexp_replace(application_name, '[ ,]', '_', 'g') = '{repid}'
    test_args:
      repid: 127.0.0.1_test_rep
  # Debugging
  sleep:
@ -57,7 +52,4 @@ metrics:
  slot_stats:
    type: row
    query:
-      90400: SELECT active_pid, xmin, pg_xlog_location_diff(pg_current_xlog_location(), restart_lsn) AS restart_bytes, pg_xlog_location_diff(pg_current_xlog_location(), confirmed_flush_lsn) AS confirmed_flush_bytes FROM pg_replication_slots WHERE slot_name = '{slot}'
+      0: SELECT active_pid, xmin, pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn) AS restart_bytes, pg_wal_lsn_diff(pg_current_wal_lsn(), confirmed_flush_lsn) AS confirmed_flush_bytes FROM pg_replication_slots WHERE slot_name = '{slot}'
      100000: SELECT active_pid, xmin, pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn) AS restart_bytes, pg_wal_lsn_diff(pg_current_wal_lsn(), confirmed_flush_lsn) AS confirmed_flush_bytes FROM pg_replication_slots WHERE slot_name = '{slot}'
    test_args:
      slot: test_slot
--- a/sample-config/pgmon.yml
+++ b/sample-config/pgmon.yml
@ -1,6 +1,3 @@
 # The address the agent binds to
 #address: 127.0.0.1
 # The port the agent listens on for requests
 #port: 5400
@ -29,9 +26,6 @@
 # Default database to connect to when none is specified for a metric
 #dbname: 'postgres'
 # SSL connection mode
 #ssl_mode: require
 # Timeout for getting a connection slot from a pool
 #pool_slot_timeout: 5
--- a/src/pgmon.py
+++ b/src/pgmon.py
@ -4,7 +4,6 @@ import yaml
 import json
 import time
 import os
 import sys
 import argparse
 import logging
@ -24,7 +23,7 @@ from http.server import BaseHTTPRequestHandler, HTTPServer
 from http.server import ThreadingHTTPServer
 from urllib.parse import urlparse, parse_qs
-VERSION = "1.0.2"
+VERSION = "1.0.1"
 # Configuration
 config = {}
@ -75,18 +74,12 @@ class UnhappyDBError(Exception):
    pass
 class UnknownMetricError(Exception):
    pass
 class MetricVersionError(Exception):
    pass
 # Default config settings
 default_config = {
    # The address the agent binds to
    "address": "127.0.0.1",
    # The port the agent listens on for requests
    "port": 5400,
    # Min PostgreSQL connection pool size (per database)
@ -105,8 +98,6 @@ default_config = {
    "dbport": 5432,
    # Default database to connect to when none is specified for a metric
    "dbname": "postgres",
    # SSL connection mode
    "ssl_mode": "require",
    # Timeout for getting a connection slot from a pool
    "pool_slot_timeout": 5,
    # PostgreSQL connection timeout (seconds)
@ -329,7 +320,6 @@ def get_pool(dbname):
            # lock
            if dbname not in connections:
                log.info("Creating connection pool for: {}".format(dbname))
                # Actually create the connection pool
                connections[dbname] = ConnectionPool(
                    dbname,
                    int(config["min_pool_size"]),
@ -339,7 +329,7 @@ def get_pool(dbname):
                    port=config["dbport"],
                    user=config["dbuser"],
                    connect_timeout=int(config["connect_timeout"]),
-                    sslmode=config["ssl_mode"],
+                    sslmode="require",
                )
            # Clear the unhappy indicator if present
            unhappy_cooldown.pop(dbname, None)
@ -387,16 +377,10 @@ def run_query_no_retry(pool, return_type, query, args):
                res = curs.fetchall()
                if return_type == "value":
                    if len(res) == 0:
                        return ""
                    return str(list(res[0].values())[0])
                elif return_type == "row":
                    if len(res) == 0:
                        return "[]"
                    return json.dumps(res[0])
                elif return_type == "column":
                    if len(res) == 0:
                        return "[]"
                    return json.dumps([list(r.values())[0] for r in res])
                elif return_type == "set":
                    return json.dumps(res)
@ -404,7 +388,7 @@ def run_query_no_retry(pool, return_type, query, args):
            dbname = pool.name
            if dbname in unhappy_cooldown:
                raise UnhappyDBError()
-            elif conn.closed != 0:
+            elif conn.broken:
                raise DisconnectedError()
            else:
                raise
@ -482,53 +466,6 @@ def get_cluster_version():
    return cluster_version
 def sample_metric(dbname, metric_name, args, retry=True):
    """
    Run the appropriate query for the named metric against the specified database
    """
    # Get the metric definition
    try:
        metric = config["metrics"][metric_name]
    except KeyError:
        raise UnknownMetricError("Unknown metric: {}".format(metric_name))
    # Get the connection pool for the database, or create one if it doesn't
    # already exist.
    pool = get_pool(dbname)
    # Identify the PostgreSQL version
    version = get_cluster_version()
    # Get the query version
    query = get_query(metric, version)
    # Execute the quert
    if retry:
        return run_query(pool, metric["type"], query, args)
    else:
        return run_query_no_retry(pool, metric["type"], query, args)
 def test_queries():
    """
    Run all of the metric queries against a database and check the results
    """
    # We just use the default db for tests
    dbname = config["dbname"]
    # Loop through all defined metrics.
    for name, metric in config["metrics"].items():
        # If the metric has arguments to use while testing, grab those
        args = metric.get("test_args", {})
        # Run the query without the ability to retry.
        res = sample_metric(dbname, name, args, retry=False)
        # Compare the result to the provided sample results
        # TODO
        print("{} -> {}".format(name, res))
    # Return the number of errors
    # TODO
    return 0
 class SimpleHTTPRequestHandler(BaseHTTPRequestHandler):
    """
    This is our request handling server.  It is responsible for listening for
@ -557,10 +494,10 @@ class SimpleHTTPRequestHandler(BaseHTTPRequestHandler):
        """
        # Parse the URL
        parsed_path = urlparse(self.path)
-        metric_name = parsed_path.path.strip("/")
+        name = parsed_path.path.strip("/")
        parsed_query = parse_qs(parsed_path.query)
-        if metric_name == "agent_version":
+        if name == "agent_version":
            self._reply(200, VERSION)
            return
@ -568,31 +505,60 @@ class SimpleHTTPRequestHandler(BaseHTTPRequestHandler):
        #       single values, just grab the first from each.
        args = {key: values[0] for key, values in parsed_query.items()}
        # Get the metric definition
        try:
            metric = config["metrics"][name]
        except KeyError:
            log.error("Unknown metric: {}".format(name))
            self._reply(404, "Unknown metric")
            return
        # Get the dbname.  If none was provided, use the default from the
        # config.
        dbname = args.get("dbname", config["dbname"])
-        # Sample the metric
+        # Get the connection pool for the database, or create one if it doesn't
        # already exist.
        try:
-            self._reply(200, sample_metric(dbname, metric_name, args))
+            pool = get_pool(dbname)
-            return
+        except UnhappyDBError:
        except UnknownMetricError as e:
            log.error("Unknown metric: {}".format(metric_name))
            self._reply(404, "Unknown metric")
            return
        except MetricVersionError as e:
            log.error(
                "Failed to find a version of {} for {}".format(metric_name, version)
            )
            self._reply(404, "Unsupported version")
            return
        except UnhappyDBError as e:
            log.info("Database {} is unhappy, please be patient".format(dbname))
            self._reply(503, "Database unavailable")
            return
        # Identify the PostgreSQL version
        try:
            version = get_cluster_version()
        except UnhappyDBError:
            return
        except Exception as e:
            if dbname in unhappy_cooldown:
                log.info("Database {} is unhappy, please be patient".format(dbname))
                self._reply(503, "Database unavailable")
            else:
                log.error("Failed to get PostgreSQL version: {}".format(e))
                self._reply(500, "Error getting DB version")
            return
        # Get the query version
        try:
            query = get_query(metric, version)
        except KeyError:
            log.error("Failed to find a version of {} for {}".format(name, version))
            self._reply(404, "Unsupported version")
            return
        # Execute the quert
        try:
            self._reply(200, run_query(pool, metric["type"], query, args))
            return
        except Exception as e:
            if dbname in unhappy_cooldown:
                log.info("Database {} is unhappy, please be patient".format(dbname))
                self._reply(503, "Database unavailable")
            else:
                log.error("Error running query: {}".format(e))
-            self._reply(500, "Unexpected error: {}".format(e))
+                self._reply(500, "Error running query")
            return
    def _reply(self, code, content):
@ -613,17 +579,12 @@ if __name__ == "__main__":
    )
    parser.add_argument(
-        "-c",
+        "config_file",
        "--config_file",
        default="pgmon.yml",
        nargs="?",
        help="The config file to read (default: %(default)s)",
    )
    parser.add_argument(
        "-t", "--test", action="store_true", help="Run query tests and exit"
    )
    args = parser.parse_args()
    # Set the config file path
@ -632,16 +593,8 @@ if __name__ == "__main__":
    # Read the config file
    read_config(config_file)
    # Run query tests and exit if test mode is enabled
    if args.test:
        errors = test_queries()
        if errors > 0:
            sys.exit(1)
        else:
            sys.exit(0)
    # Set up the http server to receive requests
-    server_address = (config["address"], config["port"])
+    server_address = ("127.0.0.1", config["port"])
    httpd = ThreadingHTTPServer(server_address, SimpleHTTPRequestHandler)
    # Set up the signal handler
--- a/systemd/pgmon@.service
+++ b/systemd/pgmon@.service
@ -7,7 +7,7 @@ After=network.target
 [Service]
 EnvironmentFile=/etc/pgmon/%i-service.conf
 User=${SERVICE_USER:-postgres}
-ExecStart=/usr/local/bin/pgmon -c /etc/pgmon/%i.yml
+ExecStart=/usr/local/bin/pgmon /etc/pgmon/%i.yml
 ExecReload=kill -HUP $MAINPID
 Restart=on-failure
 Type=exec
--- a/tests/Dockerfile
+++ b/tests/Dockerfile
@ -1,22 +0,0 @@
 FROM alpine:3.21
 RUN apk update && \
    apk add py3-psycopg2 \
            py3-yaml \
            tini
 WORKDIR /app
 COPY src/pgmon.py /app/
 COPY sample-config/pgmon-metrics.yml /app/
 COPY tests/test-config.yml /app/
 COPY --chmod=0600 --chown=postgres:postgres tests/pgpass /root/.pgpass
 ENTRYPOINT ["tini", "--"]
 EXPOSE 5400
 CMD ["/app/pgmon.py", "-c", "/app/test-config.yml", "--test"]
--- a/tests/docker-compose.yml
+++ b/tests/docker-compose.yml
@ -1,32 +0,0 @@
 ---
 services:
  agent:
    image: pgmon
    build:
      context: ..
      dockerfile: tests/Dockerfile
    ports:
      - :5400
    depends_on:
      db:
        condition: service_healthy
  db:
    image: "postgres:${PGTAG:-17-bookworm}"
    ports:
      - :5432
    environment:
      POSTGRES_PASSWORD: secret
    healthcheck:
      #test: [ "CMD", "pg_isready", "-U", "postgres" ]
      test: [ "CMD-SHELL", "pg_controldata /var/lib/postgresql/data/ | grep -q 'in production'" ]
      interval: 5s
      timeout: 2s
      retries: 20
    command: >
      postgres -c ssl=on
               -c ssl_cert_file='/etc/ssl/certs/ssl-cert-snakeoil.pem'
               -c ssl_key_file='/etc/ssl/private/ssl-cert-snakeoil.key'
               -c listen_addresses='*'
--- a/tests/pgpass
+++ b/tests/pgpass
@ -1 +0,0 @@
 db:5432:*:postgres:secret
--- a/tests/run-tests.sh
+++ b/tests/run-tests.sh
@ -1,62 +0,0 @@
 #!/bin/bash
 # Versions to test
 versions=( $@ )
 # If we weren't given any versions, test them all
 if [ ${#versions[@]} -eq 0 ]
 then
    versions=( 9.2 9.6 10 11 12 13 14 15 16 17 )
 fi
 # Image tags to use
 declare -A images=()
 images["9.2"]='9.2'
 images["9.6"]='9.6-bullseye'
 images["10"]='10-bullseye'
 images["11"]='11-bookworm'
 images["12"]='12-bookworm'
 images["13"]='13-bookworm'
 images["14"]='14-bookworm'
 images["15"]='15-bookworm'
 images["16"]='16-bookworm'
 images["17"]='17-bookworm'
 declare -A results=()
 # Make sure everything's down to start with
 docker compose down
 # Make sure our agent container is up to date
 docker compose build agent
 for version in "${versions[@]}"
 do
    echo
    echo "Testing: PostgreSQL ${version}"
    # Specify the version we're testing against
    export PGTAG="${images["$version"]}"
    # Start the containers
    docker compose up --exit-code-from=agent agent
    rc=$?
    results["$version"]=$rc
    # Destroy the containers
    docker compose down
 done
 echo
 echo
 for v in "${versions[@]}"
 do
    case "${results["$v"]}" in
        0) msg="OK" ;;
        1) msg="Query failure detected" ;;
        18) msg="Docker image error: 18" ;;
        *) msg="Unexpected error: ${results["$v"]}" ;;
    esac
    echo "$v -> $msg"
 done
--- a/tests/test-config.yml
+++ b/tests/test-config.yml
@ -1,17 +0,0 @@
 ---
 # Bind to all interfaces so we can submit requests from outside the test container
 address: 0.0.0.0
 # We always just connect to the db container
 dbhost: db
 dbport: 5432
 dbuser: postgres
 # The SSL cipher parameters are too old in the 9.2 container, so we allow the tests
 # to be run without encryption
 ssl_mode: prefer
 # Pull in the standard metrics
 include:
  - pgmon-metrics.yml