diff --git a/GENTOO/pgmon-1.0.2.ebuild b/GENTOO/pgmon-1.0.2.ebuild new file mode 100644 index 0000000..9c52b5a --- /dev/null +++ b/GENTOO/pgmon-1.0.2.ebuild @@ -0,0 +1,73 @@ +# Copyright 2024 Gentoo Authors +# Distributed under the terms of the GNU General Public License v2 + +EAPI=8 + +PYTHON_COMPAT=( python3_{6..13} ) + +inherit python-r1 + +DESCRIPTION="PostgreSQL monitoring bridge" +HOMEPAGE="None" + +LICENSE="BSD" +SLOT="0" +KEYWORDS="amd64" + +SRC_URI="https://code2.shh-dot-com.org/james/${PN}/archive/v${PV}.tar.bz2 -> ${P}.tar.bz2" + +IUSE="-systemd" + +DEPEND=" + ${PYTHON_DEPS} + dev-python/psycopg:2 + dev-python/pyyaml + app-admin/logrotate + " +RDEPEND="${DEPEND}" +BDEPEND="" + +RESTRICT="fetch" + +#S="${WORKDIR}/${PN}" + +pkg_nofetch() { + einfo "Please download" + einfo " - ${P}.tar.bz2" + einfo "from ${HOMEPAGE} and place it in your DISTDIR directory." + einfo "The file should be owned by portage:portage." +} + +src_compile() { + true +} + +src_install() { + # Install init script + if ! use systemd ; then + newinitd "openrc/pgmon.initd" pgmon + newconfd "openrc/pgmon.confd" pgmon + fi + + # Install systemd unit + if use systemd ; then + systemd_dounit "systemd/pgmon.service" + fi + + # Install script + exeinto /usr/bin + newexe "src/pgmon.py" pgmon + + # Install default config + diropts -o root -g root -m 0755 + insinto /etc/pgmon + doins "sample-config/pgmon.yml" + doins "sample-config/pgmon-metrics.yml" + + # Install logrotate config + insinto /etc/logrotate.d + newins "logrotate/pgmon.logrotate" pgmon + + # Install man page + doman manpages/pgmon.1 +} diff --git a/Makefile b/Makefile index 8f98594..1c3de00 100644 --- a/Makefile +++ b/Makefile @@ -1,9 +1,10 @@ # Package details PACKAGE_NAME := pgmon -VERSION := 1.0.1 SCRIPT := src/$(PACKAGE_NAME).py +VERSION := $(shell grep -m 1 '^VERSION = ' "$(SCRIPT)" | sed -ne 's/.*"\(.*\)".*/\1/p') + # Where packages are built BUILD_DIR := build @@ -22,7 +23,7 @@ SUPPORTED := ubuntu-20.04 \ # These targets are the main ones to use for most things. ## -.PHONY: all clean tgz test install +.PHONY: all clean tgz test query-tests install # Build all packages @@ -64,6 +65,10 @@ clean: test: cd src ; python3 -m unittest +# Run query tests +query-tests: + cd tests ; ./run-tests.sh + # Install the script at the specified base directory install: # Set up directories @@ -92,7 +97,7 @@ install: # Run all of the install tests -.PHONY: install-tests debian-%-install-test rockylinux-%-install-test ubuntu-%-install-test +.PHONY: install-tests debian-%-install-test rockylinux-%-install-test ubuntu-%-install-test gentoo-install-test install-tests: $(foreach distro_release, $(SUPPORTED), $(distro_release)-install-test) @@ -124,6 +129,11 @@ oraclelinux-%-install-test: oraclelinux:7 \ bash -c 'yum makecache && yum install -y /output/$(PACKAGE_NAME)-$(VERSION)-1.el7.noarch.rpm' +# Run a Gentoo install test +gentoo-install-test: + # May impliment this in the future, but would require additional headaches to set up a repo + true + ## # Container targets # diff --git a/openrc/pgmon.initd b/openrc/pgmon.initd index 8d8c96d..e6e3c81 100644 --- a/openrc/pgmon.initd +++ b/openrc/pgmon.initd @@ -19,6 +19,6 @@ start_pre() { } command="/usr/bin/pgmon" -command_args="'$CONFIG_FILE'" +command_args="-c '$CONFIG_FILE'" command_background="true" command_user="${PGMON_USER}:${PGMON_GROUP}" diff --git a/requirements-dev.yml b/requirements-dev.yml new file mode 100644 index 0000000..9680b1e --- /dev/null +++ b/requirements-dev.yml @@ -0,0 +1,4 @@ +-r requirements.txt +testcontainers[postgresql] +pytest +black diff --git a/sample-config/pgmon-metrics.yml b/sample-config/pgmon-metrics.yml index c72c786..60e428e 100644 --- a/sample-config/pgmon-metrics.yml +++ b/sample-config/pgmon-metrics.yml @@ -11,7 +11,8 @@ metrics: discover_slots: type: set query: - 0: SELECT slot_name, plugin, slot_type, database, temporary, active FROM pg_replication_slots + 90400: SELECT slot_name, plugin, slot_type, database, false as temporary, active FROM pg_replication_slots + 100000: SELECT slot_name, plugin, slot_type, database, temporary, active FROM pg_replication_slots # cluster-wide metrics version: @@ -19,9 +20,9 @@ metrics: query: 0: SHOW server_version_num max_frozen_age: - type: value + type: row query: - 0: SELECT max(age(datfrozenxid)) FROM pg_database + 0: SELECT max(age(datfrozenxid)), max(mxid_age(datminmxid)) FROM pg_database # Per-database metrics db_stats: @@ -29,6 +30,8 @@ metrics: query: 0: SELECT numbackends, xact_commit, xact_rollback, blks_read, blks_hit, tup_returned, tup_fetched, tup_inserted, tup_updated, tup_deleted, conflicts, temp_files, temp_bytes, deadlocks, blk_read_time, blk_write_time, extract('epoch' from stats_reset)::float FROM pg_stat_database WHERE datname = %(dbname)s 140000: SELECT numbackends, xact_commit, xact_rollback, blks_read, blks_hit, tup_returned, tup_fetched, tup_inserted, tup_updated, tup_deleted, conflicts, temp_files, temp_bytes, deadlocks, COALESCE(checksum_failures, 0) AS checksum_failures, blk_read_time, blk_write_time, session_time, active_time, idle_in_transaction_time, sessions, sessions_abandoned, sessions_fatal, sessions_killed, extract('epoch' from stats_reset)::float FROM pg_stat_database WHERE datname = %(dbname)s + test_args: + dbname: postgres # Debugging ntables: @@ -40,7 +43,9 @@ metrics: rep_stats: type: row query: - 0: SELECT * FROM pg_stat_database WHERE client_addr || '_' || regexp_replace(application_name, '[ ,]', '_', 'g') = '{repid}' + 90400: SELECT * FROM pg_stat_replication WHERE client_addr || '_' || regexp_replace(application_name, '[ ,]', '_', 'g') = '{repid}' + test_args: + repid: 127.0.0.1_test_rep # Debugging sleep: @@ -52,4 +57,7 @@ metrics: slot_stats: type: row query: - 0: SELECT active_pid, xmin, pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn) AS restart_bytes, pg_wal_lsn_diff(pg_current_wal_lsn(), confirmed_flush_lsn) AS confirmed_flush_bytes FROM pg_replication_slots WHERE slot_name = '{slot}' + 90400: SELECT active_pid, xmin, pg_xlog_location_diff(pg_current_xlog_location(), restart_lsn) AS restart_bytes, pg_xlog_location_diff(pg_current_xlog_location(), confirmed_flush_lsn) AS confirmed_flush_bytes FROM pg_replication_slots WHERE slot_name = '{slot}' + 100000: SELECT active_pid, xmin, pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn) AS restart_bytes, pg_wal_lsn_diff(pg_current_wal_lsn(), confirmed_flush_lsn) AS confirmed_flush_bytes FROM pg_replication_slots WHERE slot_name = '{slot}' + test_args: + slot: test_slot diff --git a/sample-config/pgmon.yml b/sample-config/pgmon.yml index 879005d..bbe49dc 100644 --- a/sample-config/pgmon.yml +++ b/sample-config/pgmon.yml @@ -1,3 +1,6 @@ +# The address the agent binds to +#address: 127.0.0.1 + # The port the agent listens on for requests #port: 5400 @@ -26,6 +29,9 @@ # Default database to connect to when none is specified for a metric #dbname: 'postgres' +# SSL connection mode +#ssl_mode: require + # Timeout for getting a connection slot from a pool #pool_slot_timeout: 5 diff --git a/src/pgmon.py b/src/pgmon.py index 38c1f8b..6178827 100755 --- a/src/pgmon.py +++ b/src/pgmon.py @@ -4,6 +4,7 @@ import yaml import json import time import os +import sys import argparse import logging @@ -23,7 +24,7 @@ from http.server import BaseHTTPRequestHandler, HTTPServer from http.server import ThreadingHTTPServer from urllib.parse import urlparse, parse_qs -VERSION = "1.0.1" +VERSION = "1.0.2" # Configuration config = {} @@ -74,12 +75,18 @@ class UnhappyDBError(Exception): pass +class UnknownMetricError(Exception): + pass + + class MetricVersionError(Exception): pass # Default config settings default_config = { + # The address the agent binds to + "address": "127.0.0.1", # The port the agent listens on for requests "port": 5400, # Min PostgreSQL connection pool size (per database) @@ -98,6 +105,8 @@ default_config = { "dbport": 5432, # Default database to connect to when none is specified for a metric "dbname": "postgres", + # SSL connection mode + "ssl_mode": "require", # Timeout for getting a connection slot from a pool "pool_slot_timeout": 5, # PostgreSQL connection timeout (seconds) @@ -320,6 +329,7 @@ def get_pool(dbname): # lock if dbname not in connections: log.info("Creating connection pool for: {}".format(dbname)) + # Actually create the connection pool connections[dbname] = ConnectionPool( dbname, int(config["min_pool_size"]), @@ -329,7 +339,7 @@ def get_pool(dbname): port=config["dbport"], user=config["dbuser"], connect_timeout=int(config["connect_timeout"]), - sslmode="require", + sslmode=config["ssl_mode"], ) # Clear the unhappy indicator if present unhappy_cooldown.pop(dbname, None) @@ -377,10 +387,16 @@ def run_query_no_retry(pool, return_type, query, args): res = curs.fetchall() if return_type == "value": + if len(res) == 0: + return "" return str(list(res[0].values())[0]) elif return_type == "row": + if len(res) == 0: + return "[]" return json.dumps(res[0]) elif return_type == "column": + if len(res) == 0: + return "[]" return json.dumps([list(r.values())[0] for r in res]) elif return_type == "set": return json.dumps(res) @@ -388,7 +404,7 @@ def run_query_no_retry(pool, return_type, query, args): dbname = pool.name if dbname in unhappy_cooldown: raise UnhappyDBError() - elif conn.broken: + elif conn.closed != 0: raise DisconnectedError() else: raise @@ -466,6 +482,53 @@ def get_cluster_version(): return cluster_version +def sample_metric(dbname, metric_name, args, retry=True): + """ + Run the appropriate query for the named metric against the specified database + """ + # Get the metric definition + try: + metric = config["metrics"][metric_name] + except KeyError: + raise UnknownMetricError("Unknown metric: {}".format(metric_name)) + + # Get the connection pool for the database, or create one if it doesn't + # already exist. + pool = get_pool(dbname) + + # Identify the PostgreSQL version + version = get_cluster_version() + + # Get the query version + query = get_query(metric, version) + + # Execute the quert + if retry: + return run_query(pool, metric["type"], query, args) + else: + return run_query_no_retry(pool, metric["type"], query, args) + + +def test_queries(): + """ + Run all of the metric queries against a database and check the results + """ + # We just use the default db for tests + dbname = config["dbname"] + # Loop through all defined metrics. + for name, metric in config["metrics"].items(): + # If the metric has arguments to use while testing, grab those + args = metric.get("test_args", {}) + # Run the query without the ability to retry. + res = sample_metric(dbname, name, args, retry=False) + # Compare the result to the provided sample results + # TODO + print("{} -> {}".format(name, res)) + # Return the number of errors + # TODO + return 0 + + class SimpleHTTPRequestHandler(BaseHTTPRequestHandler): """ This is our request handling server. It is responsible for listening for @@ -494,10 +557,10 @@ class SimpleHTTPRequestHandler(BaseHTTPRequestHandler): """ # Parse the URL parsed_path = urlparse(self.path) - name = parsed_path.path.strip("/") + metric_name = parsed_path.path.strip("/") parsed_query = parse_qs(parsed_path.query) - if name == "agent_version": + if metric_name == "agent_version": self._reply(200, VERSION) return @@ -505,60 +568,31 @@ class SimpleHTTPRequestHandler(BaseHTTPRequestHandler): # single values, just grab the first from each. args = {key: values[0] for key, values in parsed_query.items()} - # Get the metric definition - try: - metric = config["metrics"][name] - except KeyError: - log.error("Unknown metric: {}".format(name)) - self._reply(404, "Unknown metric") - return - # Get the dbname. If none was provided, use the default from the # config. dbname = args.get("dbname", config["dbname"]) - # Get the connection pool for the database, or create one if it doesn't - # already exist. + # Sample the metric try: - pool = get_pool(dbname) - except UnhappyDBError: + self._reply(200, sample_metric(dbname, metric_name, args)) + return + except UnknownMetricError as e: + log.error("Unknown metric: {}".format(metric_name)) + self._reply(404, "Unknown metric") + return + except MetricVersionError as e: + log.error( + "Failed to find a version of {} for {}".format(metric_name, version) + ) + self._reply(404, "Unsupported version") + return + except UnhappyDBError as e: log.info("Database {} is unhappy, please be patient".format(dbname)) self._reply(503, "Database unavailable") return - - # Identify the PostgreSQL version - try: - version = get_cluster_version() - except UnhappyDBError: - return except Exception as e: - if dbname in unhappy_cooldown: - log.info("Database {} is unhappy, please be patient".format(dbname)) - self._reply(503, "Database unavailable") - else: - log.error("Failed to get PostgreSQL version: {}".format(e)) - self._reply(500, "Error getting DB version") - return - - # Get the query version - try: - query = get_query(metric, version) - except KeyError: - log.error("Failed to find a version of {} for {}".format(name, version)) - self._reply(404, "Unsupported version") - return - - # Execute the quert - try: - self._reply(200, run_query(pool, metric["type"], query, args)) - return - except Exception as e: - if dbname in unhappy_cooldown: - log.info("Database {} is unhappy, please be patient".format(dbname)) - self._reply(503, "Database unavailable") - else: - log.error("Error running query: {}".format(e)) - self._reply(500, "Error running query") + log.error("Error running query: {}".format(e)) + self._reply(500, "Unexpected error: {}".format(e)) return def _reply(self, code, content): @@ -579,12 +613,17 @@ if __name__ == "__main__": ) parser.add_argument( - "config_file", + "-c", + "--config_file", default="pgmon.yml", nargs="?", help="The config file to read (default: %(default)s)", ) + parser.add_argument( + "-t", "--test", action="store_true", help="Run query tests and exit" + ) + args = parser.parse_args() # Set the config file path @@ -593,8 +632,16 @@ if __name__ == "__main__": # Read the config file read_config(config_file) + # Run query tests and exit if test mode is enabled + if args.test: + errors = test_queries() + if errors > 0: + sys.exit(1) + else: + sys.exit(0) + # Set up the http server to receive requests - server_address = ("127.0.0.1", config["port"]) + server_address = (config["address"], config["port"]) httpd = ThreadingHTTPServer(server_address, SimpleHTTPRequestHandler) # Set up the signal handler diff --git a/systemd/pgmon@.service b/systemd/pgmon@.service index 0dec16b..a17860a 100644 --- a/systemd/pgmon@.service +++ b/systemd/pgmon@.service @@ -7,7 +7,7 @@ After=network.target [Service] EnvironmentFile=/etc/pgmon/%i-service.conf User=${SERVICE_USER:-postgres} -ExecStart=/usr/local/bin/pgmon /etc/pgmon/%i.yml +ExecStart=/usr/local/bin/pgmon -c /etc/pgmon/%i.yml ExecReload=kill -HUP $MAINPID Restart=on-failure Type=exec diff --git a/tests/Dockerfile b/tests/Dockerfile new file mode 100644 index 0000000..1913313 --- /dev/null +++ b/tests/Dockerfile @@ -0,0 +1,22 @@ +FROM alpine:3.21 + +RUN apk update && \ + apk add py3-psycopg2 \ + py3-yaml \ + tini + +WORKDIR /app + +COPY src/pgmon.py /app/ + +COPY sample-config/pgmon-metrics.yml /app/ + +COPY tests/test-config.yml /app/ + +COPY --chmod=0600 --chown=postgres:postgres tests/pgpass /root/.pgpass + +ENTRYPOINT ["tini", "--"] + +EXPOSE 5400 + +CMD ["/app/pgmon.py", "-c", "/app/test-config.yml", "--test"] diff --git a/tests/docker-compose.yml b/tests/docker-compose.yml new file mode 100644 index 0000000..ae9b5f6 --- /dev/null +++ b/tests/docker-compose.yml @@ -0,0 +1,32 @@ +--- + +services: + agent: + image: pgmon + build: + context: .. + dockerfile: tests/Dockerfile + ports: + - :5400 + depends_on: + db: + condition: service_healthy + + db: + image: "postgres:${PGTAG:-17-bookworm}" + ports: + - :5432 + environment: + POSTGRES_PASSWORD: secret + healthcheck: + #test: [ "CMD", "pg_isready", "-U", "postgres" ] + test: [ "CMD-SHELL", "pg_controldata /var/lib/postgresql/data/ | grep -q 'in production'" ] + interval: 5s + timeout: 2s + retries: 20 + command: > + postgres -c ssl=on + -c ssl_cert_file='/etc/ssl/certs/ssl-cert-snakeoil.pem' + -c ssl_key_file='/etc/ssl/private/ssl-cert-snakeoil.key' + -c listen_addresses='*' + diff --git a/tests/pgpass b/tests/pgpass new file mode 100644 index 0000000..1066019 --- /dev/null +++ b/tests/pgpass @@ -0,0 +1 @@ +db:5432:*:postgres:secret diff --git a/tests/run-tests.sh b/tests/run-tests.sh new file mode 100755 index 0000000..9bf472a --- /dev/null +++ b/tests/run-tests.sh @@ -0,0 +1,62 @@ +#!/bin/bash + +# Versions to test +versions=( $@ ) + +# If we weren't given any versions, test them all +if [ ${#versions[@]} -eq 0 ] +then + versions=( 9.2 9.6 10 11 12 13 14 15 16 17 ) +fi + +# Image tags to use +declare -A images=() +images["9.2"]='9.2' +images["9.6"]='9.6-bullseye' +images["10"]='10-bullseye' +images["11"]='11-bookworm' +images["12"]='12-bookworm' +images["13"]='13-bookworm' +images["14"]='14-bookworm' +images["15"]='15-bookworm' +images["16"]='16-bookworm' +images["17"]='17-bookworm' + +declare -A results=() + +# Make sure everything's down to start with +docker compose down + +# Make sure our agent container is up to date +docker compose build agent + +for version in "${versions[@]}" +do + echo + echo "Testing: PostgreSQL ${version}" + + # Specify the version we're testing against + export PGTAG="${images["$version"]}" + + # Start the containers + docker compose up --exit-code-from=agent agent + rc=$? + + results["$version"]=$rc + + # Destroy the containers + docker compose down +done + +echo +echo +for v in "${versions[@]}" +do + case "${results["$v"]}" in + 0) msg="OK" ;; + 1) msg="Query failure detected" ;; + 18) msg="Docker image error: 18" ;; + *) msg="Unexpected error: ${results["$v"]}" ;; + esac + echo "$v -> $msg" +done diff --git a/tests/test-config.yml b/tests/test-config.yml new file mode 100644 index 0000000..7a17cdb --- /dev/null +++ b/tests/test-config.yml @@ -0,0 +1,17 @@ +--- + +# Bind to all interfaces so we can submit requests from outside the test container +address: 0.0.0.0 + +# We always just connect to the db container +dbhost: db +dbport: 5432 +dbuser: postgres + +# The SSL cipher parameters are too old in the 9.2 container, so we allow the tests +# to be run without encryption +ssl_mode: prefer + +# Pull in the standard metrics +include: + - pgmon-metrics.yml