pgmon/src/pgmon.py

647 lines
20 KiB
Python
Raw Normal View History

2024-05-16 15:41:47 +00:00
#!/usr/bin/env python3
import yaml
import json
import time
2025-01-08 22:01:11 +00:00
import os
2025-05-18 16:52:32 +00:00
import sys
2024-05-16 15:41:47 +00:00
import argparse
import logging
2024-05-18 18:09:43 +00:00
2024-10-31 05:03:43 +00:00
from datetime import datetime, timedelta
2025-01-08 07:39:20 +00:00
import psycopg2
from psycopg2.extras import RealDictCursor
2025-01-08 07:39:20 +00:00
from psycopg2.pool import ThreadedConnectionPool
from contextlib import contextmanager
2024-05-18 18:09:43 +00:00
import signal
from threading import Thread, Lock, Semaphore
2024-05-18 18:09:43 +00:00
from http.server import BaseHTTPRequestHandler, HTTPServer
from http.server import ThreadingHTTPServer
from urllib.parse import urlparse, parse_qs
2024-05-18 18:09:43 +00:00
2025-05-14 04:29:09 +00:00
VERSION = "1.0.1"
2024-10-29 05:36:04 +00:00
# Configuration
config = {}
2024-05-18 18:09:43 +00:00
# Dictionary of current PostgreSQL connection pools
connections_lock = Lock()
connections = {}
2024-05-18 18:09:43 +00:00
2024-10-31 05:03:43 +00:00
# Dictionary of unhappy databases. Keys are database names, value is the time
# the database was determined to be unhappy plus the cooldown setting. So,
# basically it's the time when we should try to connect to the database again.
2025-01-08 07:39:20 +00:00
unhappy_cooldown = {}
2024-10-31 05:03:43 +00:00
2024-10-31 06:18:45 +00:00
# Version information
cluster_version = None
cluster_version_next_check = None
cluster_version_lock = Lock()
# Running state (used to gracefully shut down)
running = True
2024-05-18 18:09:43 +00:00
# The http server object
httpd = None
2024-05-18 18:09:43 +00:00
# Where the config file lives
config_file = None
2024-05-18 18:09:43 +00:00
# Configure logging
log = logging.getLogger(__name__)
2025-05-13 05:44:47 +00:00
formatter = logging.Formatter(
"%(asctime)s - %(levelname)s - %(filename)s: %(funcName)s() line %(lineno)d: %(message)s"
)
console_log_handler = logging.StreamHandler()
console_log_handler.setFormatter(formatter)
log.addHandler(console_log_handler)
2024-05-18 18:09:43 +00:00
2025-05-13 05:44:47 +00:00
# Error types
class ConfigError(Exception):
pass
2025-05-13 05:44:47 +00:00
class DisconnectedError(Exception):
pass
2025-05-13 05:44:47 +00:00
2024-10-31 05:03:43 +00:00
class UnhappyDBError(Exception):
pass
2025-05-13 05:44:47 +00:00
2025-05-18 16:52:32 +00:00
class UnknownMetricError(Exception):
pass
2025-01-08 22:01:11 +00:00
class MetricVersionError(Exception):
pass
2025-05-13 05:44:47 +00:00
# Default config settings
default_config = {
# The port the agent listens on for requests
2025-05-13 05:44:47 +00:00
"port": 5400,
2025-01-08 07:39:20 +00:00
# Min PostgreSQL connection pool size (per database)
2025-05-13 05:44:47 +00:00
"min_pool_size": 0,
2025-01-08 07:39:20 +00:00
# Max PostgreSQL connection pool size (per database)
2025-05-13 05:44:47 +00:00
"max_pool_size": 4,
2025-01-08 07:39:20 +00:00
# How long a connection can sit idle in the pool before it's removed (seconds)
2025-05-13 05:44:47 +00:00
"max_idle_time": 30,
2025-01-08 22:01:11 +00:00
# Log level for stderr logging
2025-05-13 05:44:47 +00:00
"log_level": "error",
2025-01-08 07:39:20 +00:00
# Database user to connect as
2025-05-13 05:44:47 +00:00
"dbuser": "postgres",
2025-01-08 07:39:20 +00:00
# Database host
2025-05-13 05:44:47 +00:00
"dbhost": "/var/run/postgresql",
2025-01-08 07:39:20 +00:00
# Database port
2025-05-13 05:44:47 +00:00
"dbport": 5432,
# Default database to connect to when none is specified for a metric
2025-05-13 05:44:47 +00:00
"dbname": "postgres",
2025-01-08 07:39:20 +00:00
# Timeout for getting a connection slot from a pool
2025-05-13 05:44:47 +00:00
"pool_slot_timeout": 5,
2024-10-31 05:03:43 +00:00
# PostgreSQL connection timeout (seconds)
2025-01-08 07:39:20 +00:00
# Note: It can actually be double this because of retries
2025-05-13 05:44:47 +00:00
"connect_timeout": 5,
2025-01-08 07:39:20 +00:00
# Time to wait before trying to reconnect again after a reconnect failure (seconds)
2025-05-13 05:44:47 +00:00
"reconnect_cooldown": 30,
2025-01-08 07:39:20 +00:00
# How often to check the version of PostgreSQL (seconds)
2025-05-13 05:44:47 +00:00
"version_check_period": 300,
# Metrics
2025-05-13 05:44:47 +00:00
"metrics": {},
}
2025-05-13 05:44:47 +00:00
def update_deep(d1, d2):
"""
Recursively update a dict, adding keys to dictionaries and appending to
lists. Note that this both modifies and returns the first dict.
Params:
d1: the dictionary to update
2024-10-31 05:03:43 +00:00
d2: the dictionary to get new values from
Returns:
The new d1
"""
2025-01-03 07:16:40 +00:00
if not isinstance(d1, dict) or not isinstance(d2, dict):
2025-05-13 05:44:47 +00:00
raise TypeError("Both arguments to update_deep need to be dictionaries")
2025-01-03 07:16:40 +00:00
for k, v2 in d2.items():
if isinstance(v2, dict):
v1 = d1.get(k, {})
if not isinstance(v1, dict):
2025-05-13 05:44:47 +00:00
raise TypeError(
"Type mismatch between dictionaries: {} is not a dict".format(
type(v1).__name__
)
)
2025-01-03 07:16:40 +00:00
d1[k] = update_deep(v1, v2)
elif isinstance(v2, list):
v1 = d1.get(k, [])
if not isinstance(v1, list):
2025-05-13 05:44:47 +00:00
raise TypeError(
"Type mismatch between dictionaries: {} is not a list".format(
type(v1).__name__
)
)
2025-01-03 07:16:40 +00:00
d1[k] = v1 + v2
else:
2025-01-03 07:16:40 +00:00
d1[k] = v2
return d1
2025-05-13 05:44:47 +00:00
def read_config(path, included=False):
"""
Read a config file.
params:
path: path to the file to read
included: is this file included by another file?
"""
# Read config file
log.info("Reading log file: {}".format(path))
2025-05-13 05:44:47 +00:00
with open(path, "r") as f:
2025-01-08 22:01:11 +00:00
try:
cfg = yaml.safe_load(f)
except yaml.parser.ParserError as e:
raise ConfigError("Inavlid config file: {}: {}".format(path, e))
2025-01-08 22:01:11 +00:00
# Since we use it a few places, get the base directory from the config
config_base = os.path.dirname(path)
# Read any external queries and validate metric definitions
2025-05-13 05:44:47 +00:00
for name, metric in cfg.get("metrics", {}).items():
2025-01-08 22:01:11 +00:00
# Validate return types
try:
2025-05-13 05:44:47 +00:00
if metric["type"] not in ["value", "row", "column", "set"]:
raise ConfigError(
"Invalid return type: {} for metric {} in {}".format(
metric["type"], name, path
)
)
2025-01-08 22:01:11 +00:00
except KeyError:
2025-05-13 05:44:47 +00:00
raise ConfigError(
"No type specified for metric {} in {}".format(name, path)
)
2025-01-08 22:01:11 +00:00
# Ensure queries exist
2025-05-13 05:44:47 +00:00
query_dict = metric.get("query", {})
2025-01-08 22:01:11 +00:00
if type(query_dict) is not dict:
2025-05-13 05:44:47 +00:00
raise ConfigError(
"Query definition should be a dictionary, got: {} for metric {} in {}".format(
query_dict, name, path
)
)
2025-01-08 22:01:11 +00:00
if len(query_dict) == 0:
raise ConfigError("Missing queries for metric {} in {}".format(name, path))
2025-01-08 22:01:11 +00:00
# Read external sql files and validate version keys
2025-05-13 05:44:47 +00:00
for vers, query in metric["query"].items():
2025-01-08 22:01:11 +00:00
try:
int(vers)
except:
2025-05-13 05:44:47 +00:00
raise ConfigError(
"Invalid version: {} for metric {} in {}".format(vers, name, path)
)
2025-01-08 22:01:11 +00:00
2025-05-13 05:44:47 +00:00
if query.startswith("file:"):
2025-01-08 22:01:11 +00:00
query_path = query[5:]
2025-05-13 05:44:47 +00:00
if not query_path.startswith("/"):
query_path = os.path.join(config_base, query_path)
with open(query_path, "r") as f:
metric["query"][vers] = f.read()
2024-10-31 05:03:43 +00:00
# Read any included config files
2025-05-13 05:44:47 +00:00
for inc in cfg.get("include", []):
2025-01-08 22:01:11 +00:00
# Prefix relative paths with the directory from the current config
2025-05-13 05:44:47 +00:00
if not inc.startswith("/"):
inc = os.path.join(config_base, inc)
update_deep(cfg, read_config(inc, included=True))
# Return the config we read if this is an include, otherwise set the final
# config
if included:
return cfg
else:
new_config = {}
2025-01-08 22:01:11 +00:00
update_deep(new_config, default_config)
update_deep(new_config, cfg)
# Minor sanity checks
2025-05-13 05:44:47 +00:00
if len(new_config["metrics"]) == 0:
log.error("No metrics are defined")
2025-01-08 22:01:11 +00:00
raise ConfigError("No metrics defined")
# Validate the new log level before changing the config
2025-05-13 05:44:47 +00:00
if new_config["log_level"].upper() not in [
"DEBUG",
"INFO",
"WARNING",
"ERROR",
"CRITICAL",
]:
raise ConfigError("Invalid log level: {}".format(new_config["log_level"]))
2024-05-18 18:09:43 +00:00
global config
config = new_config
2024-05-18 18:09:43 +00:00
# Apply changes to log level
2025-05-13 05:44:47 +00:00
log.setLevel(logging.getLevelName(config["log_level"].upper()))
2024-05-16 15:41:47 +00:00
def signal_handler(sig, frame):
"""
Function for handling signals
HUP => Reload
"""
2024-05-16 15:41:47 +00:00
# Restore the original handler
signal.signal(signal.SIGINT, signal.default_int_handler)
# Signal everything to shut down
2025-05-13 05:44:47 +00:00
if sig in [signal.SIGINT, signal.SIGTERM, signal.SIGQUIT]:
log.info("Shutting down ...")
2024-05-16 15:41:47 +00:00
global running
running = False
if httpd is not None:
httpd.socket.close()
# Signal a reload
if sig == signal.SIGHUP:
log.warning("Received config reload signal")
read_config(config_file)
2024-05-16 15:41:47 +00:00
2025-05-13 05:44:47 +00:00
2025-01-08 07:39:20 +00:00
class ConnectionPool(ThreadedConnectionPool):
def __init__(self, dbname, minconn, maxconn, *args, **kwargs):
# Make sure dbname isn't different in the kwargs
2025-05-13 05:44:47 +00:00
kwargs["dbname"] = dbname
2025-01-08 07:39:20 +00:00
super().__init__(minconn, maxconn, *args, **kwargs)
self.name = dbname
@contextmanager
def connection(self, timeout=None):
conn = None
timeout_time = datetime.now() + timedelta(timeout)
# We will continue to try to get a connection slot until we time out
while datetime.now() < timeout_time:
# See if we can get a connection slot
try:
conn = self.getconn()
try:
yield conn
finally:
self.putconn(conn)
return
except psycopg2.pool.PoolError:
# If we failed to get the connection slot, wait a bit and try again
time.sleep(0.1)
2025-05-13 05:44:47 +00:00
raise TimeoutError(
"Timed out waiting for an available connection to {}".format(self.name)
)
def get_pool(dbname):
2024-05-16 15:41:47 +00:00
"""
Get a database connection pool.
2024-05-16 15:41:47 +00:00
"""
2024-10-31 05:03:43 +00:00
# Check if the db is unhappy and wants to be left alone
2025-01-08 07:39:20 +00:00
if dbname in unhappy_cooldown:
if unhappy_cooldown[dbname] > datetime.now():
2024-10-31 05:03:43 +00:00
raise UnhappyDBError()
2025-01-08 07:39:20 +00:00
# Create a connection pool if it doesn't already exist
if dbname not in connections:
with connections_lock:
# Make sure nobody created the pool while we were waiting on the
# lock
if dbname not in connections:
log.info("Creating connection pool for: {}".format(dbname))
connections[dbname] = ConnectionPool(
2025-01-08 07:39:20 +00:00
dbname,
2025-05-13 05:44:47 +00:00
int(config["min_pool_size"]),
int(config["max_pool_size"]),
application_name="pgmon",
host=config["dbhost"],
port=config["dbport"],
user=config["dbuser"],
connect_timeout=int(config["connect_timeout"]),
sslmode="require",
)
2024-10-31 05:03:43 +00:00
# Clear the unhappy indicator if present
2025-01-08 07:39:20 +00:00
unhappy_cooldown.pop(dbname, None)
return connections[dbname]
2025-05-13 05:44:47 +00:00
2024-10-31 05:03:43 +00:00
def handle_connect_failure(pool):
"""
Mark the database as being unhappy so we can leave it alone for a while
"""
dbname = pool.name
2025-05-13 05:44:47 +00:00
unhappy_cooldown[dbname] = datetime.now() + timedelta(
seconds=int(config["reconnect_cooldown"])
)
def get_query(metric, version):
2024-05-16 15:41:47 +00:00
"""
Get the correct metric query for a given version of PostgreSQL.
params:
metric: The metric definition
version: The PostgreSQL version number, as given by server_version_num
2024-05-16 15:41:47 +00:00
"""
# Select the correct query
2025-05-13 05:44:47 +00:00
for v in reversed(sorted(metric["query"].keys())):
if version >= v:
2025-05-13 05:44:47 +00:00
if len(metric["query"][v].strip()) == 0:
raise MetricVersionError(
"Metric no longer applies to PostgreSQL {}".format(version)
)
return metric["query"][v]
2025-05-13 05:44:47 +00:00
raise MetricVersionError("Missing metric query for PostgreSQL {}".format(version))
2024-10-31 05:03:43 +00:00
def run_query_no_retry(pool, return_type, query, args):
2024-10-31 06:18:45 +00:00
"""
Run the query with no explicit retry code
"""
2025-05-13 05:44:47 +00:00
with pool.connection(float(config["connect_timeout"])) as conn:
2024-05-16 15:41:47 +00:00
try:
with conn.cursor(cursor_factory=RealDictCursor) as curs:
curs.execute(query, args)
res = curs.fetchall()
2025-05-13 05:44:47 +00:00
if return_type == "value":
return str(list(res[0].values())[0])
2025-05-13 05:44:47 +00:00
elif return_type == "row":
return json.dumps(res[0])
2025-05-13 05:44:47 +00:00
elif return_type == "column":
return json.dumps([list(r.values())[0] for r in res])
2025-05-13 05:44:47 +00:00
elif return_type == "set":
return json.dumps(res)
except:
2024-10-31 05:03:43 +00:00
dbname = pool.name
2025-01-08 07:39:20 +00:00
if dbname in unhappy_cooldown:
2024-10-31 05:03:43 +00:00
raise UnhappyDBError()
elif conn.broken:
raise DisconnectedError()
else:
raise
2025-05-13 05:44:47 +00:00
def run_query(pool, return_type, query, args):
2024-10-31 06:18:45 +00:00
"""
Run the query, and if we find upon the first attempt that the connection
had been closed, wait a second and try again. This is because psycopg
doesn't know if a connection closed (ie: PostgreSQL was restarted or the
backend was terminated) until you try to execute a query.
Note that the pool has its own retry mechanism as well, but it only applies
to new connections being made.
Also, this will not retry a query if the query itself failed, or if the
database connection could not be established.
"""
# If we get disconnected, I think the putconn command will close the dead
# connection. So we can just give it another shot.
try:
2024-10-31 05:03:43 +00:00
return run_query_no_retry(pool, return_type, query, args)
except DisconnectedError:
log.warning("Stale PostgreSQL connection found ... trying again")
2025-01-08 07:39:20 +00:00
# This sleep is an annoying hack to give the pool workers time to
2024-10-31 05:03:43 +00:00
# actually mark the connection, otherwise it can be given back in the
# next connection() call
2025-01-08 07:39:20 +00:00
# TODO: verify this is the case with psycopg2
time.sleep(1)
2025-01-08 07:39:20 +00:00
try:
return run_query_no_retry(pool, return_type, query, args)
except:
handle_connect_failure(pool)
raise UnhappyDBError()
2025-05-13 05:44:47 +00:00
2024-10-31 06:18:45 +00:00
def get_cluster_version():
"""
Get the PostgreSQL version if we don't already know it, or if it's been
too long sice the last time it was checked.
"""
global cluster_version
global cluster_version_next_check
# If we don't know the version or it's past the recheck time, get the
# version from the database. Only one thread needs to do this, so they all
# try to grab the lock, and then make sure nobody else beat them to it.
2025-05-13 05:44:47 +00:00
if (
cluster_version is None
or cluster_version_next_check is None
or cluster_version_next_check < datetime.now()
):
2024-10-31 06:18:45 +00:00
with cluster_version_lock:
# Only check if nobody already got the version before us
2025-05-13 05:44:47 +00:00
if (
cluster_version is None
or cluster_version_next_check is None
or cluster_version_next_check < datetime.now()
):
log.info("Checking PostgreSQL cluster version")
pool = get_pool(config["dbname"])
cluster_version = int(
run_query(pool, "value", "SHOW server_version_num", None)
)
cluster_version_next_check = datetime.now() + timedelta(
seconds=int(config["version_check_period"])
)
log.info("Got PostgreSQL cluster version: {}".format(cluster_version))
2025-05-13 05:44:47 +00:00
log.debug(
"Next PostgreSQL cluster version check will be after: {}".format(
cluster_version_next_check
)
)
2024-10-31 06:18:45 +00:00
return cluster_version
2025-05-13 05:44:47 +00:00
2025-05-18 16:52:32 +00:00
def sample_metric(dbname, metric_name, args, retry=True):
"""
Run the appropriate query for the named metric against the specified database
"""
# Get the metric definition
try:
metric = config["metrics"][metric_name]
except KeyError:
raise UnknownMetricError("Unknown metric: {}".format(metric_name))
# Get the connection pool for the database, or create one if it doesn't
# already exist.
pool = get_pool(dbname)
# Identify the PostgreSQL version
version = get_cluster_version()
# Get the query version
query = get_query(metric, version)
# Execute the quert
if retry:
return run_query(pool, metric["type"], query, args)
else:
return run_query_no_retry(pool, metric["type"], query, args)
def test_queries():
"""
Run all of the metric queries against a database and check the results
"""
# We just use the default db for tests
dbname = config["dbname"]
# Loop through all defined metrics.
for metric_name in config["metrics"].keys():
# Get the actual metric definition
metric = metrics[metric_name]
# If the metric has arguments to use while testing, grab those
args = metric.get("test_args", {})
# Run the query without the ability to retry.
res = sample_metric(dbname, metric_name, args, retry=False)
# Compare the result to the provided sample results
# TODO
# Return the number of errors
# TODO
return 0
class SimpleHTTPRequestHandler(BaseHTTPRequestHandler):
2024-10-31 06:18:45 +00:00
"""
This is our request handling server. It is responsible for listening for
requests, processing them, and responding.
"""
2025-05-13 05:44:47 +00:00
def log_request(self, code="-", size="-"):
2024-10-31 06:18:45 +00:00
"""
Override to suppress standard request logging
"""
pass
def do_GET(self):
2024-10-31 06:18:45 +00:00
"""
Handle a request. This is just a wrapper around the actual handler
code to keep things more readable.
"""
try:
self._handle_request()
except BrokenPipeError:
log.error("Client disconnected, exiting handler")
def _handle_request(self):
"""
Request handler
"""
# Parse the URL
parsed_path = urlparse(self.path)
2025-05-18 16:52:32 +00:00
metric_name = parsed_path.path.strip("/")
parsed_query = parse_qs(parsed_path.query)
2025-05-18 16:52:32 +00:00
if metric_name == "agent_version":
self._reply(200, VERSION)
2024-10-29 05:36:04 +00:00
return
2024-10-31 06:18:45 +00:00
# Note: parse_qs returns the values as a list. Since we always expect
# single values, just grab the first from each.
2024-10-31 05:03:43 +00:00
args = {key: values[0] for key, values in parsed_query.items()}
2024-10-31 06:18:45 +00:00
# Get the dbname. If none was provided, use the default from the
# config.
2025-05-13 05:44:47 +00:00
dbname = args.get("dbname", config["dbname"])
2024-05-18 18:09:43 +00:00
2025-05-18 16:52:32 +00:00
# Sample the metric
2024-10-31 05:03:43 +00:00
try:
2025-05-18 16:52:32 +00:00
self._reply(200, sample_metric(dbname, metric_name, args))
2024-10-31 06:18:45 +00:00
return
2025-05-18 16:52:32 +00:00
except UnknownMetricError as e:
log.error("Unknown metric: {}".format(metric_name))
self._reply(404, "Unknown metric")
2024-10-31 06:18:45 +00:00
return
2025-05-18 16:52:32 +00:00
except MetricVersionError as e:
log.error(
"Failed to find a version of {} for {}".format(metric_name, version)
)
2025-05-13 05:44:47 +00:00
self._reply(404, "Unsupported version")
return
2025-05-18 16:52:32 +00:00
except UnhappyDBError as e:
log.info("Database {} is unhappy, please be patient".format(dbname))
self._reply(503, "Database unavailable")
2024-10-31 05:03:43 +00:00
return
except Exception as e:
2025-05-18 16:52:32 +00:00
log.error("Error running query: {}".format(e))
self._reply(500, "Unexpected error: {}".format(e))
2024-10-31 05:03:43 +00:00
return
def _reply(self, code, content):
2024-10-31 06:18:45 +00:00
"""
Send a reply to the client
"""
self.send_response(code)
2025-05-13 05:44:47 +00:00
self.send_header("Content-type", "application/json")
self.end_headers()
2025-05-13 05:44:47 +00:00
self.wfile.write(bytes(content, "utf-8"))
2024-05-16 15:41:47 +00:00
2025-05-13 05:44:47 +00:00
if __name__ == "__main__":
# Handle cli args
parser = argparse.ArgumentParser(
2025-05-13 05:44:47 +00:00
prog="pgmon", description="A PostgreSQL monitoring agent"
)
2024-05-16 15:41:47 +00:00
2025-05-13 05:44:47 +00:00
parser.add_argument(
"config_file",
default="pgmon.yml",
nargs="?",
help="The config file to read (default: %(default)s)",
)
2024-05-16 15:41:47 +00:00
2025-05-18 16:52:32 +00:00
parser.add_argument("test", action="store_true", help="Run query tests and exit")
args = parser.parse_args()
2024-10-31 05:03:43 +00:00
# Set the config file path
config_file = args.config_file
2024-05-16 15:41:47 +00:00
# Read the config file
read_config(config_file)
2024-05-16 15:41:47 +00:00
2025-05-18 16:52:32 +00:00
# Run query tests and exit if test mode is enabled
if args.test:
errors = test_queries()
if errors > 0:
sys.exit(1)
else:
sys.exit(0)
# Set up the http server to receive requests
2025-05-13 05:44:47 +00:00
server_address = ("127.0.0.1", config["port"])
httpd = ThreadingHTTPServer(server_address, SimpleHTTPRequestHandler)
2024-05-16 15:41:47 +00:00
# Set up the signal handler
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGHUP, signal_handler)
2024-05-16 15:41:47 +00:00
# Handle requests.
2025-05-13 05:44:47 +00:00
log.info("Listening on port {}...".format(config["port"]))
while running:
httpd.handle_request()
2024-05-16 15:41:47 +00:00
# Clean up PostgreSQL connections
# TODO: Improve this ... not sure it actually closes all the connections cleanly
for pool in connections.values():
pool.close()