2024-05-16 15:41:47 +00:00
|
|
|
#!/usr/bin/env python3
|
2025-09-23 05:12:49 +00:00
|
|
|
"""
|
|
|
|
|
pgmon is a monitoring intermediary that sits between a PostgreSQL cluster and a monitoring systen
|
|
|
|
|
that is capable of parsing JSON responses over an HTTP connection.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
# pylint: disable=too-few-public-methods
|
2024-05-16 15:41:47 +00:00
|
|
|
|
2024-05-16 19:09:09 +00:00
|
|
|
import json
|
2024-10-24 16:23:19 +00:00
|
|
|
import time
|
2025-01-08 22:01:11 +00:00
|
|
|
import os
|
2025-05-18 16:52:32 +00:00
|
|
|
import sys
|
2025-09-23 05:12:49 +00:00
|
|
|
import signal
|
2024-10-24 16:23:19 +00:00
|
|
|
import argparse
|
|
|
|
|
import logging
|
2025-09-23 05:12:49 +00:00
|
|
|
import re
|
2026-02-04 05:36:54 +00:00
|
|
|
import stat
|
2026-03-16 03:54:27 +00:00
|
|
|
import errno
|
|
|
|
|
import requests
|
2024-05-18 18:09:43 +00:00
|
|
|
|
2025-09-23 05:12:49 +00:00
|
|
|
from decimal import Decimal
|
2024-10-31 05:03:43 +00:00
|
|
|
|
2025-09-23 05:12:49 +00:00
|
|
|
from urllib.parse import urlparse, parse_qs
|
2025-01-08 07:39:20 +00:00
|
|
|
|
|
|
|
|
from contextlib import contextmanager
|
2024-05-18 18:09:43 +00:00
|
|
|
|
2025-09-23 05:12:49 +00:00
|
|
|
from datetime import datetime, timedelta
|
2024-05-18 18:09:43 +00:00
|
|
|
|
2025-09-23 05:12:49 +00:00
|
|
|
from threading import Lock
|
|
|
|
|
|
|
|
|
|
import yaml
|
|
|
|
|
|
|
|
|
|
import psycopg2
|
|
|
|
|
from psycopg2.extras import RealDictCursor
|
|
|
|
|
from psycopg2.pool import ThreadedConnectionPool
|
2024-05-18 18:09:43 +00:00
|
|
|
|
2026-02-04 05:36:54 +00:00
|
|
|
from pwd import getpwnam
|
|
|
|
|
from grp import getgrnam
|
|
|
|
|
|
2026-03-16 03:54:27 +00:00
|
|
|
import llfuse
|
2025-06-01 06:44:01 +00:00
|
|
|
|
2025-07-03 05:47:06 +00:00
|
|
|
|
2025-10-18 05:00:30 +00:00
|
|
|
VERSION = "1.1.0-rc1"
|
2024-10-29 05:36:04 +00:00
|
|
|
|
2024-05-18 18:09:43 +00:00
|
|
|
|
2025-09-23 05:12:49 +00:00
|
|
|
class Context:
|
|
|
|
|
"""
|
|
|
|
|
The global context for connections, config, version, nad IPC
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
# Configuration
|
|
|
|
|
config = {}
|
2024-05-18 18:09:43 +00:00
|
|
|
|
2025-09-23 05:12:49 +00:00
|
|
|
# Dictionary of current PostgreSQL connection pools
|
|
|
|
|
connections_lock = Lock()
|
|
|
|
|
connections = {}
|
2024-10-31 05:03:43 +00:00
|
|
|
|
2025-09-23 05:12:49 +00:00
|
|
|
# Dictionary of unhappy databases. Keys are database names, value is the time
|
|
|
|
|
# the database was determined to be unhappy plus the cooldown setting. So,
|
|
|
|
|
# basically it's the time when we should try to connect to the database again.
|
|
|
|
|
unhappy_cooldown = {}
|
2024-10-31 06:18:45 +00:00
|
|
|
|
2025-09-23 05:12:49 +00:00
|
|
|
# Version information
|
|
|
|
|
cluster_version = None
|
|
|
|
|
cluster_version_next_check = None
|
|
|
|
|
cluster_version_lock = Lock()
|
2025-06-01 06:44:01 +00:00
|
|
|
|
2025-09-23 05:12:49 +00:00
|
|
|
# PostgreSQL latest version information
|
|
|
|
|
latest_version = None
|
|
|
|
|
latest_version_next_check = None
|
|
|
|
|
latest_version_lock = Lock()
|
|
|
|
|
release_supported = None
|
2024-05-18 18:09:43 +00:00
|
|
|
|
2025-09-23 05:12:49 +00:00
|
|
|
# Running state (used to gracefully shut down)
|
|
|
|
|
running = True
|
2024-05-18 18:09:43 +00:00
|
|
|
|
2025-09-23 05:12:49 +00:00
|
|
|
# Where the config file lives
|
|
|
|
|
config_file = None
|
|
|
|
|
|
|
|
|
|
# Configure logging
|
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
def init_logging(cls):
|
|
|
|
|
"""
|
|
|
|
|
Actually initialize the logging framework. Since we don't ever instantiate the Context
|
|
|
|
|
class, this provides a way to make a few modifications to the log handler.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
formatter = logging.Formatter(
|
|
|
|
|
"%(asctime)s - %(levelname)s - %(filename)s: "
|
|
|
|
|
"%(funcName)s() line %(lineno)d: %(message)s"
|
|
|
|
|
)
|
|
|
|
|
console_log_handler = logging.StreamHandler()
|
|
|
|
|
console_log_handler.setFormatter(formatter)
|
|
|
|
|
cls.log.addHandler(console_log_handler)
|
2024-05-18 18:09:43 +00:00
|
|
|
|
2025-05-13 05:44:47 +00:00
|
|
|
|
2024-10-24 16:23:19 +00:00
|
|
|
# Error types
|
|
|
|
|
class ConfigError(Exception):
|
2025-09-23 05:12:49 +00:00
|
|
|
"""
|
|
|
|
|
Error type for all config related errors.
|
|
|
|
|
"""
|
2025-05-13 05:44:47 +00:00
|
|
|
|
|
|
|
|
|
2024-10-24 16:23:19 +00:00
|
|
|
class DisconnectedError(Exception):
|
2025-09-23 05:12:49 +00:00
|
|
|
"""
|
|
|
|
|
Error indicating a previously active connection to the database has been disconnected.
|
|
|
|
|
"""
|
2025-05-13 05:44:47 +00:00
|
|
|
|
|
|
|
|
|
2024-10-31 05:03:43 +00:00
|
|
|
class UnhappyDBError(Exception):
|
2025-09-23 05:12:49 +00:00
|
|
|
"""
|
|
|
|
|
Error indicating that a database the code has been asked to connect to is on the unhappy list.
|
|
|
|
|
"""
|
2025-05-13 05:44:47 +00:00
|
|
|
|
|
|
|
|
|
2025-05-18 16:52:32 +00:00
|
|
|
class UnknownMetricError(Exception):
|
2025-09-23 05:12:49 +00:00
|
|
|
"""
|
|
|
|
|
Error indicating that an undefined metric was requested.
|
|
|
|
|
"""
|
2025-05-18 16:52:32 +00:00
|
|
|
|
|
|
|
|
|
2025-01-08 22:01:11 +00:00
|
|
|
class MetricVersionError(Exception):
|
2025-09-23 05:12:49 +00:00
|
|
|
"""
|
|
|
|
|
Error indicating that there is no suitable query for a metric that was requested for the
|
|
|
|
|
version of PostgreSQL being monitored.
|
|
|
|
|
"""
|
2024-05-23 04:35:44 +00:00
|
|
|
|
2025-05-13 05:44:47 +00:00
|
|
|
|
2025-06-01 06:44:01 +00:00
|
|
|
class LatestVersionCheckError(Exception):
|
2025-09-23 05:12:49 +00:00
|
|
|
"""
|
|
|
|
|
Error indicating that there was a problem retrieving or parsing the latest version information.
|
|
|
|
|
"""
|
2025-06-01 06:44:01 +00:00
|
|
|
|
|
|
|
|
|
2025-10-09 03:53:45 +00:00
|
|
|
class InvalidDataError(Exception):
|
|
|
|
|
"""
|
|
|
|
|
Error indicating query results were somehow invalid
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
2024-10-24 16:23:19 +00:00
|
|
|
# Default config settings
|
2025-09-23 05:12:49 +00:00
|
|
|
DEFAULT_CONFIG = {
|
2026-02-04 05:36:54 +00:00
|
|
|
# The name for this agent instance (controles the FUSE directory name)
|
2026-03-16 03:54:27 +00:00
|
|
|
"name": "pgmon",
|
2026-02-04 05:36:54 +00:00
|
|
|
# Base directory for FUSE file system
|
|
|
|
|
"fuse_base": "/run/pgmon",
|
|
|
|
|
# Mode to set on FUSE files (in octal)
|
|
|
|
|
"fuse_mode": 0o400,
|
|
|
|
|
# Owner to set on FUSE files
|
|
|
|
|
"fuse_owner": "zabbix",
|
|
|
|
|
# Group to set on FUSE files
|
|
|
|
|
"fuse_group": "zabbix",
|
|
|
|
|
# The user the agent runs as
|
|
|
|
|
"agent_user": "pgmon",
|
|
|
|
|
# The group the agent runs as
|
|
|
|
|
"agent_group": "pgmon",
|
2025-01-08 07:39:20 +00:00
|
|
|
# Min PostgreSQL connection pool size (per database)
|
2025-05-13 05:44:47 +00:00
|
|
|
"min_pool_size": 0,
|
2025-01-08 07:39:20 +00:00
|
|
|
# Max PostgreSQL connection pool size (per database)
|
2025-05-13 05:44:47 +00:00
|
|
|
"max_pool_size": 4,
|
2025-01-08 07:39:20 +00:00
|
|
|
# How long a connection can sit idle in the pool before it's removed (seconds)
|
2025-05-13 05:44:47 +00:00
|
|
|
"max_idle_time": 30,
|
2025-01-08 22:01:11 +00:00
|
|
|
# Log level for stderr logging
|
2025-05-13 05:44:47 +00:00
|
|
|
"log_level": "error",
|
2025-01-08 07:39:20 +00:00
|
|
|
# Database user to connect as
|
2025-05-13 05:44:47 +00:00
|
|
|
"dbuser": "postgres",
|
2025-01-08 07:39:20 +00:00
|
|
|
# Database host
|
2025-05-13 05:44:47 +00:00
|
|
|
"dbhost": "/var/run/postgresql",
|
2025-01-08 07:39:20 +00:00
|
|
|
# Database port
|
2025-05-13 05:44:47 +00:00
|
|
|
"dbport": 5432,
|
2024-10-24 16:23:19 +00:00
|
|
|
# Default database to connect to when none is specified for a metric
|
2025-05-13 05:44:47 +00:00
|
|
|
"dbname": "postgres",
|
2025-05-22 18:53:25 +00:00
|
|
|
# SSL connection mode
|
|
|
|
|
"ssl_mode": "require",
|
2025-01-08 07:39:20 +00:00
|
|
|
# Timeout for getting a connection slot from a pool
|
2025-05-13 05:44:47 +00:00
|
|
|
"pool_slot_timeout": 5,
|
2024-10-31 05:03:43 +00:00
|
|
|
# PostgreSQL connection timeout (seconds)
|
2025-01-08 07:39:20 +00:00
|
|
|
# Note: It can actually be double this because of retries
|
2025-05-13 05:44:47 +00:00
|
|
|
"connect_timeout": 5,
|
2025-01-08 07:39:20 +00:00
|
|
|
# Time to wait before trying to reconnect again after a reconnect failure (seconds)
|
2025-05-13 05:44:47 +00:00
|
|
|
"reconnect_cooldown": 30,
|
2025-01-08 07:39:20 +00:00
|
|
|
# How often to check the version of PostgreSQL (seconds)
|
2025-05-13 05:44:47 +00:00
|
|
|
"version_check_period": 300,
|
2025-06-01 06:44:01 +00:00
|
|
|
# How often to check the latest supported version of PostgreSQL (seconds)
|
|
|
|
|
"latest_version_check_period": 86400,
|
2024-10-24 16:23:19 +00:00
|
|
|
# Metrics
|
2025-05-13 05:44:47 +00:00
|
|
|
"metrics": {},
|
2024-10-24 16:23:19 +00:00
|
|
|
}
|
2024-05-23 04:35:44 +00:00
|
|
|
|
2025-05-13 05:44:47 +00:00
|
|
|
|
2024-10-29 06:51:57 +00:00
|
|
|
def update_deep(d1, d2):
|
|
|
|
|
"""
|
|
|
|
|
Recursively update a dict, adding keys to dictionaries and appending to
|
|
|
|
|
lists. Note that this both modifies and returns the first dict.
|
|
|
|
|
|
|
|
|
|
Params:
|
|
|
|
|
d1: the dictionary to update
|
2024-10-31 05:03:43 +00:00
|
|
|
d2: the dictionary to get new values from
|
2024-10-29 06:51:57 +00:00
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
The new d1
|
|
|
|
|
"""
|
2025-01-03 07:16:40 +00:00
|
|
|
if not isinstance(d1, dict) or not isinstance(d2, dict):
|
2025-05-13 05:44:47 +00:00
|
|
|
raise TypeError("Both arguments to update_deep need to be dictionaries")
|
2025-01-03 07:16:40 +00:00
|
|
|
|
|
|
|
|
for k, v2 in d2.items():
|
|
|
|
|
if isinstance(v2, dict):
|
|
|
|
|
v1 = d1.get(k, {})
|
|
|
|
|
if not isinstance(v1, dict):
|
2025-05-13 05:44:47 +00:00
|
|
|
raise TypeError(
|
|
|
|
|
"Type mismatch between dictionaries: {} is not a dict".format(
|
|
|
|
|
type(v1).__name__
|
|
|
|
|
)
|
|
|
|
|
)
|
2025-01-03 07:16:40 +00:00
|
|
|
d1[k] = update_deep(v1, v2)
|
|
|
|
|
elif isinstance(v2, list):
|
|
|
|
|
v1 = d1.get(k, [])
|
|
|
|
|
if not isinstance(v1, list):
|
2025-05-13 05:44:47 +00:00
|
|
|
raise TypeError(
|
|
|
|
|
"Type mismatch between dictionaries: {} is not a list".format(
|
|
|
|
|
type(v1).__name__
|
|
|
|
|
)
|
|
|
|
|
)
|
2025-01-03 07:16:40 +00:00
|
|
|
d1[k] = v1 + v2
|
2024-10-29 06:51:57 +00:00
|
|
|
else:
|
2025-01-03 07:16:40 +00:00
|
|
|
d1[k] = v2
|
2024-10-29 06:51:57 +00:00
|
|
|
return d1
|
|
|
|
|
|
2025-05-13 05:44:47 +00:00
|
|
|
|
2025-09-23 05:12:49 +00:00
|
|
|
def validate_metric(path, name, metric):
|
|
|
|
|
"""
|
|
|
|
|
Validate a metric definition from a given file. If any query definitions come from external
|
|
|
|
|
files, the metric dict will be updated with the actual query.
|
|
|
|
|
|
|
|
|
|
Params:
|
|
|
|
|
path: path to the file which contains this definition
|
|
|
|
|
name: name of the metric
|
|
|
|
|
metric: the dictionary containing the metric definition
|
|
|
|
|
"""
|
|
|
|
|
# Validate return types
|
|
|
|
|
try:
|
|
|
|
|
if metric["type"] not in ["value", "row", "column", "set"]:
|
|
|
|
|
raise ConfigError(
|
|
|
|
|
"Invalid return type: {} for metric {} in {}".format(
|
|
|
|
|
metric["type"], name, path
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
except KeyError as e:
|
|
|
|
|
raise ConfigError(
|
|
|
|
|
"No type specified for metric {} in {}".format(name, path)
|
|
|
|
|
) from e
|
|
|
|
|
|
|
|
|
|
# Ensure queries exist
|
|
|
|
|
query_dict = metric.get("query", {})
|
|
|
|
|
if not isinstance(query_dict, dict):
|
|
|
|
|
raise ConfigError(
|
|
|
|
|
"Query definition should be a dictionary, got: {} for metric {} in {}".format(
|
|
|
|
|
query_dict, name, path
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
if len(query_dict) == 0:
|
|
|
|
|
raise ConfigError("Missing queries for metric {} in {}".format(name, path))
|
|
|
|
|
|
|
|
|
|
# Read external sql files and validate version keys
|
|
|
|
|
config_base = os.path.dirname(path)
|
|
|
|
|
for vers, query in metric["query"].items():
|
|
|
|
|
try:
|
|
|
|
|
int(vers)
|
|
|
|
|
except Exception as e:
|
|
|
|
|
raise ConfigError(
|
|
|
|
|
"Invalid version: {} for metric {} in {}".format(vers, name, path)
|
|
|
|
|
) from e
|
|
|
|
|
|
|
|
|
|
# Read in the external query and update the definition in the metricdictionary
|
|
|
|
|
if query.startswith("file:"):
|
|
|
|
|
query_path = query[5:]
|
|
|
|
|
if not query_path.startswith("/"):
|
|
|
|
|
query_path = os.path.join(config_base, query_path)
|
|
|
|
|
with open(query_path, "r", encoding="utf-8") as f:
|
|
|
|
|
metric["query"][vers] = f.read()
|
|
|
|
|
|
|
|
|
|
|
2025-05-13 05:44:47 +00:00
|
|
|
def read_config(path, included=False):
|
2024-10-24 16:23:19 +00:00
|
|
|
"""
|
|
|
|
|
Read a config file.
|
2024-05-23 04:35:44 +00:00
|
|
|
|
2024-10-24 16:23:19 +00:00
|
|
|
params:
|
|
|
|
|
path: path to the file to read
|
|
|
|
|
included: is this file included by another file?
|
|
|
|
|
"""
|
2025-09-23 05:12:49 +00:00
|
|
|
|
2024-10-24 16:23:19 +00:00
|
|
|
# Read config file
|
2025-09-23 05:12:49 +00:00
|
|
|
Context.log.info("Reading log file: %s", path)
|
|
|
|
|
with open(path, "r", encoding="utf-8") as f:
|
2025-01-08 22:01:11 +00:00
|
|
|
try:
|
|
|
|
|
cfg = yaml.safe_load(f)
|
|
|
|
|
except yaml.parser.ParserError as e:
|
2025-09-23 05:12:49 +00:00
|
|
|
raise ConfigError("Inavlid config file: {}: {}".format(path, e)) from e
|
2025-01-08 22:01:11 +00:00
|
|
|
|
|
|
|
|
# Read any external queries and validate metric definitions
|
2025-05-13 05:44:47 +00:00
|
|
|
for name, metric in cfg.get("metrics", {}).items():
|
2025-09-23 05:12:49 +00:00
|
|
|
validate_metric(path, name, metric)
|
2024-10-31 05:03:43 +00:00
|
|
|
|
2024-10-24 16:23:19 +00:00
|
|
|
# Read any included config files
|
2025-09-23 05:12:49 +00:00
|
|
|
config_base = os.path.dirname(path)
|
2025-05-13 05:44:47 +00:00
|
|
|
for inc in cfg.get("include", []):
|
2025-01-08 22:01:11 +00:00
|
|
|
# Prefix relative paths with the directory from the current config
|
2025-05-13 05:44:47 +00:00
|
|
|
if not inc.startswith("/"):
|
|
|
|
|
inc = os.path.join(config_base, inc)
|
2024-10-29 06:51:57 +00:00
|
|
|
update_deep(cfg, read_config(inc, included=True))
|
2024-10-24 16:23:19 +00:00
|
|
|
|
|
|
|
|
# Return the config we read if this is an include, otherwise set the final
|
|
|
|
|
# config
|
|
|
|
|
if included:
|
|
|
|
|
return cfg
|
2024-10-29 06:51:57 +00:00
|
|
|
|
2025-09-23 05:12:49 +00:00
|
|
|
new_config = {}
|
|
|
|
|
update_deep(new_config, DEFAULT_CONFIG)
|
|
|
|
|
update_deep(new_config, cfg)
|
|
|
|
|
|
|
|
|
|
# Minor sanity checks
|
|
|
|
|
if len(new_config["metrics"]) == 0:
|
|
|
|
|
Context.log.error("No metrics are defined")
|
|
|
|
|
raise ConfigError("No metrics defined")
|
2025-01-08 22:01:11 +00:00
|
|
|
|
2025-09-23 05:12:49 +00:00
|
|
|
# Validate the new log level before changing the config
|
|
|
|
|
if new_config["log_level"].upper() not in [
|
|
|
|
|
"DEBUG",
|
|
|
|
|
"INFO",
|
|
|
|
|
"WARNING",
|
|
|
|
|
"ERROR",
|
|
|
|
|
"CRITICAL",
|
|
|
|
|
]:
|
|
|
|
|
raise ConfigError("Invalid log level: {}".format(new_config["log_level"]))
|
2024-05-18 18:09:43 +00:00
|
|
|
|
2025-09-23 05:12:49 +00:00
|
|
|
Context.config = new_config
|
2024-05-18 18:09:43 +00:00
|
|
|
|
2025-09-23 05:12:49 +00:00
|
|
|
# Apply changes to log level
|
|
|
|
|
Context.log.setLevel(logging.getLevelName(Context.config["log_level"].upper()))
|
2025-05-13 05:44:47 +00:00
|
|
|
|
2025-09-23 05:12:49 +00:00
|
|
|
# Return the config (mostly to make pylint happy, but also in case I opt to remove the side
|
|
|
|
|
# effect and make this more functional.
|
|
|
|
|
return Context.config
|
2024-05-23 04:35:44 +00:00
|
|
|
|
2025-09-23 05:12:49 +00:00
|
|
|
|
|
|
|
|
def signal_handler(sig, frame): # pylint: disable=unused-argument
|
2024-05-23 04:35:44 +00:00
|
|
|
"""
|
|
|
|
|
Function for handling signals
|
|
|
|
|
|
|
|
|
|
HUP => Reload
|
|
|
|
|
"""
|
2024-05-16 15:41:47 +00:00
|
|
|
# Restore the original handler
|
|
|
|
|
signal.signal(signal.SIGINT, signal.default_int_handler)
|
|
|
|
|
|
|
|
|
|
# Signal everything to shut down
|
2025-05-13 05:44:47 +00:00
|
|
|
if sig in [signal.SIGINT, signal.SIGTERM, signal.SIGQUIT]:
|
2025-09-23 05:12:49 +00:00
|
|
|
Context.log.info("Shutting down ...")
|
|
|
|
|
Context.running = False
|
2024-05-23 04:35:44 +00:00
|
|
|
|
|
|
|
|
# Signal a reload
|
2026-02-04 05:36:54 +00:00
|
|
|
if sig in [signal.SIGHUP, signal.SIGUSR1]:
|
2025-09-23 05:12:49 +00:00
|
|
|
Context.log.warning("Received config reload signal")
|
|
|
|
|
read_config(Context.config_file)
|
2024-05-16 15:41:47 +00:00
|
|
|
|
2025-05-13 05:44:47 +00:00
|
|
|
|
2025-01-08 07:39:20 +00:00
|
|
|
class ConnectionPool(ThreadedConnectionPool):
|
2025-09-23 05:12:49 +00:00
|
|
|
"""
|
|
|
|
|
Threaded connection pool that has a context manager.
|
|
|
|
|
"""
|
|
|
|
|
|
2025-01-08 07:39:20 +00:00
|
|
|
def __init__(self, dbname, minconn, maxconn, *args, **kwargs):
|
|
|
|
|
# Make sure dbname isn't different in the kwargs
|
2025-05-13 05:44:47 +00:00
|
|
|
kwargs["dbname"] = dbname
|
|
|
|
|
|
2025-01-08 07:39:20 +00:00
|
|
|
super().__init__(minconn, maxconn, *args, **kwargs)
|
|
|
|
|
self.name = dbname
|
|
|
|
|
|
|
|
|
|
@contextmanager
|
2025-09-23 05:12:49 +00:00
|
|
|
def connection(self, timeout):
|
|
|
|
|
"""
|
|
|
|
|
Connection context manager for our connection pool. This will attempt to retrieve a
|
|
|
|
|
connection until the timeout is reached.
|
|
|
|
|
|
|
|
|
|
Params:
|
|
|
|
|
timeout: how long to keep trying to get a connection bedore giving up
|
|
|
|
|
"""
|
2025-01-08 07:39:20 +00:00
|
|
|
conn = None
|
|
|
|
|
timeout_time = datetime.now() + timedelta(timeout)
|
|
|
|
|
# We will continue to try to get a connection slot until we time out
|
|
|
|
|
while datetime.now() < timeout_time:
|
|
|
|
|
# See if we can get a connection slot
|
|
|
|
|
try:
|
|
|
|
|
conn = self.getconn()
|
|
|
|
|
try:
|
|
|
|
|
yield conn
|
|
|
|
|
finally:
|
|
|
|
|
self.putconn(conn)
|
|
|
|
|
return
|
|
|
|
|
except psycopg2.pool.PoolError:
|
|
|
|
|
# If we failed to get the connection slot, wait a bit and try again
|
|
|
|
|
time.sleep(0.1)
|
2025-05-13 05:44:47 +00:00
|
|
|
raise TimeoutError(
|
|
|
|
|
"Timed out waiting for an available connection to {}".format(self.name)
|
|
|
|
|
)
|
|
|
|
|
|
2024-05-23 04:35:44 +00:00
|
|
|
|
2024-10-24 16:23:19 +00:00
|
|
|
def get_pool(dbname):
|
2024-05-16 15:41:47 +00:00
|
|
|
"""
|
2024-10-24 16:23:19 +00:00
|
|
|
Get a database connection pool.
|
2025-09-23 05:12:49 +00:00
|
|
|
|
|
|
|
|
Params:
|
|
|
|
|
dbname: the name of the database for which a connection pool should be returned.
|
2024-05-16 15:41:47 +00:00
|
|
|
"""
|
2024-10-31 05:03:43 +00:00
|
|
|
# Check if the db is unhappy and wants to be left alone
|
2025-09-23 05:12:49 +00:00
|
|
|
if dbname in Context.unhappy_cooldown:
|
|
|
|
|
if Context.unhappy_cooldown[dbname] > datetime.now():
|
2024-10-31 05:03:43 +00:00
|
|
|
raise UnhappyDBError()
|
|
|
|
|
|
2025-01-08 07:39:20 +00:00
|
|
|
# Create a connection pool if it doesn't already exist
|
2025-09-23 05:12:49 +00:00
|
|
|
if dbname not in Context.connections:
|
|
|
|
|
with Context.connections_lock:
|
2024-10-24 16:23:19 +00:00
|
|
|
# Make sure nobody created the pool while we were waiting on the
|
|
|
|
|
# lock
|
2025-09-23 05:12:49 +00:00
|
|
|
if dbname not in Context.connections:
|
|
|
|
|
Context.log.info("Creating connection pool for: %s", dbname)
|
2025-05-22 18:53:25 +00:00
|
|
|
# Actually create the connection pool
|
2025-09-23 05:12:49 +00:00
|
|
|
Context.connections[dbname] = ConnectionPool(
|
2025-01-08 07:39:20 +00:00
|
|
|
dbname,
|
2025-09-23 05:12:49 +00:00
|
|
|
int(Context.config["min_pool_size"]),
|
|
|
|
|
int(Context.config["max_pool_size"]),
|
2025-05-13 05:44:47 +00:00
|
|
|
application_name="pgmon",
|
2025-09-23 05:12:49 +00:00
|
|
|
host=Context.config["dbhost"],
|
|
|
|
|
port=Context.config["dbport"],
|
|
|
|
|
user=Context.config["dbuser"],
|
|
|
|
|
connect_timeout=int(Context.config["connect_timeout"]),
|
|
|
|
|
sslmode=Context.config["ssl_mode"],
|
2025-05-13 05:44:47 +00:00
|
|
|
)
|
2024-10-31 05:03:43 +00:00
|
|
|
# Clear the unhappy indicator if present
|
2025-09-23 05:12:49 +00:00
|
|
|
Context.unhappy_cooldown.pop(dbname, None)
|
|
|
|
|
return Context.connections[dbname]
|
2024-10-24 16:23:19 +00:00
|
|
|
|
2025-05-13 05:44:47 +00:00
|
|
|
|
2024-10-31 05:03:43 +00:00
|
|
|
def handle_connect_failure(pool):
|
|
|
|
|
"""
|
|
|
|
|
Mark the database as being unhappy so we can leave it alone for a while
|
|
|
|
|
"""
|
|
|
|
|
dbname = pool.name
|
2025-09-23 05:12:49 +00:00
|
|
|
Context.unhappy_cooldown[dbname] = datetime.now() + timedelta(
|
|
|
|
|
seconds=int(Context.config["reconnect_cooldown"])
|
2025-05-13 05:44:47 +00:00
|
|
|
)
|
|
|
|
|
|
2024-10-24 16:23:19 +00:00
|
|
|
|
|
|
|
|
def get_query(metric, version):
|
2024-05-16 15:41:47 +00:00
|
|
|
"""
|
2024-10-24 16:23:19 +00:00
|
|
|
Get the correct metric query for a given version of PostgreSQL.
|
2024-05-23 04:35:44 +00:00
|
|
|
|
2024-10-24 16:23:19 +00:00
|
|
|
params:
|
|
|
|
|
metric: The metric definition
|
|
|
|
|
version: The PostgreSQL version number, as given by server_version_num
|
2024-05-16 15:41:47 +00:00
|
|
|
"""
|
2024-10-24 16:23:19 +00:00
|
|
|
# Select the correct query
|
2025-05-13 05:44:47 +00:00
|
|
|
for v in reversed(sorted(metric["query"].keys())):
|
2024-10-24 16:23:19 +00:00
|
|
|
if version >= v:
|
2025-05-13 05:44:47 +00:00
|
|
|
if len(metric["query"][v].strip()) == 0:
|
|
|
|
|
raise MetricVersionError(
|
|
|
|
|
"Metric no longer applies to PostgreSQL {}".format(version)
|
|
|
|
|
)
|
|
|
|
|
return metric["query"][v]
|
2024-05-23 04:35:44 +00:00
|
|
|
|
2025-05-13 05:44:47 +00:00
|
|
|
raise MetricVersionError("Missing metric query for PostgreSQL {}".format(version))
|
2024-05-23 04:35:44 +00:00
|
|
|
|
2024-05-16 19:09:09 +00:00
|
|
|
|
2025-07-03 05:47:06 +00:00
|
|
|
def json_encode_special(obj):
|
|
|
|
|
"""
|
|
|
|
|
Encoder function to handle types the standard JSON package doesn't know what
|
|
|
|
|
to do with
|
|
|
|
|
"""
|
|
|
|
|
if isinstance(obj, Decimal):
|
|
|
|
|
return float(obj)
|
2025-09-23 05:12:49 +00:00
|
|
|
raise TypeError("Cannot serialize object of {}".format(type(obj)))
|
2025-07-03 05:47:06 +00:00
|
|
|
|
|
|
|
|
|
2025-10-09 03:53:45 +00:00
|
|
|
def json_encode_result(return_type, res):
|
|
|
|
|
"""
|
|
|
|
|
Return a json string encoding of the results of a query.
|
|
|
|
|
|
|
|
|
|
params:
|
|
|
|
|
return_type: the expected structure to return. One of:
|
|
|
|
|
value, row, column, set
|
|
|
|
|
res: the query results
|
|
|
|
|
|
|
|
|
|
returns: a json string form of the results
|
|
|
|
|
|
|
|
|
|
raises:
|
|
|
|
|
ConfigError: when an invalid return_type is given
|
|
|
|
|
InvalidDataError: when the query results don't match the return type
|
|
|
|
|
"""
|
|
|
|
|
try:
|
|
|
|
|
if return_type == "value":
|
|
|
|
|
if len(res) == 0:
|
|
|
|
|
return ""
|
|
|
|
|
return str(list(res[0].values())[0])
|
|
|
|
|
|
|
|
|
|
if return_type == "row":
|
|
|
|
|
return json.dumps(
|
|
|
|
|
res[0] if len(res) > 0 else {}, default=json_encode_special
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
if return_type == "column":
|
|
|
|
|
return json.dumps(
|
|
|
|
|
[list(r.values())[0] for r in res], default=json_encode_special
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
if return_type == "set":
|
|
|
|
|
return json.dumps(res, default=json_encode_special)
|
|
|
|
|
except IndexError as e:
|
|
|
|
|
raise InvalidDataError(e) from e
|
|
|
|
|
|
|
|
|
|
# If we got to this point, the return type is invalid
|
|
|
|
|
raise ConfigError("Invalid query return type: {}".format(return_type))
|
|
|
|
|
|
|
|
|
|
|
2024-10-31 05:03:43 +00:00
|
|
|
def run_query_no_retry(pool, return_type, query, args):
|
2024-10-31 06:18:45 +00:00
|
|
|
"""
|
|
|
|
|
Run the query with no explicit retry code
|
|
|
|
|
"""
|
2025-09-23 05:12:49 +00:00
|
|
|
with pool.connection(float(Context.config["connect_timeout"])) as conn:
|
2024-05-16 15:41:47 +00:00
|
|
|
try:
|
2025-04-19 04:07:15 +00:00
|
|
|
with conn.cursor(cursor_factory=RealDictCursor) as curs:
|
2024-10-24 16:23:19 +00:00
|
|
|
curs.execute(query, args)
|
|
|
|
|
res = curs.fetchall()
|
|
|
|
|
|
2025-10-09 03:53:45 +00:00
|
|
|
return json_encode_result(return_type, res)
|
2025-09-23 05:12:49 +00:00
|
|
|
except Exception as e:
|
2024-10-31 05:03:43 +00:00
|
|
|
dbname = pool.name
|
2025-09-23 05:12:49 +00:00
|
|
|
if dbname in Context.unhappy_cooldown:
|
|
|
|
|
raise UnhappyDBError() from e
|
|
|
|
|
if conn.closed != 0:
|
|
|
|
|
raise DisconnectedError() from e
|
|
|
|
|
raise
|
2024-10-24 16:23:19 +00:00
|
|
|
|
2025-05-13 05:44:47 +00:00
|
|
|
|
2024-10-24 16:23:19 +00:00
|
|
|
def run_query(pool, return_type, query, args):
|
2024-10-31 06:18:45 +00:00
|
|
|
"""
|
|
|
|
|
Run the query, and if we find upon the first attempt that the connection
|
|
|
|
|
had been closed, wait a second and try again. This is because psycopg
|
|
|
|
|
doesn't know if a connection closed (ie: PostgreSQL was restarted or the
|
|
|
|
|
backend was terminated) until you try to execute a query.
|
|
|
|
|
|
|
|
|
|
Note that the pool has its own retry mechanism as well, but it only applies
|
|
|
|
|
to new connections being made.
|
|
|
|
|
|
|
|
|
|
Also, this will not retry a query if the query itself failed, or if the
|
|
|
|
|
database connection could not be established.
|
|
|
|
|
"""
|
2024-10-24 16:23:19 +00:00
|
|
|
# If we get disconnected, I think the putconn command will close the dead
|
|
|
|
|
# connection. So we can just give it another shot.
|
|
|
|
|
try:
|
2024-10-31 05:03:43 +00:00
|
|
|
return run_query_no_retry(pool, return_type, query, args)
|
2024-10-24 16:23:19 +00:00
|
|
|
except DisconnectedError:
|
2025-09-23 05:12:49 +00:00
|
|
|
Context.log.warning("Stale PostgreSQL connection found ... trying again")
|
2025-01-08 07:39:20 +00:00
|
|
|
# This sleep is an annoying hack to give the pool workers time to
|
2024-10-31 05:03:43 +00:00
|
|
|
# actually mark the connection, otherwise it can be given back in the
|
|
|
|
|
# next connection() call
|
2025-01-08 07:39:20 +00:00
|
|
|
# TODO: verify this is the case with psycopg2
|
2024-10-24 16:23:19 +00:00
|
|
|
time.sleep(1)
|
2025-01-08 07:39:20 +00:00
|
|
|
try:
|
|
|
|
|
return run_query_no_retry(pool, return_type, query, args)
|
2025-09-23 05:12:49 +00:00
|
|
|
except Exception as e:
|
2025-01-08 07:39:20 +00:00
|
|
|
handle_connect_failure(pool)
|
2025-09-23 05:12:49 +00:00
|
|
|
raise UnhappyDBError() from e
|
2024-10-24 16:23:19 +00:00
|
|
|
|
2025-05-13 05:44:47 +00:00
|
|
|
|
2024-10-31 06:18:45 +00:00
|
|
|
def get_cluster_version():
|
|
|
|
|
"""
|
|
|
|
|
Get the PostgreSQL version if we don't already know it, or if it's been
|
|
|
|
|
too long sice the last time it was checked.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
# If we don't know the version or it's past the recheck time, get the
|
|
|
|
|
# version from the database. Only one thread needs to do this, so they all
|
|
|
|
|
# try to grab the lock, and then make sure nobody else beat them to it.
|
2025-05-13 05:44:47 +00:00
|
|
|
if (
|
2025-09-23 05:12:49 +00:00
|
|
|
Context.cluster_version is None
|
|
|
|
|
or Context.cluster_version_next_check is None
|
|
|
|
|
or Context.cluster_version_next_check < datetime.now()
|
2025-05-13 05:44:47 +00:00
|
|
|
):
|
2025-09-23 05:12:49 +00:00
|
|
|
with Context.cluster_version_lock:
|
2024-10-31 06:18:45 +00:00
|
|
|
# Only check if nobody already got the version before us
|
2025-05-13 05:44:47 +00:00
|
|
|
if (
|
2025-09-23 05:12:49 +00:00
|
|
|
Context.cluster_version is None
|
|
|
|
|
or Context.cluster_version_next_check is None
|
|
|
|
|
or Context.cluster_version_next_check < datetime.now()
|
2025-05-13 05:44:47 +00:00
|
|
|
):
|
2025-09-23 05:12:49 +00:00
|
|
|
Context.log.info("Checking PostgreSQL cluster version")
|
|
|
|
|
pool = get_pool(Context.config["dbname"])
|
|
|
|
|
Context.cluster_version = int(
|
2025-05-13 05:44:47 +00:00
|
|
|
run_query(pool, "value", "SHOW server_version_num", None)
|
|
|
|
|
)
|
2025-09-23 05:12:49 +00:00
|
|
|
Context.cluster_version_next_check = datetime.now() + timedelta(
|
|
|
|
|
seconds=int(Context.config["version_check_period"])
|
2025-05-13 05:44:47 +00:00
|
|
|
)
|
2025-09-23 05:12:49 +00:00
|
|
|
Context.log.info(
|
|
|
|
|
"Got PostgreSQL cluster version: %s", Context.cluster_version
|
|
|
|
|
)
|
|
|
|
|
Context.log.debug(
|
|
|
|
|
"Next PostgreSQL cluster version check will be after: %s",
|
|
|
|
|
Context.cluster_version_next_check,
|
2025-05-13 05:44:47 +00:00
|
|
|
)
|
2024-10-31 06:18:45 +00:00
|
|
|
|
2025-09-23 05:12:49 +00:00
|
|
|
return Context.cluster_version
|
2024-10-31 06:18:45 +00:00
|
|
|
|
2025-06-02 07:41:15 +00:00
|
|
|
|
2025-06-02 07:39:52 +00:00
|
|
|
def version_num_to_release(version_num):
|
|
|
|
|
"""
|
|
|
|
|
Extract the revease from a version_num.
|
2025-05-13 05:44:47 +00:00
|
|
|
|
2025-06-02 07:39:52 +00:00
|
|
|
In other words, this converts things like:
|
|
|
|
|
90603 => 9.6
|
|
|
|
|
130010 => 13
|
2025-06-01 06:44:01 +00:00
|
|
|
"""
|
2025-06-02 07:39:52 +00:00
|
|
|
if version_num // 10000 < 10:
|
|
|
|
|
return version_num // 10000 + (version_num % 10000 // 100 / 10)
|
2025-09-23 05:12:49 +00:00
|
|
|
return version_num // 10000
|
2025-06-02 07:39:52 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def parse_version_rss(raw_rss, release):
|
|
|
|
|
"""
|
|
|
|
|
Parse the raw RSS from the versions.rss feed to extract the latest version of
|
|
|
|
|
PostgreSQL that's availabe for the cluster being monitored.
|
|
|
|
|
|
2025-09-23 05:12:49 +00:00
|
|
|
This sets these Context variables:
|
2025-06-02 07:39:52 +00:00
|
|
|
latest_version
|
|
|
|
|
release_supported
|
|
|
|
|
|
|
|
|
|
It is expected that the caller already holds the latest_version_lock lock.
|
|
|
|
|
|
|
|
|
|
params:
|
|
|
|
|
raw_rss: The raw rss text from versions.rss
|
|
|
|
|
release: The PostgreSQL release we care about (ex: 9.2, 14)
|
2025-06-01 06:44:01 +00:00
|
|
|
"""
|
|
|
|
|
|
2025-06-02 07:39:52 +00:00
|
|
|
# Regular expressions for parsing the RSS document
|
2025-06-02 07:41:15 +00:00
|
|
|
version_line = re.compile(
|
|
|
|
|
r".*?([0-9][0-9.]+) is the latest release in the {} series.*".format(release)
|
|
|
|
|
)
|
2025-06-02 07:39:52 +00:00
|
|
|
unsupported_line = re.compile(r"^This version is unsupported")
|
|
|
|
|
|
|
|
|
|
# Loop through the RSS until we find the current release
|
|
|
|
|
release_found = False
|
|
|
|
|
for line in raw_rss.splitlines():
|
|
|
|
|
m = version_line.match(line)
|
|
|
|
|
if m:
|
|
|
|
|
# Note that we found the version we were looking for
|
|
|
|
|
release_found = True
|
|
|
|
|
|
|
|
|
|
# Convert the version to version_num format
|
|
|
|
|
version = m.group(1)
|
|
|
|
|
parts = list(map(int, version.split(".")))
|
|
|
|
|
if parts[0] < 10:
|
2025-09-23 05:12:49 +00:00
|
|
|
Context.latest_version = int(
|
2025-06-02 07:39:52 +00:00
|
|
|
"{}{:02}{:02}".format(parts[0], parts[1], parts[2])
|
|
|
|
|
)
|
|
|
|
|
else:
|
2025-09-23 05:12:49 +00:00
|
|
|
Context.latest_version = int("{}00{:02}".format(parts[0], parts[1]))
|
2025-06-02 07:39:52 +00:00
|
|
|
elif release_found:
|
|
|
|
|
# The next line after the version tells if the version is supported
|
|
|
|
|
if unsupported_line.match(line):
|
2025-09-23 05:12:49 +00:00
|
|
|
Context.release_supported = False
|
2025-06-02 07:39:52 +00:00
|
|
|
else:
|
2025-09-23 05:12:49 +00:00
|
|
|
Context.release_supported = True
|
2025-06-02 07:39:52 +00:00
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# Make sure we actually found it
|
|
|
|
|
if not release_found:
|
2025-06-02 07:41:15 +00:00
|
|
|
raise LatestVersionCheckError("Current release ({}) not found".format(release))
|
2025-06-02 07:39:52 +00:00
|
|
|
|
2025-09-23 05:12:49 +00:00
|
|
|
Context.log.info(
|
|
|
|
|
"Got latest PostgreSQL version: %s supported=%s",
|
|
|
|
|
Context.latest_version,
|
|
|
|
|
Context.release_supported,
|
2025-06-02 07:39:52 +00:00
|
|
|
)
|
2025-09-23 05:12:49 +00:00
|
|
|
Context.log.debug(
|
|
|
|
|
"Next latest PostgreSQL version check will be after: %s",
|
|
|
|
|
Context.latest_version_next_check,
|
2025-06-02 07:39:52 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_latest_version():
|
|
|
|
|
"""
|
2025-09-23 05:12:49 +00:00
|
|
|
Get the latest supported version of the major PostgreSQL release running on the server being
|
|
|
|
|
monitored.
|
2025-06-02 07:39:52 +00:00
|
|
|
"""
|
|
|
|
|
|
2025-06-01 06:44:01 +00:00
|
|
|
# If we don't know the latest version or it's past the recheck time, get the
|
|
|
|
|
# version from the PostgreSQL RSS feed. Only one thread needs to do this, so
|
|
|
|
|
# they all try to grab the lock, and then make sure nobody else beat them to it.
|
|
|
|
|
if (
|
2025-09-23 05:12:49 +00:00
|
|
|
Context.latest_version is None
|
|
|
|
|
or Context.latest_version_next_check is None
|
|
|
|
|
or Context.latest_version_next_check < datetime.now()
|
2025-06-01 06:44:01 +00:00
|
|
|
):
|
2025-06-02 07:39:52 +00:00
|
|
|
# Note: we get the cluster version here before grabbing the latest_version_lock
|
|
|
|
|
# lock so it's not held while trying to talk with the DB.
|
|
|
|
|
release = version_num_to_release(get_cluster_version())
|
|
|
|
|
|
2025-09-23 05:12:49 +00:00
|
|
|
with Context.latest_version_lock:
|
2025-06-01 06:44:01 +00:00
|
|
|
# Only check if nobody already got the version before us
|
|
|
|
|
if (
|
2025-09-23 05:12:49 +00:00
|
|
|
Context.latest_version is None
|
|
|
|
|
or Context.latest_version_next_check is None
|
|
|
|
|
or Context.latest_version_next_check < datetime.now()
|
2025-06-01 06:44:01 +00:00
|
|
|
):
|
2025-09-23 05:12:49 +00:00
|
|
|
Context.log.info("Checking latest PostgreSQL version")
|
|
|
|
|
Context.latest_version_next_check = datetime.now() + timedelta(
|
|
|
|
|
seconds=int(Context.config["latest_version_check_period"])
|
2025-06-01 06:44:01 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Grab the RSS feed
|
2025-09-23 05:12:49 +00:00
|
|
|
raw_rss = requests.get(
|
|
|
|
|
"https://www.postgresql.org/versions.rss", timeout=30
|
|
|
|
|
)
|
2025-06-02 07:39:52 +00:00
|
|
|
if raw_rss.status_code != 200:
|
2025-09-23 05:12:49 +00:00
|
|
|
raise LatestVersionCheckError("code={}".format(raw_rss.status_code))
|
2025-06-01 06:44:01 +00:00
|
|
|
|
2025-09-23 05:12:49 +00:00
|
|
|
# Parse the RSS body and set Context variables
|
2025-06-02 07:39:52 +00:00
|
|
|
parse_version_rss(raw_rss.text, release)
|
2025-06-01 06:44:01 +00:00
|
|
|
|
2025-09-23 05:12:49 +00:00
|
|
|
return Context.latest_version
|
2025-06-01 06:44:01 +00:00
|
|
|
|
|
|
|
|
|
2025-05-18 16:52:32 +00:00
|
|
|
def sample_metric(dbname, metric_name, args, retry=True):
|
|
|
|
|
"""
|
|
|
|
|
Run the appropriate query for the named metric against the specified database
|
|
|
|
|
"""
|
|
|
|
|
# Get the metric definition
|
|
|
|
|
try:
|
2025-09-23 05:12:49 +00:00
|
|
|
metric = Context.config["metrics"][metric_name]
|
|
|
|
|
except KeyError as e:
|
|
|
|
|
raise UnknownMetricError("Unknown metric: {}".format(metric_name)) from e
|
2025-05-18 16:52:32 +00:00
|
|
|
|
|
|
|
|
# Get the connection pool for the database, or create one if it doesn't
|
|
|
|
|
# already exist.
|
|
|
|
|
pool = get_pool(dbname)
|
|
|
|
|
|
|
|
|
|
# Identify the PostgreSQL version
|
|
|
|
|
version = get_cluster_version()
|
|
|
|
|
|
|
|
|
|
# Get the query version
|
|
|
|
|
query = get_query(metric, version)
|
|
|
|
|
|
|
|
|
|
# Execute the quert
|
|
|
|
|
if retry:
|
|
|
|
|
return run_query(pool, metric["type"], query, args)
|
2025-09-23 05:12:49 +00:00
|
|
|
return run_query_no_retry(pool, metric["type"], query, args)
|
2025-05-18 16:52:32 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_queries():
|
|
|
|
|
"""
|
|
|
|
|
Run all of the metric queries against a database and check the results
|
|
|
|
|
"""
|
|
|
|
|
# We just use the default db for tests
|
2025-09-23 05:12:49 +00:00
|
|
|
dbname = Context.config["dbname"]
|
2025-05-18 16:52:32 +00:00
|
|
|
# Loop through all defined metrics.
|
2025-09-23 05:12:49 +00:00
|
|
|
for name, metric in Context.config["metrics"].items():
|
2025-05-18 16:52:32 +00:00
|
|
|
# If the metric has arguments to use while testing, grab those
|
|
|
|
|
args = metric.get("test_args", {})
|
2025-07-14 05:58:08 +00:00
|
|
|
print(
|
|
|
|
|
"Testing {} [{}]".format(
|
|
|
|
|
name,
|
|
|
|
|
", ".join(["{}={}".format(key, value) for key, value in args.items()]),
|
|
|
|
|
)
|
|
|
|
|
)
|
2025-06-18 21:39:16 +00:00
|
|
|
# When testing against a docker container, we may end up connecting
|
|
|
|
|
# before the service is truly up (it restarts during the initialization
|
|
|
|
|
# phase). To cope with this, we'll allow a few connection failures.
|
|
|
|
|
tries = 5
|
|
|
|
|
while True:
|
|
|
|
|
# Run the query without the ability to retry
|
|
|
|
|
try:
|
|
|
|
|
res = sample_metric(dbname, name, args, retry=False)
|
|
|
|
|
break
|
|
|
|
|
except MetricVersionError:
|
|
|
|
|
res = "Unsupported for this version"
|
|
|
|
|
break
|
|
|
|
|
except psycopg2.OperationalError as e:
|
|
|
|
|
print("Error encountered, {} tries left: {}".format(tries, e))
|
|
|
|
|
if tries <= 0:
|
|
|
|
|
raise
|
|
|
|
|
time.sleep(1)
|
|
|
|
|
tries -= 1
|
2025-05-18 16:52:32 +00:00
|
|
|
# Compare the result to the provided sample results
|
|
|
|
|
# TODO
|
2025-05-22 18:53:25 +00:00
|
|
|
print("{} -> {}".format(name, res))
|
2025-05-18 16:52:32 +00:00
|
|
|
# Return the number of errors
|
|
|
|
|
# TODO
|
|
|
|
|
return 0
|
|
|
|
|
|
|
|
|
|
|
2026-03-16 03:54:27 +00:00
|
|
|
class PGMonFuse(llfuse.Operations):
|
2024-10-31 06:18:45 +00:00
|
|
|
"""
|
2026-02-04 05:36:54 +00:00
|
|
|
This is our FUSE filesystem for requests from Zabbix. It is responsible for listening for
|
2024-10-31 06:18:45 +00:00
|
|
|
requests, processing them, and responding.
|
|
|
|
|
"""
|
|
|
|
|
|
2026-02-04 05:36:54 +00:00
|
|
|
def __init__(self):
|
2026-03-16 03:54:27 +00:00
|
|
|
super().__init__()
|
|
|
|
|
self.builtin = ["agent_version", "latest_version_info", "sleep"]
|
|
|
|
|
self.update_files()
|
2024-10-24 16:23:19 +00:00
|
|
|
|
2026-03-16 03:54:27 +00:00
|
|
|
# Dictionary holdig context information for open file handles
|
|
|
|
|
self.inodes = dict()
|
|
|
|
|
self.file_handles = dict()
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def gen_attrs(inode, is_dir):
|
|
|
|
|
|
|
|
|
|
attrs = llfuse.EntryAttributes()
|
|
|
|
|
if is_dir:
|
|
|
|
|
attrs.st_mode = Context.config["fuse_mode"] | stat.S_IXUSR | stat.S_IFDIR
|
|
|
|
|
else:
|
|
|
|
|
attrs.st_mode = Context.config["fuse_mode"] | stat.S_IFREG
|
|
|
|
|
attrs.st_nlink = 1
|
|
|
|
|
attrs.st_size = 1024
|
|
|
|
|
attrs.st_uid = getpwnam(Context.config["fuse_owner"]).pw_uid
|
|
|
|
|
attrs.st_gid = getgrnam(Context.config["fuse_group"]).gr_gid
|
|
|
|
|
|
|
|
|
|
attrs.st_ino = inode
|
2024-10-31 06:18:45 +00:00
|
|
|
|
2026-02-04 05:36:54 +00:00
|
|
|
return attrs
|
|
|
|
|
|
2026-03-16 03:54:27 +00:00
|
|
|
def update_files(self):
|
|
|
|
|
self.files = self.builtin + sorted(list(Context.config["metrics"].keys()))
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def path_to_inode(path):
|
|
|
|
|
"""
|
|
|
|
|
Provide a consistent inode for a given file
|
|
|
|
|
"""
|
|
|
|
|
return hash(path) & 0xFFFFFFFFFFFFFFFF
|
|
|
|
|
|
|
|
|
|
def lookup(self, inode_p, name, ctx=None):
|
|
|
|
|
""" """
|
|
|
|
|
print(f"Lookup called with: inode_p={inode_p} name={name}")
|
|
|
|
|
|
|
|
|
|
# We only have a single level, so if the parent inode is anything else
|
|
|
|
|
# then it doesn't exist
|
|
|
|
|
if inode_p != llfuse.ROOT_INODE:
|
|
|
|
|
raise llfuse.FUSEError(errno.ENOENT)
|
|
|
|
|
|
|
|
|
|
# Generate the inode for this file (ie: query + args)
|
|
|
|
|
inode = self.path_to_inode(name)
|
|
|
|
|
|
|
|
|
|
# Get the file info if we already have it, create it otherwise
|
|
|
|
|
try:
|
|
|
|
|
file = self.inodes[inode]
|
|
|
|
|
except KeyError:
|
|
|
|
|
# Separate out the name components
|
|
|
|
|
parts = name.decode("utf-8").split(":", 1)
|
|
|
|
|
base_name = parts[0]
|
|
|
|
|
try:
|
|
|
|
|
arg_str = parts[1]
|
|
|
|
|
except IndexError:
|
|
|
|
|
arg_str = ""
|
|
|
|
|
|
|
|
|
|
# Make sure it's a valid request
|
|
|
|
|
if base_name not in self.files:
|
|
|
|
|
raise llfuse.FUSEError(errno.ENOENT)
|
|
|
|
|
|
|
|
|
|
# Split any key/value args
|
|
|
|
|
args = {}
|
|
|
|
|
for arg in arg_str.split(","):
|
|
|
|
|
if arg == "":
|
|
|
|
|
continue
|
|
|
|
|
print(f"Splitting: {arg}")
|
|
|
|
|
key, value = arg.split("=", 1)
|
|
|
|
|
args[key] = value
|
|
|
|
|
|
|
|
|
|
file = {
|
|
|
|
|
"name": base_name,
|
|
|
|
|
"args": args,
|
|
|
|
|
"error": None,
|
|
|
|
|
"attrs": None,
|
|
|
|
|
"count": 0,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
self.inodes[inode] = file
|
|
|
|
|
|
|
|
|
|
if not file["attrs"]:
|
|
|
|
|
# Fuse shouldn't be calling this for the base directory, so it
|
|
|
|
|
# should be safe to assume we're dealing with a file.
|
|
|
|
|
file["attrs"] = self.gen_attrs(inode, False)
|
|
|
|
|
|
|
|
|
|
# Increase the lookup count
|
|
|
|
|
file["count"] += 1
|
|
|
|
|
|
|
|
|
|
return file["attrs"]
|
|
|
|
|
|
|
|
|
|
def getattr(self, inode, ctx=None):
|
|
|
|
|
print(f"getattr called with: inode={inode}")
|
|
|
|
|
|
|
|
|
|
if inode == llfuse.ROOT_INODE:
|
|
|
|
|
# If we're looking at the directory itself
|
|
|
|
|
return self.gen_attrs(inode, True)
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
return self.inodes[inode]["attrs"]
|
|
|
|
|
except KeyError:
|
|
|
|
|
print(f"Getattr called without a lookup first for: inode={inode}")
|
|
|
|
|
raise llfuse.FUSEError(errno.ENOENT)
|
2026-02-04 05:36:54 +00:00
|
|
|
|
2026-03-16 03:54:27 +00:00
|
|
|
def opendir(self, inode, ctx):
|
|
|
|
|
"""
|
|
|
|
|
Note: We only allow one directory level
|
|
|
|
|
"""
|
|
|
|
|
print(f"opendir called with: inode={inode}")
|
2026-02-04 05:36:54 +00:00
|
|
|
|
2026-03-16 03:54:27 +00:00
|
|
|
if inode != llfuse.ROOT_INODE:
|
|
|
|
|
raise llfuse.FUSEError(errno.ENOENT)
|
|
|
|
|
return inode
|
2026-02-04 05:36:54 +00:00
|
|
|
|
2026-03-16 03:54:27 +00:00
|
|
|
def readdir(self, fh, off):
|
|
|
|
|
print(f"readdir called with: fh={fh} off={off}")
|
2026-02-04 05:36:54 +00:00
|
|
|
|
2026-03-16 03:54:27 +00:00
|
|
|
# Something has gone wrong if this was called for anything other than the top level directory
|
|
|
|
|
assert fh == llfuse.ROOT_INODE
|
2026-02-04 05:36:54 +00:00
|
|
|
|
2026-03-16 03:54:27 +00:00
|
|
|
for i, entry in enumerate(self.files[off:]):
|
|
|
|
|
print(f" yielding: {entry.encode('utf8')}")
|
|
|
|
|
yield (
|
|
|
|
|
entry.encode("utf-8"),
|
|
|
|
|
self.lookup(llfuse.ROOT_INODE, entry.encode("utf-8")),
|
|
|
|
|
i + off + 1,
|
|
|
|
|
)
|
2024-10-24 16:23:19 +00:00
|
|
|
|
2026-03-16 03:54:27 +00:00
|
|
|
def new_file_handle(self, inode=None):
|
|
|
|
|
"""
|
|
|
|
|
Pick an unused file handle (int) and create an entry in file_handles for it
|
|
|
|
|
|
|
|
|
|
returns: the file handle number
|
|
|
|
|
"""
|
|
|
|
|
# Find the first unused number
|
|
|
|
|
new_fh = 0
|
|
|
|
|
for fh in sorted(self.file_handles.keys()):
|
|
|
|
|
if new_fh == fh:
|
|
|
|
|
new_fh += 1
|
|
|
|
|
else:
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# Create the empty entry in the
|
|
|
|
|
self.file_handles[new_fh] = {
|
|
|
|
|
"inode": inode,
|
|
|
|
|
"data": None,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return new_fh
|
|
|
|
|
|
|
|
|
|
def open(self, inode, flags, ctx):
|
|
|
|
|
print(f"open called with: inode={inode} flags={flags}")
|
|
|
|
|
|
|
|
|
|
fh = self.new_file_handle(inode)
|
|
|
|
|
|
|
|
|
|
# Get the data associated with the file handle
|
|
|
|
|
file = self.file_handles[fh]
|
|
|
|
|
|
|
|
|
|
# Get the data associated with the file name and parameters
|
|
|
|
|
data = self.inodes[inode]
|
|
|
|
|
|
|
|
|
|
# Pull out the things we need to run the query
|
|
|
|
|
metric_name = data["name"]
|
|
|
|
|
metric_args = data["args"]
|
|
|
|
|
|
|
|
|
|
# Populate the data for the file
|
2025-05-18 16:52:32 +00:00
|
|
|
if metric_name == "agent_version":
|
2026-03-16 03:54:27 +00:00
|
|
|
file["data"] = VERSION.encode("utf-8")
|
|
|
|
|
elif metric_name == "sleep":
|
|
|
|
|
seconds = metric_args.get("seconds", "10")
|
|
|
|
|
with llfuse.lock_released:
|
|
|
|
|
time.sleep(int(seconds))
|
|
|
|
|
file["data"] = seconds.encode("utf-8")
|
2025-06-03 05:58:44 +00:00
|
|
|
elif metric_name == "latest_version_info":
|
|
|
|
|
try:
|
|
|
|
|
get_latest_version()
|
2026-03-16 03:54:27 +00:00
|
|
|
file["data"] = json.dumps(
|
|
|
|
|
{
|
|
|
|
|
"latest": Context.latest_version,
|
|
|
|
|
"supported": 1 if Context.release_supported else 0,
|
|
|
|
|
}
|
|
|
|
|
).encode("utf-8")
|
2025-06-03 05:58:44 +00:00
|
|
|
except LatestVersionCheckError as e:
|
2025-09-23 05:12:49 +00:00
|
|
|
Context.log.error(
|
|
|
|
|
"Failed to retrieve latest version information: %s", e
|
|
|
|
|
)
|
2026-03-16 03:54:27 +00:00
|
|
|
file["data"] = "ERROR: Failed to retrieve latest version info".encode(
|
|
|
|
|
"utf-8"
|
|
|
|
|
)
|
2025-09-23 05:12:49 +00:00
|
|
|
else:
|
|
|
|
|
# Get the dbname. If none was provided, use the default from the
|
|
|
|
|
# config.
|
2026-02-04 05:36:54 +00:00
|
|
|
dbname = metric_args.get("dbname", Context.config["dbname"])
|
2024-05-18 18:09:43 +00:00
|
|
|
|
2025-09-23 05:12:49 +00:00
|
|
|
# Sample the metric
|
|
|
|
|
try:
|
2026-03-16 03:54:27 +00:00
|
|
|
file["data"] = sample_metric(dbname, metric_name, metric_args).encode(
|
|
|
|
|
"utf-8"
|
|
|
|
|
)
|
2025-09-23 05:12:49 +00:00
|
|
|
except UnknownMetricError:
|
|
|
|
|
Context.log.error("Unknown metric: %s", metric_name)
|
2026-03-16 03:54:27 +00:00
|
|
|
file["data"] = "ERROR: Unknown metric".encode("utf-8")
|
2025-09-23 05:12:49 +00:00
|
|
|
except MetricVersionError:
|
|
|
|
|
Context.log.error("Failed to find an query version for %s", metric_name)
|
2026-03-16 03:54:27 +00:00
|
|
|
file["data"] = "ERROR: Unsupported version".encode("utf-8")
|
2025-09-23 05:12:49 +00:00
|
|
|
except UnhappyDBError:
|
|
|
|
|
Context.log.info("Database %s is unhappy, please be patient", dbname)
|
2026-03-16 03:54:27 +00:00
|
|
|
file["data"] = "ERROR: Database unavailable".encode("utf-8")
|
2025-09-23 05:12:49 +00:00
|
|
|
except Exception as e: # pylint: disable=broad-exception-caught
|
|
|
|
|
Context.log.error("Error running query: %s", e)
|
2026-03-16 03:54:27 +00:00
|
|
|
file["data"] = "ERROR: Unexpected error: {}".format(e).encode("utf-8")
|
2024-05-23 04:35:44 +00:00
|
|
|
|
2026-03-16 03:54:27 +00:00
|
|
|
return fh
|
|
|
|
|
|
|
|
|
|
def read(self, fh, off, size):
|
|
|
|
|
print(f"read called with: fh={fh} off={off} size={size}")
|
|
|
|
|
|
|
|
|
|
file = self.file_handles[fh]
|
|
|
|
|
|
|
|
|
|
return file["data"][off : off + size]
|
|
|
|
|
|
|
|
|
|
def forget(self, inode_list):
|
|
|
|
|
"""
|
|
|
|
|
Note: Per the dcumenation, this function must not raise an exception!
|
|
|
|
|
"""
|
|
|
|
|
print(f"forget called with: inode_list={inode_list}")
|
|
|
|
|
|
|
|
|
|
for inode, nlookup in inode_list:
|
|
|
|
|
try:
|
|
|
|
|
self.inodes[inode]["count"] -= nlookup
|
|
|
|
|
if self.inodes[inode]["count"] <= 0:
|
|
|
|
|
del self.inodes[inode]
|
|
|
|
|
except Exception as e:
|
|
|
|
|
Context.log.error("Error forgetting inode %s: %s", inode, e)
|
|
|
|
|
|
|
|
|
|
def release(self, fh):
|
|
|
|
|
print(f"release called with: fh={fh}")
|
|
|
|
|
|
|
|
|
|
del self.file_handles[fh]
|
2025-05-13 05:44:47 +00:00
|
|
|
|
2024-05-16 15:41:47 +00:00
|
|
|
|
2025-09-23 05:12:49 +00:00
|
|
|
def main():
|
|
|
|
|
"""
|
|
|
|
|
Main application routine
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
# Initialize the logging framework
|
|
|
|
|
Context.init_logging()
|
|
|
|
|
|
2024-10-24 16:23:19 +00:00
|
|
|
# Handle cli args
|
|
|
|
|
parser = argparse.ArgumentParser(
|
2025-05-13 05:44:47 +00:00
|
|
|
prog="pgmon", description="A PostgreSQL monitoring agent"
|
|
|
|
|
)
|
2024-05-16 15:41:47 +00:00
|
|
|
|
2025-05-13 05:44:47 +00:00
|
|
|
parser.add_argument(
|
2025-05-22 18:53:25 +00:00
|
|
|
"-c",
|
|
|
|
|
"--config_file",
|
2025-05-13 05:44:47 +00:00
|
|
|
default="pgmon.yml",
|
|
|
|
|
nargs="?",
|
|
|
|
|
help="The config file to read (default: %(default)s)",
|
|
|
|
|
)
|
2024-05-16 15:41:47 +00:00
|
|
|
|
2025-05-22 18:53:25 +00:00
|
|
|
parser.add_argument(
|
|
|
|
|
"-t", "--test", action="store_true", help="Run query tests and exit"
|
|
|
|
|
)
|
2025-05-18 16:52:32 +00:00
|
|
|
|
2024-10-24 16:23:19 +00:00
|
|
|
args = parser.parse_args()
|
2024-10-31 05:03:43 +00:00
|
|
|
|
2024-10-24 16:23:19 +00:00
|
|
|
# Set the config file path
|
2025-09-23 05:12:49 +00:00
|
|
|
Context.config_file = args.config_file
|
2024-05-16 15:41:47 +00:00
|
|
|
|
2024-10-24 16:23:19 +00:00
|
|
|
# Read the config file
|
2025-09-23 05:12:49 +00:00
|
|
|
read_config(Context.config_file)
|
2024-05-16 15:41:47 +00:00
|
|
|
|
2025-05-18 16:52:32 +00:00
|
|
|
# Run query tests and exit if test mode is enabled
|
|
|
|
|
if args.test:
|
2025-09-23 05:12:49 +00:00
|
|
|
if test_queries() > 0:
|
2025-05-18 16:52:32 +00:00
|
|
|
sys.exit(1)
|
2025-09-23 05:12:49 +00:00
|
|
|
sys.exit(0)
|
2025-05-18 16:52:32 +00:00
|
|
|
|
2024-10-24 16:23:19 +00:00
|
|
|
# Set up the signal handler
|
|
|
|
|
signal.signal(signal.SIGINT, signal_handler)
|
|
|
|
|
signal.signal(signal.SIGHUP, signal_handler)
|
2026-02-04 05:36:54 +00:00
|
|
|
signal.signal(signal.SIGUSR1, signal_handler)
|
|
|
|
|
|
|
|
|
|
# Ensure the mount point exists
|
|
|
|
|
if not os.path.exists(Context.config["fuse_base"]):
|
|
|
|
|
os.makedirs(Context.config["fuse_base"])
|
2024-05-16 15:41:47 +00:00
|
|
|
|
2026-02-04 05:36:54 +00:00
|
|
|
# Create the FUSE filesystem
|
2026-03-16 03:54:27 +00:00
|
|
|
pgmon_fuse = PGMonFuse()
|
|
|
|
|
fuse_options = set(llfuse.default_options)
|
|
|
|
|
fuse_options.add("fsname=pgmon")
|
|
|
|
|
# fuse_options.add('direct_io')
|
|
|
|
|
# if options.debug_fuse:
|
|
|
|
|
# fuse_options.add('allow_others')
|
|
|
|
|
try:
|
|
|
|
|
llfuse.init(pgmon_fuse, Context.config["fuse_base"], fuse_options)
|
|
|
|
|
except:
|
|
|
|
|
llfuse.close()
|
|
|
|
|
return -1
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
llfuse.main(workers=1)
|
|
|
|
|
except:
|
|
|
|
|
llfuse.close()
|
|
|
|
|
raise
|
|
|
|
|
|
|
|
|
|
llfuse.close()
|
2024-05-16 15:41:47 +00:00
|
|
|
|
2024-10-24 16:23:19 +00:00
|
|
|
# Clean up PostgreSQL connections
|
|
|
|
|
# TODO: Improve this ... not sure it actually closes all the connections cleanly
|
2025-09-23 05:12:49 +00:00
|
|
|
for pool in Context.connections.values():
|
2026-02-04 05:36:54 +00:00
|
|
|
pool.closeall()
|
|
|
|
|
|
|
|
|
|
logging.shutdown()
|
2025-09-23 05:12:49 +00:00
|
|
|
|
2026-02-04 05:36:54 +00:00
|
|
|
print("Good bye.")
|
2025-09-23 05:12:49 +00:00
|
|
|
|
2026-03-16 03:54:27 +00:00
|
|
|
|
2025-09-23 05:12:49 +00:00
|
|
|
if __name__ == "__main__":
|
|
|
|
|
main()
|