Compare commits

..

9 Commits

4 changed files with 190 additions and 41 deletions

View File

@ -8,14 +8,21 @@ metrics:
0: > 0: >
SELECT datname AS dbname SELECT datname AS dbname
FROM pg_database FROM pg_database
# Note: If the user lacks sufficient privileges, these fields will be NULL.
# The WHERE clause is intended to prevent Zabbix from discovering a
# connection it cannot monitor. Ideally this would generate an error
# instead.
discover_rep: discover_rep:
type: set type: set
query: query:
0: > 0: >
SELECT client_addr || '_' || regexp_replace(application_name, '[ ,]', '_', 'g') AS repid, SELECT host(client_addr) || '_' || regexp_replace(application_name, '[ ,]', '_', 'g') AS repid,
client_addr, client_addr,
state state
FROM pg_stat_replication FROM pg_stat_replication
WHERE state IS NOT NULL
discover_slots: discover_slots:
type: set type: set
query: query:
@ -36,6 +43,7 @@ metrics:
active active
FROM pg_replication_slots FROM pg_replication_slots
## ##
# cluster-wide metrics # cluster-wide metrics
## ##
@ -85,29 +93,29 @@ metrics:
FROM pg_stat_bgwriter bg FROM pg_stat_bgwriter bg
CROSS JOIN pg_stat_checkpointer cp CROSS JOIN pg_stat_checkpointer cp
io_per_backend: io_per_backend:
type: set type: set
query: query:
160000: > 160000: >
SELECT backend_type, SELECT backend_type,
COALESCE(SUM(reads * op_bytes), 0)::bigint AS reads, COALESCE(SUM(reads * op_bytes), 0) AS reads,
COALESCE(SUM(read_time), 0)::bigint AS read_time, COALESCE(SUM(read_time), 0) AS read_time,
COALESCE(SUM(writes * op_bytes), 0)::bigint AS writes, COALESCE(SUM(writes * op_bytes), 0) AS writes,
COALESCE(SUM(write_time), 0)::bigint AS write_time, COALESCE(SUM(write_time), 0) AS write_time,
COALESCE(SUM(writebacks * op_bytes), 0)::bigint AS writebacks, COALESCE(SUM(writebacks * op_bytes), 0) AS writebacks,
COALESCE(SUM(writeback_time), 0)::bigint AS writeback_time, COALESCE(SUM(writeback_time), 0) AS writeback_time,
COALESCE(SUM(extends * op_bytes), 0)::bigint AS extends, COALESCE(SUM(extends * op_bytes), 0) AS extends,
COALESCE(SUM(extend_time), 0)::bigint AS extend_time, COALESCE(SUM(extend_time), 0) AS extend_time,
COALESCE(SUM(op_bytes), 0)::bigint AS op_bytes, COALESCE(SUM(op_bytes), 0) AS op_bytes,
COALESCE(SUM(hits), 0)::bigint AS hits, COALESCE(SUM(hits), 0) AS hits,
COALESCE(SUM(evictions), 0)::bigint AS evictions, COALESCE(SUM(evictions), 0) AS evictions,
COALESCE(SUM(reuses), 0)::bigint AS reuses, COALESCE(SUM(reuses), 0) AS reuses,
COALESCE(SUM(fsyncs), 0)::bigint AS fsyncs, COALESCE(SUM(fsyncs), 0) AS fsyncs,
COALESCE(SUM(fsync_time), 0)::bigint AS fsync_time COALESCE(SUM(fsync_time), 0) AS fsync_time
FROM pg_stat_io FROM pg_stat_io
GROUP BY backend_type GROUP BY backend_type
## ##
# Per-database metrics # Per-database metrics
## ##
@ -139,7 +147,7 @@ metrics:
NULL AS sessions_abandoned, NULL AS sessions_abandoned,
NULL AS sessions_fatal, NULL AS sessions_fatal,
NULL AS sessions_killed, NULL AS sessions_killed,
extract('epoch' from stats_reset)::float AS stats_reset extract('epoch' from stats_reset) AS stats_reset
FROM pg_stat_database WHERE datname = %(dbname)s FROM pg_stat_database WHERE datname = %(dbname)s
140000: > 140000: >
SELECT numbackends, SELECT numbackends,
@ -166,7 +174,7 @@ metrics:
sessions_abandoned, sessions_abandoned,
sessions_fatal, sessions_fatal,
sessions_killed, sessions_killed,
extract('epoch' from stats_reset)::float AS stats_reset extract('epoch' from stats_reset) AS stats_reset
FROM pg_stat_database WHERE datname = %(dbname)s FROM pg_stat_database WHERE datname = %(dbname)s
test_args: test_args:
dbname: postgres dbname: postgres
@ -189,13 +197,40 @@ metrics:
0: > 0: >
SELECT state, SELECT state,
count(*) AS backend_count, count(*) AS backend_count,
COALESCE(EXTRACT(EPOCH FROM max(now() - state_change))::float, 0) AS max_state_time COALESCE(EXTRACT(EPOCH FROM max(now() - state_change)), 0) AS max_state_time
FROM pg_stat_activity FROM pg_stat_activity
WHERE datname = %(dbname)s WHERE datname = %(dbname)s
GROUP BY state GROUP BY state
test_args: test_args:
dbname: postgres dbname: postgres
sequence_usage:
type: value
query:
# 9.2 lacks lateral joins, the pg_sequence_last_value function, and the pg_sequences view
# 0: >
# SELECT COALESCE(MAX(pg_sequence_last_value(c.oid)::float / (pg_sequence_parameters(oid)).maximum_value), 0) AS max_usage
# FROM pg_class c
# WHERE c.relkind = 'S'
# 9.3 - 9.6 lacks the pg_sequence_last_value function, and pg_sequences view
# 90300: >
# SELECT COALESCE(MAX(pg_sequence_last_value(c.oid)::float / s.maximum_value), 0) AS max_usage
# FROM pg_class c
# CROSS JOIN LATERAL pg_sequence_parameters(c.oid) AS s
# WHERE c.relkind = 'S'
100000: SELECT COALESCE(MAX(last_value::float / max_value), 0) AS max_usage FROM pg_sequences;
test_args:
dbname: postgres
sequence_visibility:
type: row
query:
100000: >
SELECT COUNT(*) FILTER (WHERE has_sequence_privilege(c.oid, 'SELECT,USAGE')) AS visible_sequences,
COUNT(*) AS total_sequences
FROM pg_class AS c
WHERE relkind = 'S';
## ##
# Per-replication metrics # Per-replication metrics
@ -205,7 +240,7 @@ metrics:
query: query:
90400: > 90400: >
SELECT pid, usename, SELECT pid, usename,
EXTRACT(EPOCH FROM backend_start)::integer AS backend_start, EXTRACT(EPOCH FROM backend_start) AS backend_start,
state, state,
pg_xlog_location_diff(pg_current_xlog_location(), sent_location) AS sent_lsn, pg_xlog_location_diff(pg_current_xlog_location(), sent_location) AS sent_lsn,
pg_xlog_location_diff(pg_current_xlog_location(), write_location) AS write_lsn, pg_xlog_location_diff(pg_current_xlog_location(), write_location) AS write_lsn,
@ -216,20 +251,21 @@ metrics:
NULL AS replay_lag, NULL AS replay_lag,
sync_state sync_state
FROM pg_stat_replication FROM pg_stat_replication
WHERE client_addr || '_' || regexp_replace(application_name, '[ ,]', '_', 'g') = %(repid)s WHERE host(client_addr) || '_' || regexp_replace(application_name, '[ ,]', '_', 'g') = %(repid)s
100000: > 100000: >
SELECT pid, usename, SELECT pid, usename,
EXTRACT(EPOCH FROM backend_start)::integer AS backend_start, EXTRACT(EPOCH FROM backend_start) AS backend_start,
state, state,
pg_wal_lsn_diff(pg_current_wal_lsn(), sent_lsn) AS sent_lsn, pg_wal_lsn_diff(pg_current_wal_lsn(), sent_lsn) AS sent_lsn,
pg_wal_lsn_diff(pg_current_wal_lsn(), write_lsn) AS write_lsn, pg_wal_lsn_diff(pg_current_wal_lsn(), write_lsn) AS write_lsn,
pg_wal_lsn_diff(pg_current_wal_lsn(), flush_lsn) AS flush_lsn, pg_wal_lsn_diff(pg_current_wal_lsn(), flush_lsn) AS flush_lsn,
pg_wal_lsn_diff(pg_current_wal_lsn(), replay_lsn) AS replay_lsn, pg_wal_lsn_diff(pg_current_wal_lsn(), replay_lsn) AS replay_lsn,
COALESCE(EXTRACT(EPOCH FROM write_lag), 0)::integer AS write_lag, COALESCE(EXTRACT(EPOCH FROM write_lag), 0) AS write_lag,
COALESCE(EXTRACT(EPOCH FROM flush_lag), 0)::integer AS flush_lag, COALESCE(EXTRACT(EPOCH FROM flush_lag), 0) AS flush_lag,
COALESCE(EXTRACT(EPOCH FROM replay_lag), 0)::integer AS replay_lag, COALESCE(EXTRACT(EPOCH FROM replay_lag), 0) AS replay_lag,
sync_state sync_state
FROM pg_stat_replication WHERE client_addr || '_' || regexp_replace(application_name, '[ ,]', '_', 'g') = %(repid)s FROM pg_stat_replication
WHERE host(client_addr) || '_' || regexp_replace(application_name, '[ ,]', '_', 'g') = %(repid)s
test_args: test_args:
repid: 127.0.0.1_test_rep repid: 127.0.0.1_test_rep
@ -261,6 +297,7 @@ metrics:
test_args: test_args:
slot: test_slot slot: test_slot
## ##
# Debugging # Debugging
## ##

View File

@ -27,6 +27,8 @@ from urllib.parse import urlparse, parse_qs
import requests import requests
import re import re
from decimal import Decimal
VERSION = "1.0.3" VERSION = "1.0.3"
# Configuration # Configuration
@ -391,6 +393,16 @@ def get_query(metric, version):
raise MetricVersionError("Missing metric query for PostgreSQL {}".format(version)) raise MetricVersionError("Missing metric query for PostgreSQL {}".format(version))
def json_encode_special(obj):
"""
Encoder function to handle types the standard JSON package doesn't know what
to do with
"""
if isinstance(obj, Decimal):
return float(obj)
raise TypeError(f'Cannot serialize object of {type(obj)}')
def run_query_no_retry(pool, return_type, query, args): def run_query_no_retry(pool, return_type, query, args):
""" """
Run the query with no explicit retry code Run the query with no explicit retry code
@ -408,13 +420,13 @@ def run_query_no_retry(pool, return_type, query, args):
elif return_type == "row": elif return_type == "row":
if len(res) == 0: if len(res) == 0:
return "[]" return "[]"
return json.dumps(res[0]) return json.dumps(res[0], default=json_encode_special)
elif return_type == "column": elif return_type == "column":
if len(res) == 0: if len(res) == 0:
return "[]" return "[]"
return json.dumps([list(r.values())[0] for r in res]) return json.dumps([list(r.values())[0] for r in res], default=json_encode_special)
elif return_type == "set": elif return_type == "set":
return json.dumps(res) return json.dumps(res, default=json_encode_special)
except: except:
dbname = pool.name dbname = pool.name
if dbname in unhappy_cooldown: if dbname in unhappy_cooldown:

View File

@ -5,6 +5,9 @@ import tempfile
import logging import logging
from decimal import Decimal
import json
import pgmon import pgmon
# Silence most logging output # Silence most logging output
@ -789,3 +792,20 @@ metrics:
# Make sure we can pull the RSS file (we assume the 9.6 series won't be getting # Make sure we can pull the RSS file (we assume the 9.6 series won't be getting
# any more updates) # any more updates)
self.assertEqual(pgmon.get_latest_version(), 90624) self.assertEqual(pgmon.get_latest_version(), 90624)
def test_json_encode_special(self):
# Confirm that we're getting the right type
self.assertFalse(isinstance(Decimal('0.5'), float))
self.assertTrue(isinstance(pgmon.json_encode_special(Decimal('0.5')), float))
# Make sure we get sane values
self.assertEqual(pgmon.json_encode_special(Decimal('0.5')), 0.5)
self.assertEqual(pgmon.json_encode_special(Decimal('12')), 12.0)
# Make sure we can still fail for other types
self.assertRaises(
TypeError, pgmon.json_encode_special, object
)
# Make sure we can actually serialize a Decimal
self.assertEqual(json.dumps(Decimal('2.5'), default=pgmon.json_encode_special), '2.5')

View File

@ -785,6 +785,76 @@ zabbix_export:
value: PostgreSQL value: PostgreSQL
- tag: Database - tag: Database
value: '{#DBNAME}' value: '{#DBNAME}'
- uuid: 5960120dd01c4926b0fc1fbe9c011507
name: 'Database max sequence usage in {#DBNAME}'
type: HTTP_AGENT
key: 'pgmon_db_max_sequence[{#DBNAME}]'
delay: 5m
value_type: FLOAT
units: '%'
description: 'The percent of the currently configured value range for the most utilized sequence.'
url: 'http://localhost:{$AGENT_PORT}/sequence_usage'
query_fields:
- name: dbname
value: '{#DBNAME}'
tags:
- tag: Application
value: PostgreSQL
- tag: Database
value: '{#DBNAME}'
- uuid: 48b9cc80ac4d4aee9e9f3a5d6f7d4a95
name: 'Total number of sequences on {#DBNAME}'
type: DEPENDENT
key: 'pgmon_db_sequences[total,{#DBNAME}]'
delay: '0'
description: 'Total number of sequences in the database.'
preprocessing:
- type: JSONPATH
parameters:
- $.total_sequences
master_item:
key: 'pgmon_db_sequence_visibility[{#DBNAME}]'
tags:
- tag: Application
value: PostgreSQL
- tag: Database
value: '{#DBNAME}'
- uuid: 6521a9bab2ac47bf85429832d289bbac
name: 'Visible sequences on {#DBNAME}'
type: DEPENDENT
key: 'pgmon_db_sequences[visible,{#DBNAME}]'
delay: '0'
description: 'Number of sequences in the database for which Zabbix can see stats.'
preprocessing:
- type: JSONPATH
parameters:
- $.visible_sequences
master_item:
key: 'pgmon_db_sequence_visibility[{#DBNAME}]'
tags:
- tag: Application
value: PostgreSQL
- tag: Database
value: '{#DBNAME}'
- uuid: 00f2da3eb99940839410a6ecd5df153f
name: 'Database sequence visibility in {#DBNAME}'
type: HTTP_AGENT
key: 'pgmon_db_sequence_visibility[{#DBNAME}]'
delay: 30m
value_type: TEXT
trends: '0'
description: 'Statistics about the number of sequences that exist and the number Zabbix can actually see stats for.'
url: 'http://localhost:{$AGENT_PORT}/sequence_visibility'
query_fields:
- name: dbname
value: '{#DBNAME}'
tags:
- tag: Application
value: PostgreSQL
- tag: Database
value: '{#DBNAME}'
- tag: Type
value: Raw
- uuid: 492b3cac15f348c2b85f97b69c114d1b - uuid: 492b3cac15f348c2b85f97b69c114d1b
name: 'Database Stats for {#DBNAME}' name: 'Database Stats for {#DBNAME}'
type: HTTP_AGENT type: HTTP_AGENT
@ -803,6 +873,17 @@ zabbix_export:
value: '{#DBNAME}' value: '{#DBNAME}'
- tag: Type - tag: Type
value: Raw value: Raw
trigger_prototypes:
- uuid: d29d0fd9d9d34b5ebd649592b0829ce5
expression: 'last(/PostgreSQL by pgmon/pgmon_db_sequences[total,{#DBNAME}]) <> last(/PostgreSQL by pgmon/pgmon_db_sequences[visible,{#DBNAME}])'
name: 'Sequences not visible to Zabbix on {#DBNAME}'
priority: WARNING
description: 'There are sequences for which Zabbix cannot see usage statistics'
tags:
- tag: Application
value: PostgreSQL
- tag: Component
value: Sequence
graph_prototypes: graph_prototypes:
- uuid: 1f7de43b77714f819e61c31273712b70 - uuid: 1f7de43b77714f819e61c31273712b70
name: 'DML Totals for {#DBNAME}' name: 'DML Totals for {#DBNAME}'
@ -1500,6 +1581,7 @@ zabbix_export:
delay: '0' delay: '0'
history: 90d history: 90d
value_type: FLOAT value_type: FLOAT
units: s
description: 'Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written and flushed it (but not yet applied it). This can be used to gauge the delay that synchronous_commit level on incurred while committing if this server was configured as a synchronous standby.' description: 'Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written and flushed it (but not yet applied it). This can be used to gauge the delay that synchronous_commit level on incurred while committing if this server was configured as a synchronous standby.'
preprocessing: preprocessing:
- type: JSONPATH - type: JSONPATH
@ -1515,13 +1597,12 @@ zabbix_export:
- tag: Database - tag: Database
value: '{#DBNAME}' value: '{#DBNAME}'
- uuid: 624f8f085a3642c9a10a03361c17763d - uuid: 624f8f085a3642c9a10a03361c17763d
name: 'Last flush LSN for {#REPID}' name: 'Last flush LSN lag for {#REPID}'
type: DEPENDENT type: DEPENDENT
key: 'pgmon_rep[flush_lsn,repid={#REPID}]' key: 'pgmon_rep[flush_lsn,repid={#REPID}]'
delay: '0' delay: '0'
history: 90d history: 90d
value_type: TEXT units: B
trends: '0'
description: 'Last write-ahead log location flushed to disk by this standby server' description: 'Last write-ahead log location flushed to disk by this standby server'
preprocessing: preprocessing:
- type: JSONPATH - type: JSONPATH
@ -1543,6 +1624,7 @@ zabbix_export:
delay: '0' delay: '0'
history: 90d history: 90d
value_type: FLOAT value_type: FLOAT
units: s
description: 'Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written, flushed and applied it. This can be used to gauge the delay that synchronous_commit level remote_apply incurred while committing if this server was configured as a synchronous standby.' description: 'Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written, flushed and applied it. This can be used to gauge the delay that synchronous_commit level remote_apply incurred while committing if this server was configured as a synchronous standby.'
preprocessing: preprocessing:
- type: JSONPATH - type: JSONPATH
@ -1558,13 +1640,12 @@ zabbix_export:
- tag: Database - tag: Database
value: '{#DBNAME}' value: '{#DBNAME}'
- uuid: fe1bed51845d4694bae8f53deed4846d - uuid: fe1bed51845d4694bae8f53deed4846d
name: 'Last replay LSN for {#REPID}' name: 'Last replay LSN lag for {#REPID}'
type: DEPENDENT type: DEPENDENT
key: 'pgmon_rep[replay_lsn,repid={#REPID}]' key: 'pgmon_rep[replay_lsn,repid={#REPID}]'
delay: '0' delay: '0'
history: 90d history: 90d
value_type: TEXT units: B
trends: '0'
description: 'Last write-ahead log location replayed into the database on this standby server' description: 'Last write-ahead log location replayed into the database on this standby server'
preprocessing: preprocessing:
- type: JSONPATH - type: JSONPATH
@ -1580,13 +1661,12 @@ zabbix_export:
- tag: Database - tag: Database
value: '{#DBNAME}' value: '{#DBNAME}'
- uuid: 68c179d0e33f45f9bf82d2d4125763f0 - uuid: 68c179d0e33f45f9bf82d2d4125763f0
name: 'Last sent LSN for {#REPID}' name: 'Last sent LSN lag for {#REPID}'
type: DEPENDENT type: DEPENDENT
key: 'pgmon_rep[sent_lsn,repid={#REPID}]' key: 'pgmon_rep[sent_lsn,repid={#REPID}]'
delay: '0' delay: '0'
history: 90d history: 90d
value_type: TEXT units: B
trends: '0'
description: 'Last write-ahead log location sent on this connection' description: 'Last write-ahead log location sent on this connection'
preprocessing: preprocessing:
- type: JSONPATH - type: JSONPATH
@ -1636,6 +1716,7 @@ zabbix_export:
delay: '0' delay: '0'
history: 90d history: 90d
value_type: FLOAT value_type: FLOAT
units: s
description: 'Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written it (but not yet flushed it or applied it). This can be used to gauge the delay that synchronous_commit level remote_write incurred while committing if this server was configured as a synchronous standby.' description: 'Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written it (but not yet flushed it or applied it). This can be used to gauge the delay that synchronous_commit level remote_write incurred while committing if this server was configured as a synchronous standby.'
preprocessing: preprocessing:
- type: JSONPATH - type: JSONPATH
@ -1651,13 +1732,12 @@ zabbix_export:
- tag: Database - tag: Database
value: '{#DBNAME}' value: '{#DBNAME}'
- uuid: 57fb03cf63af4b0a91d8e36d6ff64d30 - uuid: 57fb03cf63af4b0a91d8e36d6ff64d30
name: 'Last write LSN for {#REPID}' name: 'Last write LSN lag for {#REPID}'
type: DEPENDENT type: DEPENDENT
key: 'pgmon_rep[write_lsn,repid={#REPID}]' key: 'pgmon_rep[write_lsn,repid={#REPID}]'
delay: '0' delay: '0'
history: 90d history: 90d
value_type: TEXT units: B
trends: '0'
description: 'Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written it (but not yet flushed it or applied it). This can be used to gauge the delay that synchronous_commit level remote_write incurred while committing if this server was configured as a synchronous standby.' description: 'Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written it (but not yet flushed it or applied it). This can be used to gauge the delay that synchronous_commit level remote_write incurred while committing if this server was configured as a synchronous standby.'
preprocessing: preprocessing:
- type: JSONPATH - type: JSONPATH