Compare commits

..

No commits in common. "b7f731c6ac9aec900a50ca4dad79f363afcb6f5a" and "9225e745b02538ff5ebd41daf81352ec2fb1a634" have entirely different histories.

4 changed files with 28 additions and 230 deletions

View File

@ -1,44 +1,20 @@
metrics: metrics:
##
# Discovery metrics # Discovery metrics
##
discover_dbs: discover_dbs:
type: set type: set
query: query:
0: > 0: SELECT datname AS dbname FROM pg_database
SELECT datname AS dbname
FROM pg_database
discover_rep: discover_rep:
type: set type: set
query: query:
0: > 0: SELECT client_addr || '_' || regexp_replace(application_name, '[ ,]', '_', 'g') AS repid, client_addr, state FROM pg_stat_replication
SELECT client_addr || '_' || regexp_replace(application_name, '[ ,]', '_', 'g') AS repid,
client_addr,
state
FROM pg_stat_replication
discover_slots: discover_slots:
type: set type: set
query: query:
90400: > 90400: SELECT slot_name, plugin, slot_type, database, false as temporary, active FROM pg_replication_slots
SELECT slot_name, 100000: SELECT slot_name, plugin, slot_type, database, temporary, active FROM pg_replication_slots
plugin,
slot_type,
database,
false as temporary,
active
FROM pg_replication_slots
100000: >
SELECT slot_name,
plugin,
slot_type,
database,
temporary,
active
FROM pg_replication_slots
##
# cluster-wide metrics # cluster-wide metrics
##
version: version:
type: value type: value
query: query:
@ -47,216 +23,42 @@ metrics:
max_frozen_age: max_frozen_age:
type: row type: row
query: query:
0: > 0: SELECT max(age(datfrozenxid)) AS xid_age, 0 AS mxid_age FROM pg_database
SELECT max(age(datfrozenxid)) AS xid_age, 90600: SELECT max(age(datfrozenxid)) AS xid_age, max(mxid_age(datminmxid)) AS mxid_age FROM pg_database
0 AS mxid_age
FROM pg_database
90600: >
SELECT max(age(datfrozenxid)) AS xid_age,
max(mxid_age(datminmxid)) AS mxid_age
FROM pg_database
bgwriter:
type: row
query:
0: >
SELECT checkpoints_timed,
checkpoints_req,
checkpoint_write_time,
checkpoint_sync_time,
buffers_checkpoint,
buffers_clean,
maxwritten_clean,
buffers_backend,
buffers_backend_fsync,
buffers_alloc
FROM pg_stat_bgwriter
170000: >
SELECT cp.num_timed AS checkpoints_timed,
cp.num_requested AS checkpoints_req,
cp.write_time AS checkpoint_write_time,
cp.sync_time AS checkpoint_sync_time,
cp.buffers_written AS buffers_checkpoint,
bg.buffers_clean AS buffers_clean,
bg.maxwritten_clean AS maxwritten_clean,
NULL AS buffers_backend,
NULL AS buffers_backend_fsync,
bg.buffers_alloc AS buffers_alloc
FROM pg_stat_bgwriter bg
CROSS JOIN pg_stat_checkpointer cp
##
# Per-database metrics # Per-database metrics
##
db_stats: db_stats:
type: row type: row
query: query:
0: > 0: SELECT numbackends, xact_commit, xact_rollback, blks_read, blks_hit, tup_returned, tup_fetched, tup_inserted, tup_updated, tup_deleted, conflicts, temp_files, temp_bytes, deadlocks, blk_read_time, blk_write_time, extract('epoch' from stats_reset)::float FROM pg_stat_database WHERE datname = %(dbname)s
SELECT numbackends, 140000: SELECT numbackends, xact_commit, xact_rollback, blks_read, blks_hit, tup_returned, tup_fetched, tup_inserted, tup_updated, tup_deleted, conflicts, temp_files, temp_bytes, deadlocks, COALESCE(checksum_failures, 0) AS checksum_failures, blk_read_time, blk_write_time, session_time, active_time, idle_in_transaction_time, sessions, sessions_abandoned, sessions_fatal, sessions_killed, extract('epoch' from stats_reset)::float FROM pg_stat_database WHERE datname = %(dbname)s
xact_commit,
xact_rollback,
blks_read,
blks_hit,
tup_returned,
tup_fetched,
tup_inserted,
tup_updated,
tup_deleted,
conflicts,
temp_files,
temp_bytes,
deadlocks,
0 AS checksum_failures,
blk_read_time,
blk_write_time,
0 AS session_time,
0 AS active_time,
0 AS idle_in_transaction_time,
0 AS sessions,
0 AS sessions_abandoned,
0 AS sessions_fatal,
0 AS sessions_killed,
extract('epoch' from stats_reset)::float AS stats_reset
FROM pg_stat_database WHERE datname = %(dbname)s
140000: >
SELECT numbackends,
xact_commit,
xact_rollback,
blks_read,
blks_hit,
tup_returned,
tup_fetched,
tup_inserted,
tup_updated,
tup_deleted,
conflicts,
temp_files,
temp_bytes,
deadlocks,
COALESCE(checksum_failures,
0) AS checksum_failures,
blk_read_time,
blk_write_time,
session_time,
active_time,
idle_in_transaction_time,
sessions,
sessions_abandoned,
sessions_fatal,
sessions_killed,
extract('epoch' from stats_reset)::float AS stats_reset
FROM pg_stat_database WHERE datname = %(dbname)s
test_args: test_args:
dbname: postgres dbname: postgres
hit_ratios: hit_ratios:
type: row type: row
query: query:
0: > 0: SELECT sum(heap_blks_read)::float / NULLIF(sum(heap_blks_read + heap_blks_hit), 0) AS avg_heap_hit_ratio, sum(idx_blks_hit)::float / NULLIF(sum(idx_blks_read + idx_blks_hit), 0) AS avg_idx_hit_ratio, sum(toast_blks_hit)::float / NULLIF(sum(toast_blks_read + toast_blks_hit), 0) AS avg_toast_hit_ratio, sum(tidx_blks_hit)::float / NULLIF(sum(tidx_blks_read + tidx_blks_hit), 0) AS avg_tidx_hit_ratio FROM pg_statio_all_tables
SELECT sum(heap_blks_read)::float / NULLIF(sum(heap_blks_read + heap_blks_hit), 0) AS avg_heap_hit_ratio,
sum(idx_blks_hit)::float / NULLIF(sum(idx_blks_read + idx_blks_hit), 0) AS avg_idx_hit_ratio,
sum(toast_blks_hit)::float / NULLIF(sum(toast_blks_read + toast_blks_hit), 0) AS avg_toast_hit_ratio,
sum(tidx_blks_hit)::float / NULLIF(sum(tidx_blks_read + tidx_blks_hit), 0) AS avg_tidx_hit_ratio
FROM pg_statio_all_tables
test_args:
dbname: postgres
activity:
type: set
query:
0: >
SELECT state,
count(*) AS backend_count,
COALESCE(EXTRACT(EPOCH FROM max(now() - state_change))::float, 0) AS max_state_time
FROM pg_stat_activity
WHERE datname = %(dbname)s
GROUP BY state
test_args:
dbname: postgres
##
# Per-replication metrics
##
rep_stats:
type: row
query:
90400: >
SELECT pid, usename,
EXTRACT(EPOCH FROM backend_start)::integer AS backend_start,
state,
pg_xlog_location_diff(pg_current_xlog_location(), sent_location) AS sent_lsn,
pg_xlog_location_diff(pg_current_xlog_location(), write_location) AS write_lsn,
pg_xlog_location_diff(pg_current_xlog_location(), flush_location) AS flush_lsn,
pg_xlog_location_diff(pg_current_xlog_location(), replay_location) AS replay_lsn,
NULL AS write_lag,
NULL AS flush_lag,
NULL AS replay_lag,
sync_state
FROM pg_stat_replication
WHERE client_addr || '_' || regexp_replace(application_name, '[ ,]', '_', 'g') = %(repid)s
90600: >
SELECT pid, usename,
EXTRACT(EPOCH FROM backend_start)::integer AS backend_start,
state,
pg_xlog_lsn_diff(pg_current_xlog_lsn(), sent_lsn) AS sent_lsn,
pg_xlog_lsn_diff(pg_current_xlog_lsn(), write_lsn) AS write_lsn,
pg_xlog_lsn_diff(pg_current_xlog_lsn(), flush_lsn) AS flush_lsn,
pg_xlog_lsn_diff(pg_current_xlog_lsn(), replay_lsn) AS replay_lsn,
COALESCE(EXTRACT(EPOCH FROM write_lag), 0)::integer AS write_lag,
COALESCE(EXTRACT(EPOCH FROM flush_lag), 0)::integer AS flush_lag,
COALESCE(EXTRACT(EPOCH FROM replay_lag), 0)::integer AS replay_lag,
sync_state
FROM pg_stat_replication WHERE client_addr || '_' || regexp_replace(application_name, '[ ,]', '_', 'g') = %(repid)s
100000: >
SELECT pid, usename,
EXTRACT(EPOCH FROM backend_start)::integer AS backend_start,
state,
pg_wal_lsn_diff(pg_current_wal_lsn(), sent_lsn) AS sent_lsn,
pg_wal_lsn_diff(pg_current_wal_lsn(), write_lsn) AS write_lsn,
pg_wal_lsn_diff(pg_current_wal_lsn(), flush_lsn) AS flush_lsn,
pg_wal_lsn_diff(pg_current_wal_lsn(), replay_lsn) AS replay_lsn,
COALESCE(EXTRACT(EPOCH FROM write_lag), 0)::integer AS write_lag,
COALESCE(EXTRACT(EPOCH FROM flush_lag), 0)::integer AS flush_lag,
COALESCE(EXTRACT(EPOCH FROM replay_lag), 0)::integer AS replay_lag,
sync_state
FROM pg_stat_replication WHERE client_addr || '_' || regexp_replace(application_name, '[ ,]', '_', 'g') = %(repid)s
test_args:
repid: 127.0.0.1_test_rep
##
# Per-slot metrics
##
slot_stats:
type: row
query:
90400: >
SELECT NULL as active_pid,
xmin,
pg_xlog_location_diff(pg_current_xlog_location(), restart_lsn) AS restart_bytes,
NULL AS confirmed_flush_bytes
FROM pg_replication_slots WHERE slot_name = %(slot)s
90600: >
SELECT active_pid,
xmin,
pg_xlog_location_diff(pg_current_xlog_location(), restart_lsn) AS restart_bytes,
pg_xlog_location_diff(pg_current_xlog_location(), confirmed_flush_lsn) AS confirmed_flush_bytes
FROM pg_replication_slots WHERE slot_name = %(slot)s
100000: >
SELECT active_pid,
xmin,
pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn) AS restart_bytes,
pg_wal_lsn_diff(pg_current_wal_lsn(), confirmed_flush_lsn) AS confirmed_flush_bytes
FROM pg_replication_slots WHERE slot_name = %(slot)s
test_args:
slot: test_slot
##
# Debugging # Debugging
##
ntables: ntables:
type: value type: value
query: query:
0: SELECT count(*) AS ntables FROM pg_stat_user_tables 0: SELECT count(*) AS ntables FROM pg_stat_user_tables
# Per-replication metrics
rep_stats:
type: row
query:
90400: SELECT pid, usename, EXTRACT(EPOCH FROM backend_start)::integer AS backend_start, state, sent_lsn, write_lsn, flush_lsn, replay_lsn, COALESCE(EXTRACT(EPOCH FROM write_lag), 0)::integer AS write_lag, COALESCE(EXTRACT(EPOCH FROM flush_lag), 0)::integer AS flush_lag, COALESCE(EXTRACT(EPOCH FROM replay_lag), 0)::integer AS replay_lag, sync_state FROM pg_stat_replication WHERE client_addr || '_' || regexp_replace(application_name, '[ ,]', '_', 'g') = %(repid)s
test_args:
repid: 127.0.0.1_test_rep
# Per-slot metrics
slot_stats:
type: row
query:
90400: SELECT active_pid, xmin, pg_xlog_location_diff(pg_current_xlog_location(), restart_lsn) AS restart_bytes, pg_xlog_location_diff(pg_current_xlog_location(), confirmed_flush_lsn) AS confirmed_flush_bytes FROM pg_replication_slots WHERE slot_name = %(slot)s
100000: SELECT active_pid, xmin, pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn) AS restart_bytes, pg_wal_lsn_diff(pg_current_wal_lsn(), confirmed_flush_lsn) AS confirmed_flush_bytes FROM pg_replication_slots WHERE slot_name = %(slot)s
test_args:
slot: test_slot

View File

@ -655,7 +655,6 @@ def test_queries():
for name, metric in config["metrics"].items(): for name, metric in config["metrics"].items():
# If the metric has arguments to use while testing, grab those # If the metric has arguments to use while testing, grab those
args = metric.get("test_args", {}) args = metric.get("test_args", {})
print("Testing {} [{}]".format(name, ", ".join(["{}={}".format(key, value) for key, value in args.items()])))
# Run the query without the ability to retry. # Run the query without the ability to retry.
try: try:
res = sample_metric(dbname, name, args, retry=False) res = sample_metric(dbname, name, args, retry=False)

View File

@ -23,7 +23,7 @@ services:
test: [ "CMD-SHELL", "pg_controldata /var/lib/postgresql/data/ | grep -q 'in production'" ] test: [ "CMD-SHELL", "pg_controldata /var/lib/postgresql/data/ | grep -q 'in production'" ]
interval: 5s interval: 5s
timeout: 2s timeout: 2s
retries: 40 retries: 20
command: > command: >
postgres -c ssl=on postgres -c ssl=on
-c ssl_cert_file='/etc/ssl/certs/ssl-cert-snakeoil.pem' -c ssl_cert_file='/etc/ssl/certs/ssl-cert-snakeoil.pem'

View File

@ -6,15 +6,12 @@ versions=( $@ )
# If we weren't given any versions, test them all # If we weren't given any versions, test them all
if [ ${#versions[@]} -eq 0 ] if [ ${#versions[@]} -eq 0 ]
then then
versions=( 9.2 9.4 9.6 10 11 12 13 14 15 16 17 ) versions=( 9.2 9.6 10 11 12 13 14 15 16 17 )
fi fi
# Image tags to use # Image tags to use
declare -A images=() declare -A images=()
images["9.2"]='9.2' images["9.2"]='9.2'
images["9.3"]='9.3'
images["9.4"]='9.4'
images["9.5"]='9.5'
images["9.6"]='9.6-bullseye' images["9.6"]='9.6-bullseye'
images["10"]='10-bullseye' images["10"]='10-bullseye'
images["11"]='11-bookworm' images["11"]='11-bookworm'