Leave failed dbs alone for a bit
This commit is contained in:
parent
237ee9d2a2
commit
1e9f8e4b1d
71
pgmon.py
71
pgmon.py
@ -7,6 +7,8 @@ import time
|
|||||||
import argparse
|
import argparse
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
import psycopg
|
import psycopg
|
||||||
from psycopg_pool import ConnectionPool
|
from psycopg_pool import ConnectionPool
|
||||||
|
|
||||||
@ -26,6 +28,11 @@ config = {}
|
|||||||
connections_lock = Lock()
|
connections_lock = Lock()
|
||||||
connections = {}
|
connections = {}
|
||||||
|
|
||||||
|
# Dictionary of unhappy databases. Keys are database names, value is the time
|
||||||
|
# the database was determined to be unhappy plus the cooldown setting. So,
|
||||||
|
# basically it's the time when we should try to connect to the database again.
|
||||||
|
unhappy = {}
|
||||||
|
|
||||||
# Running state (used to gracefully shut down)
|
# Running state (used to gracefully shut down)
|
||||||
running = True
|
running = True
|
||||||
|
|
||||||
@ -47,6 +54,8 @@ class ConfigError(Exception):
|
|||||||
pass
|
pass
|
||||||
class DisconnectedError(Exception):
|
class DisconnectedError(Exception):
|
||||||
pass
|
pass
|
||||||
|
class UnhappyDBError(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
# Default config settings
|
# Default config settings
|
||||||
default_config = {
|
default_config = {
|
||||||
@ -65,6 +74,12 @@ default_config = {
|
|||||||
# Default database to connect to when none is specified for a metric
|
# Default database to connect to when none is specified for a metric
|
||||||
'dbname': 'postgres',
|
'dbname': 'postgres',
|
||||||
|
|
||||||
|
# PostgreSQL connection timeout (seconds)
|
||||||
|
'connect_timeout': 5,
|
||||||
|
|
||||||
|
# Time to wait before trying to reconnect again after a reconnect failure
|
||||||
|
'reconnect_cooldown': 30,
|
||||||
|
|
||||||
# Metrics
|
# Metrics
|
||||||
'metrics': {}
|
'metrics': {}
|
||||||
}
|
}
|
||||||
@ -162,6 +177,11 @@ def get_pool(dbname):
|
|||||||
"""
|
"""
|
||||||
Get a database connection pool.
|
Get a database connection pool.
|
||||||
"""
|
"""
|
||||||
|
# Check if the db is unhappy and wants to be left alone
|
||||||
|
if dbname in unhappy:
|
||||||
|
if unhappy[dbname] > datetime.now():
|
||||||
|
raise UnhappyDBError()
|
||||||
|
|
||||||
if dbname not in connections:
|
if dbname not in connections:
|
||||||
with connections_lock:
|
with connections_lock:
|
||||||
# Make sure nobody created the pool while we were waiting on the
|
# Make sure nobody created the pool while we were waiting on the
|
||||||
@ -169,10 +189,22 @@ def get_pool(dbname):
|
|||||||
if dbname not in connections:
|
if dbname not in connections:
|
||||||
log.info(f"Creating connection pool for: {dbname}")
|
log.info(f"Creating connection pool for: {dbname}")
|
||||||
connections[dbname] = ConnectionPool(
|
connections[dbname] = ConnectionPool(
|
||||||
min_size=0, max_size=config['pool_size'],
|
name=dbname,
|
||||||
conninfo=f"dbname={dbname} {config['connstr']}")
|
min_size=0, max_size=int(config['pool_size']),
|
||||||
|
conninfo=f"dbname={dbname} application_name=pgmon {config['connstr']}",
|
||||||
|
reconnect_timeout=float(config['connect_timeout']),
|
||||||
|
reconnect_failed=handle_connect_failure)
|
||||||
|
# Clear the unhappy indicator if present
|
||||||
|
unhappy.pop(dbname, None)
|
||||||
return connections[dbname]
|
return connections[dbname]
|
||||||
|
|
||||||
|
def handle_connect_failure(pool):
|
||||||
|
"""
|
||||||
|
Mark the database as being unhappy so we can leave it alone for a while
|
||||||
|
"""
|
||||||
|
dbname = pool.name
|
||||||
|
unhappy[dbname] = datetime.now() + timedelta(seconds=int(config['reconnect_cooldown']))
|
||||||
|
|
||||||
|
|
||||||
def get_query(metric, version):
|
def get_query(metric, version):
|
||||||
"""
|
"""
|
||||||
@ -190,8 +222,8 @@ def get_query(metric, version):
|
|||||||
raise Exception('Missing metric query')
|
raise Exception('Missing metric query')
|
||||||
|
|
||||||
|
|
||||||
def run_query_no_reconnect(pool, return_type, query, args):
|
def run_query_no_retry(pool, return_type, query, args):
|
||||||
with pool.connection() as conn:
|
with pool.connection(timeout=float(config['connect_timeout'])) as conn:
|
||||||
try:
|
try:
|
||||||
with conn.cursor(row_factory=psycopg.rows.dict_row) as curs:
|
with conn.cursor(row_factory=psycopg.rows.dict_row) as curs:
|
||||||
curs.execute(query, args)
|
curs.execute(query, args)
|
||||||
@ -206,7 +238,10 @@ def run_query_no_reconnect(pool, return_type, query, args):
|
|||||||
elif return_type == 'set':
|
elif return_type == 'set':
|
||||||
return json.dumps(res)
|
return json.dumps(res)
|
||||||
except:
|
except:
|
||||||
if conn.broken:
|
dbname = pool.name
|
||||||
|
if dbname in unhappy:
|
||||||
|
raise UnhappyDBError()
|
||||||
|
elif conn.broken:
|
||||||
raise DisconnectedError()
|
raise DisconnectedError()
|
||||||
else:
|
else:
|
||||||
raise
|
raise
|
||||||
@ -215,11 +250,14 @@ def run_query(pool, return_type, query, args):
|
|||||||
# If we get disconnected, I think the putconn command will close the dead
|
# If we get disconnected, I think the putconn command will close the dead
|
||||||
# connection. So we can just give it another shot.
|
# connection. So we can just give it another shot.
|
||||||
try:
|
try:
|
||||||
return run_query_no_reconnect(pool, return_type, query, args)
|
return run_query_no_retry(pool, return_type, query, args)
|
||||||
except DisconnectedError:
|
except DisconnectedError:
|
||||||
log.warning("Stale PostgreSQL connection found ... trying again")
|
log.warning("Stale PostgreSQL connection found ... trying again")
|
||||||
|
# This sleep is an annoyinh hack to give the pool workers time to
|
||||||
|
# actually mark the connection, otherwise it can be given back in the
|
||||||
|
# next connection() call
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
return run_query_no_reconnect(pool, return_type, query, args)
|
return run_query_no_retry(pool, return_type, query, args)
|
||||||
|
|
||||||
class SimpleHTTPRequestHandler(BaseHTTPRequestHandler):
|
class SimpleHTTPRequestHandler(BaseHTTPRequestHandler):
|
||||||
def log_request(self, code='-', size='-'):
|
def log_request(self, code='-', size='-'):
|
||||||
@ -252,7 +290,12 @@ class SimpleHTTPRequestHandler(BaseHTTPRequestHandler):
|
|||||||
dbname = args.get('dbname', config['dbname'])
|
dbname = args.get('dbname', config['dbname'])
|
||||||
|
|
||||||
# Get the connection pool for the database
|
# Get the connection pool for the database
|
||||||
|
try:
|
||||||
pool = get_pool(dbname)
|
pool = get_pool(dbname)
|
||||||
|
except UnhappyDBError:
|
||||||
|
log.info(f"Database {dbname} is unhappy, please be patient")
|
||||||
|
self._reply(503, 'Database unavailable')
|
||||||
|
return
|
||||||
|
|
||||||
# Identify the PostgreSQL version
|
# Identify the PostgreSQL version
|
||||||
try:
|
try:
|
||||||
@ -260,7 +303,13 @@ class SimpleHTTPRequestHandler(BaseHTTPRequestHandler):
|
|||||||
except KeyError:
|
except KeyError:
|
||||||
try:
|
try:
|
||||||
version = int(run_query(pool, 'value', 'SHOW server_version_num', None))
|
version = int(run_query(pool, 'value', 'SHOW server_version_num', None))
|
||||||
|
except UnhappyDBError:
|
||||||
|
return
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
if dbname in unhappy:
|
||||||
|
log.info(f"Database {dbname} is unhappy, please be patient")
|
||||||
|
self._reply(503, 'Database unavailable')
|
||||||
|
else:
|
||||||
log.error(f"Failed to get PostgreSQL version: {e}")
|
log.error(f"Failed to get PostgreSQL version: {e}")
|
||||||
self._reply(500, 'Error getting DB version')
|
self._reply(500, 'Error getting DB version')
|
||||||
return
|
return
|
||||||
@ -268,7 +317,7 @@ class SimpleHTTPRequestHandler(BaseHTTPRequestHandler):
|
|||||||
# Get the query version
|
# Get the query version
|
||||||
try:
|
try:
|
||||||
query = get_query(metric, version)
|
query = get_query(metric, version)
|
||||||
except:
|
except KeyError:
|
||||||
log.error(f"Failed to find a version of {name} for {version}")
|
log.error(f"Failed to find a version of {name} for {version}")
|
||||||
self._reply(404, 'Unsupported version')
|
self._reply(404, 'Unsupported version')
|
||||||
return
|
return
|
||||||
@ -276,9 +325,15 @@ class SimpleHTTPRequestHandler(BaseHTTPRequestHandler):
|
|||||||
# Execute the quert
|
# Execute the quert
|
||||||
try:
|
try:
|
||||||
self._reply(200, run_query(pool, metric['type'], query, args))
|
self._reply(200, run_query(pool, metric['type'], query, args))
|
||||||
|
return
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
if dbname in unhappy:
|
||||||
|
log.info(f"Database {dbname} is unhappy, please be patient")
|
||||||
|
self._reply(503, 'Database unavailable')
|
||||||
|
else:
|
||||||
log.error(f"Error running query: {e}")
|
log.error(f"Error running query: {e}")
|
||||||
self._reply(500, "Error running query")
|
self._reply(500, "Error running query")
|
||||||
|
return
|
||||||
|
|
||||||
def _reply(self, code, content):
|
def _reply(self, code, content):
|
||||||
self.send_response(code)
|
self.send_response(code)
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user