Add rollback ratio alert tuning parameters
All checks were successful
PostgreSQL Compatibility Matrix / PG14 smoke (push) Successful in 7s
PostgreSQL Compatibility Matrix / PG15 smoke (push) Successful in 7s
PostgreSQL Compatibility Matrix / PG16 smoke (push) Successful in 7s
PostgreSQL Compatibility Matrix / PG17 smoke (push) Successful in 8s
PostgreSQL Compatibility Matrix / PG18 smoke (push) Successful in 9s

Introduced new parameters to fine-tune rollback ratio alerts and reduce false positives on low-traffic databases. Adjusted evaluation logic to account for minimum rollback counts and transaction volumes, ensuring more reliable alert thresholds. Updated .env.example and descriptions for better configuration clarity.
This commit is contained in:
2026-02-12 15:49:44 +01:00
parent ec05163a04
commit 7599b3742d
3 changed files with 41 additions and 11 deletions

View File

@@ -45,6 +45,10 @@ POLL_INTERVAL_SECONDS=30
# Active Connection Ratio alert is only evaluated when total sessions
# are at least this number (reduces false positives on low-traffic DBs).
ALERT_ACTIVE_CONNECTION_RATIO_MIN_TOTAL_CONNECTIONS=5
# Rollback Ratio tuning to reduce false positives on low traffic.
ALERT_ROLLBACK_RATIO_WINDOW_MINUTES=15
ALERT_ROLLBACK_RATIO_MIN_TOTAL_TRANSACTIONS=100
ALERT_ROLLBACK_RATIO_MIN_ROLLBACKS=10
# Initial admin bootstrap user (created on first startup if not present).
INIT_ADMIN_EMAIL=admin@example.com
INIT_ADMIN_PASSWORD=ChangeMe123!

View File

@@ -26,6 +26,9 @@ class Settings(BaseSettings):
cors_origins: str = "http://localhost:5173"
poll_interval_seconds: int = 30
alert_active_connection_ratio_min_total_connections: int = 5
alert_rollback_ratio_window_minutes: int = 15
alert_rollback_ratio_min_total_transactions: int = 100
alert_rollback_ratio_min_rollbacks: int = 10
init_admin_email: str = "admin@example.com"
init_admin_password: str = "ChangeMe123!"

View File

@@ -152,17 +152,20 @@ async def _metric_delta(db: AsyncSession, target_id: int, metric_name: str, minu
async def _rollback_ratio_recent(
db: AsyncSession, target_id: int, minutes: int, min_total_transactions: int
) -> tuple[float | None, float]:
db: AsyncSession, target_id: int, minutes: int, min_total_transactions: int, min_rollbacks: int
) -> tuple[float | None, float, float]:
commit_delta = await _metric_delta(db, target_id, "xact_commit", minutes=minutes)
rollback_delta = await _metric_delta(db, target_id, "xact_rollback", minutes=minutes)
if commit_delta is None or rollback_delta is None:
return None, 0.0
return None, 0.0, 0.0
tx_total = commit_delta + rollback_delta
if tx_total < float(min_total_transactions):
# Too little traffic in window, ratio would be noisy and misleading.
return None, tx_total
return (rollback_delta / tx_total) if tx_total > 0 else None, tx_total
return None, tx_total, rollback_delta
if rollback_delta < float(min_rollbacks):
# Ignore tiny rollback counts even if ratio appears high on low absolute numbers.
return None, tx_total, rollback_delta
return (rollback_delta / tx_total) if tx_total > 0 else None, tx_total, rollback_delta
async def _latest_query_snapshot_max(db: AsyncSession, target_id: int, column_name: str) -> float | None:
@@ -275,8 +278,13 @@ async def _build_standard_rules(db: AsyncSession, target: Target) -> tuple[list[
):
active_ratio = active_connections / total_connections
rollback_ratio_15m, tx_total_15m = await _rollback_ratio_recent(
db, target.id, minutes=15, min_total_transactions=20
rollback_ratio_window = settings.alert_rollback_ratio_window_minutes
rollback_ratio_val, tx_total_window, rollback_count_window = await _rollback_ratio_recent(
db,
target.id,
minutes=rollback_ratio_window,
min_total_transactions=settings.alert_rollback_ratio_min_total_transactions,
min_rollbacks=settings.alert_rollback_ratio_min_rollbacks,
)
rules.extend(
@@ -324,9 +332,13 @@ async def _build_standard_rules(db: AsyncSession, target: Target) -> tuple[list[
_RuleInput(
key="rollback_ratio",
name="Rollback Ratio",
description="Fraction of rolled back transactions in the last 15 minutes (evaluated only when at least 20 transactions occurred).",
description=(
f"Fraction of rolled back transactions in the last {rollback_ratio_window} minutes "
f"(evaluated only when at least {settings.alert_rollback_ratio_min_total_transactions} "
f"transactions and {settings.alert_rollback_ratio_min_rollbacks} rollbacks occurred)."
),
category="transactions",
value=rollback_ratio_15m,
value=rollback_ratio_val,
warning_threshold=0.10,
alert_threshold=0.25,
),
@@ -364,9 +376,20 @@ async def _build_standard_rules(db: AsyncSession, target: Target) -> tuple[list[
_RuleInput(
key="rollback_tx_volume_15m",
name="Rollback Ratio Evaluation Volume",
description="Total transactions in the last 15 minutes used for rollback-ratio evaluation.",
description=f"Total transactions in the last {rollback_ratio_window} minutes used for rollback-ratio evaluation.",
category="transactions",
value=tx_total_15m,
value=tx_total_window,
warning_threshold=None,
alert_threshold=None,
)
)
rules.append(
_RuleInput(
key="rollback_count_window",
name="Rollback Count (Window)",
description=f"Rollback count in the last {rollback_ratio_window} minutes used for rollback-ratio evaluation.",
category="transactions",
value=rollback_count_window,
warning_threshold=None,
alert_threshold=None,
)