Refactor rollback ratio calculation and thresholds.
Some checks failed
PostgreSQL Compatibility Matrix / PG14 smoke (push) Failing after 3m17s
PostgreSQL Compatibility Matrix / PG15 smoke (push) Failing after 1m17s
PostgreSQL Compatibility Matrix / PG17 smoke (push) Has been cancelled
PostgreSQL Compatibility Matrix / PG18 smoke (push) Has been cancelled
PostgreSQL Compatibility Matrix / PG16 smoke (push) Has been cancelled

Introduced a new helper function `_rollback_ratio_recent` to calculate the rollback ratio over the last 15 minutes, ensuring meaningful evaluation only when a minimum transaction threshold is met. Adjusted warning and alert thresholds for rollback ratio and added a contextual metric for transaction volume in the past 15 minutes.
This commit is contained in:
2026-02-12 14:23:53 +01:00
parent 9eb94545a1
commit a0ba4e1314

View File

@@ -151,6 +151,20 @@ async def _metric_delta(db: AsyncSession, target_id: int, metric_name: str, minu
return max(0.0, float(latest) - float(oldest))
async def _rollback_ratio_recent(
db: AsyncSession, target_id: int, minutes: int, min_total_transactions: int
) -> tuple[float | None, float]:
commit_delta = await _metric_delta(db, target_id, "xact_commit", minutes=minutes)
rollback_delta = await _metric_delta(db, target_id, "xact_rollback", minutes=minutes)
if commit_delta is None or rollback_delta is None:
return None, 0.0
tx_total = commit_delta + rollback_delta
if tx_total < float(min_total_transactions):
# Too little traffic in window, ratio would be noisy and misleading.
return None, tx_total
return (rollback_delta / tx_total) if tx_total > 0 else None, tx_total
async def _latest_query_snapshot_max(db: AsyncSession, target_id: int, column_name: str) -> float | None:
latest_ts = await db.scalar(select(func.max(QueryStat.ts)).where(QueryStat.target_id == target_id))
if latest_ts is None:
@@ -248,8 +262,6 @@ async def _build_standard_rules(db: AsyncSession, target: Target) -> tuple[list[
total_connections = await _latest_metric_value(db, target.id, "connections_total")
cache_hit_ratio = await _latest_metric_value(db, target.id, "cache_hit_ratio")
lock_count = await _latest_metric_value(db, target.id, "locks_total")
xact_commit = await _latest_metric_value(db, target.id, "xact_commit")
xact_rollback = await _latest_metric_value(db, target.id, "xact_rollback")
checkpoints_req_delta = await _metric_delta(db, target.id, "checkpoints_req", minutes=15)
deadlocks_delta = await _metric_delta(db, target.id, "deadlocks", minutes=60)
slowest_query_mean = await _latest_query_snapshot_max(db, target.id, "mean_time")
@@ -259,11 +271,9 @@ async def _build_standard_rules(db: AsyncSession, target: Target) -> tuple[list[
if active_connections is not None and total_connections and total_connections > 0:
active_ratio = active_connections / total_connections
rollback_ratio = None
if xact_commit is not None and xact_rollback is not None:
tx_total = xact_commit + xact_rollback
if tx_total > 0:
rollback_ratio = xact_rollback / tx_total
rollback_ratio_15m, tx_total_15m = await _rollback_ratio_recent(
db, target.id, minutes=15, min_total_transactions=20
)
rules.extend(
[
@@ -307,11 +317,11 @@ async def _build_standard_rules(db: AsyncSession, target: Target) -> tuple[list[
_RuleInput(
key="rollback_ratio",
name="Rollback Ratio",
description="Fraction of rolled back transactions over all transactions.",
description="Fraction of rolled back transactions in the last 15 minutes (evaluated only when at least 20 transactions occurred).",
category="transactions",
value=rollback_ratio,
warning_threshold=0.05,
alert_threshold=0.15,
value=rollback_ratio_15m,
warning_threshold=0.10,
alert_threshold=0.25,
),
_RuleInput(
key="deadlocks_60m",
@@ -342,6 +352,18 @@ async def _build_standard_rules(db: AsyncSession, target: Target) -> tuple[list[
),
]
)
# Expose transaction volume as contextual metric for UI/debugging.
rules.append(
_RuleInput(
key="rollback_tx_volume_15m",
name="Rollback Ratio Evaluation Volume",
description="Total transactions in the last 15 minutes used for rollback-ratio evaluation.",
category="transactions",
value=tx_total_15m,
warning_threshold=None,
alert_threshold=None,
)
)
return rules, forced_items