From a0ba4e13148193a1451a4224a90191b742db3e2f Mon Sep 17 00:00:00 2001 From: nessi Date: Thu, 12 Feb 2026 14:23:53 +0100 Subject: [PATCH] Refactor rollback ratio calculation and thresholds. Introduced a new helper function `_rollback_ratio_recent` to calculate the rollback ratio over the last 15 minutes, ensuring meaningful evaluation only when a minimum transaction threshold is met. Adjusted warning and alert thresholds for rollback ratio and added a contextual metric for transaction volume in the past 15 minutes. --- backend/app/services/alerts.py | 44 +++++++++++++++++++++++++--------- 1 file changed, 33 insertions(+), 11 deletions(-) diff --git a/backend/app/services/alerts.py b/backend/app/services/alerts.py index 4cf9588..d3b7bcc 100644 --- a/backend/app/services/alerts.py +++ b/backend/app/services/alerts.py @@ -151,6 +151,20 @@ async def _metric_delta(db: AsyncSession, target_id: int, metric_name: str, minu return max(0.0, float(latest) - float(oldest)) +async def _rollback_ratio_recent( + db: AsyncSession, target_id: int, minutes: int, min_total_transactions: int +) -> tuple[float | None, float]: + commit_delta = await _metric_delta(db, target_id, "xact_commit", minutes=minutes) + rollback_delta = await _metric_delta(db, target_id, "xact_rollback", minutes=minutes) + if commit_delta is None or rollback_delta is None: + return None, 0.0 + tx_total = commit_delta + rollback_delta + if tx_total < float(min_total_transactions): + # Too little traffic in window, ratio would be noisy and misleading. + return None, tx_total + return (rollback_delta / tx_total) if tx_total > 0 else None, tx_total + + async def _latest_query_snapshot_max(db: AsyncSession, target_id: int, column_name: str) -> float | None: latest_ts = await db.scalar(select(func.max(QueryStat.ts)).where(QueryStat.target_id == target_id)) if latest_ts is None: @@ -248,8 +262,6 @@ async def _build_standard_rules(db: AsyncSession, target: Target) -> tuple[list[ total_connections = await _latest_metric_value(db, target.id, "connections_total") cache_hit_ratio = await _latest_metric_value(db, target.id, "cache_hit_ratio") lock_count = await _latest_metric_value(db, target.id, "locks_total") - xact_commit = await _latest_metric_value(db, target.id, "xact_commit") - xact_rollback = await _latest_metric_value(db, target.id, "xact_rollback") checkpoints_req_delta = await _metric_delta(db, target.id, "checkpoints_req", minutes=15) deadlocks_delta = await _metric_delta(db, target.id, "deadlocks", minutes=60) slowest_query_mean = await _latest_query_snapshot_max(db, target.id, "mean_time") @@ -259,11 +271,9 @@ async def _build_standard_rules(db: AsyncSession, target: Target) -> tuple[list[ if active_connections is not None and total_connections and total_connections > 0: active_ratio = active_connections / total_connections - rollback_ratio = None - if xact_commit is not None and xact_rollback is not None: - tx_total = xact_commit + xact_rollback - if tx_total > 0: - rollback_ratio = xact_rollback / tx_total + rollback_ratio_15m, tx_total_15m = await _rollback_ratio_recent( + db, target.id, minutes=15, min_total_transactions=20 + ) rules.extend( [ @@ -307,11 +317,11 @@ async def _build_standard_rules(db: AsyncSession, target: Target) -> tuple[list[ _RuleInput( key="rollback_ratio", name="Rollback Ratio", - description="Fraction of rolled back transactions over all transactions.", + description="Fraction of rolled back transactions in the last 15 minutes (evaluated only when at least 20 transactions occurred).", category="transactions", - value=rollback_ratio, - warning_threshold=0.05, - alert_threshold=0.15, + value=rollback_ratio_15m, + warning_threshold=0.10, + alert_threshold=0.25, ), _RuleInput( key="deadlocks_60m", @@ -342,6 +352,18 @@ async def _build_standard_rules(db: AsyncSession, target: Target) -> tuple[list[ ), ] ) + # Expose transaction volume as contextual metric for UI/debugging. + rules.append( + _RuleInput( + key="rollback_tx_volume_15m", + name="Rollback Ratio Evaluation Volume", + description="Total transactions in the last 15 minutes used for rollback-ratio evaluation.", + category="transactions", + value=tx_total_15m, + warning_threshold=None, + alert_threshold=None, + ) + ) return rules, forced_items