Refactor rollback ratio calculation and thresholds.
Some checks failed
PostgreSQL Compatibility Matrix / PG14 smoke (push) Failing after 3m17s
PostgreSQL Compatibility Matrix / PG15 smoke (push) Failing after 1m17s
PostgreSQL Compatibility Matrix / PG17 smoke (push) Has been cancelled
PostgreSQL Compatibility Matrix / PG18 smoke (push) Has been cancelled
PostgreSQL Compatibility Matrix / PG16 smoke (push) Has been cancelled
Some checks failed
PostgreSQL Compatibility Matrix / PG14 smoke (push) Failing after 3m17s
PostgreSQL Compatibility Matrix / PG15 smoke (push) Failing after 1m17s
PostgreSQL Compatibility Matrix / PG17 smoke (push) Has been cancelled
PostgreSQL Compatibility Matrix / PG18 smoke (push) Has been cancelled
PostgreSQL Compatibility Matrix / PG16 smoke (push) Has been cancelled
Introduced a new helper function `_rollback_ratio_recent` to calculate the rollback ratio over the last 15 minutes, ensuring meaningful evaluation only when a minimum transaction threshold is met. Adjusted warning and alert thresholds for rollback ratio and added a contextual metric for transaction volume in the past 15 minutes.
This commit is contained in:
@@ -151,6 +151,20 @@ async def _metric_delta(db: AsyncSession, target_id: int, metric_name: str, minu
|
|||||||
return max(0.0, float(latest) - float(oldest))
|
return max(0.0, float(latest) - float(oldest))
|
||||||
|
|
||||||
|
|
||||||
|
async def _rollback_ratio_recent(
|
||||||
|
db: AsyncSession, target_id: int, minutes: int, min_total_transactions: int
|
||||||
|
) -> tuple[float | None, float]:
|
||||||
|
commit_delta = await _metric_delta(db, target_id, "xact_commit", minutes=minutes)
|
||||||
|
rollback_delta = await _metric_delta(db, target_id, "xact_rollback", minutes=minutes)
|
||||||
|
if commit_delta is None or rollback_delta is None:
|
||||||
|
return None, 0.0
|
||||||
|
tx_total = commit_delta + rollback_delta
|
||||||
|
if tx_total < float(min_total_transactions):
|
||||||
|
# Too little traffic in window, ratio would be noisy and misleading.
|
||||||
|
return None, tx_total
|
||||||
|
return (rollback_delta / tx_total) if tx_total > 0 else None, tx_total
|
||||||
|
|
||||||
|
|
||||||
async def _latest_query_snapshot_max(db: AsyncSession, target_id: int, column_name: str) -> float | None:
|
async def _latest_query_snapshot_max(db: AsyncSession, target_id: int, column_name: str) -> float | None:
|
||||||
latest_ts = await db.scalar(select(func.max(QueryStat.ts)).where(QueryStat.target_id == target_id))
|
latest_ts = await db.scalar(select(func.max(QueryStat.ts)).where(QueryStat.target_id == target_id))
|
||||||
if latest_ts is None:
|
if latest_ts is None:
|
||||||
@@ -248,8 +262,6 @@ async def _build_standard_rules(db: AsyncSession, target: Target) -> tuple[list[
|
|||||||
total_connections = await _latest_metric_value(db, target.id, "connections_total")
|
total_connections = await _latest_metric_value(db, target.id, "connections_total")
|
||||||
cache_hit_ratio = await _latest_metric_value(db, target.id, "cache_hit_ratio")
|
cache_hit_ratio = await _latest_metric_value(db, target.id, "cache_hit_ratio")
|
||||||
lock_count = await _latest_metric_value(db, target.id, "locks_total")
|
lock_count = await _latest_metric_value(db, target.id, "locks_total")
|
||||||
xact_commit = await _latest_metric_value(db, target.id, "xact_commit")
|
|
||||||
xact_rollback = await _latest_metric_value(db, target.id, "xact_rollback")
|
|
||||||
checkpoints_req_delta = await _metric_delta(db, target.id, "checkpoints_req", minutes=15)
|
checkpoints_req_delta = await _metric_delta(db, target.id, "checkpoints_req", minutes=15)
|
||||||
deadlocks_delta = await _metric_delta(db, target.id, "deadlocks", minutes=60)
|
deadlocks_delta = await _metric_delta(db, target.id, "deadlocks", minutes=60)
|
||||||
slowest_query_mean = await _latest_query_snapshot_max(db, target.id, "mean_time")
|
slowest_query_mean = await _latest_query_snapshot_max(db, target.id, "mean_time")
|
||||||
@@ -259,11 +271,9 @@ async def _build_standard_rules(db: AsyncSession, target: Target) -> tuple[list[
|
|||||||
if active_connections is not None and total_connections and total_connections > 0:
|
if active_connections is not None and total_connections and total_connections > 0:
|
||||||
active_ratio = active_connections / total_connections
|
active_ratio = active_connections / total_connections
|
||||||
|
|
||||||
rollback_ratio = None
|
rollback_ratio_15m, tx_total_15m = await _rollback_ratio_recent(
|
||||||
if xact_commit is not None and xact_rollback is not None:
|
db, target.id, minutes=15, min_total_transactions=20
|
||||||
tx_total = xact_commit + xact_rollback
|
)
|
||||||
if tx_total > 0:
|
|
||||||
rollback_ratio = xact_rollback / tx_total
|
|
||||||
|
|
||||||
rules.extend(
|
rules.extend(
|
||||||
[
|
[
|
||||||
@@ -307,11 +317,11 @@ async def _build_standard_rules(db: AsyncSession, target: Target) -> tuple[list[
|
|||||||
_RuleInput(
|
_RuleInput(
|
||||||
key="rollback_ratio",
|
key="rollback_ratio",
|
||||||
name="Rollback Ratio",
|
name="Rollback Ratio",
|
||||||
description="Fraction of rolled back transactions over all transactions.",
|
description="Fraction of rolled back transactions in the last 15 minutes (evaluated only when at least 20 transactions occurred).",
|
||||||
category="transactions",
|
category="transactions",
|
||||||
value=rollback_ratio,
|
value=rollback_ratio_15m,
|
||||||
warning_threshold=0.05,
|
warning_threshold=0.10,
|
||||||
alert_threshold=0.15,
|
alert_threshold=0.25,
|
||||||
),
|
),
|
||||||
_RuleInput(
|
_RuleInput(
|
||||||
key="deadlocks_60m",
|
key="deadlocks_60m",
|
||||||
@@ -342,6 +352,18 @@ async def _build_standard_rules(db: AsyncSession, target: Target) -> tuple[list[
|
|||||||
),
|
),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
# Expose transaction volume as contextual metric for UI/debugging.
|
||||||
|
rules.append(
|
||||||
|
_RuleInput(
|
||||||
|
key="rollback_tx_volume_15m",
|
||||||
|
name="Rollback Ratio Evaluation Volume",
|
||||||
|
description="Total transactions in the last 15 minutes used for rollback-ratio evaluation.",
|
||||||
|
category="transactions",
|
||||||
|
value=tx_total_15m,
|
||||||
|
warning_threshold=None,
|
||||||
|
alert_threshold=None,
|
||||||
|
)
|
||||||
|
)
|
||||||
return rules, forced_items
|
return rules, forced_items
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user