diff --git a/.env.example b/.env.example index 495ae60..802baf1 100644 --- a/.env.example +++ b/.env.example @@ -45,6 +45,10 @@ POLL_INTERVAL_SECONDS=30 # Active Connection Ratio alert is only evaluated when total sessions # are at least this number (reduces false positives on low-traffic DBs). ALERT_ACTIVE_CONNECTION_RATIO_MIN_TOTAL_CONNECTIONS=5 +# Rollback Ratio tuning to reduce false positives on low traffic. +ALERT_ROLLBACK_RATIO_WINDOW_MINUTES=15 +ALERT_ROLLBACK_RATIO_MIN_TOTAL_TRANSACTIONS=100 +ALERT_ROLLBACK_RATIO_MIN_ROLLBACKS=10 # Initial admin bootstrap user (created on first startup if not present). INIT_ADMIN_EMAIL=admin@example.com INIT_ADMIN_PASSWORD=ChangeMe123! diff --git a/backend/app/core/config.py b/backend/app/core/config.py index 3222c3e..21cc3ec 100644 --- a/backend/app/core/config.py +++ b/backend/app/core/config.py @@ -26,6 +26,9 @@ class Settings(BaseSettings): cors_origins: str = "http://localhost:5173" poll_interval_seconds: int = 30 alert_active_connection_ratio_min_total_connections: int = 5 + alert_rollback_ratio_window_minutes: int = 15 + alert_rollback_ratio_min_total_transactions: int = 100 + alert_rollback_ratio_min_rollbacks: int = 10 init_admin_email: str = "admin@example.com" init_admin_password: str = "ChangeMe123!" diff --git a/backend/app/services/alerts.py b/backend/app/services/alerts.py index ef6a8d5..3f9327c 100644 --- a/backend/app/services/alerts.py +++ b/backend/app/services/alerts.py @@ -152,17 +152,20 @@ async def _metric_delta(db: AsyncSession, target_id: int, metric_name: str, minu async def _rollback_ratio_recent( - db: AsyncSession, target_id: int, minutes: int, min_total_transactions: int -) -> tuple[float | None, float]: + db: AsyncSession, target_id: int, minutes: int, min_total_transactions: int, min_rollbacks: int +) -> tuple[float | None, float, float]: commit_delta = await _metric_delta(db, target_id, "xact_commit", minutes=minutes) rollback_delta = await _metric_delta(db, target_id, "xact_rollback", minutes=minutes) if commit_delta is None or rollback_delta is None: - return None, 0.0 + return None, 0.0, 0.0 tx_total = commit_delta + rollback_delta if tx_total < float(min_total_transactions): # Too little traffic in window, ratio would be noisy and misleading. - return None, tx_total - return (rollback_delta / tx_total) if tx_total > 0 else None, tx_total + return None, tx_total, rollback_delta + if rollback_delta < float(min_rollbacks): + # Ignore tiny rollback counts even if ratio appears high on low absolute numbers. + return None, tx_total, rollback_delta + return (rollback_delta / tx_total) if tx_total > 0 else None, tx_total, rollback_delta async def _latest_query_snapshot_max(db: AsyncSession, target_id: int, column_name: str) -> float | None: @@ -275,8 +278,13 @@ async def _build_standard_rules(db: AsyncSession, target: Target) -> tuple[list[ ): active_ratio = active_connections / total_connections - rollback_ratio_15m, tx_total_15m = await _rollback_ratio_recent( - db, target.id, minutes=15, min_total_transactions=20 + rollback_ratio_window = settings.alert_rollback_ratio_window_minutes + rollback_ratio_val, tx_total_window, rollback_count_window = await _rollback_ratio_recent( + db, + target.id, + minutes=rollback_ratio_window, + min_total_transactions=settings.alert_rollback_ratio_min_total_transactions, + min_rollbacks=settings.alert_rollback_ratio_min_rollbacks, ) rules.extend( @@ -324,9 +332,13 @@ async def _build_standard_rules(db: AsyncSession, target: Target) -> tuple[list[ _RuleInput( key="rollback_ratio", name="Rollback Ratio", - description="Fraction of rolled back transactions in the last 15 minutes (evaluated only when at least 20 transactions occurred).", + description=( + f"Fraction of rolled back transactions in the last {rollback_ratio_window} minutes " + f"(evaluated only when at least {settings.alert_rollback_ratio_min_total_transactions} " + f"transactions and {settings.alert_rollback_ratio_min_rollbacks} rollbacks occurred)." + ), category="transactions", - value=rollback_ratio_15m, + value=rollback_ratio_val, warning_threshold=0.10, alert_threshold=0.25, ), @@ -364,9 +376,20 @@ async def _build_standard_rules(db: AsyncSession, target: Target) -> tuple[list[ _RuleInput( key="rollback_tx_volume_15m", name="Rollback Ratio Evaluation Volume", - description="Total transactions in the last 15 minutes used for rollback-ratio evaluation.", + description=f"Total transactions in the last {rollback_ratio_window} minutes used for rollback-ratio evaluation.", category="transactions", - value=tx_total_15m, + value=tx_total_window, + warning_threshold=None, + alert_threshold=None, + ) + ) + rules.append( + _RuleInput( + key="rollback_count_window", + name="Rollback Count (Window)", + description=f"Rollback count in the last {rollback_ratio_window} minutes used for rollback-ratio evaluation.", + category="transactions", + value=rollback_count_window, warning_threshold=None, alert_threshold=None, )