From 918bb132efc7cd2342b0f003c440d7a40ccf3db0 Mon Sep 17 00:00:00 2001 From: nessi Date: Thu, 12 Feb 2026 15:41:11 +0100 Subject: [PATCH] Add logging for failure recovery and throttled error reporting. Introduced `_failure_state` to track consecutive failures per target and `_failure_log_interval_seconds` to control logging frequency. Added logs for recovery when a target recovers and throttled error logs to reduce noise for recurring errors. --- backend/app/services/collector.py | 44 ++++++++++++++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/backend/app/services/collector.py b/backend/app/services/collector.py index e20bf22..51bbe3c 100644 --- a/backend/app/services/collector.py +++ b/backend/app/services/collector.py @@ -13,6 +13,8 @@ import asyncpg logger = logging.getLogger(__name__) settings = get_settings() +_failure_state: dict[int, dict[str, object]] = {} +_failure_log_interval_seconds = 300 def build_target_dsn(target: Target) -> str: @@ -182,8 +184,48 @@ async def collect_once() -> None: for target in targets: try: await collect_target(target) + prev = _failure_state.pop(target.id, None) + if prev: + logger.info( + "collector_target_recovered target=%s after_failures=%s last_error=%s", + target.id, + prev.get("count", 0), + prev.get("error"), + ) except (OSError, SQLAlchemyError, asyncpg.PostgresError, Exception) as exc: - logger.exception("collector_error target=%s err=%s", target.id, exc) + now = datetime.now(timezone.utc) + current_error = str(exc) + state = _failure_state.get(target.id) + if state is None: + _failure_state[target.id] = { + "count": 1, + "last_log_at": now, + "error": current_error, + } + logger.exception("collector_error target=%s err=%s", target.id, exc) + continue + + count = int(state.get("count", 0)) + 1 + last_log_at = state.get("last_log_at") + last_logged_error = str(state.get("error", "")) + should_log = False + if current_error != last_logged_error: + should_log = True + elif isinstance(last_log_at, datetime): + should_log = (now - last_log_at).total_seconds() >= _failure_log_interval_seconds + else: + should_log = True + + state["count"] = count + if should_log: + state["last_log_at"] = now + state["error"] = current_error + logger.error( + "collector_error_throttled target=%s err=%s consecutive_failures=%s", + target.id, + current_error, + count, + ) async def collector_loop(stop_event: asyncio.Event) -> None: