diff --git a/.env.example b/.env.example index 4d6446c..1ffd1a5 100644 --- a/.env.example +++ b/.env.example @@ -17,6 +17,7 @@ JWT_ACCESS_TOKEN_MINUTES=15 JWT_REFRESH_TOKEN_MINUTES=10080 # Generate with: python -c "from cryptography.fernet import Fernet; print(Fernet.generate_key().decode())" ENCRYPTION_KEY=REPLACE_WITH_FERNET_KEY +# Dev: set to * to allow all origins (credentials disabled automatically) CORS_ORIGINS=http://localhost:5173,http://localhost:8080 POLL_INTERVAL_SECONDS=30 INIT_ADMIN_EMAIL=admin@example.com diff --git a/backend/app/api/routes/targets.py b/backend/app/api/routes/targets.py index ed2a89c..a3dfb8c 100644 --- a/backend/app/api/routes/targets.py +++ b/backend/app/api/routes/targets.py @@ -7,10 +7,12 @@ from app.core.db import get_db from app.core.deps import get_current_user, require_roles from app.models.models import Metric, QueryStat, Target, User from app.schemas.metric import MetricOut, QueryStatOut +from app.schemas.overview import DatabaseOverviewOut from app.schemas.target import TargetCreate, TargetOut, TargetUpdate from app.services.audit import write_audit_log from app.services.collector import build_target_dsn from app.services.crypto import encrypt_secret +from app.services.overview_service import get_target_overview router = APIRouter() @@ -179,3 +181,12 @@ async def get_top_queries(target_id: int, user: User = Depends(get_current_user) ) for r in rows ] + + +@router.get("/{target_id}/overview", response_model=DatabaseOverviewOut) +async def get_overview(target_id: int, user: User = Depends(get_current_user), db: AsyncSession = Depends(get_db)) -> DatabaseOverviewOut: + _ = user + target = await db.scalar(select(Target).where(Target.id == target_id)) + if not target: + raise HTTPException(status_code=404, detail="Target not found") + return await get_target_overview(target) diff --git a/backend/app/main.py b/backend/app/main.py index 82e53d9..d1420ed 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -49,10 +49,11 @@ async def lifespan(_: FastAPI): app = FastAPI(title=settings.app_name, lifespan=lifespan) +use_wildcard_cors = settings.cors_origins.strip() == "*" app.add_middleware( CORSMiddleware, - allow_origins=settings.cors_origins_list, - allow_credentials=True, + allow_origins=["*"] if use_wildcard_cors else settings.cors_origins_list, + allow_credentials=False if use_wildcard_cors else True, allow_methods=["*"], allow_headers=["*"], ) diff --git a/backend/app/schemas/overview.py b/backend/app/schemas/overview.py new file mode 100644 index 0000000..61b147d --- /dev/null +++ b/backend/app/schemas/overview.py @@ -0,0 +1,79 @@ +from datetime import datetime +from pydantic import BaseModel + + +class DbSizeItem(BaseModel): + name: str + size_bytes: int + + +class TableSizeItem(BaseModel): + schema: str + table: str + size_bytes: int + + +class ReplicationClientItem(BaseModel): + application_name: str | None = None + client_addr: str | None = None + state: str | None = None + sync_state: str | None = None + write_lag_seconds: float | None = None + flush_lag_seconds: float | None = None + replay_lag_seconds: float | None = None + replay_lag_bytes: float | None = None + + +class DiskSpaceInfo(BaseModel): + source: str + status: str + free_bytes: int | None = None + message: str | None = None + + +class InstanceOverview(BaseModel): + server_version: str | None = None + role: str + postmaster_start_time: datetime | None = None + uptime_seconds: float | None = None + current_database: str | None = None + port: int | None = None + + +class StorageOverview(BaseModel): + current_database_size_bytes: int | None = None + all_databases: list[DbSizeItem] + largest_tables: list[TableSizeItem] + wal_directory_size_bytes: int | None = None + disk_space: DiskSpaceInfo + + +class ReplicationOverview(BaseModel): + mode: str + replay_lag_seconds: float | None = None + replication_slots_count: int | None = None + active_replication_clients: int | None = None + clients: list[ReplicationClientItem] + + +class PerformanceOverview(BaseModel): + xact_commit: int | None = None + xact_rollback: int | None = None + deadlocks: int | None = None + temp_files: int | None = None + temp_bytes: int | None = None + blk_read_time: float | None = None + blk_write_time: float | None = None + checkpoints_timed: int | None = None + checkpoints_req: int | None = None + autovacuum_workers: int | None = None + + +class DatabaseOverviewOut(BaseModel): + collected_at: datetime + cache_ttl_seconds: int + instance: InstanceOverview + storage: StorageOverview + replication: ReplicationOverview + performance: PerformanceOverview + partial_failures: list[str] diff --git a/backend/app/services/infra_probe.py b/backend/app/services/infra_probe.py new file mode 100644 index 0000000..a5910b0 --- /dev/null +++ b/backend/app/services/infra_probe.py @@ -0,0 +1,29 @@ +from dataclasses import dataclass + + +@dataclass +class DiskSpaceProbeResult: + source: str + status: str + free_bytes: int | None = None + message: str | None = None + + +class DiskSpaceProvider: + async def get_free_bytes(self, target_host: str) -> DiskSpaceProbeResult: + raise NotImplementedError + + +class NullDiskSpaceProvider(DiskSpaceProvider): + async def get_free_bytes(self, target_host: str) -> DiskSpaceProbeResult: + return DiskSpaceProbeResult( + source="none", + status="unavailable", + free_bytes=None, + message=f"No infra probe configured for host {target_host}. Add SSH/Agent provider later.", + ) + + +def get_disk_space_provider() -> DiskSpaceProvider: + # Extension point: replace by SSH/Agent backed implementation. + return NullDiskSpaceProvider() diff --git a/backend/app/services/overview_collector.py b/backend/app/services/overview_collector.py new file mode 100644 index 0000000..eb82123 --- /dev/null +++ b/backend/app/services/overview_collector.py @@ -0,0 +1,218 @@ +from datetime import datetime, timezone +import asyncpg +from app.schemas.overview import ( + DatabaseOverviewOut, + DbSizeItem, + DiskSpaceInfo, + InstanceOverview, + PerformanceOverview, + ReplicationClientItem, + ReplicationOverview, + StorageOverview, + TableSizeItem, +) +from app.services.infra_probe import DiskSpaceProvider + + +def _interval_seconds(value) -> float | None: + if value is None: + return None + try: + return float(value.total_seconds()) + except Exception: + return None + + +async def _safe_fetchval(conn: asyncpg.Connection, query: str, errors: list[str], label: str): + try: + return await conn.fetchval(query) + except Exception as exc: + errors.append(f"{label}: {exc}") + return None + + +async def _safe_fetchrow(conn: asyncpg.Connection, query: str, errors: list[str], label: str): + try: + return await conn.fetchrow(query) + except Exception as exc: + errors.append(f"{label}: {exc}") + return None + + +async def _safe_fetch(conn: asyncpg.Connection, query: str, errors: list[str], label: str): + try: + return await conn.fetch(query) + except Exception as exc: + errors.append(f"{label}: {exc}") + return [] + + +async def collect_overview( + conn: asyncpg.Connection, disk_provider: DiskSpaceProvider, target_host: str, cache_ttl_seconds: int +) -> DatabaseOverviewOut: + errors: list[str] = [] + + version = await _safe_fetchval(conn, "SHOW server_version", errors, "show_server_version") + in_recovery = await _safe_fetchval(conn, "SELECT pg_is_in_recovery()", errors, "pg_is_in_recovery") + postmaster_start = await _safe_fetchval(conn, "SELECT pg_postmaster_start_time()", errors, "postmaster_start") + current_db = await _safe_fetchval(conn, "SELECT current_database()", errors, "current_database") + port = await _safe_fetchval(conn, "SELECT inet_server_port()", errors, "inet_server_port") + + uptime_seconds = None + if postmaster_start: + uptime_seconds = (datetime.now(timezone.utc) - postmaster_start).total_seconds() + + db_size = await _safe_fetchval( + conn, "SELECT pg_database_size(current_database())", errors, "pg_database_size_current" + ) + all_db_rows = await _safe_fetch( + conn, + """ + SELECT datname, pg_database_size(datname) AS size_bytes + FROM pg_database + ORDER BY size_bytes DESC + """, + errors, + "pg_database_size_all", + ) + largest_tables_rows = await _safe_fetch( + conn, + """ + SELECT schemaname, relname, pg_total_relation_size(schemaname || '.' || relname) AS size_bytes + FROM pg_stat_user_tables + ORDER BY size_bytes DESC + LIMIT 5 + """, + errors, + "largest_tables", + ) + wal_dir_size = await _safe_fetchval( + conn, + """ + SELECT COALESCE(sum(size), 0) + FROM pg_ls_waldir() + """, + errors, + "wal_directory_size", + ) + + stats_db = await _safe_fetchrow( + conn, + """ + SELECT xact_commit, xact_rollback, deadlocks, temp_files, temp_bytes, blk_read_time, blk_write_time + FROM pg_stat_database + WHERE datname = current_database() + """, + errors, + "pg_stat_database_perf", + ) + bgwriter = await _safe_fetchrow( + conn, + "SELECT checkpoints_timed, checkpoints_req FROM pg_stat_bgwriter", + errors, + "pg_stat_bgwriter", + ) + autovac_workers = await _safe_fetchval( + conn, + """ + SELECT count(*) + FROM pg_stat_activity + WHERE query ILIKE 'autovacuum:%' + """, + errors, + "autovacuum_activity", + ) + + clients: list[ReplicationClientItem] = [] + replay_lag_seconds = None + slots_count = None + active_clients = None + role = "standby" if in_recovery else "primary" + if in_recovery: + replay_lag_seconds = await _safe_fetchval( + conn, + """ + SELECT EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp())) + """, + errors, + "standby_replay_lag", + ) + else: + slots_count = await _safe_fetchval(conn, "SELECT count(*) FROM pg_replication_slots", errors, "replication_slots") + repl_rows = await _safe_fetch( + conn, + """ + SELECT application_name, client_addr::text, state, sync_state, + EXTRACT(EPOCH FROM write_lag) AS write_lag_seconds, + EXTRACT(EPOCH FROM flush_lag) AS flush_lag_seconds, + EXTRACT(EPOCH FROM replay_lag) AS replay_lag_seconds, + pg_wal_lsn_diff(pg_current_wal_lsn(), replay_lsn) AS replay_lag_bytes + FROM pg_stat_replication + """, + errors, + "pg_stat_replication", + ) + active_clients = len(repl_rows) + clients = [ + ReplicationClientItem( + application_name=row.get("application_name"), + client_addr=row.get("client_addr"), + state=row.get("state"), + sync_state=row.get("sync_state"), + write_lag_seconds=row.get("write_lag_seconds"), + flush_lag_seconds=row.get("flush_lag_seconds"), + replay_lag_seconds=row.get("replay_lag_seconds"), + replay_lag_bytes=float(row.get("replay_lag_bytes")) if row.get("replay_lag_bytes") is not None else None, + ) + for row in repl_rows + ] + + disk = await disk_provider.get_free_bytes(target_host) + + return DatabaseOverviewOut( + collected_at=datetime.now(timezone.utc), + cache_ttl_seconds=cache_ttl_seconds, + instance=InstanceOverview( + server_version=version, + role=role, + postmaster_start_time=postmaster_start, + uptime_seconds=uptime_seconds, + current_database=current_db, + port=int(port) if port is not None else None, + ), + storage=StorageOverview( + current_database_size_bytes=int(db_size) if db_size is not None else None, + all_databases=[DbSizeItem(name=r["datname"], size_bytes=int(r["size_bytes"])) for r in all_db_rows], + largest_tables=[ + TableSizeItem(schema=r["schemaname"], table=r["relname"], size_bytes=int(r["size_bytes"])) + for r in largest_tables_rows + ], + wal_directory_size_bytes=int(wal_dir_size) if wal_dir_size is not None else None, + disk_space=DiskSpaceInfo( + source=disk.source, + status=disk.status, + free_bytes=disk.free_bytes, + message=disk.message, + ), + ), + replication=ReplicationOverview( + mode=role, + replay_lag_seconds=float(replay_lag_seconds) if replay_lag_seconds is not None else None, + replication_slots_count=int(slots_count) if slots_count is not None else None, + active_replication_clients=active_clients, + clients=clients, + ), + performance=PerformanceOverview( + xact_commit=stats_db["xact_commit"] if stats_db else None, + xact_rollback=stats_db["xact_rollback"] if stats_db else None, + deadlocks=stats_db["deadlocks"] if stats_db else None, + temp_files=stats_db["temp_files"] if stats_db else None, + temp_bytes=stats_db["temp_bytes"] if stats_db else None, + blk_read_time=stats_db["blk_read_time"] if stats_db else None, + blk_write_time=stats_db["blk_write_time"] if stats_db else None, + checkpoints_timed=bgwriter["checkpoints_timed"] if bgwriter else None, + checkpoints_req=bgwriter["checkpoints_req"] if bgwriter else None, + autovacuum_workers=int(autovac_workers) if autovac_workers is not None else None, + ), + partial_failures=errors, + ) diff --git a/backend/app/services/overview_service.py b/backend/app/services/overview_service.py new file mode 100644 index 0000000..851dcf5 --- /dev/null +++ b/backend/app/services/overview_service.py @@ -0,0 +1,46 @@ +import asyncio +import time +import asyncpg +from app.models.models import Target +from app.schemas.overview import DatabaseOverviewOut +from app.services.collector import build_target_dsn +from app.services.infra_probe import get_disk_space_provider +from app.services.overview_collector import collect_overview + +_CACHE_TTL_SECONDS = 20 +_cache: dict[int, tuple[float, DatabaseOverviewOut]] = {} +_locks: dict[int, asyncio.Lock] = {} + + +def _get_lock(target_id: int) -> asyncio.Lock: + if target_id not in _locks: + _locks[target_id] = asyncio.Lock() + return _locks[target_id] + + +async def get_target_overview(target: Target) -> DatabaseOverviewOut: + now = time.time() + cached = _cache.get(target.id) + if cached and (now - cached[0]) < _CACHE_TTL_SECONDS: + return cached[1] + + lock = _get_lock(target.id) + async with lock: + cached = _cache.get(target.id) + now = time.time() + if cached and (now - cached[0]) < _CACHE_TTL_SECONDS: + return cached[1] + + conn = await asyncpg.connect(dsn=build_target_dsn(target)) + try: + payload = await collect_overview( + conn=conn, + disk_provider=get_disk_space_provider(), + target_host=target.host, + cache_ttl_seconds=_CACHE_TTL_SECONDS, + ) + finally: + await conn.close() + + _cache[target.id] = (time.time(), payload) + return payload diff --git a/frontend/src/pages/TargetDetailPage.jsx b/frontend/src/pages/TargetDetailPage.jsx index fa1f0c9..7fbf984 100644 --- a/frontend/src/pages/TargetDetailPage.jsx +++ b/frontend/src/pages/TargetDetailPage.jsx @@ -17,6 +17,25 @@ function toQueryRange(range) { return { from: from.toISOString(), to: to.toISOString() }; } +function formatBytes(value) { + if (value === null || value === undefined) return "-"; + const units = ["B", "KB", "MB", "GB", "TB"]; + let n = Number(value); + let i = 0; + while (n >= 1024 && i < units.length - 1) { + n /= 1024; + i += 1; + } + return `${n.toFixed(i >= 2 ? 2 : 0)} ${units[i]}`; +} + +function formatSeconds(value) { + if (value === null || value === undefined) return "-"; + if (value < 60) return `${value.toFixed(1)}s`; + if (value < 3600) return `${(value / 60).toFixed(1)}m`; + return `${(value / 3600).toFixed(1)}h`; +} + async function loadMetric(targetId, metric, range, tokens, refresh) { const { from, to } = toQueryRange(range); return apiFetch( @@ -34,6 +53,7 @@ export function TargetDetailPage() { const [series, setSeries] = useState({}); const [locks, setLocks] = useState([]); const [activity, setActivity] = useState([]); + const [overview, setOverview] = useState(null); const [error, setError] = useState(""); const [loading, setLoading] = useState(true); @@ -42,17 +62,19 @@ export function TargetDetailPage() { (async () => { setLoading(true); try { - const [connections, xacts, cache, locksTable, activityTable] = await Promise.all([ + const [connections, xacts, cache, locksTable, activityTable, overviewData] = await Promise.all([ loadMetric(id, "connections_total", range, tokens, refresh), loadMetric(id, "xacts_total", range, tokens, refresh), loadMetric(id, "cache_hit_ratio", range, tokens, refresh), apiFetch(`/targets/${id}/locks`, {}, tokens, refresh), apiFetch(`/targets/${id}/activity`, {}, tokens, refresh), + apiFetch(`/targets/${id}/overview`, {}, tokens, refresh), ]); if (!active) return; setSeries({ connections, xacts, cache }); setLocks(locksTable); setActivity(activityTable); + setOverview(overviewData); setError(""); } catch (e) { if (active) setError(String(e.message || e)); @@ -79,9 +101,116 @@ export function TargetDetailPage() { if (loading) return
Lade Target Detail...
; if (error) return
{error}
; + const role = overview?.instance?.role || "-"; + const isPrimary = role === "primary"; + const isStandby = role === "standby"; + return (

Target Detail #{id}

+ {overview && ( +
+

Database Overview

+
+
PostgreSQL Version{overview.instance.server_version || "-"}
+
+ Role + {role} +
+
+ Uptime{formatSeconds(overview.instance.uptime_seconds)} +
+
Database{overview.instance.current_database || "-"}
+
Port{overview.instance.port ?? "-"}
+
+ Current DB Size{formatBytes(overview.storage.current_database_size_bytes)} +
+
+ WAL Size{formatBytes(overview.storage.wal_directory_size_bytes)} +
+
+ Free Disk{formatBytes(overview.storage.disk_space.free_bytes)} +
+
+ Replay Lag + 5 ? "lag-bad" : ""}> + {formatSeconds(overview.replication.replay_lag_seconds)} + +
+
Replication Slots{overview.replication.replication_slots_count ?? "-"}
+
Repl Clients{overview.replication.active_replication_clients ?? "-"}
+
Autovacuum Workers{overview.performance.autovacuum_workers ?? "-"}
+
+ +
+
+

Size all databases

+ + + + {(overview.storage.all_databases || []).map((d) => ( + + ))} + +
DatabaseSize
{d.name}{formatBytes(d.size_bytes)}
+
+
+

Largest Tables (Top 5)

+ + + + {(overview.storage.largest_tables || []).map((t, i) => ( + + ))} + +
TableSize
{t.schema}.{t.table}{formatBytes(t.size_bytes)}
+
+
+ +
+
+

Replication Clients

+ + + + {(overview.replication.clients || []).map((c, i) => ( + + + + + + ))} + +
ClientStateLag
{c.client_addr || c.application_name || "-"}{c.state || "-"} 5 ? "lag-bad" : ""}> + {c.replay_lag_seconds != null ? formatSeconds(c.replay_lag_seconds) : formatBytes(c.replay_lag_bytes)} +
+
+
+

Performance Core Metrics

+
+
xact_commit{overview.performance.xact_commit ?? "-"}
+
xact_rollback{overview.performance.xact_rollback ?? "-"}
+
deadlocks{overview.performance.deadlocks ?? "-"}
+
temp_files{overview.performance.temp_files ?? "-"}
+
temp_bytes{formatBytes(overview.performance.temp_bytes)}
+
blk_read_time{overview.performance.blk_read_time ?? "-"} ms
+
blk_write_time{overview.performance.blk_write_time ?? "-"} ms
+
checkpoints_timed{overview.performance.checkpoints_timed ?? "-"}
+
checkpoints_req{overview.performance.checkpoints_req ?? "-"}
+
+
+
+ + {overview.partial_failures?.length > 0 && ( +
+ Partial data warnings: +
    + {overview.partial_failures.map((w, i) =>
  • {w}
  • )} +
+
+ )} +
+ )}
{Object.keys(ranges).map((r) => ( +
+ + setForm({ ...form, name: e.target.value })} required /> + Eindeutiger Anzeigename im Dashboard. +
+
+ + setForm({ ...form, host: e.target.value })} required /> + Wichtig: Muss vom Backend-Container aus erreichbar sein. +
+
+ + setForm({ ...form, port: Number(e.target.value) })} type="number" required /> + Standard PostgreSQL Port ist 5432 (oder gemappter Host-Port). +
+
+ + setForm({ ...form, dbname: e.target.value })} required /> + Name der Datenbank, die überwacht werden soll. +
+
+ + setForm({ ...form, username: e.target.value })} required /> + DB User mit Leserechten auf Stats-Views. +
+
+ + setForm({ ...form, password: e.target.value })} required /> + Wird verschlüsselt in der Core-DB gespeichert. +
+
+ + + + Bei Fehler "rejected SSL upgrade" auf disable stellen. + +
+
+ + +
)} + {canManage && ( +
+ Troubleshooting +

+ Connection refused: Host/Port falsch oder DB nicht erreichbar. +

+

+ rejected SSL upgrade: SSL Mode auf disable setzen. +

+

+ localhost im Target zeigt aus Backend-Container-Sicht auf den Container selbst. +

+
+ )}
{loading ? (

Lade Targets...

diff --git a/frontend/src/styles.css b/frontend/src/styles.css index add6f40..e93b982 100644 --- a/frontend/src/styles.css +++ b/frontend/src/styles.css @@ -96,6 +96,26 @@ button { padding: 10px; } +.field { + display: grid; + gap: 6px; +} + +.field label { + font-size: 13px; + color: #b9c6dc; +} + +.field small { + font-size: 12px; + color: #8fa0bf; +} + +.tips p { + margin: 8px 0; + color: #bfd0ea; +} + button { cursor: pointer; } @@ -147,6 +167,53 @@ td { text-overflow: ellipsis; } +.overview-kv span, +.overview-metrics span { + display: block; + font-size: 12px; + color: #97a7c8; +} + +.overview-kv strong, +.overview-metrics strong { + font-size: 15px; +} + +.overview-metrics { + display: grid; + grid-template-columns: repeat(2, minmax(0, 1fr)); + gap: 10px; +} + +.pill { + text-transform: uppercase; + letter-spacing: 0.04em; +} + +.pill.primary { + color: #4ade80; +} + +.pill.standby { + color: #f59e0b; +} + +.lag-bad { + color: #ef4444; +} + +.partial-warning { + margin-top: 12px; + border: 1px solid #7f1d1d; + border-radius: 10px; + padding: 10px; + color: #fecaca; +} + +.partial-warning ul { + margin: 8px 0 0 18px; +} + @media (max-width: 980px) { .shell { grid-template-columns: 1fr;