feat: add public status page with component health monitoring and system metrics visualization
Add statuspage package with service, handler, and types for exposing platform health. Implement GET /api/v1/status endpoint returning operational status, component health (API, database, gateway runtime), and control plane summary counts. Add Service.Snapshot method querying database connectivity, user/device/gateway/service/policy counts, connected device count via handshake timestamps, and gateway runtime tel
This commit is contained in:
155
backend/internal/statuspage/service.go
Normal file
155
backend/internal/statuspage/service.go
Normal file
@@ -0,0 +1,155 @@
|
||||
package statuspage
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/jackc/pgx/v5/pgxpool"
|
||||
)
|
||||
|
||||
type Service struct {
|
||||
db *pgxpool.Pool
|
||||
}
|
||||
|
||||
func NewService(db *pgxpool.Pool) *Service {
|
||||
return &Service{db: db}
|
||||
}
|
||||
|
||||
func (s *Service) Snapshot(ctx context.Context) PublicStatusResponse {
|
||||
now := time.Now().UTC()
|
||||
resp := PublicStatusResponse{
|
||||
Status: "operational",
|
||||
GeneratedAt: now.Format(time.RFC3339),
|
||||
Components: map[string]ComponentStatus{
|
||||
"api": {
|
||||
Status: "operational",
|
||||
Message: "Public API is responding.",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
if err := s.db.Ping(ctx); err != nil {
|
||||
resp.Status = "degraded"
|
||||
resp.Components["database"] = ComponentStatus{
|
||||
Status: "degraded",
|
||||
Message: "Database ping failed.",
|
||||
}
|
||||
return resp
|
||||
}
|
||||
|
||||
resp.Components["database"] = ComponentStatus{
|
||||
Status: "operational",
|
||||
Message: "Database connectivity is healthy.",
|
||||
}
|
||||
|
||||
if err := s.loadSummary(ctx, &resp.Summary); err != nil {
|
||||
resp.Status = "degraded"
|
||||
resp.Components["database"] = ComponentStatus{
|
||||
Status: "degraded",
|
||||
Message: "Database query failed.",
|
||||
}
|
||||
return resp
|
||||
}
|
||||
|
||||
lastGatewayRuntime, err := s.latestGatewayRuntime(ctx)
|
||||
if err != nil {
|
||||
resp.Status = "degraded"
|
||||
resp.Components["gateway"] = ComponentStatus{
|
||||
Status: "degraded",
|
||||
Message: "Gateway runtime state could not be read.",
|
||||
}
|
||||
return resp
|
||||
}
|
||||
|
||||
resp.Components["gateway"] = componentFromGatewayRuntime(now, lastGatewayRuntime, resp.Summary.ActiveGateways)
|
||||
if resp.Components["gateway"].Status != "operational" {
|
||||
resp.Status = "degraded"
|
||||
}
|
||||
|
||||
return resp
|
||||
}
|
||||
|
||||
func (s *Service) loadSummary(ctx context.Context, summary *Summary) error {
|
||||
return s.db.QueryRow(ctx, `
|
||||
select
|
||||
(select count(*) from users where deleted_at is null),
|
||||
(select count(*) from devices where deleted_at is null),
|
||||
(
|
||||
select count(*)
|
||||
from devices d
|
||||
join wireguard_peers wp on wp.device_id = d.id and wp.deleted_at is null
|
||||
where d.deleted_at is null
|
||||
and d.status = 'active'
|
||||
and wp.latest_handshake_at is not null
|
||||
and to_timestamp(wp.latest_handshake_at) >= now() - interval '3 minutes'
|
||||
),
|
||||
(select count(*) from gateways where deleted_at is null),
|
||||
(select count(*) from gateways where deleted_at is null and is_active = true),
|
||||
(select count(*) from services where deleted_at is null and is_active = true),
|
||||
(select count(*) from policies where deleted_at is null and is_active = true)
|
||||
`).Scan(
|
||||
&summary.Users,
|
||||
&summary.Devices,
|
||||
&summary.ConnectedDevices,
|
||||
&summary.Gateways,
|
||||
&summary.ActiveGateways,
|
||||
&summary.Services,
|
||||
&summary.Policies,
|
||||
)
|
||||
}
|
||||
|
||||
func (s *Service) latestGatewayRuntime(ctx context.Context) (*time.Time, error) {
|
||||
var updatedAt *time.Time
|
||||
err := s.db.QueryRow(ctx, `
|
||||
select max(updated_at)
|
||||
from settings
|
||||
where category = 'gateway_runtime'
|
||||
`).Scan(&updatedAt)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return updatedAt, nil
|
||||
}
|
||||
|
||||
func componentFromGatewayRuntime(now time.Time, lastRuntime *time.Time, activeGateways int) ComponentStatus {
|
||||
if activeGateways == 0 {
|
||||
return ComponentStatus{
|
||||
Status: "degraded",
|
||||
Message: "No active gateway is configured.",
|
||||
}
|
||||
}
|
||||
if lastRuntime == nil {
|
||||
return ComponentStatus{
|
||||
Status: "degraded",
|
||||
Message: "No gateway telemetry has been received yet.",
|
||||
}
|
||||
}
|
||||
|
||||
age := now.Sub(lastRuntime.UTC())
|
||||
if age <= 90*time.Second {
|
||||
return ComponentStatus{
|
||||
Status: "operational",
|
||||
Message: fmt.Sprintf("Last gateway telemetry %s ago.", humanizeAge(age)),
|
||||
}
|
||||
}
|
||||
|
||||
return ComponentStatus{
|
||||
Status: "degraded",
|
||||
Message: fmt.Sprintf("Gateway telemetry is stale (%s ago).", humanizeAge(age)),
|
||||
}
|
||||
}
|
||||
|
||||
func humanizeAge(age time.Duration) string {
|
||||
if age < time.Minute {
|
||||
seconds := int(age.Seconds())
|
||||
if seconds < 1 {
|
||||
seconds = 1
|
||||
}
|
||||
return fmt.Sprintf("%ds", seconds)
|
||||
}
|
||||
if age < time.Hour {
|
||||
return fmt.Sprintf("%dm", int(age.Minutes()))
|
||||
}
|
||||
return fmt.Sprintf("%dh", int(age.Hours()))
|
||||
}
|
||||
Reference in New Issue
Block a user