Add statuspage package with service, handler, and types for exposing platform health. Implement GET /api/v1/status endpoint returning operational status, component health (API, database, gateway runtime), and control plane summary counts. Add Service.Snapshot method querying database connectivity, user/device/gateway/service/policy counts, connected device count via handshake timestamps, and gateway runtime tel
156 lines
3.8 KiB
Go
156 lines
3.8 KiB
Go
package statuspage
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"time"
|
|
|
|
"github.com/jackc/pgx/v5/pgxpool"
|
|
)
|
|
|
|
type Service struct {
|
|
db *pgxpool.Pool
|
|
}
|
|
|
|
func NewService(db *pgxpool.Pool) *Service {
|
|
return &Service{db: db}
|
|
}
|
|
|
|
func (s *Service) Snapshot(ctx context.Context) PublicStatusResponse {
|
|
now := time.Now().UTC()
|
|
resp := PublicStatusResponse{
|
|
Status: "operational",
|
|
GeneratedAt: now.Format(time.RFC3339),
|
|
Components: map[string]ComponentStatus{
|
|
"api": {
|
|
Status: "operational",
|
|
Message: "Public API is responding.",
|
|
},
|
|
},
|
|
}
|
|
|
|
if err := s.db.Ping(ctx); err != nil {
|
|
resp.Status = "degraded"
|
|
resp.Components["database"] = ComponentStatus{
|
|
Status: "degraded",
|
|
Message: "Database ping failed.",
|
|
}
|
|
return resp
|
|
}
|
|
|
|
resp.Components["database"] = ComponentStatus{
|
|
Status: "operational",
|
|
Message: "Database connectivity is healthy.",
|
|
}
|
|
|
|
if err := s.loadSummary(ctx, &resp.Summary); err != nil {
|
|
resp.Status = "degraded"
|
|
resp.Components["database"] = ComponentStatus{
|
|
Status: "degraded",
|
|
Message: "Database query failed.",
|
|
}
|
|
return resp
|
|
}
|
|
|
|
lastGatewayRuntime, err := s.latestGatewayRuntime(ctx)
|
|
if err != nil {
|
|
resp.Status = "degraded"
|
|
resp.Components["gateway"] = ComponentStatus{
|
|
Status: "degraded",
|
|
Message: "Gateway runtime state could not be read.",
|
|
}
|
|
return resp
|
|
}
|
|
|
|
resp.Components["gateway"] = componentFromGatewayRuntime(now, lastGatewayRuntime, resp.Summary.ActiveGateways)
|
|
if resp.Components["gateway"].Status != "operational" {
|
|
resp.Status = "degraded"
|
|
}
|
|
|
|
return resp
|
|
}
|
|
|
|
func (s *Service) loadSummary(ctx context.Context, summary *Summary) error {
|
|
return s.db.QueryRow(ctx, `
|
|
select
|
|
(select count(*) from users where deleted_at is null),
|
|
(select count(*) from devices where deleted_at is null),
|
|
(
|
|
select count(*)
|
|
from devices d
|
|
join wireguard_peers wp on wp.device_id = d.id and wp.deleted_at is null
|
|
where d.deleted_at is null
|
|
and d.status = 'active'
|
|
and wp.latest_handshake_at is not null
|
|
and to_timestamp(wp.latest_handshake_at) >= now() - interval '3 minutes'
|
|
),
|
|
(select count(*) from gateways where deleted_at is null),
|
|
(select count(*) from gateways where deleted_at is null and is_active = true),
|
|
(select count(*) from services where deleted_at is null and is_active = true),
|
|
(select count(*) from policies where deleted_at is null and is_active = true)
|
|
`).Scan(
|
|
&summary.Users,
|
|
&summary.Devices,
|
|
&summary.ConnectedDevices,
|
|
&summary.Gateways,
|
|
&summary.ActiveGateways,
|
|
&summary.Services,
|
|
&summary.Policies,
|
|
)
|
|
}
|
|
|
|
func (s *Service) latestGatewayRuntime(ctx context.Context) (*time.Time, error) {
|
|
var updatedAt *time.Time
|
|
err := s.db.QueryRow(ctx, `
|
|
select max(updated_at)
|
|
from settings
|
|
where category = 'gateway_runtime'
|
|
`).Scan(&updatedAt)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return updatedAt, nil
|
|
}
|
|
|
|
func componentFromGatewayRuntime(now time.Time, lastRuntime *time.Time, activeGateways int) ComponentStatus {
|
|
if activeGateways == 0 {
|
|
return ComponentStatus{
|
|
Status: "degraded",
|
|
Message: "No active gateway is configured.",
|
|
}
|
|
}
|
|
if lastRuntime == nil {
|
|
return ComponentStatus{
|
|
Status: "degraded",
|
|
Message: "No gateway telemetry has been received yet.",
|
|
}
|
|
}
|
|
|
|
age := now.Sub(lastRuntime.UTC())
|
|
if age <= 90*time.Second {
|
|
return ComponentStatus{
|
|
Status: "operational",
|
|
Message: fmt.Sprintf("Last gateway telemetry %s ago.", humanizeAge(age)),
|
|
}
|
|
}
|
|
|
|
return ComponentStatus{
|
|
Status: "degraded",
|
|
Message: fmt.Sprintf("Gateway telemetry is stale (%s ago).", humanizeAge(age)),
|
|
}
|
|
}
|
|
|
|
func humanizeAge(age time.Duration) string {
|
|
if age < time.Minute {
|
|
seconds := int(age.Seconds())
|
|
if seconds < 1 {
|
|
seconds = 1
|
|
}
|
|
return fmt.Sprintf("%ds", seconds)
|
|
}
|
|
if age < time.Hour {
|
|
return fmt.Sprintf("%dm", int(age.Minutes()))
|
|
}
|
|
return fmt.Sprintf("%dh", int(age.Hours()))
|
|
}
|