feat: add public status page with component health monitoring and system metrics visualization

Add statuspage package with service, handler, and types for exposing platform health. Implement GET /api/v1/status endpoint returning operational status, component health (API, database, gateway runtime), and control plane summary counts.

Add Service.Snapshot method querying database connectivity, user/device/gateway/service/policy counts, connected device count via handshake timestamps, and gateway runtime tel
This commit is contained in:
2026-03-24 18:25:55 +01:00
parent 9aa4a13fd5
commit ff7eff8242
9 changed files with 541 additions and 1 deletions

View File

@@ -0,0 +1,24 @@
package statuspage
import (
"net/http"
"nexavpn/backend/internal/apiutil"
)
type Handler struct {
service *Service
}
func NewHandler(service *Service) *Handler {
return &Handler{service: service}
}
func (h *Handler) PublicStatus(w http.ResponseWriter, r *http.Request) {
snapshot := h.service.Snapshot(r.Context())
statusCode := http.StatusOK
if snapshot.Status != "operational" {
statusCode = http.StatusServiceUnavailable
}
apiutil.JSON(w, statusCode, snapshot)
}

View File

@@ -0,0 +1,155 @@
package statuspage
import (
"context"
"fmt"
"time"
"github.com/jackc/pgx/v5/pgxpool"
)
type Service struct {
db *pgxpool.Pool
}
func NewService(db *pgxpool.Pool) *Service {
return &Service{db: db}
}
func (s *Service) Snapshot(ctx context.Context) PublicStatusResponse {
now := time.Now().UTC()
resp := PublicStatusResponse{
Status: "operational",
GeneratedAt: now.Format(time.RFC3339),
Components: map[string]ComponentStatus{
"api": {
Status: "operational",
Message: "Public API is responding.",
},
},
}
if err := s.db.Ping(ctx); err != nil {
resp.Status = "degraded"
resp.Components["database"] = ComponentStatus{
Status: "degraded",
Message: "Database ping failed.",
}
return resp
}
resp.Components["database"] = ComponentStatus{
Status: "operational",
Message: "Database connectivity is healthy.",
}
if err := s.loadSummary(ctx, &resp.Summary); err != nil {
resp.Status = "degraded"
resp.Components["database"] = ComponentStatus{
Status: "degraded",
Message: "Database query failed.",
}
return resp
}
lastGatewayRuntime, err := s.latestGatewayRuntime(ctx)
if err != nil {
resp.Status = "degraded"
resp.Components["gateway"] = ComponentStatus{
Status: "degraded",
Message: "Gateway runtime state could not be read.",
}
return resp
}
resp.Components["gateway"] = componentFromGatewayRuntime(now, lastGatewayRuntime, resp.Summary.ActiveGateways)
if resp.Components["gateway"].Status != "operational" {
resp.Status = "degraded"
}
return resp
}
func (s *Service) loadSummary(ctx context.Context, summary *Summary) error {
return s.db.QueryRow(ctx, `
select
(select count(*) from users where deleted_at is null),
(select count(*) from devices where deleted_at is null),
(
select count(*)
from devices d
join wireguard_peers wp on wp.device_id = d.id and wp.deleted_at is null
where d.deleted_at is null
and d.status = 'active'
and wp.latest_handshake_at is not null
and to_timestamp(wp.latest_handshake_at) >= now() - interval '3 minutes'
),
(select count(*) from gateways where deleted_at is null),
(select count(*) from gateways where deleted_at is null and is_active = true),
(select count(*) from services where deleted_at is null and is_active = true),
(select count(*) from policies where deleted_at is null and is_active = true)
`).Scan(
&summary.Users,
&summary.Devices,
&summary.ConnectedDevices,
&summary.Gateways,
&summary.ActiveGateways,
&summary.Services,
&summary.Policies,
)
}
func (s *Service) latestGatewayRuntime(ctx context.Context) (*time.Time, error) {
var updatedAt *time.Time
err := s.db.QueryRow(ctx, `
select max(updated_at)
from settings
where category = 'gateway_runtime'
`).Scan(&updatedAt)
if err != nil {
return nil, err
}
return updatedAt, nil
}
func componentFromGatewayRuntime(now time.Time, lastRuntime *time.Time, activeGateways int) ComponentStatus {
if activeGateways == 0 {
return ComponentStatus{
Status: "degraded",
Message: "No active gateway is configured.",
}
}
if lastRuntime == nil {
return ComponentStatus{
Status: "degraded",
Message: "No gateway telemetry has been received yet.",
}
}
age := now.Sub(lastRuntime.UTC())
if age <= 90*time.Second {
return ComponentStatus{
Status: "operational",
Message: fmt.Sprintf("Last gateway telemetry %s ago.", humanizeAge(age)),
}
}
return ComponentStatus{
Status: "degraded",
Message: fmt.Sprintf("Gateway telemetry is stale (%s ago).", humanizeAge(age)),
}
}
func humanizeAge(age time.Duration) string {
if age < time.Minute {
seconds := int(age.Seconds())
if seconds < 1 {
seconds = 1
}
return fmt.Sprintf("%ds", seconds)
}
if age < time.Hour {
return fmt.Sprintf("%dm", int(age.Minutes()))
}
return fmt.Sprintf("%dh", int(age.Hours()))
}

View File

@@ -0,0 +1,23 @@
package statuspage
type ComponentStatus struct {
Status string `json:"status"`
Message string `json:"message,omitempty"`
}
type Summary struct {
Users int `json:"users"`
Devices int `json:"devices"`
ConnectedDevices int `json:"connected_devices"`
Gateways int `json:"gateways"`
ActiveGateways int `json:"active_gateways"`
Services int `json:"services"`
Policies int `json:"policies"`
}
type PublicStatusResponse struct {
Status string `json:"status"`
GeneratedAt string `json:"generated_at"`
Components map[string]ComponentStatus `json:"components"`
Summary Summary `json:"summary"`
}