kedge/internal/health/health.go
Tyler King 6058e62348 Initial commit: Kedge network automation platform
Go-based network automation with YANG models, gRPC, Ansible,
Terraform, and Kubernetes integration.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-26 12:09:30 -05:00

135 lines
3.3 KiB
Go

package health
import (
"context"
"fmt"
"net/http"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
"go.uber.org/zap"
"github.com/guildhouse-co/kedge/internal/config"
)
// PeerCounter is implemented by the mesh manager.
type PeerCounter interface {
PeerCount() int
}
// VLANCounter is implemented by the VLAN manager.
type VLANCounter interface {
VLANCount() int
}
var (
meshPeerGauge = prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: "kedge",
Subsystem: "mesh",
Name: "peer_count",
Help: "Number of active WireGuard mesh peers",
})
vlanInterfaceGauge = prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: "kedge",
Subsystem: "vlan",
Name: "interface_count",
Help: "Number of managed VLAN interfaces",
})
sessionTransitCounter = prometheus.NewCounter(prometheus.CounterOpts{
Namespace: "kedge",
Subsystem: "quartermaster",
Name: "session_transits_total",
Help: "Total SessionTransitArtifacts submitted to Quartermaster",
})
networkMutationCounter = prometheus.NewCounter(prometheus.CounterOpts{
Namespace: "kedge",
Subsystem: "quartermaster",
Name: "network_mutations_total",
Help: "Total NetworkMutationArtifacts submitted to Quartermaster",
})
tunnelStatusGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "kedge",
Subsystem: "mesh",
Name: "tunnel_up",
Help: "WireGuard tunnel status (1=up, 0=down)",
}, []string{"peer"})
)
func init() {
prometheus.MustRegister(
meshPeerGauge,
vlanInterfaceGauge,
sessionTransitCounter,
networkMutationCounter,
tunnelStatusGauge,
)
}
// Server serves health checks and Prometheus metrics.
type Server struct {
cfg config.HealthConfig
peers PeerCounter
vlans VLANCounter
log *zap.SugaredLogger
}
// NewServer creates a new health and metrics server.
func NewServer(cfg config.HealthConfig, peers PeerCounter, vlans VLANCounter, log *zap.SugaredLogger) *Server {
return &Server{cfg: cfg, peers: peers, vlans: vlans, log: log}
}
// Run starts the HTTP server for health checks and metrics.
func (s *Server) Run(ctx context.Context) error {
mux := http.NewServeMux()
mux.HandleFunc("/healthz", s.handleHealthz)
mux.HandleFunc("/readyz", s.handleReadyz)
mux.Handle("/metrics", promhttp.Handler())
srv := &http.Server{
Addr: s.cfg.ListenAddr,
Handler: mux,
}
go func() {
<-ctx.Done()
srv.Close()
}()
s.log.Infof("health server listening on %s", s.cfg.ListenAddr)
if err := srv.ListenAndServe(); err != http.ErrServerClosed {
return fmt.Errorf("health server error: %w", err)
}
return nil
}
func (s *Server) handleHealthz(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusOK)
fmt.Fprintln(w, "ok")
}
func (s *Server) handleReadyz(w http.ResponseWriter, r *http.Request) {
// Update metrics.
if s.peers != nil {
meshPeerGauge.Set(float64(s.peers.PeerCount()))
}
if s.vlans != nil {
vlanInterfaceGauge.Set(float64(s.vlans.VLANCount()))
}
w.WriteHeader(http.StatusOK)
fmt.Fprintln(w, "ok")
}
// RecordSessionTransit increments the session transit counter.
func RecordSessionTransit() {
sessionTransitCounter.Inc()
}
// RecordNetworkMutation increments the network mutation counter.
func RecordNetworkMutation() {
networkMutationCounter.Inc()
}