Go-based network automation with YANG models, gRPC, Ansible, Terraform, and Kubernetes integration. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
135 lines
3.3 KiB
Go
135 lines
3.3 KiB
Go
package health
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"net/http"
|
|
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
"github.com/prometheus/client_golang/prometheus/promhttp"
|
|
"go.uber.org/zap"
|
|
|
|
"github.com/guildhouse-co/kedge/internal/config"
|
|
)
|
|
|
|
// PeerCounter is implemented by the mesh manager.
|
|
type PeerCounter interface {
|
|
PeerCount() int
|
|
}
|
|
|
|
// VLANCounter is implemented by the VLAN manager.
|
|
type VLANCounter interface {
|
|
VLANCount() int
|
|
}
|
|
|
|
var (
|
|
meshPeerGauge = prometheus.NewGauge(prometheus.GaugeOpts{
|
|
Namespace: "kedge",
|
|
Subsystem: "mesh",
|
|
Name: "peer_count",
|
|
Help: "Number of active WireGuard mesh peers",
|
|
})
|
|
|
|
vlanInterfaceGauge = prometheus.NewGauge(prometheus.GaugeOpts{
|
|
Namespace: "kedge",
|
|
Subsystem: "vlan",
|
|
Name: "interface_count",
|
|
Help: "Number of managed VLAN interfaces",
|
|
})
|
|
|
|
sessionTransitCounter = prometheus.NewCounter(prometheus.CounterOpts{
|
|
Namespace: "kedge",
|
|
Subsystem: "quartermaster",
|
|
Name: "session_transits_total",
|
|
Help: "Total SessionTransitArtifacts submitted to Quartermaster",
|
|
})
|
|
|
|
networkMutationCounter = prometheus.NewCounter(prometheus.CounterOpts{
|
|
Namespace: "kedge",
|
|
Subsystem: "quartermaster",
|
|
Name: "network_mutations_total",
|
|
Help: "Total NetworkMutationArtifacts submitted to Quartermaster",
|
|
})
|
|
|
|
tunnelStatusGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
|
Namespace: "kedge",
|
|
Subsystem: "mesh",
|
|
Name: "tunnel_up",
|
|
Help: "WireGuard tunnel status (1=up, 0=down)",
|
|
}, []string{"peer"})
|
|
)
|
|
|
|
func init() {
|
|
prometheus.MustRegister(
|
|
meshPeerGauge,
|
|
vlanInterfaceGauge,
|
|
sessionTransitCounter,
|
|
networkMutationCounter,
|
|
tunnelStatusGauge,
|
|
)
|
|
}
|
|
|
|
// Server serves health checks and Prometheus metrics.
|
|
type Server struct {
|
|
cfg config.HealthConfig
|
|
peers PeerCounter
|
|
vlans VLANCounter
|
|
log *zap.SugaredLogger
|
|
}
|
|
|
|
// NewServer creates a new health and metrics server.
|
|
func NewServer(cfg config.HealthConfig, peers PeerCounter, vlans VLANCounter, log *zap.SugaredLogger) *Server {
|
|
return &Server{cfg: cfg, peers: peers, vlans: vlans, log: log}
|
|
}
|
|
|
|
// Run starts the HTTP server for health checks and metrics.
|
|
func (s *Server) Run(ctx context.Context) error {
|
|
mux := http.NewServeMux()
|
|
mux.HandleFunc("/healthz", s.handleHealthz)
|
|
mux.HandleFunc("/readyz", s.handleReadyz)
|
|
mux.Handle("/metrics", promhttp.Handler())
|
|
|
|
srv := &http.Server{
|
|
Addr: s.cfg.ListenAddr,
|
|
Handler: mux,
|
|
}
|
|
|
|
go func() {
|
|
<-ctx.Done()
|
|
srv.Close()
|
|
}()
|
|
|
|
s.log.Infof("health server listening on %s", s.cfg.ListenAddr)
|
|
if err := srv.ListenAndServe(); err != http.ErrServerClosed {
|
|
return fmt.Errorf("health server error: %w", err)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (s *Server) handleHealthz(w http.ResponseWriter, r *http.Request) {
|
|
w.WriteHeader(http.StatusOK)
|
|
fmt.Fprintln(w, "ok")
|
|
}
|
|
|
|
func (s *Server) handleReadyz(w http.ResponseWriter, r *http.Request) {
|
|
// Update metrics.
|
|
if s.peers != nil {
|
|
meshPeerGauge.Set(float64(s.peers.PeerCount()))
|
|
}
|
|
if s.vlans != nil {
|
|
vlanInterfaceGauge.Set(float64(s.vlans.VLANCount()))
|
|
}
|
|
|
|
w.WriteHeader(http.StatusOK)
|
|
fmt.Fprintln(w, "ok")
|
|
}
|
|
|
|
// RecordSessionTransit increments the session transit counter.
|
|
func RecordSessionTransit() {
|
|
sessionTransitCounter.Inc()
|
|
}
|
|
|
|
// RecordNetworkMutation increments the network mutation counter.
|
|
func RecordNetworkMutation() {
|
|
networkMutationCounter.Inc()
|
|
}
|