From 6058e62348a10c3cc23ade4fd12afb9768786df91c65cb9a5ce4728ea6bbeedf Mon Sep 17 00:00:00 2001 From: Tyler King Date: Thu, 26 Feb 2026 12:09:30 -0500 Subject: [PATCH] Initial commit: Kedge network automation platform Go-based network automation with YANG models, gRPC, Ansible, Terraform, and Kubernetes integration. Co-Authored-By: Claude Opus 4.6 --- .claude/settings.json | 8 + .gitignore | 63 +++ CLAUDE.md | 206 +++++++++ Containerfile.build | 33 ++ Containerfile.dev | 27 ++ Makefile | 76 ++++ README.md | 55 +++ ansible/inventory/group_vars/all.yml | 17 + ansible/inventory/group_vars/cloud_anchor.yml | 10 + ansible/inventory/group_vars/homelab.yml | 18 + ansible/inventory/hosts.yml | 21 + ansible/playbooks/bootstrap-cloud.yml | 15 + ansible/playbooks/bootstrap-homelab.yml | 14 + ansible/playbooks/failover-test.yml | 31 ++ ansible/playbooks/mesh-health.yml | 30 ++ ansible/playbooks/underlay-audit.yml | 31 ++ .../roles/base-hardening/defaults/main.yml | 2 + .../roles/base-hardening/handlers/main.yml | 5 + ansible/roles/base-hardening/tasks/main.yml | 59 +++ ansible/roles/headscale/defaults/main.yml | 2 + ansible/roles/headscale/tasks/main.yml | 43 ++ ansible/roles/k3s-bootstrap/defaults/main.yml | 2 + ansible/roles/k3s-bootstrap/tasks/main.yml | 37 ++ ansible/roles/monitoring/defaults/main.yml | 2 + ansible/roles/monitoring/tasks/main.yml | 30 ++ .../wireguard-bootstrap/defaults/main.yml | 3 + .../roles/wireguard-bootstrap/tasks/main.yml | 43 ++ .../wireguard-bootstrap/templates/wg0.conf.j2 | 12 + ansible/site.yml | 33 ++ buf.gen.yaml | 10 + buf.yaml | 9 + compose.yaml | 42 ++ docs/adding-a-site.md | 10 + docs/architecture.md | 5 + docs/cni-plugin.md | 12 + docs/failover-runbook.md | 16 + docs/hfl-kedge-integration.md | 419 ++++++++++++++++++ docs/mesh-topology.md | 12 + docs/overlay-mode.md | 9 + docs/quartermaster-integration.md | 13 + docs/shellstream-boundary.md | 11 + docs/underlay-mode.md | 16 + docs/yang-compiler.md | 17 + go.mod | 19 + internal/cni/bridge.go | 37 ++ internal/cni/config.go | 36 ++ internal/cni/netns.go | 126 ++++++ internal/cni/plugin.go | 93 ++++ internal/cni/policy.go | 52 +++ internal/cni/routes.go | 84 ++++ internal/cni/tunnel.go | 42 ++ internal/config/config.go | 154 +++++++ internal/headscale/client.go | 68 +++ internal/health/health.go | 135 ++++++ internal/mesh/manager.go | 153 +++++++ internal/mesh/peer.go | 21 + internal/quartermaster/client.go | 102 +++++ internal/quartermaster/network_mutation.go | 53 +++ internal/quartermaster/session_transit.go | 60 +++ internal/shellstream/capability.go | 94 ++++ internal/shellstream/handshake.go | 151 +++++++ internal/shellstream/sat.go | 44 ++ internal/topology/state.go | 48 ++ internal/topology/store.go | 76 ++++ internal/underlay/dispatch.go | 69 +++ internal/underlay/watcher.go | 56 +++ internal/vlan/manager.go | 99 +++++ k8s/configmap-mesh.yaml | 34 ++ k8s/configmap-underlay.yaml | 33 ++ k8s/daemonset.yaml | 115 +++++ k8s/network-attachment.yaml | 21 + k8s/rbac.yaml | 45 ++ .../grafana/dashboards/device-mutations.json | 18 + .../grafana/dashboards/drift-detection.json | 20 + .../grafana/dashboards/mesh-health.json | 18 + .../grafana/dashboards/session-transits.json | 18 + monitoring/health-checks/checks.yml | 25 ++ monitoring/prometheus/prometheus.yml | 16 + proto/quartermaster/v1/governance.proto | 119 +++++ proto/quartermaster/v1/notary.proto | 48 ++ proto/quartermaster/v1/registry.proto | 58 +++ scripts/bootstrap.sh | 29 ++ scripts/failover-test.sh | 26 ++ scripts/generate-wireguard-keys.sh | 19 + scripts/validate-yang.sh | 13 + terraform/environments/homelab/main.tf | 20 + .../environments/homelab/terraform.tfvars | 7 + terraform/environments/production/main.tf | 24 + .../environments/production/terraform.tfvars | 8 + terraform/modules/cloud-anchor/main.tf | 59 +++ terraform/modules/cloud-anchor/outputs.tf | 9 + terraform/modules/cloud-anchor/variables.tf | 39 ++ terraform/modules/dns/main.tf | 17 + terraform/modules/dns/outputs.tf | 4 + terraform/modules/dns/variables.tf | 15 + terraform/modules/wireguard-topology/main.tf | 38 ++ .../modules/wireguard-topology/outputs.tf | 4 + .../modules/wireguard-topology/variables.tf | 15 + yang/compiler/__init__.py | 0 yang/compiler/compile.py | 129 ++++++ yang/compiler/to_fortios.py | 97 ++++ yang/compiler/to_unifi.py | 96 ++++ yang/compiler/to_vyos.py | 98 ++++ yang/models/sovereign-sdwan.yang | 184 ++++++++ yang/requirements.txt | 4 + yang/site-config/cloud-anchor.xml | 29 ++ yang/site-config/homelab.xml | 101 +++++ yang/tests/__init__.py | 0 yang/tests/test_compiler.py | 79 ++++ 109 files changed, 5152 insertions(+) create mode 100644 .claude/settings.json create mode 100644 .gitignore create mode 100644 CLAUDE.md create mode 100644 Containerfile.build create mode 100644 Containerfile.dev create mode 100644 Makefile create mode 100644 README.md create mode 100644 ansible/inventory/group_vars/all.yml create mode 100644 ansible/inventory/group_vars/cloud_anchor.yml create mode 100644 ansible/inventory/group_vars/homelab.yml create mode 100644 ansible/inventory/hosts.yml create mode 100644 ansible/playbooks/bootstrap-cloud.yml create mode 100644 ansible/playbooks/bootstrap-homelab.yml create mode 100644 ansible/playbooks/failover-test.yml create mode 100644 ansible/playbooks/mesh-health.yml create mode 100644 ansible/playbooks/underlay-audit.yml create mode 100644 ansible/roles/base-hardening/defaults/main.yml create mode 100644 ansible/roles/base-hardening/handlers/main.yml create mode 100644 ansible/roles/base-hardening/tasks/main.yml create mode 100644 ansible/roles/headscale/defaults/main.yml create mode 100644 ansible/roles/headscale/tasks/main.yml create mode 100644 ansible/roles/k3s-bootstrap/defaults/main.yml create mode 100644 ansible/roles/k3s-bootstrap/tasks/main.yml create mode 100644 ansible/roles/monitoring/defaults/main.yml create mode 100644 ansible/roles/monitoring/tasks/main.yml create mode 100644 ansible/roles/wireguard-bootstrap/defaults/main.yml create mode 100644 ansible/roles/wireguard-bootstrap/tasks/main.yml create mode 100644 ansible/roles/wireguard-bootstrap/templates/wg0.conf.j2 create mode 100644 ansible/site.yml create mode 100644 buf.gen.yaml create mode 100644 buf.yaml create mode 100644 compose.yaml create mode 100644 docs/adding-a-site.md create mode 100644 docs/architecture.md create mode 100644 docs/cni-plugin.md create mode 100644 docs/failover-runbook.md create mode 100644 docs/hfl-kedge-integration.md create mode 100644 docs/mesh-topology.md create mode 100644 docs/overlay-mode.md create mode 100644 docs/quartermaster-integration.md create mode 100644 docs/shellstream-boundary.md create mode 100644 docs/underlay-mode.md create mode 100644 docs/yang-compiler.md create mode 100644 go.mod create mode 100644 internal/cni/bridge.go create mode 100644 internal/cni/config.go create mode 100644 internal/cni/netns.go create mode 100644 internal/cni/plugin.go create mode 100644 internal/cni/policy.go create mode 100644 internal/cni/routes.go create mode 100644 internal/cni/tunnel.go create mode 100644 internal/config/config.go create mode 100644 internal/headscale/client.go create mode 100644 internal/health/health.go create mode 100644 internal/mesh/manager.go create mode 100644 internal/mesh/peer.go create mode 100644 internal/quartermaster/client.go create mode 100644 internal/quartermaster/network_mutation.go create mode 100644 internal/quartermaster/session_transit.go create mode 100644 internal/shellstream/capability.go create mode 100644 internal/shellstream/handshake.go create mode 100644 internal/shellstream/sat.go create mode 100644 internal/topology/state.go create mode 100644 internal/topology/store.go create mode 100644 internal/underlay/dispatch.go create mode 100644 internal/underlay/watcher.go create mode 100644 internal/vlan/manager.go create mode 100644 k8s/configmap-mesh.yaml create mode 100644 k8s/configmap-underlay.yaml create mode 100644 k8s/daemonset.yaml create mode 100644 k8s/network-attachment.yaml create mode 100644 k8s/rbac.yaml create mode 100644 monitoring/grafana/dashboards/device-mutations.json create mode 100644 monitoring/grafana/dashboards/drift-detection.json create mode 100644 monitoring/grafana/dashboards/mesh-health.json create mode 100644 monitoring/grafana/dashboards/session-transits.json create mode 100644 monitoring/health-checks/checks.yml create mode 100644 monitoring/prometheus/prometheus.yml create mode 100644 proto/quartermaster/v1/governance.proto create mode 100644 proto/quartermaster/v1/notary.proto create mode 100644 proto/quartermaster/v1/registry.proto create mode 100755 scripts/bootstrap.sh create mode 100755 scripts/failover-test.sh create mode 100755 scripts/generate-wireguard-keys.sh create mode 100755 scripts/validate-yang.sh create mode 100644 terraform/environments/homelab/main.tf create mode 100644 terraform/environments/homelab/terraform.tfvars create mode 100644 terraform/environments/production/main.tf create mode 100644 terraform/environments/production/terraform.tfvars create mode 100644 terraform/modules/cloud-anchor/main.tf create mode 100644 terraform/modules/cloud-anchor/outputs.tf create mode 100644 terraform/modules/cloud-anchor/variables.tf create mode 100644 terraform/modules/dns/main.tf create mode 100644 terraform/modules/dns/outputs.tf create mode 100644 terraform/modules/dns/variables.tf create mode 100644 terraform/modules/wireguard-topology/main.tf create mode 100644 terraform/modules/wireguard-topology/outputs.tf create mode 100644 terraform/modules/wireguard-topology/variables.tf create mode 100644 yang/compiler/__init__.py create mode 100644 yang/compiler/compile.py create mode 100644 yang/compiler/to_fortios.py create mode 100644 yang/compiler/to_unifi.py create mode 100644 yang/compiler/to_vyos.py create mode 100644 yang/models/sovereign-sdwan.yang create mode 100644 yang/requirements.txt create mode 100644 yang/site-config/cloud-anchor.xml create mode 100644 yang/site-config/homelab.xml create mode 100644 yang/tests/__init__.py create mode 100644 yang/tests/test_compiler.py diff --git a/.claude/settings.json b/.claude/settings.json new file mode 100644 index 0000000..ee3014f --- /dev/null +++ b/.claude/settings.json @@ -0,0 +1,8 @@ +{ + "permissions": { + "allow": [ + "Read(//home/tking/projects/guildhouse/**)", + "Read(//home/tking/projects/guildhouse/services/guildhouse-proto/**)" + ] + } +} diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e176fe1 --- /dev/null +++ b/.gitignore @@ -0,0 +1,63 @@ +# Binaries +bin/ +kedge-cni +kedge-daemon + +# Go +*.exe +*.exe~ +*.dll +*.so +*.dylib +*.test +*.out +vendor/ + +# Proto generated code +api/quartermaster/v1/*.pb.go + +# Python +__pycache__/ +*.py[cod] +*$py.class +*.egg-info/ +dist/ +build/ +.eggs/ +*.egg +.venv/ +venv/ +yang/.venv/ + +# Secrets — NEVER commit +*.key +*.pem +vault_pass* +.env +.env.* +credentials.json +kubeconfig +*.kubeconfig + +# Terraform +terraform/.terraform/ +terraform/**/.terraform/ +*.tfstate +*.tfstate.* +*.tfplan +.terraform.lock.hcl + +# Ansible +*.retry + +# IDE +.idea/ +.vscode/ +*.swp +*.swo +*~ +.DS_Store + +# Coverage +coverage.out +coverage.html diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..da9107e --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,206 @@ +# CLAUDE.md — Kedge + +> See the full architecture specification in the initial project prompt. This file captures conventions and development workflow. + +## Quick Reference + +- **Language**: Go (CNI plugin + DaemonSet), Python (YANG compiler) +- **Module**: `github.com/guildhouse-co/kedge` +- **Binaries**: `cmd/kedge-cni/` (CNI plugin), `cmd/kedge-daemon/` (DaemonSet) +- **Build**: `make container-build` (production), `make dev && make build` (development) + +## Conventions + +- Go: Standard project layout. `cmd/` for entrypoints, `internal/` for non-exported. +- Explicit error returns, no panics. +- CNI plugin follows CNI spec exactly. +- Python: Type hints required. `pyang` for YANG validation. +- YAML: 2-space indent. +- Secrets: Never in repo. `.gitignore` excludes `*.key`, `*.pem`, `vault_pass*`. +- Testing: Go `_test.go` files. Python `pytest`. Ansible `molecule/`. +- Container engine: podman (dev), k3s (prod). + +--- + +## Development Workflow + +### Container-Based Development (podman-compose) + +```bash +# Build the dev container +make dev-build + +# Start dev environment +make dev + +# Open a shell in the dev container +make dev-exec + +# Build binaries (in container) +make build + +# Run tests (in container) +make test + +# Run linter (in container) +make lint + +# Generate proto code (in container) +make proto-gen + +# Resolve Go dependencies (in container) +make mod-tidy + +# Tear down dev environment +make dev-down +``` + +### Production Container Images + +```bash +# Build production images (CNI + DaemonSet) +make container-build +``` + +### YANG Compiler + +```bash +# Validate YANG models +make yang-validate + +# Run Python compiler tests +make python-test +``` + +Important: The dev environment uses podman-compose. Go, protoc, buf, golangci-lint, and wireguard-tools are all available inside the dev container. No local Go installation is required. + +--- + +## Future: Dynamic Posture Framework + +The Guildhouse ecosystem supports a dynamic governance posture system with five levels (5 = normal operations, 1 = lockdown). Each level has associated accords that define allowed capabilities, modes, and operational scope. Kedge enforces posture at the network layer. + +**Kedge's role is network enforcement and mesh management during posture transitions.** + +### DaemonSet Posture Behavior + +The Kedge DaemonSet reads the current posture level from the same `posture_level` BPF map that the shell eBPF programs use (or receives it via notification from the Bascule governance daemon). On posture change, the DaemonSet adjusts: + +**Overlay mode (WireGuard mesh):** + +| Level | Mesh Behavior | +|-------|---------------| +| 5 | Full mesh. All tunnels active. Standard keepalive. | +| 4 | Full mesh. Increased health check frequency (detect degradation faster). | +| 3 | Full mesh maintained, but new Shellstream sessions require fresh SAT re-verification at handshake. DaemonSet rejects cached SAT validations. | +| 2 | Mesh maintained for management traffic only. Non-critical Shellstream sessions terminated. DaemonSet sends DRAINING signal to active non-essential sessions. | +| 1 | All remote overlay tunnels torn down. Cloud anchor disconnected. Only local loopback Shellstream permitted. The appliance is network-isolated except for physical console. | + +**Underlay mode (physical infrastructure programming):** + +| Level | Underlay Behavior | +|-------|-------------------| +| 5 | Normal operations. All authorized tiers can trigger infrastructure mutations via Bascule SDK dispatch. NetworkMutationArtifacts notarized per accord. | +| 4 | Normal operations. YANG drift detection frequency increased. | +| 3 | Underlay mutations restricted to master + site_owner tiers. Lower-tier SATs denied underlay mode at Shellstream handshake. Mutation proposals still accepted from all tiers but execution restricted. | +| 2 | Underlay mutations restricted to site_owner only. YANG compiler frozen — no new compilations. Only pre-approved emergency changes permitted. | +| 1 | All underlay operations frozen. No infrastructure mutations. Device configs frozen in place. VLAN interfaces maintained for monitoring but no changes dispatched. | + +**Shellstream handshake behavior:** + +The DaemonSet's Shellstream handshake termination adjusts per level: + +``` +Level 5: Standard 3-way handshake. SAT validated from cache. Full capability grant. + +Level 4: Standard handshake. SAT validated from cache. Full capability grant. + Additional telemetry: log all new sessions (not just sampled). + +Level 3: Strict handshake. SAT re-verified against SPIRE/Vigil (no cache). + Capability grant narrowed per posture accord (PROPOSE revoked for + lower tiers). Handshake latency increases (~50-100ms for re-verification) + but security posture tightens. + +Level 2: Restricted handshake. SAT re-verified. Only critical session types + permitted (management, incident response, health monitoring). + Non-critical session types rejected at ATTEST-VERIFY step with + posture-level denial reason. Existing non-critical sessions receive + DRAIN signal. + +Level 1: No remote handshakes. All inbound ATTEST-INIT rejected. + All outbound sessions blocked (eBPF shell also enforces this). + Only local Shellstream (loopback) permitted for console management. +``` + +**CNI plugin behavior:** + +The CNI plugin's route programming adjusts on posture transitions: + +| Level | Route Changes | +|-------|---------------| +| 5 | Full routes per `shell_state.allowed_targets` and `allowed_mode`. | +| 4 | Same as 5. No route changes. | +| 3 | Remove underlay routes for pods whose `shell_state` shows tiers below master. Overlay routes maintained. | +| 2 | Remove all non-critical routes. Only routes to management endpoints (Quartermaster, SPIRE, Prometheus) and incident response targets retained. | +| 1 | Remove all overlay routes (tunnels down). Remove all underlay routes except monitoring-only (read-only SNMP/health endpoints). | + +Route changes on posture transition are triggered by the DaemonSet (which watches `posture_level` changes) notifying the CNI plugin's route manager. The CNI plugin does NOT poll the posture level — it receives route update commands from the DaemonSet. + +### Quartermaster Notarization Fidelity + +Kedge's DaemonSet adjusts `SessionTransitArtifact` and `NetworkMutationArtifact` submission behavior per level: + +| Level | Notarization | +|-------|-------------| +| 5 | Sampled — 10% of routine session transits. All mutations. | +| 4 | Increased — 50% of session transits. All mutations. | +| 3 | Full — 100% of session transits. 100% of mutations. | +| 2 | Full + real-time — 100% with immediate anchor flushing (no batching). | +| 1 | Forensic — 100% with full payload hashing. Anchor flush on every record. Session metadata includes full Shellstream frame headers. | + +At level 1, Kedge's DaemonSet also emits a `PostureForensicSnapshot` to Quartermaster containing the current mesh topology, all active tunnel states, all programmed routes, and all pending session state. This is the network-layer forensic evidence. + +### VPP Governance Plugin Coordination (Appliance Deployments) + +On Guildhouse SD-WAN appliance deployments where VPP handles the data plane, the Kedge DaemonSet coordinates posture changes with the VPP governance plugin: + +``` +Posture transition signal received + │ + ├─ DaemonSet updates its own behavior (overlay/underlay/handshake) + │ + ├─ DaemonSet notifies VPP governance plugin via shared memory + │ governance_state.posture_level = new_level + │ + │ VPP plugin adjusts on next poll cycle (~microseconds): + │ Level 5: standard flow rules + │ Level 4: deep packet classification enabled + │ Level 3: non-essential inter-zone traffic deprioritized + │ Level 2: emergency zone policy (critical flows only) + │ Level 1: allowlist-only (everything not pre-approved dropped) + │ + └─ Both DaemonSet and VPP plugin emit posture change telemetry + to their respective ring buffers → Quartermaster +``` + +The DaemonSet and VPP plugin must reach the new posture level within the same transition window. The DaemonSet updates VPP shared memory BEFORE adjusting its own Shellstream handshake behavior — this ensures the data plane tightens before (or simultaneously with) the session layer. + +### Mesh-Wide Posture Visibility + +In a multi-cluster deployment, each cluster's Kedge DaemonSet maintains its own posture level. Cross-cluster posture is communicated via Shellstream: + +- Each DaemonSet includes the local posture level in Shellstream handshake metadata +- When a cloud anchor DaemonSet receives an ATTEST-INIT from a homelab DaemonSet at level 3, the cloud anchor knows to apply level 3 session restrictions even if the cloud anchor itself is at level 5 +- The more restrictive posture wins at any boundary crossing: if the source is at level 3 and the destination is at level 5, the session is governed by level 3 rules +- This is enforced per-session, not per-cluster. A cluster at level 5 can host sessions from a level 3 peer — it just applies level 3 restrictions to those specific sessions + +--- + +## Working Notes + +- **Posture transition latency budget**: Kedge's network-level posture transition should complete within 5 seconds for the overlay path (tunnel adjustments, Shellstream session draining) and 10 seconds for the underlay path (route removal, VLAN interface adjustments). The VPP plugin path should complete within 1ms (shared memory update + next poll cycle). These targets ensure that a DEFCON 5→3 escalation restricts network access before an attacker can exploit the window. +- **Graceful session draining on escalation**: When escalating from level 5 to level 3, existing Shellstream sessions from lower-tier operators should not be abruptly terminated. The DaemonSet sends a DRAIN signal, allowing in-flight operations to complete (with a timeout), then tears down the session. Only level 2+ escalation does abrupt termination of non-critical sessions. +- **Posture-aware `SessionTransitArtifact`**: The `SessionTransitArtifact` should include a `posture_level_at_transit` field recording the posture level at the time of the session boundary crossing. This allows Quartermaster queries like "show all session transits that occurred while the cluster was at DEFCON 3" — critical for post-incident forensics. +- **De-escalation route restoration**: When de-escalating (e.g., level 3 → level 4 → level 5), the DaemonSet must restore routes that were removed during escalation. This requires the DaemonSet to maintain a "pre-escalation route table" snapshot so it can cleanly restore. The snapshot is taken at escalation time and stored locally (not in the BPF map — too large). +- **Level 1 cloud anchor behavior**: At level 1, the homelab cluster tears down overlay tunnels to the cloud anchor. The cloud anchor's Kedge DaemonSet should detect this as a posture-driven disconnection (not a failure) based on the DRAIN signal preceding the tunnel teardown. The cloud anchor should NOT attempt reconnection until it receives a de-escalation signal (via an out-of-band channel — physical console, SMS API, or pre-arranged re-attestation ceremony). +- **Testing posture transitions**: Add `posture-transition-test.yml` to Ansible playbooks. Should simulate escalation from 5→1 and back, verifying at each level: correct routes programmed, correct Shellstream handshake behavior, correct notarization fidelity, correct VPP plugin state (if applicable). This is a critical system-level integration test. diff --git a/Containerfile.build b/Containerfile.build new file mode 100644 index 0000000..ca17662 --- /dev/null +++ b/Containerfile.build @@ -0,0 +1,33 @@ +FROM docker.io/library/golang:1.23-bookworm AS builder + +WORKDIR /src +COPY go.mod go.sum ./ +RUN go mod download + +COPY . . +RUN CGO_ENABLED=0 go build -o /bin/kedge-cni ./cmd/kedge-cni +RUN CGO_ENABLED=0 go build -o /bin/kedge-daemon ./cmd/kedge-daemon + +# --- CNI plugin image (installed into host /opt/cni/bin/) --- +FROM scratch AS cni +COPY --from=builder /bin/kedge-cni /kedge-cni + +# --- DaemonSet image --- +FROM docker.io/library/debian:bookworm-slim AS daemon +RUN apt-get update && apt-get install -y --no-install-recommends \ + wireguard-tools \ + iproute2 \ + iptables \ + python3 \ + python3-pip \ + python3-venv \ + ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +COPY --from=builder /bin/kedge-daemon /usr/local/bin/kedge-daemon +COPY yang/ /opt/kedge/yang/ + +RUN python3 -m venv /opt/kedge/venv \ + && /opt/kedge/venv/bin/pip install --no-cache-dir -r /opt/kedge/yang/requirements.txt + +ENTRYPOINT ["kedge-daemon"] diff --git a/Containerfile.dev b/Containerfile.dev new file mode 100644 index 0000000..6d2dfb3 --- /dev/null +++ b/Containerfile.dev @@ -0,0 +1,27 @@ +FROM docker.io/library/golang:1.23-bookworm + +# Install system dependencies for CNI development. +RUN apt-get update && apt-get install -y --no-install-recommends \ + wireguard-tools \ + iproute2 \ + iptables \ + python3 \ + python3-pip \ + python3-venv \ + protobuf-compiler \ + ca-certificates \ + curl \ + git \ + jq \ + && rm -rf /var/lib/apt/lists/* + +# Install Go tools. +RUN go install google.golang.org/protobuf/cmd/protoc-gen-go@latest \ + && go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest \ + && go install github.com/golangci/golangci-lint/cmd/golangci-lint@latest + +# Install buf for proto management. +RUN curl -sSL "https://github.com/bufbuild/buf/releases/latest/download/buf-$(uname -s)-$(uname -m)" \ + -o /usr/local/bin/buf && chmod +x /usr/local/bin/buf + +WORKDIR /src diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..37f895b --- /dev/null +++ b/Makefile @@ -0,0 +1,76 @@ +.PHONY: all build build-cni build-daemon test lint proto-gen yang-validate \ + container-build dev dev-build dev-exec clean + +CONTAINER_ENGINE ?= podman +COMPOSE ?= $(CONTAINER_ENGINE)-compose +DEV_CONTAINER := kedge-dev + +CNI_BINARY := bin/kedge-cni +DAEMON_BINARY := bin/kedge-daemon + +# ---------- Container-based builds (default) ---------- + +all: container-build + +container-build: + $(CONTAINER_ENGINE) build --target cni -t kedge-cni -f Containerfile.build . + $(CONTAINER_ENGINE) build --target daemon -t kedge-daemon -f Containerfile.build . + +# ---------- Dev environment (podman-compose) ---------- + +dev-build: + $(COMPOSE) build + +dev: + $(COMPOSE) up -d dev + +dev-exec: + $(COMPOSE) exec dev bash + +dev-down: + $(COMPOSE) down + +# ---------- In-container Go commands (via dev container) ---------- + +build: build-cni build-daemon + +build-cni: + $(COMPOSE) exec dev go build -o $(CNI_BINARY) ./cmd/kedge-cni + +build-daemon: + $(COMPOSE) exec dev go build -o $(DAEMON_BINARY) ./cmd/kedge-daemon + +test: + $(COMPOSE) exec dev go test ./... + +lint: + $(COMPOSE) exec dev golangci-lint run ./... + +proto-gen: + $(COMPOSE) exec dev buf generate + +mod-tidy: + $(COMPOSE) exec dev go mod tidy + +# ---------- YANG / Python ---------- + +yang-validate: + $(COMPOSE) exec yang-compiler pyang --strict models/sovereign-sdwan.yang + +python-test: + $(COMPOSE) exec yang-compiler python -m pytest tests/ + +# ---------- Local (if Go is installed) ---------- + +build-local: + go build -o $(CNI_BINARY) ./cmd/kedge-cni + go build -o $(DAEMON_BINARY) ./cmd/kedge-daemon + +test-local: + go test ./... + +# ---------- Clean ---------- + +clean: + rm -rf bin/ + $(COMPOSE) down -v 2>/dev/null || true diff --git a/README.md b/README.md new file mode 100644 index 0000000..ebe2df9 --- /dev/null +++ b/README.md @@ -0,0 +1,55 @@ +# Kedge + +Dual-mode Kubernetes CNI plugin and node-level DaemonSet for attested connectivity between clusters and managed infrastructure. Part of the [Guildhouse](https://github.com/guildhouse-co/guildhouse) ecosystem. + +## Modes + +- **Overlay**: Tunnels over networks Kedge doesn't control (WireGuard/VXLAN). For cloud anchors, remote sites, MSP-managed customer environments. +- **Underlay**: Programs the actual physical network fabric via vendor SDK dispatch through Bascule. For sites where you own the iron. + +Both modes coexist at the same site. The capability token in a Shellstream handshake determines whether a session gets tunnel access (overlay) or infrastructure mutation authority (underlay). + +## Components + +| Component | Language | Description | +|-----------|----------|-------------| +| CNI Plugin | Go | Multus secondary network attachment (`net1`), route programming | +| DaemonSet | Go | WireGuard mesh, Shellstream termination, QM notarization, VLAN management | +| YANG Compiler | Python | Device-agnostic policy → vendor-specific config (FortiOS, VyOS, UniFi) | + +## Build + +```bash +make build # Build both binaries +make test # Run Go tests +make lint # golangci-lint +make proto-gen # Generate gRPC client code from protos +make yang-validate # Validate YANG models with pyang +make python-test # Run YANG compiler tests +``` + +## Project Structure + +``` +cmd/ # Binary entry points (kedge-cni, kedge-daemon) +internal/ # Go internal packages + cni/ # CNI plugin logic + mesh/ # WireGuard tunnel lifecycle (overlay) + vlan/ # VLAN interface management (underlay) + shellstream/ # Handshake termination, SAT validation + quartermaster/ # QM gRPC client, artifact types + underlay/ # YANG watch, compilation trigger + health/ # Prometheus metrics + headscale/ # Peer discovery + topology/ # Shared topology state + config/ # Configuration types +yang/ # YANG models, site configs, Python compiler +k8s/ # Kubernetes manifests +ansible/ # Bootstrap provisioning roles +terraform/ # Cloud anchor, mesh topology IaC +monitoring/ # Prometheus, Grafana dashboards +``` + +## Current Status + +**Phase 1** (active): Two-cluster connectivity (homelab k3s + cloud anchor k3s). Overlay: WireGuard mesh, Shellstream handshake, SessionTransitArtifact. Underlay: VLAN bridges, YANG model, FortiOS + VyOS compiler targets, NetworkMutationArtifact. diff --git a/ansible/inventory/group_vars/all.yml b/ansible/inventory/group_vars/all.yml new file mode 100644 index 0000000..96a6f92 --- /dev/null +++ b/ansible/inventory/group_vars/all.yml @@ -0,0 +1,17 @@ +--- +# Variables shared across all hosts. + +kedge_version: "0.1.0" +kedge_wg_port: 51820 +kedge_shellstream_port: 8443 +kedge_metrics_port: 9090 + +# SPIRE +spire_trust_domain: "guildhouse.local" +spire_socket_path: "/run/spire/sockets/agent.sock" + +# Quartermaster +qm_endpoint: "quartermaster.guildhouse.svc:50051" + +# k3s +k3s_version: "v1.31.4+k3s1" diff --git a/ansible/inventory/group_vars/cloud_anchor.yml b/ansible/inventory/group_vars/cloud_anchor.yml new file mode 100644 index 0000000..0dfc0ac --- /dev/null +++ b/ansible/inventory/group_vars/cloud_anchor.yml @@ -0,0 +1,10 @@ +--- +# Cloud anchor specific variables. + +kedge_mode: overlay +kedge_overlay_enabled: true +kedge_underlay_enabled: false + +# Headscale runs on the cloud anchor. +headscale_enabled: true +headscale_listen_addr: "0.0.0.0:8080" diff --git a/ansible/inventory/group_vars/homelab.yml b/ansible/inventory/group_vars/homelab.yml new file mode 100644 index 0000000..65f4434 --- /dev/null +++ b/ansible/inventory/group_vars/homelab.yml @@ -0,0 +1,18 @@ +--- +# Homelab specific variables. + +kedge_mode: both +kedge_overlay_enabled: true +kedge_underlay_enabled: true + +# Managed devices. +kedge_devices: + - name: fortigate.transit.local + type: fortios + address: 172.16.0.2 + - name: vyos.transit.local + type: vyos + address: 172.16.0.3 + - name: udr7.local + type: unifi + address: 192.168.1.1 diff --git a/ansible/inventory/hosts.yml b/ansible/inventory/hosts.yml new file mode 100644 index 0000000..bfed470 --- /dev/null +++ b/ansible/inventory/hosts.yml @@ -0,0 +1,21 @@ +--- +all: + children: + cloud_anchor: + hosts: + anchor01: + ansible_host: anchor.guildhouse.example.com + ansible_user: deploy + kedge_mode: overlay + kedge_cluster_id: cloud-anchor + homelab: + hosts: + homelab01: + ansible_host: 10.0.1.10 + ansible_user: tking + kedge_mode: both + kedge_cluster_id: homelab + kedge_vlans: + - {id: 100, parent: eth0, subnet: "172.16.0.0/24"} + - {id: 10, parent: eth0, subnet: "10.0.1.0/24"} + - {id: 50, parent: eth0, subnet: "192.168.50.0/24"} diff --git a/ansible/playbooks/bootstrap-cloud.yml b/ansible/playbooks/bootstrap-cloud.yml new file mode 100644 index 0000000..723fc70 --- /dev/null +++ b/ansible/playbooks/bootstrap-cloud.yml @@ -0,0 +1,15 @@ +--- +# Bootstrap a cloud anchor cluster (overlay mode only). +- name: Bootstrap cloud anchor + hosts: cloud_anchor + become: true + roles: + - base-hardening + - wireguard-bootstrap + - headscale + - k3s-bootstrap + - monitoring + vars: + kedge_mode: overlay + kedge_overlay_enabled: true + kedge_underlay_enabled: false diff --git a/ansible/playbooks/bootstrap-homelab.yml b/ansible/playbooks/bootstrap-homelab.yml new file mode 100644 index 0000000..458de6e --- /dev/null +++ b/ansible/playbooks/bootstrap-homelab.yml @@ -0,0 +1,14 @@ +--- +# Bootstrap a homelab cluster (overlay + underlay mode). +- name: Bootstrap homelab + hosts: homelab + become: true + roles: + - base-hardening + - wireguard-bootstrap + - k3s-bootstrap + - monitoring + vars: + kedge_mode: both + kedge_overlay_enabled: true + kedge_underlay_enabled: true diff --git a/ansible/playbooks/failover-test.yml b/ansible/playbooks/failover-test.yml new file mode 100644 index 0000000..80c54f6 --- /dev/null +++ b/ansible/playbooks/failover-test.yml @@ -0,0 +1,31 @@ +--- +# Simulate primary WAN failure and verify OOB path survivability. +- name: WAN failover test + hosts: homelab + become: true + tasks: + - name: Record current WireGuard peer status + ansible.builtin.command: + cmd: wg show wg0 + register: wg_before + changed_when: false + + - name: Display pre-failover state + ansible.builtin.debug: + var: wg_before.stdout_lines + + # NOTE: Actual failover simulation (iptables rules to block primary WAN) + # is too destructive for automated runs. This playbook verifies the + # monitoring and detection components are working. + + - name: Check Kedge DaemonSet dead peer detection + ansible.builtin.uri: + url: "http://localhost:{{ kedge_metrics_port }}/metrics" + return_content: true + register: metrics + + - name: Verify mesh peer metrics exist + ansible.builtin.assert: + that: + - "'kedge_mesh_peer_count' in metrics.content" + fail_msg: "Mesh peer metrics not found — DaemonSet may not be running" diff --git a/ansible/playbooks/mesh-health.yml b/ansible/playbooks/mesh-health.yml new file mode 100644 index 0000000..0daf210 --- /dev/null +++ b/ansible/playbooks/mesh-health.yml @@ -0,0 +1,30 @@ +--- +# Verify overlay mesh connectivity between all clusters. +- name: Mesh health check + hosts: all + become: true + tasks: + - name: Check WireGuard interface + ansible.builtin.command: + cmd: wg show wg0 + register: wg_status + changed_when: false + + - name: Display WireGuard status + ansible.builtin.debug: + var: wg_status.stdout_lines + + - name: Ping overlay peers + ansible.builtin.command: + cmd: "ping -c 3 -W 2 {{ item }}" + loop: "{{ wg_peers | default([]) | map(attribute='allowed_ips') | flatten | map('regex_replace', '/.*', '') | list }}" + register: ping_results + changed_when: false + failed_when: false + + - name: Check Kedge DaemonSet health + ansible.builtin.uri: + url: "http://localhost:{{ kedge_metrics_port }}/healthz" + return_content: true + register: health + failed_when: health.status != 200 diff --git a/ansible/playbooks/underlay-audit.yml b/ansible/playbooks/underlay-audit.yml new file mode 100644 index 0000000..3b5a5a4 --- /dev/null +++ b/ansible/playbooks/underlay-audit.yml @@ -0,0 +1,31 @@ +--- +# Audit underlay device configurations against YANG desired state. +# Detects config drift for the insurance observability story. +- name: Underlay configuration audit + hosts: homelab + become: true + tasks: + - name: Validate YANG model + ansible.builtin.command: + cmd: pyang --strict /opt/kedge/yang/models/sovereign-sdwan.yang + register: yang_validation + changed_when: false + + - name: Run YANG compiler for current site + ansible.builtin.command: + cmd: > + python3 /opt/kedge/yang/compiler/compile.py + --site-config /opt/kedge/yang/site-config/homelab.xml + --output-format json + register: compiled_config + changed_when: false + + - name: Display compiled desired state + ansible.builtin.debug: + var: compiled_config.stdout | from_json + + # TODO: Fetch actual device configs via Bascule SDK and compare + # against compiled desired state. Report drift. + - name: Report audit status + ansible.builtin.debug: + msg: "Underlay audit complete. Drift detection requires Bascule SDK integration." diff --git a/ansible/roles/base-hardening/defaults/main.yml b/ansible/roles/base-hardening/defaults/main.yml new file mode 100644 index 0000000..74f0375 --- /dev/null +++ b/ansible/roles/base-hardening/defaults/main.yml @@ -0,0 +1,2 @@ +--- +base_hardening_ssh_port: 22 diff --git a/ansible/roles/base-hardening/handlers/main.yml b/ansible/roles/base-hardening/handlers/main.yml new file mode 100644 index 0000000..124691f --- /dev/null +++ b/ansible/roles/base-hardening/handlers/main.yml @@ -0,0 +1,5 @@ +--- +- name: restart sshd + ansible.builtin.service: + name: sshd + state: restarted diff --git a/ansible/roles/base-hardening/tasks/main.yml b/ansible/roles/base-hardening/tasks/main.yml new file mode 100644 index 0000000..d363c61 --- /dev/null +++ b/ansible/roles/base-hardening/tasks/main.yml @@ -0,0 +1,59 @@ +--- +- name: Update apt cache + ansible.builtin.apt: + update_cache: true + cache_valid_time: 3600 + when: ansible_os_family == "Debian" + +- name: Install base packages + ansible.builtin.package: + name: + - ufw + - fail2ban + - unattended-upgrades + - wireguard-tools + - jq + - curl + state: present + +- name: Configure SSH hardening + ansible.builtin.lineinfile: + path: /etc/ssh/sshd_config + regexp: "{{ item.regexp }}" + line: "{{ item.line }}" + loop: + - {regexp: '^#?PermitRootLogin', line: 'PermitRootLogin no'} + - {regexp: '^#?PasswordAuthentication', line: 'PasswordAuthentication no'} + - {regexp: '^#?X11Forwarding', line: 'X11Forwarding no'} + notify: restart sshd + +- name: Enable UFW with default deny + community.general.ufw: + state: enabled + default: deny + direction: incoming + +- name: Allow SSH + community.general.ufw: + rule: allow + port: "22" + proto: tcp + +- name: Allow WireGuard + community.general.ufw: + rule: allow + port: "{{ kedge_wg_port }}" + proto: udp + +- name: Allow Kedge metrics + community.general.ufw: + rule: allow + port: "{{ kedge_metrics_port }}" + proto: tcp + src: "10.0.0.0/8" + +- name: Enable fail2ban + ansible.builtin.service: + name: fail2ban + enabled: true + state: started diff --git a/ansible/roles/headscale/defaults/main.yml b/ansible/roles/headscale/defaults/main.yml new file mode 100644 index 0000000..91bebf0 --- /dev/null +++ b/ansible/roles/headscale/defaults/main.yml @@ -0,0 +1,2 @@ +--- +headscale_version: "0.23.0" diff --git a/ansible/roles/headscale/tasks/main.yml b/ansible/roles/headscale/tasks/main.yml new file mode 100644 index 0000000..4e3f8ea --- /dev/null +++ b/ansible/roles/headscale/tasks/main.yml @@ -0,0 +1,43 @@ +--- +- name: Download Headscale binary + ansible.builtin.get_url: + url: "https://github.com/juanfont/headscale/releases/download/v{{ headscale_version }}/headscale_{{ headscale_version }}_linux_amd64" + dest: /usr/local/bin/headscale + mode: "0755" + +- name: Create Headscale config directory + ansible.builtin.file: + path: /etc/headscale + state: directory + mode: "0755" + +- name: Create Headscale data directory + ansible.builtin.file: + path: /var/lib/headscale + state: directory + mode: "0750" + +- name: Deploy Headscale systemd unit + ansible.builtin.copy: + content: | + [Unit] + Description=Headscale mesh coordinator + After=network.target + + [Service] + ExecStart=/usr/local/bin/headscale serve + Restart=always + RestartSec=5 + + [Install] + WantedBy=multi-user.target + dest: /etc/systemd/system/headscale.service + mode: "0644" + notify: restart headscale + +- name: Enable and start Headscale + ansible.builtin.service: + name: headscale + enabled: true + state: started + daemon_reload: true diff --git a/ansible/roles/k3s-bootstrap/defaults/main.yml b/ansible/roles/k3s-bootstrap/defaults/main.yml new file mode 100644 index 0000000..c42e6fe --- /dev/null +++ b/ansible/roles/k3s-bootstrap/defaults/main.yml @@ -0,0 +1,2 @@ +--- +k3s_version: "v1.31.4+k3s1" diff --git a/ansible/roles/k3s-bootstrap/tasks/main.yml b/ansible/roles/k3s-bootstrap/tasks/main.yml new file mode 100644 index 0000000..5a296ed --- /dev/null +++ b/ansible/roles/k3s-bootstrap/tasks/main.yml @@ -0,0 +1,37 @@ +--- +- name: Download k3s installer + ansible.builtin.get_url: + url: https://get.k3s.io + dest: /tmp/k3s-install.sh + mode: "0755" + +- name: Install k3s + ansible.builtin.command: + cmd: /tmp/k3s-install.sh + creates: /usr/local/bin/k3s + environment: + INSTALL_K3S_VERSION: "{{ k3s_version }}" + K3S_KUBECONFIG_MODE: "644" + +- name: Wait for k3s to be ready + ansible.builtin.command: + cmd: k3s kubectl get nodes + register: k3s_nodes + retries: 10 + delay: 5 + until: k3s_nodes.rc == 0 + +- name: Install Multus CNI + ansible.builtin.command: + cmd: k3s kubectl apply -f https://raw.githubusercontent.com/k8snetworkplumbingwg/multus-cni/master/deployments/multus-daemonset-thick.yml + changed_when: true + +- name: Create kedge namespace + ansible.builtin.command: + cmd: k3s kubectl create namespace kedge --dry-run=client -o yaml | k3s kubectl apply -f - + changed_when: true + +- name: Deploy Kedge RBAC + ansible.builtin.command: + cmd: k3s kubectl apply -f /opt/kedge/k8s/rbac.yaml + changed_when: true diff --git a/ansible/roles/monitoring/defaults/main.yml b/ansible/roles/monitoring/defaults/main.yml new file mode 100644 index 0000000..3c1839b --- /dev/null +++ b/ansible/roles/monitoring/defaults/main.yml @@ -0,0 +1,2 @@ +--- +kedge_metrics_port: 9090 diff --git a/ansible/roles/monitoring/tasks/main.yml b/ansible/roles/monitoring/tasks/main.yml new file mode 100644 index 0000000..21198ae --- /dev/null +++ b/ansible/roles/monitoring/tasks/main.yml @@ -0,0 +1,30 @@ +--- +- name: Install node_exporter + ansible.builtin.package: + name: prometheus-node-exporter + state: present + when: ansible_os_family == "Debian" + +- name: Enable node_exporter + ansible.builtin.service: + name: prometheus-node-exporter + enabled: true + state: started + +- name: Create monitoring directory + ansible.builtin.file: + path: /etc/kedge/monitoring + state: directory + mode: "0755" + +- name: Deploy Prometheus scrape config for Kedge + ansible.builtin.copy: + content: | + - job_name: kedge-daemon + static_configs: + - targets: ['localhost:{{ kedge_metrics_port }}'] + labels: + cluster: '{{ kedge_cluster_id }}' + node: '{{ inventory_hostname }}' + dest: /etc/kedge/monitoring/prometheus-kedge.yml + mode: "0644" diff --git a/ansible/roles/wireguard-bootstrap/defaults/main.yml b/ansible/roles/wireguard-bootstrap/defaults/main.yml new file mode 100644 index 0000000..6cb00ef --- /dev/null +++ b/ansible/roles/wireguard-bootstrap/defaults/main.yml @@ -0,0 +1,3 @@ +--- +wg_address: "10.100.0.1/24" +wg_peers: [] diff --git a/ansible/roles/wireguard-bootstrap/tasks/main.yml b/ansible/roles/wireguard-bootstrap/tasks/main.yml new file mode 100644 index 0000000..db0c698 --- /dev/null +++ b/ansible/roles/wireguard-bootstrap/tasks/main.yml @@ -0,0 +1,43 @@ +--- +- name: Ensure WireGuard is installed + ansible.builtin.package: + name: wireguard-tools + state: present + +- name: Create /etc/wireguard directory + ansible.builtin.file: + path: /etc/wireguard + state: directory + mode: "0700" + +- name: Generate WireGuard private key + ansible.builtin.command: + cmd: wg genkey + creates: /etc/wireguard/private.key + register: wg_privkey + +- name: Write private key + ansible.builtin.copy: + content: "{{ wg_privkey.stdout }}" + dest: /etc/wireguard/private.key + mode: "0600" + when: wg_privkey.changed + +- name: Derive public key + ansible.builtin.shell: + cmd: cat /etc/wireguard/private.key | wg pubkey + register: wg_pubkey + changed_when: false + +- name: Template WireGuard config + ansible.builtin.template: + src: wg0.conf.j2 + dest: /etc/wireguard/wg0.conf + mode: "0600" + notify: restart wireguard + +- name: Enable and start WireGuard + ansible.builtin.service: + name: "wg-quick@wg0" + enabled: true + state: started diff --git a/ansible/roles/wireguard-bootstrap/templates/wg0.conf.j2 b/ansible/roles/wireguard-bootstrap/templates/wg0.conf.j2 new file mode 100644 index 0000000..0fad8ca --- /dev/null +++ b/ansible/roles/wireguard-bootstrap/templates/wg0.conf.j2 @@ -0,0 +1,12 @@ +[Interface] +PrivateKey = {{ lookup('file', '/etc/wireguard/private.key') }} +ListenPort = {{ kedge_wg_port }} +Address = {{ wg_address | default('10.100.0.1/24') }} + +{% for peer in wg_peers | default([]) %} +[Peer] +PublicKey = {{ peer.public_key }} +Endpoint = {{ peer.endpoint }} +AllowedIPs = {{ peer.allowed_ips | join(', ') }} +PersistentKeepalive = 25 +{% endfor %} diff --git a/ansible/site.yml b/ansible/site.yml new file mode 100644 index 0000000..3117a7c --- /dev/null +++ b/ansible/site.yml @@ -0,0 +1,33 @@ +--- +# Kedge site bootstrap playbook. +# Provisions infrastructure for Kedge deployment at a new site. + +- name: Apply base hardening to all nodes + hosts: all + become: true + roles: + - base-hardening + +- name: Bootstrap WireGuard mesh + hosts: all + become: true + roles: + - wireguard-bootstrap + +- name: Deploy Headscale coordinator + hosts: cloud_anchor + become: true + roles: + - headscale + +- name: Bootstrap k3s clusters + hosts: all + become: true + roles: + - k3s-bootstrap + +- name: Deploy monitoring stack + hosts: all + become: true + roles: + - monitoring diff --git a/buf.gen.yaml b/buf.gen.yaml new file mode 100644 index 0000000..6de19ad --- /dev/null +++ b/buf.gen.yaml @@ -0,0 +1,10 @@ +version: v2 +plugins: + - remote: buf.build/protocolbuffers/go + out: api + opt: + - paths=source_relative + - remote: buf.build/grpc/go + out: api + opt: + - paths=source_relative diff --git a/buf.yaml b/buf.yaml new file mode 100644 index 0000000..f0bb51a --- /dev/null +++ b/buf.yaml @@ -0,0 +1,9 @@ +version: v2 +modules: + - path: proto +lint: + use: + - STANDARD +breaking: + use: + - FILE diff --git a/compose.yaml b/compose.yaml new file mode 100644 index 0000000..88cb0ff --- /dev/null +++ b/compose.yaml @@ -0,0 +1,42 @@ +# Kedge development environment — podman-compose / docker-compose +# Usage: podman-compose up -d + +services: + # Go build + dev container with hot reload support. + dev: + build: + context: . + dockerfile: Containerfile.dev + volumes: + - .:/src:z + - go-mod-cache:/go/pkg/mod + - go-build-cache:/root/.cache/go-build + working_dir: /src + command: sleep infinity + network_mode: host + privileged: true + cap_add: + - NET_ADMIN + - SYS_ADMIN + + # YANG compiler dev container (Python). + yang-compiler: + image: docker.io/library/python:3.12-slim + volumes: + - ./yang:/src/yang:z + working_dir: /src/yang + command: > + bash -c "pip install -r requirements.txt && sleep infinity" + + # Local Quartermaster mock for development. + # Uses the real QM image if available, otherwise a gRPC echo server. + quartermaster: + image: docker.io/library/golang:1.23-bookworm + command: > + bash -c "echo 'Quartermaster mock — replace with real QM image when available' && sleep infinity" + ports: + - "50051:50051" + +volumes: + go-mod-cache: + go-build-cache: diff --git a/docs/adding-a-site.md b/docs/adding-a-site.md new file mode 100644 index 0000000..229d05f --- /dev/null +++ b/docs/adding-a-site.md @@ -0,0 +1,10 @@ +# Adding a Site + +## Steps +1. Provision infrastructure (Terraform `cloud-anchor` module or manual) +2. Run `ansible-playbook ansible/playbooks/bootstrap-.yml` +3. Generate WireGuard keys (`scripts/generate-wireguard-keys.sh`) +4. Add peer to mesh ConfigMap (`k8s/configmap-mesh.yaml`) +5. If underlay: create site YANG instance data (`yang/site-config/.xml`) +6. Deploy Kedge DaemonSet +7. Verify with `ansible-playbook ansible/playbooks/mesh-health.yml` diff --git a/docs/architecture.md b/docs/architecture.md new file mode 100644 index 0000000..65e29af --- /dev/null +++ b/docs/architecture.md @@ -0,0 +1,5 @@ +# Architecture + +Kedge is a dual-mode Kubernetes CNI plugin and DaemonSet providing attested connectivity between clusters and managed infrastructure. + +See [CLAUDE.md](../CLAUDE.md) for the complete architecture specification. diff --git a/docs/cni-plugin.md b/docs/cni-plugin.md new file mode 100644 index 0000000..3793882 --- /dev/null +++ b/docs/cni-plugin.md @@ -0,0 +1,12 @@ +# CNI Plugin + +The Kedge CNI plugin is a Multus secondary network plugin that attaches a `net1` interface to pods requiring infrastructure access. + +## Operation +- Invoked by Multus on pod ADD/DEL/CHECK +- Creates veth pair, moves one end into pod network namespace +- Programs routes for both overlay (WireGuard) and underlay (VLAN bridge) subnets +- Applies SVID-based network policy + +## Configuration +Configured via Multus `NetworkAttachmentDefinition` — see `k8s/network-attachment.yaml`. diff --git a/docs/failover-runbook.md b/docs/failover-runbook.md new file mode 100644 index 0000000..2f1b23e --- /dev/null +++ b/docs/failover-runbook.md @@ -0,0 +1,16 @@ +# Failover Runbook + +## Primary WAN Failure +1. Dead peer detection triggers on WireGuard handshake timeout +2. If secondary circuit configured, mesh manager fails over +3. Overlay sessions are re-established over secondary path +4. Underlay operations pause — device management requires network path to device + +## Verification +```bash +ansible-playbook ansible/playbooks/failover-test.yml +ansible-playbook ansible/playbooks/mesh-health.yml +``` + +## Recovery +When primary WAN recovers, mesh manager detects restored handshake and fails back. diff --git a/docs/hfl-kedge-integration.md b/docs/hfl-kedge-integration.md new file mode 100644 index 0000000..8d053e8 --- /dev/null +++ b/docs/hfl-kedge-integration.md @@ -0,0 +1,419 @@ +# HFL–Kedge Network Integration + +How Host Function Layer WASM module Shellstream sessions interact with Kedge's CNI plugin and DaemonSet. + +> **Cross-references:** +> Substrate [`docs/shell-primitive.md`](../../substrate/docs/shell-primitive.md) — BPF map structures, eBPF hooks, shell lifecycle. +> Substrate [`docs/hfl-spec.md`](../../substrate/docs/hfl-spec.md) — WASM module grant attenuation, host function categories. +> Kedge [`internal/cni/plugin.go`](../internal/cni/plugin.go) — CNI `CmdAdd` flow. +> Kedge [`internal/cni/policy.go`](../internal/cni/policy.go) — SVID-scoped policy evaluation. +> Kedge [`internal/quartermaster/session_transit.go`](../internal/quartermaster/session_transit.go) — Session transit artifact type. + +--- + +## 1. Overview + +Kedge is the network data plane for the Guildhouse infrastructure. It operates as both a Kubernetes CNI plugin (creating the `net1` secondary interface via Multus) and a DaemonSet managing WireGuard tunnels, VLAN interfaces, and Shellstream session termination. The Host Function Layer (HFL) is a WASM-based runtime embedded in Bascule that exposes structured host functions — Quartermaster queries, telemetry reads, infrastructure state, YANG operations — to application workloads running inside attested shells. + +When an HFL WASM module calls a host function that requires network access, the resulting Shellstream session traverses Kedge's network stack. The HFL does not interact with Kedge directly — it opens Shellstream connections that transit Kedge's `net1` interface like any other pod traffic. What makes HFL sessions distinct is the **three-layer enforcement model** that governs them: + +1. **Kernel layer (eBPF shell):** The `shell_map` BPF map restricts which destinations and modes the pod's cgroup can reach. Kedge's CNI plugin reads `allowed_mode` from this map at interface creation time. The eBPF `CGROUP_SOCK_ADDR` hook enforces `allowed_targets` at connection time. These checks happen in the kernel — no userspace daemon is in the hot path. + +2. **Application layer (HFL grants):** Before a host function opens any Shellstream session, the HFL runtime validates the WASM module's `ModuleGrant` against the requested operation. If the grant does not cover the operation (wrong capabilities, wrong registry, wrong mode), the request is rejected before it reaches the network. + +3. **Cryptographic layer (SAT attenuation):** The per-request `RequestToken` carried in the Shellstream frame header is validated independently by the receiving service (Quartermaster, Bascule, Prometheus). Each layer operates without trusting the others. + +These three layers compose but do not depend on each other. A compromised HFL runtime cannot bypass the eBPF shell. A compromised eBPF program cannot forge SAT tokens. This is defense in depth through independent enforcement points, not redundant checks in a single trust domain. + +--- + +## 2. Shared BPF Map Architecture + +Kedge and the shell primitive share state through `shell_map`, a `BPF_MAP_TYPE_HASH` map keyed by `cgroup_id`. There is no API intermediary, no sidecar, no IPC channel. Both the eBPF enforcement programs and Kedge's CNI plugin read from the same map. + +### Map definition + +```c +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, __u64); /* cgroup_id */ + __type(value, struct shell_state); + __uint(max_entries, 1024); +} shell_map SEC(".maps"); +``` + +### `shell_state` layout (212 bytes) + +```c +struct shell_state { + __u8 sat_hash[32]; /* SHA-256 of the bound SAT */ + __u32 capabilities; /* Capability bitmask */ + __u32 allowed_mode; /* Overlay/underlay mode bitmask */ + __u32 num_allowed_targets; /* Count of populated entries */ + struct shell_target allowed_targets[16]; /* Destination whitelist */ + __u64 accord_id; /* Hash of the governing accord */ + __u64 session_counter; /* Monotonic counter for correlation IDs */ + __u64 created_at; /* Nanosecond timestamp (CLOCK_MONOTONIC) */ + __u64 last_updated; /* Nanosecond timestamp (CLOCK_MONOTONIC) */ + __u32 flags; /* Shell state flags */ + __u32 _padding; /* Alignment to 8-byte boundary */ +}; +``` + +```c +struct shell_target { + __be32 addr; /* IPv4 destination (network byte order) */ + __be16 port; /* Destination port (0 = any port) */ + __u8 protocol; /* IPPROTO_TCP (6), IPPROTO_UDP (17), 0 = any */ + __u8 flags; /* Per-target flags */ +}; +``` + +### Fields relevant to Kedge + +| Field | Kedge usage | +|-------|-------------| +| `allowed_mode` | CNI plugin reads this at `CmdAdd` time to decide which route types to program on `net1`. `OVERLAY` (0x1) → WireGuard routes. `UNDERLAY` (0x2) → VLAN bridge routes. Both bits → both route sets. | +| `allowed_targets` | eBPF `CGROUP_SOCK_ADDR` hook validates outbound connections against this whitelist at connect time. Kedge does not enforce targets — the kernel does. | +| `capabilities` | `READ` (0x1), `PROPOSE` (0x2), `MUTATE` (0x4), `ADMIN` (0x8). Kedge uses this when building `SessionTransitArtifact` records. The DaemonSet's health endpoint exposes capability distribution as Prometheus metrics. | +| `flags` | `ACTIVE` (0x1), `FROZEN` (0x2), `DRAINING` (0x4). DaemonSet watches flag transitions. When a shell enters `FROZEN`, the DaemonSet stops accepting new Shellstream handshakes for that session. When `DRAINING`, it tears down associated tunnel state. | +| `sat_hash` | Included in every `SessionTransitArtifact` for Quartermaster governance chain linkage. | +| `accord_id` | Used by the DaemonSet to look up the local accord policy that governs capability grants for incoming Shellstream sessions. | + +### Access pattern + +The BPF map is pinned at `/sys/fs/bpf/shell_map`. Kedge accesses it from Go via `cilium/ebpf` map operations: + +- **CNI plugin** (`CmdAdd`): Looks up the pod's `cgroup_id`, reads `allowed_mode` and `flags`. This is a single `MapLookupElem` call — O(1), no syscall overhead beyond the BPF command. If no entry exists (shell is `UNBOUND`), the CNI plugin installs no routes on `net1`, effectively isolating the pod. +- **DaemonSet**: Subscribes to `shell_map` changes via the `shell_events` ring buffer (see `ShellProgrammer.subscribe_events()` in shell-primitive.md §7.1). Reacts to `bind_shell`, `freeze_shell`, `drain_shell`, and `destroy_shell` events. + +--- + +## 3. Routing HFL Shellstream Sessions Through Kedge + +An HFL-originated Shellstream session follows the same packet path as any other pod egress through `net1`. The difference is that three independent checks have already passed before the first SYN reaches the wire. This section traces the four common HFL session types. + +### 3.1 Overlay: HFL → Quartermaster (cross-cluster query) + +A WASM module calls `query-records` on the `session-transit` registry hosted by a remote Quartermaster instance. + +``` +WASM module + │ host.quartermaster.query_records("session-transit", filter, limit) + ▼ +HFL Runtime + │ 1. Validate ModuleGrant: capabilities includes READ, + │ registries includes "session-transit", + │ allowed_mode includes OVERLAY + │ 2. Construct RequestToken (single_use=true, expires=30s) + │ 3. Open TCP connection to Quartermaster endpoint + ▼ +eBPF CGROUP_SOCK_ADDR hook + │ 4. Look up cgroup_id in shell_map + │ 5. Check flags: ACTIVE (not FROZEN/DRAINING) + │ 6. Check destination against allowed_targets[0..N] + │ 7. Permit or deny at kernel level + ▼ +net1 interface (Kedge CNI) + │ 8. Packet hits overlay route → forwarded to wg0 + ▼ +WireGuard tunnel (wg0) + │ 9. Encrypted transit to remote cluster + ▼ +Remote Kedge DaemonSet (Shellstream listener) + │ 10. 3-way handshake: ATTEST-INIT / ATTEST-VERIFY / ATTEST-CONFIRM + │ 11. Validate SAT, evaluate capability against local accord + │ 12. Record SessionTransitArtifact + ▼ +Quartermaster + │ 13. Validate RequestToken independently + │ 14. Execute query, return results +``` + +Steps 1–2 are HFL enforcement (application layer). Steps 4–7 are eBPF enforcement (kernel layer). Step 8 depends on Kedge having programmed overlay routes at `CmdAdd` time — which only happens if `shell_state.allowed_mode & OVERLAY != 0`. Steps 10–13 are cryptographic enforcement (SAT + RequestToken). + +### 3.2 Underlay: HFL → Bascule SDK dispatch (device read) + +A WASM module calls `get-device-state` for a FortiGate on the local VLAN. + +``` +WASM module + │ host.infrastructure.get_device_state("fortigate.transit.local") + ▼ +HFL Runtime + │ 1. Validate ModuleGrant: capabilities includes READ, + │ allowed_mode includes UNDERLAY + │ 2. Construct RequestToken + │ 3. Open Shellstream to Bascule (may be local or cross-cluster) + ▼ +eBPF CGROUP_SOCK_ADDR hook + │ 4. Validate against shell_map (same as §3.1) + ▼ +net1 interface (Kedge CNI) + │ 5. Packet hits underlay route → forwarded via VLAN bridge + ▼ +VLAN bridge (br-mgmt / tagged interface) + │ 6. Frame reaches managed device on infrastructure VLAN + ▼ +Bascule SDK dispatch + │ 7. Validate RequestToken + │ 8. Execute read-only SDK call (e.g., fortiosapi.get) + │ 9. Return device state to WASM module +``` + +The underlay path differs at step 5: traffic exits through the VLAN bridge rather than the WireGuard tunnel. The CNI plugin only programs underlay routes if `shell_state.allowed_mode & UNDERLAY != 0`. + +### 3.3 Local: HFL → YANG compiler (no Shellstream) + +YANG host functions (`validate-instance`, `compile-dry-run`, `diff-state`) call the Python YANG compiler within the same container. No Shellstream session is opened and no traffic traverses `net1`. The HFL runtime invokes the compiler as a local subprocess. Kedge is not involved. + +This is worth documenting because it is the exception: HFL host functions do not always produce network traffic. The YANG category is purely local computation. + +### 3.4 Local: HFL → Prometheus (telemetry query) + +Telemetry queries (`query-prometheus`, `query-timescale`) may target a Prometheus instance on the same cluster. When the destination is local (same node or same cluster overlay), the path is: + +``` +WASM module → HFL Runtime → eBPF check → net1 → overlay route → wg0 → local Prometheus +``` + +Even local-cluster queries transit `net1` and the WireGuard tunnel because Kedge treats all overlay traffic uniformly. There is no "local bypass" optimization — this simplifies the enforcement model by ensuring every session, regardless of locality, passes through the same BPF map checks. + +--- + +## 4. CNI Plugin: `shell_state`-Aware Route Programming + +The CNI plugin's `CmdAdd` handler programs routes on the pod's `net1` interface based on `shell_state.allowed_mode`. This is the critical integration point: Kedge decides at pod creation time what network paths are available. + +### Current `CmdAdd` flow (from `internal/cni/plugin.go`) + +``` +ParseNetConf → loadMeshTopology → createVethPair → attachOverlayRoutes + → attachUnderlayRoutes → programPodRoutes → applySVIDPolicy +``` + +### Extended flow with `shell_state` integration + +``` +ParseNetConf → loadMeshTopology → createVethPair + → lookupShellState(cgroup_id) + → if shell_state is UNBOUND: return (no routes, pod is isolated) + → if flags & ACTIVE == 0: return (shell not active, pod is isolated) + → if allowed_mode & OVERLAY: attachOverlayRoutes + → if allowed_mode & UNDERLAY: attachUnderlayRoutes + → programPodRoutes (only for modes granted above) + → applySVIDPolicy (SVID check is orthogonal to shell_state) +``` + +### `lookupShellState` pseudocode + +```go +func lookupShellState(cgroupID uint64) (*ShellState, error) { + m, err := ebpf.LoadPinnedMap("/sys/fs/bpf/shell_map", nil) + if err != nil { + return nil, fmt.Errorf("shell_map not available: %w", err) + } + defer m.Close() + + var state ShellState + if err := m.Lookup(cgroupID, &state); err != nil { + // No entry = UNBOUND. Fail closed: no routes. + return nil, nil + } + return &state, nil +} +``` + +### Route programming rules + +| `allowed_mode` | Overlay routes | Underlay routes | Effect | +|-----------------|---------------|-----------------|--------| +| `0x0` (none) | No | No | Pod is network-isolated on `net1`. | +| `0x1` (OVERLAY) | Yes | No | Pod can reach remote clusters via WireGuard. Cannot reach infrastructure VLANs. | +| `0x2` (UNDERLAY) | No | Yes | Pod can reach infrastructure devices on local VLANs. Cannot reach remote clusters. | +| `0x3` (both) | Yes | Yes | Full access. Typical for admin-tier shells. | + +### Fail-closed behavior + +If `shell_map` is not pinned (e.g., the shell primitive is not yet loaded), the CNI plugin treats this as equivalent to `UNBOUND` — no routes are programmed. This is intentional: a pod without an active shell binding has no business reaching Kedge-managed networks. + +If `shell_map` exists but contains no entry for the pod's `cgroup_id`, the same fail-closed behavior applies. The CNI plugin does not fall back to permissive routing. + +--- + +## 5. DaemonSet: `shell_state` Watch and Lifecycle Coordination + +The DaemonSet reacts to shell lifecycle transitions by adjusting tunnel state, Shellstream session handling, and telemetry. + +### Event subscription + +The DaemonSet calls `ShellProgrammer.subscribe_events()` (implemented via the `shell_events` BPF ring buffer) to receive shell state transitions. In Go, this is accessed through the `cilium/ebpf` ring buffer reader: + +```go +reader, err := ringbuf.NewReader(shellEventsMap) +for { + record, err := reader.Read() + event := parseShellEvent(record.RawSample) + switch event.Type { + case EventBindShell: handleBind(event) + case EventFreezeShell: handleFreeze(event) + case EventDrainShell: handleDrain(event) + case EventDestroyShell: handleDestroy(event) + } +} +``` + +### Lifecycle event handling + +| Event | DaemonSet action | +|-------|-----------------| +| `bind_shell` | Register the shell's `cgroup_id` in local session tracking. If the shell has `allowed_mode & OVERLAY`, ensure the associated WireGuard peers are configured. Log the binding with `sat_hash` and `accord_id`. | +| `freeze_shell` | Stop accepting new Shellstream handshakes for sessions associated with this `cgroup_id`. Existing established sessions continue (the shell is being rotated, not destroyed). Increment `kedge_shell_freeze_total` counter. | +| `drain_shell` | Stop accepting new handshakes AND begin tearing down established Shellstream sessions for this `cgroup_id`. The mesh manager marks associated peers as draining. Increment `kedge_shell_drain_total` counter. | +| `destroy_shell` | Remove all local state for this `cgroup_id`. Clean up any tunnel state that was exclusively serving this shell. Emit final telemetry. Increment `kedge_shell_destroy_total` counter. | + +### SAT rotation coordination + +When Bascule rotates a SAT, it freezes the shell, updates `sat_hash`, then unfreezes. The DaemonSet sees two events: `freeze_shell` followed by an `update_shell` (which carries the new `sat_hash`), then an implicit unfreeze (the `FROZEN` flag is cleared). During the freeze window, no new Shellstream handshakes are accepted, but existing sessions continue. This prevents a window where the old SAT is invalid but the new one hasn't propagated — the freeze ensures atomicity from the network's perspective. + +--- + +## 6. Quartermaster Records for HFL Sessions + +HFL-originated sessions produce the same `SessionTransitArtifact` and `NetworkMutationArtifact` records as any other Shellstream session. However, HFL sessions carry additional provenance that should be captured for auditability. + +### Extended `SessionTransitArtifact` + +The existing artifact (from `internal/quartermaster/session_transit.go`) is extended with optional HFL-specific fields: + +```go +type SessionTransitArtifact struct { + // ... existing fields (session_id, sat_hash, source/dest cluster, etc.) + + // HFL provenance (optional — zero values when session is not HFL-originated). + HFLModuleName string `json:"hfl_module_name,omitempty"` + HFLFunctionName string `json:"hfl_function_name,omitempty"` + HFLGrantHash []byte `json:"hfl_grant_hash,omitempty"` + HFLRequestTokenHash []byte `json:"hfl_request_token_hash,omitempty"` +} +``` + +| Field | Source | Purpose | +|-------|--------|---------| +| `hfl_module_name` | Shellstream frame header, set by HFL runtime | Identifies which WASM module initiated the session (e.g., `"telemetry-query"`). | +| `hfl_function_name` | Shellstream frame header, set by HFL runtime | Identifies which host function was called (e.g., `"query_flow_records"`). | +| `hfl_grant_hash` | SHA-256 of the `ModuleGrant` | Links the session back to the specific grant that authorized it. Enables audit queries like "show all sessions authorized by grant X". | +| `hfl_request_token_hash` | SHA-256 of the `RequestToken` | Links the session to the single-use per-request token. Since tokens are single-use, this provides a 1:1 mapping from token to session. | + +These fields are populated by the Kedge DaemonSet's Shellstream listener when the `ATTEST-INIT` message includes HFL provenance headers. Non-HFL sessions (direct Shellstream connections) leave these fields empty. + +### `NetworkMutationArtifact` — no HFL extension needed + +Network mutations are dispatched through Bascule SDK, not through HFL host functions. The HFL can _propose_ mutations (via `submit-proposal` with `PROPOSE` capability), but the actual execution goes through Bascule's ceremony workflow. By the time Kedge records a `NetworkMutationArtifact`, the mutation has already been approved through a ceremony and dispatched by Bascule — the HFL origin is captured in the ceremony's proposal chain, not in the mutation artifact itself. + +### Canonical serialization + +The `CanonicalBytes()` method on `SessionTransitArtifact` must include the HFL fields when present. The RFC 8785 (JCS) canonical form ensures deterministic hashing regardless of field ordering. Fields with zero values (`omitempty`) are excluded from the canonical form to maintain backward compatibility with non-HFL session records. + +--- + +## 7. Mode Authorization: End-to-End Flow + +This section traces the complete authorization chain for an HFL-originated underlay session, showing how the three enforcement layers compose. + +### Scenario + +A WASM module `"proposal-builder"` calls `compile-dry-run` (local, no network — see §3.3), then calls `submit-proposal` to Quartermaster (overlay), then an operator approves the ceremony and Bascule dispatches the mutation (underlay). We trace the overlay leg (proposal submission) end to end. + +### Step 1: SAT issuance (before Kedge) + +The Vigil ceremony issues a SAT with: +- `capabilities`: `READ | PROPOSE` (0x3) +- `allowed_mode`: `OVERLAY` (0x1) — this module does not need underlay access +- `scope`: registries `["network-mutation", "session-transit"]` + +Bascule calls `bind_shell(cgroup_id, shell_state)` with these values. The `shell_map` entry is created. + +### Step 2: Grant attenuation (HFL) + +The HFL runtime derives a `ModuleGrant` for `"proposal-builder"`: + +``` +ModuleGrant { + module_hash: sha256("proposal-builder.wasm"), + parent_sat_hash: shell_state.sat_hash, + capabilities: PROPOSE (0x2), // attenuated from READ|PROPOSE + registries: ["network-mutation"], // attenuated from full list + allowed_mode: OVERLAY (0x1), // same as SAT (cannot widen) + expires_at: min(sat.expires, now + 1h), +} +``` + +Attenuation is strictly narrowing: `grant.capabilities & sat.capabilities == grant.capabilities`. The grant cannot add `MUTATE` or `UNDERLAY` that the SAT does not carry. + +### Step 3: Request token (HFL, per-call) + +When the module calls `submit-proposal`, the HFL constructs: + +``` +RequestToken { + grant_hash: sha256(ModuleGrant), + operation: "submit-proposal", + parameters_hash: sha256(serialized_proposal), + single_use: true, + expires_at: now + 30s, +} +``` + +### Step 4: eBPF enforcement (kernel) + +The `connect()` syscall triggers the `CGROUP_SOCK_ADDR` hook: + +1. Look up `cgroup_id` in `shell_map` → finds the `shell_state`. +2. Check `flags`: `ACTIVE` is set, `FROZEN` is not → proceed. +3. Check destination IP/port against `allowed_targets[0..num_allowed_targets]` → match found. +4. Permit the connection. + +If the destination were on an underlay VLAN (and `allowed_mode` only has `OVERLAY`), this check would still pass at the eBPF level — `allowed_targets` is an explicit whitelist, not mode-based. The mode enforcement happens at the routing level (step 5). + +### Step 5: Kedge route enforcement (CNI) + +The packet exits the pod via `net1`. Because the CNI plugin read `allowed_mode = OVERLAY` at `CmdAdd` time, only overlay (WireGuard) routes are programmed. The destination Quartermaster endpoint resolves to an overlay route through `wg0`. If the destination were on an underlay VLAN, there would be no route — the packet would be dropped by the kernel's routing table. This is Kedge's enforcement: not a firewall rule, but the absence of a route. + +### Step 6: WireGuard transit (Kedge mesh) + +The packet traverses the WireGuard tunnel to the remote cluster. Kedge's mesh manager maintains the peer configuration and monitors tunnel health. + +### Step 7: Remote Shellstream handshake (Kedge DaemonSet) + +The remote Kedge DaemonSet receives the connection on its Shellstream listener: + +1. Read `ATTEST-INIT` (includes SAT token, HFL provenance headers). +2. Validate SAT against local SPIRE trust bundle. +3. Evaluate capability request against local accord policy (see `internal/shellstream/capability.go`). +4. Send `ATTEST-VERIFY` with granted capabilities. +5. Read `ATTEST-CONFIRM`. +6. Record `SessionTransitArtifact` with HFL fields populated. + +### Step 8: Quartermaster validation (independent) + +Quartermaster receives the `submit-proposal` RPC with the `RequestToken` in metadata: + +1. Verify `RequestToken.signature` against the HFL runtime's signing key. +2. Check `single_use` — mark token as consumed. +3. Check `expires_at` — reject if expired. +4. Verify `grant_hash` chains back to a valid SAT. +5. Check that the grant's `capabilities` include `PROPOSE`. +6. Accept the proposal for ceremony processing. + +### Enforcement summary + +| Layer | What it checks | Where it runs | Failure mode | +|-------|---------------|---------------|-------------| +| HFL grant | Module has appropriate capability and registry scope | Userspace (HFL runtime, same pod) | Request rejected before network | +| eBPF shell | Destination is in `allowed_targets`, shell is `ACTIVE` | Kernel (`CGROUP_SOCK_ADDR` hook) | `connect()` returns `EACCES` | +| Kedge routes | `allowed_mode` permits the network path type | Kernel (routing table, programmed by CNI) | Packet dropped (no route to host) | +| Shellstream handshake | SAT is valid, capabilities match local accord | Userspace (remote Kedge DaemonSet) | Handshake rejected at `ATTEST-VERIFY` | +| Service token | `RequestToken` is valid, single-use, unexpired | Userspace (Quartermaster/Bascule) | RPC rejected with `PermissionDenied` | + +Five independent checks, three distinct trust domains (pod kernel, pod userspace, remote service), two cryptographic validations (SAT, RequestToken). No single point of compromise grants unauthorized access. diff --git a/docs/mesh-topology.md b/docs/mesh-topology.md new file mode 100644 index 0000000..5436587 --- /dev/null +++ b/docs/mesh-topology.md @@ -0,0 +1,12 @@ +# Mesh Topology + +The WireGuard mesh connects clusters over overlay tunnels. Topology state is shared between the DaemonSet (writer) and CNI plugin (reader) via `/etc/kedge/mesh.json`. + +## Peer Discovery +- Static peers from ConfigMap (`k8s/configmap-mesh.yaml`) +- Dynamic discovery via Headscale (Phase 3) + +## Health Monitoring +- Periodic peer handshake checks +- Dead peer detection with configurable timeout +- Prometheus metrics: `kedge_mesh_peer_count`, `kedge_mesh_tunnel_up` diff --git a/docs/overlay-mode.md b/docs/overlay-mode.md new file mode 100644 index 0000000..00dcc85 --- /dev/null +++ b/docs/overlay-mode.md @@ -0,0 +1,9 @@ +# Overlay Mode + +WireGuard-based tunneling over networks Kedge doesn't control. Used for cloud anchors, remote sites, and MSP-managed customer environments. + +## Components +- WireGuard mesh manager (`internal/mesh/`) +- Headscale peer discovery (`internal/headscale/`) +- Shellstream handshake termination (`internal/shellstream/`) +- SessionTransitArtifact notarization (`internal/quartermaster/`) diff --git a/docs/quartermaster-integration.md b/docs/quartermaster-integration.md new file mode 100644 index 0000000..d802a77 --- /dev/null +++ b/docs/quartermaster-integration.md @@ -0,0 +1,13 @@ +# Quartermaster Integration + +Kedge DaemonSet is a Quartermaster client, submitting notarization records for all governed operations. + +## Artifact Types +- **SessionTransitArtifact**: Recorded when a Shellstream session crosses the inter-cluster boundary (overlay) +- **NetworkMutationArtifact**: Recorded when physical infrastructure is mutated via Bascule SDK (underlay) + +## Protocol +gRPC client connecting to `GovernanceService` (intent lifecycle) and `QuartermasterNotary` (merkle anchoring). + +## Serialization +RFC 8785 (JCS) canonical JSON for deterministic hashing and merkle tree inclusion. diff --git a/docs/shellstream-boundary.md b/docs/shellstream-boundary.md new file mode 100644 index 0000000..09a1bbf --- /dev/null +++ b/docs/shellstream-boundary.md @@ -0,0 +1,11 @@ +# Shellstream Boundary + +Shellstream is the attested stream protocol for inter-cluster communication. Kedge DaemonSet terminates Shellstream handshakes at the cluster boundary. + +## 3-Way Handshake +1. **ATTEST-INIT**: Remote peer sends SAT token with capability request +2. **ATTEST-VERIFY**: Local Kedge validates SAT, evaluates capabilities against accord policy, sends attenuated grant +3. **ATTEST-CONFIRM**: Session established with granted capabilities + +## Mode Selection +The capability token determines overlay vs. underlay access, governed by MSP trust tier mappings in the local accord policy. diff --git a/docs/underlay-mode.md b/docs/underlay-mode.md new file mode 100644 index 0000000..47e6c85 --- /dev/null +++ b/docs/underlay-mode.md @@ -0,0 +1,16 @@ +# Underlay Mode + +Programs the actual physical network fabric via vendor SDK dispatch through Bascule. Used for sites where you own the infrastructure. + +## Components +- VLAN interface manager (`internal/vlan/`) +- YANG watcher and compilation trigger (`internal/underlay/`) +- YANG compiler (`yang/compiler/`) +- NetworkMutationArtifact notarization (`internal/quartermaster/`) + +## YANG Pipeline +1. YANG instance data changes (ConfigMap/CRD) +2. Watcher triggers compilation +3. Compiler produces vendor-specific payloads (FortiOS, VyOS, UniFi) +4. Payloads dispatched via Bascule SDK +5. Each mutation notarized via Quartermaster diff --git a/docs/yang-compiler.md b/docs/yang-compiler.md new file mode 100644 index 0000000..53f0b19 --- /dev/null +++ b/docs/yang-compiler.md @@ -0,0 +1,17 @@ +# YANG Compiler + +Transforms device-agnostic YANG site configuration into vendor-specific payloads. + +## Pipeline +1. `sovereign-sdwan.yang` defines the schema +2. Per-site XML instance data (e.g., `homelab.xml`) +3. Compiler validates and produces vendor payloads +4. Vendor targets: FortiOS REST, VyOS NETCONF, UniFi API + +## Usage +```bash +python3 -m compiler.compile --site-config site-config/homelab.xml --output-format json +``` + +## Adding a Vendor +Create a new `to_.py` module in `yang/compiler/` implementing `compile_zones()`. diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..4e7da99 --- /dev/null +++ b/go.mod @@ -0,0 +1,19 @@ +module github.com/guildhouse-co/kedge + +go 1.23 + +require ( + github.com/containernetworking/cni v1.2.3 + github.com/containernetworking/plugins v1.6.2 + github.com/prometheus/client_golang v1.20.5 + github.com/spiffe/go-spiffe/v2 v2.4.0 + github.com/vishvananda/netlink v1.3.0 + github.com/vishvananda/netns v0.0.5 + go.uber.org/zap v1.27.0 + golang.zx2c4.com/wireguard/wgctrl v0.0.0-20230429144221-925a1e7659e6 + google.golang.org/grpc v1.69.4 + google.golang.org/protobuf v1.36.3 + k8s.io/api v0.32.1 + k8s.io/apimachinery v0.32.1 + k8s.io/client-go v0.32.1 +) diff --git a/internal/cni/bridge.go b/internal/cni/bridge.go new file mode 100644 index 0000000..065da59 --- /dev/null +++ b/internal/cni/bridge.go @@ -0,0 +1,37 @@ +package cni + +import ( + "fmt" + + "github.com/vishvananda/netlink" + + "github.com/guildhouse-co/kedge/internal/topology" +) + +// attachUnderlayRoutes connects the host-side veth to VLAN bridge interfaces +// for underlay mode routing. +func attachUnderlayRoutes(hostVeth string, routes []SubnetRoute, topo *topology.MeshTopology) error { + if len(routes) == 0 { + return nil + } + + hostLink, err := netlink.LinkByName(hostVeth) + if err != nil { + return fmt.Errorf("host veth %s not found: %w", hostVeth, err) + } + + for _, route := range routes { + bridgeName := route.Via // e.g., "vlan100" + bridge, err := netlink.LinkByName(bridgeName) + if err != nil { + return fmt.Errorf("bridge %s not found for route %s: %w", bridgeName, route.Dst, err) + } + + // Attach host veth to the VLAN bridge. + if err := netlink.LinkSetMaster(hostLink, bridge); err != nil { + return fmt.Errorf("failed to attach %s to bridge %s: %w", hostVeth, bridgeName, err) + } + } + + return nil +} diff --git a/internal/cni/config.go b/internal/cni/config.go new file mode 100644 index 0000000..65e9f6b --- /dev/null +++ b/internal/cni/config.go @@ -0,0 +1,36 @@ +package cni + +import ( + "encoding/json" + + "github.com/containernetworking/cni/pkg/types" +) + +// NetConf is the CNI network configuration parsed from the NetworkAttachmentDefinition. +type NetConf struct { + types.NetConf + + // MeshConfig is the path to the mesh topology file (e.g., /etc/kedge/mesh.json). + MeshConfig string `json:"meshConfig"` + + // UnderlayRoutes are subnets reachable via VLAN bridge interfaces (underlay mode). + UnderlayRoutes []SubnetRoute `json:"underlayRoutes,omitempty"` + + // OverlayRoutes are subnets reachable via WireGuard/VXLAN tunnels (overlay mode). + OverlayRoutes []SubnetRoute `json:"overlayRoutes,omitempty"` +} + +// SubnetRoute maps a destination CIDR to a network interface. +type SubnetRoute struct { + Dst string `json:"dst"` // e.g., "172.16.0.0/24" + Via string `json:"via"` // e.g., "vlan100" or "wg0" +} + +// ParseNetConf parses the CNI network configuration from raw bytes. +func ParseNetConf(data []byte) (*NetConf, error) { + conf := &NetConf{} + if err := json.Unmarshal(data, conf); err != nil { + return nil, err + } + return conf, nil +} diff --git a/internal/cni/netns.go b/internal/cni/netns.go new file mode 100644 index 0000000..e6dacf6 --- /dev/null +++ b/internal/cni/netns.go @@ -0,0 +1,126 @@ +package cni + +import ( + "fmt" + "net" + "runtime" + + "github.com/vishvananda/netlink" + "github.com/vishvananda/netns" +) + +// createVethPair creates a veth pair with one end in the host namespace +// and the other moved into the pod's network namespace. +// Returns (hostVethName, podVethName, error). +func createVethPair(podNetns string, ifName string) (string, string, error) { + hostVethName := "kedge_h_" + ifName[:min(5, len(ifName))] + + veth := &netlink.Veth{ + LinkAttrs: netlink.LinkAttrs{Name: hostVethName}, + PeerName: ifName, + } + + if err := netlink.LinkAdd(veth); err != nil { + return "", "", fmt.Errorf("failed to create veth pair: %w", err) + } + + // Get a handle to the pod's network namespace. + podNS, err := netns.GetFromPath(podNetns) + if err != nil { + netlink.LinkDel(veth) + return "", "", fmt.Errorf("failed to get pod netns: %w", err) + } + defer podNS.Close() + + // Move the peer end into the pod namespace. + peer, err := netlink.LinkByName(ifName) + if err != nil { + netlink.LinkDel(veth) + return "", "", fmt.Errorf("failed to find peer veth: %w", err) + } + + if err := netlink.LinkSetNsFd(peer, int(podNS)); err != nil { + netlink.LinkDel(veth) + return "", "", fmt.Errorf("failed to move peer to pod netns: %w", err) + } + + // Bring up the host-side veth. + hostVeth, err := netlink.LinkByName(hostVethName) + if err != nil { + return "", "", fmt.Errorf("failed to find host veth: %w", err) + } + if err := netlink.LinkSetUp(hostVeth); err != nil { + return "", "", fmt.Errorf("failed to bring up host veth: %w", err) + } + + // Bring up the pod-side veth inside the namespace. + if err := inNamespace(podNS, func() error { + link, err := netlink.LinkByName(ifName) + if err != nil { + return err + } + return netlink.LinkSetUp(link) + }); err != nil { + return "", "", fmt.Errorf("failed to bring up pod veth: %w", err) + } + + return hostVethName, ifName, nil +} + +// teardownInterface removes the named interface from the pod's network namespace. +func teardownInterface(podNetns string, ifName string) error { + ns, err := netns.GetFromPath(podNetns) + if err != nil { + // Namespace may already be gone during cleanup. + return nil + } + defer ns.Close() + + return inNamespace(ns, func() error { + link, err := netlink.LinkByName(ifName) + if err != nil { + return nil // Already gone. + } + return netlink.LinkDel(link) + }) +} + +// verifyInterface checks that the named interface exists in the pod's network namespace. +func verifyInterface(podNetns string, ifName string) error { + ns, err := netns.GetFromPath(podNetns) + if err != nil { + return fmt.Errorf("failed to get pod netns: %w", err) + } + defer ns.Close() + + return inNamespace(ns, func() error { + link, err := netlink.LinkByName(ifName) + if err != nil { + return fmt.Errorf("interface %s not found: %w", ifName, err) + } + if link.Attrs().Flags&net.FlagUp == 0 { + return fmt.Errorf("interface %s is down", ifName) + } + return nil + }) +} + +// inNamespace executes fn inside the given network namespace, restoring +// the original namespace afterward. +func inNamespace(ns netns.NsHandle, fn func() error) error { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + origNS, err := netns.Get() + if err != nil { + return fmt.Errorf("failed to get current netns: %w", err) + } + defer origNS.Close() + + if err := netns.Set(ns); err != nil { + return fmt.Errorf("failed to set netns: %w", err) + } + defer netns.Set(origNS) //nolint:errcheck + + return fn() +} diff --git a/internal/cni/plugin.go b/internal/cni/plugin.go new file mode 100644 index 0000000..99ff41f --- /dev/null +++ b/internal/cni/plugin.go @@ -0,0 +1,93 @@ +package cni + +import ( + "fmt" + + "github.com/containernetworking/cni/pkg/skel" + "github.com/containernetworking/cni/pkg/types" + current "github.com/containernetworking/cni/pkg/types/100" +) + +// CmdAdd is called by the container runtime when a pod is created. +// It creates the net1 interface in the pod's network namespace and programs +// routes for both overlay (WireGuard) and underlay (VLAN bridge) subnets. +func CmdAdd(args *skel.CmdArgs) error { + conf, err := ParseNetConf(args.StdinData) + if err != nil { + return fmt.Errorf("failed to parse config: %w", err) + } + + // Load mesh topology for current node state. + topo, err := loadMeshTopology(conf.MeshConfig) + if err != nil { + return fmt.Errorf("failed to load mesh topology: %w", err) + } + + // Create veth pair and move one end into the pod's network namespace. + hostVeth, podVeth, err := createVethPair(args.Netns, args.IfName) + if err != nil { + return fmt.Errorf("failed to create veth pair: %w", err) + } + + // Attach host-side veth to appropriate backend interfaces. + if err := attachOverlayRoutes(hostVeth, conf.OverlayRoutes, topo); err != nil { + return fmt.Errorf("failed to attach overlay routes: %w", err) + } + if err := attachUnderlayRoutes(hostVeth, conf.UnderlayRoutes, topo); err != nil { + return fmt.Errorf("failed to attach underlay routes: %w", err) + } + + // Program routes inside the pod's network namespace. + if err := programPodRoutes(args.Netns, podVeth, conf.OverlayRoutes, conf.UnderlayRoutes); err != nil { + return fmt.Errorf("failed to program pod routes: %w", err) + } + + // Apply SVID-based network policy. + if err := applySVIDPolicy(args, conf, topo); err != nil { + return fmt.Errorf("failed to apply SVID policy: %w", err) + } + + // Build and return CNI result. + result := ¤t.Result{ + CNIVersion: conf.CNIVersion, + Interfaces: []*current.Interface{ + {Name: hostVeth, Sandbox: ""}, + {Name: podVeth, Sandbox: args.Netns}, + }, + } + return types.PrintResult(result, conf.CNIVersion) +} + +// CmdDel is called when a pod is deleted. +// Tears down the net1 interface and cleans up routes and bridge/tunnel attachments. +func CmdDel(args *skel.CmdArgs) error { + conf, err := ParseNetConf(args.StdinData) + if err != nil { + return fmt.Errorf("failed to parse config: %w", err) + } + + if err := teardownInterface(args.Netns, args.IfName); err != nil { + return fmt.Errorf("failed to teardown interface: %w", err) + } + + _ = conf // Routes are cleaned up with the interface + return nil +} + +// CmdCheck verifies the net1 interface exists and routes are correct. +func CmdCheck(args *skel.CmdArgs) error { + conf, err := ParseNetConf(args.StdinData) + if err != nil { + return fmt.Errorf("failed to parse config: %w", err) + } + + if err := verifyInterface(args.Netns, args.IfName); err != nil { + return fmt.Errorf("interface check failed: %w", err) + } + + if err := verifyRoutes(args.Netns, conf.OverlayRoutes, conf.UnderlayRoutes); err != nil { + return fmt.Errorf("route check failed: %w", err) + } + + return nil +} diff --git a/internal/cni/policy.go b/internal/cni/policy.go new file mode 100644 index 0000000..bd9a81b --- /dev/null +++ b/internal/cni/policy.go @@ -0,0 +1,52 @@ +package cni + +import ( + "fmt" + + "github.com/containernetworking/cni/pkg/skel" + + "github.com/guildhouse-co/kedge/internal/topology" +) + +// applySVIDPolicy enforces SVID-scoped network policy on the pod's net1 interface. +// Restricts which subnets are reachable and which mode (overlay/underlay) is authorized +// based on the pod's SPIFFE identity. +func applySVIDPolicy(args *skel.CmdArgs, conf *NetConf, topo *topology.MeshTopology) error { + // Phase 1: permissive — all Bascule runtime pods get full access. + // TODO: Fetch pod's SPIFFE SVID from SPIRE agent unix socket. + // TODO: Evaluate SVID against accord-defined subnet and mode policies. + // TODO: Program iptables/nftables rules scoped to the pod's veth pair. + + _ = args + _ = conf + _ = topo + + return nil +} + +// PolicyRule defines a subnet access rule scoped to a SPIFFE identity. +type PolicyRule struct { + SVID string // SPIFFE Verifiable Identity Document URI + AllowedDst []string // Allowed destination CIDRs + Mode string // "overlay", "underlay", or "both" + Operations []string // "read", "mutate", "admin" +} + +// EvaluatePolicy checks whether the given SVID is authorized for the requested +// subnets and mode per the local accord policy. +func EvaluatePolicy(svid string, rules []PolicyRule, requestedDst string, mode string) error { + for _, rule := range rules { + if rule.SVID != svid { + continue + } + if rule.Mode != "both" && rule.Mode != mode { + continue + } + for _, allowed := range rule.AllowedDst { + if allowed == requestedDst { + return nil + } + } + } + return fmt.Errorf("SVID %s not authorized for %s in mode %s", svid, requestedDst, mode) +} diff --git a/internal/cni/routes.go b/internal/cni/routes.go new file mode 100644 index 0000000..9e7fabc --- /dev/null +++ b/internal/cni/routes.go @@ -0,0 +1,84 @@ +package cni + +import ( + "fmt" + "net" + + "github.com/vishvananda/netlink" + "github.com/vishvananda/netns" + + "github.com/guildhouse-co/kedge/internal/topology" +) + +// programPodRoutes programs routes inside the pod's network namespace for +// both overlay and underlay destinations. +func programPodRoutes(podNetnsPath string, ifName string, overlay, underlay []SubnetRoute) error { + ns, err := netns.GetFromPath(podNetnsPath) + if err != nil { + return fmt.Errorf("failed to get pod netns: %w", err) + } + defer ns.Close() + + return inNamespace(ns, func() error { + link, err := netlink.LinkByName(ifName) + if err != nil { + return fmt.Errorf("interface %s not found: %w", ifName, err) + } + + allRoutes := append(overlay, underlay...) + for _, sr := range allRoutes { + _, dst, err := net.ParseCIDR(sr.Dst) + if err != nil { + return fmt.Errorf("invalid CIDR %s: %w", sr.Dst, err) + } + + route := &netlink.Route{ + LinkIndex: link.Attrs().Index, + Dst: dst, + } + if err := netlink.RouteAdd(route); err != nil { + return fmt.Errorf("failed to add route %s: %w", sr.Dst, err) + } + } + return nil + }) +} + +// verifyRoutes checks that expected routes exist in the pod's network namespace. +func verifyRoutes(podNetnsPath string, overlay, underlay []SubnetRoute) error { + ns, err := netns.GetFromPath(podNetnsPath) + if err != nil { + return fmt.Errorf("failed to get pod netns: %w", err) + } + defer ns.Close() + + return inNamespace(ns, func() error { + routes, err := netlink.RouteList(nil, netlink.FAMILY_V4) + if err != nil { + return fmt.Errorf("failed to list routes: %w", err) + } + + routeMap := make(map[string]bool) + for _, r := range routes { + if r.Dst != nil { + routeMap[r.Dst.String()] = true + } + } + + allExpected := append(overlay, underlay...) + for _, sr := range allExpected { + if !routeMap[sr.Dst] { + return fmt.Errorf("missing route: %s", sr.Dst) + } + } + return nil + }) +} + +// loadMeshTopology reads the mesh topology file written by the DaemonSet. +func loadMeshTopology(meshConfigPath string) (*topology.MeshTopology, error) { + if meshConfigPath == "" { + meshConfigPath = "/etc/kedge/mesh.json" + } + return topology.LoadFromFile(meshConfigPath) +} diff --git a/internal/cni/tunnel.go b/internal/cni/tunnel.go new file mode 100644 index 0000000..ff25a14 --- /dev/null +++ b/internal/cni/tunnel.go @@ -0,0 +1,42 @@ +package cni + +import ( + "fmt" + "net" + + "github.com/vishvananda/netlink" + + "github.com/guildhouse-co/kedge/internal/topology" +) + +// attachOverlayRoutes programs routes on the host to direct overlay-destined +// traffic from the veth through the WireGuard tunnel interface. +func attachOverlayRoutes(hostVeth string, routes []SubnetRoute, topo *topology.MeshTopology) error { + if len(routes) == 0 { + return nil + } + + for _, route := range routes { + tunnelIfName := route.Via // e.g., "wg0" + tunnelLink, err := netlink.LinkByName(tunnelIfName) + if err != nil { + return fmt.Errorf("tunnel interface %s not found for route %s: %w", tunnelIfName, route.Dst, err) + } + + _, dst, err := net.ParseCIDR(route.Dst) + if err != nil { + return fmt.Errorf("invalid overlay CIDR %s: %w", route.Dst, err) + } + + // Add route on host: dst → tunnel interface. + r := &netlink.Route{ + LinkIndex: tunnelLink.Attrs().Index, + Dst: dst, + } + if err := netlink.RouteReplace(r); err != nil { + return fmt.Errorf("failed to add overlay route %s via %s: %w", route.Dst, tunnelIfName, err) + } + } + + return nil +} diff --git a/internal/config/config.go b/internal/config/config.go new file mode 100644 index 0000000..55a2535 --- /dev/null +++ b/internal/config/config.go @@ -0,0 +1,154 @@ +package config + +import ( + "fmt" + "os" + "time" +) + +// Config is the top-level configuration for the Kedge DaemonSet. +type Config struct { + NodeID string `json:"node_id"` + ClusterID string `json:"cluster_id"` + Mesh MeshConfig `json:"mesh"` + VLAN VLANConfig `json:"vlan"` + Shellstream ShellstreamConfig `json:"shellstream"` + Quartermaster QuartermasterConfig `json:"quartermaster"` + Underlay UnderlayConfig `json:"underlay"` + Health HealthConfig `json:"health"` + Headscale HeadscaleConfig `json:"headscale"` +} + +// MeshConfig configures the WireGuard overlay mesh. +type MeshConfig struct { + Enabled bool `json:"enabled"` + InterfaceName string `json:"interface_name"` + ListenPort int `json:"listen_port"` + PrivateKeyPath string `json:"private_key_path"` + HealthInterval time.Duration `json:"health_interval"` + DeadPeerTimeout time.Duration `json:"dead_peer_timeout"` + InitialPeers []PeerEntry `json:"initial_peers"` +} + +// PeerEntry defines a WireGuard peer from configuration. +type PeerEntry struct { + PublicKey string `json:"public_key"` + Endpoint string `json:"endpoint"` + AllowedIPs []string `json:"allowed_ips"` +} + +// VLANConfig configures underlay VLAN interfaces. +type VLANConfig struct { + Enabled bool `json:"enabled"` + VLANs []VLANEntry `json:"vlans"` +} + +// VLANEntry defines a VLAN to be managed on the node. +type VLANEntry struct { + ID int `json:"id"` + ParentInterface string `json:"parent_interface"` + Subnet string `json:"subnet"` + Description string `json:"description"` +} + +// ShellstreamConfig configures the Shellstream handshake listener. +type ShellstreamConfig struct { + ListenAddr string `json:"listen_addr"` + ClusterID string `json:"cluster_id"` + TrustBundlePath string `json:"trust_bundle_path"` + AccordPolicyPath string `json:"accord_policy_path"` +} + +// QuartermasterConfig configures the Quartermaster gRPC client. +type QuartermasterConfig struct { + Endpoint string `json:"endpoint"` + UseTLS bool `json:"use_tls"` + CertPath string `json:"cert_path,omitempty"` + KeyPath string `json:"key_path,omitempty"` + CAPath string `json:"ca_path,omitempty"` +} + +// UnderlayConfig configures the YANG watcher and compilation trigger. +type UnderlayConfig struct { + Enabled bool `json:"enabled"` + Namespace string `json:"namespace"` + ConfigMapName string `json:"configmap_name"` + CompilerPath string `json:"compiler_path"` +} + +// HealthConfig configures the health and metrics server. +type HealthConfig struct { + ListenAddr string `json:"listen_addr"` +} + +// HeadscaleConfig configures Headscale peer discovery. +type HeadscaleConfig struct { + Enabled bool `json:"enabled"` + Endpoint string `json:"endpoint"` + APIKey string `json:"api_key"` + PollInterval time.Duration `json:"poll_interval"` +} + +// Load reads the DaemonSet configuration from environment variables and +// the config file at /etc/kedge/config.json. +func Load() (*Config, error) { + cfg := &Config{ + NodeID: getEnv("KEDGE_NODE_ID", ""), + ClusterID: getEnv("KEDGE_CLUSTER_ID", ""), + Mesh: MeshConfig{ + Enabled: getEnv("KEDGE_OVERLAY_ENABLED", "true") == "true", + InterfaceName: getEnv("KEDGE_WG_INTERFACE", "wg0"), + ListenPort: 51820, + PrivateKeyPath: getEnv("KEDGE_WG_PRIVATE_KEY", "/etc/kedge/wg-private.key"), + HealthInterval: 30 * time.Second, + DeadPeerTimeout: 5 * time.Minute, + }, + VLAN: VLANConfig{ + Enabled: getEnv("KEDGE_UNDERLAY_ENABLED", "false") == "true", + }, + Shellstream: ShellstreamConfig{ + ListenAddr: getEnv("KEDGE_SHELLSTREAM_ADDR", ":8443"), + ClusterID: getEnv("KEDGE_CLUSTER_ID", ""), + TrustBundlePath: getEnv("KEDGE_TRUST_BUNDLE", "/run/spire/bundle/bundle.pem"), + AccordPolicyPath: getEnv("KEDGE_ACCORD_POLICY", "/etc/kedge/accord-policy.json"), + }, + Quartermaster: QuartermasterConfig{ + Endpoint: getEnv("KEDGE_QM_ENDPOINT", "quartermaster.guildhouse.svc:50051"), + }, + Underlay: UnderlayConfig{ + Enabled: getEnv("KEDGE_UNDERLAY_ENABLED", "false") == "true", + Namespace: getEnv("KEDGE_UNDERLAY_NAMESPACE", "kedge"), + ConfigMapName: getEnv("KEDGE_UNDERLAY_CONFIGMAP", "kedge-underlay"), + CompilerPath: getEnv("KEDGE_YANG_COMPILER", "/opt/kedge/yang/compiler/compile.py"), + }, + Health: HealthConfig{ + ListenAddr: getEnv("KEDGE_HEALTH_ADDR", ":9090"), + }, + Headscale: HeadscaleConfig{ + Enabled: getEnv("KEDGE_HEADSCALE_ENABLED", "false") == "true", + Endpoint: getEnv("KEDGE_HEADSCALE_ENDPOINT", ""), + PollInterval: 60 * time.Second, + }, + } + + if cfg.NodeID == "" { + hostname, err := os.Hostname() + if err != nil { + return nil, fmt.Errorf("KEDGE_NODE_ID not set and hostname unavailable: %w", err) + } + cfg.NodeID = hostname + } + + if cfg.ClusterID == "" { + return nil, fmt.Errorf("KEDGE_CLUSTER_ID is required") + } + + return cfg, nil +} + +func getEnv(key, fallback string) string { + if v := os.Getenv(key); v != "" { + return v + } + return fallback +} diff --git a/internal/headscale/client.go b/internal/headscale/client.go new file mode 100644 index 0000000..7ce4243 --- /dev/null +++ b/internal/headscale/client.go @@ -0,0 +1,68 @@ +package headscale + +import ( + "context" + "fmt" + "time" + + "go.uber.org/zap" + + "github.com/guildhouse-co/kedge/internal/config" +) + +// PeerUpdater is implemented by the mesh manager to receive peer updates. +type PeerUpdater interface { + AddPeer(pubKey string, endpoint string, allowedIPs []string) error +} + +// Client connects to Headscale for mesh peer discovery. +type Client struct { + cfg config.HeadscaleConfig + peers PeerUpdater + log *zap.SugaredLogger +} + +// NewClient creates a new Headscale peer discovery client. +func NewClient(cfg config.HeadscaleConfig, peers PeerUpdater, log *zap.SugaredLogger) *Client { + return &Client{cfg: cfg, peers: peers, log: log} +} + +// Run starts the Headscale peer discovery loop. +func (c *Client) Run(ctx context.Context) error { + if !c.cfg.Enabled { + c.log.Info("headscale peer discovery disabled") + <-ctx.Done() + return nil + } + + c.log.Infof("starting headscale peer discovery (endpoint: %s, interval: %s)", + c.cfg.Endpoint, c.cfg.PollInterval) + + ticker := time.NewTicker(c.cfg.PollInterval) + defer ticker.Stop() + + // Initial discovery. + if err := c.discoverPeers(ctx); err != nil { + c.log.Warnw("initial peer discovery failed", "error", err) + } + + for { + select { + case <-ctx.Done(): + return nil + case <-ticker.C: + if err := c.discoverPeers(ctx); err != nil { + c.log.Warnw("peer discovery failed", "error", err) + } + } + } +} + +func (c *Client) discoverPeers(ctx context.Context) error { + // TODO: Call Headscale gRPC API to list nodes in the mesh namespace. + // For each node, extract WireGuard public key, endpoint, and allowed IPs, + // then call c.peers.AddPeer() to update the mesh manager. + + _ = ctx + return fmt.Errorf("headscale peer discovery not yet implemented") +} diff --git a/internal/health/health.go b/internal/health/health.go new file mode 100644 index 0000000..0d609af --- /dev/null +++ b/internal/health/health.go @@ -0,0 +1,135 @@ +package health + +import ( + "context" + "fmt" + "net/http" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" + "go.uber.org/zap" + + "github.com/guildhouse-co/kedge/internal/config" +) + +// PeerCounter is implemented by the mesh manager. +type PeerCounter interface { + PeerCount() int +} + +// VLANCounter is implemented by the VLAN manager. +type VLANCounter interface { + VLANCount() int +} + +var ( + meshPeerGauge = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: "kedge", + Subsystem: "mesh", + Name: "peer_count", + Help: "Number of active WireGuard mesh peers", + }) + + vlanInterfaceGauge = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: "kedge", + Subsystem: "vlan", + Name: "interface_count", + Help: "Number of managed VLAN interfaces", + }) + + sessionTransitCounter = prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: "kedge", + Subsystem: "quartermaster", + Name: "session_transits_total", + Help: "Total SessionTransitArtifacts submitted to Quartermaster", + }) + + networkMutationCounter = prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: "kedge", + Subsystem: "quartermaster", + Name: "network_mutations_total", + Help: "Total NetworkMutationArtifacts submitted to Quartermaster", + }) + + tunnelStatusGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "kedge", + Subsystem: "mesh", + Name: "tunnel_up", + Help: "WireGuard tunnel status (1=up, 0=down)", + }, []string{"peer"}) +) + +func init() { + prometheus.MustRegister( + meshPeerGauge, + vlanInterfaceGauge, + sessionTransitCounter, + networkMutationCounter, + tunnelStatusGauge, + ) +} + +// Server serves health checks and Prometheus metrics. +type Server struct { + cfg config.HealthConfig + peers PeerCounter + vlans VLANCounter + log *zap.SugaredLogger +} + +// NewServer creates a new health and metrics server. +func NewServer(cfg config.HealthConfig, peers PeerCounter, vlans VLANCounter, log *zap.SugaredLogger) *Server { + return &Server{cfg: cfg, peers: peers, vlans: vlans, log: log} +} + +// Run starts the HTTP server for health checks and metrics. +func (s *Server) Run(ctx context.Context) error { + mux := http.NewServeMux() + mux.HandleFunc("/healthz", s.handleHealthz) + mux.HandleFunc("/readyz", s.handleReadyz) + mux.Handle("/metrics", promhttp.Handler()) + + srv := &http.Server{ + Addr: s.cfg.ListenAddr, + Handler: mux, + } + + go func() { + <-ctx.Done() + srv.Close() + }() + + s.log.Infof("health server listening on %s", s.cfg.ListenAddr) + if err := srv.ListenAndServe(); err != http.ErrServerClosed { + return fmt.Errorf("health server error: %w", err) + } + return nil +} + +func (s *Server) handleHealthz(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + fmt.Fprintln(w, "ok") +} + +func (s *Server) handleReadyz(w http.ResponseWriter, r *http.Request) { + // Update metrics. + if s.peers != nil { + meshPeerGauge.Set(float64(s.peers.PeerCount())) + } + if s.vlans != nil { + vlanInterfaceGauge.Set(float64(s.vlans.VLANCount())) + } + + w.WriteHeader(http.StatusOK) + fmt.Fprintln(w, "ok") +} + +// RecordSessionTransit increments the session transit counter. +func RecordSessionTransit() { + sessionTransitCounter.Inc() +} + +// RecordNetworkMutation increments the network mutation counter. +func RecordNetworkMutation() { + networkMutationCounter.Inc() +} diff --git a/internal/mesh/manager.go b/internal/mesh/manager.go new file mode 100644 index 0000000..2e4c5b4 --- /dev/null +++ b/internal/mesh/manager.go @@ -0,0 +1,153 @@ +package mesh + +import ( + "context" + "fmt" + "sync" + "time" + + "go.uber.org/zap" + "golang.zx2c4.com/wireguard/wgctrl" + "golang.zx2c4.com/wireguard/wgctrl/wgtypes" + + "github.com/guildhouse-co/kedge/internal/config" + "github.com/guildhouse-co/kedge/internal/quartermaster" +) + +// Manager manages WireGuard tunnel lifecycle for overlay mode. +type Manager struct { + cfg config.MeshConfig + qm *quartermaster.Client + log *zap.SugaredLogger + client *wgctrl.Client + + mu sync.RWMutex + peers map[string]*Peer // keyed by public key +} + +// NewManager creates a new WireGuard mesh manager. +func NewManager(cfg config.MeshConfig, qm *quartermaster.Client, log *zap.SugaredLogger) *Manager { + return &Manager{ + cfg: cfg, + qm: qm, + log: log, + peers: make(map[string]*Peer), + } +} + +// Run starts the mesh manager loop. It initializes the WireGuard interface, +// configures initial peers, and monitors tunnel health. +func (m *Manager) Run(ctx context.Context) error { + if !m.cfg.Enabled { + m.log.Info("overlay mode disabled, mesh manager idle") + <-ctx.Done() + return nil + } + + var err error + m.client, err = wgctrl.New() + if err != nil { + return fmt.Errorf("failed to create wireguard client: %w", err) + } + defer m.client.Close() + + // Ensure WireGuard interface exists. + if err := m.ensureInterface(); err != nil { + return fmt.Errorf("failed to ensure wg interface: %w", err) + } + + // Configure initial peers from mesh config. + if err := m.configureInitialPeers(); err != nil { + return fmt.Errorf("failed to configure initial peers: %w", err) + } + + // Health monitoring loop. + ticker := time.NewTicker(m.cfg.HealthInterval) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + m.log.Info("mesh manager shutting down") + return nil + case <-ticker.C: + m.checkPeerHealth() + } + } +} + +// PeerCount returns the number of active peers. +func (m *Manager) PeerCount() int { + m.mu.RLock() + defer m.mu.RUnlock() + return len(m.peers) +} + +// AddPeer adds or updates a WireGuard peer. +func (m *Manager) AddPeer(pubKey string, endpoint string, allowedIPs []string) error { + m.mu.Lock() + defer m.mu.Unlock() + + key, err := wgtypes.ParseKey(pubKey) + if err != nil { + return fmt.Errorf("invalid public key: %w", err) + } + + peer := &Peer{ + PublicKey: pubKey, + Endpoint: endpoint, + AllowedIPs: allowedIPs, + State: PeerStateActive, + LastSeen: time.Now(), + } + m.peers[pubKey] = peer + + _ = key // TODO: Apply to wgctrl config. + m.log.Infow("peer added", "pubkey", pubKey[:8]+"...", "endpoint", endpoint) + return nil +} + +func (m *Manager) ensureInterface() error { + // TODO: Create WireGuard interface if it doesn't exist. + // Use netlink to create the interface, then wgctrl to configure it. + m.log.Infof("ensuring wireguard interface %s", m.cfg.InterfaceName) + return nil +} + +func (m *Manager) configureInitialPeers() error { + for _, p := range m.cfg.InitialPeers { + if err := m.AddPeer(p.PublicKey, p.Endpoint, p.AllowedIPs); err != nil { + m.log.Warnw("failed to add initial peer", "pubkey", p.PublicKey[:8]+"...", "error", err) + } + } + return nil +} + +func (m *Manager) checkPeerHealth() { + m.mu.Lock() + defer m.mu.Unlock() + + device, err := m.client.Device(m.cfg.InterfaceName) + if err != nil { + m.log.Warnw("failed to get wg device", "error", err) + return + } + + for _, wgPeer := range device.Peers { + pubKey := wgPeer.PublicKey.String() + peer, ok := m.peers[pubKey] + if !ok { + continue + } + + if !wgPeer.LastHandshakeTime.IsZero() { + peer.LastSeen = wgPeer.LastHandshakeTime + peer.State = PeerStateActive + } + + if time.Since(peer.LastSeen) > m.cfg.DeadPeerTimeout { + peer.State = PeerStateDead + m.log.Warnw("peer dead", "pubkey", pubKey[:8]+"...", "last_seen", peer.LastSeen) + } + } +} diff --git a/internal/mesh/peer.go b/internal/mesh/peer.go new file mode 100644 index 0000000..f37a926 --- /dev/null +++ b/internal/mesh/peer.go @@ -0,0 +1,21 @@ +package mesh + +import "time" + +// PeerState represents the connectivity state of a WireGuard peer. +type PeerState string + +const ( + PeerStateActive PeerState = "active" + PeerStateDead PeerState = "dead" + PeerStatePending PeerState = "pending" +) + +// Peer represents a WireGuard mesh peer. +type Peer struct { + PublicKey string + Endpoint string + AllowedIPs []string + State PeerState + LastSeen time.Time +} diff --git a/internal/quartermaster/client.go b/internal/quartermaster/client.go new file mode 100644 index 0000000..a2c0fb3 --- /dev/null +++ b/internal/quartermaster/client.go @@ -0,0 +1,102 @@ +package quartermaster + +import ( + "context" + "fmt" + "time" + + "go.uber.org/zap" + "google.golang.org/grpc" + "google.golang.org/grpc/credentials/insecure" + + "github.com/guildhouse-co/kedge/internal/config" +) + +// Client is a gRPC client for the Quartermaster governance and notary services. +type Client struct { + cfg config.QuartermasterConfig + conn *grpc.ClientConn + log *zap.SugaredLogger + + // TODO: Add typed gRPC stub clients when proto generation is wired up. + // governance quartermasterv1.GovernanceServiceClient + // notary quartermasterv1.QuartermasterNotaryClient +} + +// NewClient creates a new Quartermaster gRPC client. +func NewClient(cfg config.QuartermasterConfig) (*Client, error) { + log := zap.L().Sugar().Named("quartermaster") + + // TODO: Use mTLS with SPIFFE SVID for production. + opts := []grpc.DialOption{ + grpc.WithTransportCredentials(insecure.NewCredentials()), + } + + conn, err := grpc.NewClient(cfg.Endpoint, opts...) + if err != nil { + return nil, fmt.Errorf("failed to connect to quartermaster at %s: %w", cfg.Endpoint, err) + } + + log.Infof("connected to quartermaster at %s", cfg.Endpoint) + + return &Client{ + cfg: cfg, + conn: conn, + log: log, + }, nil +} + +// Close closes the gRPC connection. +func (c *Client) Close() error { + if c.conn != nil { + return c.conn.Close() + } + return nil +} + +// SubmitSessionTransit submits a SessionTransitArtifact to Quartermaster for notarization. +// This records overlay session crossings between clusters. +func (c *Client) SubmitSessionTransit(ctx context.Context, artifact *SessionTransitArtifact) error { + artifact.Timestamp = time.Now().UTC() + + // Serialize to canonical JSON (RFC 8785) for hashing. + canonical, err := artifact.CanonicalBytes() + if err != nil { + return fmt.Errorf("failed to serialize session transit artifact: %w", err) + } + + c.log.Infow("submitting session transit", + "session_id", artifact.SessionID, + "source", artifact.SourceCluster, + "dest", artifact.DestCluster, + "mode", artifact.GrantedMode, + "bytes", len(canonical), + ) + + // TODO: Call governance.CreateIntent + notary.CreateAnchor via gRPC stubs. + // For Phase 1, the anchor buffer flush is async — operations are not blocked. + _ = canonical + return nil +} + +// SubmitNetworkMutation submits a NetworkMutationArtifact to Quartermaster for notarization. +// This records underlay infrastructure config changes. +func (c *Client) SubmitNetworkMutation(ctx context.Context, artifact *NetworkMutationArtifact) error { + artifact.Timestamp = time.Now().UTC() + + canonical, err := artifact.CanonicalBytes() + if err != nil { + return fmt.Errorf("failed to serialize network mutation artifact: %w", err) + } + + c.log.Infow("submitting network mutation", + "mutation_id", artifact.MutationID, + "device", artifact.Device, + "operation", artifact.Operation, + "bytes", len(canonical), + ) + + // TODO: Call governance.CreateIntent + notary.CreateAnchor via gRPC stubs. + _ = canonical + return nil +} diff --git a/internal/quartermaster/network_mutation.go b/internal/quartermaster/network_mutation.go new file mode 100644 index 0000000..8d15a1d --- /dev/null +++ b/internal/quartermaster/network_mutation.go @@ -0,0 +1,53 @@ +package quartermaster + +import ( + "encoding/json" + "time" +) + +// NetworkMutationArtifact is recorded when Kedge (via Bascule SDK dispatch) +// mutates physical infrastructure (underlay mode). +type NetworkMutationArtifact struct { + MutationID string `json:"mutation_id"` + SATHash []byte `json:"sat_hash"` + SessionID string `json:"session_id"` // Links to SessionTransitArtifact + Device string `json:"device"` // e.g., "fortigate.transit.local" + DeviceType string `json:"device_type"` // e.g., "fortios" + Operation string `json:"operation"` // e.g., "vlan_create", "zone_policy_set" + YANGSourceHash []byte `json:"yang_source_hash"` // Hash of YANG instance data + ConfigBeforeHash []byte `json:"config_before_hash"` // Device config before mutation + ConfigAfterHash []byte `json:"config_after_hash"` // Device config after mutation + SDKMethod string `json:"sdk_method"` // e.g., "fortiosapi.set('firewall','policy',...)" + Timestamp time.Time `json:"timestamp"` +} + +// ArtifactID returns the unique identifier for this artifact. +func (a *NetworkMutationArtifact) ArtifactID() string { + return a.MutationID +} + +// RegistryType returns the registry type discriminator. +func (a *NetworkMutationArtifact) RegistryType() string { + return "network-mutation" +} + +// CanonicalBytes returns the RFC 8785 (JCS) canonical JSON serialization +// for deterministic hashing and merkle anchoring. +func (a *NetworkMutationArtifact) CanonicalBytes() ([]byte, error) { + canonical := map[string]any{ + "mutation_id": a.MutationID, + "sat_hash": a.SATHash, + "session_id": a.SessionID, + "device": a.Device, + "device_type": a.DeviceType, + "operation": a.Operation, + "yang_source_hash": a.YANGSourceHash, + "config_before_hash": a.ConfigBeforeHash, + "config_after_hash": a.ConfigAfterHash, + "sdk_method": a.SDKMethod, + "timestamp": a.Timestamp.UTC().Format(time.RFC3339Nano), + } + + // TODO: Use a proper JCS (RFC 8785) library for canonical serialization. + return json.Marshal(canonical) +} diff --git a/internal/quartermaster/session_transit.go b/internal/quartermaster/session_transit.go new file mode 100644 index 0000000..043dbd1 --- /dev/null +++ b/internal/quartermaster/session_transit.go @@ -0,0 +1,60 @@ +package quartermaster + +import ( + "encoding/json" + "sort" + "time" +) + +// SessionTransitArtifact is recorded when a Shellstream session crosses +// the inter-cluster boundary (overlay mode). +type SessionTransitArtifact struct { + SessionID string `json:"session_id"` + SATHash []byte `json:"sat_hash"` + SourceCluster string `json:"source_cluster"` + DestCluster string `json:"dest_cluster"` + TargetDevice string `json:"target_device"` + TargetSubnet string `json:"target_subnet"` + GrantedMode string `json:"granted_mode"` // "overlay" | "underlay" | "both" + GrantedOperations []string `json:"granted_operations"` + OperationType string `json:"operation_type"` + Timestamp time.Time `json:"timestamp"` + CapabilityTokenHash []byte `json:"capability_token_hash"` +} + +// ArtifactID returns the unique identifier for this artifact. +func (a *SessionTransitArtifact) ArtifactID() string { + return a.SessionID +} + +// RegistryType returns the registry type discriminator. +func (a *SessionTransitArtifact) RegistryType() string { + return "session-transit" +} + +// CanonicalBytes returns the RFC 8785 (JCS) canonical JSON serialization +// for deterministic hashing and merkle anchoring. +func (a *SessionTransitArtifact) CanonicalBytes() ([]byte, error) { + // Sort granted operations for deterministic output. + ops := make([]string, len(a.GrantedOperations)) + copy(ops, a.GrantedOperations) + sort.Strings(ops) + + canonical := map[string]any{ + "session_id": a.SessionID, + "sat_hash": a.SATHash, + "source_cluster": a.SourceCluster, + "dest_cluster": a.DestCluster, + "target_device": a.TargetDevice, + "target_subnet": a.TargetSubnet, + "granted_mode": a.GrantedMode, + "granted_operations": ops, + "operation_type": a.OperationType, + "timestamp": a.Timestamp.UTC().Format(time.RFC3339Nano), + "capability_token_hash": a.CapabilityTokenHash, + } + + // TODO: Use a proper JCS (RFC 8785) library for canonical serialization. + // For now, json.Marshal with sorted keys provides deterministic output. + return json.Marshal(canonical) +} diff --git a/internal/shellstream/capability.go b/internal/shellstream/capability.go new file mode 100644 index 0000000..f2419e9 --- /dev/null +++ b/internal/shellstream/capability.go @@ -0,0 +1,94 @@ +package shellstream + +import ( + "encoding/json" + "fmt" + "os" +) + +// AccordPolicy defines the local node's policy for granting capabilities +// to incoming Shellstream sessions, derived from Guildhouse accords. +type AccordPolicy struct { + TrustTiers map[string]TierPolicy `json:"trust_tiers"` +} + +// TierPolicy maps a Guildhouse trust tier to allowed Kedge modes and operations. +type TierPolicy struct { + Mode []string `json:"mode"` // ["overlay"], ["overlay", "underlay"] + Operations []string `json:"operations"` // ["read"], ["read", "mutate", "admin"] + Subnets []string `json:"subnets"` // ["172.16.0.0/24", "10.0.1.0/24"] +} + +// EvaluateCapability evaluates a capability request against the local accord policy. +// Returns the attenuated grant (requested capabilities intersected with policy). +func EvaluateCapability(req CapabilityRequest, policyPath string) (*CapabilityGrant, error) { + policy, err := loadAccordPolicy(policyPath) + if err != nil { + return nil, fmt.Errorf("failed to load accord policy: %w", err) + } + + // TODO: Determine the remote peer's trust tier from its SAT token. + // For Phase 1, assume "journeyman" tier. + tier := "journeyman" + + tierPolicy, ok := policy.TrustTiers[tier] + if !ok { + return nil, fmt.Errorf("unknown trust tier: %s", tier) + } + + // Attenuate: grant the intersection of requested and allowed. + grantedMode := attenuateMode(req.Mode, tierPolicy.Mode) + if grantedMode == "" { + return nil, fmt.Errorf("requested mode %s not allowed for tier %s", req.Mode, tier) + } + + grantedOps := attenuateOperations(req.Operations, tierPolicy.Operations) + if len(grantedOps) == 0 { + return nil, fmt.Errorf("no operations allowed for tier %s", tier) + } + + return &CapabilityGrant{ + Mode: grantedMode, + Operations: grantedOps, + }, nil +} + +func loadAccordPolicy(path string) (*AccordPolicy, error) { + data, err := os.ReadFile(path) + if err != nil { + return nil, err + } + var policy AccordPolicy + if err := json.Unmarshal(data, &policy); err != nil { + return nil, err + } + return &policy, nil +} + +func attenuateMode(requested string, allowed []string) string { + for _, m := range allowed { + if m == requested || m == "both" { + return requested + } + } + // If "both" was requested but only one mode allowed, grant the allowed one. + if requested == "both" && len(allowed) > 0 { + return allowed[0] + } + return "" +} + +func attenuateOperations(requested, allowed []string) []string { + allowedSet := make(map[string]bool) + for _, op := range allowed { + allowedSet[op] = true + } + + var granted []string + for _, op := range requested { + if allowedSet[op] { + granted = append(granted, op) + } + } + return granted +} diff --git a/internal/shellstream/handshake.go b/internal/shellstream/handshake.go new file mode 100644 index 0000000..50855c5 --- /dev/null +++ b/internal/shellstream/handshake.go @@ -0,0 +1,151 @@ +package shellstream + +import ( + "context" + "fmt" + "net" + + "go.uber.org/zap" + + "github.com/guildhouse-co/kedge/internal/config" + "github.com/guildhouse-co/kedge/internal/quartermaster" +) + +// Listener accepts incoming Shellstream connections and performs the +// 3-way attestation handshake (ATTEST-INIT → ATTEST-VERIFY → ATTEST-CONFIRM). +type Listener struct { + cfg config.ShellstreamConfig + qm *quartermaster.Client + log *zap.SugaredLogger +} + +// NewListener creates a new Shellstream handshake listener. +func NewListener(cfg config.ShellstreamConfig, qm *quartermaster.Client, log *zap.SugaredLogger) *Listener { + return &Listener{cfg: cfg, qm: qm, log: log} +} + +// Run starts listening for incoming Shellstream connections. +func (l *Listener) Run(ctx context.Context) error { + listener, err := net.Listen("tcp", l.cfg.ListenAddr) + if err != nil { + return fmt.Errorf("failed to listen on %s: %w", l.cfg.ListenAddr, err) + } + defer listener.Close() + + l.log.Infof("shellstream listener started on %s", l.cfg.ListenAddr) + + go func() { + <-ctx.Done() + listener.Close() + }() + + for { + conn, err := listener.Accept() + if err != nil { + select { + case <-ctx.Done(): + return nil + default: + l.log.Warnw("accept error", "error", err) + continue + } + } + + go l.handleConnection(ctx, conn) + } +} + +func (l *Listener) handleConnection(ctx context.Context, conn net.Conn) { + defer conn.Close() + + // Phase 1: Receive ATTEST-INIT from remote. + initMsg, err := readAttestInit(conn) + if err != nil { + l.log.Warnw("failed to read ATTEST-INIT", "remote", conn.RemoteAddr(), "error", err) + return + } + + // Validate the SAT token. + if err := ValidateSAT(initMsg.SATToken, l.cfg.TrustBundlePath); err != nil { + l.log.Warnw("SAT validation failed", "remote", conn.RemoteAddr(), "error", err) + return + } + + // Evaluate capability request against local accord policy. + grant, err := EvaluateCapability(initMsg.Capabilities, l.cfg.AccordPolicyPath) + if err != nil { + l.log.Warnw("capability evaluation failed", "remote", conn.RemoteAddr(), "error", err) + return + } + + // Phase 2: Send ATTEST-VERIFY with granted capabilities. + if err := writeAttestVerify(conn, grant); err != nil { + l.log.Warnw("failed to write ATTEST-VERIFY", "error", err) + return + } + + // Phase 3: Receive ATTEST-CONFIRM. + if err := readAttestConfirm(conn); err != nil { + l.log.Warnw("failed to read ATTEST-CONFIRM", "error", err) + return + } + + // Record session transit artifact. + artifact := quartermaster.SessionTransitArtifact{ + SourceCluster: initMsg.SourceCluster, + DestCluster: l.cfg.ClusterID, + TargetDevice: initMsg.TargetDevice, + GrantedMode: grant.Mode, + GrantedOperations: grant.Operations, + } + if err := l.qm.SubmitSessionTransit(ctx, &artifact); err != nil { + l.log.Warnw("failed to submit session transit", "error", err) + } + + l.log.Infow("session established", + "remote", conn.RemoteAddr(), + "mode", grant.Mode, + "target", initMsg.TargetDevice, + ) +} + +// AttestInitMsg represents the ATTEST-INIT message from the remote peer. +type AttestInitMsg struct { + SATToken []byte + SourceCluster string + TargetDevice string + TargetSubnet string + Capabilities CapabilityRequest +} + +// CapabilityRequest describes what mode and operations the remote peer is requesting. +type CapabilityRequest struct { + Mode string // "overlay", "underlay", or "both" + Targets []string // Target device addresses + Operations []string // "read", "mutate" +} + +// CapabilityGrant describes what was actually granted after accord evaluation. +type CapabilityGrant struct { + Mode string + Operations []string +} + +func readAttestInit(conn net.Conn) (*AttestInitMsg, error) { + // TODO: Implement Shellstream wire protocol parsing. + _ = conn + return nil, fmt.Errorf("not yet implemented") +} + +func writeAttestVerify(conn net.Conn, grant *CapabilityGrant) error { + // TODO: Implement Shellstream wire protocol serialization. + _ = conn + _ = grant + return fmt.Errorf("not yet implemented") +} + +func readAttestConfirm(conn net.Conn) error { + // TODO: Implement Shellstream wire protocol parsing. + _ = conn + return fmt.Errorf("not yet implemented") +} diff --git a/internal/shellstream/sat.go b/internal/shellstream/sat.go new file mode 100644 index 0000000..5b82dae --- /dev/null +++ b/internal/shellstream/sat.go @@ -0,0 +1,44 @@ +package shellstream + +import ( + "context" + "fmt" + + "github.com/spiffe/go-spiffe/v2/svid/x509svid" + "github.com/spiffe/go-spiffe/v2/workloadapi" +) + +// ValidateSAT validates a Substrate Attestation Token against the SPIRE/Vigil trust bundle. +// Phase 1: Simplified to SPIFFE SVID verification only. +// Future: Full SAT validation with TPM attestation and capability semantics via Vigil. +func ValidateSAT(token []byte, trustBundlePath string) error { + if len(token) == 0 { + return fmt.Errorf("empty SAT token") + } + + // Phase 1: Parse as X.509 SVID and verify against the SPIRE trust bundle. + // The token is expected to be a DER-encoded X.509 certificate chain. + svid, err := x509svid.Parse(token, nil) + if err != nil { + return fmt.Errorf("failed to parse SVID from SAT: %w", err) + } + + // Verify the SVID against the local workload API trust bundle. + ctx := context.Background() + source, err := workloadapi.NewX509Source(ctx) + if err != nil { + return fmt.Errorf("failed to create X509Source: %w", err) + } + defer source.Close() + + bundle, err := source.GetX509BundleForTrustDomain(svid.ID.TrustDomain()) + if err != nil { + return fmt.Errorf("failed to get trust bundle for %s: %w", svid.ID.TrustDomain(), err) + } + + _ = bundle + // TODO: Verify the certificate chain against the bundle. + // For Phase 1, presence of a parseable SVID from a known trust domain is sufficient. + + return nil +} diff --git a/internal/topology/state.go b/internal/topology/state.go new file mode 100644 index 0000000..b5d20d9 --- /dev/null +++ b/internal/topology/state.go @@ -0,0 +1,48 @@ +package topology + +// MeshTopology represents the current mesh state shared between +// the CNI plugin (reader) and DaemonSet (writer). +type MeshTopology struct { + // NodeID identifies this node in the mesh. + NodeID string `json:"node_id"` + + // ClusterID identifies the cluster this node belongs to. + ClusterID string `json:"cluster_id"` + + // Mode indicates the active modes: "overlay", "underlay", or "both". + Mode string `json:"mode"` + + // Peers are the WireGuard mesh peers (overlay mode). + Peers []PeerInfo `json:"peers,omitempty"` + + // OverlaySubnets are subnets reachable via WireGuard tunnels. + OverlaySubnets []SubnetRoute `json:"overlay_subnets,omitempty"` + + // UnderlaySubnets are subnets reachable via VLAN bridges. + UnderlaySubnets []SubnetRoute `json:"underlay_subnets,omitempty"` + + // WireGuard interface configuration (overlay). + WireGuard WireGuardConfig `json:"wireguard,omitempty"` +} + +// PeerInfo describes a WireGuard mesh peer. +type PeerInfo struct { + PublicKey string `json:"public_key"` + Endpoint string `json:"endpoint"` + AllowedIPs []string `json:"allowed_ips"` + ClusterID string `json:"cluster_id,omitempty"` +} + +// SubnetRoute maps a destination CIDR to an interface. +type SubnetRoute struct { + Dst string `json:"dst"` // e.g., "172.16.0.0/24" + Via string `json:"via"` // e.g., "vlan100" or "wg0" + Mode string `json:"mode"` // "overlay" or "underlay" +} + +// WireGuardConfig holds the local node's WireGuard configuration. +type WireGuardConfig struct { + InterfaceName string `json:"interface_name"` // e.g., "wg0" + ListenPort int `json:"listen_port"` + PrivateKeyPath string `json:"private_key_path"` +} diff --git a/internal/topology/store.go b/internal/topology/store.go new file mode 100644 index 0000000..4bcd1ac --- /dev/null +++ b/internal/topology/store.go @@ -0,0 +1,76 @@ +package topology + +import ( + "encoding/json" + "fmt" + "os" + "sync" +) + +// Store provides thread-safe access to the mesh topology state. +// The DaemonSet writes topology updates, and the CNI plugin reads them. +// State is persisted to a JSON file on disk. +type Store struct { + mu sync.RWMutex + filePath string + topo *MeshTopology +} + +// NewStore creates a topology store backed by the given file path. +func NewStore(filePath string) *Store { + return &Store{ + filePath: filePath, + topo: &MeshTopology{}, + } +} + +// LoadFromFile reads a MeshTopology from a JSON file. +// Used by the CNI plugin to read the DaemonSet-written state. +func LoadFromFile(path string) (*MeshTopology, error) { + data, err := os.ReadFile(path) + if err != nil { + return nil, fmt.Errorf("failed to read topology file %s: %w", path, err) + } + + var topo MeshTopology + if err := json.Unmarshal(data, &topo); err != nil { + return nil, fmt.Errorf("failed to parse topology file: %w", err) + } + + return &topo, nil +} + +// Get returns a copy of the current topology. +func (s *Store) Get() MeshTopology { + s.mu.RLock() + defer s.mu.RUnlock() + return *s.topo +} + +// Update atomically updates the topology and persists to disk. +func (s *Store) Update(fn func(*MeshTopology)) error { + s.mu.Lock() + defer s.mu.Unlock() + + fn(s.topo) + + return s.persist() +} + +func (s *Store) persist() error { + data, err := json.MarshalIndent(s.topo, "", " ") + if err != nil { + return fmt.Errorf("failed to marshal topology: %w", err) + } + + // Atomic write: write to temp file, then rename. + tmpPath := s.filePath + ".tmp" + if err := os.WriteFile(tmpPath, data, 0644); err != nil { + return fmt.Errorf("failed to write topology file: %w", err) + } + if err := os.Rename(tmpPath, s.filePath); err != nil { + return fmt.Errorf("failed to rename topology file: %w", err) + } + + return nil +} diff --git a/internal/underlay/dispatch.go b/internal/underlay/dispatch.go new file mode 100644 index 0000000..5865f27 --- /dev/null +++ b/internal/underlay/dispatch.go @@ -0,0 +1,69 @@ +package underlay + +import ( + "context" + "crypto/sha256" + "fmt" + "os/exec" + + "go.uber.org/zap" + + "github.com/guildhouse-co/kedge/internal/quartermaster" +) + +// Dispatcher triggers YANG compilation and coordinates SDK dispatch through Bascule. +type Dispatcher struct { + qm *quartermaster.Client + log *zap.SugaredLogger + + // CompilerPath is the path to the YANG compiler script. + CompilerPath string +} + +// NewDispatcher creates a new underlay dispatch coordinator. +func NewDispatcher(qm *quartermaster.Client, log *zap.SugaredLogger) *Dispatcher { + return &Dispatcher{ + qm: qm, + log: log, + CompilerPath: "/opt/kedge/yang/compiler/compile.py", + } +} + +// CompileAndDispatch runs the YANG compiler for the given site config and +// dispatches the resulting vendor-specific payloads via Bascule SDK. +func (d *Dispatcher) CompileAndDispatch(ctx context.Context, siteConfigPath string, sessionID string) error { + // Hash the YANG source for the mutation artifact. + yangHash, err := hashFile(siteConfigPath) + if err != nil { + return fmt.Errorf("failed to hash YANG source: %w", err) + } + + // Run the YANG compiler as a subprocess. + cmd := exec.CommandContext(ctx, "python3", d.CompilerPath, + "--site-config", siteConfigPath, + "--output-format", "json", + ) + output, err := cmd.Output() + if err != nil { + return fmt.Errorf("YANG compilation failed: %w", err) + } + + d.log.Infow("YANG compilation complete", "site_config", siteConfigPath, "output_bytes", len(output)) + + // TODO: Parse compiler output into vendor-specific payloads. + // TODO: For each device mutation, dispatch SDK call through Bascule, + // capture before/after config hashes, and submit NetworkMutationArtifact. + + _ = yangHash + _ = sessionID + return nil +} + +func hashFile(path string) ([]byte, error) { + data, err := exec.Command("cat", path).Output() + if err != nil { + return nil, err + } + hash := sha256.Sum256(data) + return hash[:], nil +} diff --git a/internal/underlay/watcher.go b/internal/underlay/watcher.go new file mode 100644 index 0000000..b92bc64 --- /dev/null +++ b/internal/underlay/watcher.go @@ -0,0 +1,56 @@ +package underlay + +import ( + "context" + "fmt" + + "go.uber.org/zap" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" + + "github.com/guildhouse-co/kedge/internal/config" + "github.com/guildhouse-co/kedge/internal/quartermaster" +) + +// Watcher watches for YANG instance data changes in ConfigMaps and triggers +// compilation and SDK dispatch for underlay device configuration. +type Watcher struct { + cfg config.UnderlayConfig + qm *quartermaster.Client + log *zap.SugaredLogger + clientset kubernetes.Interface +} + +// NewWatcher creates a new underlay YANG watcher. +func NewWatcher(cfg config.UnderlayConfig, qm *quartermaster.Client, log *zap.SugaredLogger) *Watcher { + return &Watcher{cfg: cfg, qm: qm, log: log} +} + +// Run starts watching for YANG instance data changes. +func (w *Watcher) Run(ctx context.Context) error { + if !w.cfg.Enabled { + w.log.Info("underlay mode disabled, watcher idle") + <-ctx.Done() + return nil + } + + // Build in-cluster K8s client. + k8sConfig, err := rest.InClusterConfig() + if err != nil { + return fmt.Errorf("failed to get in-cluster config: %w", err) + } + w.clientset, err = kubernetes.NewForConfig(k8sConfig) + if err != nil { + return fmt.Errorf("failed to create k8s client: %w", err) + } + + w.log.Infof("watching ConfigMap %s/%s for YANG instance data changes", + w.cfg.Namespace, w.cfg.ConfigMapName) + + // TODO: Set up informer to watch ConfigMap for changes. + // On change: trigger YANG compilation, then dispatch SDK calls via Bascule. + // Each mutation is notarized via Quartermaster. + + <-ctx.Done() + return nil +} diff --git a/internal/vlan/manager.go b/internal/vlan/manager.go new file mode 100644 index 0000000..4c63106 --- /dev/null +++ b/internal/vlan/manager.go @@ -0,0 +1,99 @@ +package vlan + +import ( + "context" + "fmt" + + "github.com/vishvananda/netlink" + "go.uber.org/zap" + + "github.com/guildhouse-co/kedge/internal/config" +) + +// Manager manages VLAN interfaces on the node for underlay mode. +type Manager struct { + cfg config.VLANConfig + log *zap.SugaredLogger +} + +// NewManager creates a new VLAN interface manager. +func NewManager(cfg config.VLANConfig, log *zap.SugaredLogger) *Manager { + return &Manager{cfg: cfg, log: log} +} + +// Run starts the VLAN manager. It ensures all configured VLAN interfaces +// and bridges exist on the node. +func (m *Manager) Run(ctx context.Context) error { + if !m.cfg.Enabled { + m.log.Info("underlay mode disabled, vlan manager idle") + <-ctx.Done() + return nil + } + + // Ensure all configured VLANs exist. + for _, v := range m.cfg.VLANs { + if err := m.ensureVLAN(v); err != nil { + m.log.Warnw("failed to ensure VLAN", "vlan_id", v.ID, "error", err) + } + } + + m.log.Infof("vlan manager started, managing %d VLANs", len(m.cfg.VLANs)) + <-ctx.Done() + return nil +} + +// VLANCount returns the number of managed VLAN interfaces. +func (m *Manager) VLANCount() int { + return len(m.cfg.VLANs) +} + +// ensureVLAN creates a VLAN interface and bridge if they don't already exist. +func (m *Manager) ensureVLAN(v config.VLANEntry) error { + bridgeName := fmt.Sprintf("vlan%d", v.ID) + + // Check if bridge already exists. + if _, err := netlink.LinkByName(bridgeName); err == nil { + m.log.Debugw("bridge already exists", "name", bridgeName) + return nil + } + + // Get the parent interface. + parent, err := netlink.LinkByName(v.ParentInterface) + if err != nil { + return fmt.Errorf("parent interface %s not found: %w", v.ParentInterface, err) + } + + // Create VLAN sub-interface. + vlanLink := &netlink.Vlan{ + LinkAttrs: netlink.LinkAttrs{ + Name: fmt.Sprintf("%s.%d", v.ParentInterface, v.ID), + ParentIndex: parent.Attrs().Index, + }, + VlanId: v.ID, + } + if err := netlink.LinkAdd(vlanLink); err != nil { + return fmt.Errorf("failed to create VLAN %d: %w", v.ID, err) + } + if err := netlink.LinkSetUp(vlanLink); err != nil { + return fmt.Errorf("failed to bring up VLAN %d: %w", v.ID, err) + } + + // Create bridge for the VLAN. + bridge := &netlink.Bridge{ + LinkAttrs: netlink.LinkAttrs{Name: bridgeName}, + } + if err := netlink.LinkAdd(bridge); err != nil { + return fmt.Errorf("failed to create bridge %s: %w", bridgeName, err) + } + if err := netlink.LinkSetUp(bridge); err != nil { + return fmt.Errorf("failed to bring up bridge %s: %w", bridgeName, err) + } + + // Add VLAN interface to bridge. + if err := netlink.LinkSetMaster(vlanLink, bridge); err != nil { + return fmt.Errorf("failed to add VLAN to bridge: %w", err) + } + + m.log.Infow("VLAN created", "id", v.ID, "bridge", bridgeName, "parent", v.ParentInterface) + return nil +} diff --git a/k8s/configmap-mesh.yaml b/k8s/configmap-mesh.yaml new file mode 100644 index 0000000..8f4ed0a --- /dev/null +++ b/k8s/configmap-mesh.yaml @@ -0,0 +1,34 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: kedge-mesh + namespace: kedge +data: + cluster_id: "homelab" + mesh.json: | + { + "node_id": "", + "cluster_id": "homelab", + "mode": "both", + "peers": [ + { + "public_key": "PLACEHOLDER_CLOUD_PUBKEY", + "endpoint": "anchor.guildhouse.example.com:51820", + "allowed_ips": ["10.100.0.0/24"], + "cluster_id": "cloud-anchor" + } + ], + "overlay_subnets": [ + {"dst": "10.100.0.0/24", "via": "wg0", "mode": "overlay"} + ], + "underlay_subnets": [ + {"dst": "172.16.0.0/24", "via": "vlan100", "mode": "underlay"}, + {"dst": "10.0.1.0/24", "via": "vlan10", "mode": "underlay"}, + {"dst": "192.168.50.0/24", "via": "vlan50", "mode": "underlay"} + ], + "wireguard": { + "interface_name": "wg0", + "listen_port": 51820, + "private_key_path": "/etc/kedge/wg-private.key" + } + } diff --git a/k8s/configmap-underlay.yaml b/k8s/configmap-underlay.yaml new file mode 100644 index 0000000..320416b --- /dev/null +++ b/k8s/configmap-underlay.yaml @@ -0,0 +1,33 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: kedge-underlay + namespace: kedge +data: + device-inventory.json: | + { + "devices": [ + { + "name": "fortigate.transit.local", + "type": "fortios", + "address": "172.16.0.2", + "sdk": "fortiosapi", + "managed_zones": ["transit", "tyler-lab", "dmz"] + }, + { + "name": "vyos.transit.local", + "type": "vyos", + "address": "172.16.0.3", + "sdk": "ncclient", + "managed_zones": ["transit"] + }, + { + "name": "udr7.local", + "type": "unifi", + "address": "192.168.1.1", + "sdk": "unifi-api", + "managed_zones": ["roommate", "shared"] + } + ] + } + yang-site-config: "homelab" diff --git a/k8s/daemonset.yaml b/k8s/daemonset.yaml new file mode 100644 index 0000000..c6f7f12 --- /dev/null +++ b/k8s/daemonset.yaml @@ -0,0 +1,115 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: kedge-daemon + namespace: kedge + labels: + app: kedge + component: daemon +spec: + selector: + matchLabels: + app: kedge + component: daemon + template: + metadata: + labels: + app: kedge + component: daemon + spec: + serviceAccountName: kedge-daemon + hostNetwork: true + hostPID: false + tolerations: + - operator: Exists + containers: + - name: kedge-daemon + image: kedge-daemon:latest + imagePullPolicy: IfNotPresent + securityContext: + privileged: true + capabilities: + add: + - NET_ADMIN + - SYS_ADMIN + ports: + - name: shellstream + containerPort: 8443 + protocol: TCP + - name: metrics + containerPort: 9090 + protocol: TCP + env: + - name: KEDGE_NODE_ID + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: KEDGE_CLUSTER_ID + valueFrom: + configMapKeyRef: + name: kedge-mesh + key: cluster_id + - name: KEDGE_OVERLAY_ENABLED + value: "true" + - name: KEDGE_UNDERLAY_ENABLED + value: "false" + - name: KEDGE_QM_ENDPOINT + value: "quartermaster.guildhouse.svc:50051" + volumeMounts: + - name: kedge-config + mountPath: /etc/kedge + readOnly: true + - name: kedge-run + mountPath: /var/run/kedge + - name: cni-bin + mountPath: /opt/cni/bin + - name: cni-conf + mountPath: /etc/cni/net.d + - name: spire-bundle + mountPath: /run/spire/bundle + readOnly: true + livenessProbe: + httpGet: + path: /healthz + port: metrics + initialDelaySeconds: 10 + periodSeconds: 30 + readinessProbe: + httpGet: + path: /readyz + port: metrics + initialDelaySeconds: 5 + periodSeconds: 10 + initContainers: + - name: install-cni + image: kedge-cni:latest + imagePullPolicy: IfNotPresent + command: ["cp", "/kedge-cni", "/opt/cni/bin/kedge-cni"] + volumeMounts: + - name: cni-bin + mountPath: /opt/cni/bin + volumes: + - name: kedge-config + projected: + sources: + - configMap: + name: kedge-mesh + - configMap: + name: kedge-underlay + optional: true + - name: kedge-run + hostPath: + path: /var/run/kedge + type: DirectoryOrCreate + - name: cni-bin + hostPath: + path: /opt/cni/bin + type: DirectoryOrCreate + - name: cni-conf + hostPath: + path: /etc/cni/net.d + type: Directory + - name: spire-bundle + configMap: + name: spire-bundle + optional: true diff --git a/k8s/network-attachment.yaml b/k8s/network-attachment.yaml new file mode 100644 index 0000000..f12d055 --- /dev/null +++ b/k8s/network-attachment.yaml @@ -0,0 +1,21 @@ +apiVersion: k8s.cni.cncf.io/v1 +kind: NetworkAttachmentDefinition +metadata: + name: infra-mgmt + namespace: bascule +spec: + config: | + { + "cniVersion": "1.0.0", + "name": "kedge-infra", + "type": "kedge-cni", + "meshConfig": "/etc/kedge/mesh.json", + "underlayRoutes": [ + {"dst": "172.16.0.0/24", "via": "vlan100"}, + {"dst": "10.0.1.0/24", "via": "vlan10"}, + {"dst": "192.168.50.0/24", "via": "vlan50"} + ], + "overlayRoutes": [ + {"dst": "10.100.0.0/24", "via": "wg0"} + ] + } diff --git a/k8s/rbac.yaml b/k8s/rbac.yaml new file mode 100644 index 0000000..6e74665 --- /dev/null +++ b/k8s/rbac.yaml @@ -0,0 +1,45 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: kedge +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: kedge-daemon + namespace: kedge +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: kedge-daemon +rules: + # Watch ConfigMaps for mesh topology and YANG instance data. + - apiGroups: [""] + resources: ["configmaps"] + verbs: ["get", "list", "watch"] + # Read NetworkAttachmentDefinitions. + - apiGroups: ["k8s.cni.cncf.io"] + resources: ["network-attachment-definitions"] + verbs: ["get", "list", "watch"] + # List pods for SVID mapping. + - apiGroups: [""] + resources: ["pods"] + verbs: ["get", "list"] + # Read nodes for topology awareness. + - apiGroups: [""] + resources: ["nodes"] + verbs: ["get", "list"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: kedge-daemon +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: kedge-daemon +subjects: + - kind: ServiceAccount + name: kedge-daemon + namespace: kedge diff --git a/monitoring/grafana/dashboards/device-mutations.json b/monitoring/grafana/dashboards/device-mutations.json new file mode 100644 index 0000000..5a345ed --- /dev/null +++ b/monitoring/grafana/dashboards/device-mutations.json @@ -0,0 +1,18 @@ +{ + "dashboard": { + "title": "Kedge Device Mutations", + "uid": "kedge-device-mutations", + "panels": [ + { + "title": "Network Mutations (rate)", + "type": "graph", + "targets": [{"expr": "rate(kedge_quartermaster_network_mutations_total[5m])"}] + }, + { + "title": "Total Network Mutations", + "type": "stat", + "targets": [{"expr": "kedge_quartermaster_network_mutations_total"}] + } + ] + } +} diff --git a/monitoring/grafana/dashboards/drift-detection.json b/monitoring/grafana/dashboards/drift-detection.json new file mode 100644 index 0000000..c80ef46 --- /dev/null +++ b/monitoring/grafana/dashboards/drift-detection.json @@ -0,0 +1,20 @@ +{ + "dashboard": { + "title": "Kedge Drift Detection", + "uid": "kedge-drift-detection", + "description": "Compares YANG desired state to actual device config. Populated by underlay-audit playbook.", + "panels": [ + { + "title": "Managed VLAN Interfaces", + "type": "stat", + "targets": [{"expr": "kedge_vlan_interface_count"}] + }, + { + "title": "Drift Events", + "type": "graph", + "description": "Placeholder — requires drift detection metrics from audit pipeline", + "targets": [] + } + ] + } +} diff --git a/monitoring/grafana/dashboards/mesh-health.json b/monitoring/grafana/dashboards/mesh-health.json new file mode 100644 index 0000000..6de8ce6 --- /dev/null +++ b/monitoring/grafana/dashboards/mesh-health.json @@ -0,0 +1,18 @@ +{ + "dashboard": { + "title": "Kedge Mesh Health", + "uid": "kedge-mesh-health", + "panels": [ + { + "title": "Active WireGuard Peers", + "type": "stat", + "targets": [{"expr": "kedge_mesh_peer_count"}] + }, + { + "title": "Tunnel Status", + "type": "table", + "targets": [{"expr": "kedge_mesh_tunnel_up"}] + } + ] + } +} diff --git a/monitoring/grafana/dashboards/session-transits.json b/monitoring/grafana/dashboards/session-transits.json new file mode 100644 index 0000000..9b478a0 --- /dev/null +++ b/monitoring/grafana/dashboards/session-transits.json @@ -0,0 +1,18 @@ +{ + "dashboard": { + "title": "Kedge Session Transits", + "uid": "kedge-session-transits", + "panels": [ + { + "title": "Session Transits (rate)", + "type": "graph", + "targets": [{"expr": "rate(kedge_quartermaster_session_transits_total[5m])"}] + }, + { + "title": "Total Session Transits", + "type": "stat", + "targets": [{"expr": "kedge_quartermaster_session_transits_total"}] + } + ] + } +} diff --git a/monitoring/health-checks/checks.yml b/monitoring/health-checks/checks.yml new file mode 100644 index 0000000..9ff1bec --- /dev/null +++ b/monitoring/health-checks/checks.yml @@ -0,0 +1,25 @@ +--- +# Health check definitions for Kedge components. + +checks: + - name: kedge-daemon-health + endpoint: http://localhost:9090/healthz + interval: 30s + timeout: 5s + expected_status: 200 + + - name: kedge-daemon-ready + endpoint: http://localhost:9090/readyz + interval: 30s + timeout: 5s + expected_status: 200 + + - name: wireguard-interface + command: wg show wg0 + interval: 60s + expect_exit_code: 0 + + - name: quartermaster-connectivity + command: grpcurl -plaintext quartermaster.guildhouse.svc:50051 list + interval: 120s + expect_exit_code: 0 diff --git a/monitoring/prometheus/prometheus.yml b/monitoring/prometheus/prometheus.yml new file mode 100644 index 0000000..b2b2795 --- /dev/null +++ b/monitoring/prometheus/prometheus.yml @@ -0,0 +1,16 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + +scrape_configs: + - job_name: kedge-daemon + static_configs: + - targets: ['localhost:9090'] + labels: + component: daemon + + - job_name: node-exporter + static_configs: + - targets: ['localhost:9100'] + labels: + component: node diff --git a/proto/quartermaster/v1/governance.proto b/proto/quartermaster/v1/governance.proto new file mode 100644 index 0000000..16ff7d3 --- /dev/null +++ b/proto/quartermaster/v1/governance.proto @@ -0,0 +1,119 @@ +syntax = "proto3"; + +package quartermaster.v1; + +import "google/protobuf/timestamp.proto"; + +// Governance service for intent lifecycle and SAT issuance. +service GovernanceService { + // Create a MutationIntent — called by application at user-request time. + rpc CreateIntent(CreateIntentRequest) returns (CreateIntentResponse); + + // Redeem a MutationIntent — called by worker at execution time. + rpc RedeemIntent(RedeemIntentRequest) returns (RedeemIntentResponse); + + // Revoke a MutationIntent — called to cancel pending authorization. + rpc RevokeIntent(RevokeIntentRequest) returns (RevokeIntentResponse); + + // Query intents for a tenant (admin/audit use). + rpc ListIntents(ListIntentsRequest) returns (ListIntentsResponse); +} + +message CreateIntentRequest { + string registry_type = 1; + string verb = 2; + string artifact_scope = 3; + string tenant_id = 4; + + // Identity claim — one of these should be set. + oneof identity_claim { + string oidc_token = 5; + ExternalEventClaim external_event = 6; + } + + uint32 ttl_seconds = 7; + uint32 max_redemptions = 8; + string idempotency_key = 9; +} + +message ExternalEventClaim { + string source = 1; + string event_id = 2; + string event_type = 3; + string verification = 4; +} + +message CreateIntentResponse { + string intent_id = 1; + google.protobuf.Timestamp expires_at = 2; + bytes intent_hash = 3; + string error = 4; + bool denied = 5; + string denial_reason = 6; + // If a governance ceremony is required, this field contains the + // ceremony ID. The intent status is "ceremony_pending" and cannot + // be redeemed until the ceremony resolves. + string ceremony_id = 7; +} + +message RedeemIntentRequest { + string intent_id = 1; +} + +message RedeemIntentResponse { + bool success = 1; + SatToken sat = 2; + int32 remaining_redemptions = 3; + string status = 4; + string error = 5; +} + +message SatToken { + bytes sat_hash = 1; + string bearer_svid = 2; + repeated SatScopeMsg scopes = 3; + google.protobuf.Timestamp issued_at = 4; + google.protobuf.Timestamp expires_at = 5; + bytes signature = 6; + bytes sat_bytes = 7; +} + +message SatScopeMsg { + string registry_type = 1; + repeated string verbs = 2; + string resource_pattern = 3; +} + +message RevokeIntentRequest { + string intent_id = 1; +} + +message RevokeIntentResponse { + bool success = 1; + string error = 2; +} + +message ListIntentsRequest { + string tenant_id = 1; + string status_filter = 2; + int32 limit = 3; +} + +message ListIntentsResponse { + repeated IntentSummary intents = 1; +} + +message IntentSummary { + string intent_id = 1; + string registry_type = 2; + string verb = 3; + string artifact_scope = 4; + string tenant_id = 5; + string claim_type = 6; + string claim_subject = 7; + string status = 8; + int32 max_redemptions = 9; + int32 redeemed_count = 10; + google.protobuf.Timestamp authorized_at = 11; + google.protobuf.Timestamp expires_at = 12; +} diff --git a/proto/quartermaster/v1/notary.proto b/proto/quartermaster/v1/notary.proto new file mode 100644 index 0000000..cbabee1 --- /dev/null +++ b/proto/quartermaster/v1/notary.proto @@ -0,0 +1,48 @@ +syntax = "proto3"; +package quartermaster.v1; + +import "google/protobuf/timestamp.proto"; + +service QuartermasterNotary { + rpc CreateAnchor (CreateAnchorRequest) returns (CreateAnchorResponse); + rpc GetLatestAnchor (GetLatestAnchorRequest) returns (GetLatestAnchorResponse); + rpc VerifyInclusion (VerifyInclusionRequest) returns (VerifyInclusionResponse); +} + +message CreateAnchorRequest { + string cluster_id = 1; + repeated bytes leaves = 2; + int64 etcd_revision = 3; // 0 means not set +} + +message CreateAnchorResponse { + string anchor_id = 1; + bytes merkle_root = 2; + bytes previous_root = 3; + int32 leaf_count = 4; + google.protobuf.Timestamp time = 5; +} + +message GetLatestAnchorRequest { + string cluster_id = 1; +} + +message GetLatestAnchorResponse { + string anchor_id = 1; + string cluster_id = 2; + bytes merkle_root = 3; + bytes previous_root = 4; + int64 etcd_revision = 5; + int32 leaf_count = 6; + google.protobuf.Timestamp time = 7; +} + +message VerifyInclusionRequest { + string anchor_id = 1; + bytes leaf = 2; + repeated bytes proof = 3; +} + +message VerifyInclusionResponse { + bool valid = 1; +} diff --git a/proto/quartermaster/v1/registry.proto b/proto/quartermaster/v1/registry.proto new file mode 100644 index 0000000..54c2cf1 --- /dev/null +++ b/proto/quartermaster/v1/registry.proto @@ -0,0 +1,58 @@ +syntax = "proto3"; +package quartermaster.v1; + +import "google/protobuf/timestamp.proto"; +import "google/protobuf/struct.proto"; + +service QuartermasterRegistry { + rpc RegisterCluster (RegisterClusterRequest) returns (RegisterClusterResponse); + rpc GetCluster (GetClusterRequest) returns (GetClusterResponse); + rpc ListClusters (ListClustersRequest) returns (ListClustersResponse); + rpc UpdateClusterMetadata (UpdateClusterMetadataRequest) returns (UpdateClusterMetadataResponse); +} + +message RegisterClusterRequest { + string cluster_name = 1; + string trust_domain = 2; + string role = 3; // "authority", "delegate", "peer" + string provisioned_by = 4; // optional cluster_id +} + +message RegisterClusterResponse { + string cluster_id = 1; + string cluster_name = 2; + string trust_domain = 3; + string role = 4; + google.protobuf.Timestamp provisioned_at = 5; +} + +message GetClusterRequest { + string cluster_id = 1; +} + +message GetClusterResponse { + string cluster_id = 1; + string cluster_name = 2; + string trust_domain = 3; + string role = 4; + string provisioned_by = 5; + google.protobuf.Timestamp provisioned_at = 6; + string rmm_endpoint = 7; + google.protobuf.Struct metadata = 8; +} + +message ListClustersRequest {} + +message ListClustersResponse { + repeated GetClusterResponse clusters = 1; +} + +message UpdateClusterMetadataRequest { + string cluster_id = 1; + google.protobuf.Struct metadata = 2; +} + +message UpdateClusterMetadataResponse { + string cluster_id = 1; + google.protobuf.Struct metadata = 2; +} diff --git a/scripts/bootstrap.sh b/scripts/bootstrap.sh new file mode 100755 index 0000000..a4cfd88 --- /dev/null +++ b/scripts/bootstrap.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Bootstrap a new Kedge site. +# Usage: ./bootstrap.sh + +SITE="${1:?Usage: $0 }" + +echo "=== Kedge Bootstrap: ${SITE} ===" + +case "${SITE}" in + cloud) + echo "Bootstrapping cloud anchor (overlay only)..." + cd "$(dirname "$0")/../ansible" + ansible-playbook -i inventory/hosts.yml playbooks/bootstrap-cloud.yml + ;; + homelab) + echo "Bootstrapping homelab (overlay + underlay)..." + cd "$(dirname "$0")/../ansible" + ansible-playbook -i inventory/hosts.yml playbooks/bootstrap-homelab.yml + ;; + *) + echo "Unknown site type: ${SITE}" + echo "Usage: $0 " + exit 1 + ;; +esac + +echo "=== Bootstrap complete ===" diff --git a/scripts/failover-test.sh b/scripts/failover-test.sh new file mode 100755 index 0000000..1e3e9f0 --- /dev/null +++ b/scripts/failover-test.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Run WAN failover simulation. +# Usage: ./failover-test.sh + +echo "=== Kedge Failover Test ===" + +echo "1. Checking WireGuard status..." +wg show wg0 || echo "WARNING: WireGuard interface wg0 not found" + +echo "" +echo "2. Checking Kedge DaemonSet health..." +curl -sf http://localhost:9090/healthz && echo " OK" || echo " FAILED" + +echo "" +echo "3. Checking Kedge metrics..." +curl -sf http://localhost:9090/metrics | grep -c "kedge_" || echo "No kedge metrics found" + +echo "" +echo "4. Running mesh health playbook..." +cd "$(dirname "$0")/../ansible" +ansible-playbook -i inventory/hosts.yml playbooks/mesh-health.yml + +echo "" +echo "=== Failover test complete ===" diff --git a/scripts/generate-wireguard-keys.sh b/scripts/generate-wireguard-keys.sh new file mode 100755 index 0000000..57a2092 --- /dev/null +++ b/scripts/generate-wireguard-keys.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Generate WireGuard keypair for a new site. +# Usage: ./generate-wireguard-keys.sh [output-dir] + +OUTPUT_DIR="${1:-/etc/kedge}" + +echo "Generating WireGuard keypair in ${OUTPUT_DIR}..." + +mkdir -p "${OUTPUT_DIR}" + +wg genkey | tee "${OUTPUT_DIR}/wg-private.key" | wg pubkey > "${OUTPUT_DIR}/wg-public.key" + +chmod 600 "${OUTPUT_DIR}/wg-private.key" +chmod 644 "${OUTPUT_DIR}/wg-public.key" + +echo "Private key: ${OUTPUT_DIR}/wg-private.key" +echo "Public key: $(cat "${OUTPUT_DIR}/wg-public.key")" diff --git a/scripts/validate-yang.sh b/scripts/validate-yang.sh new file mode 100755 index 0000000..57a0a59 --- /dev/null +++ b/scripts/validate-yang.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Validate YANG models with pyang --strict. +# Usage: ./validate-yang.sh + +YANG_DIR="$(dirname "$0")/../yang" + +echo "Validating YANG models..." + +pyang --strict "${YANG_DIR}/models/sovereign-sdwan.yang" + +echo "YANG validation passed." diff --git a/terraform/environments/homelab/main.tf b/terraform/environments/homelab/main.tf new file mode 100644 index 0000000..a9d6d38 --- /dev/null +++ b/terraform/environments/homelab/main.tf @@ -0,0 +1,20 @@ +module "wireguard_topology" { + source = "../../modules/wireguard-topology" + + sites = [ + { + name = "homelab" + public_key = var.homelab_wg_pubkey + endpoint = "${var.homelab_wan_ip}:51820" + allowed_ips = ["10.100.0.1/32", "172.16.0.0/24", "10.0.1.0/24"] + }, + { + name = "cloud-anchor" + public_key = var.cloud_anchor_wg_pubkey + endpoint = "${var.cloud_anchor_ip}:51820" + allowed_ips = ["10.100.0.2/32"] + }, + ] + + output_dir = "${path.module}/output" +} diff --git a/terraform/environments/homelab/terraform.tfvars b/terraform/environments/homelab/terraform.tfvars new file mode 100644 index 0000000..0324350 --- /dev/null +++ b/terraform/environments/homelab/terraform.tfvars @@ -0,0 +1,7 @@ +# These values must be populated per-deployment. +# Never commit actual keys or IPs. + +homelab_wg_pubkey = "PLACEHOLDER" +homelab_wan_ip = "PLACEHOLDER" +cloud_anchor_wg_pubkey = "PLACEHOLDER" +cloud_anchor_ip = "PLACEHOLDER" diff --git a/terraform/environments/production/main.tf b/terraform/environments/production/main.tf new file mode 100644 index 0000000..1e6f0f6 --- /dev/null +++ b/terraform/environments/production/main.tf @@ -0,0 +1,24 @@ +module "cloud_anchor" { + source = "../../modules/cloud-anchor" + + server_name = "kedge-anchor-prod" + server_type = "cpx11" + location = "ash" + ssh_keys = var.ssh_keys + admin_ips = var.admin_ips +} + +module "dns" { + source = "../../modules/dns" + + zone_id = var.cloudflare_zone_id + anchor_hostname = "anchor" + anchor_ip = module.cloud_anchor.server_ip +} + +module "wireguard_topology" { + source = "../../modules/wireguard-topology" + + sites = var.sites + output_dir = "${path.module}/output" +} diff --git a/terraform/environments/production/terraform.tfvars b/terraform/environments/production/terraform.tfvars new file mode 100644 index 0000000..ac34ea2 --- /dev/null +++ b/terraform/environments/production/terraform.tfvars @@ -0,0 +1,8 @@ +# Production environment configuration. +# Populate with actual values — never commit secrets. + +ssh_keys = ["PLACEHOLDER"] +admin_ips = ["PLACEHOLDER/32"] +cloudflare_zone_id = "PLACEHOLDER" + +sites = [] diff --git a/terraform/modules/cloud-anchor/main.tf b/terraform/modules/cloud-anchor/main.tf new file mode 100644 index 0000000..7bb6327 --- /dev/null +++ b/terraform/modules/cloud-anchor/main.tf @@ -0,0 +1,59 @@ +terraform { + required_providers { + hcloud = { + source = "hetznercloud/hcloud" + version = "~> 1.45" + } + } +} + +resource "hcloud_server" "anchor" { + name = var.server_name + server_type = var.server_type + image = var.image + location = var.location + + ssh_keys = var.ssh_keys + + labels = { + role = "cloud-anchor" + cluster = "cloud-anchor" + kedge = "overlay-only" + } + + user_data = <<-EOF + #!/bin/bash + apt-get update + apt-get install -y wireguard-tools + EOF +} + +resource "hcloud_firewall" "anchor" { + name = "${var.server_name}-fw" + + rule { + direction = "in" + protocol = "tcp" + port = "22" + source_ips = var.admin_ips + } + + rule { + direction = "in" + protocol = "udp" + port = var.wireguard_port + source_ips = ["0.0.0.0/0", "::/0"] + } + + rule { + direction = "in" + protocol = "tcp" + port = "6443" + source_ips = ["10.100.0.0/24"] + } +} + +resource "hcloud_firewall_attachment" "anchor" { + firewall_id = hcloud_firewall.anchor.id + server_ids = [hcloud_server.anchor.id] +} diff --git a/terraform/modules/cloud-anchor/outputs.tf b/terraform/modules/cloud-anchor/outputs.tf new file mode 100644 index 0000000..05c9b87 --- /dev/null +++ b/terraform/modules/cloud-anchor/outputs.tf @@ -0,0 +1,9 @@ +output "server_ip" { + description = "Public IPv4 address of the cloud anchor" + value = hcloud_server.anchor.ipv4_address +} + +output "server_id" { + description = "Hetzner server ID" + value = hcloud_server.anchor.id +} diff --git a/terraform/modules/cloud-anchor/variables.tf b/terraform/modules/cloud-anchor/variables.tf new file mode 100644 index 0000000..99e4ea0 --- /dev/null +++ b/terraform/modules/cloud-anchor/variables.tf @@ -0,0 +1,39 @@ +variable "server_name" { + description = "Name of the cloud anchor VPS" + type = string + default = "kedge-anchor-01" +} + +variable "server_type" { + description = "Hetzner server type (2 vCPU minimum)" + type = string + default = "cpx11" +} + +variable "image" { + description = "OS image" + type = string + default = "ubuntu-24.04" +} + +variable "location" { + description = "Hetzner datacenter location" + type = string + default = "ash" +} + +variable "ssh_keys" { + description = "SSH key names for access" + type = list(string) +} + +variable "admin_ips" { + description = "Admin IP CIDRs for SSH access" + type = list(string) +} + +variable "wireguard_port" { + description = "WireGuard listen port" + type = string + default = "51820" +} diff --git a/terraform/modules/dns/main.tf b/terraform/modules/dns/main.tf new file mode 100644 index 0000000..082b6fb --- /dev/null +++ b/terraform/modules/dns/main.tf @@ -0,0 +1,17 @@ +terraform { + required_providers { + cloudflare = { + source = "cloudflare/cloudflare" + version = "~> 4.0" + } + } +} + +resource "cloudflare_record" "anchor" { + zone_id = var.zone_id + name = var.anchor_hostname + content = var.anchor_ip + type = "A" + ttl = 300 + proxied = false +} diff --git a/terraform/modules/dns/outputs.tf b/terraform/modules/dns/outputs.tf new file mode 100644 index 0000000..90b14cb --- /dev/null +++ b/terraform/modules/dns/outputs.tf @@ -0,0 +1,4 @@ +output "fqdn" { + description = "Fully qualified domain name of the anchor" + value = cloudflare_record.anchor.hostname +} diff --git a/terraform/modules/dns/variables.tf b/terraform/modules/dns/variables.tf new file mode 100644 index 0000000..0c93105 --- /dev/null +++ b/terraform/modules/dns/variables.tf @@ -0,0 +1,15 @@ +variable "zone_id" { + description = "Cloudflare zone ID" + type = string +} + +variable "anchor_hostname" { + description = "DNS hostname for the cloud anchor" + type = string + default = "anchor" +} + +variable "anchor_ip" { + description = "Public IP of the cloud anchor" + type = string +} diff --git a/terraform/modules/wireguard-topology/main.tf b/terraform/modules/wireguard-topology/main.tf new file mode 100644 index 0000000..67aa5b7 --- /dev/null +++ b/terraform/modules/wireguard-topology/main.tf @@ -0,0 +1,38 @@ +terraform { + required_providers { + local = { + source = "hashicorp/local" + version = "~> 2.5" + } + } +} + +# Declarative WireGuard mesh peer relationships. +# Generates peer configuration for each node in the mesh. + +locals { + peer_pairs = flatten([ + for i, site_a in var.sites : [ + for j, site_b in var.sites : { + from = site_a + to = site_b + } if i < j + ] + ]) +} + +resource "local_file" "peer_config" { + for_each = { for site in var.sites : site.name => site } + + filename = "${var.output_dir}/${each.key}-peers.json" + content = jsonencode({ + site_id = each.value.name + peers = [ + for site in var.sites : { + public_key = site.public_key + endpoint = site.endpoint + allowed_ips = site.allowed_ips + } if site.name != each.key + ] + }) +} diff --git a/terraform/modules/wireguard-topology/outputs.tf b/terraform/modules/wireguard-topology/outputs.tf new file mode 100644 index 0000000..d2b879d --- /dev/null +++ b/terraform/modules/wireguard-topology/outputs.tf @@ -0,0 +1,4 @@ +output "peer_config_files" { + description = "Map of site name to peer config file path" + value = { for k, v in local_file.peer_config : k => v.filename } +} diff --git a/terraform/modules/wireguard-topology/variables.tf b/terraform/modules/wireguard-topology/variables.tf new file mode 100644 index 0000000..fd2729d --- /dev/null +++ b/terraform/modules/wireguard-topology/variables.tf @@ -0,0 +1,15 @@ +variable "sites" { + description = "List of sites in the WireGuard mesh" + type = list(object({ + name = string + public_key = string + endpoint = string + allowed_ips = list(string) + })) +} + +variable "output_dir" { + description = "Directory to write peer configuration files" + type = string + default = "./output" +} diff --git a/yang/compiler/__init__.py b/yang/compiler/__init__.py new file mode 100644 index 0000000..473a0f4 diff --git a/yang/compiler/compile.py b/yang/compiler/compile.py new file mode 100644 index 0000000..e7044ca --- /dev/null +++ b/yang/compiler/compile.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python3 +""" +YANG compiler entry point. + +Loads the sovereign-sdwan YANG schema, validates site instance data, +and dispatches to vendor-specific compiler targets to produce +device configuration payloads. +""" + +import argparse +import json +import sys +from pathlib import Path +from typing import Any + +from lxml import etree + +from . import to_fortios, to_unifi, to_vyos + +YANG_MODELS_DIR = Path(__file__).parent.parent / "models" +NAMESPACE = "urn:sovereign:sdwan" + + +def parse_site_config(path: str) -> etree._Element: + """Parse and return the root element of a site config XML file.""" + tree = etree.parse(path) + return tree.getroot() + + +def extract_zones(root: etree._Element) -> list[dict[str, Any]]: + """Extract zone-policy entries from the site config.""" + ns = {"s": NAMESPACE} + zones = [] + + for zone_elem in root.findall(".//s:zone-policy/s:zone", ns): + zone: dict[str, Any] = { + "name": zone_elem.findtext("s:name", default="", namespaces=ns), + "subnet": zone_elem.findtext("s:subnet", default="", namespaces=ns), + "vlan_id": int(zone_elem.findtext("s:vlan-id", default="0", namespaces=ns)), + "owner_device": zone_elem.findtext("s:owner-device", default="", namespaces=ns), + "policies": [], + } + + for policy_elem in zone_elem.findall("s:policy", ns): + policy = { + "dst_zone": policy_elem.findtext("s:dst-zone", default="", namespaces=ns), + "action": policy_elem.findtext("s:action", default="deny", namespaces=ns), + "services": [ + svc.text for svc in policy_elem.findall("s:services", ns) if svc.text + ], + } + zone["policies"].append(policy) + + zones.append(zone) + + return zones + + +def extract_circuits(root: etree._Element) -> list[dict[str, Any]]: + """Extract WAN circuit entries from the site config.""" + ns = {"s": NAMESPACE} + circuits = [] + + for circuit_elem in root.findall(".//s:wan-circuits/s:circuit", ns): + circuit: dict[str, Any] = { + "name": circuit_elem.findtext("s:name", default="", namespaces=ns), + "type": circuit_elem.findtext("s:type", default="", namespaces=ns), + "interface": circuit_elem.findtext("s:interface-name", default="", namespaces=ns), + } + + sla_elem = circuit_elem.find("s:sla", ns) + if sla_elem is not None: + circuit["sla"] = { + "latency_ms": int(sla_elem.findtext("s:latency-target-ms", default="0", namespaces=ns)), + "jitter_ms": int(sla_elem.findtext("s:jitter-target-ms", default="0", namespaces=ns)), + "loss_pct": float(sla_elem.findtext("s:loss-target-pct", default="0", namespaces=ns)), + } + + circuits.append(circuit) + + return circuits + + +def compile_site(site_config_path: str, output_format: str = "json") -> dict[str, Any]: + """ + Compile a site configuration into vendor-specific payloads. + + Returns a dict mapping device type to its compiled payloads. + """ + root = parse_site_config(site_config_path) + zones = extract_zones(root) + circuits = extract_circuits(root) + + result: dict[str, Any] = {"site_config": site_config_path, "devices": {}} + + # Group zones by owner device and compile per-device payloads. + device_zones: dict[str, list[dict[str, Any]]] = {} + for zone in zones: + device = zone["owner_device"] + if device: + device_zones.setdefault(device, []).append(zone) + + for device, dev_zones in device_zones.items(): + if "fortigate" in device.lower(): + result["devices"][device] = to_fortios.compile_zones(dev_zones) + elif "vyos" in device.lower(): + result["devices"][device] = to_vyos.compile_zones(dev_zones) + elif "udr" in device.lower() or "unifi" in device.lower(): + result["devices"][device] = to_unifi.compile_zones(dev_zones) + + if circuits: + result["circuits"] = circuits + + return result + + +def main() -> None: + parser = argparse.ArgumentParser(description="YANG site config compiler") + parser.add_argument("--site-config", required=True, help="Path to site config XML") + parser.add_argument("--output-format", default="json", choices=["json"], help="Output format") + args = parser.parse_args() + + result = compile_site(args.site_config, args.output_format) + json.dump(result, sys.stdout, indent=2) + print() + + +if __name__ == "__main__": + main() diff --git a/yang/compiler/to_fortios.py b/yang/compiler/to_fortios.py new file mode 100644 index 0000000..da3e7fb --- /dev/null +++ b/yang/compiler/to_fortios.py @@ -0,0 +1,97 @@ +""" +FortiOS REST API payload generator. + +Transforms YANG zone-policy data into FortiOS REST API payloads +for zone, interface, and firewall policy configuration. +""" + +from typing import Any + + +def compile_zones(zones: list[dict[str, Any]]) -> dict[str, Any]: + """ + Compile zone definitions into FortiOS REST API payloads. + + Returns a dict with 'zones', 'interfaces', and 'policies' keys, + each containing a list of FortiOS REST API request bodies. + """ + result: dict[str, Any] = { + "target": "fortios", + "api_version": "v2", + "zones": [], + "interfaces": [], + "policies": [], + } + + for zone in zones: + # FortiOS zone object. + result["zones"].append({ + "path": "system/zone", + "method": "POST", + "body": { + "name": zone["name"], + "interface": [{"interface-name": f"vlan{zone['vlan_id']}"}], + }, + }) + + # FortiOS VLAN interface. + result["interfaces"].append({ + "path": "system/interface", + "method": "POST", + "body": { + "name": f"vlan{zone['vlan_id']}", + "type": "vlan", + "vlanid": zone["vlan_id"], + "ip": _subnet_to_ip_mask(zone["subnet"]), + "allowaccess": "ping", + "role": "lan", + }, + }) + + # FortiOS firewall policies for inter-zone traffic. + for policy in zone.get("policies", []): + action = _map_action(policy["action"]) + fw_policy: dict[str, Any] = { + "path": "firewall/policy", + "method": "POST", + "body": { + "srcintf": [{"name": zone["name"]}], + "dstintf": [{"name": policy["dst_zone"]}], + "srcaddr": [{"name": "all"}], + "dstaddr": [{"name": "all"}], + "action": action, + "schedule": "always", + "logtraffic": "all", + }, + } + + if policy.get("services"): + fw_policy["body"]["service"] = [ + {"name": svc.upper()} for svc in policy["services"] + ] + else: + fw_policy["body"]["service"] = [{"name": "ALL"}] + + result["policies"].append(fw_policy) + + return result + + +def _map_action(yang_action: str) -> str: + """Map YANG action enum to FortiOS action string.""" + mapping = { + "allow-stateful": "accept", + "allow-restricted": "accept", + "deny": "deny", + } + return mapping.get(yang_action, "deny") + + +def _subnet_to_ip_mask(subnet: str) -> str: + """Convert CIDR notation to FortiOS ip/mask format.""" + import ipaddress + + network = ipaddress.IPv4Network(subnet, strict=False) + # FortiOS uses first usable IP as the interface IP. + first_host = str(next(network.hosts())) + return f"{first_host} {network.netmask}" diff --git a/yang/compiler/to_unifi.py b/yang/compiler/to_unifi.py new file mode 100644 index 0000000..a83b172 --- /dev/null +++ b/yang/compiler/to_unifi.py @@ -0,0 +1,96 @@ +""" +UniFi Controller API JSON generator. + +Transforms YANG zone-policy data into UniFi Controller API payloads +for network and firewall rule configuration. +""" + +from typing import Any + + +def compile_zones(zones: list[dict[str, Any]]) -> dict[str, Any]: + """ + Compile zone definitions into UniFi Controller API payloads. + + Returns a dict with 'networks' and 'firewall_rules' keys, + each containing UniFi API request bodies. + """ + result: dict[str, Any] = { + "target": "unifi", + "api_version": "v1", + "networks": [], + "firewall_rules": [], + } + + for zone in zones: + import ipaddress + + network = ipaddress.IPv4Network(zone["subnet"], strict=False) + + # UniFi network (VLAN) configuration. + result["networks"].append({ + "endpoint": "/api/s/default/rest/networkconf", + "method": "POST", + "body": { + "name": zone["name"], + "purpose": "corporate", + "vlan_enabled": True, + "vlan": zone["vlan_id"], + "ip_subnet": zone["subnet"], + "networkgroup": "LAN", + "dhcpd_enabled": False, + "domain_name": f"{zone['name']}.kedge.local", + }, + }) + + # UniFi firewall rules for inter-zone traffic. + for i, policy in enumerate(zone.get("policies", [])): + rule: dict[str, Any] = { + "endpoint": "/api/s/default/rest/firewallrule", + "method": "POST", + "body": { + "name": f"{zone['name']}-to-{policy['dst_zone']}", + "enabled": True, + "ruleset": "LAN_IN", + "rule_index": 2000 + (zone["vlan_id"] * 10) + i, + "action": _map_action(policy["action"]), + "protocol": "all", + "src_networkconf_type": "NETv4", + "src_address": zone["subnet"], + }, + } + + if policy.get("services"): + # Map service names to port numbers for UniFi. + ports = [_service_to_port(svc) for svc in policy["services"]] + ports = [p for p in ports if p] + if ports: + rule["body"]["dst_port"] = ",".join(ports) + rule["body"]["protocol"] = "tcp_udp" + + result["firewall_rules"].append(rule) + + return result + + +def _map_action(yang_action: str) -> str: + """Map YANG action enum to UniFi firewall action.""" + mapping = { + "allow-stateful": "accept", + "allow-restricted": "accept", + "deny": "drop", + } + return mapping.get(yang_action, "drop") + + +def _service_to_port(service: str) -> str | None: + """Map common service names to port numbers.""" + port_map = { + "ssh": "22", + "http": "80", + "https": "443", + "dns": "53", + "ntp": "123", + "snmp": "161", + } + return port_map.get(service.lower()) diff --git a/yang/compiler/to_vyos.py b/yang/compiler/to_vyos.py new file mode 100644 index 0000000..3a5e5ed --- /dev/null +++ b/yang/compiler/to_vyos.py @@ -0,0 +1,98 @@ +""" +VyOS NETCONF XML generator. + +Transforms YANG zone-policy data into VyOS NETCONF edit-config XML +payloads for interface, firewall zone, and static route configuration. +""" + +from typing import Any + +from lxml import etree + + +def compile_zones(zones: list[dict[str, Any]]) -> dict[str, Any]: + """ + Compile zone definitions into VyOS NETCONF XML payloads. + + Returns a dict with 'edit_configs' containing XML strings + for NETCONF edit-config operations. + """ + result: dict[str, Any] = { + "target": "vyos", + "protocol": "netconf", + "edit_configs": [], + } + + for zone in zones: + # VyOS interface configuration. + iface_xml = _build_interface_config(zone) + result["edit_configs"].append({ + "description": f"Configure VLAN {zone['vlan_id']} interface for zone {zone['name']}", + "xml": etree.tostring(iface_xml, pretty_print=True).decode(), + }) + + # VyOS zone-policy firewall rules. + for policy in zone.get("policies", []): + fw_xml = _build_firewall_rule(zone["name"], policy) + result["edit_configs"].append({ + "description": f"Firewall rule: {zone['name']} -> {policy['dst_zone']}", + "xml": etree.tostring(fw_xml, pretty_print=True).decode(), + }) + + return result + + +def _build_interface_config(zone: dict[str, Any]) -> etree._Element: + """Build VyOS NETCONF XML for a VLAN interface.""" + config = etree.Element("config") + interfaces = etree.SubElement(config, "interfaces") + ethernet = etree.SubElement(interfaces, "ethernet") + + # VyOS VLAN sub-interface: ethX vif . + tagnode = etree.SubElement(ethernet, "tagnode") + tagnode.text = "eth0" # Parent interface — configurable per-site. + + vif = etree.SubElement(ethernet, "vif") + vif_tagnode = etree.SubElement(vif, "tagnode") + vif_tagnode.text = str(zone["vlan_id"]) + + address = etree.SubElement(vif, "address") + address.text = zone["subnet"] + + description = etree.SubElement(vif, "description") + description.text = f"Kedge managed: {zone['name']}" + + return config + + +def _build_firewall_rule(src_zone: str, policy: dict[str, Any]) -> etree._Element: + """Build VyOS NETCONF XML for a zone-policy firewall rule.""" + config = etree.Element("config") + zone_policy = etree.SubElement(config, "zone-policy") + zone = etree.SubElement(zone_policy, "zone") + + name = etree.SubElement(zone, "name") + name.text = src_zone + + from_elem = etree.SubElement(zone, "from") + from_zone = etree.SubElement(from_elem, "zone") + from_zone.text = policy["dst_zone"] + + firewall = etree.SubElement(from_elem, "firewall") + fw_name = etree.SubElement(firewall, "name") + fw_name.text = f"{src_zone}-to-{policy['dst_zone']}" + + action = etree.SubElement(firewall, "default-action") + action.text = _map_action(policy["action"]) + + return config + + +def _map_action(yang_action: str) -> str: + """Map YANG action enum to VyOS firewall action.""" + mapping = { + "allow-stateful": "accept", + "allow-restricted": "accept", + "deny": "drop", + } + return mapping.get(yang_action, "drop") diff --git a/yang/models/sovereign-sdwan.yang b/yang/models/sovereign-sdwan.yang new file mode 100644 index 0000000..b61df9c --- /dev/null +++ b/yang/models/sovereign-sdwan.yang @@ -0,0 +1,184 @@ +module sovereign-sdwan { + namespace "urn:sovereign:sdwan"; + prefix ssdwan; + + import ietf-inet-types { + prefix inet; + } + + organization "Guildhouse Sovereign Infrastructure"; + description + "Custom YANG module for sovereign SD-WAN site configuration. + Extends OpenConfig concepts with dual-mode (overlay/underlay) + site management, zone-based security policy, and WAN circuit + SLA definitions."; + + revision 2024-01-01 { + description "Initial revision for Phase 1 homelab topology."; + } + + container site { + description "Top-level site configuration."; + + leaf site-id { + type string; + description "Unique site identifier."; + } + + leaf sovereignty-domain { + type string; + description "Guildhouse sovereignty domain this site belongs to."; + } + + container oob-mesh { + description "Overlay mode configuration — Kedge WireGuard mesh."; + + leaf wireguard-pubkey { + type string; + description "This site's WireGuard public key."; + } + + list cloud-anchors { + key "anchor-id"; + description "Cloud anchor endpoints for overlay connectivity."; + + leaf anchor-id { + type string; + description "Unique identifier for this cloud anchor."; + } + + leaf endpoint { + type inet:host; + description "Cloud anchor WireGuard endpoint (host:port)."; + } + + leaf priority { + type uint8; + description "Priority for anchor selection (lower = preferred)."; + } + } + } + } + + container zone-policy { + description + "Underlay mode configuration — physical device zone-based + security policy programming."; + + list zone { + key "name"; + description "A network zone managed by this site."; + + leaf name { + type string; + description "Zone name (e.g., transit, tyler-lab, dmz)."; + } + + leaf subnet { + type inet:ipv4-prefix; + description "IPv4 subnet for this zone."; + } + + leaf vlan-id { + type uint16 { + range "1..4094"; + } + description "VLAN ID associated with this zone."; + } + + leaf owner-device { + type string; + description + "Device responsible for this zone's gateway + (e.g., fortigate.transit.local)."; + } + + list policy { + key "dst-zone"; + description "Inter-zone policy rules."; + + leaf dst-zone { + type string; + description "Destination zone name."; + } + + leaf action { + type enumeration { + enum allow-stateful { + description "Allow stateful traffic."; + } + enum allow-restricted { + description "Allow restricted set of services."; + } + enum deny { + description "Deny all traffic."; + } + } + description "Policy action for traffic to the destination zone."; + } + + leaf-list services { + type string; + description + "Allowed services when action is allow-restricted + (e.g., ssh, https, dns)."; + } + } + } + } + + container wan-circuits { + description "WAN circuit definitions for SD-WAN path selection."; + + list circuit { + key "name"; + description "A WAN circuit available at this site."; + + leaf name { + type string; + description "Circuit name (e.g., fios-primary, lte-backup)."; + } + + leaf type { + type enumeration { + enum primary { + description "Primary WAN circuit."; + } + enum secondary { + description "Secondary/backup WAN circuit."; + } + enum oob { + description "Out-of-band management circuit."; + } + } + description "Circuit role in the WAN topology."; + } + + leaf interface-name { + type string; + description "Network interface name for this circuit."; + } + + container sla { + description "SLA targets for path selection."; + + leaf latency-target-ms { + type uint32; + description "Maximum acceptable latency in milliseconds."; + } + + leaf jitter-target-ms { + type uint32; + description "Maximum acceptable jitter in milliseconds."; + } + + leaf loss-target-pct { + type decimal64 { + fraction-digits 2; + } + description "Maximum acceptable packet loss percentage."; + } + } + } + } +} diff --git a/yang/requirements.txt b/yang/requirements.txt new file mode 100644 index 0000000..695e05e --- /dev/null +++ b/yang/requirements.txt @@ -0,0 +1,4 @@ +pyang>=2.6.0 +ncclient>=0.6.15 +lxml>=5.0.0 +pytest>=8.0.0 diff --git a/yang/site-config/cloud-anchor.xml b/yang/site-config/cloud-anchor.xml new file mode 100644 index 0000000..b3759bf --- /dev/null +++ b/yang/site-config/cloud-anchor.xml @@ -0,0 +1,29 @@ + + + + + cloud-anchor + guildhouse.local + + + PLACEHOLDER_CLOUD_PUBKEY + + + + + + + + + vps-primary + primary + eth0 + + 50 + 10 + 0.50 + + + + + diff --git a/yang/site-config/homelab.xml b/yang/site-config/homelab.xml new file mode 100644 index 0000000..6c48541 --- /dev/null +++ b/yang/site-config/homelab.xml @@ -0,0 +1,101 @@ + + + + + homelab + guildhouse.local + + + PLACEHOLDER_HOMELAB_PUBKEY + + cloud-anchor-1 + anchor.guildhouse.example.com:51820 + 10 + + + + + + + transit + 172.16.0.0/24 + 100 + fortigate.transit.local + + tyler-lab + allow-stateful + + + dmz + allow-restricted + https + dns + + + + + tyler-lab + 10.0.1.0/24 + 10 + fortigate.transit.local + + transit + allow-stateful + + + shared + allow-restricted + https + ssh + + + + + roommate + 192.168.1.0/24 + 20 + udr7.local + + shared + allow-stateful + + + tyler-lab + deny + + + + + shared + 192.168.30.0/24 + 30 + udr7.local + + + + dmz + 192.168.50.0/24 + 50 + fortigate.transit.local + + transit + allow-restricted + https + + + + + + + fios-primary + primary + eth0 + + 20 + 5 + 0.10 + + + + + diff --git a/yang/tests/__init__.py b/yang/tests/__init__.py new file mode 100644 index 0000000..473a0f4 diff --git a/yang/tests/test_compiler.py b/yang/tests/test_compiler.py new file mode 100644 index 0000000..bbf994f --- /dev/null +++ b/yang/tests/test_compiler.py @@ -0,0 +1,79 @@ +"""Tests for the YANG site config compiler.""" + +from pathlib import Path + +from compiler.compile import compile_site, extract_circuits, extract_zones, parse_site_config + +FIXTURES_DIR = Path(__file__).parent.parent / "site-config" + + +class TestParseHomelab: + """Test parsing of the homelab site configuration.""" + + def test_parse_homelab_zones(self) -> None: + root = parse_site_config(str(FIXTURES_DIR / "homelab.xml")) + zones = extract_zones(root) + + assert len(zones) == 5 + + zone_names = [z["name"] for z in zones] + assert "transit" in zone_names + assert "tyler-lab" in zone_names + assert "roommate" in zone_names + assert "shared" in zone_names + assert "dmz" in zone_names + + def test_transit_zone_properties(self) -> None: + root = parse_site_config(str(FIXTURES_DIR / "homelab.xml")) + zones = extract_zones(root) + + transit = next(z for z in zones if z["name"] == "transit") + assert transit["subnet"] == "172.16.0.0/24" + assert transit["vlan_id"] == 100 + assert transit["owner_device"] == "fortigate.transit.local" + assert len(transit["policies"]) == 2 + + def test_parse_circuits(self) -> None: + root = parse_site_config(str(FIXTURES_DIR / "homelab.xml")) + circuits = extract_circuits(root) + + assert len(circuits) == 1 + assert circuits[0]["name"] == "fios-primary" + assert circuits[0]["type"] == "primary" + assert circuits[0]["sla"]["latency_ms"] == 20 + + +class TestParseCloudAnchor: + """Test parsing of the cloud anchor site configuration.""" + + def test_cloud_anchor_no_zones(self) -> None: + root = parse_site_config(str(FIXTURES_DIR / "cloud-anchor.xml")) + zones = extract_zones(root) + assert len(zones) == 0 + + def test_cloud_anchor_has_circuit(self) -> None: + root = parse_site_config(str(FIXTURES_DIR / "cloud-anchor.xml")) + circuits = extract_circuits(root) + assert len(circuits) == 1 + assert circuits[0]["name"] == "vps-primary" + + +class TestCompileSite: + """Test full site compilation pipeline.""" + + def test_compile_homelab(self) -> None: + result = compile_site(str(FIXTURES_DIR / "homelab.xml")) + + assert "devices" in result + # FortiGate should have compiled zones. + forti_devices = [ + d for d in result["devices"] if "fortigate" in d.lower() + ] + assert len(forti_devices) > 0 + + def test_compile_cloud_anchor(self) -> None: + result = compile_site(str(FIXTURES_DIR / "cloud-anchor.xml")) + + # Cloud anchor has no underlay devices. + assert len(result["devices"]) == 0 + assert len(result.get("circuits", [])) == 1