guildhouse-spire-plugins/pkg/governance/governance.go
Tyler J King 83b1264ebc governance: lazy connect + exponential reconnect backoff
NewClient no longer returns an error when Quartermaster is unreachable.
grpc.DialContext without WithBlock is already non-blocking; the prior
10s timeout context was effectively a no-op. Removing it and adding
explicit ConnectParams (BaseDelay 1s, Multiplier 1.5, Jitter 0.2,
MaxDelay 30s, MinConnectTimeout 20s) makes the intended behavior
explicit: the gRPC ClientConn retries connection in the background
with exponential backoff, and RPCs return Unavailable until QM is up.

The governance-notifier and substrate-keymanager plugins already log
RPC errors via handleEvent and continue without aborting the SPIRE
operation, so no call-site changes are needed. This unblocks SPIRE
bootstrap when Quartermaster hasn't been deployed yet, breaking the
SPIRE <-> QM circular deployment dependency.

Added watchConnState helper that logs once per transition so operators
see at SPIRE startup whether QM is reachable: a single WARN-style line
when the connection is not yet Ready, and an INFO line when it becomes
Ready. conn.Connect() is called eagerly so those logs fire at plugin
load rather than waiting for the first RPC.

Deferred:
- Add a unit test for NewClient succeeding with an unreachable address
  (existing TestNewClientAcceptsTLSConfig is a pre-existing failure
  using placeholder cert paths; unrelated to this change).

Signed-off-by: Tyler J King <tking@guildhouse.dev>
2026-04-22 11:53:36 -04:00

411 lines
14 KiB
Go

// Package governance provides a gRPC client for the Guildhouse GovernanceService
// and CeremonyService, used by SPIRE plugins to participate in governed mutations.
package governance
import (
"context"
"crypto/sha256"
"crypto/tls"
"crypto/x509"
"encoding/hex"
"encoding/json"
"fmt"
"log"
"os"
"time"
pb "github.com/guildhouse-cooperative/guildhouse-spire-plugins/gen/quartermaster/v1"
"google.golang.org/grpc"
"google.golang.org/grpc/backoff"
"google.golang.org/grpc/connectivity"
"google.golang.org/grpc/credentials"
"google.golang.org/grpc/credentials/insecure"
)
// Config holds governance client configuration.
type Config struct {
// GovernanceAddr is the gRPC address of the GovernanceService.
GovernanceAddr string
// CeremonyAddr is the gRPC address of the CeremonyService.
CeremonyAddr string
// NotaryAddr is the gRPC address of the NotaryService.
NotaryAddr string
// TLS configuration — REQUIRED for production.
// Uses SPIFFE-aware mTLS: the plugin's own SVID authenticates
// to Quartermaster services.
TLSCertPath string // Path to X.509 SVID certificate
TLSKeyPath string // Path to SVID private key
TLSCAPath string // Path to trust bundle (CA certificates)
TLSRequired bool // If true, NewClient fails without TLS config
}
// IntentResult holds the result of a CreateIntent call.
type IntentResult struct {
IntentID string
CeremonyID string // non-empty if ceremony required
Denied bool
Error string
}
// RedeemResult holds the result of a RedeemIntent call.
type RedeemResult struct {
Success bool
SatHash []byte
SatBytes []byte // raw SAT bytes for downstream verification
ExpiresAt time.Time // SAT expiry — consumers MUST check before use
Status string
Error string
}
// IsExpired returns true if the SAT has expired.
func (r *RedeemResult) IsExpired() bool {
return !r.ExpiresAt.IsZero() && time.Now().After(r.ExpiresAt)
}
// CredentialEvent describes a credential lifecycle event for merkle anchoring.
// The CredentialFingerprint field binds the merkle leaf to a specific credential,
// preventing proof replay across certificates (S-03).
type CredentialEvent struct {
EventType string // "issue", "rotate", "revoke"
IntentID string // governance intent UUID
CredentialFingerprint string // SHA-256 of certificate public key bytes, hex-encoded
SpiffeID string
TenantID string
CertSerialNumber uint64
IssuedAt time.Time
ExpiresAt time.Time
}
// CredentialVerification holds the parameters for verifying a credential's
// governance provenance via the NotaryService.
type CredentialVerification struct {
IntentID string // from governance-intent extension
CertificatePublicKey []byte // raw public key bytes from the certificate
}
// VerificationResult holds the result of a credential governance verification.
type VerificationResult struct {
Governed bool // true if the credential has valid governance provenance
AnchorID string // merkle anchor ID
FingerprintMatch bool // true if merkle leaf's fingerprint matches the cert
Error string
}
// Client wraps gRPC clients for GovernanceService, CeremonyService, and NotaryService.
type Client struct {
config Config
govConn *grpc.ClientConn
notaryConn *grpc.ClientConn
govClient pb.GovernanceServiceClient
notaryClient pb.QuartermasterNotaryClient
}
// NewClient creates a governance client with gRPC connections.
//
// The returned client is lazy-connect: grpc.DialContext is non-blocking, so
// NewClient succeeds even when Quartermaster is unreachable. Connection
// attempts happen in the background with the exponential backoff configured
// in buildDialOptions. RPCs return codes.Unavailable until QM comes up; the
// caller is expected to log-and-continue rather than crash the SPIRE plugin.
func NewClient(cfg Config) (*Client, error) {
if cfg.GovernanceAddr == "" {
return nil, fmt.Errorf("governance: governance address is required")
}
if cfg.TLSRequired {
if cfg.TLSCertPath == "" || cfg.TLSKeyPath == "" || cfg.TLSCAPath == "" {
return nil, fmt.Errorf("governance: TLS is required but cert/key/ca paths are not configured")
}
}
dialOpts, err := buildDialOptions(cfg)
if err != nil {
return nil, fmt.Errorf("governance: build dial options: %w", err)
}
// Non-blocking dial: returns a ClientConn in IDLE state. Connection
// attempts are driven in the background by grpc-go's reconnect loop,
// using the backoff config set in buildDialOptions.
govConn, err := grpc.DialContext(context.Background(), cfg.GovernanceAddr, dialOpts...)
if err != nil {
return nil, fmt.Errorf("governance: dial %s: %w", cfg.GovernanceAddr, err)
}
c := &Client{
config: cfg,
govConn: govConn,
govClient: pb.NewGovernanceServiceClient(govConn),
}
// NotaryService: defaults to the governance address if not set separately.
notaryAddr := cfg.NotaryAddr
if notaryAddr == "" {
notaryAddr = cfg.GovernanceAddr
}
notaryConn, err := grpc.DialContext(context.Background(), notaryAddr, dialOpts...)
if err != nil {
govConn.Close()
return nil, fmt.Errorf("governance: dial notary %s: %w", notaryAddr, err)
}
c.notaryConn = notaryConn
c.notaryClient = pb.NewQuartermasterNotaryClient(notaryConn)
// Trigger background connection attempts immediately so state transitions
// log from plugin startup rather than waiting for first RPC.
govConn.Connect()
if notaryConn != govConn {
notaryConn.Connect()
}
// Log connection-state transitions so operators see when QM becomes
// reachable. One WARN-style line on first unreachable observation,
// one INFO on Ready.
go watchConnState("governance", cfg.GovernanceAddr, govConn)
if notaryConn != govConn {
go watchConnState("notary", notaryAddr, notaryConn)
}
return c, nil
}
// Close shuts down all gRPC connections.
func (c *Client) Close() error {
var firstErr error
if c.govConn != nil {
if err := c.govConn.Close(); err != nil && firstErr == nil {
firstErr = err
}
}
if c.notaryConn != nil {
if err := c.notaryConn.Close(); err != nil && firstErr == nil {
firstErr = err
}
}
return firstErr
}
// CreateIntent creates a MutationIntent for a credential operation.
func (c *Client) CreateIntent(ctx context.Context, registryType, verb, artifactScope, tenantID string) (*IntentResult, error) {
resp, err := c.govClient.CreateIntent(ctx, &pb.CreateIntentRequest{
RegistryType: registryType,
Verb: verb,
ArtifactScope: artifactScope,
TenantId: tenantID,
})
if err != nil {
return nil, fmt.Errorf("governance: CreateIntent RPC: %w", err)
}
return &IntentResult{
IntentID: resp.IntentId,
CeremonyID: resp.CeremonyId,
Denied: resp.Denied,
Error: resp.Error,
}, nil
}
// RedeemIntent redeems a MutationIntent to obtain a SAT.
func (c *Client) RedeemIntent(ctx context.Context, intentID string) (*RedeemResult, error) {
resp, err := c.govClient.RedeemIntent(ctx, &pb.RedeemIntentRequest{
IntentId: intentID,
})
if err != nil {
return nil, fmt.Errorf("governance: RedeemIntent RPC: %w", err)
}
result := &RedeemResult{
Success: resp.Success,
Status: resp.Status,
Error: resp.Error,
}
if resp.Sat != nil {
result.SatHash = resp.Sat.SatHash
result.SatBytes = resp.Sat.SatBytes
if resp.Sat.ExpiresAt != nil {
result.ExpiresAt = resp.Sat.ExpiresAt.AsTime()
}
}
return result, nil
}
// CreateCeremony creates a governance ceremony.
func (c *Client) CreateCeremony(ctx context.Context, ceremonyType, intentID string, requiredApprovals uint32) (string, error) {
// CeremonyService is not yet defined in proto — use GovernanceService intent
// with ceremony_id from the response as a proxy.
return "", fmt.Errorf("governance: CreateCeremony requires CeremonyService proto (not yet generated)")
}
// SubmitMerkleLeaf submits a credential event as a merkle leaf to the NotaryService.
func (c *Client) SubmitMerkleLeaf(ctx context.Context, clusterID string, leaf []byte) (string, error) {
resp, err := c.notaryClient.CreateAnchor(ctx, &pb.CreateAnchorRequest{
ClusterId: clusterID,
Leaves: [][]byte{leaf},
})
if err != nil {
return "", fmt.Errorf("governance: SubmitMerkleLeaf RPC: %w", err)
}
return resp.AnchorId, nil
}
// NotarizeCredentialEvent sends a credential lifecycle event to the governance
// plane for merkle anchoring. The event MUST include a CredentialFingerprint
// to bind the merkle leaf to the specific certificate (S-03 fix).
func (c *Client) NotarizeCredentialEvent(ctx context.Context, event CredentialEvent) error {
if event.CredentialFingerprint == "" {
return fmt.Errorf("governance: credential_fingerprint is required for notarization")
}
if event.IntentID == "" {
return fmt.Errorf("governance: intent_id is required for notarization")
}
if event.EventType == "" {
return fmt.Errorf("governance: event_type is required for notarization")
}
// Construct MutationEnvelope payload (JCS-canonicalized via json.Marshal sorted keys).
envelope := map[string]interface{}{
"credential_fingerprint": event.CredentialFingerprint,
"event_type": event.EventType,
"intent_id": event.IntentID,
"spiffe_id": event.SpiffeID,
"tenant_id": event.TenantID,
}
if event.CertSerialNumber > 0 {
envelope["cert_serial_number"] = event.CertSerialNumber
}
envelopeBytes, err := json.Marshal(envelope)
if err != nil {
return fmt.Errorf("governance: marshal envelope: %w", err)
}
// Domain-separated SHA-256: "guildhouse.credential.v1:" prefix.
h := sha256.New()
h.Write([]byte("guildhouse.credential.v1:"))
h.Write(envelopeBytes)
leaf := h.Sum(nil)
_, err = c.SubmitMerkleLeaf(ctx, event.TenantID, leaf)
return err
}
// VerifyCredentialGovernance checks that a credential's governance provenance
// is valid by verifying the merkle proof binds to this specific credential.
func (c *Client) VerifyCredentialGovernance(ctx context.Context, v CredentialVerification) (*VerificationResult, error) {
if v.IntentID == "" {
return &VerificationResult{Governed: false, Error: "no governance intent"}, nil
}
if len(v.CertificatePublicKey) == 0 {
return nil, fmt.Errorf("governance: certificate public key is required for verification")
}
// Compute fingerprint of the certificate's public key.
certHash := sha256.Sum256(v.CertificatePublicKey)
expectedFingerprint := hex.EncodeToString(certHash[:])
// Construct the same leaf that was submitted during notarization.
envelope := map[string]interface{}{
"credential_fingerprint": expectedFingerprint,
"intent_id": v.IntentID,
}
envelopeBytes, _ := json.Marshal(envelope)
h := sha256.New()
h.Write([]byte("guildhouse.credential.v1:"))
h.Write(envelopeBytes)
leaf := h.Sum(nil)
// Verify inclusion in the merkle tree via NotaryService.
resp, err := c.notaryClient.VerifyInclusion(ctx, &pb.VerifyInclusionRequest{
Leaf: leaf,
})
if err != nil {
return nil, fmt.Errorf("governance: VerifyInclusion RPC: %w", err)
}
return &VerificationResult{
Governed: resp.Valid,
FingerprintMatch: resp.Valid, // If inclusion is valid, the fingerprint matched.
}, nil
}
// buildDialOptions creates gRPC dial options from the config (mTLS or insecure).
//
// All returned option sets include an exponential reconnect-backoff so that
// when Quartermaster is temporarily unreachable, grpc-go retries connection
// attempts in the background without blocking plugin startup or spawning
// per-RPC retry goroutines.
func buildDialOptions(cfg Config) ([]grpc.DialOption, error) {
connectParams := grpc.WithConnectParams(grpc.ConnectParams{
Backoff: backoff.Config{
BaseDelay: 1 * time.Second,
Multiplier: 1.5,
Jitter: 0.2,
MaxDelay: 30 * time.Second,
},
MinConnectTimeout: 20 * time.Second,
})
if cfg.TLSCertPath != "" && cfg.TLSKeyPath != "" && cfg.TLSCAPath != "" {
cert, err := tls.LoadX509KeyPair(cfg.TLSCertPath, cfg.TLSKeyPath)
if err != nil {
return nil, fmt.Errorf("load TLS keypair: %w", err)
}
caCert, err := os.ReadFile(cfg.TLSCAPath)
if err != nil {
return nil, fmt.Errorf("read CA cert: %w", err)
}
caPool := x509.NewCertPool()
if !caPool.AppendCertsFromPEM(caCert) {
return nil, fmt.Errorf("failed to append CA certificate")
}
tlsCfg := &tls.Config{
Certificates: []tls.Certificate{cert},
RootCAs: caPool,
MinVersion: tls.VersionTLS13,
}
return []grpc.DialOption{
grpc.WithTransportCredentials(credentials.NewTLS(tlsCfg)),
connectParams,
}, nil
}
if cfg.TLSRequired {
return nil, fmt.Errorf("TLS is required but no certificates configured")
}
return []grpc.DialOption{
grpc.WithTransportCredentials(insecure.NewCredentials()),
connectParams,
}, nil
}
// watchConnState logs gRPC connection-state transitions for a ClientConn.
// Runs until the ClientConn is closed (WaitForStateChange returns false
// when the conn reaches SHUTDOWN). Logs WARN-style once when the conn is
// not yet reachable and INFO when it becomes Ready, so operators see at
// startup whether Quartermaster is responding.
func watchConnState(name, addr string, conn *grpc.ClientConn) {
loggedUnreachable := false
state := conn.GetState()
for {
switch state {
case connectivity.Ready:
log.Printf("governance: %s client connected to %s", name, addr)
loggedUnreachable = false
case connectivity.TransientFailure, connectivity.Idle, connectivity.Connecting:
if !loggedUnreachable {
log.Printf("governance: %s client cannot reach %s yet; retrying in background",
name, addr)
loggedUnreachable = true
}
}
if !conn.WaitForStateChange(context.Background(), state) {
return // conn shut down
}
state = conn.GetState()
}
}