governance: lazy connect + exponential reconnect backoff
NewClient no longer returns an error when Quartermaster is unreachable. grpc.DialContext without WithBlock is already non-blocking; the prior 10s timeout context was effectively a no-op. Removing it and adding explicit ConnectParams (BaseDelay 1s, Multiplier 1.5, Jitter 0.2, MaxDelay 30s, MinConnectTimeout 20s) makes the intended behavior explicit: the gRPC ClientConn retries connection in the background with exponential backoff, and RPCs return Unavailable until QM is up. The governance-notifier and substrate-keymanager plugins already log RPC errors via handleEvent and continue without aborting the SPIRE operation, so no call-site changes are needed. This unblocks SPIRE bootstrap when Quartermaster hasn't been deployed yet, breaking the SPIRE <-> QM circular deployment dependency. Added watchConnState helper that logs once per transition so operators see at SPIRE startup whether QM is reachable: a single WARN-style line when the connection is not yet Ready, and an INFO line when it becomes Ready. conn.Connect() is called eagerly so those logs fire at plugin load rather than waiting for the first RPC. Deferred: - Add a unit test for NewClient succeeding with an unreachable address (existing TestNewClientAcceptsTLSConfig is a pre-existing failure using placeholder cert paths; unrelated to this change). Signed-off-by: Tyler J King <tking@guildhouse.dev>
This commit is contained in:
parent
f0268305ae
commit
83b1264ebc
1 changed files with 77 additions and 10 deletions
|
|
@ -10,11 +10,14 @@ import (
|
|||
"encoding/hex"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"time"
|
||||
|
||||
pb "github.com/guildhouse-cooperative/guildhouse-spire-plugins/gen/quartermaster/v1"
|
||||
"google.golang.org/grpc"
|
||||
"google.golang.org/grpc/backoff"
|
||||
"google.golang.org/grpc/connectivity"
|
||||
"google.golang.org/grpc/credentials"
|
||||
"google.golang.org/grpc/credentials/insecure"
|
||||
)
|
||||
|
|
@ -101,6 +104,12 @@ type Client struct {
|
|||
}
|
||||
|
||||
// NewClient creates a governance client with gRPC connections.
|
||||
//
|
||||
// The returned client is lazy-connect: grpc.DialContext is non-blocking, so
|
||||
// NewClient succeeds even when Quartermaster is unreachable. Connection
|
||||
// attempts happen in the background with the exponential backoff configured
|
||||
// in buildDialOptions. RPCs return codes.Unavailable until QM comes up; the
|
||||
// caller is expected to log-and-continue rather than crash the SPIRE plugin.
|
||||
func NewClient(cfg Config) (*Client, error) {
|
||||
if cfg.GovernanceAddr == "" {
|
||||
return nil, fmt.Errorf("governance: governance address is required")
|
||||
|
|
@ -116,13 +125,12 @@ func NewClient(cfg Config) (*Client, error) {
|
|||
return nil, fmt.Errorf("governance: build dial options: %w", err)
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
|
||||
// Connect to GovernanceService.
|
||||
govConn, err := grpc.DialContext(ctx, cfg.GovernanceAddr, dialOpts...)
|
||||
// Non-blocking dial: returns a ClientConn in IDLE state. Connection
|
||||
// attempts are driven in the background by grpc-go's reconnect loop,
|
||||
// using the backoff config set in buildDialOptions.
|
||||
govConn, err := grpc.DialContext(context.Background(), cfg.GovernanceAddr, dialOpts...)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("governance: connect to %s: %w", cfg.GovernanceAddr, err)
|
||||
return nil, fmt.Errorf("governance: dial %s: %w", cfg.GovernanceAddr, err)
|
||||
}
|
||||
|
||||
c := &Client{
|
||||
|
|
@ -131,19 +139,34 @@ func NewClient(cfg Config) (*Client, error) {
|
|||
govClient: pb.NewGovernanceServiceClient(govConn),
|
||||
}
|
||||
|
||||
// Connect to NotaryService if address provided.
|
||||
// NotaryService: defaults to the governance address if not set separately.
|
||||
notaryAddr := cfg.NotaryAddr
|
||||
if notaryAddr == "" {
|
||||
notaryAddr = cfg.GovernanceAddr // Same host by default.
|
||||
notaryAddr = cfg.GovernanceAddr
|
||||
}
|
||||
notaryConn, err := grpc.DialContext(ctx, notaryAddr, dialOpts...)
|
||||
notaryConn, err := grpc.DialContext(context.Background(), notaryAddr, dialOpts...)
|
||||
if err != nil {
|
||||
govConn.Close()
|
||||
return nil, fmt.Errorf("governance: connect to notary %s: %w", notaryAddr, err)
|
||||
return nil, fmt.Errorf("governance: dial notary %s: %w", notaryAddr, err)
|
||||
}
|
||||
c.notaryConn = notaryConn
|
||||
c.notaryClient = pb.NewQuartermasterNotaryClient(notaryConn)
|
||||
|
||||
// Trigger background connection attempts immediately so state transitions
|
||||
// log from plugin startup rather than waiting for first RPC.
|
||||
govConn.Connect()
|
||||
if notaryConn != govConn {
|
||||
notaryConn.Connect()
|
||||
}
|
||||
|
||||
// Log connection-state transitions so operators see when QM becomes
|
||||
// reachable. One WARN-style line on first unreachable observation,
|
||||
// one INFO on Ready.
|
||||
go watchConnState("governance", cfg.GovernanceAddr, govConn)
|
||||
if notaryConn != govConn {
|
||||
go watchConnState("notary", notaryAddr, notaryConn)
|
||||
}
|
||||
|
||||
return c, nil
|
||||
}
|
||||
|
||||
|
|
@ -310,7 +333,22 @@ func (c *Client) VerifyCredentialGovernance(ctx context.Context, v CredentialVer
|
|||
}
|
||||
|
||||
// buildDialOptions creates gRPC dial options from the config (mTLS or insecure).
|
||||
//
|
||||
// All returned option sets include an exponential reconnect-backoff so that
|
||||
// when Quartermaster is temporarily unreachable, grpc-go retries connection
|
||||
// attempts in the background without blocking plugin startup or spawning
|
||||
// per-RPC retry goroutines.
|
||||
func buildDialOptions(cfg Config) ([]grpc.DialOption, error) {
|
||||
connectParams := grpc.WithConnectParams(grpc.ConnectParams{
|
||||
Backoff: backoff.Config{
|
||||
BaseDelay: 1 * time.Second,
|
||||
Multiplier: 1.5,
|
||||
Jitter: 0.2,
|
||||
MaxDelay: 30 * time.Second,
|
||||
},
|
||||
MinConnectTimeout: 20 * time.Second,
|
||||
})
|
||||
|
||||
if cfg.TLSCertPath != "" && cfg.TLSKeyPath != "" && cfg.TLSCAPath != "" {
|
||||
cert, err := tls.LoadX509KeyPair(cfg.TLSCertPath, cfg.TLSKeyPath)
|
||||
if err != nil {
|
||||
|
|
@ -331,6 +369,7 @@ func buildDialOptions(cfg Config) ([]grpc.DialOption, error) {
|
|||
}
|
||||
return []grpc.DialOption{
|
||||
grpc.WithTransportCredentials(credentials.NewTLS(tlsCfg)),
|
||||
connectParams,
|
||||
}, nil
|
||||
}
|
||||
|
||||
|
|
@ -340,5 +379,33 @@ func buildDialOptions(cfg Config) ([]grpc.DialOption, error) {
|
|||
|
||||
return []grpc.DialOption{
|
||||
grpc.WithTransportCredentials(insecure.NewCredentials()),
|
||||
connectParams,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// watchConnState logs gRPC connection-state transitions for a ClientConn.
|
||||
// Runs until the ClientConn is closed (WaitForStateChange returns false
|
||||
// when the conn reaches SHUTDOWN). Logs WARN-style once when the conn is
|
||||
// not yet reachable and INFO when it becomes Ready, so operators see at
|
||||
// startup whether Quartermaster is responding.
|
||||
func watchConnState(name, addr string, conn *grpc.ClientConn) {
|
||||
loggedUnreachable := false
|
||||
state := conn.GetState()
|
||||
for {
|
||||
switch state {
|
||||
case connectivity.Ready:
|
||||
log.Printf("governance: %s client connected to %s", name, addr)
|
||||
loggedUnreachable = false
|
||||
case connectivity.TransientFailure, connectivity.Idle, connectivity.Connecting:
|
||||
if !loggedUnreachable {
|
||||
log.Printf("governance: %s client cannot reach %s yet; retrying in background",
|
||||
name, addr)
|
||||
loggedUnreachable = true
|
||||
}
|
||||
}
|
||||
if !conn.WaitForStateChange(context.Background(), state) {
|
||||
return // conn shut down
|
||||
}
|
||||
state = conn.GetState()
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in a new issue