diff --git a/pkg/governance/governance.go b/pkg/governance/governance.go index 776c8f2..a917a86 100644 --- a/pkg/governance/governance.go +++ b/pkg/governance/governance.go @@ -10,11 +10,14 @@ import ( "encoding/hex" "encoding/json" "fmt" + "log" "os" "time" pb "github.com/guildhouse-cooperative/guildhouse-spire-plugins/gen/quartermaster/v1" "google.golang.org/grpc" + "google.golang.org/grpc/backoff" + "google.golang.org/grpc/connectivity" "google.golang.org/grpc/credentials" "google.golang.org/grpc/credentials/insecure" ) @@ -101,6 +104,12 @@ type Client struct { } // NewClient creates a governance client with gRPC connections. +// +// The returned client is lazy-connect: grpc.DialContext is non-blocking, so +// NewClient succeeds even when Quartermaster is unreachable. Connection +// attempts happen in the background with the exponential backoff configured +// in buildDialOptions. RPCs return codes.Unavailable until QM comes up; the +// caller is expected to log-and-continue rather than crash the SPIRE plugin. func NewClient(cfg Config) (*Client, error) { if cfg.GovernanceAddr == "" { return nil, fmt.Errorf("governance: governance address is required") @@ -116,13 +125,12 @@ func NewClient(cfg Config) (*Client, error) { return nil, fmt.Errorf("governance: build dial options: %w", err) } - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) - defer cancel() - - // Connect to GovernanceService. - govConn, err := grpc.DialContext(ctx, cfg.GovernanceAddr, dialOpts...) + // Non-blocking dial: returns a ClientConn in IDLE state. Connection + // attempts are driven in the background by grpc-go's reconnect loop, + // using the backoff config set in buildDialOptions. + govConn, err := grpc.DialContext(context.Background(), cfg.GovernanceAddr, dialOpts...) if err != nil { - return nil, fmt.Errorf("governance: connect to %s: %w", cfg.GovernanceAddr, err) + return nil, fmt.Errorf("governance: dial %s: %w", cfg.GovernanceAddr, err) } c := &Client{ @@ -131,19 +139,34 @@ func NewClient(cfg Config) (*Client, error) { govClient: pb.NewGovernanceServiceClient(govConn), } - // Connect to NotaryService if address provided. + // NotaryService: defaults to the governance address if not set separately. notaryAddr := cfg.NotaryAddr if notaryAddr == "" { - notaryAddr = cfg.GovernanceAddr // Same host by default. + notaryAddr = cfg.GovernanceAddr } - notaryConn, err := grpc.DialContext(ctx, notaryAddr, dialOpts...) + notaryConn, err := grpc.DialContext(context.Background(), notaryAddr, dialOpts...) if err != nil { govConn.Close() - return nil, fmt.Errorf("governance: connect to notary %s: %w", notaryAddr, err) + return nil, fmt.Errorf("governance: dial notary %s: %w", notaryAddr, err) } c.notaryConn = notaryConn c.notaryClient = pb.NewQuartermasterNotaryClient(notaryConn) + // Trigger background connection attempts immediately so state transitions + // log from plugin startup rather than waiting for first RPC. + govConn.Connect() + if notaryConn != govConn { + notaryConn.Connect() + } + + // Log connection-state transitions so operators see when QM becomes + // reachable. One WARN-style line on first unreachable observation, + // one INFO on Ready. + go watchConnState("governance", cfg.GovernanceAddr, govConn) + if notaryConn != govConn { + go watchConnState("notary", notaryAddr, notaryConn) + } + return c, nil } @@ -310,7 +333,22 @@ func (c *Client) VerifyCredentialGovernance(ctx context.Context, v CredentialVer } // buildDialOptions creates gRPC dial options from the config (mTLS or insecure). +// +// All returned option sets include an exponential reconnect-backoff so that +// when Quartermaster is temporarily unreachable, grpc-go retries connection +// attempts in the background without blocking plugin startup or spawning +// per-RPC retry goroutines. func buildDialOptions(cfg Config) ([]grpc.DialOption, error) { + connectParams := grpc.WithConnectParams(grpc.ConnectParams{ + Backoff: backoff.Config{ + BaseDelay: 1 * time.Second, + Multiplier: 1.5, + Jitter: 0.2, + MaxDelay: 30 * time.Second, + }, + MinConnectTimeout: 20 * time.Second, + }) + if cfg.TLSCertPath != "" && cfg.TLSKeyPath != "" && cfg.TLSCAPath != "" { cert, err := tls.LoadX509KeyPair(cfg.TLSCertPath, cfg.TLSKeyPath) if err != nil { @@ -331,6 +369,7 @@ func buildDialOptions(cfg Config) ([]grpc.DialOption, error) { } return []grpc.DialOption{ grpc.WithTransportCredentials(credentials.NewTLS(tlsCfg)), + connectParams, }, nil } @@ -340,5 +379,33 @@ func buildDialOptions(cfg Config) ([]grpc.DialOption, error) { return []grpc.DialOption{ grpc.WithTransportCredentials(insecure.NewCredentials()), + connectParams, }, nil } + +// watchConnState logs gRPC connection-state transitions for a ClientConn. +// Runs until the ClientConn is closed (WaitForStateChange returns false +// when the conn reaches SHUTDOWN). Logs WARN-style once when the conn is +// not yet reachable and INFO when it becomes Ready, so operators see at +// startup whether Quartermaster is responding. +func watchConnState(name, addr string, conn *grpc.ClientConn) { + loggedUnreachable := false + state := conn.GetState() + for { + switch state { + case connectivity.Ready: + log.Printf("governance: %s client connected to %s", name, addr) + loggedUnreachable = false + case connectivity.TransientFailure, connectivity.Idle, connectivity.Connecting: + if !loggedUnreachable { + log.Printf("governance: %s client cannot reach %s yet; retrying in background", + name, addr) + loggedUnreachable = true + } + } + if !conn.WaitForStateChange(context.Background(), state) { + return // conn shut down + } + state = conn.GetState() + } +}