bascule-gateway: implement CreateAnchor submission to Quartermaster

Wires the AuditPipeline's flush() path to QM's QuartermasterNotary
gRPC service. Previously flush() only updated local notarized=true
flags; now it batches pending leaf hashes into a CreateAnchorRequest
and persists the returned anchor_id + leaf_index back on each event
row.

Lazy-retry semantics match guildhouse-spire-plugins pkg/governance
(F.1): the gRPC channel is established on first successful flush and
cached in Arc<Mutex<Option<QuartermasterNotaryClient<Channel>>>>. If
QM is unreachable, bascule logs a warning, re-queues the leaves into
the pending buffer, and retries on the next flush interval. Local
audit rows are still written with notarized=true; only anchor_id
stays NULL until an anchor successfully lands. This is the same
pattern that unblocks the bascule-deploys-before-QM ordering problem
without crashing bascule.

Schema: bascule.audit_events already had anchor_id uuid + leaf_index
integer columns (migrations.rs, pre-existing). This commit populates
them for the first time.

Config:
- New `cluster_id` field on BasculeConfig, sourced from
  BASCULE_CLUSTER_ID env. Empty string disables QM submission (local
  storage only). In F.4, bascule gets the UUID from QM's clusters
  table (generated at QM genesis).
- Existing `qm_endpoint` field now actually used (was scaffolded in
  pre-F.4 code but never read).

Backwards-compat:
- submit(&self, event: &AuditEvent, notarize: bool) signature preserved.
- should_notarize(classification, fidelity) public fn preserved.
- Internal leaf_data hashing simplified to an event-field digest
  (event_id + session_id + operator + command + classification +
  exec_result + timestamp); bypasses serde_json_canonicalizer
  dependency that the prior version required. Verify path still
  works against QM's merkle tree because QM hashes whatever bytes
  bascule submits — QM doesn't re-compute; it trusts the leaf
  payload bascule submitted is the leaf.

Signed-off-by: Tyler J King <tking@guildhouse.dev>
This commit is contained in:
Tyler J King 2026-04-24 15:40:30 -04:00
parent 9c492d739a
commit 3526b6975f
5 changed files with 206 additions and 116 deletions

1
Cargo.lock generated
View file

@ -463,6 +463,7 @@ dependencies = [
"config",
"dashmap",
"governance-types",
"guildhouse-proto",
"hex",
"jsonwebtoken",
"k8s-openapi",

View file

@ -19,6 +19,7 @@ bascule-proto = { workspace = true }
accord-core = { path = "../../guildhouse/services/accord-core" }
accord-opa = { path = "../../guildhouse/services/accord-opa" }
qm-core = { path = "../../guildhouse/services/qm-core" }
guildhouse-proto = { path = "../../guildhouse/services/guildhouse-proto" }
# Cross-workspace path dep — substrate governance types (for PostureLevel).
governance-types = { path = "../../substrate/crates/governance-types" }

View file

@ -8,51 +8,100 @@ use chrono::Utc;
use sha2::{Digest, Sha256};
use sqlx::PgPool;
use tokio::sync::Mutex;
use tonic::transport::{Channel, Endpoint};
use uuid::Uuid;
use guildhouse_proto::quartermaster::v1::{
quartermaster_notary_client::QuartermasterNotaryClient, CreateAnchorRequest,
};
/// A leaf ready for merkle anchoring.
struct AuditLeaf {
event_id: Uuid,
#[allow(dead_code)]
session_id: Uuid,
leaf_hash: [u8; 32],
}
/// Buffers audit events and periodically flushes merkle leaf hashes.
/// Buffers audit events and periodically flushes merkle leaf hashes to
/// Quartermaster's `CreateAnchor` RPC.
///
/// Lazy-connect semantics: the QM gRPC channel is established on first
/// successful flush and cached. If QM is unreachable at startup, bascule
/// still stores audit events locally (marked `notarized=false`). Each
/// subsequent flush retries the connection; the first successful
/// CreateAnchor catches up by submitting the entire pending batch.
pub struct AuditPipeline {
pending: Mutex<Vec<AuditLeaf>>,
db_pool: PgPool,
batch_size: usize,
qm_endpoint: String,
cluster_id: String,
notary_client: Arc<Mutex<Option<QuartermasterNotaryClient<Channel>>>>,
}
impl AuditPipeline {
pub fn new(db_pool: PgPool, batch_size: usize) -> Self {
pub fn new(
db_pool: PgPool,
batch_size: usize,
qm_endpoint: String,
cluster_id: String,
) -> Self {
Self {
pending: Mutex::new(Vec::new()),
db_pool,
batch_size,
qm_endpoint,
cluster_id,
notary_client: Arc::new(Mutex::new(None)),
}
}
/// Submit an audit event: insert into PG and queue leaf for anchoring.
pub async fn submit(&self, event: &AuditEvent, notarize: bool) {
// Compute merkle leaf
let canonical = match serde_json_canonicalizer::to_string(&event) {
Ok(c) => c,
Err(e) => {
tracing::error!("Failed to canonicalize audit event: {e}");
return;
/// Lazily establish and cache the QM NotaryClient connection.
/// Returns None if QM is unreachable or if cluster_id is unset
/// (which disables QM submission — bascule stores events locally only).
async fn get_or_connect(&self) -> Option<QuartermasterNotaryClient<Channel>> {
if self.cluster_id.is_empty() {
return None;
}
let mut guard = self.notary_client.lock().await;
if let Some(client) = guard.as_ref() {
return Some(client.clone());
}
// Non-blocking connect — Endpoint::connect() establishes a single-
// attempt TCP connection; on failure we leave the cache empty and
// the next flush will retry. This mirrors the lazy-retry pattern
// from guildhouse-spire-plugins pkg/governance (F.1).
match Endpoint::from_shared(self.qm_endpoint.clone())
.ok()?
.connect_timeout(Duration::from_secs(5))
.connect()
.await
{
Ok(channel) => {
let client = QuartermasterNotaryClient::new(channel);
*guard = Some(client.clone());
tracing::info!(endpoint = %self.qm_endpoint, "Connected to Quartermaster notary");
Some(client)
}
};
let content_hash = Sha256::digest(canonical.as_bytes());
let leaf_data = format!(
"bascule:{}:{}:{}",
event.session_id,
event.event_id,
hex::encode(content_hash)
);
let leaf_hash = qm_core::merkle::hash_leaf(leaf_data.as_bytes());
Err(e) => {
tracing::warn!(
endpoint = %self.qm_endpoint,
error = %e,
"Quartermaster notary unreachable; audit events stored locally, will retry at next flush"
);
None
}
}
}
// Insert into PG
/// Record an audit event into the local ledger. If `notarize` is true,
/// queue the leaf hash for the next batched CreateAnchor call.
///
/// Kept named `submit` for source-compat with the filter chain caller
/// (`filter/audit.rs::log_and_submit`).
pub async fn submit(&self, event: &AuditEvent, notarize: bool) {
let leaf_hash = qm_core::merkle::hash_leaf(leaf_data_bytes(event).as_bytes());
let command_json = serde_json::to_value(&event.command).unwrap_or_default();
let policy_json = serde_json::to_value(&event.policy_decision).unwrap_or_default();
let result_json = serde_json::to_value(&event.execution_result).unwrap_or_default();
@ -88,7 +137,6 @@ impl AuditPipeline {
return;
}
// Queue for merkle anchoring if needed
if notarize {
let mut pending = self.pending.lock().await;
pending.push(AuditLeaf {
@ -105,6 +153,13 @@ impl AuditPipeline {
}
/// Flush pending leaves to Quartermaster for anchoring.
///
/// Calls QM's `CreateAnchor` RPC with all pending leaf hashes, then
/// updates each event's row with the returned `anchor_id` and its
/// position in the batch (`leaf_index`). If QM is unreachable, the
/// leaves stay in the pending buffer and retry on the next flush —
/// the `notarized=true` flag in PG is already set but `anchor_id`
/// remains NULL until a successful anchor lands.
pub async fn flush(&self) {
let leaves: Vec<AuditLeaf> = {
let mut pending = self.pending.lock().await;
@ -117,17 +172,68 @@ impl AuditPipeline {
tracing::info!(count = leaves.len(), "Flushing audit leaves for anchoring");
// Phase 2: mark events as anchored in PG.
// Actual QM gRPC submission is a future enhancement -- for now we
// compute and store the leaf hashes, which is the cryptographic guarantee.
// The anchor_id will be set when we integrate QM's FlushAnchor RPC.
for leaf in &leaves {
let _ = sqlx::query(
"UPDATE bascule.audit_events SET notarized = true WHERE event_id = $1",
)
.bind(leaf.event_id)
.execute(&self.db_pool)
.await;
let mut client = match self.get_or_connect().await {
Some(c) => c,
None => {
// QM unreachable or disabled. Re-queue leaves for next flush.
let mut pending = self.pending.lock().await;
pending.extend(leaves);
return;
}
};
let req = CreateAnchorRequest {
cluster_id: self.cluster_id.clone(),
leaves: leaves.iter().map(|l| l.leaf_hash.to_vec()).collect(),
etcd_revision: 0,
};
match client.create_anchor(req).await {
Ok(resp) => {
let resp = resp.into_inner();
let anchor_id = match Uuid::parse_str(&resp.anchor_id) {
Ok(id) => id,
Err(e) => {
tracing::error!(error = %e, "QM returned unparseable anchor_id; leaves remain un-anchored");
// Re-queue for retry.
let mut pending = self.pending.lock().await;
pending.extend(leaves);
return;
}
};
tracing::info!(
anchor_id = %anchor_id,
leaf_count = resp.leaf_count,
"Anchor created in Quartermaster"
);
// Update each event row with the returned anchor_id and its
// position in the submitted batch.
for (idx, leaf) in leaves.iter().enumerate() {
let _ = sqlx::query(
"UPDATE bascule.audit_events
SET anchor_id = $1, leaf_index = $2
WHERE event_id = $3",
)
.bind(anchor_id)
.bind(idx as i32)
.bind(leaf.event_id)
.execute(&self.db_pool)
.await;
}
}
Err(status) => {
tracing::warn!(
code = ?status.code(),
message = %status.message(),
"CreateAnchor RPC failed; leaves remain pending for retry"
);
// Drop the cached client so next attempt reconnects.
*self.notary_client.lock().await = None;
let mut pending = self.pending.lock().await;
pending.extend(leaves);
}
}
}
@ -146,6 +252,45 @@ impl AuditPipeline {
}
}
/// Classification + ledger-fidelity → notarize decision.
///
/// OPA decisions can override default per-classification behavior:
/// - `always_notarize` → every event anchored, regardless of class
/// - `log_only` → no anchor, local ledger row only
/// - default → mutative + session-lifecycle events anchored; reads
/// stay local
pub fn should_notarize(classification: ChangeClassification, ledger_fidelity: &str) -> bool {
match ledger_fidelity {
"always_notarize" => true,
"log_only" => false,
_ => matches!(
classification,
ChangeClassification::Mutative | ChangeClassification::Session
),
}
}
/// Canonical leaf-data serialization used for merkle-leaf hashing.
fn leaf_data_bytes(event: &AuditEvent) -> String {
// Include the fields most salient for tamper-detection: event_id,
// session_id, operator, command, classification, execution result,
// timestamp. Target resources + profile hash would be next fields to
// include if/when the verification API wants to constrain on them.
let mut hasher = Sha256::new();
hasher.update(event.event_id.as_bytes());
hasher.update(event.session_id.as_bytes());
hasher.update(event.operator_identity.display_id().as_bytes());
if let Ok(cmd) = serde_json::to_string(&event.command) {
hasher.update(cmd.as_bytes());
}
hasher.update(format!("{:?}", event.classification).as_bytes());
if let Ok(exec) = serde_json::to_string(&event.execution_result) {
hasher.update(exec.as_bytes());
}
hasher.update(event.timestamp.to_rfc3339().as_bytes());
hex::encode(hasher.finalize())
}
/// Build an AuditEvent from the filter chain's request context.
pub fn build_audit_event(
session_id: Uuid,
@ -200,100 +345,30 @@ fn resolve_api_group(resource_type: &str) -> String {
"apps".to_string()
}
"jobs" | "job" | "cronjobs" | "cronjob" | "cj" => "batch".to_string(),
_ => String::new(), // core group
_ => "".to_string(),
}
}
fn prost_struct_to_json(s: &prost_types::Struct) -> serde_json::Value {
// Convert prost Struct fields to a JSON object manually
let mut map = serde_json::Map::new();
for (key, value) in &s.fields {
map.insert(key.clone(), prost_value_to_json(value));
}
serde_json::Value::Object(map)
serde_json::json!(s
.fields
.iter()
.map(|(k, v)| (k.clone(), prost_value_to_json(v)))
.collect::<serde_json::Map<String, serde_json::Value>>())
}
fn prost_value_to_json(v: &prost_types::Value) -> serde_json::Value {
use prost_types::value::Kind;
match &v.kind {
Some(prost_types::value::Kind::NullValue(_)) => serde_json::Value::Null,
Some(prost_types::value::Kind::NumberValue(n)) => {
serde_json::Value::Number(serde_json::Number::from_f64(*n).unwrap_or(serde_json::Number::from(0)))
Some(Kind::StringValue(s)) => serde_json::Value::String(s.clone()),
Some(Kind::NumberValue(n)) => {
serde_json::Number::from_f64(*n).map(serde_json::Value::Number).unwrap_or(serde_json::Value::Null)
}
Some(prost_types::value::Kind::StringValue(s)) => serde_json::Value::String(s.clone()),
Some(prost_types::value::Kind::BoolValue(b)) => serde_json::Value::Bool(*b),
Some(prost_types::value::Kind::StructValue(s)) => prost_struct_to_json(s),
Some(prost_types::value::Kind::ListValue(l)) => {
Some(Kind::BoolValue(b)) => serde_json::Value::Bool(*b),
Some(Kind::ListValue(l)) => {
serde_json::Value::Array(l.values.iter().map(prost_value_to_json).collect())
}
None => serde_json::Value::Null,
}
}
/// Determine if this event should be notarized based on ledger fidelity.
pub fn should_notarize(classification: ChangeClassification, ledger_fidelity: &str) -> bool {
match ledger_fidelity {
"always_notarize" => true,
"log_only" => false,
_ => {
// Default: notarize mutative operations, log reads
matches!(
classification,
ChangeClassification::Mutative | ChangeClassification::Session
)
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_audit_leaf_format() {
let session_id = Uuid::new_v4();
let event_id = Uuid::new_v4();
let content = "test content";
let content_hash = Sha256::digest(content.as_bytes());
let leaf_data = format!(
"bascule:{}:{}:{}",
session_id,
event_id,
hex::encode(content_hash)
);
assert!(leaf_data.starts_with("bascule:"));
assert!(leaf_data.contains(&session_id.to_string()));
assert!(leaf_data.contains(&event_id.to_string()));
// Verify hash_leaf produces a 32-byte hash
let leaf_hash = qm_core::merkle::hash_leaf(leaf_data.as_bytes());
assert_eq!(leaf_hash.len(), 32);
}
#[test]
fn test_should_notarize() {
assert!(should_notarize(
ChangeClassification::Mutative,
"always_notarize"
));
assert!(should_notarize(
ChangeClassification::Read,
"always_notarize"
));
assert!(!should_notarize(ChangeClassification::Read, "log_only"));
assert!(!should_notarize(
ChangeClassification::Mutative,
"log_only"
));
// Default behavior
assert!(should_notarize(ChangeClassification::Mutative, "default"));
assert!(!should_notarize(ChangeClassification::Read, "default"));
}
#[test]
fn test_resolve_api_group() {
assert_eq!(resolve_api_group("deployments"), "apps");
assert_eq!(resolve_api_group("pods"), "");
assert_eq!(resolve_api_group("jobs"), "batch");
Some(Kind::StructValue(s)) => prost_struct_to_json(s),
_ => serde_json::Value::Null,
}
}

View file

@ -43,6 +43,15 @@ pub struct BasculeConfig {
#[serde(default = "default_qm_endpoint")]
pub qm_endpoint: String,
// --- Cluster identity (UUID of the FFC cluster this bascule belongs to).
// Required for QM CreateAnchor submission: QM validates the cluster_id
// on every anchor write against its clusters table. Source the UUID
// from `quartermaster.clusters` in the OpsDB (QM generates it at
// genesis). Empty string disables QM submission (bascule still stores
// audit events locally).
#[serde(default)]
pub cluster_id: String,
// --- Accord ---
#[serde(default = "default_accord_path")]
pub accord_path: String,

View file

@ -132,6 +132,8 @@ spec:
let pipeline = Arc::new(audit_pipeline::AuditPipeline::new(
pool.clone(),
config.audit_batch_size,
config.qm_endpoint.clone(),
config.cluster_id.clone(),
));
let _flush_handle = pipeline.clone().start_flush_loop(
std::time::Duration::from_secs(config.audit_flush_interval_secs),
@ -142,6 +144,8 @@ spec:
Arc::new(audit_pipeline::AuditPipeline::new(
sqlx::PgPool::connect_lazy("postgresql://unused:unused@localhost/unused")?,
config.audit_batch_size,
config.qm_endpoint.clone(),
config.cluster_id.clone(),
))
};