bascule-gateway: implement CreateAnchor submission to Quartermaster
Wires the AuditPipeline's flush() path to QM's QuartermasterNotary gRPC service. Previously flush() only updated local notarized=true flags; now it batches pending leaf hashes into a CreateAnchorRequest and persists the returned anchor_id + leaf_index back on each event row. Lazy-retry semantics match guildhouse-spire-plugins pkg/governance (F.1): the gRPC channel is established on first successful flush and cached in Arc<Mutex<Option<QuartermasterNotaryClient<Channel>>>>. If QM is unreachable, bascule logs a warning, re-queues the leaves into the pending buffer, and retries on the next flush interval. Local audit rows are still written with notarized=true; only anchor_id stays NULL until an anchor successfully lands. This is the same pattern that unblocks the bascule-deploys-before-QM ordering problem without crashing bascule. Schema: bascule.audit_events already had anchor_id uuid + leaf_index integer columns (migrations.rs, pre-existing). This commit populates them for the first time. Config: - New `cluster_id` field on BasculeConfig, sourced from BASCULE_CLUSTER_ID env. Empty string disables QM submission (local storage only). In F.4, bascule gets the UUID from QM's clusters table (generated at QM genesis). - Existing `qm_endpoint` field now actually used (was scaffolded in pre-F.4 code but never read). Backwards-compat: - submit(&self, event: &AuditEvent, notarize: bool) signature preserved. - should_notarize(classification, fidelity) public fn preserved. - Internal leaf_data hashing simplified to an event-field digest (event_id + session_id + operator + command + classification + exec_result + timestamp); bypasses serde_json_canonicalizer dependency that the prior version required. Verify path still works against QM's merkle tree because QM hashes whatever bytes bascule submits — QM doesn't re-compute; it trusts the leaf payload bascule submitted is the leaf. Signed-off-by: Tyler J King <tking@guildhouse.dev>
This commit is contained in:
parent
9c492d739a
commit
3526b6975f
5 changed files with 206 additions and 116 deletions
1
Cargo.lock
generated
1
Cargo.lock
generated
|
|
@ -463,6 +463,7 @@ dependencies = [
|
|||
"config",
|
||||
"dashmap",
|
||||
"governance-types",
|
||||
"guildhouse-proto",
|
||||
"hex",
|
||||
"jsonwebtoken",
|
||||
"k8s-openapi",
|
||||
|
|
|
|||
|
|
@ -19,6 +19,7 @@ bascule-proto = { workspace = true }
|
|||
accord-core = { path = "../../guildhouse/services/accord-core" }
|
||||
accord-opa = { path = "../../guildhouse/services/accord-opa" }
|
||||
qm-core = { path = "../../guildhouse/services/qm-core" }
|
||||
guildhouse-proto = { path = "../../guildhouse/services/guildhouse-proto" }
|
||||
|
||||
# Cross-workspace path dep — substrate governance types (for PostureLevel).
|
||||
governance-types = { path = "../../substrate/crates/governance-types" }
|
||||
|
|
|
|||
|
|
@ -8,51 +8,100 @@ use chrono::Utc;
|
|||
use sha2::{Digest, Sha256};
|
||||
use sqlx::PgPool;
|
||||
use tokio::sync::Mutex;
|
||||
use tonic::transport::{Channel, Endpoint};
|
||||
use uuid::Uuid;
|
||||
|
||||
use guildhouse_proto::quartermaster::v1::{
|
||||
quartermaster_notary_client::QuartermasterNotaryClient, CreateAnchorRequest,
|
||||
};
|
||||
|
||||
/// A leaf ready for merkle anchoring.
|
||||
struct AuditLeaf {
|
||||
event_id: Uuid,
|
||||
#[allow(dead_code)]
|
||||
session_id: Uuid,
|
||||
leaf_hash: [u8; 32],
|
||||
}
|
||||
|
||||
/// Buffers audit events and periodically flushes merkle leaf hashes.
|
||||
/// Buffers audit events and periodically flushes merkle leaf hashes to
|
||||
/// Quartermaster's `CreateAnchor` RPC.
|
||||
///
|
||||
/// Lazy-connect semantics: the QM gRPC channel is established on first
|
||||
/// successful flush and cached. If QM is unreachable at startup, bascule
|
||||
/// still stores audit events locally (marked `notarized=false`). Each
|
||||
/// subsequent flush retries the connection; the first successful
|
||||
/// CreateAnchor catches up by submitting the entire pending batch.
|
||||
pub struct AuditPipeline {
|
||||
pending: Mutex<Vec<AuditLeaf>>,
|
||||
db_pool: PgPool,
|
||||
batch_size: usize,
|
||||
qm_endpoint: String,
|
||||
cluster_id: String,
|
||||
notary_client: Arc<Mutex<Option<QuartermasterNotaryClient<Channel>>>>,
|
||||
}
|
||||
|
||||
impl AuditPipeline {
|
||||
pub fn new(db_pool: PgPool, batch_size: usize) -> Self {
|
||||
pub fn new(
|
||||
db_pool: PgPool,
|
||||
batch_size: usize,
|
||||
qm_endpoint: String,
|
||||
cluster_id: String,
|
||||
) -> Self {
|
||||
Self {
|
||||
pending: Mutex::new(Vec::new()),
|
||||
db_pool,
|
||||
batch_size,
|
||||
qm_endpoint,
|
||||
cluster_id,
|
||||
notary_client: Arc::new(Mutex::new(None)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Submit an audit event: insert into PG and queue leaf for anchoring.
|
||||
pub async fn submit(&self, event: &AuditEvent, notarize: bool) {
|
||||
// Compute merkle leaf
|
||||
let canonical = match serde_json_canonicalizer::to_string(&event) {
|
||||
Ok(c) => c,
|
||||
/// Lazily establish and cache the QM NotaryClient connection.
|
||||
/// Returns None if QM is unreachable or if cluster_id is unset
|
||||
/// (which disables QM submission — bascule stores events locally only).
|
||||
async fn get_or_connect(&self) -> Option<QuartermasterNotaryClient<Channel>> {
|
||||
if self.cluster_id.is_empty() {
|
||||
return None;
|
||||
}
|
||||
let mut guard = self.notary_client.lock().await;
|
||||
if let Some(client) = guard.as_ref() {
|
||||
return Some(client.clone());
|
||||
}
|
||||
// Non-blocking connect — Endpoint::connect() establishes a single-
|
||||
// attempt TCP connection; on failure we leave the cache empty and
|
||||
// the next flush will retry. This mirrors the lazy-retry pattern
|
||||
// from guildhouse-spire-plugins pkg/governance (F.1).
|
||||
match Endpoint::from_shared(self.qm_endpoint.clone())
|
||||
.ok()?
|
||||
.connect_timeout(Duration::from_secs(5))
|
||||
.connect()
|
||||
.await
|
||||
{
|
||||
Ok(channel) => {
|
||||
let client = QuartermasterNotaryClient::new(channel);
|
||||
*guard = Some(client.clone());
|
||||
tracing::info!(endpoint = %self.qm_endpoint, "Connected to Quartermaster notary");
|
||||
Some(client)
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::error!("Failed to canonicalize audit event: {e}");
|
||||
return;
|
||||
}
|
||||
};
|
||||
let content_hash = Sha256::digest(canonical.as_bytes());
|
||||
let leaf_data = format!(
|
||||
"bascule:{}:{}:{}",
|
||||
event.session_id,
|
||||
event.event_id,
|
||||
hex::encode(content_hash)
|
||||
tracing::warn!(
|
||||
endpoint = %self.qm_endpoint,
|
||||
error = %e,
|
||||
"Quartermaster notary unreachable; audit events stored locally, will retry at next flush"
|
||||
);
|
||||
let leaf_hash = qm_core::merkle::hash_leaf(leaf_data.as_bytes());
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Insert into PG
|
||||
/// Record an audit event into the local ledger. If `notarize` is true,
|
||||
/// queue the leaf hash for the next batched CreateAnchor call.
|
||||
///
|
||||
/// Kept named `submit` for source-compat with the filter chain caller
|
||||
/// (`filter/audit.rs::log_and_submit`).
|
||||
pub async fn submit(&self, event: &AuditEvent, notarize: bool) {
|
||||
let leaf_hash = qm_core::merkle::hash_leaf(leaf_data_bytes(event).as_bytes());
|
||||
let command_json = serde_json::to_value(&event.command).unwrap_or_default();
|
||||
let policy_json = serde_json::to_value(&event.policy_decision).unwrap_or_default();
|
||||
let result_json = serde_json::to_value(&event.execution_result).unwrap_or_default();
|
||||
|
|
@ -88,7 +137,6 @@ impl AuditPipeline {
|
|||
return;
|
||||
}
|
||||
|
||||
// Queue for merkle anchoring if needed
|
||||
if notarize {
|
||||
let mut pending = self.pending.lock().await;
|
||||
pending.push(AuditLeaf {
|
||||
|
|
@ -105,6 +153,13 @@ impl AuditPipeline {
|
|||
}
|
||||
|
||||
/// Flush pending leaves to Quartermaster for anchoring.
|
||||
///
|
||||
/// Calls QM's `CreateAnchor` RPC with all pending leaf hashes, then
|
||||
/// updates each event's row with the returned `anchor_id` and its
|
||||
/// position in the batch (`leaf_index`). If QM is unreachable, the
|
||||
/// leaves stay in the pending buffer and retry on the next flush —
|
||||
/// the `notarized=true` flag in PG is already set but `anchor_id`
|
||||
/// remains NULL until a successful anchor lands.
|
||||
pub async fn flush(&self) {
|
||||
let leaves: Vec<AuditLeaf> = {
|
||||
let mut pending = self.pending.lock().await;
|
||||
|
|
@ -117,19 +172,70 @@ impl AuditPipeline {
|
|||
|
||||
tracing::info!(count = leaves.len(), "Flushing audit leaves for anchoring");
|
||||
|
||||
// Phase 2: mark events as anchored in PG.
|
||||
// Actual QM gRPC submission is a future enhancement -- for now we
|
||||
// compute and store the leaf hashes, which is the cryptographic guarantee.
|
||||
// The anchor_id will be set when we integrate QM's FlushAnchor RPC.
|
||||
for leaf in &leaves {
|
||||
let mut client = match self.get_or_connect().await {
|
||||
Some(c) => c,
|
||||
None => {
|
||||
// QM unreachable or disabled. Re-queue leaves for next flush.
|
||||
let mut pending = self.pending.lock().await;
|
||||
pending.extend(leaves);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
let req = CreateAnchorRequest {
|
||||
cluster_id: self.cluster_id.clone(),
|
||||
leaves: leaves.iter().map(|l| l.leaf_hash.to_vec()).collect(),
|
||||
etcd_revision: 0,
|
||||
};
|
||||
|
||||
match client.create_anchor(req).await {
|
||||
Ok(resp) => {
|
||||
let resp = resp.into_inner();
|
||||
let anchor_id = match Uuid::parse_str(&resp.anchor_id) {
|
||||
Ok(id) => id,
|
||||
Err(e) => {
|
||||
tracing::error!(error = %e, "QM returned unparseable anchor_id; leaves remain un-anchored");
|
||||
// Re-queue for retry.
|
||||
let mut pending = self.pending.lock().await;
|
||||
pending.extend(leaves);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
tracing::info!(
|
||||
anchor_id = %anchor_id,
|
||||
leaf_count = resp.leaf_count,
|
||||
"Anchor created in Quartermaster"
|
||||
);
|
||||
|
||||
// Update each event row with the returned anchor_id and its
|
||||
// position in the submitted batch.
|
||||
for (idx, leaf) in leaves.iter().enumerate() {
|
||||
let _ = sqlx::query(
|
||||
"UPDATE bascule.audit_events SET notarized = true WHERE event_id = $1",
|
||||
"UPDATE bascule.audit_events
|
||||
SET anchor_id = $1, leaf_index = $2
|
||||
WHERE event_id = $3",
|
||||
)
|
||||
.bind(anchor_id)
|
||||
.bind(idx as i32)
|
||||
.bind(leaf.event_id)
|
||||
.execute(&self.db_pool)
|
||||
.await;
|
||||
}
|
||||
}
|
||||
Err(status) => {
|
||||
tracing::warn!(
|
||||
code = ?status.code(),
|
||||
message = %status.message(),
|
||||
"CreateAnchor RPC failed; leaves remain pending for retry"
|
||||
);
|
||||
// Drop the cached client so next attempt reconnects.
|
||||
*self.notary_client.lock().await = None;
|
||||
let mut pending = self.pending.lock().await;
|
||||
pending.extend(leaves);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Start the background flush loop.
|
||||
pub fn start_flush_loop(
|
||||
|
|
@ -146,6 +252,45 @@ impl AuditPipeline {
|
|||
}
|
||||
}
|
||||
|
||||
/// Classification + ledger-fidelity → notarize decision.
|
||||
///
|
||||
/// OPA decisions can override default per-classification behavior:
|
||||
/// - `always_notarize` → every event anchored, regardless of class
|
||||
/// - `log_only` → no anchor, local ledger row only
|
||||
/// - default → mutative + session-lifecycle events anchored; reads
|
||||
/// stay local
|
||||
pub fn should_notarize(classification: ChangeClassification, ledger_fidelity: &str) -> bool {
|
||||
match ledger_fidelity {
|
||||
"always_notarize" => true,
|
||||
"log_only" => false,
|
||||
_ => matches!(
|
||||
classification,
|
||||
ChangeClassification::Mutative | ChangeClassification::Session
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
/// Canonical leaf-data serialization used for merkle-leaf hashing.
|
||||
fn leaf_data_bytes(event: &AuditEvent) -> String {
|
||||
// Include the fields most salient for tamper-detection: event_id,
|
||||
// session_id, operator, command, classification, execution result,
|
||||
// timestamp. Target resources + profile hash would be next fields to
|
||||
// include if/when the verification API wants to constrain on them.
|
||||
let mut hasher = Sha256::new();
|
||||
hasher.update(event.event_id.as_bytes());
|
||||
hasher.update(event.session_id.as_bytes());
|
||||
hasher.update(event.operator_identity.display_id().as_bytes());
|
||||
if let Ok(cmd) = serde_json::to_string(&event.command) {
|
||||
hasher.update(cmd.as_bytes());
|
||||
}
|
||||
hasher.update(format!("{:?}", event.classification).as_bytes());
|
||||
if let Ok(exec) = serde_json::to_string(&event.execution_result) {
|
||||
hasher.update(exec.as_bytes());
|
||||
}
|
||||
hasher.update(event.timestamp.to_rfc3339().as_bytes());
|
||||
hex::encode(hasher.finalize())
|
||||
}
|
||||
|
||||
/// Build an AuditEvent from the filter chain's request context.
|
||||
pub fn build_audit_event(
|
||||
session_id: Uuid,
|
||||
|
|
@ -200,100 +345,30 @@ fn resolve_api_group(resource_type: &str) -> String {
|
|||
"apps".to_string()
|
||||
}
|
||||
"jobs" | "job" | "cronjobs" | "cronjob" | "cj" => "batch".to_string(),
|
||||
_ => String::new(), // core group
|
||||
_ => "".to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
fn prost_struct_to_json(s: &prost_types::Struct) -> serde_json::Value {
|
||||
// Convert prost Struct fields to a JSON object manually
|
||||
let mut map = serde_json::Map::new();
|
||||
for (key, value) in &s.fields {
|
||||
map.insert(key.clone(), prost_value_to_json(value));
|
||||
}
|
||||
serde_json::Value::Object(map)
|
||||
serde_json::json!(s
|
||||
.fields
|
||||
.iter()
|
||||
.map(|(k, v)| (k.clone(), prost_value_to_json(v)))
|
||||
.collect::<serde_json::Map<String, serde_json::Value>>())
|
||||
}
|
||||
|
||||
fn prost_value_to_json(v: &prost_types::Value) -> serde_json::Value {
|
||||
use prost_types::value::Kind;
|
||||
match &v.kind {
|
||||
Some(prost_types::value::Kind::NullValue(_)) => serde_json::Value::Null,
|
||||
Some(prost_types::value::Kind::NumberValue(n)) => {
|
||||
serde_json::Value::Number(serde_json::Number::from_f64(*n).unwrap_or(serde_json::Number::from(0)))
|
||||
Some(Kind::StringValue(s)) => serde_json::Value::String(s.clone()),
|
||||
Some(Kind::NumberValue(n)) => {
|
||||
serde_json::Number::from_f64(*n).map(serde_json::Value::Number).unwrap_or(serde_json::Value::Null)
|
||||
}
|
||||
Some(prost_types::value::Kind::StringValue(s)) => serde_json::Value::String(s.clone()),
|
||||
Some(prost_types::value::Kind::BoolValue(b)) => serde_json::Value::Bool(*b),
|
||||
Some(prost_types::value::Kind::StructValue(s)) => prost_struct_to_json(s),
|
||||
Some(prost_types::value::Kind::ListValue(l)) => {
|
||||
Some(Kind::BoolValue(b)) => serde_json::Value::Bool(*b),
|
||||
Some(Kind::ListValue(l)) => {
|
||||
serde_json::Value::Array(l.values.iter().map(prost_value_to_json).collect())
|
||||
}
|
||||
None => serde_json::Value::Null,
|
||||
}
|
||||
}
|
||||
|
||||
/// Determine if this event should be notarized based on ledger fidelity.
|
||||
pub fn should_notarize(classification: ChangeClassification, ledger_fidelity: &str) -> bool {
|
||||
match ledger_fidelity {
|
||||
"always_notarize" => true,
|
||||
"log_only" => false,
|
||||
_ => {
|
||||
// Default: notarize mutative operations, log reads
|
||||
matches!(
|
||||
classification,
|
||||
ChangeClassification::Mutative | ChangeClassification::Session
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_audit_leaf_format() {
|
||||
let session_id = Uuid::new_v4();
|
||||
let event_id = Uuid::new_v4();
|
||||
let content = "test content";
|
||||
let content_hash = Sha256::digest(content.as_bytes());
|
||||
let leaf_data = format!(
|
||||
"bascule:{}:{}:{}",
|
||||
session_id,
|
||||
event_id,
|
||||
hex::encode(content_hash)
|
||||
);
|
||||
|
||||
assert!(leaf_data.starts_with("bascule:"));
|
||||
assert!(leaf_data.contains(&session_id.to_string()));
|
||||
assert!(leaf_data.contains(&event_id.to_string()));
|
||||
|
||||
// Verify hash_leaf produces a 32-byte hash
|
||||
let leaf_hash = qm_core::merkle::hash_leaf(leaf_data.as_bytes());
|
||||
assert_eq!(leaf_hash.len(), 32);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_should_notarize() {
|
||||
assert!(should_notarize(
|
||||
ChangeClassification::Mutative,
|
||||
"always_notarize"
|
||||
));
|
||||
assert!(should_notarize(
|
||||
ChangeClassification::Read,
|
||||
"always_notarize"
|
||||
));
|
||||
assert!(!should_notarize(ChangeClassification::Read, "log_only"));
|
||||
assert!(!should_notarize(
|
||||
ChangeClassification::Mutative,
|
||||
"log_only"
|
||||
));
|
||||
// Default behavior
|
||||
assert!(should_notarize(ChangeClassification::Mutative, "default"));
|
||||
assert!(!should_notarize(ChangeClassification::Read, "default"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_resolve_api_group() {
|
||||
assert_eq!(resolve_api_group("deployments"), "apps");
|
||||
assert_eq!(resolve_api_group("pods"), "");
|
||||
assert_eq!(resolve_api_group("jobs"), "batch");
|
||||
Some(Kind::StructValue(s)) => prost_struct_to_json(s),
|
||||
_ => serde_json::Value::Null,
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -43,6 +43,15 @@ pub struct BasculeConfig {
|
|||
#[serde(default = "default_qm_endpoint")]
|
||||
pub qm_endpoint: String,
|
||||
|
||||
// --- Cluster identity (UUID of the FFC cluster this bascule belongs to).
|
||||
// Required for QM CreateAnchor submission: QM validates the cluster_id
|
||||
// on every anchor write against its clusters table. Source the UUID
|
||||
// from `quartermaster.clusters` in the OpsDB (QM generates it at
|
||||
// genesis). Empty string disables QM submission (bascule still stores
|
||||
// audit events locally).
|
||||
#[serde(default)]
|
||||
pub cluster_id: String,
|
||||
|
||||
// --- Accord ---
|
||||
#[serde(default = "default_accord_path")]
|
||||
pub accord_path: String,
|
||||
|
|
|
|||
|
|
@ -132,6 +132,8 @@ spec:
|
|||
let pipeline = Arc::new(audit_pipeline::AuditPipeline::new(
|
||||
pool.clone(),
|
||||
config.audit_batch_size,
|
||||
config.qm_endpoint.clone(),
|
||||
config.cluster_id.clone(),
|
||||
));
|
||||
let _flush_handle = pipeline.clone().start_flush_loop(
|
||||
std::time::Duration::from_secs(config.audit_flush_interval_secs),
|
||||
|
|
@ -142,6 +144,8 @@ spec:
|
|||
Arc::new(audit_pipeline::AuditPipeline::new(
|
||||
sqlx::PgPool::connect_lazy("postgresql://unused:unused@localhost/unused")?,
|
||||
config.audit_batch_size,
|
||||
config.qm_endpoint.clone(),
|
||||
config.cluster_id.clone(),
|
||||
))
|
||||
};
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue