vectorpin 0.1.0

Verifiable integrity for AI embedding stores.
Documentation
// Copyright 2025 Jascha Wanger / Tarnover, LLC
// SPDX-License-Identifier: Apache-2.0

//! Pin attestation format and canonicalization.
//!
//! See `docs/spec.md` in the repo root for the protocol specification.
//! In summary: a [`Pin`] is a JSON object with a header (the signed
//! payload) plus a key id and signature. The header canonicalizes to a
//! deterministic byte sequence — sorted keys, no whitespace — that
//! both Python and Rust implementations agree on byte-for-byte.

use std::collections::BTreeMap;

use base64::Engine;
use serde::{Deserialize, Serialize};

/// Protocol version implemented by this crate. Verifiers reject pins
/// whose `v` field does not match.
pub const PROTOCOL_VERSION: u32 = 1;

/// The signed portion of a [`Pin`].
///
/// Two pins are considered equivalent iff their headers canonicalize to
/// identical bytes. Optional fields (`model_hash`, `extra`) are omitted
/// from the canonical form when unset, never written as `null`.
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct PinHeader {
    /// Protocol version. Must equal [`PROTOCOL_VERSION`].
    pub v: u32,
    /// Embedding model identifier.
    pub model: String,
    /// Optional content hash of the model weights.
    #[serde(skip_serializing_if = "Option::is_none", default)]
    pub model_hash: Option<String>,
    /// SHA-256 of the source text (UTF-8 NFC).
    pub source_hash: String,
    /// SHA-256 of the embedding vector under the declared dtype.
    pub vec_hash: String,
    /// `"f32"` or `"f64"`.
    pub vec_dtype: String,
    /// Embedding dimensionality.
    pub vec_dim: u32,
    /// RFC 3339 / ISO 8601 timestamp in UTC, e.g. `"2026-05-05T12:00:00Z"`.
    pub ts: String,
    /// Producer-defined string-to-string metadata, signed alongside the
    /// rest of the header. Omitted from the canonical form when empty.
    #[serde(skip_serializing_if = "BTreeMap::is_empty", default)]
    pub extra: BTreeMap<String, String>,
}

impl PinHeader {
    /// Stable byte representation for signing/verifying.
    ///
    /// JSON with sorted keys and no whitespace. `BTreeMap` gives us
    /// sorted `extra` for free; field order is fixed by hand below to
    /// match the Python reference.
    pub fn canonicalize(&self) -> Vec<u8> {
        // Manually build a sorted JSON object — this is the contract the
        // Python implementation also follows. Using `serde_json::to_vec`
        // on the struct directly would emit fields in declaration order,
        // not lexicographic order, which would break compatibility.
        let mut entries: Vec<(&'static str, serde_json::Value)> = Vec::new();
        entries.push(("v", serde_json::Value::Number(self.v.into())));
        entries.push(("model", serde_json::Value::String(self.model.clone())));
        if let Some(h) = &self.model_hash {
            entries.push(("model_hash", serde_json::Value::String(h.clone())));
        }
        entries.push((
            "source_hash",
            serde_json::Value::String(self.source_hash.clone()),
        ));
        entries.push(("vec_hash", serde_json::Value::String(self.vec_hash.clone())));
        entries.push((
            "vec_dtype",
            serde_json::Value::String(self.vec_dtype.clone()),
        ));
        entries.push(("vec_dim", serde_json::Value::Number(self.vec_dim.into())));
        entries.push(("ts", serde_json::Value::String(self.ts.clone())));
        if !self.extra.is_empty() {
            // BTreeMap iterates in sorted key order; preserve that as a
            // serde_json::Map (which is also sorted in our build via the
            // preserve_order feature being OFF — the default).
            let mut m = serde_json::Map::new();
            for (k, val) in &self.extra {
                m.insert(k.clone(), serde_json::Value::String(val.clone()));
            }
            entries.push(("extra", serde_json::Value::Object(m)));
        }

        // Sort the top-level entries lexicographically. This is the rule
        // the Python implementation follows via `sort_keys=True`.
        entries.sort_by(|a, b| a.0.cmp(b.0));
        let mut map = serde_json::Map::with_capacity(entries.len());
        for (k, v) in entries {
            map.insert(k.to_string(), v);
        }
        serde_json::to_vec(&serde_json::Value::Object(map))
            .expect("JSON serialization of well-formed map cannot fail")
    }
}

/// A signed VectorPin attestation. Serialize with [`Pin::to_json`] and
/// store alongside the embedding in vector store metadata.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Pin {
    /// The signed payload.
    pub header: PinHeader,
    /// Identifier of the signing key — verifiers route to a public key by `kid`.
    pub kid: String,
    /// Raw Ed25519 signature bytes (64 bytes).
    pub sig: Vec<u8>,
}

/// URL-safe base64, padding stripped, matching the Python encoder.
pub(crate) fn b64url_encode(data: &[u8]) -> String {
    base64::engine::general_purpose::URL_SAFE_NO_PAD.encode(data)
}

/// Inverse of [`b64url_encode`]. Restores stripped padding internally.
pub(crate) fn b64url_decode(s: &str) -> Result<Vec<u8>, AttestationError> {
    base64::engine::general_purpose::URL_SAFE_NO_PAD
        .decode(s.as_bytes())
        .map_err(AttestationError::Base64)
}

/// Errors produced when parsing or serializing pins.
#[derive(Debug, thiserror::Error)]
pub enum AttestationError {
    /// Pin uses a protocol version this crate does not support.
    #[error("unsupported pin version: got {got}, expected {expected}")]
    UnsupportedVersion {
        /// Version number found in the pin.
        got: u32,
        /// Version number this build supports.
        expected: u32,
    },
    /// JSON parsing failure.
    #[error("malformed pin JSON: {0}")]
    Json(#[from] serde_json::Error),
    /// Base64 decode failure (signature or related field).
    #[error("malformed base64: {0}")]
    Base64(#[from] base64::DecodeError),
    /// A required field was missing from the pin JSON.
    #[error("missing required field: {0}")]
    MissingField(&'static str),
}

impl Pin {
    /// Encode the pin as compact JSON suitable for vector DB metadata.
    ///
    /// Output is sorted-key, whitespace-free JSON exactly matching what
    /// the Python implementation emits, so `to_json()` is deterministic
    /// across implementations.
    pub fn to_json(&self) -> String {
        let mut entries: Vec<(&str, serde_json::Value)> = Vec::new();
        entries.push(("v", serde_json::Value::Number(self.header.v.into())));
        entries.push((
            "model",
            serde_json::Value::String(self.header.model.clone()),
        ));
        if let Some(h) = &self.header.model_hash {
            entries.push(("model_hash", serde_json::Value::String(h.clone())));
        }
        entries.push((
            "source_hash",
            serde_json::Value::String(self.header.source_hash.clone()),
        ));
        entries.push((
            "vec_hash",
            serde_json::Value::String(self.header.vec_hash.clone()),
        ));
        entries.push((
            "vec_dtype",
            serde_json::Value::String(self.header.vec_dtype.clone()),
        ));
        entries.push((
            "vec_dim",
            serde_json::Value::Number(self.header.vec_dim.into()),
        ));
        entries.push(("ts", serde_json::Value::String(self.header.ts.clone())));
        if !self.header.extra.is_empty() {
            let mut m = serde_json::Map::new();
            for (k, val) in &self.header.extra {
                m.insert(k.clone(), serde_json::Value::String(val.clone()));
            }
            entries.push(("extra", serde_json::Value::Object(m)));
        }
        entries.push(("kid", serde_json::Value::String(self.kid.clone())));
        entries.push(("sig", serde_json::Value::String(b64url_encode(&self.sig))));
        entries.sort_by(|a, b| a.0.cmp(b.0));
        let mut map = serde_json::Map::with_capacity(entries.len());
        for (k, v) in entries {
            map.insert(k.to_string(), v);
        }
        serde_json::to_string(&serde_json::Value::Object(map))
            .expect("JSON serialization of well-formed map cannot fail")
    }

    /// Parse a pin from its compact JSON wire form.
    pub fn from_json(s: &str) -> Result<Self, AttestationError> {
        let value: serde_json::Value = serde_json::from_str(s)?;
        Self::from_value(value)
    }

    /// Parse a pin from a parsed `serde_json::Value`.
    pub fn from_value(value: serde_json::Value) -> Result<Self, AttestationError> {
        let obj = value
            .as_object()
            .ok_or(AttestationError::MissingField("(root)"))?;

        let v = obj
            .get("v")
            .and_then(|x| x.as_u64())
            .ok_or(AttestationError::MissingField("v"))? as u32;
        if v != PROTOCOL_VERSION {
            return Err(AttestationError::UnsupportedVersion {
                got: v,
                expected: PROTOCOL_VERSION,
            });
        }

        fn s_field(
            obj: &serde_json::Map<String, serde_json::Value>,
            name: &'static str,
        ) -> Result<String, AttestationError> {
            obj.get(name)
                .and_then(|x| x.as_str())
                .map(str::to_owned)
                .ok_or(AttestationError::MissingField(name))
        }

        let header = PinHeader {
            v,
            model: s_field(obj, "model")?,
            model_hash: obj
                .get("model_hash")
                .and_then(|x| x.as_str())
                .map(String::from),
            source_hash: s_field(obj, "source_hash")?,
            vec_hash: s_field(obj, "vec_hash")?,
            vec_dtype: s_field(obj, "vec_dtype")?,
            vec_dim: obj
                .get("vec_dim")
                .and_then(|x| x.as_u64())
                .ok_or(AttestationError::MissingField("vec_dim"))? as u32,
            ts: s_field(obj, "ts")?,
            extra: obj
                .get("extra")
                .and_then(|x| x.as_object())
                .map(|m| {
                    m.iter()
                        .filter_map(|(k, v)| v.as_str().map(|s| (k.clone(), s.to_owned())))
                        .collect()
                })
                .unwrap_or_default(),
        };

        let kid = s_field(obj, "kid")?;
        let sig = b64url_decode(s_field(obj, "sig")?.as_str())?;

        Ok(Pin { header, kid, sig })
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    fn header() -> PinHeader {
        PinHeader {
            v: PROTOCOL_VERSION,
            model: "test-model".into(),
            model_hash: None,
            source_hash: format!("sha256:{}", "0".repeat(64)),
            vec_hash: format!("sha256:{}", "1".repeat(64)),
            vec_dtype: "f32".into(),
            vec_dim: 3072,
            ts: "2026-05-05T12:00:00Z".into(),
            extra: BTreeMap::new(),
        }
    }

    #[test]
    fn canonicalize_is_deterministic() {
        let h = header();
        assert_eq!(h.canonicalize(), h.canonicalize());
    }

    #[test]
    fn canonicalize_omits_optional_when_unset() {
        let raw = String::from_utf8(header().canonicalize()).unwrap();
        assert!(!raw.contains("model_hash"));
        assert!(!raw.contains("extra"));
    }

    #[test]
    fn pin_round_trip_via_json() {
        let pin = Pin {
            header: header(),
            kid: "k".into(),
            sig: vec![1u8; 64],
        };
        let restored = Pin::from_json(&pin.to_json()).unwrap();
        assert_eq!(pin, restored);
    }

    #[test]
    fn pin_rejects_unsupported_version() {
        let bad = serde_json::json!({
            "v": 99,
            "model": "x",
            "source_hash": format!("sha256:{}", "0".repeat(64)),
            "vec_hash": format!("sha256:{}", "1".repeat(64)),
            "vec_dtype": "f32",
            "vec_dim": 1,
            "ts": "2026-05-05T12:00:00Z",
            "kid": "k",
            "sig": "AA",
        });
        let err = Pin::from_value(bad).unwrap_err();
        assert!(matches!(err, AttestationError::UnsupportedVersion { .. }));
    }

    #[test]
    fn pin_to_json_is_compact() {
        let pin = Pin {
            header: header(),
            kid: "k".into(),
            sig: vec![1u8; 64],
        };
        let j = pin.to_json();
        assert!(!j.contains(": "));
        assert!(!j.contains(", "));
    }
}