pithy-core 0.0.2

UltraCoS® symbolic token compression — 17-rule encoder for LLM prompts. PolyForm Noncommercial.
Documentation
//! Pithy interface contracts — FROZEN at Phase 0.
//!
//! This module defines the trait and type contracts that every other
//! crate in the workspace depends on. Semver commitment: any breaking
//! change to these types or traits requires a major-version bump and
//! coordinated release across all consumers.
//!
//! Copyright (c) 2026 Mikko Parkkola. All rights reserved.
//! Licensed under PolyForm Noncommercial 1.0 + Pithy Attribution Rider.

use serde::{Deserialize, Serialize};

/// One of the four compression formats defined by the Pithy specification.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum Format {
    /// Symbolic notation (arrow chains, unicode glyphs, parenthetical state).
    Symbolic,
    /// Just-in-time progressive vocabulary binding: bind on first re-use.
    JitProgressive,
    /// Fragment prose: articles dropped, grammar compressed.
    FragmentProse,
    /// Structured delimiters: colon-key / pipe-separated fields.
    StructuredDelim,
    /// No compression applied. Fallback and baseline.
    Prose,
}

/// The upstream LLM whose tokenizer rules the measurement.
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum Model {
    // Anthropic
    ClaudeOpus47,
    ClaudeSonnet47,
    ClaudeHaiku47,
    // OpenAI
    Gpt5,
    Gpt4o,
    Gpt4,
    // Google
    Gemini25Ultra,
    Gemini25Pro,
    // Open weights
    Llama3Custom(String),
    Qwen3Custom(String),
    // xAI
    Grok4,
    // Catch-all for registered tokenizers not yet in the enum
    Registered(String),
}

/// Reason the encoder fell back to a less aggressive format.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum FallbackReason {
    /// Selector judged compression would be net-negative.
    Uncompressible,
    /// Tokenizer not registered for the target model.
    TokenizerMissing,
    /// Downstream quality gate failed (semantic fidelity below threshold).
    QualityDegraded,
    /// Content exceeds max input size for the selected format.
    OversizedInput,
    /// Encoder panic or unrecoverable error.
    EncoderFault,
    /// Input contains markdown/structured markers (headings, list items,
    /// fenced code blocks, or multiple paragraphs). Symbolic compression
    /// collapses whitespace unconditionally via `MULTI_WS`, which
    /// destroys paragraph breaks and newline-bearing structure.
    /// This fallback preserves structure at the cost of some savings.
    StructuredContent,
}

/// Output of a single compression operation.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Compressed {
    /// Compressed text ready to send to an upstream LLM.
    pub content: String,
    /// Which format was selected.
    pub format: Format,
    /// Real tokenizer count of the original prose input.
    pub baseline_tokens: u32,
    /// Real tokenizer count of the compressed output.
    pub compressed_tokens: u32,
    /// Model whose tokenizer was used for the measurement.
    pub model: Model,
    /// Blake3 hash of the original prose input (32 bytes hex-encoded).
    pub content_hash: String,
    /// Fallback reason if format is Prose and baseline != compressed semantically.
    pub fallback: Option<FallbackReason>,
}

/// A pass-through encoder interface. Implementors MUST preserve semantic fidelity
/// of the compressed output at or above the Phase-0 kill-switch threshold (≥0.90
/// cosine similarity under the target LLM's response).
pub trait Encoder: Send + Sync {
    /// Compress `input` for the target `model`. Must run in <5ms at p95.
    /// Returns `Compressed` even on fallback; format field reports what was used.
    fn compress(&self, input: &str, model: Model) -> Compressed;

    /// Pick the best format for `input` without actually encoding. Used by
    /// A/B samplers and planners. Must match [`compress`]'s selection.
    fn select_format(&self, input: &str, model: Model) -> Format;

    /// Explicit fallback request — used when downstream quality monitor flags
    /// a previously-compressed payload.
    fn fallback(&self, input: &str, model: Model, reason: FallbackReason) -> Compressed;
}

/// Count real tokens for the target model. No estimation, no synthetic fallback.
pub trait Measurer: Send + Sync {
    /// Returns the actual token count for `text` under `model`'s tokenizer.
    /// Implementations MUST return `Err` for unregistered models rather than
    /// silently approximating — this is a DoR gate (no hardcoded ratios).
    fn tokenize(&self, text: &str, model: &Model) -> Result<u32, TokenizerError>;

    /// Returns true iff the tokenizer for `model` is registered and callable.
    fn supported(&self, model: &Model) -> bool;
}

/// Error from the measurer path. Kept narrow so downstream code can fall back.
#[derive(Debug, thiserror::Error)]
pub enum TokenizerError {
    #[error("tokenizer for model {0:?} is not registered")]
    NotRegistered(Model),
    #[error("tokenizer I/O failure: {0}")]
    Io(String),
    #[error("tokenizer library error: {0}")]
    Library(String),
}

/// A single signed measurement record. Wire format frozen at v0.1.
///
/// # Changelog
/// - v0.0.2: Added `dialect` and `rules_applied` as backward-compatible
///   `#[serde(default)]` optional fields. Both default to `None` for v0.1
///   readers. Phase-2 Agent D's encoder populates them.
/// - v0.0.4: Added `bytes_saved_by_rule` as backward-compatible
///   `#[serde(default)]` optional field. `None` for v0.0.3-and-earlier
///   readers. Wired by Step 9 of `docs/PLAN_2026-04-24.md` — schema
///   lands now so future encoder instrumentation (per-rule byte-delta
///   callbacks) can populate without a second migration. Until then
///   the field is always `None` at produce time. Consumers must handle
///   `None` gracefully (treat as "no attribution available").
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Record {
    /// ULID request identifier.
    pub request_id: String,
    /// ISO-8601 timestamp with millisecond precision.
    pub ts: String,
    /// Tenant id. Required for multi-tenant reporting.
    pub tenant_id: String,
    /// Model the request was routed to.
    pub model: Model,
    /// Route classification (agent-to-agent, agent-to-human, etc).
    pub route: String,
    /// Input-stream measurement (baseline vs compressed).
    pub input: StreamDelta,
    /// Output-stream measurement.
    pub output: StreamDelta,
    /// Thinking-stream measurement (when provider exposes it).
    pub thinking: Option<StreamDelta>,
    /// Cost translation at capture time.
    pub cost: CostEntry,
    /// Cryptographic integrity fields.
    pub integrity: Integrity,
    /// Pithy dialect family used for compression (e.g. `"ultracos-symbolic-v1"`).
    /// Added in v0.0.2. Populated by Phase-2 encoder; `None` for v0.1 records.
    #[serde(default)]
    pub dialect: Option<String>,
    /// Ordered list of compression rules applied during encoding.
    /// Added in v0.0.2. Populated by Phase-2 encoder; `None` for v0.1 records.
    #[serde(default)]
    pub rules_applied: Option<Vec<String>>,
    /// Per-rule firing counts (alphabetical by rule name).
    /// Added in v0.0.3. Populated by `EncoderTrace::as_pairs` so the
    /// post-hoc analyzer can attribute savings and quality regressions
    /// to specific rules without re-running the encoder. `None` when
    /// the encoder did not run (Prose fallback) or the producer is on
    /// the v0.0.2 wire schema.
    #[serde(default)]
    pub rule_fire_counts: Option<Vec<(String, u32)>>,
    /// Optional shadow-arm quality measurement. Set when the proxy
    /// ran the same prompt through both the compressed and the prose
    /// arm at sample rate, so the closed-loop tuner can correlate
    /// compression with response quality. Added in v0.0.3.
    #[serde(default)]
    pub quality: Option<QualityScore>,
    /// Per-rule byte-savings attribution, in UTF-8 bytes of input payload.
    /// Added in v0.0.4 as the substrate for PLAN Step 9. Aggregate
    /// `delta_tokens` + `rule_fire_counts` tells us WHICH rules fired
    /// and HOW OFTEN, but not WHICH BYTES each rule removed from the
    /// payload. Step 8+ rule-vs-rule tuning needs the latter.
    ///
    /// Populated by the encoder once per-rule byte-delta callbacks are
    /// wired (separate follow-up commit). Until then, this field is
    /// `None` at produce time and consumers MUST treat `None` as "no
    /// attribution available", not as "zero savings".
    ///
    /// Units are **bytes of input before vs after the rule ran**, not
    /// tokens. The bandit converts to token-equivalents against the
    /// target-model tokenizer at aggregation time, so this field stays
    /// tokenizer-agnostic and doesn't drift when tokenizer choices
    /// change (see B8 DeepSeek-V4 addition).
    #[serde(default)]
    pub bytes_saved_by_rule: Option<Vec<(String, u64)>>,
}

/// Optional quality measurement attached to a `Record` when the
/// proxy ran a shadow comparison between the compressed and the
/// prose arm at sample rate.
///
/// Recorded ONCE at sample time -- never re-derived from response
/// text -- so the audit ledger remains the single source of truth.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct QualityScore {
    /// Cosine similarity in `[0.0, 1.0]` of compressed-arm vs
    /// prose-arm response embeddings. Pin the embedder via
    /// `embedder_id` so values stay comparable across releases.
    pub cosine: f32,
    /// Categorical judge verdict: `"yes"`, `"partial"`, `"no"`, or
    /// `"indeterminate"`. From the cheap comparator model.
    pub judge_verdict: String,
    /// Identifier of the model used for `judge_verdict`; pinned for
    /// reproducibility (e.g. `"claude-haiku-4.5"`).
    pub judge_model: String,
    /// Identifier of the embedder used for `cosine`; pinned (e.g.
    /// `"sentence-transformers/all-MiniLM-L6-v2@v2.7.0"`).
    pub embedder_id: String,
    /// Sample rate in `[0.0, 1.0]` this record was drawn at; lets
    /// the aggregator weight per-record contributions when computing
    /// fleet-level quality.
    pub sample_rate: f32,
}

/// Per-stream delta between uncompressed and compressed.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct StreamDelta {
    pub baseline_tokens: u32,
    pub compressed_tokens: u32,
    pub delta_tokens: i32,
    pub compressed_pct: f32,
    pub format: Option<Format>,
}

/// Cost translation for a single record.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CostEntry {
    pub currency: String,
    pub baseline_cost_cents: f64,
    pub actual_cost_cents: f64,
    pub savings_cents: f64,
    /// Opaque snapshot id pointing at the versioned pricing table in use.
    pub pricing_snapshot_id: String,
}

/// Cryptographic fields of a signed record.
/// Phase 1 must populate BOTH ed25519 and ML-DSA signatures (Day-1 PQC).
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Integrity {
    /// Blake3 hash of the prose input (hex).
    pub prompt_hash: String,
    /// Blake3 hash of the model response (hex).
    pub response_hash: String,
    /// Classical ed25519 signature over the canonicalized record (hex).
    pub ed25519_signature: String,
    /// Post-quantum ML-DSA (FIPS 204) signature (hex). Required.
    pub mldsa_signature: String,
    /// Identifier of the signing key pair.
    pub signing_key_id: String,
}

/// Log of signed measurement records. Append-only; tampering detectable via merkle.
pub trait MeasurementLog: Send + Sync {
    /// Append a record to the log. Returns the record id on success.
    fn record(&self, r: Record) -> Result<RecordId, LogError>;

    /// Retrieve a record by id for spot-check audit.
    fn get(&self, id: &RecordId) -> Result<Option<Record>, LogError>;
}

/// Opaque id for a stored record.
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct RecordId(pub String);

/// Error type for the measurement log.
#[derive(Debug, thiserror::Error)]
pub enum LogError {
    #[error("log I/O failure: {0}")]
    Io(String),
    #[error("signing failure: {0}")]
    Signing(String),
    #[error("canonicalization failure: {0}")]
    Canonicalization(String),
}

/// Dual-signing facade. Implementations sign with BOTH ed25519 and ML-DSA.
pub trait Signer: Send + Sync {
    /// Return (ed25519_sig_hex, mldsa_sig_hex) over `canonical_bytes`.
    fn dual_sign(&self, canonical_bytes: &[u8]) -> Result<(String, String), SignerError>;

    /// Return the signing-key-pair identifier recorded on every output.
    fn key_id(&self) -> &str;
}

/// Error type from the signer.
#[derive(Debug, thiserror::Error)]
pub enum SignerError {
    #[error("ed25519 error: {0}")]
    Ed25519(String),
    #[error("ML-DSA error: {0}")]
    MlDsa(String),
    #[error("key not loaded")]
    KeyMissing,
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn format_round_trip_json() {
        let f = Format::JitProgressive;
        let s = serde_json::to_string(&f).unwrap();
        let f2: Format = serde_json::from_str(&s).unwrap();
        assert_eq!(f, f2);
    }

    #[test]
    fn model_custom_llama_supported() {
        let m = Model::Llama3Custom("meta-llama/Llama-3-70b".to_string());
        let s = serde_json::to_string(&m).unwrap();
        assert!(s.contains("Llama3Custom"));
    }

    #[test]
    fn record_bytes_saved_by_rule_field_is_backward_compatible() {
        // B9/Step-9 substrate: a v0.0.3 wire-format record (no
        // bytes_saved_by_rule field at all) must still deserialize
        // cleanly against the v0.0.4 schema. This is the whole reason
        // the field is marked #[serde(default)].
        let v003_json = r#"{
            "request_id": "01HY0000000000000000000000",
            "ts": "2026-04-24T00:00:00.000Z",
            "tenant_id": "t",
            "model": "ClaudeOpus47",
            "route": "a2a",
            "input":  {"baseline_tokens": 100, "compressed_tokens": 80, "delta_tokens": -20, "compressed_pct": 0.8, "format": null},
            "output": {"baseline_tokens": 50,  "compressed_tokens": 50, "delta_tokens":  0, "compressed_pct": 1.0, "format": null},
            "thinking": null,
            "cost": {"currency": "USD", "baseline_cost_cents": 1.0, "actual_cost_cents": 0.8, "savings_cents": 0.2, "pricing_snapshot_id": "p1"},
            "integrity": {"prompt_hash": "a", "response_hash": "b", "ed25519_signature": "c", "mldsa_signature": "d", "signing_key_id": "k"}
        }"#;
        let r: Record =
            serde_json::from_str(v003_json).expect("v0.0.3 record must parse under v0.0.4 schema");
        assert!(
            r.bytes_saved_by_rule.is_none(),
            "missing field must deserialize as None, not surface an error"
        );

        // Round-trip with the new field populated.
        let with = Record {
            bytes_saved_by_rule: Some(vec![
                ("json_minified".to_string(), 42),
                ("term_substitutions".to_string(), 17),
            ]),
            ..r
        };
        let serialized = serde_json::to_string(&with).unwrap();
        assert!(serialized.contains("\"bytes_saved_by_rule\""));
        let back: Record = serde_json::from_str(&serialized).unwrap();
        assert_eq!(
            back.bytes_saved_by_rule.as_deref(),
            Some(
                &[
                    ("json_minified".to_string(), 42u64),
                    ("term_substitutions".to_string(), 17u64)
                ][..]
            )
        );
    }
}