pithy-core 0.0.2

UltraCoS® symbolic token compression — 17-rule encoder for LLM prompts. PolyForm Noncommercial.
Documentation
//! Dialect dispatcher - picks grammar rule set per tokenizer generation.
//!
//! Empirical basis: F-Gram per-rule per-tokenizer sweep (commit `692d3e6`).
//! See `docs/DIALECT_FAMILY.md` for the full specification, crossover data,
//! and kill-switches that gate future dialect additions.
//!
//! Copyright (c) 2026 Mikko Parkkola. Licensed under PolyForm Noncommercial 1.0.

use serde::{Deserialize, Serialize};

use crate::interfaces::Model;

/// The set of shipped Pithy dialects. Frozen at v0.1.
///
/// - `Base`: four universally-compressive rules. Ships unconditionally.
/// - `LegacyBpe`: `Base` + `jit_binding` (only positive on legacy-generation BPEs
///   per F-Gram). Modern BPEs get `Base` only because `jit_binding` is negative
///   or within-noise on them and adds ceremony overhead.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum Dialect {
    /// Four universal rules: arrow_chain, state_parens, subject_elision, typed_slots.
    Base,
    /// Base plus `jit_binding`. Only for legacy-generation tokenizers.
    LegacyBpe,
}

impl Dialect {
    /// Canonical string name as it appears in measurement records.
    #[must_use]
    pub const fn name(&self) -> &'static str {
        match self {
            Self::Base => "Base",
            Self::LegacyBpe => "LegacyBpe",
        }
    }
}

/// The grammar rules that can be emitted by an encoder.
///
/// Rule identifiers match F-Gram `research/grammar_test_vectors.json`.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum Rule {
    /// Arrow chains replacing if/then/and/so conjunctions.
    ArrowChain,
    /// `(ok)/(~)/(x)/(?)` replacing adjectival state clauses.
    StateParens,
    /// `subj: a -> b -> c` dropping repeated subjects.
    SubjectElision,
    /// `name:type?|*+` borrowed from BNF/YAML for type/alt/opt/rep.
    TypedSlots,
    /// `Long(Short)` then reuse `Short`. Only in `LegacyBpe`.
    JitBinding,
}

impl Rule {
    /// Canonical string name as it appears in measurement records.
    #[must_use]
    pub const fn name(&self) -> &'static str {
        match self {
            Self::ArrowChain => "arrow_chain",
            Self::StateParens => "state_parens",
            Self::SubjectElision => "subject_elision",
            Self::TypedSlots => "typed_slots",
            Self::JitBinding => "jit_binding",
        }
    }
}

/// Pick the dialect for a given model, keyed on tokenizer generation (per F-Gram).
///
/// # Determinism
/// Pure function. Same input yields same output. Required for reproducible
/// measurement and signature replay by `ultracos-verify`.
///
/// # Safe defaults
/// Unregistered or unrecognized models fall back to `Dialect::Base` - never
/// emit `jit_binding` speculatively. `jit_binding` is negative on at least one
/// modern BPE (Gemini-2.5-pro at -5.6%); the safe default is the universal set.
#[must_use]
pub fn pick_dialect(model: &Model) -> Dialect {
    match model {
        // Legacy BPE generation: `jit_binding` empirically positive (+16.7% .. +21.6%).
        Model::Gpt4 | Model::Gpt4o => Dialect::LegacyBpe,
        Model::Llama3Custom(_) | Model::Qwen3Custom(_) => Dialect::LegacyBpe,

        // Modern BPE generation: `jit_binding` near-zero or negative. Base only.
        Model::ClaudeOpus47 | Model::ClaudeSonnet47 | Model::ClaudeHaiku47 => Dialect::Base,
        Model::Gpt5 => Dialect::Base,
        Model::Gemini25Ultra | Model::Gemini25Pro => Dialect::Base,
        Model::Grok4 => Dialect::Base,

        // Fallback: conservative Base for any model not in the static registry.
        Model::Registered(_) => Dialect::Base,
    }
}

/// Return the ordered list of rules enabled for a given dialect.
///
/// Order matches the canonical enumeration in `docs/DIALECT_FAMILY.md` section 4.
/// Downstream encoders MUST preserve this order when recording `rules_applied`
/// so the measurement record sorts deterministically.
#[must_use]
pub fn rules_for(dialect: Dialect) -> &'static [Rule] {
    const BASE: &[Rule] = &[
        Rule::ArrowChain,
        Rule::StateParens,
        Rule::SubjectElision,
        Rule::TypedSlots,
    ];
    const LEGACY_BPE: &[Rule] = &[
        Rule::ArrowChain,
        Rule::StateParens,
        Rule::SubjectElision,
        Rule::TypedSlots,
        Rule::JitBinding,
    ];
    match dialect {
        Dialect::Base => BASE,
        Dialect::LegacyBpe => LEGACY_BPE,
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn base_has_four_rules_never_including_jit() {
        let rules = rules_for(Dialect::Base);
        assert_eq!(rules.len(), 4);
        assert!(!rules.contains(&Rule::JitBinding));
    }

    #[test]
    fn legacy_bpe_has_five_rules_including_jit() {
        let rules = rules_for(Dialect::LegacyBpe);
        assert_eq!(rules.len(), 5);
        assert!(rules.contains(&Rule::JitBinding));
    }

    #[test]
    fn modern_claude_maps_to_base() {
        assert_eq!(pick_dialect(&Model::ClaudeOpus47), Dialect::Base);
        assert_eq!(pick_dialect(&Model::ClaudeSonnet47), Dialect::Base);
        assert_eq!(pick_dialect(&Model::ClaudeHaiku47), Dialect::Base);
    }

    #[test]
    fn modern_gemini_maps_to_base() {
        assert_eq!(pick_dialect(&Model::Gemini25Pro), Dialect::Base);
        assert_eq!(pick_dialect(&Model::Gemini25Ultra), Dialect::Base);
    }

    #[test]
    fn legacy_openai_maps_to_legacy_bpe() {
        assert_eq!(pick_dialect(&Model::Gpt4), Dialect::LegacyBpe);
        assert_eq!(pick_dialect(&Model::Gpt4o), Dialect::LegacyBpe);
    }

    #[test]
    fn open_weights_map_to_legacy_bpe() {
        assert_eq!(
            pick_dialect(&Model::Llama3Custom("meta/70b".into())),
            Dialect::LegacyBpe,
        );
        assert_eq!(
            pick_dialect(&Model::Qwen3Custom("qwen3-8b".into())),
            Dialect::LegacyBpe,
        );
    }

    #[test]
    fn registered_models_default_to_base_never_legacy() {
        let m = Model::Registered("some-new-provider-2027".into());
        assert_eq!(pick_dialect(&m), Dialect::Base);
    }

    #[test]
    fn gpt5_modern_generation_is_base() {
        assert_eq!(pick_dialect(&Model::Gpt5), Dialect::Base);
    }

    #[test]
    fn dialect_names_match_spec() {
        assert_eq!(Dialect::Base.name(), "Base");
        assert_eq!(Dialect::LegacyBpe.name(), "LegacyBpe");
    }

    #[test]
    fn rule_names_match_f_gram_identifiers() {
        assert_eq!(Rule::ArrowChain.name(), "arrow_chain");
        assert_eq!(Rule::StateParens.name(), "state_parens");
        assert_eq!(Rule::SubjectElision.name(), "subject_elision");
        assert_eq!(Rule::TypedSlots.name(), "typed_slots");
        assert_eq!(Rule::JitBinding.name(), "jit_binding");
    }

    #[test]
    fn rules_for_is_deterministic_and_order_preserving() {
        let a = rules_for(Dialect::LegacyBpe);
        let b = rules_for(Dialect::LegacyBpe);
        assert_eq!(a, b);
        // Legacy must start with Base order, then append jit_binding.
        assert_eq!(&a[..4], rules_for(Dialect::Base));
        assert_eq!(a[4], Rule::JitBinding);
    }

    #[test]
    fn dialect_serde_round_trips() {
        for d in [Dialect::Base, Dialect::LegacyBpe] {
            let s = serde_json::to_string(&d).unwrap();
            let back: Dialect = serde_json::from_str(&s).unwrap();
            assert_eq!(d, back);
        }
    }
}