Skip to main content

pithy_core/
dialect.rs

1//! Dialect dispatcher - picks grammar rule set per tokenizer generation.
2//!
3//! Empirical basis: F-Gram per-rule per-tokenizer sweep (commit `692d3e6`).
4//! See `docs/DIALECT_FAMILY.md` for the full specification, crossover data,
5//! and kill-switches that gate future dialect additions.
6//!
7//! Copyright (c) 2026 Mikko Parkkola. Licensed under PolyForm Noncommercial 1.0.
8
9use serde::{Deserialize, Serialize};
10
11use crate::interfaces::Model;
12
13/// The set of shipped Pithy dialects. Frozen at v0.1.
14///
15/// - `Base`: four universally-compressive rules. Ships unconditionally.
16/// - `LegacyBpe`: `Base` + `jit_binding` (only positive on legacy-generation BPEs
17///   per F-Gram). Modern BPEs get `Base` only because `jit_binding` is negative
18///   or within-noise on them and adds ceremony overhead.
19#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
20pub enum Dialect {
21    /// Four universal rules: arrow_chain, state_parens, subject_elision, typed_slots.
22    Base,
23    /// Base plus `jit_binding`. Only for legacy-generation tokenizers.
24    LegacyBpe,
25}
26
27impl Dialect {
28    /// Canonical string name as it appears in measurement records.
29    #[must_use]
30    pub const fn name(&self) -> &'static str {
31        match self {
32            Self::Base => "Base",
33            Self::LegacyBpe => "LegacyBpe",
34        }
35    }
36}
37
38/// The grammar rules that can be emitted by an encoder.
39///
40/// Rule identifiers match F-Gram `research/grammar_test_vectors.json`.
41#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
42pub enum Rule {
43    /// Arrow chains replacing if/then/and/so conjunctions.
44    ArrowChain,
45    /// `(ok)/(~)/(x)/(?)` replacing adjectival state clauses.
46    StateParens,
47    /// `subj: a -> b -> c` dropping repeated subjects.
48    SubjectElision,
49    /// `name:type?|*+` borrowed from BNF/YAML for type/alt/opt/rep.
50    TypedSlots,
51    /// `Long(Short)` then reuse `Short`. Only in `LegacyBpe`.
52    JitBinding,
53}
54
55impl Rule {
56    /// Canonical string name as it appears in measurement records.
57    #[must_use]
58    pub const fn name(&self) -> &'static str {
59        match self {
60            Self::ArrowChain => "arrow_chain",
61            Self::StateParens => "state_parens",
62            Self::SubjectElision => "subject_elision",
63            Self::TypedSlots => "typed_slots",
64            Self::JitBinding => "jit_binding",
65        }
66    }
67}
68
69/// Pick the dialect for a given model, keyed on tokenizer generation (per F-Gram).
70///
71/// # Determinism
72/// Pure function. Same input yields same output. Required for reproducible
73/// measurement and signature replay by `ultracos-verify`.
74///
75/// # Safe defaults
76/// Unregistered or unrecognized models fall back to `Dialect::Base` - never
77/// emit `jit_binding` speculatively. `jit_binding` is negative on at least one
78/// modern BPE (Gemini-2.5-pro at -5.6%); the safe default is the universal set.
79#[must_use]
80pub fn pick_dialect(model: &Model) -> Dialect {
81    match model {
82        // Legacy BPE generation: `jit_binding` empirically positive (+16.7% .. +21.6%).
83        Model::Gpt4 | Model::Gpt4o => Dialect::LegacyBpe,
84        Model::Llama3Custom(_) | Model::Qwen3Custom(_) => Dialect::LegacyBpe,
85
86        // Modern BPE generation: `jit_binding` near-zero or negative. Base only.
87        Model::ClaudeOpus47 | Model::ClaudeSonnet47 | Model::ClaudeHaiku47 => Dialect::Base,
88        Model::Gpt5 => Dialect::Base,
89        Model::Gemini25Ultra | Model::Gemini25Pro => Dialect::Base,
90        Model::Grok4 => Dialect::Base,
91
92        // Fallback: conservative Base for any model not in the static registry.
93        Model::Registered(_) => Dialect::Base,
94    }
95}
96
97/// Return the ordered list of rules enabled for a given dialect.
98///
99/// Order matches the canonical enumeration in `docs/DIALECT_FAMILY.md` section 4.
100/// Downstream encoders MUST preserve this order when recording `rules_applied`
101/// so the measurement record sorts deterministically.
102#[must_use]
103pub fn rules_for(dialect: Dialect) -> &'static [Rule] {
104    const BASE: &[Rule] = &[
105        Rule::ArrowChain,
106        Rule::StateParens,
107        Rule::SubjectElision,
108        Rule::TypedSlots,
109    ];
110    const LEGACY_BPE: &[Rule] = &[
111        Rule::ArrowChain,
112        Rule::StateParens,
113        Rule::SubjectElision,
114        Rule::TypedSlots,
115        Rule::JitBinding,
116    ];
117    match dialect {
118        Dialect::Base => BASE,
119        Dialect::LegacyBpe => LEGACY_BPE,
120    }
121}
122
123#[cfg(test)]
124mod tests {
125    use super::*;
126
127    #[test]
128    fn base_has_four_rules_never_including_jit() {
129        let rules = rules_for(Dialect::Base);
130        assert_eq!(rules.len(), 4);
131        assert!(!rules.contains(&Rule::JitBinding));
132    }
133
134    #[test]
135    fn legacy_bpe_has_five_rules_including_jit() {
136        let rules = rules_for(Dialect::LegacyBpe);
137        assert_eq!(rules.len(), 5);
138        assert!(rules.contains(&Rule::JitBinding));
139    }
140
141    #[test]
142    fn modern_claude_maps_to_base() {
143        assert_eq!(pick_dialect(&Model::ClaudeOpus47), Dialect::Base);
144        assert_eq!(pick_dialect(&Model::ClaudeSonnet47), Dialect::Base);
145        assert_eq!(pick_dialect(&Model::ClaudeHaiku47), Dialect::Base);
146    }
147
148    #[test]
149    fn modern_gemini_maps_to_base() {
150        assert_eq!(pick_dialect(&Model::Gemini25Pro), Dialect::Base);
151        assert_eq!(pick_dialect(&Model::Gemini25Ultra), Dialect::Base);
152    }
153
154    #[test]
155    fn legacy_openai_maps_to_legacy_bpe() {
156        assert_eq!(pick_dialect(&Model::Gpt4), Dialect::LegacyBpe);
157        assert_eq!(pick_dialect(&Model::Gpt4o), Dialect::LegacyBpe);
158    }
159
160    #[test]
161    fn open_weights_map_to_legacy_bpe() {
162        assert_eq!(
163            pick_dialect(&Model::Llama3Custom("meta/70b".into())),
164            Dialect::LegacyBpe,
165        );
166        assert_eq!(
167            pick_dialect(&Model::Qwen3Custom("qwen3-8b".into())),
168            Dialect::LegacyBpe,
169        );
170    }
171
172    #[test]
173    fn registered_models_default_to_base_never_legacy() {
174        let m = Model::Registered("some-new-provider-2027".into());
175        assert_eq!(pick_dialect(&m), Dialect::Base);
176    }
177
178    #[test]
179    fn gpt5_modern_generation_is_base() {
180        assert_eq!(pick_dialect(&Model::Gpt5), Dialect::Base);
181    }
182
183    #[test]
184    fn dialect_names_match_spec() {
185        assert_eq!(Dialect::Base.name(), "Base");
186        assert_eq!(Dialect::LegacyBpe.name(), "LegacyBpe");
187    }
188
189    #[test]
190    fn rule_names_match_f_gram_identifiers() {
191        assert_eq!(Rule::ArrowChain.name(), "arrow_chain");
192        assert_eq!(Rule::StateParens.name(), "state_parens");
193        assert_eq!(Rule::SubjectElision.name(), "subject_elision");
194        assert_eq!(Rule::TypedSlots.name(), "typed_slots");
195        assert_eq!(Rule::JitBinding.name(), "jit_binding");
196    }
197
198    #[test]
199    fn rules_for_is_deterministic_and_order_preserving() {
200        let a = rules_for(Dialect::LegacyBpe);
201        let b = rules_for(Dialect::LegacyBpe);
202        assert_eq!(a, b);
203        // Legacy must start with Base order, then append jit_binding.
204        assert_eq!(&a[..4], rules_for(Dialect::Base));
205        assert_eq!(a[4], Rule::JitBinding);
206    }
207
208    #[test]
209    fn dialect_serde_round_trips() {
210        for d in [Dialect::Base, Dialect::LegacyBpe] {
211            let s = serde_json::to_string(&d).unwrap();
212            let back: Dialect = serde_json::from_str(&s).unwrap();
213            assert_eq!(d, back);
214        }
215    }
216}