Skip to main content

lean_ctx/core/
tokenizer_translation_driver.rs

1use crate::core::profiles::TranslationConfig;
2
3#[derive(Debug, Clone, Copy, PartialEq, Eq)]
4pub enum TranslationRulesetV1 {
5    Legacy,
6    Ascii,
7}
8
9#[derive(Debug, Clone)]
10pub struct TranslationSelectionV1 {
11    pub ruleset: TranslationRulesetV1,
12    pub reason_code: String,
13    pub reason: String,
14    pub model_key: Option<String>,
15}
16
17#[derive(Debug, Clone)]
18pub struct TranslationApplyResultV1 {
19    pub output: String,
20    pub selection: TranslationSelectionV1,
21    pub changed: bool,
22    pub skipped_json: bool,
23}
24
25pub fn translate_tool_output(text: &str, cfg: &TranslationConfig) -> TranslationApplyResultV1 {
26    let model_key = active_model_key_from_env();
27    let selection = select_ruleset(cfg, model_key.as_deref());
28
29    if selection.ruleset == TranslationRulesetV1::Legacy {
30        return TranslationApplyResultV1 {
31            output: text.to_string(),
32            selection,
33            changed: false,
34            skipped_json: false,
35        };
36    }
37
38    if looks_like_json(text) {
39        return TranslationApplyResultV1 {
40            output: text.to_string(),
41            selection,
42            changed: false,
43            skipped_json: true,
44        };
45    }
46
47    let out = translate_text(text, selection.ruleset);
48    TranslationApplyResultV1 {
49        changed: out != text,
50        output: out,
51        selection,
52        skipped_json: false,
53    }
54}
55
56pub fn translate_text(text: &str, ruleset: TranslationRulesetV1) -> String {
57    match ruleset {
58        TranslationRulesetV1::Legacy => text.to_string(),
59        TranslationRulesetV1::Ascii => translate_ascii(text),
60    }
61}
62
63fn normalize_ruleset(s: &str) -> String {
64    s.trim().to_lowercase().replace(['_', ' '], "-")
65}
66
67fn active_model_key_from_env() -> Option<String> {
68    let raw = std::env::var("LEAN_CTX_MODEL")
69        .or_else(|_| std::env::var("LCTX_MODEL"))
70        .unwrap_or_default();
71    let m = raw.trim();
72    if m.is_empty() {
73        return None;
74    }
75    Some(m.to_lowercase().replace(['_', ' '], "-"))
76}
77
78#[derive(Debug, Clone, Copy, PartialEq, Eq)]
79enum ModelFamilyV1 {
80    OpenAiGpt,
81    AnthropicClaude,
82    GoogleGemini,
83    Unknown,
84}
85
86fn infer_model_family(model_key: &str) -> ModelFamilyV1 {
87    let m = model_key.trim().to_lowercase();
88    if m.contains("gpt") || m.contains("openai") {
89        return ModelFamilyV1::OpenAiGpt;
90    }
91    if m.contains("claude") {
92        return ModelFamilyV1::AnthropicClaude;
93    }
94    if m.contains("gemini") {
95        return ModelFamilyV1::GoogleGemini;
96    }
97    ModelFamilyV1::Unknown
98}
99
100pub fn select_ruleset(cfg: &TranslationConfig, model_key: Option<&str>) -> TranslationSelectionV1 {
101    let model_key = model_key.map(str::trim).filter(|s| !s.is_empty());
102    let model_key = model_key.map(std::string::ToString::to_string);
103
104    if !cfg.enabled_effective() {
105        return TranslationSelectionV1 {
106            ruleset: TranslationRulesetV1::Legacy,
107            reason_code: "disabled".to_string(),
108            reason: "translation disabled by profile".to_string(),
109            model_key,
110        };
111    }
112
113    let ruleset = normalize_ruleset(cfg.ruleset_effective());
114    match ruleset.as_str() {
115        "legacy" | "unicode" => TranslationSelectionV1 {
116            ruleset: TranslationRulesetV1::Legacy,
117            reason_code: "legacy".to_string(),
118            reason: "legacy ruleset selected".to_string(),
119            model_key,
120        },
121        "ascii" => TranslationSelectionV1 {
122            ruleset: TranslationRulesetV1::Ascii,
123            reason_code: "ascii".to_string(),
124            reason: "ascii ruleset selected".to_string(),
125            model_key,
126        },
127        "auto" => {
128            let family = model_key
129                .as_deref()
130                .map_or(ModelFamilyV1::Unknown, infer_model_family);
131            match family {
132                ModelFamilyV1::OpenAiGpt => TranslationSelectionV1 {
133                    ruleset: TranslationRulesetV1::Ascii,
134                    reason_code: "auto_openai_gpt".to_string(),
135                    reason: "auto: OpenAI/GPT tokenizer prefers ASCII over Unicode symbols"
136                        .to_string(),
137                    model_key,
138                },
139                _ => TranslationSelectionV1 {
140                    ruleset: TranslationRulesetV1::Legacy,
141                    reason_code: "auto_unknown".to_string(),
142                    reason: "auto: unknown tokenizer family; preserve legacy format".to_string(),
143                    model_key,
144                },
145            }
146        }
147        other => TranslationSelectionV1 {
148            ruleset: TranslationRulesetV1::Legacy,
149            reason_code: "unknown_ruleset".to_string(),
150            reason: format!("unknown ruleset '{other}'; using legacy"),
151            model_key,
152        },
153    }
154}
155
156fn looks_like_json(text: &str) -> bool {
157    let t = text.trim();
158    if t.is_empty() {
159        return false;
160    }
161    if !(t.starts_with('{') || t.starts_with('[')) {
162        return false;
163    }
164    serde_json::from_str::<serde_json::Value>(t).is_ok()
165}
166
167// Prefer deterministic, minimal symbol substitutions.
168const ASCII_SYMBOL_RULES: &[(&str, &str)] = &[
169    // Signature/TDD glyphs (empirically expensive on GPT tokenizers)
170    ("⊛ ", "+ "),
171    ("⊛", "+"),
172    ("λ", "fn"),
173    ("§", "cl"),
174    ("∂", "if"),
175    ("τ", "ty"),
176    ("ε", "en"),
177    ("ν", "val"),
178    // Common CRP/TDD symbols
179    ("→", "->"),
180    ("≠", "!="),
181    ("≈", "~"),
182    ("∴", "thus"),
183    ("✓", "ok"),
184    ("✗", "fail"),
185    ("⚠", "warn"),
186];
187
188fn translate_ascii(text: &str) -> String {
189    let mut out = text.to_string();
190    for (from, to) in ASCII_SYMBOL_RULES {
191        if out.contains(from) {
192            out = out.replace(from, to);
193        }
194    }
195
196    // Apply TokenOptimizer only on synthetic TDD signature lines (verifier-safe).
197    let opt = crate::core::neural::token_optimizer::TokenOptimizer::with_defaults();
198    let mut changed = false;
199    let mut lines: Vec<String> = Vec::new();
200    for line in out.lines() {
201        if is_synthetic_tdd_signature_line(line) {
202            let optimized = opt.optimize_line(line);
203            if optimized != line {
204                changed = true;
205            }
206            lines.push(optimized);
207        } else {
208            lines.push(line.to_string());
209        }
210    }
211    if changed {
212        out = lines.join("\n");
213    }
214
215    out
216}
217
218fn is_synthetic_tdd_signature_line(line: &str) -> bool {
219    let mut t = line.trim_start();
220    if let Some(rest) = t.strip_prefix('~') {
221        t = rest;
222    }
223
224    // Unicode TDD signature markers: λ/§/∂/τ/ε/ν + visibility +/-.
225    if let Some(first) = t.chars().next() {
226        if matches!(first, 'λ' | '§' | '∂' | 'τ' | 'ε' | 'ν') {
227            let mut it = t.chars();
228            let _ = it.next();
229            if matches!(it.next(), Some('+' | '-')) {
230                return true;
231            }
232        }
233    }
234
235    // ASCII translated variants (after symbol mapping).
236    let ascii_prefixes = [
237        "fn+", "fn-", "cl+", "cl-", "if+", "if-", "ty+", "ty-", "en+", "en-", "val+", "val-",
238    ];
239    ascii_prefixes.iter().any(|p| t.starts_with(p))
240}
241
242#[cfg(test)]
243mod tests {
244    use super::*;
245    use std::sync::{Mutex, OnceLock};
246
247    fn env_lock() -> std::sync::MutexGuard<'static, ()> {
248        static LOCK: OnceLock<Mutex<()>> = OnceLock::new();
249        LOCK.get_or_init(|| Mutex::new(()))
250            .lock()
251            .unwrap_or_else(std::sync::PoisonError::into_inner)
252    }
253
254    #[test]
255    fn ruleset_disabled_is_legacy() {
256        let _lock = env_lock();
257        std::env::remove_var("LEAN_CTX_MODEL");
258        let cfg = TranslationConfig {
259            enabled: Some(false),
260            ruleset: Some("auto".to_string()),
261        };
262        let sel = select_ruleset(&cfg, Some("gpt-5.4"));
263        assert_eq!(sel.ruleset, TranslationRulesetV1::Legacy);
264        assert!(sel.reason_code.contains("disabled"));
265    }
266
267    #[test]
268    fn ruleset_ascii_forced() {
269        let cfg = TranslationConfig {
270            enabled: Some(true),
271            ruleset: Some("ascii".to_string()),
272        };
273        let sel = select_ruleset(&cfg, Some("claude-3.5-sonnet"));
274        assert_eq!(sel.ruleset, TranslationRulesetV1::Ascii);
275    }
276
277    #[test]
278    fn ruleset_auto_openai_gpt() {
279        let cfg = TranslationConfig {
280            enabled: Some(true),
281            ruleset: Some("auto".to_string()),
282        };
283        let sel = select_ruleset(&cfg, Some("gpt-5.4-mini"));
284        assert_eq!(sel.ruleset, TranslationRulesetV1::Ascii);
285        assert!(sel.reason_code.contains("auto_openai_gpt"));
286    }
287
288    #[test]
289    fn ruleset_auto_unknown_falls_back_to_legacy() {
290        let cfg = TranslationConfig {
291            enabled: Some(true),
292            ruleset: Some("auto".to_string()),
293        };
294        let sel = select_ruleset(&cfg, Some("claude-3.5-sonnet"));
295        assert_eq!(sel.ruleset, TranslationRulesetV1::Legacy);
296        assert!(sel.reason_code.contains("auto_unknown"));
297    }
298
299    #[test]
300    fn translation_skips_json_outputs() {
301        let _lock = env_lock();
302        std::env::set_var("LEAN_CTX_MODEL", "gpt-5.4");
303        let cfg = TranslationConfig {
304            enabled: Some(true),
305            ruleset: Some("auto".to_string()),
306        };
307        let json = r#"{"ok":"✓","arrow":"→"}"#;
308        let r = translate_tool_output(json, &cfg);
309        assert!(r.skipped_json);
310        assert_eq!(r.output, json);
311    }
312
313    #[test]
314    fn translation_ascii_converts_signature_markers_and_optimizes_types() {
315        let cfg = TranslationConfig {
316            enabled: Some(true),
317            ruleset: Some("ascii".to_string()),
318        };
319        let input = "λ+foo(x)→Vec<String>";
320        let r = translate_tool_output(input, &cfg);
321        assert!(!r.skipped_json);
322        assert!(r.output.contains("fn+foo"));
323        assert!(r.output.contains("->Vec"));
324        assert!(!r.output.contains("λ"));
325        assert!(!r.output.contains("→"));
326    }
327}