Skip to main content

lean_ctx/core/terse/
dictionaries.rs

1//! Domain-specific abbreviation dictionaries for terse compression.
2//!
3//! Each dictionary provides whole-word-matching abbreviations for a specific
4//! domain (git, cargo, npm, general). Unlike the legacy ABBREVIATIONS list
5//! (18 blind substring replacements), these use word-boundary-aware matching.
6
7/// A single abbreviation rule: replaces `long` with `short` at word boundaries.
8pub struct Abbreviation {
9    pub long: &'static str,
10    pub short: &'static str,
11}
12
13pub const GENERAL: &[Abbreviation] = &[
14    Abbreviation {
15        long: "function",
16        short: "fn",
17    },
18    Abbreviation {
19        long: "configuration",
20        short: "cfg",
21    },
22    Abbreviation {
23        long: "implementation",
24        short: "impl",
25    },
26    Abbreviation {
27        long: "dependencies",
28        short: "deps",
29    },
30    Abbreviation {
31        long: "dependency",
32        short: "dep",
33    },
34    Abbreviation {
35        long: "request",
36        short: "req",
37    },
38    Abbreviation {
39        long: "response",
40        short: "res",
41    },
42    Abbreviation {
43        long: "context",
44        short: "ctx",
45    },
46    Abbreviation {
47        long: "error",
48        short: "err",
49    },
50    Abbreviation {
51        long: "return",
52        short: "ret",
53    },
54    Abbreviation {
55        long: "argument",
56        short: "arg",
57    },
58    Abbreviation {
59        long: "value",
60        short: "val",
61    },
62    Abbreviation {
63        long: "module",
64        short: "mod",
65    },
66    Abbreviation {
67        long: "package",
68        short: "pkg",
69    },
70    Abbreviation {
71        long: "directory",
72        short: "dir",
73    },
74    Abbreviation {
75        long: "parameter",
76        short: "param",
77    },
78    Abbreviation {
79        long: "variable",
80        short: "var",
81    },
82    Abbreviation {
83        long: "information",
84        short: "info",
85    },
86    Abbreviation {
87        long: "application",
88        short: "app",
89    },
90    Abbreviation {
91        long: "environment",
92        short: "env",
93    },
94    Abbreviation {
95        long: "repository",
96        short: "repo",
97    },
98    Abbreviation {
99        long: "authentication",
100        short: "auth",
101    },
102    Abbreviation {
103        long: "authorization",
104        short: "authz",
105    },
106    Abbreviation {
107        long: "description",
108        short: "desc",
109    },
110    Abbreviation {
111        long: "development",
112        short: "dev",
113    },
114    Abbreviation {
115        long: "production",
116        short: "prod",
117    },
118    Abbreviation {
119        long: "connection",
120        short: "conn",
121    },
122    Abbreviation {
123        long: "database",
124        short: "db",
125    },
126    Abbreviation {
127        long: "temporary",
128        short: "tmp",
129    },
130    Abbreviation {
131        long: "document",
132        short: "doc",
133    },
134    Abbreviation {
135        long: "maximum",
136        short: "max",
137    },
138    Abbreviation {
139        long: "minimum",
140        short: "min",
141    },
142    Abbreviation {
143        long: "number",
144        short: "num",
145    },
146    Abbreviation {
147        long: "reference",
148        short: "ref",
149    },
150    Abbreviation {
151        long: "string",
152        short: "str",
153    },
154    Abbreviation {
155        long: "message",
156        short: "msg",
157    },
158    Abbreviation {
159        long: "command",
160        short: "cmd",
161    },
162    Abbreviation {
163        long: "expression",
164        short: "expr",
165    },
166    Abbreviation {
167        long: "iteration",
168        short: "iter",
169    },
170    Abbreviation {
171        long: "previous",
172        short: "prev",
173    },
174    Abbreviation {
175        long: "current",
176        short: "cur",
177    },
178    Abbreviation {
179        long: "original",
180        short: "orig",
181    },
182    Abbreviation {
183        long: "destination",
184        short: "dst",
185    },
186    Abbreviation {
187        long: "source",
188        short: "src",
189    },
190    Abbreviation {
191        long: "attribute",
192        short: "attr",
193    },
194    Abbreviation {
195        long: "allocation",
196        short: "alloc",
197    },
198    Abbreviation {
199        long: "generation",
200        short: "gen",
201    },
202    Abbreviation {
203        long: "specification",
204        short: "spec",
205    },
206    Abbreviation {
207        long: "initialization",
208        short: "init",
209    },
210    Abbreviation {
211        long: "operation",
212        short: "op",
213    },
214    Abbreviation {
215        long: "optional",
216        short: "opt",
217    },
218    Abbreviation {
219        long: "utility",
220        short: "util",
221    },
222    Abbreviation {
223        long: "execution",
224        short: "exec",
225    },
226    Abbreviation {
227        long: "property",
228        short: "prop",
229    },
230    Abbreviation {
231        long: "statistics",
232        short: "stats",
233    },
234    Abbreviation {
235        long: "accumulator",
236        short: "acc",
237    },
238    Abbreviation {
239        long: "synchronize",
240        short: "sync",
241    },
242    Abbreviation {
243        long: "asynchronous",
244        short: "async",
245    },
246    Abbreviation {
247        long: "certificate",
248        short: "cert",
249    },
250    Abbreviation {
251        long: "identifier",
252        short: "id",
253    },
254];
255
256pub const GIT: &[Abbreviation] = &[
257    Abbreviation {
258        long: "modified",
259        short: "M",
260    },
261    Abbreviation {
262        long: "deleted",
263        short: "D",
264    },
265    Abbreviation {
266        long: "untracked",
267        short: "?",
268    },
269    Abbreviation {
270        long: "renamed",
271        short: "R",
272    },
273    Abbreviation {
274        long: "copied",
275        short: "C",
276    },
277    Abbreviation {
278        long: "insertion",
279        short: "+",
280    },
281    Abbreviation {
282        long: "deletion",
283        short: "-",
284    },
285    Abbreviation {
286        long: "upstream",
287        short: "u/",
288    },
289    Abbreviation {
290        long: "origin",
291        short: "o/",
292    },
293    Abbreviation {
294        long: "detached",
295        short: "det",
296    },
297    Abbreviation {
298        long: "conflict",
299        short: "!!",
300    },
301    Abbreviation {
302        long: "changes not staged for commit",
303        short: "unstaged",
304    },
305    Abbreviation {
306        long: "Changes to be committed",
307        short: "staged",
308    },
309    Abbreviation {
310        long: "nothing to commit, working tree clean",
311        short: "clean",
312    },
313];
314
315pub const CARGO: &[Abbreviation] = &[
316    Abbreviation {
317        long: "Compiling",
318        short: "CC",
319    },
320    Abbreviation {
321        long: "Downloading",
322        short: "DL",
323    },
324    Abbreviation {
325        long: "Downloaded",
326        short: "DL'd",
327    },
328    Abbreviation {
329        long: "Finished",
330        short: "OK",
331    },
332    Abbreviation {
333        long: "warning",
334        short: "W",
335    },
336    Abbreviation {
337        long: "test result: ok",
338        short: "PASS",
339    },
340    Abbreviation {
341        long: "test result: FAILED",
342        short: "FAIL",
343    },
344    Abbreviation {
345        long: "running",
346        short: "run",
347    },
348    Abbreviation {
349        long: "Blocking waiting for file lock on package cache",
350        short: "LOCK",
351    },
352    Abbreviation {
353        long: "Updating crates.io index",
354        short: "IDX",
355    },
356    Abbreviation {
357        long: "target/debug",
358        short: "t/d",
359    },
360    Abbreviation {
361        long: "target/release",
362        short: "t/r",
363    },
364];
365
366pub const NPM: &[Abbreviation] = &[
367    Abbreviation {
368        long: "added",
369        short: "+",
370    },
371    Abbreviation {
372        long: "removed",
373        short: "-",
374    },
375    Abbreviation {
376        long: "packages",
377        short: "pkgs",
378    },
379    Abbreviation {
380        long: "vulnerabilities",
381        short: "vulns",
382    },
383    Abbreviation {
384        long: "deprecated",
385        short: "depr",
386    },
387    Abbreviation {
388        long: "node_modules",
389        short: "n_m",
390    },
391    Abbreviation {
392        long: "devDependencies",
393        short: "devDeps",
394    },
395    Abbreviation {
396        long: "peerDependencies",
397        short: "peerDeps",
398    },
399    Abbreviation {
400        long: "optionalDependencies",
401        short: "optDeps",
402    },
403    Abbreviation {
404        long: "npm warn",
405        short: "W",
406    },
407    Abbreviation {
408        long: "npm error",
409        short: "E",
410    },
411];
412
413/// Applies whole-word abbreviations from the given dictionaries to the text.
414/// Uses a single scan: first checks which patterns exist, then applies only matches.
415pub fn apply_dictionaries(text: &str, level: DictLevel) -> String {
416    let dicts: Vec<&[Abbreviation]> = match level {
417        DictLevel::General => vec![GENERAL],
418        DictLevel::Full => vec![GENERAL, GIT, CARGO, NPM],
419    };
420
421    let mut result = text.to_string();
422    for dict in dicts {
423        for abbr in dict {
424            result = replace_whole_word(&result, abbr.long, abbr.short);
425        }
426    }
427    result
428}
429
430#[derive(Debug, Clone, Copy, PartialEq)]
431pub enum DictLevel {
432    General,
433    Full,
434}
435
436fn is_word_boundary(b: u8) -> bool {
437    !b.is_ascii_alphanumeric() && b != b'-' && b != b'_' && b != b'\'' && b != b'"'
438}
439
440fn replace_whole_word(text: &str, pattern: &str, replacement: &str) -> String {
441    if pattern.is_empty() {
442        return text.to_string();
443    }
444
445    let pattern_lower = pattern.to_lowercase();
446    let text_lower = text.to_lowercase();
447
448    if !text_lower.contains(&pattern_lower) {
449        return text.to_string();
450    }
451
452    let mut result = String::with_capacity(text.len());
453    let mut start = 0;
454
455    while let Some(pos) = text_lower[start..].find(&pattern_lower) {
456        let abs_pos = start + pos;
457        let end_pos = abs_pos + pattern.len();
458
459        let before_ok = abs_pos == 0 || is_word_boundary(text.as_bytes()[abs_pos - 1]);
460        let after_ok = end_pos >= text.len() || is_word_boundary(text.as_bytes()[end_pos]);
461
462        result.push_str(&text[start..abs_pos]);
463
464        if before_ok && after_ok {
465            result.push_str(replacement);
466        } else {
467            result.push_str(&text[start + pos..end_pos]);
468        }
469        start = end_pos;
470    }
471    result.push_str(&text[start..]);
472    result
473}
474
475#[cfg(test)]
476mod tests {
477    use super::*;
478
479    #[test]
480    fn whole_word_replaces_standalone() {
481        let r = replace_whole_word("the function works", "function", "fn");
482        assert_eq!(r, "the fn works");
483    }
484
485    #[test]
486    fn whole_word_skips_substring() {
487        let r = replace_whole_word("dysfunction", "function", "fn");
488        assert_eq!(r, "dysfunction");
489    }
490
491    #[test]
492    fn whole_word_at_start() {
493        let r = replace_whole_word("function call", "function", "fn");
494        assert_eq!(r, "fn call");
495    }
496
497    #[test]
498    fn whole_word_at_end() {
499        let r = replace_whole_word("call function", "function", "fn");
500        assert_eq!(r, "call fn");
501    }
502
503    #[test]
504    fn whole_word_with_punctuation() {
505        let r = replace_whole_word("function(arg)", "function", "fn");
506        assert_eq!(r, "fn(arg)");
507    }
508
509    #[test]
510    fn general_dict_applies() {
511        let r = apply_dictionaries("the configuration directory", DictLevel::General);
512        assert!(r.contains("cfg"));
513        assert!(r.contains("dir"));
514    }
515
516    #[test]
517    fn full_dict_includes_domain() {
518        let r = apply_dictionaries("Compiling lean-ctx", DictLevel::Full);
519        assert!(r.contains("CC"), "cargo abbreviation should apply: {r}");
520    }
521
522    #[test]
523    fn dict_count_general() {
524        assert!(
525            GENERAL.len() >= 60,
526            "should have 60+ general abbreviations, got {}",
527            GENERAL.len()
528        );
529    }
530
531    #[test]
532    fn dict_count_git() {
533        assert!(
534            GIT.len() >= 9,
535            "should have 9+ git abbreviations, got {}",
536            GIT.len()
537        );
538    }
539
540    #[test]
541    fn git_dict_never_abbreviates_subcommands() {
542        let git_subcommands = [
543            "commit", "branch", "checkout", "merge", "stash", "rebase", "push", "pull", "fetch",
544            "clone", "tag", "reset", "bisect", "log", "diff", "show", "status", "add",
545        ];
546        for abbr in GIT {
547            assert!(
548                !git_subcommands.contains(&abbr.long),
549                "GIT dictionary must NOT abbreviate git subcommand '{}' (→ '{}'). \
550                 Agents will misinterpret abbreviated output as valid commands.",
551                abbr.long,
552                abbr.short
553            );
554        }
555    }
556
557    #[test]
558    fn commit_word_survives_full_dict() {
559        let text = "commit abc1234 on branch main";
560        let result = apply_dictionaries(text, DictLevel::Full);
561        assert!(
562            result.contains("commit"),
563            "word 'commit' must not be abbreviated in output: {result}"
564        );
565    }
566
567    #[test]
568    fn branch_word_survives_full_dict() {
569        let text = "Your branch is ahead of 'origin/main' by 2 commits";
570        let result = apply_dictionaries(text, DictLevel::Full);
571        assert!(
572            result.contains("branch"),
573            "word 'branch' must not be abbreviated in output: {result}"
574        );
575    }
576}