pithy-core 0.0.2

UltraCoS® symbolic token compression — 17-rule encoder for LLM prompts. PolyForm Noncommercial.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
//! Session-scoped symbol table -- patent claim 6.
//!
//! The encoder (`crate::encoder`) only knows the static substitution
//! table. Real conversations introduce many more recurring terms
//! (entity names, file paths, commit SHAs, function identifiers).
//! When the same multi-token term appears N+ times in a session, the
//! session table promotes it to a short alias and rewrites every
//! subsequent turn -- so savings compound across turns rather than
//! resetting each request.
//!
//! Decay-weighted utility: each occurrence decays the per-term score
//! by a half-life so stale terms eventually drop out and the alias
//! pool stays bounded. Promoted bindings are emitted as a single
//! prefix the proxy prepends to the next outbound prompt; any
//! receiver (model or tool) that has not seen the binding sees the
//! prefix and learns the alias once.
//!
//! v0.1 scope (this file):
//!   - Scoring: simple count + token-cost gain heuristic.
//!   - Promotion threshold: configurable; default 2 occurrences.
//!   - Pool cap: configurable; default 64 active bindings.
//!   - Substitution: longest-binding-first greedy, like encoder.rs.
//!   - Prefix format: one binding per line, `<alias>=<term>`.
//!
//! Out of v0.1 scope (tracked as session-followup):
//!   - Decay across calendar time (currently per-turn only).
//!   - Cross-tenant alias namespaces.
//!   - Cryptographic binding receipts (would live next to PQC
//!     signatures in `pithy-measure`).

use std::collections::HashMap;

use regex::{Regex, RegexBuilder};

/// Tunable thresholds for promotion and pool size.
///
/// Defaults are chosen to compound at turn ~3 on a typical agent loop
/// while keeping the alias header small enough that one-shot turns
/// do not pay for it.
#[derive(Debug, Clone, Copy)]
pub struct SessionPolicy {
    /// Minimum number of observed occurrences before a term is
    /// eligible for promotion to an alias.
    pub min_occurrences: u32,
    /// Maximum number of active bindings simultaneously held by the
    /// session. When exceeded, the lowest-utility binding is evicted.
    pub max_active_bindings: usize,
    /// Per-turn multiplicative decay applied to every term's score.
    /// 1.0 disables decay; 0.0 forgets immediately. Default 0.85
    /// gives ~5-turn half-life.
    pub turn_decay: f32,
    /// Minimum token-length of a candidate term (in whitespace-split
    /// units). 2 prevents single common words from polluting the
    /// pool.
    pub min_term_words: usize,
}

impl Default for SessionPolicy {
    fn default() -> Self {
        Self {
            min_occurrences: 2,
            max_active_bindings: 64,
            turn_decay: 0.85,
            min_term_words: 2,
        }
    }
}

/// One alias binding promoted by the session.
///
/// Cloned cheaply because both fields are short.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Binding {
    /// Generated short alias, e.g. `s12`.
    pub alias: String,
    /// Original term being aliased.
    pub term: String,
}

#[derive(Debug, Default)]
struct CandidateState {
    occurrences: u32,
    score: f32,
}

/// Multi-turn alias accumulator.
#[derive(Debug)]
pub struct SessionSymbolTable {
    policy: SessionPolicy,
    /// Running observation state, keyed by lowercased term.
    candidates: HashMap<String, CandidateState>,
    /// Active alias bindings, in promotion order.
    bindings: Vec<Binding>,
    /// Monotonic counter so aliases are stable across the session.
    next_id: u32,
}

impl SessionSymbolTable {
    /// Build a table with explicit policy.
    #[must_use]
    pub fn new(policy: SessionPolicy) -> Self {
        Self {
            policy,
            candidates: HashMap::new(),
            bindings: Vec::new(),
            next_id: 0,
        }
    }

    /// Active bindings in promotion order.
    #[must_use]
    pub fn bindings(&self) -> &[Binding] {
        &self.bindings
    }

    /// Render the current binding pool as a one-binding-per-line
    /// prefix that the proxy prepends to the next outbound prompt.
    ///
    /// Empty when no bindings are active so single-turn calls do not
    /// pay any prefix cost.
    #[must_use]
    pub fn prefix(&self) -> String {
        if self.bindings.is_empty() {
            return String::new();
        }
        let mut out = String::with_capacity(self.bindings.len() * 24);
        for b in &self.bindings {
            out.push_str(&b.alias);
            out.push('=');
            out.push_str(&b.term);
            out.push('\n');
        }
        out
    }

    /// Observe one turn of input text. Updates per-term scores,
    /// promotes any candidate that crosses the threshold, and may
    /// evict the lowest-utility binding to keep the pool bounded.
    pub fn observe(&mut self, text: &str) {
        // Decay everything first so a single high-traffic term does
        // not dominate forever.
        for state in self.candidates.values_mut() {
            state.score *= self.policy.turn_decay;
        }
        for term in extract_candidate_terms(text, self.policy.min_term_words) {
            let key = term.to_ascii_lowercase();
            let state = self.candidates.entry(key.clone()).or_default();
            state.occurrences += 1;
            // Score: occurrence count, weighted by term length so
            // longer terms (worth more tokens) outrank short noise.
            let len_bonus = u32::try_from(term.split_whitespace().count()).unwrap_or(1) as f32;
            state.score += len_bonus;
            if state.occurrences >= self.policy.min_occurrences && !self.has_binding_for(&key) {
                self.promote(term, key.clone());
            }
        }
        self.enforce_cap();
    }

    /// Apply current bindings to `text` and return the alias-rewritten
    /// version. Substitution is greedy longest-binding-first, like the
    /// static encoder.
    #[must_use]
    pub fn rewrite(&self, text: &str) -> String {
        if self.bindings.is_empty() {
            return text.to_owned();
        }
        let mut sorted: Vec<&Binding> = self.bindings.iter().collect();
        sorted.sort_by_key(|b| std::cmp::Reverse(b.term.len()));
        let mut out = text.to_owned();
        for b in sorted {
            let pat = format!(r"\b{}\b", regex::escape(&b.term));
            if let Ok(re) = RegexBuilder::new(&pat).case_insensitive(true).build() {
                out = re.replace_all(&out, b.alias.as_str()).into_owned();
            }
        }
        out
    }

    /// Convenience: observe, then rewrite. Returns the rewritten text.
    /// `prefix()` is queried separately by the proxy so it can decide
    /// whether to emit it for this turn (every-turn vs first-turn-only
    /// is a proxy policy concern).
    pub fn observe_and_rewrite(&mut self, text: &str) -> String {
        self.observe(text);
        self.rewrite(text)
    }

    fn has_binding_for(&self, lowered_term: &str) -> bool {
        self.bindings
            .iter()
            .any(|b| b.term.eq_ignore_ascii_case(lowered_term))
    }

    fn promote(&mut self, term: String, _key_lower: String) {
        let alias = format!("s{}", self.next_id);
        self.next_id += 1;
        self.bindings.push(Binding { alias, term });
    }

    fn enforce_cap(&mut self) {
        if self.bindings.len() <= self.policy.max_active_bindings {
            return;
        }
        // Evict bindings whose scoring candidate has decayed below
        // the lowest active threshold. Simplest stable policy: drop
        // the oldest binding (FIFO). Real LFU would need a parallel
        // index; revisit when telemetry shows churn.
        let drop = self.bindings.len() - self.policy.max_active_bindings;
        self.bindings.drain(0..drop);
    }
}

impl Default for SessionSymbolTable {
    fn default() -> Self {
        Self::new(SessionPolicy::default())
    }
}

/// Extract candidate multi-word terms from `text`.
///
/// v0.1 heuristic: any sequence of `min_words..=4` Capitalized or
/// CamelCase tokens, plus `path/like/strings`, plus quoted phrases.
/// We deliberately under-extract: false positives bloat the alias
/// pool, while false negatives only cost the static-encoder savings
/// floor (which we already get from `crate::encoder`).
const LEADING_ARTICLES: &[&str] = &["The", "A", "An", "This", "That", "These", "Those"];

fn strip_leading_article(s: &str) -> &str {
    let mut iter = s.splitn(2, char::is_whitespace);
    let first = iter.next().unwrap_or("");
    let rest = iter.next().unwrap_or("");
    if LEADING_ARTICLES.contains(&first) && !rest.is_empty() {
        rest
    } else {
        s
    }
}

fn extract_candidate_terms(text: &str, min_words: usize) -> Vec<String> {
    let mut out = Vec::new();
    // Capitalized 2-4 word phrases.
    let cap_phrase = Regex::new(r"\b([A-Z][a-zA-Z0-9_]{2,}(?:\s+[A-Z][a-zA-Z0-9_]{2,}){1,3})\b")
        .expect("cap-phrase pattern");
    for m in cap_phrase.find_iter(text) {
        let raw = m.as_str().trim();
        // Drop leading capitalized articles/determiners: "The X Y" -> "X Y".
        // Keeps the alias pool focused on entity names rather than
        // sentence starts.
        let s = strip_leading_article(raw);
        if s.split_whitespace().count() >= min_words {
            out.push(s.to_owned());
        }
    }
    // Slash-paths (file paths / module paths) of length >= 2 segments.
    let path_re = Regex::new(r"[A-Za-z0-9_\-\.]+(?:/[A-Za-z0-9_\-\.]+){1,}").expect("path pattern");
    for m in path_re.find_iter(text) {
        let s = m.as_str().trim_end_matches('.');
        // Path counts as a single multi-token candidate for the cost
        // model: the tokenizer typically turns each segment into 1-2
        // tokens, so an alias is a meaningful win.
        if min_words <= 1 || s.split('/').count() >= min_words {
            out.push(s.to_owned());
        }
    }
    out
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn first_observation_does_not_promote() {
        let mut t = SessionSymbolTable::default();
        t.observe("Authentication Module is the entry point.");
        // 1 occurrence < threshold 2.
        assert_eq!(t.bindings().len(), 0);
    }

    #[test]
    fn second_observation_promotes_to_binding() {
        let mut t = SessionSymbolTable::default();
        t.observe("The Authentication Module starts.");
        t.observe("The Authentication Module fires the policy.");
        assert_eq!(t.bindings().len(), 1, "{:?}", t.bindings());
        assert_eq!(t.bindings()[0].term, "Authentication Module");
    }

    #[test]
    fn rewrite_replaces_all_occurrences() {
        let mut t = SessionSymbolTable::default();
        t.observe("Authentication Module v1.");
        t.observe("Authentication Module v2.");
        let out = t.rewrite("Authentication Module is fine.");
        assert!(out.contains("s0"));
        assert!(!out.contains("Authentication Module"));
    }

    #[test]
    fn prefix_emits_one_line_per_binding() {
        let mut t = SessionSymbolTable::default();
        t.observe("Policy Engine here.");
        t.observe("Policy Engine again.");
        t.observe("Session Store later.");
        t.observe("Session Store again.");
        let p = t.prefix();
        assert!(p.contains("=Policy Engine"), "prefix={p}");
        assert!(p.contains("=Session Store"), "prefix={p}");
        assert_eq!(p.lines().count(), 2);
    }

    #[test]
    fn empty_session_emits_empty_prefix() {
        let t = SessionSymbolTable::default();
        assert_eq!(t.prefix(), "");
    }

    #[test]
    fn pool_cap_is_enforced() {
        let policy = SessionPolicy {
            min_occurrences: 2,
            max_active_bindings: 2,
            turn_decay: 1.0,
            min_term_words: 2,
        };
        let mut t = SessionSymbolTable::new(policy);
        for term in ["Acme Service", "Beacon Service", "Cinder Service"] {
            t.observe(&format!("{term} first."));
            t.observe(&format!("{term} second."));
        }
        assert_eq!(t.bindings().len(), 2);
    }

    #[test]
    fn paths_become_candidates() {
        let mut t = SessionSymbolTable::default();
        t.observe("look at src/encoder.rs");
        t.observe("now src/encoder.rs again");
        let out = t.rewrite("src/encoder.rs is the file");
        assert!(!out.contains("src/encoder.rs"));
    }

    #[test]
    fn longer_binding_wins_over_shorter() {
        let mut t = SessionSymbolTable::default();
        t.observe("Authentication Module Plus initial seed.");
        t.observe("Authentication Module Plus second seed.");
        t.observe("Authentication Module fires.");
        t.observe("Authentication Module fires again.");
        let out = t.rewrite("Authentication Module Plus is the longer one.");
        // The 3-word binding (s0) must win over the 2-word (s1) when both apply.
        assert!(out.contains("s0"), "{out}");
        assert!(!out.contains("Authentication Module Plus"));
    }

    #[test]
    fn observe_and_rewrite_returns_alias_substituted_text() {
        let mut t = SessionSymbolTable::default();
        t.observe_and_rewrite("Policy Engine boot.");
        let out = t.observe_and_rewrite("Policy Engine boot.");
        assert!(out.contains("s0"));
    }

    #[test]
    fn rewrite_unchanged_when_no_bindings() {
        let t = SessionSymbolTable::default();
        let s = "nothing seen yet";
        assert_eq!(t.rewrite(s), s);
    }

    #[test]
    fn decay_keeps_state_bounded_under_repeated_observation() {
        // turn_decay=0.5, occurrences accumulate but decayed score
        // stays bounded. Sanity check: scores never explode.
        let policy = SessionPolicy {
            min_occurrences: 2,
            max_active_bindings: 4,
            turn_decay: 0.5,
            min_term_words: 2,
        };
        let mut t = SessionSymbolTable::new(policy);
        for _ in 0..50 {
            t.observe("Auth Module again.");
        }
        // Promotion happened (>=2 obs); pool stays bounded; no panic.
        assert_eq!(t.bindings().len(), 1);
    }

    #[test]
    fn long_session_compound_savings_simulation() {
        // Same canonical sentence recurs. Verify alias substitution
        // materially shortens later turns vs. the original prose.
        let mut t = SessionSymbolTable::default();
        let canonical = "Authentication Module forwards to Policy Engine \
                         for Validation Service against Session Store.";
        // Prime: 2 turns promote all multi-word terms.
        for _ in 0..2 {
            t.observe(canonical);
        }
        let rewritten = t.rewrite(canonical);
        assert!(
            rewritten.len() < canonical.len() * 3 / 4,
            "expected >25% length reduction, got {} -> {}",
            canonical.len(),
            rewritten.len()
        );
    }
}