Skip to main content

pithy_core/
session.rs

1//! Session-scoped symbol table -- patent claim 6.
2//!
3//! The encoder (`crate::encoder`) only knows the static substitution
4//! table. Real conversations introduce many more recurring terms
5//! (entity names, file paths, commit SHAs, function identifiers).
6//! When the same multi-token term appears N+ times in a session, the
7//! session table promotes it to a short alias and rewrites every
8//! subsequent turn -- so savings compound across turns rather than
9//! resetting each request.
10//!
11//! Decay-weighted utility: each occurrence decays the per-term score
12//! by a half-life so stale terms eventually drop out and the alias
13//! pool stays bounded. Promoted bindings are emitted as a single
14//! prefix the proxy prepends to the next outbound prompt; any
15//! receiver (model or tool) that has not seen the binding sees the
16//! prefix and learns the alias once.
17//!
18//! v0.1 scope (this file):
19//!   - Scoring: simple count + token-cost gain heuristic.
20//!   - Promotion threshold: configurable; default 2 occurrences.
21//!   - Pool cap: configurable; default 64 active bindings.
22//!   - Substitution: longest-binding-first greedy, like encoder.rs.
23//!   - Prefix format: one binding per line, `<alias>=<term>`.
24//!
25//! Out of v0.1 scope (tracked as session-followup):
26//!   - Decay across calendar time (currently per-turn only).
27//!   - Cross-tenant alias namespaces.
28//!   - Cryptographic binding receipts (would live next to PQC
29//!     signatures in `pithy-measure`).
30
31use std::collections::HashMap;
32
33use regex::{Regex, RegexBuilder};
34
35/// Tunable thresholds for promotion and pool size.
36///
37/// Defaults are chosen to compound at turn ~3 on a typical agent loop
38/// while keeping the alias header small enough that one-shot turns
39/// do not pay for it.
40#[derive(Debug, Clone, Copy)]
41pub struct SessionPolicy {
42    /// Minimum number of observed occurrences before a term is
43    /// eligible for promotion to an alias.
44    pub min_occurrences: u32,
45    /// Maximum number of active bindings simultaneously held by the
46    /// session. When exceeded, the lowest-utility binding is evicted.
47    pub max_active_bindings: usize,
48    /// Per-turn multiplicative decay applied to every term's score.
49    /// 1.0 disables decay; 0.0 forgets immediately. Default 0.85
50    /// gives ~5-turn half-life.
51    pub turn_decay: f32,
52    /// Minimum token-length of a candidate term (in whitespace-split
53    /// units). 2 prevents single common words from polluting the
54    /// pool.
55    pub min_term_words: usize,
56}
57
58impl Default for SessionPolicy {
59    fn default() -> Self {
60        Self {
61            min_occurrences: 2,
62            max_active_bindings: 64,
63            turn_decay: 0.85,
64            min_term_words: 2,
65        }
66    }
67}
68
69/// One alias binding promoted by the session.
70///
71/// Cloned cheaply because both fields are short.
72#[derive(Debug, Clone, PartialEq, Eq)]
73pub struct Binding {
74    /// Generated short alias, e.g. `s12`.
75    pub alias: String,
76    /// Original term being aliased.
77    pub term: String,
78}
79
80#[derive(Debug, Default)]
81struct CandidateState {
82    occurrences: u32,
83    score: f32,
84}
85
86/// Multi-turn alias accumulator.
87#[derive(Debug)]
88pub struct SessionSymbolTable {
89    policy: SessionPolicy,
90    /// Running observation state, keyed by lowercased term.
91    candidates: HashMap<String, CandidateState>,
92    /// Active alias bindings, in promotion order.
93    bindings: Vec<Binding>,
94    /// Monotonic counter so aliases are stable across the session.
95    next_id: u32,
96}
97
98impl SessionSymbolTable {
99    /// Build a table with explicit policy.
100    #[must_use]
101    pub fn new(policy: SessionPolicy) -> Self {
102        Self {
103            policy,
104            candidates: HashMap::new(),
105            bindings: Vec::new(),
106            next_id: 0,
107        }
108    }
109
110    /// Active bindings in promotion order.
111    #[must_use]
112    pub fn bindings(&self) -> &[Binding] {
113        &self.bindings
114    }
115
116    /// Render the current binding pool as a one-binding-per-line
117    /// prefix that the proxy prepends to the next outbound prompt.
118    ///
119    /// Empty when no bindings are active so single-turn calls do not
120    /// pay any prefix cost.
121    #[must_use]
122    pub fn prefix(&self) -> String {
123        if self.bindings.is_empty() {
124            return String::new();
125        }
126        let mut out = String::with_capacity(self.bindings.len() * 24);
127        for b in &self.bindings {
128            out.push_str(&b.alias);
129            out.push('=');
130            out.push_str(&b.term);
131            out.push('\n');
132        }
133        out
134    }
135
136    /// Observe one turn of input text. Updates per-term scores,
137    /// promotes any candidate that crosses the threshold, and may
138    /// evict the lowest-utility binding to keep the pool bounded.
139    pub fn observe(&mut self, text: &str) {
140        // Decay everything first so a single high-traffic term does
141        // not dominate forever.
142        for state in self.candidates.values_mut() {
143            state.score *= self.policy.turn_decay;
144        }
145        for term in extract_candidate_terms(text, self.policy.min_term_words) {
146            let key = term.to_ascii_lowercase();
147            let state = self.candidates.entry(key.clone()).or_default();
148            state.occurrences += 1;
149            // Score: occurrence count, weighted by term length so
150            // longer terms (worth more tokens) outrank short noise.
151            let len_bonus = u32::try_from(term.split_whitespace().count()).unwrap_or(1) as f32;
152            state.score += len_bonus;
153            if state.occurrences >= self.policy.min_occurrences && !self.has_binding_for(&key) {
154                self.promote(term, key.clone());
155            }
156        }
157        self.enforce_cap();
158    }
159
160    /// Apply current bindings to `text` and return the alias-rewritten
161    /// version. Substitution is greedy longest-binding-first, like the
162    /// static encoder.
163    #[must_use]
164    pub fn rewrite(&self, text: &str) -> String {
165        if self.bindings.is_empty() {
166            return text.to_owned();
167        }
168        let mut sorted: Vec<&Binding> = self.bindings.iter().collect();
169        sorted.sort_by_key(|b| std::cmp::Reverse(b.term.len()));
170        let mut out = text.to_owned();
171        for b in sorted {
172            let pat = format!(r"\b{}\b", regex::escape(&b.term));
173            if let Ok(re) = RegexBuilder::new(&pat).case_insensitive(true).build() {
174                out = re.replace_all(&out, b.alias.as_str()).into_owned();
175            }
176        }
177        out
178    }
179
180    /// Convenience: observe, then rewrite. Returns the rewritten text.
181    /// `prefix()` is queried separately by the proxy so it can decide
182    /// whether to emit it for this turn (every-turn vs first-turn-only
183    /// is a proxy policy concern).
184    pub fn observe_and_rewrite(&mut self, text: &str) -> String {
185        self.observe(text);
186        self.rewrite(text)
187    }
188
189    fn has_binding_for(&self, lowered_term: &str) -> bool {
190        self.bindings
191            .iter()
192            .any(|b| b.term.eq_ignore_ascii_case(lowered_term))
193    }
194
195    fn promote(&mut self, term: String, _key_lower: String) {
196        let alias = format!("s{}", self.next_id);
197        self.next_id += 1;
198        self.bindings.push(Binding { alias, term });
199    }
200
201    fn enforce_cap(&mut self) {
202        if self.bindings.len() <= self.policy.max_active_bindings {
203            return;
204        }
205        // Evict bindings whose scoring candidate has decayed below
206        // the lowest active threshold. Simplest stable policy: drop
207        // the oldest binding (FIFO). Real LFU would need a parallel
208        // index; revisit when telemetry shows churn.
209        let drop = self.bindings.len() - self.policy.max_active_bindings;
210        self.bindings.drain(0..drop);
211    }
212}
213
214impl Default for SessionSymbolTable {
215    fn default() -> Self {
216        Self::new(SessionPolicy::default())
217    }
218}
219
220/// Extract candidate multi-word terms from `text`.
221///
222/// v0.1 heuristic: any sequence of `min_words..=4` Capitalized or
223/// CamelCase tokens, plus `path/like/strings`, plus quoted phrases.
224/// We deliberately under-extract: false positives bloat the alias
225/// pool, while false negatives only cost the static-encoder savings
226/// floor (which we already get from `crate::encoder`).
227const LEADING_ARTICLES: &[&str] = &["The", "A", "An", "This", "That", "These", "Those"];
228
229fn strip_leading_article(s: &str) -> &str {
230    let mut iter = s.splitn(2, char::is_whitespace);
231    let first = iter.next().unwrap_or("");
232    let rest = iter.next().unwrap_or("");
233    if LEADING_ARTICLES.contains(&first) && !rest.is_empty() {
234        rest
235    } else {
236        s
237    }
238}
239
240fn extract_candidate_terms(text: &str, min_words: usize) -> Vec<String> {
241    let mut out = Vec::new();
242    // Capitalized 2-4 word phrases.
243    let cap_phrase = Regex::new(r"\b([A-Z][a-zA-Z0-9_]{2,}(?:\s+[A-Z][a-zA-Z0-9_]{2,}){1,3})\b")
244        .expect("cap-phrase pattern");
245    for m in cap_phrase.find_iter(text) {
246        let raw = m.as_str().trim();
247        // Drop leading capitalized articles/determiners: "The X Y" -> "X Y".
248        // Keeps the alias pool focused on entity names rather than
249        // sentence starts.
250        let s = strip_leading_article(raw);
251        if s.split_whitespace().count() >= min_words {
252            out.push(s.to_owned());
253        }
254    }
255    // Slash-paths (file paths / module paths) of length >= 2 segments.
256    let path_re = Regex::new(r"[A-Za-z0-9_\-\.]+(?:/[A-Za-z0-9_\-\.]+){1,}").expect("path pattern");
257    for m in path_re.find_iter(text) {
258        let s = m.as_str().trim_end_matches('.');
259        // Path counts as a single multi-token candidate for the cost
260        // model: the tokenizer typically turns each segment into 1-2
261        // tokens, so an alias is a meaningful win.
262        if min_words <= 1 || s.split('/').count() >= min_words {
263            out.push(s.to_owned());
264        }
265    }
266    out
267}
268
269#[cfg(test)]
270mod tests {
271    use super::*;
272
273    #[test]
274    fn first_observation_does_not_promote() {
275        let mut t = SessionSymbolTable::default();
276        t.observe("Authentication Module is the entry point.");
277        // 1 occurrence < threshold 2.
278        assert_eq!(t.bindings().len(), 0);
279    }
280
281    #[test]
282    fn second_observation_promotes_to_binding() {
283        let mut t = SessionSymbolTable::default();
284        t.observe("The Authentication Module starts.");
285        t.observe("The Authentication Module fires the policy.");
286        assert_eq!(t.bindings().len(), 1, "{:?}", t.bindings());
287        assert_eq!(t.bindings()[0].term, "Authentication Module");
288    }
289
290    #[test]
291    fn rewrite_replaces_all_occurrences() {
292        let mut t = SessionSymbolTable::default();
293        t.observe("Authentication Module v1.");
294        t.observe("Authentication Module v2.");
295        let out = t.rewrite("Authentication Module is fine.");
296        assert!(out.contains("s0"));
297        assert!(!out.contains("Authentication Module"));
298    }
299
300    #[test]
301    fn prefix_emits_one_line_per_binding() {
302        let mut t = SessionSymbolTable::default();
303        t.observe("Policy Engine here.");
304        t.observe("Policy Engine again.");
305        t.observe("Session Store later.");
306        t.observe("Session Store again.");
307        let p = t.prefix();
308        assert!(p.contains("=Policy Engine"), "prefix={p}");
309        assert!(p.contains("=Session Store"), "prefix={p}");
310        assert_eq!(p.lines().count(), 2);
311    }
312
313    #[test]
314    fn empty_session_emits_empty_prefix() {
315        let t = SessionSymbolTable::default();
316        assert_eq!(t.prefix(), "");
317    }
318
319    #[test]
320    fn pool_cap_is_enforced() {
321        let policy = SessionPolicy {
322            min_occurrences: 2,
323            max_active_bindings: 2,
324            turn_decay: 1.0,
325            min_term_words: 2,
326        };
327        let mut t = SessionSymbolTable::new(policy);
328        for term in ["Acme Service", "Beacon Service", "Cinder Service"] {
329            t.observe(&format!("{term} first."));
330            t.observe(&format!("{term} second."));
331        }
332        assert_eq!(t.bindings().len(), 2);
333    }
334
335    #[test]
336    fn paths_become_candidates() {
337        let mut t = SessionSymbolTable::default();
338        t.observe("look at src/encoder.rs");
339        t.observe("now src/encoder.rs again");
340        let out = t.rewrite("src/encoder.rs is the file");
341        assert!(!out.contains("src/encoder.rs"));
342    }
343
344    #[test]
345    fn longer_binding_wins_over_shorter() {
346        let mut t = SessionSymbolTable::default();
347        t.observe("Authentication Module Plus initial seed.");
348        t.observe("Authentication Module Plus second seed.");
349        t.observe("Authentication Module fires.");
350        t.observe("Authentication Module fires again.");
351        let out = t.rewrite("Authentication Module Plus is the longer one.");
352        // The 3-word binding (s0) must win over the 2-word (s1) when both apply.
353        assert!(out.contains("s0"), "{out}");
354        assert!(!out.contains("Authentication Module Plus"));
355    }
356
357    #[test]
358    fn observe_and_rewrite_returns_alias_substituted_text() {
359        let mut t = SessionSymbolTable::default();
360        t.observe_and_rewrite("Policy Engine boot.");
361        let out = t.observe_and_rewrite("Policy Engine boot.");
362        assert!(out.contains("s0"));
363    }
364
365    #[test]
366    fn rewrite_unchanged_when_no_bindings() {
367        let t = SessionSymbolTable::default();
368        let s = "nothing seen yet";
369        assert_eq!(t.rewrite(s), s);
370    }
371
372    #[test]
373    fn decay_keeps_state_bounded_under_repeated_observation() {
374        // turn_decay=0.5, occurrences accumulate but decayed score
375        // stays bounded. Sanity check: scores never explode.
376        let policy = SessionPolicy {
377            min_occurrences: 2,
378            max_active_bindings: 4,
379            turn_decay: 0.5,
380            min_term_words: 2,
381        };
382        let mut t = SessionSymbolTable::new(policy);
383        for _ in 0..50 {
384            t.observe("Auth Module again.");
385        }
386        // Promotion happened (>=2 obs); pool stays bounded; no panic.
387        assert_eq!(t.bindings().len(), 1);
388    }
389
390    #[test]
391    fn long_session_compound_savings_simulation() {
392        // Same canonical sentence recurs. Verify alias substitution
393        // materially shortens later turns vs. the original prose.
394        let mut t = SessionSymbolTable::default();
395        let canonical = "Authentication Module forwards to Policy Engine \
396                         for Validation Service against Session Store.";
397        // Prime: 2 turns promote all multi-word terms.
398        for _ in 0..2 {
399            t.observe(canonical);
400        }
401        let rewritten = t.rewrite(canonical);
402        assert!(
403            rewritten.len() < canonical.len() * 3 / 4,
404            "expected >25% length reduction, got {} -> {}",
405            canonical.len(),
406            rewritten.len()
407        );
408    }
409}