use std::collections::HashMap;
use regex::{Regex, RegexBuilder};
#[derive(Debug, Clone, Copy)]
pub struct SessionPolicy {
pub min_occurrences: u32,
pub max_active_bindings: usize,
pub turn_decay: f32,
pub min_term_words: usize,
}
impl Default for SessionPolicy {
fn default() -> Self {
Self {
min_occurrences: 2,
max_active_bindings: 64,
turn_decay: 0.85,
min_term_words: 2,
}
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Binding {
pub alias: String,
pub term: String,
}
#[derive(Debug, Default)]
struct CandidateState {
occurrences: u32,
score: f32,
}
#[derive(Debug)]
pub struct SessionSymbolTable {
policy: SessionPolicy,
candidates: HashMap<String, CandidateState>,
bindings: Vec<Binding>,
next_id: u32,
}
impl SessionSymbolTable {
#[must_use]
pub fn new(policy: SessionPolicy) -> Self {
Self {
policy,
candidates: HashMap::new(),
bindings: Vec::new(),
next_id: 0,
}
}
#[must_use]
pub fn bindings(&self) -> &[Binding] {
&self.bindings
}
#[must_use]
pub fn prefix(&self) -> String {
if self.bindings.is_empty() {
return String::new();
}
let mut out = String::with_capacity(self.bindings.len() * 24);
for b in &self.bindings {
out.push_str(&b.alias);
out.push('=');
out.push_str(&b.term);
out.push('\n');
}
out
}
pub fn observe(&mut self, text: &str) {
for state in self.candidates.values_mut() {
state.score *= self.policy.turn_decay;
}
for term in extract_candidate_terms(text, self.policy.min_term_words) {
let key = term.to_ascii_lowercase();
let state = self.candidates.entry(key.clone()).or_default();
state.occurrences += 1;
let len_bonus = u32::try_from(term.split_whitespace().count()).unwrap_or(1) as f32;
state.score += len_bonus;
if state.occurrences >= self.policy.min_occurrences && !self.has_binding_for(&key) {
self.promote(term, key.clone());
}
}
self.enforce_cap();
}
#[must_use]
pub fn rewrite(&self, text: &str) -> String {
if self.bindings.is_empty() {
return text.to_owned();
}
let mut sorted: Vec<&Binding> = self.bindings.iter().collect();
sorted.sort_by_key(|b| std::cmp::Reverse(b.term.len()));
let mut out = text.to_owned();
for b in sorted {
let pat = format!(r"\b{}\b", regex::escape(&b.term));
if let Ok(re) = RegexBuilder::new(&pat).case_insensitive(true).build() {
out = re.replace_all(&out, b.alias.as_str()).into_owned();
}
}
out
}
pub fn observe_and_rewrite(&mut self, text: &str) -> String {
self.observe(text);
self.rewrite(text)
}
fn has_binding_for(&self, lowered_term: &str) -> bool {
self.bindings
.iter()
.any(|b| b.term.eq_ignore_ascii_case(lowered_term))
}
fn promote(&mut self, term: String, _key_lower: String) {
let alias = format!("s{}", self.next_id);
self.next_id += 1;
self.bindings.push(Binding { alias, term });
}
fn enforce_cap(&mut self) {
if self.bindings.len() <= self.policy.max_active_bindings {
return;
}
let drop = self.bindings.len() - self.policy.max_active_bindings;
self.bindings.drain(0..drop);
}
}
impl Default for SessionSymbolTable {
fn default() -> Self {
Self::new(SessionPolicy::default())
}
}
const LEADING_ARTICLES: &[&str] = &["The", "A", "An", "This", "That", "These", "Those"];
fn strip_leading_article(s: &str) -> &str {
let mut iter = s.splitn(2, char::is_whitespace);
let first = iter.next().unwrap_or("");
let rest = iter.next().unwrap_or("");
if LEADING_ARTICLES.contains(&first) && !rest.is_empty() {
rest
} else {
s
}
}
fn extract_candidate_terms(text: &str, min_words: usize) -> Vec<String> {
let mut out = Vec::new();
let cap_phrase = Regex::new(r"\b([A-Z][a-zA-Z0-9_]{2,}(?:\s+[A-Z][a-zA-Z0-9_]{2,}){1,3})\b")
.expect("cap-phrase pattern");
for m in cap_phrase.find_iter(text) {
let raw = m.as_str().trim();
let s = strip_leading_article(raw);
if s.split_whitespace().count() >= min_words {
out.push(s.to_owned());
}
}
let path_re = Regex::new(r"[A-Za-z0-9_\-\.]+(?:/[A-Za-z0-9_\-\.]+){1,}").expect("path pattern");
for m in path_re.find_iter(text) {
let s = m.as_str().trim_end_matches('.');
if min_words <= 1 || s.split('/').count() >= min_words {
out.push(s.to_owned());
}
}
out
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn first_observation_does_not_promote() {
let mut t = SessionSymbolTable::default();
t.observe("Authentication Module is the entry point.");
assert_eq!(t.bindings().len(), 0);
}
#[test]
fn second_observation_promotes_to_binding() {
let mut t = SessionSymbolTable::default();
t.observe("The Authentication Module starts.");
t.observe("The Authentication Module fires the policy.");
assert_eq!(t.bindings().len(), 1, "{:?}", t.bindings());
assert_eq!(t.bindings()[0].term, "Authentication Module");
}
#[test]
fn rewrite_replaces_all_occurrences() {
let mut t = SessionSymbolTable::default();
t.observe("Authentication Module v1.");
t.observe("Authentication Module v2.");
let out = t.rewrite("Authentication Module is fine.");
assert!(out.contains("s0"));
assert!(!out.contains("Authentication Module"));
}
#[test]
fn prefix_emits_one_line_per_binding() {
let mut t = SessionSymbolTable::default();
t.observe("Policy Engine here.");
t.observe("Policy Engine again.");
t.observe("Session Store later.");
t.observe("Session Store again.");
let p = t.prefix();
assert!(p.contains("=Policy Engine"), "prefix={p}");
assert!(p.contains("=Session Store"), "prefix={p}");
assert_eq!(p.lines().count(), 2);
}
#[test]
fn empty_session_emits_empty_prefix() {
let t = SessionSymbolTable::default();
assert_eq!(t.prefix(), "");
}
#[test]
fn pool_cap_is_enforced() {
let policy = SessionPolicy {
min_occurrences: 2,
max_active_bindings: 2,
turn_decay: 1.0,
min_term_words: 2,
};
let mut t = SessionSymbolTable::new(policy);
for term in ["Acme Service", "Beacon Service", "Cinder Service"] {
t.observe(&format!("{term} first."));
t.observe(&format!("{term} second."));
}
assert_eq!(t.bindings().len(), 2);
}
#[test]
fn paths_become_candidates() {
let mut t = SessionSymbolTable::default();
t.observe("look at src/encoder.rs");
t.observe("now src/encoder.rs again");
let out = t.rewrite("src/encoder.rs is the file");
assert!(!out.contains("src/encoder.rs"));
}
#[test]
fn longer_binding_wins_over_shorter() {
let mut t = SessionSymbolTable::default();
t.observe("Authentication Module Plus initial seed.");
t.observe("Authentication Module Plus second seed.");
t.observe("Authentication Module fires.");
t.observe("Authentication Module fires again.");
let out = t.rewrite("Authentication Module Plus is the longer one.");
assert!(out.contains("s0"), "{out}");
assert!(!out.contains("Authentication Module Plus"));
}
#[test]
fn observe_and_rewrite_returns_alias_substituted_text() {
let mut t = SessionSymbolTable::default();
t.observe_and_rewrite("Policy Engine boot.");
let out = t.observe_and_rewrite("Policy Engine boot.");
assert!(out.contains("s0"));
}
#[test]
fn rewrite_unchanged_when_no_bindings() {
let t = SessionSymbolTable::default();
let s = "nothing seen yet";
assert_eq!(t.rewrite(s), s);
}
#[test]
fn decay_keeps_state_bounded_under_repeated_observation() {
let policy = SessionPolicy {
min_occurrences: 2,
max_active_bindings: 4,
turn_decay: 0.5,
min_term_words: 2,
};
let mut t = SessionSymbolTable::new(policy);
for _ in 0..50 {
t.observe("Auth Module again.");
}
assert_eq!(t.bindings().len(), 1);
}
#[test]
fn long_session_compound_savings_simulation() {
let mut t = SessionSymbolTable::default();
let canonical = "Authentication Module forwards to Policy Engine \
for Validation Service against Session Store.";
for _ in 0..2 {
t.observe(canonical);
}
let rewritten = t.rewrite(canonical);
assert!(
rewritten.len() < canonical.len() * 3 / 4,
"expected >25% length reduction, got {} -> {}",
canonical.len(),
rewritten.len()
);
}
}