Skip to main content

locus_sdk/application/
manual_compression.rs

1use std::collections::{HashMap, HashSet};
2
3use crate::domain::compression::{
4    AnchorTerm, ManualCompressionDiagnostics, ManualCompressionRequest, ManualCompressionResult,
5    PhraseMode, StopwordProfile,
6};
7
8pub struct CompressionLexicons {
9    pub stopwords: HashSet<String>,
10    pub fillers: HashSet<String>,
11    pub negations: HashSet<String>,
12}
13
14pub trait ManualCompressionLexiconProvider: Send + Sync {
15    fn build_lexicons(&self, request: &ManualCompressionRequest) -> CompressionLexicons;
16}
17
18#[derive(Default)]
19pub struct DefaultManualCompressionLexiconProvider;
20
21impl ManualCompressionLexiconProvider for DefaultManualCompressionLexiconProvider {
22    fn build_lexicons(&self, request: &ManualCompressionRequest) -> CompressionLexicons {
23        let mut stopwords = stopword_set(request.stopword_profile);
24        let mut fillers = filler_set();
25        let mut negations = negation_set();
26        apply_lexicon_overrides(request, &mut stopwords, &mut fillers, &mut negations);
27
28        CompressionLexicons {
29            stopwords,
30            fillers,
31            negations,
32        }
33    }
34}
35
36pub struct ManualCompressionService {
37    lexicon_provider: Box<dyn ManualCompressionLexiconProvider>,
38}
39
40impl ManualCompressionService {
41    pub fn new() -> Self {
42        Self::with_lexicon_provider(DefaultManualCompressionLexiconProvider)
43    }
44
45    pub fn with_lexicon_provider(provider: impl ManualCompressionLexiconProvider + 'static) -> Self {
46        Self {
47            lexicon_provider: Box::new(provider),
48        }
49    }
50
51    pub fn execute(&self, request: &ManualCompressionRequest) -> ManualCompressionResult {
52        let sentences = split_sentences(&request.text);
53        let sentence_count = sentences.len();
54
55        let all_tokens = tokenize(&request.text);
56        let lexicons = self.lexicon_provider.build_lexicons(request);
57        let stopwords = &lexicons.stopwords;
58        let filler = &lexicons.fillers;
59        let negations = &lexicons.negations;
60
61        let mut kept_tokens = Vec::new();
62
63        let token_result = select_content_tokens(
64            &all_tokens,
65            &stopwords,
66            &filler,
67            &negations,
68            request.min_token_length,
69        );
70        kept_tokens.extend(token_result.tokens);
71        let stopwords_removed = token_result.stopwords_removed;
72        let filler_removed = token_result.filler_removed;
73
74        let term_stats = build_term_stats(&kept_tokens);
75        let cooccur = sentence_cooccurrence(
76            &sentences,
77            &stopwords,
78            &filler,
79            &negations,
80            request.min_token_length,
81        );
82        let anchors = score_anchors(&term_stats, &cooccur, request.max_anchors.max(1));
83
84        let salient_phrases = match request.phrase_mode {
85            PhraseMode::None => Vec::new(),
86            PhraseMode::RakeLite => {
87                rake_lite_phrases(&sentences, &anchors, &stopwords, &filler, &negations, request.min_token_length, 5)
88            }
89        };
90
91        let anchor_topic = choose_anchor_topic(&anchors, &salient_phrases);
92        let key_points = build_key_points(
93            &sentences,
94            &anchors,
95            &stopwords,
96            &filler,
97            &negations,
98            request.min_token_length,
99            request.max_points.max(1),
100        );
101
102        let input_tokens = all_tokens.len().max(1) as f32;
103        let output_tokens = (tokenize(&anchor_topic).len()
104            + anchors.len()
105            + salient_phrases.iter().map(|p| tokenize(p).len()).sum::<usize>()
106            + key_points.iter().map(|p| tokenize(p).len()).sum::<usize>()) as f32;
107
108        let diagnostics = ManualCompressionDiagnostics {
109            tokens_total: all_tokens.len(),
110            tokens_kept: kept_tokens.len(),
111            stopwords_removed,
112            filler_removed,
113            sentences_total: sentence_count,
114        };
115
116        ManualCompressionResult {
117            anchor_topic,
118            anchor_terms: anchors,
119            key_points,
120            salient_phrases,
121            compression_ratio: (output_tokens / input_tokens).clamp(0.0, 10.0),
122            discarded_noise_ratio: ((stopwords_removed + filler_removed) as f32 / input_tokens)
123                .clamp(0.0, 1.0),
124            diagnostics,
125        }
126    }
127}
128
129impl Default for ManualCompressionService {
130    fn default() -> Self {
131        Self::new()
132    }
133}
134
135#[derive(Debug, Clone)]
136struct TermStat {
137    freq: usize,
138    first_pos: usize,
139}
140
141fn split_sentences(text: &str) -> Vec<String> {
142    text.split(['.', '!', '?'])
143        .map(str::trim)
144        .filter(|s| !s.is_empty())
145        .map(|s| s.to_string())
146        .collect()
147}
148
149fn tokenize(text: &str) -> Vec<String> {
150    let normalized = text
151        .to_ascii_lowercase()
152        .chars()
153        .map(|ch| if ch.is_ascii_alphanumeric() || ch == '-' || ch == '_' { ch } else { ' ' })
154        .collect::<String>();
155
156    normalized
157        .split_whitespace()
158        .map(|token| token.trim().to_string())
159        .filter(|token| !token.is_empty())
160        .collect()
161}
162
163fn normalize_lexicon_entry(value: &str) -> Option<String> {
164    let normalized = tokenize(value);
165    normalized.first().cloned()
166}
167
168fn apply_lexicon_overrides(
169    request: &ManualCompressionRequest,
170    stopwords: &mut HashSet<String>,
171    fillers: &mut HashSet<String>,
172    negations: &mut HashSet<String>,
173) {
174    for value in &request.stopwords_add {
175        if let Some(entry) = normalize_lexicon_entry(value) {
176            stopwords.insert(entry);
177        }
178    }
179    for value in &request.stopwords_remove {
180        if let Some(entry) = normalize_lexicon_entry(value) {
181            stopwords.remove(&entry);
182        }
183    }
184
185    for value in &request.fillers_add {
186        if let Some(entry) = normalize_lexicon_entry(value) {
187            fillers.insert(entry);
188        }
189    }
190    for value in &request.fillers_remove {
191        if let Some(entry) = normalize_lexicon_entry(value) {
192            fillers.remove(&entry);
193        }
194    }
195
196    for value in &request.negations_add {
197        if let Some(entry) = normalize_lexicon_entry(value) {
198            negations.insert(entry);
199        }
200    }
201    for value in &request.negations_remove {
202        if let Some(entry) = normalize_lexicon_entry(value) {
203            negations.remove(&entry);
204        }
205    }
206}
207
208#[derive(Default)]
209struct ContentTokenSelection {
210    tokens: Vec<String>,
211    stopwords_removed: usize,
212    filler_removed: usize,
213}
214
215fn is_negated_marker(token: &str) -> bool {
216    token.starts_with("neg_")
217}
218
219fn select_content_tokens(
220    raw_tokens: &[String],
221    stopwords: &HashSet<String>,
222    filler: &HashSet<String>,
223    negations: &HashSet<String>,
224    min_len: usize,
225) -> ContentTokenSelection {
226    let mut selected = ContentTokenSelection::default();
227    let mut pending_negation = false;
228
229    for token in raw_tokens {
230        if negations.contains(token.as_str()) {
231            // Toggle polarity so repeated negations cancel each other.
232            pending_negation = !pending_negation;
233            continue;
234        }
235
236        if filler.contains(token.as_str()) {
237            selected.filler_removed += 1;
238            continue;
239        }
240
241        if stopwords.contains(token.as_str()) {
242            selected.stopwords_removed += 1;
243            continue;
244        }
245
246        if token.len() < min_len.max(1) {
247            continue;
248        }
249
250        if pending_negation {
251            selected.tokens.push(format!("neg_{token}"));
252            pending_negation = false;
253        } else {
254            selected.tokens.push(token.clone());
255        }
256    }
257
258    selected
259}
260
261fn stopword_set(profile: StopwordProfile) -> HashSet<String> {
262    let basic = [
263        "the", "a", "an", "and", "or", "for", "to", "of", "in", "on", "at", "is", "are",
264        "was", "were", "be", "been", "with", "that", "this", "it", "as", "by", "from", "we",
265    ];
266
267    let extended = [
268        "have", "has", "had", "do", "does", "did", "if", "then", "else", "when", "while", "into",
269        "about", "over", "under", "very", "more", "most", "some", "any", "all", "each", "only",
270    ];
271
272    let domain = [
273        "node", "nodes", "session", "sessions", "memory", "sttp", "context", "data", "value", "values",
274    ];
275
276    let mut set: HashSet<String> = basic.into_iter().map(str::to_string).collect();
277
278    if matches!(profile, StopwordProfile::Extended | StopwordProfile::Domain) {
279        set.extend(extended.into_iter().map(str::to_string));
280    }
281    if matches!(profile, StopwordProfile::Domain) {
282        set.extend(domain.into_iter().map(str::to_string));
283    }
284
285    set
286}
287
288fn filler_set() -> HashSet<String> {
289    [
290        "basically",
291        "actually",
292        "really",
293        "just",
294        "literally",
295        "kinda",
296        "sorta",
297        "maybe",
298        "honestly",
299        "frankly",
300        "perhaps",
301        "probably",
302        "simply",
303        "quite",
304    ]
305    .into_iter()
306    .map(str::to_string)
307    .collect()
308}
309
310fn negation_set() -> HashSet<String> {
311    ["not", "no", "never", "none", "cannot", "neither", "nor"]
312        .into_iter()
313        .map(str::to_string)
314        .collect()
315}
316
317fn build_term_stats(tokens: &[String]) -> HashMap<String, TermStat> {
318    let mut map: HashMap<String, TermStat> = HashMap::new();
319
320    for (idx, token) in tokens.iter().enumerate() {
321        map.entry(token.clone())
322            .and_modify(|stat| stat.freq += 1)
323            .or_insert(TermStat {
324                freq: 1,
325                first_pos: idx,
326            });
327    }
328
329    map
330}
331
332fn sentence_cooccurrence(
333    sentences: &[String],
334    stopwords: &HashSet<String>,
335    filler: &HashSet<String>,
336    negations: &HashSet<String>,
337    min_len: usize,
338) -> HashMap<String, usize> {
339    let mut degree = HashMap::new();
340
341    for sentence in sentences {
342        let sentence_tokens = tokenize(sentence);
343        let mut uniq = select_content_tokens(&sentence_tokens, stopwords, filler, negations, min_len)
344            .tokens
345            .into_iter()
346            .collect::<HashSet<_>>()
347            .into_iter()
348            .collect::<Vec<_>>();
349        uniq.sort();
350
351        for token in uniq.iter() {
352            let entry = degree.entry(token.clone()).or_insert(0usize);
353            *entry += uniq.len().saturating_sub(1);
354        }
355    }
356
357    degree
358}
359
360fn score_anchors(
361    term_stats: &HashMap<String, TermStat>,
362    cooccur: &HashMap<String, usize>,
363    max_anchors: usize,
364) -> Vec<AnchorTerm> {
365    if term_stats.is_empty() {
366        return Vec::new();
367    }
368
369    let max_freq = term_stats.values().map(|s| s.freq).max().unwrap_or(1) as f32;
370    let max_pos = term_stats.values().map(|s| s.first_pos).max().unwrap_or(1) as f32;
371    let max_degree = cooccur.values().copied().max().unwrap_or(1) as f32;
372
373    let mut anchors = term_stats
374        .iter()
375        .map(|(term, stat)| {
376            let freq_norm = stat.freq as f32 / max_freq;
377            let pos_boost = 1.0 - (stat.first_pos as f32 / (max_pos + 1.0));
378            let rarity_boost = 1.0 / stat.freq as f32;
379            let centrality = *cooccur.get(term).unwrap_or(&0) as f32 / max_degree;
380
381            let score = (0.45 * freq_norm) + (0.20 * pos_boost) + (0.20 * rarity_boost) + (0.15 * centrality);
382
383            AnchorTerm {
384                term: term.clone(),
385                score,
386                evidence_count: stat.freq,
387                first_position: stat.first_pos,
388            }
389        })
390        .collect::<Vec<_>>();
391
392    anchors.sort_by(|a, b| {
393        b.score
394            .partial_cmp(&a.score)
395            .unwrap_or(std::cmp::Ordering::Equal)
396            .then_with(|| b.evidence_count.cmp(&a.evidence_count))
397            .then_with(|| a.term.cmp(&b.term))
398    });
399
400    anchors.truncate(max_anchors);
401    anchors
402}
403
404fn rake_lite_phrases(
405    sentences: &[String],
406    anchors: &[AnchorTerm],
407    stopwords: &HashSet<String>,
408    filler: &HashSet<String>,
409    negations: &HashSet<String>,
410    min_len: usize,
411    limit: usize,
412) -> Vec<String> {
413    if anchors.is_empty() {
414        return Vec::new();
415    }
416
417    let anchor_set = anchors
418        .iter()
419        .map(|a| a.term.as_str())
420        .collect::<HashSet<_>>();
421
422    let mut scored = Vec::new();
423
424    for sentence in sentences {
425        let sentence_tokens = tokenize(sentence);
426        let tokens = select_content_tokens(&sentence_tokens, stopwords, filler, negations, min_len).tokens;
427        for window in [2usize, 3usize] {
428            if tokens.len() < window {
429                continue;
430            }
431            for idx in 0..=(tokens.len() - window) {
432                let phrase = tokens[idx..idx + window].to_vec();
433                let overlap = phrase.iter().filter(|t| anchor_set.contains(t.as_str())).count();
434                if overlap > 0 {
435                    let score = (overlap as f32)
436                        + (window as f32 * 0.1)
437                        + (phrase.iter().filter(|t| is_negated_marker(t)).count() as f32 * 0.15);
438                    let phrase_text = phrase
439                        .iter()
440                        .map(|token| token.strip_prefix("neg_").map(|t| format!("not {t}")).unwrap_or_else(|| token.clone()))
441                        .collect::<Vec<_>>()
442                        .join(" ");
443                    scored.push((score, phrase_text));
444                }
445            }
446        }
447    }
448
449    scored.sort_by(|a, b| {
450        b.0.partial_cmp(&a.0)
451            .unwrap_or(std::cmp::Ordering::Equal)
452            .then_with(|| a.1.cmp(&b.1))
453    });
454
455    let mut seen = HashSet::new();
456    scored
457        .into_iter()
458        .filter_map(|(_, phrase)| {
459            if seen.insert(phrase.clone()) {
460                Some(phrase)
461            } else {
462                None
463            }
464        })
465        .take(limit)
466        .collect()
467}
468
469fn choose_anchor_topic(anchors: &[AnchorTerm], phrases: &[String]) -> String {
470    let Some(top) = anchors.first() else {
471        return "".to_string();
472    };
473
474    let top_display = top.term.strip_prefix("neg_").map(|t| format!("not {t}")).unwrap_or_else(|| top.term.clone());
475
476    if let Some(phrase) = phrases.iter().find(|phrase| phrase.contains(&top_display)) {
477        return phrase.clone();
478    }
479
480    top_display
481}
482
483fn build_key_points(
484    sentences: &[String],
485    anchors: &[AnchorTerm],
486    stopwords: &HashSet<String>,
487    filler: &HashSet<String>,
488    negations: &HashSet<String>,
489    min_len: usize,
490    max_points: usize,
491) -> Vec<String> {
492    if anchors.is_empty() {
493        return sentences.iter().take(max_points).cloned().collect();
494    }
495
496    let anchor_terms = anchors
497        .iter()
498        .map(|anchor| anchor.term.as_str())
499        .collect::<HashSet<_>>();
500
501    let mut scored = sentences
502        .iter()
503        .enumerate()
504        .map(|(idx, sentence)| {
505            let sentence_tokens = tokenize(sentence);
506            let tokens = select_content_tokens(&sentence_tokens, stopwords, filler, negations, min_len).tokens;
507            let hits = tokens
508                .iter()
509                .filter(|token| anchor_terms.contains(token.as_str()))
510                .count();
511            let score = (hits as f32 * 2.0) + (1.0 / ((idx + 1) as f32));
512            (score, idx, sentence.clone())
513        })
514        .collect::<Vec<_>>();
515
516    scored.sort_by(|a, b| {
517        b.0.partial_cmp(&a.0)
518            .unwrap_or(std::cmp::Ordering::Equal)
519            .then_with(|| a.1.cmp(&b.1))
520    });
521
522    scored
523        .into_iter()
524        .filter(|(score, _, _)| *score > 0.0)
525        .take(max_points)
526        .map(|(_, _, sentence)| sentence)
527        .collect()
528}
529
530#[cfg(test)]
531mod tests {
532    use std::collections::HashSet;
533
534    use super::{CompressionLexicons, ManualCompressionLexiconProvider, ManualCompressionService};
535    use crate::domain::compression::ManualCompressionRequest;
536
537    struct CustomLexiconProvider;
538
539    impl ManualCompressionLexiconProvider for CustomLexiconProvider {
540        fn build_lexicons(&self, _request: &ManualCompressionRequest) -> CompressionLexicons {
541            CompressionLexicons {
542                stopwords: ["retrieval"].into_iter().map(str::to_string).collect::<HashSet<_>>(),
543                fillers: ["fallback"].into_iter().map(str::to_string).collect::<HashSet<_>>(),
544                negations: ["hardly"].into_iter().map(str::to_string).collect::<HashSet<_>>(),
545            }
546        }
547    }
548
549    #[test]
550    fn compressor_filters_filler_and_extracts_anchor() {
551        let service = ManualCompressionService::new();
552        let request = ManualCompressionRequest {
553            text: "Basically we need retrieval fallback policy. Actually retrieval fallback keeps recall stable.".to_string(),
554            ..Default::default()
555        };
556
557        let result = service.execute(&request);
558
559        assert!(result.anchor_terms.iter().any(|term| term.term == "retrieval" || term.term == "fallback"));
560        assert!(!result.anchor_terms.iter().any(|term| term.term == "basically"));
561        assert!(result.discarded_noise_ratio > 0.0);
562    }
563
564    #[test]
565    fn compressor_is_deterministic_for_same_input() {
566        let service = ManualCompressionService::new();
567        let request = ManualCompressionRequest {
568            text: "session graph recall and migration policy stability".to_string(),
569            ..Default::default()
570        };
571
572        let left = service.execute(&request);
573        let right = service.execute(&request);
574
575        assert_eq!(left.anchor_topic, right.anchor_topic);
576        assert_eq!(left.anchor_terms.len(), right.anchor_terms.len());
577        assert_eq!(left.key_points, right.key_points);
578    }
579
580    #[test]
581    fn compressor_cancels_double_negatives() {
582        let service = ManualCompressionService::new();
583        let request = ManualCompressionRequest {
584            text: "the rollout is not not stable and the policy is not brittle".to_string(),
585            ..Default::default()
586        };
587
588        let result = service.execute(&request);
589
590        assert!(result.anchor_terms.iter().any(|term| term.term == "stable"));
591        assert!(result.anchor_terms.iter().any(|term| term.term == "neg_brittle"));
592        assert!(!result.anchor_terms.iter().any(|term| term.term == "neg_stable"));
593    }
594
595    #[test]
596    fn compressor_filters_expanded_filler_lexicon() {
597        let service = ManualCompressionService::new();
598        let request = ManualCompressionRequest {
599            text: "Honestly the migration policy is quite stable and honestly deterministic".to_string(),
600            ..Default::default()
601        };
602
603        let result = service.execute(&request);
604
605        assert!(!result.anchor_terms.iter().any(|term| term.term == "honestly" || term.term == "quite"));
606        assert!(result.diagnostics.filler_removed >= 2);
607    }
608
609    #[test]
610    fn compressor_honors_request_lexicon_overrides() {
611        let service = ManualCompressionService::new();
612        let request = ManualCompressionRequest {
613            text: "hardly brittle retrieval retrieval fallback maybe".to_string(),
614            stopwords_add: vec!["retrieval".to_string()],
615            fillers_add: vec!["fallback".to_string()],
616            negations_add: vec!["hardly".to_string()],
617            ..Default::default()
618        };
619
620        let result = service.execute(&request);
621
622        assert!(!result.anchor_terms.iter().any(|term| term.term == "retrieval"));
623        assert!(!result.anchor_terms.iter().any(|term| term.term == "fallback"));
624        assert!(result.anchor_terms.iter().any(|term| term.term == "neg_brittle"));
625    }
626
627    #[test]
628    fn compressor_supports_custom_trait_provider() {
629        let service = ManualCompressionService::with_lexicon_provider(CustomLexiconProvider);
630        let request = ManualCompressionRequest {
631            text: "hardly brittle retrieval retrieval fallback".to_string(),
632            ..Default::default()
633        };
634
635        let result = service.execute(&request);
636
637        assert!(!result.anchor_terms.iter().any(|term| term.term == "retrieval"));
638        assert!(!result.anchor_terms.iter().any(|term| term.term == "fallback"));
639        assert!(result.anchor_terms.iter().any(|term| term.term == "neg_brittle"));
640    }
641}