1use std::collections::{HashMap, HashSet};
2
3use crate::domain::compression::{
4 AnchorTerm, ManualCompressionDiagnostics, ManualCompressionRequest, ManualCompressionResult,
5 PhraseMode, StopwordProfile,
6};
7
8pub struct CompressionLexicons {
9 pub stopwords: HashSet<String>,
10 pub fillers: HashSet<String>,
11 pub negations: HashSet<String>,
12}
13
14pub trait ManualCompressionLexiconProvider: Send + Sync {
15 fn build_lexicons(&self, request: &ManualCompressionRequest) -> CompressionLexicons;
16}
17
18#[derive(Default)]
19pub struct DefaultManualCompressionLexiconProvider;
20
21impl ManualCompressionLexiconProvider for DefaultManualCompressionLexiconProvider {
22 fn build_lexicons(&self, request: &ManualCompressionRequest) -> CompressionLexicons {
23 let mut stopwords = stopword_set(request.stopword_profile);
24 let mut fillers = filler_set();
25 let mut negations = negation_set();
26 apply_lexicon_overrides(request, &mut stopwords, &mut fillers, &mut negations);
27
28 CompressionLexicons {
29 stopwords,
30 fillers,
31 negations,
32 }
33 }
34}
35
36pub struct ManualCompressionService {
37 lexicon_provider: Box<dyn ManualCompressionLexiconProvider>,
38}
39
40impl ManualCompressionService {
41 pub fn new() -> Self {
42 Self::with_lexicon_provider(DefaultManualCompressionLexiconProvider)
43 }
44
45 pub fn with_lexicon_provider(provider: impl ManualCompressionLexiconProvider + 'static) -> Self {
46 Self {
47 lexicon_provider: Box::new(provider),
48 }
49 }
50
51 pub fn execute(&self, request: &ManualCompressionRequest) -> ManualCompressionResult {
52 let sentences = split_sentences(&request.text);
53 let sentence_count = sentences.len();
54
55 let all_tokens = tokenize(&request.text);
56 let lexicons = self.lexicon_provider.build_lexicons(request);
57 let stopwords = &lexicons.stopwords;
58 let filler = &lexicons.fillers;
59 let negations = &lexicons.negations;
60
61 let mut kept_tokens = Vec::new();
62
63 let token_result = select_content_tokens(
64 &all_tokens,
65 &stopwords,
66 &filler,
67 &negations,
68 request.min_token_length,
69 );
70 kept_tokens.extend(token_result.tokens);
71 let stopwords_removed = token_result.stopwords_removed;
72 let filler_removed = token_result.filler_removed;
73
74 let term_stats = build_term_stats(&kept_tokens);
75 let cooccur = sentence_cooccurrence(
76 &sentences,
77 &stopwords,
78 &filler,
79 &negations,
80 request.min_token_length,
81 );
82 let anchors = score_anchors(&term_stats, &cooccur, request.max_anchors.max(1));
83
84 let salient_phrases = match request.phrase_mode {
85 PhraseMode::None => Vec::new(),
86 PhraseMode::RakeLite => {
87 rake_lite_phrases(&sentences, &anchors, &stopwords, &filler, &negations, request.min_token_length, 5)
88 }
89 };
90
91 let anchor_topic = choose_anchor_topic(&anchors, &salient_phrases);
92 let key_points = build_key_points(
93 &sentences,
94 &anchors,
95 &stopwords,
96 &filler,
97 &negations,
98 request.min_token_length,
99 request.max_points.max(1),
100 );
101
102 let input_tokens = all_tokens.len().max(1) as f32;
103 let output_tokens = (tokenize(&anchor_topic).len()
104 + anchors.len()
105 + salient_phrases.iter().map(|p| tokenize(p).len()).sum::<usize>()
106 + key_points.iter().map(|p| tokenize(p).len()).sum::<usize>()) as f32;
107
108 let diagnostics = ManualCompressionDiagnostics {
109 tokens_total: all_tokens.len(),
110 tokens_kept: kept_tokens.len(),
111 stopwords_removed,
112 filler_removed,
113 sentences_total: sentence_count,
114 };
115
116 ManualCompressionResult {
117 anchor_topic,
118 anchor_terms: anchors,
119 key_points,
120 salient_phrases,
121 compression_ratio: (output_tokens / input_tokens).clamp(0.0, 10.0),
122 discarded_noise_ratio: ((stopwords_removed + filler_removed) as f32 / input_tokens)
123 .clamp(0.0, 1.0),
124 diagnostics,
125 }
126 }
127}
128
129impl Default for ManualCompressionService {
130 fn default() -> Self {
131 Self::new()
132 }
133}
134
135#[derive(Debug, Clone)]
136struct TermStat {
137 freq: usize,
138 first_pos: usize,
139}
140
141fn split_sentences(text: &str) -> Vec<String> {
142 text.split(['.', '!', '?'])
143 .map(str::trim)
144 .filter(|s| !s.is_empty())
145 .map(|s| s.to_string())
146 .collect()
147}
148
149fn tokenize(text: &str) -> Vec<String> {
150 let normalized = text
151 .to_ascii_lowercase()
152 .chars()
153 .map(|ch| if ch.is_ascii_alphanumeric() || ch == '-' || ch == '_' { ch } else { ' ' })
154 .collect::<String>();
155
156 normalized
157 .split_whitespace()
158 .map(|token| token.trim().to_string())
159 .filter(|token| !token.is_empty())
160 .collect()
161}
162
163fn normalize_lexicon_entry(value: &str) -> Option<String> {
164 let normalized = tokenize(value);
165 normalized.first().cloned()
166}
167
168fn apply_lexicon_overrides(
169 request: &ManualCompressionRequest,
170 stopwords: &mut HashSet<String>,
171 fillers: &mut HashSet<String>,
172 negations: &mut HashSet<String>,
173) {
174 for value in &request.stopwords_add {
175 if let Some(entry) = normalize_lexicon_entry(value) {
176 stopwords.insert(entry);
177 }
178 }
179 for value in &request.stopwords_remove {
180 if let Some(entry) = normalize_lexicon_entry(value) {
181 stopwords.remove(&entry);
182 }
183 }
184
185 for value in &request.fillers_add {
186 if let Some(entry) = normalize_lexicon_entry(value) {
187 fillers.insert(entry);
188 }
189 }
190 for value in &request.fillers_remove {
191 if let Some(entry) = normalize_lexicon_entry(value) {
192 fillers.remove(&entry);
193 }
194 }
195
196 for value in &request.negations_add {
197 if let Some(entry) = normalize_lexicon_entry(value) {
198 negations.insert(entry);
199 }
200 }
201 for value in &request.negations_remove {
202 if let Some(entry) = normalize_lexicon_entry(value) {
203 negations.remove(&entry);
204 }
205 }
206}
207
208#[derive(Default)]
209struct ContentTokenSelection {
210 tokens: Vec<String>,
211 stopwords_removed: usize,
212 filler_removed: usize,
213}
214
215fn is_negated_marker(token: &str) -> bool {
216 token.starts_with("neg_")
217}
218
219fn select_content_tokens(
220 raw_tokens: &[String],
221 stopwords: &HashSet<String>,
222 filler: &HashSet<String>,
223 negations: &HashSet<String>,
224 min_len: usize,
225) -> ContentTokenSelection {
226 let mut selected = ContentTokenSelection::default();
227 let mut pending_negation = false;
228
229 for token in raw_tokens {
230 if negations.contains(token.as_str()) {
231 pending_negation = !pending_negation;
233 continue;
234 }
235
236 if filler.contains(token.as_str()) {
237 selected.filler_removed += 1;
238 continue;
239 }
240
241 if stopwords.contains(token.as_str()) {
242 selected.stopwords_removed += 1;
243 continue;
244 }
245
246 if token.len() < min_len.max(1) {
247 continue;
248 }
249
250 if pending_negation {
251 selected.tokens.push(format!("neg_{token}"));
252 pending_negation = false;
253 } else {
254 selected.tokens.push(token.clone());
255 }
256 }
257
258 selected
259}
260
261fn stopword_set(profile: StopwordProfile) -> HashSet<String> {
262 let basic = [
263 "the", "a", "an", "and", "or", "for", "to", "of", "in", "on", "at", "is", "are",
264 "was", "were", "be", "been", "with", "that", "this", "it", "as", "by", "from", "we",
265 ];
266
267 let extended = [
268 "have", "has", "had", "do", "does", "did", "if", "then", "else", "when", "while", "into",
269 "about", "over", "under", "very", "more", "most", "some", "any", "all", "each", "only",
270 ];
271
272 let domain = [
273 "node", "nodes", "session", "sessions", "memory", "sttp", "context", "data", "value", "values",
274 ];
275
276 let mut set: HashSet<String> = basic.into_iter().map(str::to_string).collect();
277
278 if matches!(profile, StopwordProfile::Extended | StopwordProfile::Domain) {
279 set.extend(extended.into_iter().map(str::to_string));
280 }
281 if matches!(profile, StopwordProfile::Domain) {
282 set.extend(domain.into_iter().map(str::to_string));
283 }
284
285 set
286}
287
288fn filler_set() -> HashSet<String> {
289 [
290 "basically",
291 "actually",
292 "really",
293 "just",
294 "literally",
295 "kinda",
296 "sorta",
297 "maybe",
298 "honestly",
299 "frankly",
300 "perhaps",
301 "probably",
302 "simply",
303 "quite",
304 ]
305 .into_iter()
306 .map(str::to_string)
307 .collect()
308}
309
310fn negation_set() -> HashSet<String> {
311 ["not", "no", "never", "none", "cannot", "neither", "nor"]
312 .into_iter()
313 .map(str::to_string)
314 .collect()
315}
316
317fn build_term_stats(tokens: &[String]) -> HashMap<String, TermStat> {
318 let mut map: HashMap<String, TermStat> = HashMap::new();
319
320 for (idx, token) in tokens.iter().enumerate() {
321 map.entry(token.clone())
322 .and_modify(|stat| stat.freq += 1)
323 .or_insert(TermStat {
324 freq: 1,
325 first_pos: idx,
326 });
327 }
328
329 map
330}
331
332fn sentence_cooccurrence(
333 sentences: &[String],
334 stopwords: &HashSet<String>,
335 filler: &HashSet<String>,
336 negations: &HashSet<String>,
337 min_len: usize,
338) -> HashMap<String, usize> {
339 let mut degree = HashMap::new();
340
341 for sentence in sentences {
342 let sentence_tokens = tokenize(sentence);
343 let mut uniq = select_content_tokens(&sentence_tokens, stopwords, filler, negations, min_len)
344 .tokens
345 .into_iter()
346 .collect::<HashSet<_>>()
347 .into_iter()
348 .collect::<Vec<_>>();
349 uniq.sort();
350
351 for token in uniq.iter() {
352 let entry = degree.entry(token.clone()).or_insert(0usize);
353 *entry += uniq.len().saturating_sub(1);
354 }
355 }
356
357 degree
358}
359
360fn score_anchors(
361 term_stats: &HashMap<String, TermStat>,
362 cooccur: &HashMap<String, usize>,
363 max_anchors: usize,
364) -> Vec<AnchorTerm> {
365 if term_stats.is_empty() {
366 return Vec::new();
367 }
368
369 let max_freq = term_stats.values().map(|s| s.freq).max().unwrap_or(1) as f32;
370 let max_pos = term_stats.values().map(|s| s.first_pos).max().unwrap_or(1) as f32;
371 let max_degree = cooccur.values().copied().max().unwrap_or(1) as f32;
372
373 let mut anchors = term_stats
374 .iter()
375 .map(|(term, stat)| {
376 let freq_norm = stat.freq as f32 / max_freq;
377 let pos_boost = 1.0 - (stat.first_pos as f32 / (max_pos + 1.0));
378 let rarity_boost = 1.0 / stat.freq as f32;
379 let centrality = *cooccur.get(term).unwrap_or(&0) as f32 / max_degree;
380
381 let score = (0.45 * freq_norm) + (0.20 * pos_boost) + (0.20 * rarity_boost) + (0.15 * centrality);
382
383 AnchorTerm {
384 term: term.clone(),
385 score,
386 evidence_count: stat.freq,
387 first_position: stat.first_pos,
388 }
389 })
390 .collect::<Vec<_>>();
391
392 anchors.sort_by(|a, b| {
393 b.score
394 .partial_cmp(&a.score)
395 .unwrap_or(std::cmp::Ordering::Equal)
396 .then_with(|| b.evidence_count.cmp(&a.evidence_count))
397 .then_with(|| a.term.cmp(&b.term))
398 });
399
400 anchors.truncate(max_anchors);
401 anchors
402}
403
404fn rake_lite_phrases(
405 sentences: &[String],
406 anchors: &[AnchorTerm],
407 stopwords: &HashSet<String>,
408 filler: &HashSet<String>,
409 negations: &HashSet<String>,
410 min_len: usize,
411 limit: usize,
412) -> Vec<String> {
413 if anchors.is_empty() {
414 return Vec::new();
415 }
416
417 let anchor_set = anchors
418 .iter()
419 .map(|a| a.term.as_str())
420 .collect::<HashSet<_>>();
421
422 let mut scored = Vec::new();
423
424 for sentence in sentences {
425 let sentence_tokens = tokenize(sentence);
426 let tokens = select_content_tokens(&sentence_tokens, stopwords, filler, negations, min_len).tokens;
427 for window in [2usize, 3usize] {
428 if tokens.len() < window {
429 continue;
430 }
431 for idx in 0..=(tokens.len() - window) {
432 let phrase = tokens[idx..idx + window].to_vec();
433 let overlap = phrase.iter().filter(|t| anchor_set.contains(t.as_str())).count();
434 if overlap > 0 {
435 let score = (overlap as f32)
436 + (window as f32 * 0.1)
437 + (phrase.iter().filter(|t| is_negated_marker(t)).count() as f32 * 0.15);
438 let phrase_text = phrase
439 .iter()
440 .map(|token| token.strip_prefix("neg_").map(|t| format!("not {t}")).unwrap_or_else(|| token.clone()))
441 .collect::<Vec<_>>()
442 .join(" ");
443 scored.push((score, phrase_text));
444 }
445 }
446 }
447 }
448
449 scored.sort_by(|a, b| {
450 b.0.partial_cmp(&a.0)
451 .unwrap_or(std::cmp::Ordering::Equal)
452 .then_with(|| a.1.cmp(&b.1))
453 });
454
455 let mut seen = HashSet::new();
456 scored
457 .into_iter()
458 .filter_map(|(_, phrase)| {
459 if seen.insert(phrase.clone()) {
460 Some(phrase)
461 } else {
462 None
463 }
464 })
465 .take(limit)
466 .collect()
467}
468
469fn choose_anchor_topic(anchors: &[AnchorTerm], phrases: &[String]) -> String {
470 let Some(top) = anchors.first() else {
471 return "".to_string();
472 };
473
474 let top_display = top.term.strip_prefix("neg_").map(|t| format!("not {t}")).unwrap_or_else(|| top.term.clone());
475
476 if let Some(phrase) = phrases.iter().find(|phrase| phrase.contains(&top_display)) {
477 return phrase.clone();
478 }
479
480 top_display
481}
482
483fn build_key_points(
484 sentences: &[String],
485 anchors: &[AnchorTerm],
486 stopwords: &HashSet<String>,
487 filler: &HashSet<String>,
488 negations: &HashSet<String>,
489 min_len: usize,
490 max_points: usize,
491) -> Vec<String> {
492 if anchors.is_empty() {
493 return sentences.iter().take(max_points).cloned().collect();
494 }
495
496 let anchor_terms = anchors
497 .iter()
498 .map(|anchor| anchor.term.as_str())
499 .collect::<HashSet<_>>();
500
501 let mut scored = sentences
502 .iter()
503 .enumerate()
504 .map(|(idx, sentence)| {
505 let sentence_tokens = tokenize(sentence);
506 let tokens = select_content_tokens(&sentence_tokens, stopwords, filler, negations, min_len).tokens;
507 let hits = tokens
508 .iter()
509 .filter(|token| anchor_terms.contains(token.as_str()))
510 .count();
511 let score = (hits as f32 * 2.0) + (1.0 / ((idx + 1) as f32));
512 (score, idx, sentence.clone())
513 })
514 .collect::<Vec<_>>();
515
516 scored.sort_by(|a, b| {
517 b.0.partial_cmp(&a.0)
518 .unwrap_or(std::cmp::Ordering::Equal)
519 .then_with(|| a.1.cmp(&b.1))
520 });
521
522 scored
523 .into_iter()
524 .filter(|(score, _, _)| *score > 0.0)
525 .take(max_points)
526 .map(|(_, _, sentence)| sentence)
527 .collect()
528}
529
530#[cfg(test)]
531mod tests {
532 use std::collections::HashSet;
533
534 use super::{CompressionLexicons, ManualCompressionLexiconProvider, ManualCompressionService};
535 use crate::domain::compression::ManualCompressionRequest;
536
537 struct CustomLexiconProvider;
538
539 impl ManualCompressionLexiconProvider for CustomLexiconProvider {
540 fn build_lexicons(&self, _request: &ManualCompressionRequest) -> CompressionLexicons {
541 CompressionLexicons {
542 stopwords: ["retrieval"].into_iter().map(str::to_string).collect::<HashSet<_>>(),
543 fillers: ["fallback"].into_iter().map(str::to_string).collect::<HashSet<_>>(),
544 negations: ["hardly"].into_iter().map(str::to_string).collect::<HashSet<_>>(),
545 }
546 }
547 }
548
549 #[test]
550 fn compressor_filters_filler_and_extracts_anchor() {
551 let service = ManualCompressionService::new();
552 let request = ManualCompressionRequest {
553 text: "Basically we need retrieval fallback policy. Actually retrieval fallback keeps recall stable.".to_string(),
554 ..Default::default()
555 };
556
557 let result = service.execute(&request);
558
559 assert!(result.anchor_terms.iter().any(|term| term.term == "retrieval" || term.term == "fallback"));
560 assert!(!result.anchor_terms.iter().any(|term| term.term == "basically"));
561 assert!(result.discarded_noise_ratio > 0.0);
562 }
563
564 #[test]
565 fn compressor_is_deterministic_for_same_input() {
566 let service = ManualCompressionService::new();
567 let request = ManualCompressionRequest {
568 text: "session graph recall and migration policy stability".to_string(),
569 ..Default::default()
570 };
571
572 let left = service.execute(&request);
573 let right = service.execute(&request);
574
575 assert_eq!(left.anchor_topic, right.anchor_topic);
576 assert_eq!(left.anchor_terms.len(), right.anchor_terms.len());
577 assert_eq!(left.key_points, right.key_points);
578 }
579
580 #[test]
581 fn compressor_cancels_double_negatives() {
582 let service = ManualCompressionService::new();
583 let request = ManualCompressionRequest {
584 text: "the rollout is not not stable and the policy is not brittle".to_string(),
585 ..Default::default()
586 };
587
588 let result = service.execute(&request);
589
590 assert!(result.anchor_terms.iter().any(|term| term.term == "stable"));
591 assert!(result.anchor_terms.iter().any(|term| term.term == "neg_brittle"));
592 assert!(!result.anchor_terms.iter().any(|term| term.term == "neg_stable"));
593 }
594
595 #[test]
596 fn compressor_filters_expanded_filler_lexicon() {
597 let service = ManualCompressionService::new();
598 let request = ManualCompressionRequest {
599 text: "Honestly the migration policy is quite stable and honestly deterministic".to_string(),
600 ..Default::default()
601 };
602
603 let result = service.execute(&request);
604
605 assert!(!result.anchor_terms.iter().any(|term| term.term == "honestly" || term.term == "quite"));
606 assert!(result.diagnostics.filler_removed >= 2);
607 }
608
609 #[test]
610 fn compressor_honors_request_lexicon_overrides() {
611 let service = ManualCompressionService::new();
612 let request = ManualCompressionRequest {
613 text: "hardly brittle retrieval retrieval fallback maybe".to_string(),
614 stopwords_add: vec!["retrieval".to_string()],
615 fillers_add: vec!["fallback".to_string()],
616 negations_add: vec!["hardly".to_string()],
617 ..Default::default()
618 };
619
620 let result = service.execute(&request);
621
622 assert!(!result.anchor_terms.iter().any(|term| term.term == "retrieval"));
623 assert!(!result.anchor_terms.iter().any(|term| term.term == "fallback"));
624 assert!(result.anchor_terms.iter().any(|term| term.term == "neg_brittle"));
625 }
626
627 #[test]
628 fn compressor_supports_custom_trait_provider() {
629 let service = ManualCompressionService::with_lexicon_provider(CustomLexiconProvider);
630 let request = ManualCompressionRequest {
631 text: "hardly brittle retrieval retrieval fallback".to_string(),
632 ..Default::default()
633 };
634
635 let result = service.execute(&request);
636
637 assert!(!result.anchor_terms.iter().any(|term| term.term == "retrieval"));
638 assert!(!result.anchor_terms.iter().any(|term| term.term == "fallback"));
639 assert!(result.anchor_terms.iter().any(|term| term.term == "neg_brittle"));
640 }
641}