Skip to main content

infiniloom_engine/document/distillation/
mod.rs

1//! Content distillation pipeline for LLM attention and token optimization.
2//!
3//! Research shows that removing noise from documents **improves** LLM accuracy
4//! by 17-21% (LLMLingua, Microsoft) — this is not just about saving tokens.
5//!
6//! The pipeline has 5 stages:
7//! 1. **Strip**: Remove zero-value content (page numbers, watermarks, boilerplate)
8//! 2. **Deduplicate**: Remove redundant content (TOC before body, repeated definitions)
9//! 3. **Compress**: Tighten language (filler phrases, hedging, verbose patterns)
10//! 4. **Score**: Rank sections by information density
11//! 5. **Arrange**: Place high-value content where LLMs attend (start/end, not middle)
12
13pub mod arrange;
14pub mod compress;
15pub mod patterns;
16pub mod score;
17pub mod strip;
18
19use crate::document::types::{DistillationLevel, Document};
20
21/// Run the distillation pipeline on a parsed document.
22pub fn distill(doc: &mut Document, level: DistillationLevel) {
23    match level {
24        DistillationLevel::None => {},
25        DistillationLevel::Minimal => {
26            strip::strip_document(doc);
27        },
28        DistillationLevel::Balanced => {
29            strip::strip_document(doc);
30            strip::deduplicate(doc);
31        },
32        DistillationLevel::Aggressive => {
33            strip::strip_document(doc);
34            strip::deduplicate(doc);
35            compress::compress_document(doc);
36        },
37        DistillationLevel::Full => {
38            strip::strip_document(doc);
39            strip::deduplicate(doc);
40            compress::compress_document(doc);
41            score::score_document(doc);
42            arrange::arrange_document(doc);
43        },
44    }
45}
46
47/// Get statistics about distillation effectiveness.
48pub struct DistillationStats {
49    pub original_blocks: usize,
50    pub remaining_blocks: usize,
51    pub sections_removed: usize,
52    pub filler_replacements: usize,
53}
54
55#[cfg(test)]
56mod tests {
57    use super::*;
58    use crate::document::types::*;
59
60    fn make_test_doc() -> Document {
61        let mut doc = Document::new("/tmp/test.md", DocumentFormat::Markdown);
62        let mut s1 = Section::new(1, "Introduction");
63        s1.content.push(ContentBlock::Paragraph(
64            "It is important to note that this document establishes the policy.".into(),
65        ));
66        let mut s2 = Section::new(1, "Requirements");
67        s2.content
68            .push(ContentBlock::Paragraph("All users MUST authenticate using MFA.".into()));
69        doc.sections.push(s1);
70        doc.sections.push(s2);
71        doc
72    }
73
74    #[test]
75    fn test_distill_none() {
76        let mut doc = make_test_doc();
77        distill(&mut doc, DistillationLevel::None);
78        assert_eq!(doc.sections.len(), 2);
79    }
80
81    #[test]
82    fn test_distill_aggressive() {
83        let mut doc = make_test_doc();
84        distill(&mut doc, DistillationLevel::Aggressive);
85        // Filler should be compressed
86        let text = doc.full_text();
87        assert!(!text.contains("It is important to note that"));
88    }
89
90    #[test]
91    fn test_distill_full() {
92        let mut doc = make_test_doc();
93        distill(&mut doc, DistillationLevel::Full);
94        // Requirements section should score higher
95        let req = doc
96            .sections
97            .iter()
98            .find(|s| s.title.as_deref() == Some("Requirements"));
99        assert!(req.is_some());
100    }
101}