Skip to main content

infiniloom_engine/document/
types.rs

1//! Core type definitions for document ingestion.
2//!
3//! Documents have a fundamentally different structure than code:
4//! - Code: File → Symbols (functions, classes)
5//! - Documents: Document → Sections → ContentBlocks (paragraphs, tables, lists)
6
7use serde::{Deserialize, Serialize};
8use std::collections::BTreeMap;
9use std::path::PathBuf;
10
11use crate::tokenizer::TokenCounts;
12
13/// A parsed document ready for LLM-optimized output.
14#[derive(Debug, Clone, Serialize, Deserialize)]
15pub struct Document {
16    /// Document title (extracted or inferred)
17    pub title: Option<String>,
18    /// Source file path
19    pub source: PathBuf,
20    /// Original format
21    pub format: DocumentFormat,
22    /// Document metadata
23    pub metadata: DocumentMetadata,
24    /// Hierarchical content structure
25    pub sections: Vec<Section>,
26    /// Token counts across models
27    pub token_count: TokenCounts,
28}
29
30impl Document {
31    /// Create a new empty document from a source path.
32    pub fn new(source: impl Into<PathBuf>, format: DocumentFormat) -> Self {
33        Self {
34            title: None,
35            source: source.into(),
36            format,
37            metadata: DocumentMetadata::default(),
38            sections: Vec::new(),
39            token_count: TokenCounts::default(),
40        }
41    }
42
43    /// Total number of sections (including nested).
44    pub fn section_count(&self) -> usize {
45        fn count(sections: &[Section]) -> usize {
46            sections.iter().map(|s| 1 + count(&s.children)).sum()
47        }
48        count(&self.sections)
49    }
50
51    /// Total number of content blocks across all sections.
52    pub fn block_count(&self) -> usize {
53        fn count(sections: &[Section]) -> usize {
54            sections
55                .iter()
56                .map(|s| s.content.len() + count(&s.children))
57                .sum()
58        }
59        count(&self.sections)
60    }
61
62    /// Flatten all text content into a single string.
63    pub fn full_text(&self) -> String {
64        let mut buf = String::new();
65        fn collect(sections: &[Section], buf: &mut String) {
66            for s in sections {
67                if let Some(title) = &s.title {
68                    buf.push_str(title);
69                    buf.push('\n');
70                }
71                for block in &s.content {
72                    buf.push_str(&block.text());
73                    buf.push('\n');
74                }
75                collect(&s.children, buf);
76            }
77        }
78        collect(&self.sections, &mut buf);
79        buf
80    }
81}
82
83/// Supported document formats.
84#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
85pub enum DocumentFormat {
86    Docx,
87    Html,
88    Markdown,
89    PlainText,
90    Csv,
91    Xlsx,
92    Pdf,
93}
94
95impl DocumentFormat {
96    /// Detect format from file extension.
97    pub fn from_extension(ext: &str) -> Option<Self> {
98        match ext.to_lowercase().as_str() {
99            "docx" => Some(Self::Docx),
100            "html" | "htm" | "xhtml" => Some(Self::Html),
101            "md" | "markdown" | "mdx" => Some(Self::Markdown),
102            "txt" | "text" | "log" | "rst" => Some(Self::PlainText),
103            "csv" | "tsv" => Some(Self::Csv),
104            "xlsx" | "xls" => Some(Self::Xlsx),
105            "pdf" => Some(Self::Pdf),
106            _ => None,
107        }
108    }
109
110    /// Human-readable format name.
111    pub fn name(&self) -> &'static str {
112        match self {
113            Self::Docx => "DOCX",
114            Self::Html => "HTML",
115            Self::Markdown => "Markdown",
116            Self::PlainText => "Plain Text",
117            Self::Csv => "CSV",
118            Self::Xlsx => "XLSX",
119            Self::Pdf => "PDF",
120        }
121    }
122}
123
124/// Document metadata.
125#[derive(Debug, Clone, Default, Serialize, Deserialize)]
126pub struct DocumentMetadata {
127    pub title: Option<String>,
128    pub author: Option<String>,
129    pub created: Option<String>,
130    pub modified: Option<String>,
131    pub subject: Option<String>,
132    pub keywords: Vec<String>,
133    /// Document version/revision (compliance)
134    pub version: Option<String>,
135    /// Effective date (compliance)
136    pub effective_date: Option<String>,
137    /// Document classification (e.g., "Internal", "Confidential")
138    pub classification: Option<String>,
139    /// Total pages (if applicable)
140    pub pages: Option<u32>,
141    /// Custom key-value metadata
142    pub custom: BTreeMap<String, String>,
143}
144
145/// A document section with optional heading and nested children.
146#[derive(Debug, Clone, Serialize, Deserialize)]
147pub struct Section {
148    /// Section ID for cross-referencing
149    pub id: Option<String>,
150    /// Heading level (1-6, 0 for no heading)
151    pub level: u8,
152    /// Section title/heading text
153    pub title: Option<String>,
154    /// Section number (e.g., "3.2.1")
155    pub number: Option<String>,
156    /// Content blocks within this section
157    pub content: Vec<ContentBlock>,
158    /// Nested subsections
159    pub children: Vec<Section>,
160    /// Information density score (set by distillation pipeline)
161    pub importance: f32,
162}
163
164impl Section {
165    /// Create a new section with a heading.
166    pub fn new(level: u8, title: impl Into<String>) -> Self {
167        Self {
168            id: None,
169            level,
170            title: Some(title.into()),
171            number: None,
172            content: Vec::new(),
173            children: Vec::new(),
174            importance: 0.5,
175        }
176    }
177
178    /// Create a root section (no heading) to hold top-level content.
179    pub fn root() -> Self {
180        Self {
181            id: None,
182            level: 0,
183            title: None,
184            number: None,
185            content: Vec::new(),
186            children: Vec::new(),
187            importance: 0.5,
188        }
189    }
190}
191
192/// A block of content within a section.
193#[derive(Debug, Clone, Serialize, Deserialize)]
194pub enum ContentBlock {
195    /// A paragraph of text
196    Paragraph(String),
197    /// A table with optional caption
198    Table(Table),
199    /// An ordered or unordered list
200    List(List),
201    /// A code block or preformatted text
202    CodeBlock(CodeBlock),
203    /// A definition (term + definition) — common in compliance docs
204    Definition(Definition),
205    /// A blockquote or callout
206    Blockquote(String),
207    /// A cross-reference to another section
208    CrossReference(CrossRef),
209    /// A horizontal rule / thematic break
210    ThematicBreak,
211    /// Raw content that couldn't be classified
212    Raw(String),
213}
214
215impl ContentBlock {
216    /// Extract plain text content from this block.
217    pub fn text(&self) -> String {
218        match self {
219            Self::Paragraph(t) | Self::Blockquote(t) | Self::Raw(t) => t.clone(),
220            Self::Table(t) => t.to_text(),
221            Self::List(l) => l.to_text(),
222            Self::CodeBlock(c) => c.content.clone(),
223            Self::Definition(d) => format!("{}: {}", d.term, d.definition),
224            Self::CrossReference(r) => r.display_text.clone(),
225            Self::ThematicBreak => String::new(),
226        }
227    }
228}
229
230/// A table with headers and rows.
231#[derive(Debug, Clone, Serialize, Deserialize)]
232pub struct Table {
233    pub caption: Option<String>,
234    pub headers: Vec<String>,
235    pub rows: Vec<Vec<String>>,
236    pub alignments: Vec<Alignment>,
237}
238
239impl Table {
240    pub fn to_text(&self) -> String {
241        let mut buf = String::new();
242        if let Some(cap) = &self.caption {
243            buf.push_str(cap);
244            buf.push('\n');
245        }
246        if !self.headers.is_empty() {
247            buf.push_str(&self.headers.join(" | "));
248            buf.push('\n');
249        }
250        for row in &self.rows {
251            buf.push_str(&row.join(" | "));
252            buf.push('\n');
253        }
254        buf
255    }
256}
257
258/// Column alignment in a table.
259#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
260pub enum Alignment {
261    Left,
262    Center,
263    Right,
264    None,
265}
266
267/// A list (ordered or unordered, possibly nested).
268#[derive(Debug, Clone, Serialize, Deserialize)]
269pub struct List {
270    pub ordered: bool,
271    pub items: Vec<ListItem>,
272}
273
274impl List {
275    pub fn to_text(&self) -> String {
276        let mut buf = String::new();
277        for (i, item) in self.items.iter().enumerate() {
278            if self.ordered {
279                buf.push_str(&format!("{}. {}\n", i + 1, item.text));
280            } else {
281                buf.push_str(&format!("- {}\n", item.text));
282            }
283            if let Some(sub) = &item.children {
284                for line in sub.to_text().lines() {
285                    buf.push_str(&format!("  {}\n", line));
286                }
287            }
288        }
289        buf
290    }
291}
292
293/// A single list item.
294#[derive(Debug, Clone, Serialize, Deserialize)]
295pub struct ListItem {
296    pub text: String,
297    pub children: Option<List>,
298}
299
300/// A code block with optional language.
301#[derive(Debug, Clone, Serialize, Deserialize)]
302pub struct CodeBlock {
303    pub language: Option<String>,
304    pub content: String,
305}
306
307/// A definition (term + explanation).
308#[derive(Debug, Clone, Serialize, Deserialize)]
309pub struct Definition {
310    pub term: String,
311    pub definition: String,
312}
313
314/// A cross-reference to another section or document.
315#[derive(Debug, Clone, Serialize, Deserialize)]
316pub struct CrossRef {
317    pub target_id: String,
318    pub display_text: String,
319    pub internal: bool,
320}
321
322/// Content classification for distillation scoring.
323#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
324pub enum ContentClass {
325    /// Normative requirement (SHALL/MUST)
326    Requirement,
327    /// Informative guidance (NOTE/EXAMPLE)
328    Informative,
329    /// Definition of a term
330    DefinitionText,
331    /// Reference to external standard
332    ExternalReference,
333    /// Data-bearing content (numbers, tables, thresholds)
334    Data,
335    /// Boilerplate (standard disclaimers, copyright)
336    Boilerplate,
337    /// General text
338    General,
339}
340
341/// Distillation level controlling how aggressively content is compressed.
342#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
343pub enum DistillationLevel {
344    /// No distillation — raw conversion only
345    None,
346    /// Strip only — remove zero-value content (safe for legal)
347    Minimal,
348    /// Strip + deduplicate (default)
349    #[default]
350    Balanced,
351    /// Strip + deduplicate + language compression
352    Aggressive,
353    /// All stages including scoring and attention arrangement
354    Full,
355}
356
357impl DistillationLevel {
358    pub fn parse_name(s: &str) -> Option<Self> {
359        match s.to_lowercase().as_str() {
360            "none" => Some(Self::None),
361            "minimal" => Some(Self::Minimal),
362            "balanced" => Some(Self::Balanced),
363            "aggressive" => Some(Self::Aggressive),
364            "full" => Some(Self::Full),
365            _ => None,
366        }
367    }
368
369    pub fn name(&self) -> &'static str {
370        match self {
371            Self::None => "none",
372            Self::Minimal => "minimal",
373            Self::Balanced => "balanced",
374            Self::Aggressive => "aggressive",
375            Self::Full => "full",
376        }
377    }
378}
379
380#[cfg(test)]
381mod tests {
382    use super::*;
383
384    #[test]
385    fn test_document_format_from_extension() {
386        assert_eq!(DocumentFormat::from_extension("md"), Some(DocumentFormat::Markdown));
387        assert_eq!(DocumentFormat::from_extension("docx"), Some(DocumentFormat::Docx));
388        assert_eq!(DocumentFormat::from_extension("HTML"), Some(DocumentFormat::Html));
389        assert_eq!(DocumentFormat::from_extension("csv"), Some(DocumentFormat::Csv));
390        assert_eq!(DocumentFormat::from_extension("rs"), None);
391    }
392
393    #[test]
394    fn test_document_section_count() {
395        let mut doc = Document::new("/tmp/test.md", DocumentFormat::Markdown);
396        let mut s1 = Section::new(1, "Intro");
397        s1.children.push(Section::new(2, "Sub"));
398        doc.sections.push(s1);
399        doc.sections.push(Section::new(1, "Conclusion"));
400        assert_eq!(doc.section_count(), 3);
401    }
402
403    #[test]
404    fn test_content_block_text() {
405        let p = ContentBlock::Paragraph("Hello world".into());
406        assert_eq!(p.text(), "Hello world");
407
408        let d = ContentBlock::Definition(Definition {
409            term: "LLM".into(),
410            definition: "Large Language Model".into(),
411        });
412        assert_eq!(d.text(), "LLM: Large Language Model");
413    }
414
415    #[test]
416    fn test_distillation_level() {
417        assert_eq!(DistillationLevel::parse_name("balanced"), Some(DistillationLevel::Balanced));
418        assert_eq!(DistillationLevel::parse_name("FULL"), Some(DistillationLevel::Full));
419        assert_eq!(DistillationLevel::parse_name("unknown"), None);
420        assert_eq!(DistillationLevel::default(), DistillationLevel::Balanced);
421    }
422
423    #[test]
424    fn test_table_to_text() {
425        let t = Table {
426            caption: Some("Access Matrix".into()),
427            headers: vec!["Role".into(), "Access".into()],
428            rows: vec![vec!["Admin".into(), "Full".into()]],
429            alignments: vec![],
430        };
431        let text = t.to_text();
432        assert!(text.contains("Access Matrix"));
433        assert!(text.contains("Role | Access"));
434        assert!(text.contains("Admin | Full"));
435    }
436
437    #[test]
438    fn test_list_to_text() {
439        let l = List {
440            ordered: true,
441            items: vec![
442                ListItem { text: "First".into(), children: None },
443                ListItem { text: "Second".into(), children: None },
444            ],
445        };
446        let text = l.to_text();
447        assert!(text.contains("1. First"));
448        assert!(text.contains("2. Second"));
449    }
450}