1use serde::{Deserialize, Serialize};
8use std::collections::BTreeMap;
9use std::path::PathBuf;
10
11use crate::tokenizer::TokenCounts;
12
13#[derive(Debug, Clone, Serialize, Deserialize)]
15pub struct Document {
16 pub title: Option<String>,
18 pub source: PathBuf,
20 pub format: DocumentFormat,
22 pub metadata: DocumentMetadata,
24 pub sections: Vec<Section>,
26 pub token_count: TokenCounts,
28}
29
30impl Document {
31 pub fn new(source: impl Into<PathBuf>, format: DocumentFormat) -> Self {
33 Self {
34 title: None,
35 source: source.into(),
36 format,
37 metadata: DocumentMetadata::default(),
38 sections: Vec::new(),
39 token_count: TokenCounts::default(),
40 }
41 }
42
43 pub fn section_count(&self) -> usize {
45 fn count(sections: &[Section]) -> usize {
46 sections.iter().map(|s| 1 + count(&s.children)).sum()
47 }
48 count(&self.sections)
49 }
50
51 pub fn block_count(&self) -> usize {
53 fn count(sections: &[Section]) -> usize {
54 sections
55 .iter()
56 .map(|s| s.content.len() + count(&s.children))
57 .sum()
58 }
59 count(&self.sections)
60 }
61
62 pub fn full_text(&self) -> String {
64 let mut buf = String::new();
65 fn collect(sections: &[Section], buf: &mut String) {
66 for s in sections {
67 if let Some(title) = &s.title {
68 buf.push_str(title);
69 buf.push('\n');
70 }
71 for block in &s.content {
72 buf.push_str(&block.text());
73 buf.push('\n');
74 }
75 collect(&s.children, buf);
76 }
77 }
78 collect(&self.sections, &mut buf);
79 buf
80 }
81}
82
83#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
85pub enum DocumentFormat {
86 Docx,
87 Html,
88 Markdown,
89 PlainText,
90 Csv,
91 Xlsx,
92 Pdf,
93}
94
95impl DocumentFormat {
96 pub fn from_extension(ext: &str) -> Option<Self> {
98 match ext.to_lowercase().as_str() {
99 "docx" => Some(Self::Docx),
100 "html" | "htm" | "xhtml" => Some(Self::Html),
101 "md" | "markdown" | "mdx" => Some(Self::Markdown),
102 "txt" | "text" | "log" | "rst" => Some(Self::PlainText),
103 "csv" | "tsv" => Some(Self::Csv),
104 "xlsx" | "xls" => Some(Self::Xlsx),
105 "pdf" => Some(Self::Pdf),
106 _ => None,
107 }
108 }
109
110 pub fn name(&self) -> &'static str {
112 match self {
113 Self::Docx => "DOCX",
114 Self::Html => "HTML",
115 Self::Markdown => "Markdown",
116 Self::PlainText => "Plain Text",
117 Self::Csv => "CSV",
118 Self::Xlsx => "XLSX",
119 Self::Pdf => "PDF",
120 }
121 }
122}
123
124#[derive(Debug, Clone, Default, Serialize, Deserialize)]
126pub struct DocumentMetadata {
127 pub title: Option<String>,
128 pub author: Option<String>,
129 pub created: Option<String>,
130 pub modified: Option<String>,
131 pub subject: Option<String>,
132 pub keywords: Vec<String>,
133 pub version: Option<String>,
135 pub effective_date: Option<String>,
137 pub classification: Option<String>,
139 pub pages: Option<u32>,
141 pub custom: BTreeMap<String, String>,
143}
144
145#[derive(Debug, Clone, Serialize, Deserialize)]
147pub struct Section {
148 pub id: Option<String>,
150 pub level: u8,
152 pub title: Option<String>,
154 pub number: Option<String>,
156 pub content: Vec<ContentBlock>,
158 pub children: Vec<Section>,
160 pub importance: f32,
162}
163
164impl Section {
165 pub fn new(level: u8, title: impl Into<String>) -> Self {
167 Self {
168 id: None,
169 level,
170 title: Some(title.into()),
171 number: None,
172 content: Vec::new(),
173 children: Vec::new(),
174 importance: 0.5,
175 }
176 }
177
178 pub fn root() -> Self {
180 Self {
181 id: None,
182 level: 0,
183 title: None,
184 number: None,
185 content: Vec::new(),
186 children: Vec::new(),
187 importance: 0.5,
188 }
189 }
190}
191
192#[derive(Debug, Clone, Serialize, Deserialize)]
194pub enum ContentBlock {
195 Paragraph(String),
197 Table(Table),
199 List(List),
201 CodeBlock(CodeBlock),
203 Definition(Definition),
205 Blockquote(String),
207 CrossReference(CrossRef),
209 ThematicBreak,
211 Raw(String),
213}
214
215impl ContentBlock {
216 pub fn text(&self) -> String {
218 match self {
219 Self::Paragraph(t) | Self::Blockquote(t) | Self::Raw(t) => t.clone(),
220 Self::Table(t) => t.to_text(),
221 Self::List(l) => l.to_text(),
222 Self::CodeBlock(c) => c.content.clone(),
223 Self::Definition(d) => format!("{}: {}", d.term, d.definition),
224 Self::CrossReference(r) => r.display_text.clone(),
225 Self::ThematicBreak => String::new(),
226 }
227 }
228}
229
230#[derive(Debug, Clone, Serialize, Deserialize)]
232pub struct Table {
233 pub caption: Option<String>,
234 pub headers: Vec<String>,
235 pub rows: Vec<Vec<String>>,
236 pub alignments: Vec<Alignment>,
237}
238
239impl Table {
240 pub fn to_text(&self) -> String {
241 let mut buf = String::new();
242 if let Some(cap) = &self.caption {
243 buf.push_str(cap);
244 buf.push('\n');
245 }
246 if !self.headers.is_empty() {
247 buf.push_str(&self.headers.join(" | "));
248 buf.push('\n');
249 }
250 for row in &self.rows {
251 buf.push_str(&row.join(" | "));
252 buf.push('\n');
253 }
254 buf
255 }
256}
257
258#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
260pub enum Alignment {
261 Left,
262 Center,
263 Right,
264 None,
265}
266
267#[derive(Debug, Clone, Serialize, Deserialize)]
269pub struct List {
270 pub ordered: bool,
271 pub items: Vec<ListItem>,
272}
273
274impl List {
275 pub fn to_text(&self) -> String {
276 let mut buf = String::new();
277 for (i, item) in self.items.iter().enumerate() {
278 if self.ordered {
279 buf.push_str(&format!("{}. {}\n", i + 1, item.text));
280 } else {
281 buf.push_str(&format!("- {}\n", item.text));
282 }
283 if let Some(sub) = &item.children {
284 for line in sub.to_text().lines() {
285 buf.push_str(&format!(" {}\n", line));
286 }
287 }
288 }
289 buf
290 }
291}
292
293#[derive(Debug, Clone, Serialize, Deserialize)]
295pub struct ListItem {
296 pub text: String,
297 pub children: Option<List>,
298}
299
300#[derive(Debug, Clone, Serialize, Deserialize)]
302pub struct CodeBlock {
303 pub language: Option<String>,
304 pub content: String,
305}
306
307#[derive(Debug, Clone, Serialize, Deserialize)]
309pub struct Definition {
310 pub term: String,
311 pub definition: String,
312}
313
314#[derive(Debug, Clone, Serialize, Deserialize)]
316pub struct CrossRef {
317 pub target_id: String,
318 pub display_text: String,
319 pub internal: bool,
320}
321
322#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
324pub enum ContentClass {
325 Requirement,
327 Informative,
329 DefinitionText,
331 ExternalReference,
333 Data,
335 Boilerplate,
337 General,
339}
340
341#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
343pub enum DistillationLevel {
344 None,
346 Minimal,
348 #[default]
350 Balanced,
351 Aggressive,
353 Full,
355}
356
357impl DistillationLevel {
358 pub fn parse_name(s: &str) -> Option<Self> {
359 match s.to_lowercase().as_str() {
360 "none" => Some(Self::None),
361 "minimal" => Some(Self::Minimal),
362 "balanced" => Some(Self::Balanced),
363 "aggressive" => Some(Self::Aggressive),
364 "full" => Some(Self::Full),
365 _ => None,
366 }
367 }
368
369 pub fn name(&self) -> &'static str {
370 match self {
371 Self::None => "none",
372 Self::Minimal => "minimal",
373 Self::Balanced => "balanced",
374 Self::Aggressive => "aggressive",
375 Self::Full => "full",
376 }
377 }
378}
379
380#[cfg(test)]
381mod tests {
382 use super::*;
383
384 #[test]
385 fn test_document_format_from_extension() {
386 assert_eq!(DocumentFormat::from_extension("md"), Some(DocumentFormat::Markdown));
387 assert_eq!(DocumentFormat::from_extension("docx"), Some(DocumentFormat::Docx));
388 assert_eq!(DocumentFormat::from_extension("HTML"), Some(DocumentFormat::Html));
389 assert_eq!(DocumentFormat::from_extension("csv"), Some(DocumentFormat::Csv));
390 assert_eq!(DocumentFormat::from_extension("rs"), None);
391 }
392
393 #[test]
394 fn test_document_section_count() {
395 let mut doc = Document::new("/tmp/test.md", DocumentFormat::Markdown);
396 let mut s1 = Section::new(1, "Intro");
397 s1.children.push(Section::new(2, "Sub"));
398 doc.sections.push(s1);
399 doc.sections.push(Section::new(1, "Conclusion"));
400 assert_eq!(doc.section_count(), 3);
401 }
402
403 #[test]
404 fn test_content_block_text() {
405 let p = ContentBlock::Paragraph("Hello world".into());
406 assert_eq!(p.text(), "Hello world");
407
408 let d = ContentBlock::Definition(Definition {
409 term: "LLM".into(),
410 definition: "Large Language Model".into(),
411 });
412 assert_eq!(d.text(), "LLM: Large Language Model");
413 }
414
415 #[test]
416 fn test_distillation_level() {
417 assert_eq!(DistillationLevel::parse_name("balanced"), Some(DistillationLevel::Balanced));
418 assert_eq!(DistillationLevel::parse_name("FULL"), Some(DistillationLevel::Full));
419 assert_eq!(DistillationLevel::parse_name("unknown"), None);
420 assert_eq!(DistillationLevel::default(), DistillationLevel::Balanced);
421 }
422
423 #[test]
424 fn test_table_to_text() {
425 let t = Table {
426 caption: Some("Access Matrix".into()),
427 headers: vec!["Role".into(), "Access".into()],
428 rows: vec![vec!["Admin".into(), "Full".into()]],
429 alignments: vec![],
430 };
431 let text = t.to_text();
432 assert!(text.contains("Access Matrix"));
433 assert!(text.contains("Role | Access"));
434 assert!(text.contains("Admin | Full"));
435 }
436
437 #[test]
438 fn test_list_to_text() {
439 let l = List {
440 ordered: true,
441 items: vec![
442 ListItem { text: "First".into(), children: None },
443 ListItem { text: "Second".into(), children: None },
444 ],
445 };
446 let text = l.to_text();
447 assert!(text.contains("1. First"));
448 assert!(text.contains("2. Second"));
449 }
450}