Skip to main content

kbolt_core/ingest/
plaintext.rs

1use std::collections::HashMap;
2use std::path::Path;
3
4use crate::ingest::extract::{BlockKind, ExtractedBlock, ExtractedDocument, Extractor};
5use crate::Result;
6
7pub struct PlaintextExtractor;
8
9impl Extractor for PlaintextExtractor {
10    fn supports(&self) -> &[&str] {
11        &["txt", "text", "log"]
12    }
13
14    fn profile_key(&self) -> &'static str {
15        "txt"
16    }
17
18    fn extract(&self, _path: &Path, bytes: &[u8]) -> Result<ExtractedDocument> {
19        if let Err(err) = std::str::from_utf8(bytes) {
20            return Err(kbolt_types::KboltError::InvalidInput(format!(
21                "non-utf8 plaintext input: {err}"
22            ))
23            .into());
24        }
25
26        let mut blocks = Vec::new();
27        for (offset, end) in paragraph_ranges(bytes) {
28            let text = String::from_utf8_lossy(&bytes[offset..end]).to_string();
29            blocks.push(ExtractedBlock {
30                text,
31                offset,
32                length: end.saturating_sub(offset),
33                kind: BlockKind::Paragraph,
34                heading_path: Vec::new(),
35                attrs: HashMap::new(),
36            });
37        }
38
39        Ok(ExtractedDocument {
40            blocks,
41            metadata: HashMap::new(),
42            title: None,
43        })
44    }
45}
46
47fn paragraph_ranges(bytes: &[u8]) -> Vec<(usize, usize)> {
48    let mut ranges = Vec::new();
49    let mut paragraph_start: Option<usize> = None;
50    let mut line_start = 0usize;
51
52    while line_start < bytes.len() {
53        let line_end = next_line_end(bytes, line_start);
54        let content_end = trim_line_ending(bytes, line_start, line_end);
55        let is_blank = is_blank_line(bytes, line_start, content_end);
56
57        match (paragraph_start, is_blank) {
58            (None, false) => {
59                paragraph_start = Some(line_start);
60            }
61            (Some(start), true) => {
62                let end = trim_trailing_newlines(bytes, line_start);
63                if end > start {
64                    ranges.push((start, end));
65                }
66                paragraph_start = None;
67            }
68            _ => {}
69        }
70
71        line_start = line_end;
72    }
73
74    if let Some(start) = paragraph_start {
75        let end = trim_trailing_newlines(bytes, bytes.len());
76        if end > start {
77            ranges.push((start, end));
78        }
79    }
80
81    ranges
82}
83
84fn next_line_end(bytes: &[u8], start: usize) -> usize {
85    let mut index = start;
86    while index < bytes.len() {
87        if bytes[index] == b'\n' {
88            return index + 1;
89        }
90        index += 1;
91    }
92    bytes.len()
93}
94
95fn trim_line_ending(bytes: &[u8], start: usize, end: usize) -> usize {
96    let mut content_end = end;
97    while content_end > start && matches!(bytes[content_end - 1], b'\n' | b'\r') {
98        content_end -= 1;
99    }
100    content_end
101}
102
103fn is_blank_line(bytes: &[u8], start: usize, end: usize) -> bool {
104    bytes[start..end]
105        .iter()
106        .all(|byte| matches!(byte, b' ' | b'\t'))
107}
108
109fn trim_trailing_newlines(bytes: &[u8], end: usize) -> usize {
110    let mut result = end;
111    while result > 0 && matches!(bytes[result - 1], b'\n' | b'\r') {
112        result -= 1;
113    }
114    result
115}
116
117#[cfg(test)]
118mod tests {
119    use std::path::Path;
120
121    use crate::ingest::extract::Extractor;
122    use crate::ingest::plaintext::PlaintextExtractor;
123
124    #[test]
125    fn extracts_single_paragraph_with_exact_span() {
126        let extractor = PlaintextExtractor;
127        let doc = extractor
128            .extract(Path::new("notes/readme.txt"), b"alpha beta")
129            .expect("extract plaintext");
130
131        assert_eq!(doc.blocks.len(), 1);
132        assert_eq!(doc.blocks[0].offset, 0);
133        assert_eq!(doc.blocks[0].length, 10);
134        assert_eq!(doc.blocks[0].text, "alpha beta");
135    }
136
137    #[test]
138    fn splits_paragraphs_on_blank_lines_with_spans() {
139        let extractor = PlaintextExtractor;
140        let input = b"first line\nsecond line\n\nthird line\n\n  \nlast line\n";
141        let doc = extractor
142            .extract(Path::new("notes/readme.txt"), input)
143            .expect("extract plaintext");
144
145        assert_eq!(doc.blocks.len(), 3);
146        assert_eq!(doc.blocks[0].text, "first line\nsecond line");
147        assert_eq!(doc.blocks[0].offset, 0);
148        assert_eq!(doc.blocks[0].length, 22);
149
150        assert_eq!(doc.blocks[1].text, "third line");
151        assert_eq!(doc.blocks[1].offset, 24);
152        assert_eq!(doc.blocks[1].length, 10);
153
154        assert_eq!(doc.blocks[2].text, "last line");
155        assert_eq!(doc.blocks[2].offset, 39);
156        assert_eq!(doc.blocks[2].length, 9);
157    }
158
159    #[test]
160    fn does_not_act_as_generic_path_fallback() {
161        let extractor = PlaintextExtractor;
162        assert_eq!(extractor.profile_key(), "txt");
163        assert!(!extractor.supports_path(Path::new("docs/readme.md")));
164        assert!(!extractor.supports_path(Path::new("src/main.rs")));
165    }
166
167    #[test]
168    fn rejects_non_utf8_bytes() {
169        let extractor = PlaintextExtractor;
170        let err = extractor
171            .extract(Path::new("notes/data.bin"), &[0xff, 0xfe, 0xfd])
172            .expect_err("invalid utf8 should fail");
173        assert!(err.to_string().contains("non-utf8 plaintext input"));
174    }
175}