Skip to main content

kbolt_core/ingest/
plaintext.rs

1use std::collections::HashMap;
2use std::path::Path;
3
4use crate::ingest::extract::{BlockKind, ExtractedBlock, ExtractedDocument, Extractor};
5use crate::Result;
6
7pub struct PlaintextExtractor;
8
9impl Extractor for PlaintextExtractor {
10    fn supports(&self) -> &[&str] {
11        &["txt", "text", "log"]
12    }
13
14    fn profile_key(&self) -> &'static str {
15        "txt"
16    }
17
18    fn supports_path(&self, _path: &Path) -> bool {
19        true
20    }
21
22    fn extract(&self, _path: &Path, bytes: &[u8]) -> Result<ExtractedDocument> {
23        if let Err(err) = std::str::from_utf8(bytes) {
24            return Err(kbolt_types::KboltError::InvalidInput(format!(
25                "non-utf8 plaintext input: {err}"
26            ))
27            .into());
28        }
29
30        let mut blocks = Vec::new();
31        for (offset, end) in paragraph_ranges(bytes) {
32            let text = String::from_utf8_lossy(&bytes[offset..end]).to_string();
33            blocks.push(ExtractedBlock {
34                text,
35                offset,
36                length: end.saturating_sub(offset),
37                kind: BlockKind::Paragraph,
38                heading_path: Vec::new(),
39                attrs: HashMap::new(),
40            });
41        }
42
43        Ok(ExtractedDocument {
44            blocks,
45            metadata: HashMap::new(),
46            title: None,
47        })
48    }
49}
50
51fn paragraph_ranges(bytes: &[u8]) -> Vec<(usize, usize)> {
52    let mut ranges = Vec::new();
53    let mut paragraph_start: Option<usize> = None;
54    let mut line_start = 0usize;
55
56    while line_start < bytes.len() {
57        let line_end = next_line_end(bytes, line_start);
58        let content_end = trim_line_ending(bytes, line_start, line_end);
59        let is_blank = is_blank_line(bytes, line_start, content_end);
60
61        match (paragraph_start, is_blank) {
62            (None, false) => {
63                paragraph_start = Some(line_start);
64            }
65            (Some(start), true) => {
66                let end = trim_trailing_newlines(bytes, line_start);
67                if end > start {
68                    ranges.push((start, end));
69                }
70                paragraph_start = None;
71            }
72            _ => {}
73        }
74
75        line_start = line_end;
76    }
77
78    if let Some(start) = paragraph_start {
79        let end = trim_trailing_newlines(bytes, bytes.len());
80        if end > start {
81            ranges.push((start, end));
82        }
83    }
84
85    ranges
86}
87
88fn next_line_end(bytes: &[u8], start: usize) -> usize {
89    let mut index = start;
90    while index < bytes.len() {
91        if bytes[index] == b'\n' {
92            return index + 1;
93        }
94        index += 1;
95    }
96    bytes.len()
97}
98
99fn trim_line_ending(bytes: &[u8], start: usize, end: usize) -> usize {
100    let mut content_end = end;
101    while content_end > start && matches!(bytes[content_end - 1], b'\n' | b'\r') {
102        content_end -= 1;
103    }
104    content_end
105}
106
107fn is_blank_line(bytes: &[u8], start: usize, end: usize) -> bool {
108    bytes[start..end]
109        .iter()
110        .all(|byte| matches!(byte, b' ' | b'\t'))
111}
112
113fn trim_trailing_newlines(bytes: &[u8], end: usize) -> usize {
114    let mut result = end;
115    while result > 0 && matches!(bytes[result - 1], b'\n' | b'\r') {
116        result -= 1;
117    }
118    result
119}
120
121#[cfg(test)]
122mod tests {
123    use std::path::Path;
124
125    use crate::ingest::extract::Extractor;
126    use crate::ingest::plaintext::PlaintextExtractor;
127
128    #[test]
129    fn extracts_single_paragraph_with_exact_span() {
130        let extractor = PlaintextExtractor;
131        let doc = extractor
132            .extract(Path::new("notes/readme.txt"), b"alpha beta")
133            .expect("extract plaintext");
134
135        assert_eq!(doc.blocks.len(), 1);
136        assert_eq!(doc.blocks[0].offset, 0);
137        assert_eq!(doc.blocks[0].length, 10);
138        assert_eq!(doc.blocks[0].text, "alpha beta");
139    }
140
141    #[test]
142    fn splits_paragraphs_on_blank_lines_with_spans() {
143        let extractor = PlaintextExtractor;
144        let input = b"first line\nsecond line\n\nthird line\n\n  \nlast line\n";
145        let doc = extractor
146            .extract(Path::new("notes/readme.txt"), input)
147            .expect("extract plaintext");
148
149        assert_eq!(doc.blocks.len(), 3);
150        assert_eq!(doc.blocks[0].text, "first line\nsecond line");
151        assert_eq!(doc.blocks[0].offset, 0);
152        assert_eq!(doc.blocks[0].length, 22);
153
154        assert_eq!(doc.blocks[1].text, "third line");
155        assert_eq!(doc.blocks[1].offset, 24);
156        assert_eq!(doc.blocks[1].length, 10);
157
158        assert_eq!(doc.blocks[2].text, "last line");
159        assert_eq!(doc.blocks[2].offset, 39);
160        assert_eq!(doc.blocks[2].length, 9);
161    }
162
163    #[test]
164    fn supports_path_acts_as_generic_text_fallback() {
165        let extractor = PlaintextExtractor;
166        assert_eq!(extractor.profile_key(), "txt");
167        assert!(extractor.supports_path(Path::new("docs/readme.md")));
168        assert!(extractor.supports_path(Path::new("src/main.rs")));
169    }
170
171    #[test]
172    fn rejects_non_utf8_bytes() {
173        let extractor = PlaintextExtractor;
174        let err = extractor
175            .extract(Path::new("notes/data.bin"), &[0xff, 0xfe, 0xfd])
176            .expect_err("invalid utf8 should fail");
177        assert!(err.to_string().contains("non-utf8 plaintext input"));
178    }
179}