kbolt_core/ingest/
plaintext.rs1use std::collections::HashMap;
2use std::path::Path;
3
4use crate::ingest::extract::{BlockKind, ExtractedBlock, ExtractedDocument, Extractor};
5use crate::Result;
6
7pub struct PlaintextExtractor;
8
9impl Extractor for PlaintextExtractor {
10 fn supports(&self) -> &[&str] {
11 &["txt", "text", "log"]
12 }
13
14 fn profile_key(&self) -> &'static str {
15 "txt"
16 }
17
18 fn extract(&self, _path: &Path, bytes: &[u8]) -> Result<ExtractedDocument> {
19 if let Err(err) = std::str::from_utf8(bytes) {
20 return Err(kbolt_types::KboltError::InvalidInput(format!(
21 "non-utf8 plaintext input: {err}"
22 ))
23 .into());
24 }
25
26 let mut blocks = Vec::new();
27 for (offset, end) in paragraph_ranges(bytes) {
28 let text = String::from_utf8_lossy(&bytes[offset..end]).to_string();
29 blocks.push(ExtractedBlock {
30 text,
31 offset,
32 length: end.saturating_sub(offset),
33 kind: BlockKind::Paragraph,
34 heading_path: Vec::new(),
35 attrs: HashMap::new(),
36 });
37 }
38
39 Ok(ExtractedDocument {
40 blocks,
41 metadata: HashMap::new(),
42 title: None,
43 })
44 }
45}
46
47fn paragraph_ranges(bytes: &[u8]) -> Vec<(usize, usize)> {
48 let mut ranges = Vec::new();
49 let mut paragraph_start: Option<usize> = None;
50 let mut line_start = 0usize;
51
52 while line_start < bytes.len() {
53 let line_end = next_line_end(bytes, line_start);
54 let content_end = trim_line_ending(bytes, line_start, line_end);
55 let is_blank = is_blank_line(bytes, line_start, content_end);
56
57 match (paragraph_start, is_blank) {
58 (None, false) => {
59 paragraph_start = Some(line_start);
60 }
61 (Some(start), true) => {
62 let end = trim_trailing_newlines(bytes, line_start);
63 if end > start {
64 ranges.push((start, end));
65 }
66 paragraph_start = None;
67 }
68 _ => {}
69 }
70
71 line_start = line_end;
72 }
73
74 if let Some(start) = paragraph_start {
75 let end = trim_trailing_newlines(bytes, bytes.len());
76 if end > start {
77 ranges.push((start, end));
78 }
79 }
80
81 ranges
82}
83
84fn next_line_end(bytes: &[u8], start: usize) -> usize {
85 let mut index = start;
86 while index < bytes.len() {
87 if bytes[index] == b'\n' {
88 return index + 1;
89 }
90 index += 1;
91 }
92 bytes.len()
93}
94
95fn trim_line_ending(bytes: &[u8], start: usize, end: usize) -> usize {
96 let mut content_end = end;
97 while content_end > start && matches!(bytes[content_end - 1], b'\n' | b'\r') {
98 content_end -= 1;
99 }
100 content_end
101}
102
103fn is_blank_line(bytes: &[u8], start: usize, end: usize) -> bool {
104 bytes[start..end]
105 .iter()
106 .all(|byte| matches!(byte, b' ' | b'\t'))
107}
108
109fn trim_trailing_newlines(bytes: &[u8], end: usize) -> usize {
110 let mut result = end;
111 while result > 0 && matches!(bytes[result - 1], b'\n' | b'\r') {
112 result -= 1;
113 }
114 result
115}
116
117#[cfg(test)]
118mod tests {
119 use std::path::Path;
120
121 use crate::ingest::extract::Extractor;
122 use crate::ingest::plaintext::PlaintextExtractor;
123
124 #[test]
125 fn extracts_single_paragraph_with_exact_span() {
126 let extractor = PlaintextExtractor;
127 let doc = extractor
128 .extract(Path::new("notes/readme.txt"), b"alpha beta")
129 .expect("extract plaintext");
130
131 assert_eq!(doc.blocks.len(), 1);
132 assert_eq!(doc.blocks[0].offset, 0);
133 assert_eq!(doc.blocks[0].length, 10);
134 assert_eq!(doc.blocks[0].text, "alpha beta");
135 }
136
137 #[test]
138 fn splits_paragraphs_on_blank_lines_with_spans() {
139 let extractor = PlaintextExtractor;
140 let input = b"first line\nsecond line\n\nthird line\n\n \nlast line\n";
141 let doc = extractor
142 .extract(Path::new("notes/readme.txt"), input)
143 .expect("extract plaintext");
144
145 assert_eq!(doc.blocks.len(), 3);
146 assert_eq!(doc.blocks[0].text, "first line\nsecond line");
147 assert_eq!(doc.blocks[0].offset, 0);
148 assert_eq!(doc.blocks[0].length, 22);
149
150 assert_eq!(doc.blocks[1].text, "third line");
151 assert_eq!(doc.blocks[1].offset, 24);
152 assert_eq!(doc.blocks[1].length, 10);
153
154 assert_eq!(doc.blocks[2].text, "last line");
155 assert_eq!(doc.blocks[2].offset, 39);
156 assert_eq!(doc.blocks[2].length, 9);
157 }
158
159 #[test]
160 fn does_not_act_as_generic_path_fallback() {
161 let extractor = PlaintextExtractor;
162 assert_eq!(extractor.profile_key(), "txt");
163 assert!(!extractor.supports_path(Path::new("docs/readme.md")));
164 assert!(!extractor.supports_path(Path::new("src/main.rs")));
165 }
166
167 #[test]
168 fn rejects_non_utf8_bytes() {
169 let extractor = PlaintextExtractor;
170 let err = extractor
171 .extract(Path::new("notes/data.bin"), &[0xff, 0xfe, 0xfd])
172 .expect_err("invalid utf8 should fail");
173 assert!(err.to_string().contains("non-utf8 plaintext input"));
174 }
175}