kbolt_core/ingest/
plaintext.rs1use std::collections::HashMap;
2use std::path::Path;
3
4use crate::ingest::extract::{BlockKind, ExtractedBlock, ExtractedDocument, Extractor};
5use crate::Result;
6
7pub struct PlaintextExtractor;
8
9impl Extractor for PlaintextExtractor {
10 fn supports(&self) -> &[&str] {
11 &["txt", "text", "log"]
12 }
13
14 fn profile_key(&self) -> &'static str {
15 "txt"
16 }
17
18 fn supports_path(&self, _path: &Path) -> bool {
19 true
20 }
21
22 fn extract(&self, _path: &Path, bytes: &[u8]) -> Result<ExtractedDocument> {
23 if let Err(err) = std::str::from_utf8(bytes) {
24 return Err(kbolt_types::KboltError::InvalidInput(format!(
25 "non-utf8 plaintext input: {err}"
26 ))
27 .into());
28 }
29
30 let mut blocks = Vec::new();
31 for (offset, end) in paragraph_ranges(bytes) {
32 let text = String::from_utf8_lossy(&bytes[offset..end]).to_string();
33 blocks.push(ExtractedBlock {
34 text,
35 offset,
36 length: end.saturating_sub(offset),
37 kind: BlockKind::Paragraph,
38 heading_path: Vec::new(),
39 attrs: HashMap::new(),
40 });
41 }
42
43 Ok(ExtractedDocument {
44 blocks,
45 metadata: HashMap::new(),
46 title: None,
47 })
48 }
49}
50
51fn paragraph_ranges(bytes: &[u8]) -> Vec<(usize, usize)> {
52 let mut ranges = Vec::new();
53 let mut paragraph_start: Option<usize> = None;
54 let mut line_start = 0usize;
55
56 while line_start < bytes.len() {
57 let line_end = next_line_end(bytes, line_start);
58 let content_end = trim_line_ending(bytes, line_start, line_end);
59 let is_blank = is_blank_line(bytes, line_start, content_end);
60
61 match (paragraph_start, is_blank) {
62 (None, false) => {
63 paragraph_start = Some(line_start);
64 }
65 (Some(start), true) => {
66 let end = trim_trailing_newlines(bytes, line_start);
67 if end > start {
68 ranges.push((start, end));
69 }
70 paragraph_start = None;
71 }
72 _ => {}
73 }
74
75 line_start = line_end;
76 }
77
78 if let Some(start) = paragraph_start {
79 let end = trim_trailing_newlines(bytes, bytes.len());
80 if end > start {
81 ranges.push((start, end));
82 }
83 }
84
85 ranges
86}
87
88fn next_line_end(bytes: &[u8], start: usize) -> usize {
89 let mut index = start;
90 while index < bytes.len() {
91 if bytes[index] == b'\n' {
92 return index + 1;
93 }
94 index += 1;
95 }
96 bytes.len()
97}
98
99fn trim_line_ending(bytes: &[u8], start: usize, end: usize) -> usize {
100 let mut content_end = end;
101 while content_end > start && matches!(bytes[content_end - 1], b'\n' | b'\r') {
102 content_end -= 1;
103 }
104 content_end
105}
106
107fn is_blank_line(bytes: &[u8], start: usize, end: usize) -> bool {
108 bytes[start..end]
109 .iter()
110 .all(|byte| matches!(byte, b' ' | b'\t'))
111}
112
113fn trim_trailing_newlines(bytes: &[u8], end: usize) -> usize {
114 let mut result = end;
115 while result > 0 && matches!(bytes[result - 1], b'\n' | b'\r') {
116 result -= 1;
117 }
118 result
119}
120
121#[cfg(test)]
122mod tests {
123 use std::path::Path;
124
125 use crate::ingest::extract::Extractor;
126 use crate::ingest::plaintext::PlaintextExtractor;
127
128 #[test]
129 fn extracts_single_paragraph_with_exact_span() {
130 let extractor = PlaintextExtractor;
131 let doc = extractor
132 .extract(Path::new("notes/readme.txt"), b"alpha beta")
133 .expect("extract plaintext");
134
135 assert_eq!(doc.blocks.len(), 1);
136 assert_eq!(doc.blocks[0].offset, 0);
137 assert_eq!(doc.blocks[0].length, 10);
138 assert_eq!(doc.blocks[0].text, "alpha beta");
139 }
140
141 #[test]
142 fn splits_paragraphs_on_blank_lines_with_spans() {
143 let extractor = PlaintextExtractor;
144 let input = b"first line\nsecond line\n\nthird line\n\n \nlast line\n";
145 let doc = extractor
146 .extract(Path::new("notes/readme.txt"), input)
147 .expect("extract plaintext");
148
149 assert_eq!(doc.blocks.len(), 3);
150 assert_eq!(doc.blocks[0].text, "first line\nsecond line");
151 assert_eq!(doc.blocks[0].offset, 0);
152 assert_eq!(doc.blocks[0].length, 22);
153
154 assert_eq!(doc.blocks[1].text, "third line");
155 assert_eq!(doc.blocks[1].offset, 24);
156 assert_eq!(doc.blocks[1].length, 10);
157
158 assert_eq!(doc.blocks[2].text, "last line");
159 assert_eq!(doc.blocks[2].offset, 39);
160 assert_eq!(doc.blocks[2].length, 9);
161 }
162
163 #[test]
164 fn supports_path_acts_as_generic_text_fallback() {
165 let extractor = PlaintextExtractor;
166 assert_eq!(extractor.profile_key(), "txt");
167 assert!(extractor.supports_path(Path::new("docs/readme.md")));
168 assert!(extractor.supports_path(Path::new("src/main.rs")));
169 }
170
171 #[test]
172 fn rejects_non_utf8_bytes() {
173 let extractor = PlaintextExtractor;
174 let err = extractor
175 .extract(Path::new("notes/data.bin"), &[0xff, 0xfe, 0xfd])
176 .expect_err("invalid utf8 should fail");
177 assert!(err.to_string().contains("non-utf8 plaintext input"));
178 }
179}