1use crate::engine::ExtractionEngine;
2use crate::error::Result;
3use crate::ir::{
4 BBox, Block, Confidence, Document, Line, Metadata, Page, SourceAnchor, Span, TextBlock,
5 SCHEMA_VERSION,
6};
7use crate::source::Source;
8
9const EXTRACTION_METHOD: &str = "csv_native";
10
11#[derive(Debug, Default, Clone, Copy)]
12pub struct CsvEngine;
13
14impl ExtractionEngine for CsvEngine {
15 fn name(&self) -> &'static str {
16 "csv-native"
17 }
18
19 fn extract(&self, source: &Source) -> Result<Document> {
20 let delimiter = delimiter_for_source(source);
21 Ok(build_document(
22 source,
23 self.name(),
24 parse_rows(source, delimiter),
25 ))
26 }
27}
28
29fn parse_rows(source: &Source, delimiter: char) -> Vec<Block> {
30 if let Some(blocks) = tesseract_tsv_blocks(source, delimiter) {
31 return blocks;
32 }
33 if let Some(blocks) = ckorzen_tsv_blocks(source, delimiter) {
34 return blocks;
35 }
36
37 source
38 .content
39 .lines()
40 .filter_map(|line| block_from_line(line, delimiter))
41 .collect()
42}
43
44fn block_from_line(line: &str, delimiter: char) -> Option<Block> {
45 let trimmed = line.trim();
46 if trimmed.is_empty() {
47 return None;
48 }
49
50 let cells = trimmed
51 .split(delimiter)
52 .map(|cell| cell.trim().trim_matches('"').to_owned())
53 .collect::<Vec<_>>();
54 let (bbox, text) = if let Some((bbox, text)) = ocr_box_row(&cells, delimiter) {
55 (Some(bbox), text)
56 } else {
57 (
58 None,
59 cells
60 .iter()
61 .filter(|cell| !cell.is_empty())
62 .cloned()
63 .collect::<Vec<_>>()
64 .join(" "),
65 )
66 };
67 let text = clean_text(&text);
68 if text.is_empty() {
69 return None;
70 }
71
72 Some(Block::Text(TextBlock {
73 text,
74 kind: "row".to_owned(),
75 bbox,
76 lines: Vec::new(),
77 source_anchors: vec![SourceAnchor {
78 page_number: 1,
79 pdf_object_ids: Vec::new(),
80 bbox,
81 extraction_method: EXTRACTION_METHOD.to_owned(),
82 }],
83 confidence: Some(Confidence {
84 score: 0.9,
85 calibrated: false,
86 }), ..Default::default()
87 }))
88}
89
90fn ckorzen_tsv_blocks(source: &Source, delimiter: char) -> Option<Vec<Block>> {
91 if delimiter != '\t' {
92 return None;
93 }
94
95 let mut lines = source.content.lines();
96 let header_line = lines.find(|line| !line.trim().is_empty())?;
97 let headers = split_delimited_cells(header_line, delimiter);
98 let feature_column = header_index(&headers, "feature")?;
99 let boxes_column = header_index(&headers, "bounding boxes")?;
100 let text_column = header_index(&headers, "text")?;
101 let required_max_index = feature_column.max(boxes_column).max(text_column);
102 let mut blocks = Vec::new();
103
104 for line in lines {
105 if line.trim().is_empty() {
106 continue;
107 }
108 let cells = split_delimited_cells(line, delimiter);
109 if cells.len() <= required_max_index {
110 continue;
111 }
112
113 let text = clean_text(&cells[text_column..].join("\t"));
114 if text.is_empty() {
115 continue;
116 }
117
118 let kind = clean_text(&cells[feature_column]);
119 let anchors = ckorzen_bounding_boxes(&cells[boxes_column]);
120 let bbox = bbox_union(anchors.iter().map(|(_, bbox)| *bbox));
121 let source_anchors = if anchors.is_empty() {
122 vec![SourceAnchor {
123 page_number: 1,
124 pdf_object_ids: Vec::new(),
125 bbox: None,
126 extraction_method: EXTRACTION_METHOD.to_owned(),
127 }]
128 } else {
129 anchors
130 .iter()
131 .map(|(page_number, bbox)| SourceAnchor {
132 page_number: *page_number,
133 pdf_object_ids: Vec::new(),
134 bbox: Some(*bbox),
135 extraction_method: EXTRACTION_METHOD.to_owned(),
136 })
137 .collect()
138 };
139
140 blocks.push(Block::Text(TextBlock {
141 text,
142 kind: if kind.is_empty() {
143 "row".to_owned()
144 } else {
145 kind
146 },
147 bbox,
148 lines: Vec::new(),
149 source_anchors,
150 confidence: Some(Confidence {
151 score: 0.9,
152 calibrated: false,
153 }), ..Default::default()
154 }));
155 }
156
157 (!blocks.is_empty()).then_some(blocks)
158}
159
160fn ckorzen_bounding_boxes(cell: &str) -> Vec<(usize, BBox)> {
161 cell.split("),")
162 .filter_map(|part| {
163 let part = part.trim().trim_start_matches('(').trim_end_matches(')');
164 let (page_number, coordinates) = part.split_once(";[")?;
165 let page_number = page_number.parse::<usize>().ok()?.max(1);
166 let coordinates = coordinates.trim_end_matches(']');
167 let coordinates = coordinates
168 .split(';')
169 .map(str::parse::<f32>)
170 .collect::<std::result::Result<Vec<_>, _>>()
171 .ok()?;
172 if coordinates.len() != 4 {
173 return None;
174 }
175 let x = coordinates[0];
176 let y = coordinates[1];
177 let width = coordinates[2] - coordinates[0];
178 let height = coordinates[3] - coordinates[1];
179 if width <= 0.0 || height <= 0.0 {
180 return None;
181 }
182 Some((
183 page_number,
184 BBox {
185 x,
186 y,
187 width,
188 height,
189 },
190 ))
191 })
192 .collect()
193}
194
195#[derive(Debug, Clone, Copy)]
196struct TesseractTsvColumns {
197 level: usize,
198 page_num: usize,
199 block_num: usize,
200 par_num: usize,
201 line_num: usize,
202 word_num: usize,
203 left: usize,
204 top: usize,
205 width: usize,
206 height: usize,
207 conf: usize,
208 text: usize,
209}
210
211#[derive(Debug)]
212struct TesseractWord {
213 text: String,
214 bbox: BBox,
215 confidence: Option<f32>,
216}
217
218fn tesseract_tsv_blocks(source: &Source, delimiter: char) -> Option<Vec<Block>> {
219 if delimiter != '\t' {
220 return None;
221 }
222
223 let mut lines = source.content.lines();
224 let header_line = lines.find(|line| !line.trim().is_empty())?;
225 let columns = TesseractTsvColumns::from_header(&split_delimited_cells(header_line, delimiter))?;
226 let required_max_index = columns.required_max_index();
227 let mut groups: Vec<((usize, usize, usize, usize), Vec<TesseractWord>)> = Vec::new();
228
229 for line in lines {
230 if line.trim().is_empty() {
231 continue;
232 }
233 let cells = split_delimited_cells(line, delimiter);
234 if cells.len() <= required_max_index || cells.len() <= columns.text {
235 continue;
236 }
237 if parse_usize_cell(&cells, columns.level) != Some(5) {
238 continue;
239 }
240
241 let text = clean_text(&cells[columns.text..].join("\t"));
242 if text.is_empty() {
243 continue;
244 }
245
246 let Some(bbox) = tesseract_bbox(&cells, columns) else {
247 continue;
248 };
249 let page_number = parse_usize_cell(&cells, columns.page_num)
250 .unwrap_or(1)
251 .max(1);
252 let key = (
253 page_number,
254 parse_usize_cell(&cells, columns.block_num).unwrap_or(0),
255 parse_usize_cell(&cells, columns.par_num).unwrap_or(0),
256 parse_usize_cell(&cells, columns.line_num).unwrap_or(0),
257 );
258 let word = TesseractWord {
259 text,
260 bbox,
261 confidence: parse_confidence_cell(&cells, columns.conf),
262 };
263
264 if let Some((_, words)) = groups
265 .iter_mut()
266 .find(|(existing_key, _)| *existing_key == key)
267 {
268 words.push(word);
269 } else {
270 groups.push((key, vec![word]));
271 }
272 }
273
274 if groups.is_empty() {
275 return None;
276 }
277
278 Some(
279 groups
280 .into_iter()
281 .filter_map(tesseract_line_block)
282 .collect(),
283 )
284}
285
286impl TesseractTsvColumns {
287 fn from_header(headers: &[String]) -> Option<Self> {
288 Some(Self {
289 level: header_index(headers, "level")?,
290 page_num: header_index(headers, "page_num")?,
291 block_num: header_index(headers, "block_num")?,
292 par_num: header_index(headers, "par_num")?,
293 line_num: header_index(headers, "line_num")?,
294 word_num: header_index(headers, "word_num")?,
295 left: header_index(headers, "left")?,
296 top: header_index(headers, "top")?,
297 width: header_index(headers, "width")?,
298 height: header_index(headers, "height")?,
299 conf: header_index(headers, "conf")?,
300 text: header_index(headers, "text")?,
301 })
302 }
303
304 fn required_max_index(self) -> usize {
305 [
306 self.level,
307 self.page_num,
308 self.block_num,
309 self.par_num,
310 self.line_num,
311 self.word_num,
312 self.left,
313 self.top,
314 self.width,
315 self.height,
316 self.conf,
317 ]
318 .into_iter()
319 .max()
320 .unwrap_or(0)
321 }
322}
323
324fn tesseract_line_block(
325 ((page_number, _, _, _), words): ((usize, usize, usize, usize), Vec<TesseractWord>),
326) -> Option<Block> {
327 if words.is_empty() {
328 return None;
329 }
330
331 let text = words
332 .iter()
333 .map(|word| word.text.as_str())
334 .collect::<Vec<_>>()
335 .join(" ");
336 let bbox = bbox_union(words.iter().map(|word| word.bbox))?;
337 let spans = words
338 .iter()
339 .map(|word| Span {
340 text: word.text.clone(),
341 bbox: Some(word.bbox),
342 font: None,
343 size: None,
344 bold: false,
345 italic: false,
346 })
347 .collect::<Vec<_>>();
348 let confidence = average_confidence(words.iter().filter_map(|word| word.confidence));
349
350 Some(Block::Text(TextBlock {
351 text: text.clone(),
352 kind: "ocr_line".to_owned(),
353 bbox: Some(bbox),
354 lines: vec![Line {
355 text,
356 bbox: Some(bbox),
357 spans,
358 }],
359 source_anchors: vec![SourceAnchor {
360 page_number,
361 pdf_object_ids: Vec::new(),
362 bbox: Some(bbox),
363 extraction_method: EXTRACTION_METHOD.to_owned(),
364 }],
365 confidence: Some(Confidence {
366 score: confidence.unwrap_or(0.9),
367 calibrated: false,
368 }), ..Default::default()
369 }))
370}
371
372fn split_delimited_cells(line: &str, delimiter: char) -> Vec<String> {
373 line.trim_end()
374 .split(delimiter)
375 .map(|cell| cell.trim().trim_matches('"').to_owned())
376 .collect()
377}
378
379fn header_index(headers: &[String], name: &str) -> Option<usize> {
380 headers
381 .iter()
382 .position(|header| normalize_header(header) == name)
383}
384
385fn normalize_header(header: &str) -> String {
386 header
387 .trim_start_matches('\u{feff}')
388 .trim()
389 .to_ascii_lowercase()
390}
391
392fn tesseract_bbox(cells: &[String], columns: TesseractTsvColumns) -> Option<BBox> {
393 let x = parse_f32_cell(cells, columns.left)?;
394 let y = parse_f32_cell(cells, columns.top)?;
395 let width = parse_f32_cell(cells, columns.width)?;
396 let height = parse_f32_cell(cells, columns.height)?;
397 if width <= 0.0 || height <= 0.0 {
398 return None;
399 }
400 Some(BBox {
401 x,
402 y,
403 width,
404 height,
405 })
406}
407
408fn parse_usize_cell(cells: &[String], index: usize) -> Option<usize> {
409 cells.get(index)?.parse::<usize>().ok()
410}
411
412fn parse_f32_cell(cells: &[String], index: usize) -> Option<f32> {
413 cells.get(index)?.parse::<f32>().ok()
414}
415
416fn parse_confidence_cell(cells: &[String], index: usize) -> Option<f32> {
417 let confidence = parse_f32_cell(cells, index)?;
418 if confidence < 0.0 {
419 return None;
420 }
421 if confidence > 1.0 {
422 Some((confidence / 100.0).clamp(0.0, 1.0))
423 } else {
424 Some(confidence)
425 }
426}
427
428fn bbox_union(boxes: impl Iterator<Item = BBox>) -> Option<BBox> {
429 let mut min_x = f32::INFINITY;
430 let mut min_y = f32::INFINITY;
431 let mut max_x = f32::NEG_INFINITY;
432 let mut max_y = f32::NEG_INFINITY;
433 let mut has_box = false;
434 for bbox in boxes {
435 has_box = true;
436 min_x = min_x.min(bbox.x);
437 min_y = min_y.min(bbox.y);
438 max_x = max_x.max(bbox.x + bbox.width);
439 max_y = max_y.max(bbox.y + bbox.height);
440 }
441 has_box.then_some(BBox {
442 x: min_x,
443 y: min_y,
444 width: max_x - min_x,
445 height: max_y - min_y,
446 })
447}
448
449fn average_confidence(confidences: impl Iterator<Item = f32>) -> Option<f32> {
450 let mut total = 0.0;
451 let mut count = 0usize;
452 for confidence in confidences {
453 total += confidence;
454 count += 1;
455 }
456 (count > 0).then_some(total / count as f32)
457}
458
459fn ocr_box_row(cells: &[String], delimiter: char) -> Option<(BBox, String)> {
460 if cells.len() < 9 {
461 return None;
462 }
463 let mut coordinates = [0.0f32; 8];
464 for (index, coordinate) in coordinates.iter_mut().enumerate() {
465 *coordinate = cells[index].parse::<f32>().ok()?;
466 }
467 let xs = [
468 coordinates[0],
469 coordinates[2],
470 coordinates[4],
471 coordinates[6],
472 ];
473 let ys = [
474 coordinates[1],
475 coordinates[3],
476 coordinates[5],
477 coordinates[7],
478 ];
479 let min_x = xs.iter().copied().fold(f32::INFINITY, f32::min);
480 let max_x = xs.iter().copied().fold(f32::NEG_INFINITY, f32::max);
481 let min_y = ys.iter().copied().fold(f32::INFINITY, f32::min);
482 let max_y = ys.iter().copied().fold(f32::NEG_INFINITY, f32::max);
483 let separator = if delimiter == ',' { ", " } else { "\t" };
484 let text = cells[8..].join(&separator);
485 Some((
486 BBox {
487 x: min_x,
488 y: min_y,
489 width: max_x - min_x,
490 height: max_y - min_y,
491 },
492 text,
493 ))
494}
495
496fn build_document(source: &Source, engine_name: &str, blocks: Vec<Block>) -> Document {
497 let page_bbox = inferred_page_bbox(&blocks);
498 let (character_count, word_count) = text_counts(&blocks);
499 let block_count = blocks.len();
500 Document {
501 schema_version: SCHEMA_VERSION.to_owned(),
502 metadata: Metadata {
503 format: source.format.clone(),
504 engine: engine_name.to_owned(),
505 source: source.path.clone(),
506 title: None,
507 character_count,
508 word_count,
509 block_count,
510 file_size_bytes: source.bytes.as_ref().map(|bytes| bytes.len() as u64),
511 pdf_version: None,
512 encrypted: false,
513 },
514 pages: vec![Page {
515 number: 1,
516 width: page_bbox.map(|bbox| bbox.width),
517 height: page_bbox.map(|bbox| bbox.height),
518 rotation: None,
519 bbox: page_bbox,
520 blocks,
521 images: Vec::new(),
522 assets: Vec::new(),
523 warnings: Vec::new(), ..Default::default()
524 }],
525 assets: Vec::new(),
526 warnings: Vec::new(),
527 }
528}
529
530fn inferred_page_bbox(blocks: &[Block]) -> Option<BBox> {
531 let mut max_x = 0.0f32;
532 let mut max_y = 0.0f32;
533 let mut has_bbox = false;
534 for block in blocks {
535 let Some(bbox) = block_bbox(block) else {
536 continue;
537 };
538 has_bbox = true;
539 max_x = max_x.max(bbox.x + bbox.width);
540 max_y = max_y.max(bbox.y + bbox.height);
541 }
542 has_bbox.then_some(BBox {
543 x: 0.0,
544 y: 0.0,
545 width: max_x,
546 height: max_y,
547 })
548}
549
550fn block_bbox(block: &Block) -> Option<BBox> {
551 match block {
552 Block::Text(text) => text.bbox,
553 Block::Table(table) => table.bbox,
554 Block::Figure(figure) => figure.bbox,
555 }
556}
557
558fn text_counts(blocks: &[Block]) -> (usize, usize) {
559 let mut character_count = 0;
560 let mut word_count = 0;
561 for block in blocks {
562 let text = match block {
563 Block::Text(text) => text.text.as_str(),
564 _ => "",
565 };
566 character_count += text.chars().count();
567 word_count += text.split_whitespace().count();
568 }
569 (character_count, word_count)
570}
571
572fn delimiter_for_source(source: &Source) -> char {
573 if source
574 .path
575 .as_deref()
576 .map(|path| path.to_ascii_lowercase().ends_with(".tsv"))
577 .unwrap_or(false)
578 {
579 '\t'
580 } else {
581 ','
582 }
583}
584
585fn clean_text(text: &str) -> String {
586 text.split_whitespace().collect::<Vec<_>>().join(" ")
587}