dongler_core/
csv.rs

1use crate::engine::ExtractionEngine;
2use crate::error::Result;
3use crate::ir::{
4    BBox, Block, Confidence, Document, Line, Metadata, Page, SourceAnchor, Span, TextBlock,
5    SCHEMA_VERSION,
6};
7use crate::source::Source;
8
9const EXTRACTION_METHOD: &str = "csv_native";
10
11#[derive(Debug, Default, Clone, Copy)]
12pub struct CsvEngine;
13
14impl ExtractionEngine for CsvEngine {
15    fn name(&self) -> &'static str {
16        "csv-native"
17    }
18
19    fn extract(&self, source: &Source) -> Result<Document> {
20        let delimiter = delimiter_for_source(source);
21        Ok(build_document(
22            source,
23            self.name(),
24            parse_rows(source, delimiter),
25        ))
26    }
27}
28
29fn parse_rows(source: &Source, delimiter: char) -> Vec<Block> {
30    if let Some(blocks) = tesseract_tsv_blocks(source, delimiter) {
31        return blocks;
32    }
33    if let Some(blocks) = ckorzen_tsv_blocks(source, delimiter) {
34        return blocks;
35    }
36
37    source
38        .content
39        .lines()
40        .filter_map(|line| block_from_line(line, delimiter))
41        .collect()
42}
43
44fn block_from_line(line: &str, delimiter: char) -> Option<Block> {
45    let trimmed = line.trim();
46    if trimmed.is_empty() {
47        return None;
48    }
49
50    let cells = trimmed
51        .split(delimiter)
52        .map(|cell| cell.trim().trim_matches('"').to_owned())
53        .collect::<Vec<_>>();
54    let (bbox, text) = if let Some((bbox, text)) = ocr_box_row(&cells, delimiter) {
55        (Some(bbox), text)
56    } else {
57        (
58            None,
59            cells
60                .iter()
61                .filter(|cell| !cell.is_empty())
62                .cloned()
63                .collect::<Vec<_>>()
64                .join(" "),
65        )
66    };
67    let text = clean_text(&text);
68    if text.is_empty() {
69        return None;
70    }
71
72    Some(Block::Text(TextBlock {
73        text,
74        kind: "row".to_owned(),
75        bbox,
76        lines: Vec::new(),
77        source_anchors: vec![SourceAnchor {
78            page_number: 1,
79            pdf_object_ids: Vec::new(),
80            bbox,
81            extraction_method: EXTRACTION_METHOD.to_owned(),
82        }],
83        confidence: Some(Confidence {
84            score: 0.9,
85            calibrated: false,
86        }), ..Default::default()
87    }))
88}
89
90fn ckorzen_tsv_blocks(source: &Source, delimiter: char) -> Option<Vec<Block>> {
91    if delimiter != '\t' {
92        return None;
93    }
94
95    let mut lines = source.content.lines();
96    let header_line = lines.find(|line| !line.trim().is_empty())?;
97    let headers = split_delimited_cells(header_line, delimiter);
98    let feature_column = header_index(&headers, "feature")?;
99    let boxes_column = header_index(&headers, "bounding boxes")?;
100    let text_column = header_index(&headers, "text")?;
101    let required_max_index = feature_column.max(boxes_column).max(text_column);
102    let mut blocks = Vec::new();
103
104    for line in lines {
105        if line.trim().is_empty() {
106            continue;
107        }
108        let cells = split_delimited_cells(line, delimiter);
109        if cells.len() <= required_max_index {
110            continue;
111        }
112
113        let text = clean_text(&cells[text_column..].join("\t"));
114        if text.is_empty() {
115            continue;
116        }
117
118        let kind = clean_text(&cells[feature_column]);
119        let anchors = ckorzen_bounding_boxes(&cells[boxes_column]);
120        let bbox = bbox_union(anchors.iter().map(|(_, bbox)| *bbox));
121        let source_anchors = if anchors.is_empty() {
122            vec![SourceAnchor {
123                page_number: 1,
124                pdf_object_ids: Vec::new(),
125                bbox: None,
126                extraction_method: EXTRACTION_METHOD.to_owned(),
127            }]
128        } else {
129            anchors
130                .iter()
131                .map(|(page_number, bbox)| SourceAnchor {
132                    page_number: *page_number,
133                    pdf_object_ids: Vec::new(),
134                    bbox: Some(*bbox),
135                    extraction_method: EXTRACTION_METHOD.to_owned(),
136                })
137                .collect()
138        };
139
140        blocks.push(Block::Text(TextBlock {
141            text,
142            kind: if kind.is_empty() {
143                "row".to_owned()
144            } else {
145                kind
146            },
147            bbox,
148            lines: Vec::new(),
149            source_anchors,
150            confidence: Some(Confidence {
151                score: 0.9,
152                calibrated: false,
153            }), ..Default::default()
154        }));
155    }
156
157    (!blocks.is_empty()).then_some(blocks)
158}
159
160fn ckorzen_bounding_boxes(cell: &str) -> Vec<(usize, BBox)> {
161    cell.split("),")
162        .filter_map(|part| {
163            let part = part.trim().trim_start_matches('(').trim_end_matches(')');
164            let (page_number, coordinates) = part.split_once(";[")?;
165            let page_number = page_number.parse::<usize>().ok()?.max(1);
166            let coordinates = coordinates.trim_end_matches(']');
167            let coordinates = coordinates
168                .split(';')
169                .map(str::parse::<f32>)
170                .collect::<std::result::Result<Vec<_>, _>>()
171                .ok()?;
172            if coordinates.len() != 4 {
173                return None;
174            }
175            let x = coordinates[0];
176            let y = coordinates[1];
177            let width = coordinates[2] - coordinates[0];
178            let height = coordinates[3] - coordinates[1];
179            if width <= 0.0 || height <= 0.0 {
180                return None;
181            }
182            Some((
183                page_number,
184                BBox {
185                    x,
186                    y,
187                    width,
188                    height,
189                },
190            ))
191        })
192        .collect()
193}
194
195#[derive(Debug, Clone, Copy)]
196struct TesseractTsvColumns {
197    level: usize,
198    page_num: usize,
199    block_num: usize,
200    par_num: usize,
201    line_num: usize,
202    word_num: usize,
203    left: usize,
204    top: usize,
205    width: usize,
206    height: usize,
207    conf: usize,
208    text: usize,
209}
210
211#[derive(Debug)]
212struct TesseractWord {
213    text: String,
214    bbox: BBox,
215    confidence: Option<f32>,
216}
217
218fn tesseract_tsv_blocks(source: &Source, delimiter: char) -> Option<Vec<Block>> {
219    if delimiter != '\t' {
220        return None;
221    }
222
223    let mut lines = source.content.lines();
224    let header_line = lines.find(|line| !line.trim().is_empty())?;
225    let columns = TesseractTsvColumns::from_header(&split_delimited_cells(header_line, delimiter))?;
226    let required_max_index = columns.required_max_index();
227    let mut groups: Vec<((usize, usize, usize, usize), Vec<TesseractWord>)> = Vec::new();
228
229    for line in lines {
230        if line.trim().is_empty() {
231            continue;
232        }
233        let cells = split_delimited_cells(line, delimiter);
234        if cells.len() <= required_max_index || cells.len() <= columns.text {
235            continue;
236        }
237        if parse_usize_cell(&cells, columns.level) != Some(5) {
238            continue;
239        }
240
241        let text = clean_text(&cells[columns.text..].join("\t"));
242        if text.is_empty() {
243            continue;
244        }
245
246        let Some(bbox) = tesseract_bbox(&cells, columns) else {
247            continue;
248        };
249        let page_number = parse_usize_cell(&cells, columns.page_num)
250            .unwrap_or(1)
251            .max(1);
252        let key = (
253            page_number,
254            parse_usize_cell(&cells, columns.block_num).unwrap_or(0),
255            parse_usize_cell(&cells, columns.par_num).unwrap_or(0),
256            parse_usize_cell(&cells, columns.line_num).unwrap_or(0),
257        );
258        let word = TesseractWord {
259            text,
260            bbox,
261            confidence: parse_confidence_cell(&cells, columns.conf),
262        };
263
264        if let Some((_, words)) = groups
265            .iter_mut()
266            .find(|(existing_key, _)| *existing_key == key)
267        {
268            words.push(word);
269        } else {
270            groups.push((key, vec![word]));
271        }
272    }
273
274    if groups.is_empty() {
275        return None;
276    }
277
278    Some(
279        groups
280            .into_iter()
281            .filter_map(tesseract_line_block)
282            .collect(),
283    )
284}
285
286impl TesseractTsvColumns {
287    fn from_header(headers: &[String]) -> Option<Self> {
288        Some(Self {
289            level: header_index(headers, "level")?,
290            page_num: header_index(headers, "page_num")?,
291            block_num: header_index(headers, "block_num")?,
292            par_num: header_index(headers, "par_num")?,
293            line_num: header_index(headers, "line_num")?,
294            word_num: header_index(headers, "word_num")?,
295            left: header_index(headers, "left")?,
296            top: header_index(headers, "top")?,
297            width: header_index(headers, "width")?,
298            height: header_index(headers, "height")?,
299            conf: header_index(headers, "conf")?,
300            text: header_index(headers, "text")?,
301        })
302    }
303
304    fn required_max_index(self) -> usize {
305        [
306            self.level,
307            self.page_num,
308            self.block_num,
309            self.par_num,
310            self.line_num,
311            self.word_num,
312            self.left,
313            self.top,
314            self.width,
315            self.height,
316            self.conf,
317        ]
318        .into_iter()
319        .max()
320        .unwrap_or(0)
321    }
322}
323
324fn tesseract_line_block(
325    ((page_number, _, _, _), words): ((usize, usize, usize, usize), Vec<TesseractWord>),
326) -> Option<Block> {
327    if words.is_empty() {
328        return None;
329    }
330
331    let text = words
332        .iter()
333        .map(|word| word.text.as_str())
334        .collect::<Vec<_>>()
335        .join(" ");
336    let bbox = bbox_union(words.iter().map(|word| word.bbox))?;
337    let spans = words
338        .iter()
339        .map(|word| Span {
340            text: word.text.clone(),
341            bbox: Some(word.bbox),
342            font: None,
343            size: None,
344            bold: false,
345            italic: false,
346        })
347        .collect::<Vec<_>>();
348    let confidence = average_confidence(words.iter().filter_map(|word| word.confidence));
349
350    Some(Block::Text(TextBlock {
351        text: text.clone(),
352        kind: "ocr_line".to_owned(),
353        bbox: Some(bbox),
354        lines: vec![Line {
355            text,
356            bbox: Some(bbox),
357            spans,
358        }],
359        source_anchors: vec![SourceAnchor {
360            page_number,
361            pdf_object_ids: Vec::new(),
362            bbox: Some(bbox),
363            extraction_method: EXTRACTION_METHOD.to_owned(),
364        }],
365        confidence: Some(Confidence {
366            score: confidence.unwrap_or(0.9),
367            calibrated: false,
368        }), ..Default::default()
369    }))
370}
371
372fn split_delimited_cells(line: &str, delimiter: char) -> Vec<String> {
373    line.trim_end()
374        .split(delimiter)
375        .map(|cell| cell.trim().trim_matches('"').to_owned())
376        .collect()
377}
378
379fn header_index(headers: &[String], name: &str) -> Option<usize> {
380    headers
381        .iter()
382        .position(|header| normalize_header(header) == name)
383}
384
385fn normalize_header(header: &str) -> String {
386    header
387        .trim_start_matches('\u{feff}')
388        .trim()
389        .to_ascii_lowercase()
390}
391
392fn tesseract_bbox(cells: &[String], columns: TesseractTsvColumns) -> Option<BBox> {
393    let x = parse_f32_cell(cells, columns.left)?;
394    let y = parse_f32_cell(cells, columns.top)?;
395    let width = parse_f32_cell(cells, columns.width)?;
396    let height = parse_f32_cell(cells, columns.height)?;
397    if width <= 0.0 || height <= 0.0 {
398        return None;
399    }
400    Some(BBox {
401        x,
402        y,
403        width,
404        height,
405    })
406}
407
408fn parse_usize_cell(cells: &[String], index: usize) -> Option<usize> {
409    cells.get(index)?.parse::<usize>().ok()
410}
411
412fn parse_f32_cell(cells: &[String], index: usize) -> Option<f32> {
413    cells.get(index)?.parse::<f32>().ok()
414}
415
416fn parse_confidence_cell(cells: &[String], index: usize) -> Option<f32> {
417    let confidence = parse_f32_cell(cells, index)?;
418    if confidence < 0.0 {
419        return None;
420    }
421    if confidence > 1.0 {
422        Some((confidence / 100.0).clamp(0.0, 1.0))
423    } else {
424        Some(confidence)
425    }
426}
427
428fn bbox_union(boxes: impl Iterator<Item = BBox>) -> Option<BBox> {
429    let mut min_x = f32::INFINITY;
430    let mut min_y = f32::INFINITY;
431    let mut max_x = f32::NEG_INFINITY;
432    let mut max_y = f32::NEG_INFINITY;
433    let mut has_box = false;
434    for bbox in boxes {
435        has_box = true;
436        min_x = min_x.min(bbox.x);
437        min_y = min_y.min(bbox.y);
438        max_x = max_x.max(bbox.x + bbox.width);
439        max_y = max_y.max(bbox.y + bbox.height);
440    }
441    has_box.then_some(BBox {
442        x: min_x,
443        y: min_y,
444        width: max_x - min_x,
445        height: max_y - min_y,
446    })
447}
448
449fn average_confidence(confidences: impl Iterator<Item = f32>) -> Option<f32> {
450    let mut total = 0.0;
451    let mut count = 0usize;
452    for confidence in confidences {
453        total += confidence;
454        count += 1;
455    }
456    (count > 0).then_some(total / count as f32)
457}
458
459fn ocr_box_row(cells: &[String], delimiter: char) -> Option<(BBox, String)> {
460    if cells.len() < 9 {
461        return None;
462    }
463    let mut coordinates = [0.0f32; 8];
464    for (index, coordinate) in coordinates.iter_mut().enumerate() {
465        *coordinate = cells[index].parse::<f32>().ok()?;
466    }
467    let xs = [
468        coordinates[0],
469        coordinates[2],
470        coordinates[4],
471        coordinates[6],
472    ];
473    let ys = [
474        coordinates[1],
475        coordinates[3],
476        coordinates[5],
477        coordinates[7],
478    ];
479    let min_x = xs.iter().copied().fold(f32::INFINITY, f32::min);
480    let max_x = xs.iter().copied().fold(f32::NEG_INFINITY, f32::max);
481    let min_y = ys.iter().copied().fold(f32::INFINITY, f32::min);
482    let max_y = ys.iter().copied().fold(f32::NEG_INFINITY, f32::max);
483    let separator = if delimiter == ',' { ", " } else { "\t" };
484    let text = cells[8..].join(&separator);
485    Some((
486        BBox {
487            x: min_x,
488            y: min_y,
489            width: max_x - min_x,
490            height: max_y - min_y,
491        },
492        text,
493    ))
494}
495
496fn build_document(source: &Source, engine_name: &str, blocks: Vec<Block>) -> Document {
497    let page_bbox = inferred_page_bbox(&blocks);
498    let (character_count, word_count) = text_counts(&blocks);
499    let block_count = blocks.len();
500    Document {
501        schema_version: SCHEMA_VERSION.to_owned(),
502        metadata: Metadata {
503            format: source.format.clone(),
504            engine: engine_name.to_owned(),
505            source: source.path.clone(),
506            title: None,
507            character_count,
508            word_count,
509            block_count,
510            file_size_bytes: source.bytes.as_ref().map(|bytes| bytes.len() as u64),
511            pdf_version: None,
512            encrypted: false,
513        },
514        pages: vec![Page {
515            number: 1,
516            width: page_bbox.map(|bbox| bbox.width),
517            height: page_bbox.map(|bbox| bbox.height),
518            rotation: None,
519            bbox: page_bbox,
520            blocks,
521            images: Vec::new(),
522            assets: Vec::new(),
523            warnings: Vec::new(), ..Default::default()
524        }],
525        assets: Vec::new(),
526        warnings: Vec::new(),
527    }
528}
529
530fn inferred_page_bbox(blocks: &[Block]) -> Option<BBox> {
531    let mut max_x = 0.0f32;
532    let mut max_y = 0.0f32;
533    let mut has_bbox = false;
534    for block in blocks {
535        let Some(bbox) = block_bbox(block) else {
536            continue;
537        };
538        has_bbox = true;
539        max_x = max_x.max(bbox.x + bbox.width);
540        max_y = max_y.max(bbox.y + bbox.height);
541    }
542    has_bbox.then_some(BBox {
543        x: 0.0,
544        y: 0.0,
545        width: max_x,
546        height: max_y,
547    })
548}
549
550fn block_bbox(block: &Block) -> Option<BBox> {
551    match block {
552        Block::Text(text) => text.bbox,
553        Block::Table(table) => table.bbox,
554        Block::Figure(figure) => figure.bbox,
555    }
556}
557
558fn text_counts(blocks: &[Block]) -> (usize, usize) {
559    let mut character_count = 0;
560    let mut word_count = 0;
561    for block in blocks {
562        let text = match block {
563            Block::Text(text) => text.text.as_str(),
564            _ => "",
565        };
566        character_count += text.chars().count();
567        word_count += text.split_whitespace().count();
568    }
569    (character_count, word_count)
570}
571
572fn delimiter_for_source(source: &Source) -> char {
573    if source
574        .path
575        .as_deref()
576        .map(|path| path.to_ascii_lowercase().ends_with(".tsv"))
577        .unwrap_or(false)
578    {
579        '\t'
580    } else {
581        ','
582    }
583}
584
585fn clean_text(text: &str) -> String {
586    text.split_whitespace().collect::<Vec<_>>().join(" ")
587}
dongler_core/csv.rs

dongler_core/
csv.rs