Skip to main content

dongler_core/
engine.rs

1use crate::error::Result;
2use crate::ir::{
3    BBox, Block, Confidence, Document, Line, Metadata, Page, SourceAnchor, Span, TableBlock,
4    TextBlock, SCHEMA_VERSION,
5};
6use crate::source::Source;
7
8pub trait ExtractionEngine {
9    fn name(&self) -> &'static str;
10    fn extract(&self, source: &Source) -> Result<Document>;
11}
12
13#[derive(Debug, Default, Clone, Copy)]
14pub struct PlainTextEngine;
15
16impl ExtractionEngine for PlainTextEngine {
17    fn name(&self) -> &'static str {
18        "plain-text"
19    }
20
21    fn extract(&self, source: &Source) -> Result<Document> {
22        if let Some(document) = docbank_token_label_document(source, self.name()) {
23            return Ok(document);
24        }
25        if let Some(document) = latex_document(source) {
26            return Ok(document);
27        }
28        if let Some(document) = markdown_document(source) {
29            return Ok(document);
30        }
31        text_document_from_paragraphs(source, self.name(), split_paragraphs(&source.content), None)
32    }
33}
34
35const DOCBANK_EXTRACTION_METHOD: &str = "docbank_token_labels";
36const LATEX_ENGINE_NAME: &str = "latex-native";
37const LATEX_EXTRACTION_METHOD: &str = "latex_native";
38const MARKDOWN_ENGINE_NAME: &str = "markdown-native";
39const MARKDOWN_EXTRACTION_METHOD: &str = "markdown_native";
40
41#[derive(Debug)]
42struct DocBankToken {
43    text: String,
44    label: String,
45    bbox: BBox,
46}
47
48#[derive(Debug)]
49struct DocBankLine {
50    label: String,
51    y: f32,
52    height: f32,
53    tokens: Vec<DocBankToken>,
54}
55
56fn docbank_token_label_document(source: &Source, engine_name: &str) -> Option<Document> {
57    let mut tokens = Vec::new();
58    let mut non_empty_lines = 0usize;
59
60    for line in source.content.lines() {
61        if line.trim().is_empty() {
62            continue;
63        }
64        non_empty_lines += 1;
65        if let Some(token) = docbank_token_from_line(line) {
66            tokens.push(token);
67        }
68    }
69
70    if tokens.is_empty() || tokens.len() != non_empty_lines {
71        return None;
72    }
73
74    let blocks = docbank_lines(tokens)
75        .into_iter()
76        .filter_map(docbank_line_block)
77        .collect::<Vec<_>>();
78    if blocks.is_empty() {
79        return None;
80    }
81
82    let page_bbox = inferred_text_block_bbox(&blocks);
83    let plain_text = blocks
84        .iter()
85        .filter_map(|block| match block {
86            Block::Text(text) => Some(text.text.as_str()),
87            _ => None,
88        })
89        .collect::<Vec<_>>()
90        .join("\n\n");
91
92    Some(Document {
93        schema_version: SCHEMA_VERSION.to_owned(),
94        metadata: Metadata {
95            format: source.format.clone(),
96            engine: engine_name.to_owned(),
97            source: source.path.clone(),
98            title: None,
99            character_count: plain_text.chars().count(),
100            word_count: plain_text.split_whitespace().count(),
101            block_count: blocks.len(),
102            file_size_bytes: source.bytes.as_ref().map(|bytes| bytes.len() as u64),
103            pdf_version: None,
104            encrypted: false,
105        },
106        pages: vec![Page {
107            number: 1,
108            width: page_bbox.map(|bbox| bbox.width),
109            height: page_bbox.map(|bbox| bbox.height),
110            rotation: None,
111            bbox: page_bbox,
112            blocks,
113            images: Vec::new(),
114            assets: Vec::new(),
115            warnings: Vec::new(), ..Default::default()
116        }],
117        assets: Vec::new(),
118        warnings: Vec::new(),
119    })
120}
121
122fn docbank_token_from_line(line: &str) -> Option<DocBankToken> {
123    let cells = line.split('\t').collect::<Vec<_>>();
124    if cells.len() < 10 {
125        return None;
126    }
127    let text = cells[0].trim();
128    let label = cells[9].trim();
129    if text.is_empty() || !is_docbank_label(label) {
130        return None;
131    }
132
133    let x0 = cells[1].parse::<f32>().ok()?;
134    let y0 = cells[2].parse::<f32>().ok()?;
135    let x1 = cells[3].parse::<f32>().ok()?;
136    let y1 = cells[4].parse::<f32>().ok()?;
137    if x1 <= x0 || y1 <= y0 {
138        return None;
139    }
140
141    Some(DocBankToken {
142        text: text.to_owned(),
143        label: label.to_owned(),
144        bbox: BBox {
145            x: x0,
146            y: y0,
147            width: x1 - x0,
148            height: y1 - y0,
149        },
150    })
151}
152
153fn is_docbank_label(label: &str) -> bool {
154    matches!(
155        label,
156        "abstract"
157            | "author"
158            | "caption"
159            | "date"
160            | "equation"
161            | "figure"
162            | "footer"
163            | "list"
164            | "paragraph"
165            | "reference"
166            | "section"
167            | "table"
168            | "title"
169    )
170}
171
172fn docbank_lines(tokens: Vec<DocBankToken>) -> Vec<DocBankLine> {
173    let mut lines = Vec::new();
174
175    for token in tokens {
176        let same_line = lines
177            .last()
178            .map(|line: &DocBankLine| {
179                line.label == token.label
180                    && (line.y - token.bbox.y).abs() <= line.height.max(token.bbox.height).max(3.0)
181            })
182            .unwrap_or(false);
183        if same_line {
184            if let Some(line) = lines.last_mut() {
185                line.height = line.height.max(token.bbox.height);
186                line.tokens.push(token);
187            }
188        } else {
189            lines.push(DocBankLine {
190                label: token.label.clone(),
191                y: token.bbox.y,
192                height: token.bbox.height,
193                tokens: vec![token],
194            });
195        }
196    }
197
198    lines
199}
200
201fn docbank_line_block(line: DocBankLine) -> Option<Block> {
202    if line.tokens.is_empty() {
203        return None;
204    }
205
206    let text = line
207        .tokens
208        .iter()
209        .map(|token| token.text.as_str())
210        .collect::<Vec<_>>()
211        .join(" ");
212    let bbox = bbox_union(line.tokens.iter().map(|token| token.bbox))?;
213    let spans = line
214        .tokens
215        .iter()
216        .map(|token| Span {
217            text: token.text.clone(),
218            bbox: Some(token.bbox),
219            font: None,
220            size: None,
221            bold: false,
222            italic: false,
223        })
224        .collect::<Vec<_>>();
225
226    Some(Block::Text(TextBlock {
227        text: text.clone(),
228        kind: line.label,
229        bbox: Some(bbox),
230        lines: vec![Line {
231            text,
232            bbox: Some(bbox),
233            spans,
234        }],
235        source_anchors: vec![SourceAnchor {
236            page_number: 1,
237            pdf_object_ids: Vec::new(),
238            bbox: Some(bbox),
239            extraction_method: DOCBANK_EXTRACTION_METHOD.to_owned(),
240        }],
241        confidence: Some(Confidence {
242            score: 0.9,
243            calibrated: false,
244        }), ..Default::default()
245    }))
246}
247
248fn inferred_text_block_bbox(blocks: &[Block]) -> Option<BBox> {
249    let mut max_x = 0.0f32;
250    let mut max_y = 0.0f32;
251    let mut has_bbox = false;
252    for block in blocks {
253        let Block::Text(text) = block else {
254            continue;
255        };
256        let Some(bbox) = text.bbox else {
257            continue;
258        };
259        has_bbox = true;
260        max_x = max_x.max(bbox.x + bbox.width);
261        max_y = max_y.max(bbox.y + bbox.height);
262    }
263    has_bbox.then_some(BBox {
264        x: 0.0,
265        y: 0.0,
266        width: max_x,
267        height: max_y,
268    })
269}
270
271fn bbox_union(boxes: impl Iterator<Item = BBox>) -> Option<BBox> {
272    let mut min_x = f32::INFINITY;
273    let mut min_y = f32::INFINITY;
274    let mut max_x = f32::NEG_INFINITY;
275    let mut max_y = f32::NEG_INFINITY;
276    let mut has_box = false;
277    for bbox in boxes {
278        has_box = true;
279        min_x = min_x.min(bbox.x);
280        min_y = min_y.min(bbox.y);
281        max_x = max_x.max(bbox.x + bbox.width);
282        max_y = max_y.max(bbox.y + bbox.height);
283    }
284    has_box.then_some(BBox {
285        x: min_x,
286        y: min_y,
287        width: max_x - min_x,
288        height: max_y - min_y,
289    })
290}
291
292fn document_from_blocks(
293    source: &Source,
294    engine_name: &str,
295    title: Option<String>,
296    blocks: Vec<Block>,
297) -> Option<Document> {
298    if blocks.is_empty() {
299        return None;
300    }
301    let plain_text = blocks
302        .iter()
303        .map(block_markdown_text)
304        .filter(|text| !text.is_empty())
305        .collect::<Vec<_>>()
306        .join("\n\n");
307
308    Some(Document {
309        schema_version: SCHEMA_VERSION.to_owned(),
310        metadata: Metadata {
311            format: source.format.clone(),
312            engine: engine_name.to_owned(),
313            source: source.path.clone(),
314            title,
315            character_count: plain_text.chars().count(),
316            word_count: plain_text.split_whitespace().count(),
317            block_count: blocks.len(),
318            file_size_bytes: source.bytes.as_ref().map(|bytes| bytes.len() as u64),
319            pdf_version: None,
320            encrypted: false,
321        },
322        pages: vec![Page {
323            number: 1,
324            width: None,
325            height: None,
326            rotation: None,
327            bbox: None,
328            blocks,
329            images: Vec::new(),
330            assets: Vec::new(),
331            warnings: Vec::new(), ..Default::default()
332        }],
333        assets: Vec::new(),
334        warnings: Vec::new(),
335    })
336}
337
338fn latex_document(source: &Source) -> Option<Document> {
339    if !is_latex_source(source) {
340        return None;
341    }
342
343    let stripped = strip_latex_comments(&source.content);
344    let title = latex_command_argument(&stripped, "title").map(|text| clean_latex_inline(&text));
345    let body = latex_document_body(&stripped);
346    let blocks = latex_blocks(body, title.clone());
347    document_from_blocks(source, LATEX_ENGINE_NAME, title, blocks)
348}
349
350fn is_latex_source(source: &Source) -> bool {
351    source
352        .path
353        .as_deref()
354        .map(|path| {
355            let path = path.to_ascii_lowercase();
356            path.ends_with(".tex")
357                || path.ends_with(".latex")
358                || path.ends_with(".ltx")
359                || path.ends_with(".tex.gz")
360                || path.ends_with(".latex.gz")
361                || path.ends_with(".ltx.gz")
362        })
363        .unwrap_or(false)
364}
365
366fn strip_latex_comments(text: &str) -> String {
367    let mut output = String::with_capacity(text.len());
368    for line in text.lines() {
369        let mut escaped = false;
370        for character in line.chars() {
371            if character == '%' && !escaped {
372                break;
373            }
374            escaped = character == '\\' && !escaped;
375            if character != '\\' {
376                escaped = false;
377            }
378            output.push(character);
379        }
380        output.push('\n');
381    }
382    output
383}
384
385fn latex_document_body(text: &str) -> &str {
386    let Some(start) = text.find("\\begin{document}") else {
387        return text;
388    };
389    let body_start = start + "\\begin{document}".len();
390    let body = &text[body_start..];
391    if let Some(end) = body.find("\\end{document}") {
392        &body[..end]
393    } else {
394        body
395    }
396}
397
398fn latex_blocks(body: &str, title: Option<String>) -> Vec<Block> {
399    let lines = body.lines().collect::<Vec<_>>();
400    let mut blocks = Vec::new();
401    let mut paragraph = Vec::new();
402    let mut index = 0usize;
403
404    if let Some(title) = title.filter(|title| !title.is_empty()) {
405        blocks.push(latex_text_block(title, "heading_1".to_owned()));
406    }
407
408    while index < lines.len() {
409        let trimmed = lines[index].trim();
410        if trimmed.is_empty() {
411            flush_latex_paragraph(&mut blocks, &mut paragraph);
412            index += 1;
413            continue;
414        }
415        if is_latex_skip_line(trimmed) {
416            flush_latex_paragraph(&mut blocks, &mut paragraph);
417            index += 1;
418            continue;
419        }
420        if let Some((level, text)) = latex_heading(trimmed) {
421            flush_latex_paragraph(&mut blocks, &mut paragraph);
422            blocks.push(latex_text_block(text, format!("heading_{level}")));
423            index += 1;
424            continue;
425        }
426        if contains_latex_begin(trimmed, "abstract") {
427            flush_latex_paragraph(&mut blocks, &mut paragraph);
428            let (environment, next_index) = collect_latex_environment(&lines, index, &["abstract"]);
429            if let Some(abstract_text) = latex_environment_body(&environment, "abstract") {
430                let text = clean_latex_inline(&abstract_text);
431                if !text.is_empty() {
432                    blocks.push(latex_text_block(text, "abstract".to_owned()));
433                }
434            }
435            index = next_index;
436            continue;
437        }
438        if contains_any_latex_begin(trimmed, &["itemize", "enumerate"]) {
439            flush_latex_paragraph(&mut blocks, &mut paragraph);
440            let (environment, next_index) =
441                collect_latex_environment(&lines, index, &["itemize", "enumerate"]);
442            if let Some(block) = latex_list_block(&environment) {
443                blocks.push(block);
444            }
445            index = next_index;
446            continue;
447        }
448        if contains_any_latex_begin(
449            trimmed,
450            &[
451                "table",
452                "table*",
453                "tabular",
454                "tabular*",
455                "tabularx",
456                "longtable",
457                "array",
458            ],
459        ) {
460            flush_latex_paragraph(&mut blocks, &mut paragraph);
461            let (environment, next_index) = collect_latex_environment(
462                &lines,
463                index,
464                &[
465                    "table",
466                    "table*",
467                    "tabular",
468                    "tabular*",
469                    "tabularx",
470                    "longtable",
471                    "array",
472                ],
473            );
474            if let Some(block) = latex_table_block(&environment) {
475                blocks.push(block);
476            }
477            index = next_index;
478            continue;
479        }
480
481        let text = clean_latex_inline(trimmed);
482        if !text.is_empty() {
483            paragraph.push(text);
484        }
485        index += 1;
486    }
487
488    flush_latex_paragraph(&mut blocks, &mut paragraph);
489    blocks
490}
491
492fn flush_latex_paragraph(blocks: &mut Vec<Block>, paragraph: &mut Vec<String>) {
493    if paragraph.is_empty() {
494        return;
495    }
496    blocks.push(latex_text_block(
497        paragraph.join(" "),
498        "paragraph".to_owned(),
499    ));
500    paragraph.clear();
501}
502
503fn is_latex_skip_line(line: &str) -> bool {
504    matches!(
505        latex_command_name_at(line, 1).as_deref(),
506        Some(
507            "author"
508                | "date"
509                | "documentclass"
510                | "end"
511                | "input"
512                | "include"
513                | "label"
514                | "maketitle"
515                | "newcommand"
516                | "renewcommand"
517                | "bibliography"
518                | "bibliographystyle"
519                | "usepackage"
520        )
521    )
522}
523
524fn latex_heading(line: &str) -> Option<(usize, String)> {
525    for (command, level) in [
526        ("part", 1usize),
527        ("chapter", 1),
528        ("section", 1),
529        ("subsection", 2),
530        ("subsubsection", 3),
531        ("paragraph", 4),
532        ("subparagraph", 5),
533    ] {
534        if let Some(text) = latex_line_command_argument(line, command) {
535            let text = clean_latex_inline(&text);
536            if !text.is_empty() {
537                return Some((level, text));
538            }
539        }
540    }
541    None
542}
543
544fn latex_line_command_argument(line: &str, command: &str) -> Option<String> {
545    let trimmed = line.trim_start();
546    let marker = format!("\\{command}");
547    if !trimmed.starts_with(&marker) {
548        return None;
549    }
550    latex_command_argument(trimmed, command)
551}
552
553fn contains_any_latex_begin(line: &str, names: &[&str]) -> bool {
554    names.iter().any(|name| contains_latex_begin(line, name))
555}
556
557fn contains_latex_begin(line: &str, name: &str) -> bool {
558    line.contains(&format!("\\begin{{{name}}}"))
559}
560
561fn collect_latex_environment(lines: &[&str], index: usize, names: &[&str]) -> (String, usize) {
562    let mut output = String::new();
563    let mut next_index = index;
564    while next_index < lines.len() {
565        let line = lines[next_index];
566        output.push_str(line);
567        output.push('\n');
568        next_index += 1;
569        if names
570            .iter()
571            .any(|name| line.contains(&format!("\\end{{{name}}}")))
572        {
573            break;
574        }
575    }
576    (output, next_index)
577}
578
579fn latex_list_block(environment: &str) -> Option<Block> {
580    let body = latex_environment_body(environment, "itemize")
581        .or_else(|| latex_environment_body(environment, "enumerate"))?;
582    let items = latex_item_texts(&body);
583    if items.is_empty() {
584        return None;
585    }
586    Some(latex_text_block(items.join("\n"), "list".to_owned()))
587}
588
589fn latex_item_texts(body: &str) -> Vec<String> {
590    let mut items = Vec::new();
591    let mut search_start = 0usize;
592    while let Some(relative_start) = body[search_start..].find("\\item") {
593        let item_start = search_start + relative_start;
594        let mut content_start = item_start + "\\item".len();
595        content_start = skip_latex_whitespace(body, content_start);
596        if body.as_bytes().get(content_start) == Some(&b'[') {
597            content_start = skip_latex_optional_argument(body, content_start);
598            content_start = skip_latex_whitespace(body, content_start);
599        }
600        let next_item = body[content_start..]
601            .find("\\item")
602            .map(|relative| content_start + relative)
603            .unwrap_or(body.len());
604        let text = clean_latex_inline(&body[content_start..next_item]);
605        if !text.is_empty() {
606            items.push(text);
607        }
608        search_start = next_item;
609    }
610    items
611}
612
613fn latex_table_block(environment: &str) -> Option<Block> {
614    let caption =
615        latex_command_argument(environment, "caption").map(|text| clean_latex_inline(&text));
616    let body = latex_environment_body(environment, "tabular")
617        .or_else(|| latex_environment_body(environment, "tabular*"))
618        .or_else(|| latex_environment_body(environment, "tabularx"))
619        .or_else(|| latex_environment_body(environment, "longtable"))
620        .or_else(|| latex_environment_body(environment, "array"))?;
621
622    let mut rows = split_latex_table_rows(&body)
623        .into_iter()
624        .filter_map(|row| latex_table_row(&row))
625        .collect::<Vec<_>>();
626    if rows.is_empty() {
627        return None;
628    }
629
630    let headers = if rows.len() > 1 {
631        rows.remove(0)
632    } else {
633        Vec::new()
634    };
635
636    Some(Block::Table(TableBlock {
637        headers,
638        rows,
639        caption,
640        bbox: None,
641        cells: Vec::new(),
642        source_anchors: vec![latex_source_anchor()],
643        confidence: Some(latex_confidence()), ..Default::default()
644    }))
645}
646
647fn split_latex_table_rows(body: &str) -> Vec<String> {
648    let mut rows = Vec::new();
649    let mut current = String::new();
650    let bytes = body.as_bytes();
651    let mut pos = 0usize;
652    while pos < bytes.len() {
653        if bytes[pos] == b'\\' && bytes.get(pos + 1) == Some(&b'\\') {
654            rows.push(current);
655            current = String::new();
656            pos += 2;
657        } else {
658            current.push(body[pos..].chars().next().unwrap());
659            pos += body[pos..].chars().next().unwrap().len_utf8();
660        }
661    }
662    if !current.trim().is_empty() {
663        rows.push(current);
664    }
665    rows
666}
667
668fn latex_table_row(row: &str) -> Option<Vec<String>> {
669    let row = strip_latex_table_rules(row);
670    let cells = split_latex_cells(&row)
671        .into_iter()
672        .map(|cell| clean_latex_inline(&cell))
673        .filter(|cell| !cell.is_empty())
674        .collect::<Vec<_>>();
675    if cells.is_empty() {
676        None
677    } else {
678        Some(cells)
679    }
680}
681
682fn strip_latex_table_rules(row: &str) -> String {
683    let mut cleaned = row.to_owned();
684    for command in [
685        "\\hline",
686        "\\toprule",
687        "\\midrule",
688        "\\bottomrule",
689        "\\cmidrule",
690        "\\cline",
691    ] {
692        cleaned = cleaned.replace(command, " ");
693    }
694    cleaned
695}
696
697fn split_latex_cells(row: &str) -> Vec<String> {
698    let mut cells = Vec::new();
699    let mut current = String::new();
700    let mut escaped = false;
701    for character in row.chars() {
702        if character == '&' && !escaped {
703            cells.push(current);
704            current = String::new();
705        } else {
706            escaped = character == '\\' && !escaped;
707            if character != '\\' {
708                escaped = false;
709            }
710            current.push(character);
711        }
712    }
713    cells.push(current);
714    cells
715}
716
717fn latex_environment_body(text: &str, name: &str) -> Option<String> {
718    let marker = format!("\\begin{{{name}}}");
719    let start = text.find(&marker)?;
720    let mut body_start = start + marker.len();
721    loop {
722        body_start = skip_latex_whitespace(text, body_start);
723        match text.as_bytes().get(body_start) {
724            Some(b'[') => body_start = skip_latex_optional_argument(text, body_start),
725            Some(b'{') => {
726                let (_, end) = read_latex_braced_argument(text, body_start)?;
727                body_start = end;
728            }
729            _ => break,
730        }
731    }
732    let end_marker = format!("\\end{{{name}}}");
733    let end = text[body_start..]
734        .find(&end_marker)
735        .map(|relative| body_start + relative)
736        .unwrap_or(text.len());
737    Some(text[body_start..end].to_owned())
738}
739
740fn latex_command_argument(text: &str, command: &str) -> Option<String> {
741    let marker = format!("\\{command}");
742    let mut search_start = 0usize;
743    while let Some(relative_start) = text[search_start..].find(&marker) {
744        let start = search_start + relative_start;
745        let mut cursor = start + marker.len();
746        if text[cursor..]
747            .chars()
748            .next()
749            .map(|character| character.is_ascii_alphabetic())
750            .unwrap_or(false)
751        {
752            search_start = cursor;
753            continue;
754        }
755        if text.as_bytes().get(cursor) == Some(&b'*') {
756            cursor += 1;
757        }
758        cursor = skip_latex_whitespace(text, cursor);
759        if text.as_bytes().get(cursor) == Some(&b'[') {
760            cursor = skip_latex_optional_argument(text, cursor);
761            cursor = skip_latex_whitespace(text, cursor);
762        }
763        if text.as_bytes().get(cursor) == Some(&b'{') {
764            let (argument, _) = read_latex_braced_argument(text, cursor)?;
765            return Some(argument);
766        }
767        search_start = cursor.max(start + 1);
768    }
769    None
770}
771
772fn read_latex_braced_argument(text: &str, open: usize) -> Option<(String, usize)> {
773    if text.as_bytes().get(open) != Some(&b'{') {
774        return None;
775    }
776    let mut depth = 0usize;
777    let mut escaped = false;
778    for (relative, character) in text[open..].char_indices() {
779        let index = open + relative;
780        if character == '{' && !escaped {
781            depth += 1;
782        } else if character == '}' && !escaped {
783            depth = depth.saturating_sub(1);
784            if depth == 0 {
785                return Some((text[open + 1..index].to_owned(), index + 1));
786            }
787        }
788        escaped = character == '\\' && !escaped;
789        if character != '\\' {
790            escaped = false;
791        }
792    }
793    None
794}
795
796fn skip_latex_optional_argument(text: &str, open: usize) -> usize {
797    if text.as_bytes().get(open) != Some(&b'[') {
798        return open;
799    }
800    let mut escaped = false;
801    for (relative, character) in text[open + 1..].char_indices() {
802        if character == ']' && !escaped {
803            return open + 1 + relative + 1;
804        }
805        escaped = character == '\\' && !escaped;
806        if character != '\\' {
807            escaped = false;
808        }
809    }
810    open + 1
811}
812
813fn skip_latex_whitespace(text: &str, mut pos: usize) -> usize {
814    while pos < text.len() && text.as_bytes()[pos].is_ascii_whitespace() {
815        pos += 1;
816    }
817    pos
818}
819
820fn clean_latex_inline(text: &str) -> String {
821    let mut output = String::with_capacity(text.len());
822    let mut pos = 0usize;
823    while pos < text.len() {
824        let character = text[pos..].chars().next().unwrap();
825        if character == '\\' {
826            let next_pos = pos + character.len_utf8();
827            let Some(next_character) = text[next_pos..].chars().next() else {
828                break;
829            };
830            if next_character == '\\' {
831                output.push(' ');
832                pos = next_pos + next_character.len_utf8();
833                continue;
834            }
835            if matches!(
836                next_character,
837                '%' | '&' | '_' | '$' | '#' | '{' | '}' | '[' | ']'
838            ) {
839                output.push(next_character);
840                pos = next_pos + next_character.len_utf8();
841                continue;
842            }
843            let (name, after_name) = latex_command_name(text, next_pos);
844            if name.is_empty() {
845                pos = next_pos;
846                continue;
847            }
848            let (replacement, after_command) =
849                clean_latex_command_argument(text, &name, after_name);
850            output.push_str(&replacement);
851            pos = after_command;
852            continue;
853        }
854        if matches!(character, '{' | '}' | '$') {
855            pos += character.len_utf8();
856            continue;
857        }
858        if character == '~' {
859            output.push(' ');
860        } else {
861            output.push(character);
862        }
863        pos += character.len_utf8();
864    }
865    output.split_whitespace().collect::<Vec<_>>().join(" ")
866}
867
868fn clean_latex_command_argument(text: &str, name: &str, after_name: usize) -> (String, usize) {
869    let mut cursor = skip_latex_whitespace(text, after_name);
870    if text.as_bytes().get(cursor) == Some(&b'[') {
871        cursor = skip_latex_optional_argument(text, cursor);
872        cursor = skip_latex_whitespace(text, cursor);
873    }
874
875    if matches!(
876        name,
877        "label" | "pageref" | "ref" | "cite" | "citep" | "citet"
878    ) {
879        if text.as_bytes().get(cursor) == Some(&b'{') {
880            let (_, end) = read_latex_braced_argument(text, cursor).unwrap_or_default();
881            return (String::new(), end.max(cursor + 1));
882        }
883        return (String::new(), cursor);
884    }
885
886    if name == "href" {
887        if text.as_bytes().get(cursor) == Some(&b'{') {
888            let (_, first_end) = read_latex_braced_argument(text, cursor).unwrap_or_default();
889            let second_start = skip_latex_whitespace(text, first_end);
890            if text.as_bytes().get(second_start) == Some(&b'{') {
891                if let Some((argument, end)) = read_latex_braced_argument(text, second_start) {
892                    return (clean_latex_inline(&argument), end);
893                }
894            }
895            return (String::new(), first_end.max(cursor + 1));
896        }
897    }
898
899    if matches!(name, "multicolumn" | "multirow") {
900        let mut arguments = Vec::new();
901        for _ in 0..3 {
902            cursor = skip_latex_whitespace(text, cursor);
903            if text.as_bytes().get(cursor) != Some(&b'{') {
904                break;
905            }
906            if let Some((argument, end)) = read_latex_braced_argument(text, cursor) {
907                arguments.push(argument);
908                cursor = end;
909            }
910        }
911        return (
912            arguments
913                .last()
914                .map(|argument| clean_latex_inline(argument))
915                .unwrap_or_default(),
916            cursor,
917        );
918    }
919
920    if text.as_bytes().get(cursor) == Some(&b'{') {
921        if let Some((argument, end)) = read_latex_braced_argument(text, cursor) {
922            return (clean_latex_inline(&argument), end);
923        }
924    }
925
926    let replacement = match name {
927        "LaTeX" => "LaTeX",
928        "TeX" => "TeX",
929        "quad" | "qquad" | "enspace" | "thinspace" => " ",
930        _ => "",
931    };
932    (replacement.to_owned(), cursor)
933}
934
935fn latex_command_name(text: &str, start: usize) -> (String, usize) {
936    let mut end = start;
937    for (relative, character) in text[start..].char_indices() {
938        if !character.is_ascii_alphabetic() {
939            break;
940        }
941        end = start + relative + character.len_utf8();
942    }
943    if end > start {
944        return (text[start..end].to_owned(), end);
945    }
946    if let Some(character) = text[start..].chars().next() {
947        let end = start + character.len_utf8();
948        (character.to_string(), end)
949    } else {
950        (String::new(), start)
951    }
952}
953
954fn latex_command_name_at(line: &str, start: usize) -> Option<String> {
955    if !line.starts_with('\\') {
956        return None;
957    }
958    let (name, _) = latex_command_name(line, start);
959    (!name.is_empty()).then_some(name)
960}
961
962fn latex_text_block(text: String, kind: String) -> Block {
963    Block::Text(TextBlock {
964        text,
965        kind,
966        bbox: None,
967        lines: Vec::new(),
968        source_anchors: vec![latex_source_anchor()],
969        confidence: Some(latex_confidence()), ..Default::default()
970    })
971}
972
973fn latex_source_anchor() -> SourceAnchor {
974    SourceAnchor {
975        page_number: 1,
976        pdf_object_ids: Vec::new(),
977        bbox: None,
978        extraction_method: LATEX_EXTRACTION_METHOD.to_owned(),
979    }
980}
981
982fn latex_confidence() -> Confidence {
983    Confidence {
984        score: 0.85,
985        calibrated: false,
986    }
987}
988
989fn markdown_document(source: &Source) -> Option<Document> {
990    if !is_markdown_source(source) {
991        return None;
992    }
993
994    let blocks = markdown_blocks(&source.content);
995    document_from_blocks(source, MARKDOWN_ENGINE_NAME, None, blocks)
996}
997
998fn is_markdown_source(source: &Source) -> bool {
999    source
1000        .path
1001        .as_deref()
1002        .map(|path| {
1003            let path = path.to_ascii_lowercase();
1004            path.ends_with(".md") || path.ends_with(".markdown")
1005        })
1006        .unwrap_or(false)
1007}
1008
1009fn markdown_blocks(content: &str) -> Vec<Block> {
1010    let lines = content.lines().collect::<Vec<_>>();
1011    let mut blocks = Vec::new();
1012    let mut paragraph = Vec::new();
1013    let mut index = 0usize;
1014
1015    while index < lines.len() {
1016        let trimmed = lines[index].trim();
1017        if trimmed.is_empty() {
1018            flush_markdown_paragraph(&mut blocks, &mut paragraph);
1019            index += 1;
1020            continue;
1021        }
1022        if let Some((level, text)) = markdown_heading(trimmed) {
1023            flush_markdown_paragraph(&mut blocks, &mut paragraph);
1024            blocks.push(markdown_text_block(text, format!("heading_{level}")));
1025            index += 1;
1026            continue;
1027        }
1028        if is_markdown_table_start(&lines, index) {
1029            flush_markdown_paragraph(&mut blocks, &mut paragraph);
1030            let (table, next_index) = markdown_table_block(&lines, index);
1031            blocks.push(table);
1032            index = next_index;
1033            continue;
1034        }
1035        if is_markdown_list_item(trimmed) {
1036            flush_markdown_paragraph(&mut blocks, &mut paragraph);
1037            let (list, next_index) = markdown_list_block(&lines, index);
1038            blocks.push(list);
1039            index = next_index;
1040            continue;
1041        }
1042
1043        paragraph.push(trimmed.to_owned());
1044        index += 1;
1045    }
1046
1047    flush_markdown_paragraph(&mut blocks, &mut paragraph);
1048    blocks
1049}
1050
1051fn flush_markdown_paragraph(blocks: &mut Vec<Block>, paragraph: &mut Vec<String>) {
1052    if paragraph.is_empty() {
1053        return;
1054    }
1055    blocks.push(markdown_text_block(
1056        paragraph.join(" "),
1057        "paragraph".to_owned(),
1058    ));
1059    paragraph.clear();
1060}
1061
1062fn markdown_heading(line: &str) -> Option<(usize, String)> {
1063    let hashes = line
1064        .chars()
1065        .take_while(|character| *character == '#')
1066        .count();
1067    if hashes == 0 || hashes > 6 {
1068        return None;
1069    }
1070    let text = line.get(hashes..)?.trim();
1071    if text.is_empty() {
1072        return None;
1073    }
1074    Some((hashes, clean_markdown_inline(text)))
1075}
1076
1077fn is_markdown_table_start(lines: &[&str], index: usize) -> bool {
1078    index + 1 < lines.len()
1079        && markdown_row_cells(lines[index]).len() >= 2
1080        && is_markdown_separator_row(lines[index + 1])
1081}
1082
1083fn markdown_table_block(lines: &[&str], index: usize) -> (Block, usize) {
1084    let headers = markdown_row_cells(lines[index]);
1085    let mut rows = Vec::new();
1086    let mut next_index = index + 2;
1087
1088    while next_index < lines.len() {
1089        let line = lines[next_index].trim();
1090        if line.is_empty() || !line.contains('|') {
1091            break;
1092        }
1093        let row = markdown_row_cells(line);
1094        if row.is_empty() {
1095            break;
1096        }
1097        rows.push(row);
1098        next_index += 1;
1099    }
1100
1101    (
1102        Block::Table(TableBlock {
1103            headers,
1104            rows,
1105            caption: None,
1106            bbox: None,
1107            cells: Vec::new(),
1108            source_anchors: vec![markdown_source_anchor()],
1109            confidence: Some(markdown_confidence()), ..Default::default()
1110        }),
1111        next_index,
1112    )
1113}
1114
1115fn markdown_row_cells(line: &str) -> Vec<String> {
1116    let trimmed = line.trim().trim_matches('|');
1117    trimmed
1118        .split('|')
1119        .map(|cell| clean_markdown_inline(cell.trim()))
1120        .collect::<Vec<_>>()
1121}
1122
1123fn is_markdown_separator_row(line: &str) -> bool {
1124    let cells = line.trim().trim_matches('|').split('|').collect::<Vec<_>>();
1125    if cells.len() < 2 {
1126        return false;
1127    }
1128    cells.iter().all(|cell| {
1129        let cell = cell.trim();
1130        let cell = cell.trim_matches(':');
1131        !cell.is_empty() && cell.chars().all(|character| character == '-')
1132    })
1133}
1134
1135fn is_markdown_list_item(line: &str) -> bool {
1136    markdown_list_text(line).is_some()
1137}
1138
1139fn markdown_list_block(lines: &[&str], index: usize) -> (Block, usize) {
1140    let mut items = Vec::new();
1141    let mut next_index = index;
1142    while next_index < lines.len() {
1143        let trimmed = lines[next_index].trim();
1144        let Some(item) = markdown_list_text(trimmed) else {
1145            break;
1146        };
1147        items.push(item);
1148        next_index += 1;
1149    }
1150    (
1151        markdown_text_block(items.join("\n"), "list".to_owned()),
1152        next_index,
1153    )
1154}
1155
1156fn markdown_list_text(line: &str) -> Option<String> {
1157    if let Some(text) = line.strip_prefix("- ").or_else(|| line.strip_prefix("* ")) {
1158        return Some(clean_markdown_inline(text));
1159    }
1160    let dot = line.find('.')?;
1161    if dot == 0
1162        || dot + 1 >= line.len()
1163        || !line[..dot]
1164            .chars()
1165            .all(|character| character.is_ascii_digit())
1166    {
1167        return None;
1168    }
1169    line[dot + 1..].strip_prefix(' ').map(clean_markdown_inline)
1170}
1171
1172fn clean_markdown_inline(text: &str) -> String {
1173    text.trim()
1174        .trim_matches('`')
1175        .split_whitespace()
1176        .collect::<Vec<_>>()
1177        .join(" ")
1178}
1179
1180fn markdown_text_block(text: String, kind: String) -> Block {
1181    Block::Text(TextBlock {
1182        text,
1183        kind,
1184        bbox: None,
1185        lines: Vec::new(),
1186        source_anchors: vec![markdown_source_anchor()],
1187        confidence: Some(markdown_confidence()), ..Default::default()
1188    })
1189}
1190
1191fn markdown_source_anchor() -> SourceAnchor {
1192    SourceAnchor {
1193        page_number: 1,
1194        pdf_object_ids: Vec::new(),
1195        bbox: None,
1196        extraction_method: MARKDOWN_EXTRACTION_METHOD.to_owned(),
1197    }
1198}
1199
1200fn markdown_confidence() -> Confidence {
1201    Confidence {
1202        score: 0.9,
1203        calibrated: false,
1204    }
1205}
1206
1207fn block_markdown_text(block: &Block) -> String {
1208    match block {
1209        Block::Text(text) => text.text.clone(),
1210        Block::Table(table) => {
1211            let mut rows = Vec::new();
1212            if !table.headers.is_empty() {
1213                rows.push(table.headers.join(" "));
1214            }
1215            rows.extend(table.rows.iter().map(|row| row.join(" ")));
1216            rows.join("\n")
1217        }
1218        Block::Figure(figure) => figure.caption.clone().unwrap_or_default(),
1219    }
1220}
1221
1222pub(crate) fn split_paragraphs(text: &str) -> Vec<String> {
1223    let mut paragraphs = Vec::new();
1224    let mut current = Vec::new();
1225
1226    for line in text.lines() {
1227        let trimmed = line.trim();
1228        if trimmed.is_empty() {
1229            flush_paragraph(&mut paragraphs, &mut current);
1230        } else {
1231            current.push(trimmed.to_owned());
1232        }
1233    }
1234
1235    flush_paragraph(&mut paragraphs, &mut current);
1236    paragraphs
1237}
1238
1239fn flush_paragraph(paragraphs: &mut Vec<String>, current: &mut Vec<String>) {
1240    if !current.is_empty() {
1241        paragraphs.push(current.join(" "));
1242        current.clear();
1243    }
1244}
1245
1246pub(crate) fn text_document_from_text(
1247    source: &Source,
1248    engine_name: &str,
1249    text: &str,
1250    title: Option<String>,
1251) -> Result<Document> {
1252    text_document_from_paragraphs(source, engine_name, split_paragraphs(text), title)
1253}
1254
1255pub(crate) fn text_document_from_paragraphs(
1256    source: &Source,
1257    engine_name: &str,
1258    paragraphs: Vec<String>,
1259    title: Option<String>,
1260) -> Result<Document> {
1261    let blocks = paragraphs
1262        .into_iter()
1263        .filter(|text| !text.trim().is_empty())
1264        .map(|text| {
1265            Block::Text(TextBlock {
1266                text,
1267                kind: "paragraph".to_owned(),
1268                bbox: None,
1269                lines: Vec::new(),
1270                source_anchors: vec![SourceAnchor {
1271                    page_number: 1,
1272                    pdf_object_ids: Vec::new(),
1273                    bbox: None,
1274                    extraction_method: engine_name.to_owned(),
1275                }],
1276                confidence: Some(Confidence {
1277                    score: 0.9,
1278                    calibrated: false,
1279                }), ..Default::default()
1280            })
1281        })
1282        .collect::<Vec<_>>();
1283    let plain_text = blocks
1284        .iter()
1285        .filter_map(|block| match block {
1286            Block::Text(text) => Some(text.text.as_str()),
1287            _ => None,
1288        })
1289        .collect::<Vec<_>>()
1290        .join("\n\n");
1291
1292    Ok(Document {
1293        schema_version: SCHEMA_VERSION.to_owned(),
1294        metadata: Metadata {
1295            format: source.format.clone(),
1296            engine: engine_name.to_owned(),
1297            source: source.path.clone(),
1298            title,
1299            character_count: plain_text.chars().count(),
1300            word_count: plain_text.split_whitespace().count(),
1301            block_count: blocks.len(),
1302            file_size_bytes: source.bytes.as_ref().map(|bytes| bytes.len() as u64),
1303            pdf_version: None,
1304            encrypted: false,
1305        },
1306        pages: vec![Page {
1307            number: 1,
1308            width: None,
1309            height: None,
1310            rotation: None,
1311            bbox: None,
1312            blocks,
1313            images: Vec::new(),
1314            assets: Vec::new(),
1315            warnings: Vec::new(), ..Default::default()
1316        }],
1317        assets: Vec::new(),
1318        warnings: Vec::new(),
1319    })
1320}