1use crate::error::Result;
2use crate::ir::{
3 BBox, Block, Confidence, Document, Line, Metadata, Page, SourceAnchor, Span, TableBlock,
4 TextBlock, SCHEMA_VERSION,
5};
6use crate::source::Source;
7
8pub trait ExtractionEngine {
9 fn name(&self) -> &'static str;
10 fn extract(&self, source: &Source) -> Result<Document>;
11}
12
13#[derive(Debug, Default, Clone, Copy)]
14pub struct PlainTextEngine;
15
16impl ExtractionEngine for PlainTextEngine {
17 fn name(&self) -> &'static str {
18 "plain-text"
19 }
20
21 fn extract(&self, source: &Source) -> Result<Document> {
22 if let Some(document) = docbank_token_label_document(source, self.name()) {
23 return Ok(document);
24 }
25 if let Some(document) = latex_document(source) {
26 return Ok(document);
27 }
28 if let Some(document) = markdown_document(source) {
29 return Ok(document);
30 }
31 text_document_from_paragraphs(source, self.name(), split_paragraphs(&source.content), None)
32 }
33}
34
35const DOCBANK_EXTRACTION_METHOD: &str = "docbank_token_labels";
36const LATEX_ENGINE_NAME: &str = "latex-native";
37const LATEX_EXTRACTION_METHOD: &str = "latex_native";
38const MARKDOWN_ENGINE_NAME: &str = "markdown-native";
39const MARKDOWN_EXTRACTION_METHOD: &str = "markdown_native";
40
41#[derive(Debug)]
42struct DocBankToken {
43 text: String,
44 label: String,
45 bbox: BBox,
46}
47
48#[derive(Debug)]
49struct DocBankLine {
50 label: String,
51 y: f32,
52 height: f32,
53 tokens: Vec<DocBankToken>,
54}
55
56fn docbank_token_label_document(source: &Source, engine_name: &str) -> Option<Document> {
57 let mut tokens = Vec::new();
58 let mut non_empty_lines = 0usize;
59
60 for line in source.content.lines() {
61 if line.trim().is_empty() {
62 continue;
63 }
64 non_empty_lines += 1;
65 if let Some(token) = docbank_token_from_line(line) {
66 tokens.push(token);
67 }
68 }
69
70 if tokens.is_empty() || tokens.len() != non_empty_lines {
71 return None;
72 }
73
74 let blocks = docbank_lines(tokens)
75 .into_iter()
76 .filter_map(docbank_line_block)
77 .collect::<Vec<_>>();
78 if blocks.is_empty() {
79 return None;
80 }
81
82 let page_bbox = inferred_text_block_bbox(&blocks);
83 let plain_text = blocks
84 .iter()
85 .filter_map(|block| match block {
86 Block::Text(text) => Some(text.text.as_str()),
87 _ => None,
88 })
89 .collect::<Vec<_>>()
90 .join("\n\n");
91
92 Some(Document {
93 schema_version: SCHEMA_VERSION.to_owned(),
94 metadata: Metadata {
95 format: source.format.clone(),
96 engine: engine_name.to_owned(),
97 source: source.path.clone(),
98 title: None,
99 character_count: plain_text.chars().count(),
100 word_count: plain_text.split_whitespace().count(),
101 block_count: blocks.len(),
102 file_size_bytes: source.bytes.as_ref().map(|bytes| bytes.len() as u64),
103 pdf_version: None,
104 encrypted: false,
105 },
106 pages: vec![Page {
107 number: 1,
108 width: page_bbox.map(|bbox| bbox.width),
109 height: page_bbox.map(|bbox| bbox.height),
110 rotation: None,
111 bbox: page_bbox,
112 blocks,
113 images: Vec::new(),
114 assets: Vec::new(),
115 warnings: Vec::new(), ..Default::default()
116 }],
117 assets: Vec::new(),
118 warnings: Vec::new(),
119 })
120}
121
122fn docbank_token_from_line(line: &str) -> Option<DocBankToken> {
123 let cells = line.split('\t').collect::<Vec<_>>();
124 if cells.len() < 10 {
125 return None;
126 }
127 let text = cells[0].trim();
128 let label = cells[9].trim();
129 if text.is_empty() || !is_docbank_label(label) {
130 return None;
131 }
132
133 let x0 = cells[1].parse::<f32>().ok()?;
134 let y0 = cells[2].parse::<f32>().ok()?;
135 let x1 = cells[3].parse::<f32>().ok()?;
136 let y1 = cells[4].parse::<f32>().ok()?;
137 if x1 <= x0 || y1 <= y0 {
138 return None;
139 }
140
141 Some(DocBankToken {
142 text: text.to_owned(),
143 label: label.to_owned(),
144 bbox: BBox {
145 x: x0,
146 y: y0,
147 width: x1 - x0,
148 height: y1 - y0,
149 },
150 })
151}
152
153fn is_docbank_label(label: &str) -> bool {
154 matches!(
155 label,
156 "abstract"
157 | "author"
158 | "caption"
159 | "date"
160 | "equation"
161 | "figure"
162 | "footer"
163 | "list"
164 | "paragraph"
165 | "reference"
166 | "section"
167 | "table"
168 | "title"
169 )
170}
171
172fn docbank_lines(tokens: Vec<DocBankToken>) -> Vec<DocBankLine> {
173 let mut lines = Vec::new();
174
175 for token in tokens {
176 let same_line = lines
177 .last()
178 .map(|line: &DocBankLine| {
179 line.label == token.label
180 && (line.y - token.bbox.y).abs() <= line.height.max(token.bbox.height).max(3.0)
181 })
182 .unwrap_or(false);
183 if same_line {
184 if let Some(line) = lines.last_mut() {
185 line.height = line.height.max(token.bbox.height);
186 line.tokens.push(token);
187 }
188 } else {
189 lines.push(DocBankLine {
190 label: token.label.clone(),
191 y: token.bbox.y,
192 height: token.bbox.height,
193 tokens: vec![token],
194 });
195 }
196 }
197
198 lines
199}
200
201fn docbank_line_block(line: DocBankLine) -> Option<Block> {
202 if line.tokens.is_empty() {
203 return None;
204 }
205
206 let text = line
207 .tokens
208 .iter()
209 .map(|token| token.text.as_str())
210 .collect::<Vec<_>>()
211 .join(" ");
212 let bbox = bbox_union(line.tokens.iter().map(|token| token.bbox))?;
213 let spans = line
214 .tokens
215 .iter()
216 .map(|token| Span {
217 text: token.text.clone(),
218 bbox: Some(token.bbox),
219 font: None,
220 size: None,
221 bold: false,
222 italic: false,
223 })
224 .collect::<Vec<_>>();
225
226 Some(Block::Text(TextBlock {
227 text: text.clone(),
228 kind: line.label,
229 bbox: Some(bbox),
230 lines: vec![Line {
231 text,
232 bbox: Some(bbox),
233 spans,
234 }],
235 source_anchors: vec![SourceAnchor {
236 page_number: 1,
237 pdf_object_ids: Vec::new(),
238 bbox: Some(bbox),
239 extraction_method: DOCBANK_EXTRACTION_METHOD.to_owned(),
240 }],
241 confidence: Some(Confidence {
242 score: 0.9,
243 calibrated: false,
244 }), ..Default::default()
245 }))
246}
247
248fn inferred_text_block_bbox(blocks: &[Block]) -> Option<BBox> {
249 let mut max_x = 0.0f32;
250 let mut max_y = 0.0f32;
251 let mut has_bbox = false;
252 for block in blocks {
253 let Block::Text(text) = block else {
254 continue;
255 };
256 let Some(bbox) = text.bbox else {
257 continue;
258 };
259 has_bbox = true;
260 max_x = max_x.max(bbox.x + bbox.width);
261 max_y = max_y.max(bbox.y + bbox.height);
262 }
263 has_bbox.then_some(BBox {
264 x: 0.0,
265 y: 0.0,
266 width: max_x,
267 height: max_y,
268 })
269}
270
271fn bbox_union(boxes: impl Iterator<Item = BBox>) -> Option<BBox> {
272 let mut min_x = f32::INFINITY;
273 let mut min_y = f32::INFINITY;
274 let mut max_x = f32::NEG_INFINITY;
275 let mut max_y = f32::NEG_INFINITY;
276 let mut has_box = false;
277 for bbox in boxes {
278 has_box = true;
279 min_x = min_x.min(bbox.x);
280 min_y = min_y.min(bbox.y);
281 max_x = max_x.max(bbox.x + bbox.width);
282 max_y = max_y.max(bbox.y + bbox.height);
283 }
284 has_box.then_some(BBox {
285 x: min_x,
286 y: min_y,
287 width: max_x - min_x,
288 height: max_y - min_y,
289 })
290}
291
292fn document_from_blocks(
293 source: &Source,
294 engine_name: &str,
295 title: Option<String>,
296 blocks: Vec<Block>,
297) -> Option<Document> {
298 if blocks.is_empty() {
299 return None;
300 }
301 let plain_text = blocks
302 .iter()
303 .map(block_markdown_text)
304 .filter(|text| !text.is_empty())
305 .collect::<Vec<_>>()
306 .join("\n\n");
307
308 Some(Document {
309 schema_version: SCHEMA_VERSION.to_owned(),
310 metadata: Metadata {
311 format: source.format.clone(),
312 engine: engine_name.to_owned(),
313 source: source.path.clone(),
314 title,
315 character_count: plain_text.chars().count(),
316 word_count: plain_text.split_whitespace().count(),
317 block_count: blocks.len(),
318 file_size_bytes: source.bytes.as_ref().map(|bytes| bytes.len() as u64),
319 pdf_version: None,
320 encrypted: false,
321 },
322 pages: vec![Page {
323 number: 1,
324 width: None,
325 height: None,
326 rotation: None,
327 bbox: None,
328 blocks,
329 images: Vec::new(),
330 assets: Vec::new(),
331 warnings: Vec::new(), ..Default::default()
332 }],
333 assets: Vec::new(),
334 warnings: Vec::new(),
335 })
336}
337
338fn latex_document(source: &Source) -> Option<Document> {
339 if !is_latex_source(source) {
340 return None;
341 }
342
343 let stripped = strip_latex_comments(&source.content);
344 let title = latex_command_argument(&stripped, "title").map(|text| clean_latex_inline(&text));
345 let body = latex_document_body(&stripped);
346 let blocks = latex_blocks(body, title.clone());
347 document_from_blocks(source, LATEX_ENGINE_NAME, title, blocks)
348}
349
350fn is_latex_source(source: &Source) -> bool {
351 source
352 .path
353 .as_deref()
354 .map(|path| {
355 let path = path.to_ascii_lowercase();
356 path.ends_with(".tex")
357 || path.ends_with(".latex")
358 || path.ends_with(".ltx")
359 || path.ends_with(".tex.gz")
360 || path.ends_with(".latex.gz")
361 || path.ends_with(".ltx.gz")
362 })
363 .unwrap_or(false)
364}
365
366fn strip_latex_comments(text: &str) -> String {
367 let mut output = String::with_capacity(text.len());
368 for line in text.lines() {
369 let mut escaped = false;
370 for character in line.chars() {
371 if character == '%' && !escaped {
372 break;
373 }
374 escaped = character == '\\' && !escaped;
375 if character != '\\' {
376 escaped = false;
377 }
378 output.push(character);
379 }
380 output.push('\n');
381 }
382 output
383}
384
385fn latex_document_body(text: &str) -> &str {
386 let Some(start) = text.find("\\begin{document}") else {
387 return text;
388 };
389 let body_start = start + "\\begin{document}".len();
390 let body = &text[body_start..];
391 if let Some(end) = body.find("\\end{document}") {
392 &body[..end]
393 } else {
394 body
395 }
396}
397
398fn latex_blocks(body: &str, title: Option<String>) -> Vec<Block> {
399 let lines = body.lines().collect::<Vec<_>>();
400 let mut blocks = Vec::new();
401 let mut paragraph = Vec::new();
402 let mut index = 0usize;
403
404 if let Some(title) = title.filter(|title| !title.is_empty()) {
405 blocks.push(latex_text_block(title, "heading_1".to_owned()));
406 }
407
408 while index < lines.len() {
409 let trimmed = lines[index].trim();
410 if trimmed.is_empty() {
411 flush_latex_paragraph(&mut blocks, &mut paragraph);
412 index += 1;
413 continue;
414 }
415 if is_latex_skip_line(trimmed) {
416 flush_latex_paragraph(&mut blocks, &mut paragraph);
417 index += 1;
418 continue;
419 }
420 if let Some((level, text)) = latex_heading(trimmed) {
421 flush_latex_paragraph(&mut blocks, &mut paragraph);
422 blocks.push(latex_text_block(text, format!("heading_{level}")));
423 index += 1;
424 continue;
425 }
426 if contains_latex_begin(trimmed, "abstract") {
427 flush_latex_paragraph(&mut blocks, &mut paragraph);
428 let (environment, next_index) = collect_latex_environment(&lines, index, &["abstract"]);
429 if let Some(abstract_text) = latex_environment_body(&environment, "abstract") {
430 let text = clean_latex_inline(&abstract_text);
431 if !text.is_empty() {
432 blocks.push(latex_text_block(text, "abstract".to_owned()));
433 }
434 }
435 index = next_index;
436 continue;
437 }
438 if contains_any_latex_begin(trimmed, &["itemize", "enumerate"]) {
439 flush_latex_paragraph(&mut blocks, &mut paragraph);
440 let (environment, next_index) =
441 collect_latex_environment(&lines, index, &["itemize", "enumerate"]);
442 if let Some(block) = latex_list_block(&environment) {
443 blocks.push(block);
444 }
445 index = next_index;
446 continue;
447 }
448 if contains_any_latex_begin(
449 trimmed,
450 &[
451 "table",
452 "table*",
453 "tabular",
454 "tabular*",
455 "tabularx",
456 "longtable",
457 "array",
458 ],
459 ) {
460 flush_latex_paragraph(&mut blocks, &mut paragraph);
461 let (environment, next_index) = collect_latex_environment(
462 &lines,
463 index,
464 &[
465 "table",
466 "table*",
467 "tabular",
468 "tabular*",
469 "tabularx",
470 "longtable",
471 "array",
472 ],
473 );
474 if let Some(block) = latex_table_block(&environment) {
475 blocks.push(block);
476 }
477 index = next_index;
478 continue;
479 }
480
481 let text = clean_latex_inline(trimmed);
482 if !text.is_empty() {
483 paragraph.push(text);
484 }
485 index += 1;
486 }
487
488 flush_latex_paragraph(&mut blocks, &mut paragraph);
489 blocks
490}
491
492fn flush_latex_paragraph(blocks: &mut Vec<Block>, paragraph: &mut Vec<String>) {
493 if paragraph.is_empty() {
494 return;
495 }
496 blocks.push(latex_text_block(
497 paragraph.join(" "),
498 "paragraph".to_owned(),
499 ));
500 paragraph.clear();
501}
502
503fn is_latex_skip_line(line: &str) -> bool {
504 matches!(
505 latex_command_name_at(line, 1).as_deref(),
506 Some(
507 "author"
508 | "date"
509 | "documentclass"
510 | "end"
511 | "input"
512 | "include"
513 | "label"
514 | "maketitle"
515 | "newcommand"
516 | "renewcommand"
517 | "bibliography"
518 | "bibliographystyle"
519 | "usepackage"
520 )
521 )
522}
523
524fn latex_heading(line: &str) -> Option<(usize, String)> {
525 for (command, level) in [
526 ("part", 1usize),
527 ("chapter", 1),
528 ("section", 1),
529 ("subsection", 2),
530 ("subsubsection", 3),
531 ("paragraph", 4),
532 ("subparagraph", 5),
533 ] {
534 if let Some(text) = latex_line_command_argument(line, command) {
535 let text = clean_latex_inline(&text);
536 if !text.is_empty() {
537 return Some((level, text));
538 }
539 }
540 }
541 None
542}
543
544fn latex_line_command_argument(line: &str, command: &str) -> Option<String> {
545 let trimmed = line.trim_start();
546 let marker = format!("\\{command}");
547 if !trimmed.starts_with(&marker) {
548 return None;
549 }
550 latex_command_argument(trimmed, command)
551}
552
553fn contains_any_latex_begin(line: &str, names: &[&str]) -> bool {
554 names.iter().any(|name| contains_latex_begin(line, name))
555}
556
557fn contains_latex_begin(line: &str, name: &str) -> bool {
558 line.contains(&format!("\\begin{{{name}}}"))
559}
560
561fn collect_latex_environment(lines: &[&str], index: usize, names: &[&str]) -> (String, usize) {
562 let mut output = String::new();
563 let mut next_index = index;
564 while next_index < lines.len() {
565 let line = lines[next_index];
566 output.push_str(line);
567 output.push('\n');
568 next_index += 1;
569 if names
570 .iter()
571 .any(|name| line.contains(&format!("\\end{{{name}}}")))
572 {
573 break;
574 }
575 }
576 (output, next_index)
577}
578
579fn latex_list_block(environment: &str) -> Option<Block> {
580 let body = latex_environment_body(environment, "itemize")
581 .or_else(|| latex_environment_body(environment, "enumerate"))?;
582 let items = latex_item_texts(&body);
583 if items.is_empty() {
584 return None;
585 }
586 Some(latex_text_block(items.join("\n"), "list".to_owned()))
587}
588
589fn latex_item_texts(body: &str) -> Vec<String> {
590 let mut items = Vec::new();
591 let mut search_start = 0usize;
592 while let Some(relative_start) = body[search_start..].find("\\item") {
593 let item_start = search_start + relative_start;
594 let mut content_start = item_start + "\\item".len();
595 content_start = skip_latex_whitespace(body, content_start);
596 if body.as_bytes().get(content_start) == Some(&b'[') {
597 content_start = skip_latex_optional_argument(body, content_start);
598 content_start = skip_latex_whitespace(body, content_start);
599 }
600 let next_item = body[content_start..]
601 .find("\\item")
602 .map(|relative| content_start + relative)
603 .unwrap_or(body.len());
604 let text = clean_latex_inline(&body[content_start..next_item]);
605 if !text.is_empty() {
606 items.push(text);
607 }
608 search_start = next_item;
609 }
610 items
611}
612
613fn latex_table_block(environment: &str) -> Option<Block> {
614 let caption =
615 latex_command_argument(environment, "caption").map(|text| clean_latex_inline(&text));
616 let body = latex_environment_body(environment, "tabular")
617 .or_else(|| latex_environment_body(environment, "tabular*"))
618 .or_else(|| latex_environment_body(environment, "tabularx"))
619 .or_else(|| latex_environment_body(environment, "longtable"))
620 .or_else(|| latex_environment_body(environment, "array"))?;
621
622 let mut rows = split_latex_table_rows(&body)
623 .into_iter()
624 .filter_map(|row| latex_table_row(&row))
625 .collect::<Vec<_>>();
626 if rows.is_empty() {
627 return None;
628 }
629
630 let headers = if rows.len() > 1 {
631 rows.remove(0)
632 } else {
633 Vec::new()
634 };
635
636 Some(Block::Table(TableBlock {
637 headers,
638 rows,
639 caption,
640 bbox: None,
641 cells: Vec::new(),
642 source_anchors: vec![latex_source_anchor()],
643 confidence: Some(latex_confidence()), ..Default::default()
644 }))
645}
646
647fn split_latex_table_rows(body: &str) -> Vec<String> {
648 let mut rows = Vec::new();
649 let mut current = String::new();
650 let bytes = body.as_bytes();
651 let mut pos = 0usize;
652 while pos < bytes.len() {
653 if bytes[pos] == b'\\' && bytes.get(pos + 1) == Some(&b'\\') {
654 rows.push(current);
655 current = String::new();
656 pos += 2;
657 } else {
658 current.push(body[pos..].chars().next().unwrap());
659 pos += body[pos..].chars().next().unwrap().len_utf8();
660 }
661 }
662 if !current.trim().is_empty() {
663 rows.push(current);
664 }
665 rows
666}
667
668fn latex_table_row(row: &str) -> Option<Vec<String>> {
669 let row = strip_latex_table_rules(row);
670 let cells = split_latex_cells(&row)
671 .into_iter()
672 .map(|cell| clean_latex_inline(&cell))
673 .filter(|cell| !cell.is_empty())
674 .collect::<Vec<_>>();
675 if cells.is_empty() {
676 None
677 } else {
678 Some(cells)
679 }
680}
681
682fn strip_latex_table_rules(row: &str) -> String {
683 let mut cleaned = row.to_owned();
684 for command in [
685 "\\hline",
686 "\\toprule",
687 "\\midrule",
688 "\\bottomrule",
689 "\\cmidrule",
690 "\\cline",
691 ] {
692 cleaned = cleaned.replace(command, " ");
693 }
694 cleaned
695}
696
697fn split_latex_cells(row: &str) -> Vec<String> {
698 let mut cells = Vec::new();
699 let mut current = String::new();
700 let mut escaped = false;
701 for character in row.chars() {
702 if character == '&' && !escaped {
703 cells.push(current);
704 current = String::new();
705 } else {
706 escaped = character == '\\' && !escaped;
707 if character != '\\' {
708 escaped = false;
709 }
710 current.push(character);
711 }
712 }
713 cells.push(current);
714 cells
715}
716
717fn latex_environment_body(text: &str, name: &str) -> Option<String> {
718 let marker = format!("\\begin{{{name}}}");
719 let start = text.find(&marker)?;
720 let mut body_start = start + marker.len();
721 loop {
722 body_start = skip_latex_whitespace(text, body_start);
723 match text.as_bytes().get(body_start) {
724 Some(b'[') => body_start = skip_latex_optional_argument(text, body_start),
725 Some(b'{') => {
726 let (_, end) = read_latex_braced_argument(text, body_start)?;
727 body_start = end;
728 }
729 _ => break,
730 }
731 }
732 let end_marker = format!("\\end{{{name}}}");
733 let end = text[body_start..]
734 .find(&end_marker)
735 .map(|relative| body_start + relative)
736 .unwrap_or(text.len());
737 Some(text[body_start..end].to_owned())
738}
739
740fn latex_command_argument(text: &str, command: &str) -> Option<String> {
741 let marker = format!("\\{command}");
742 let mut search_start = 0usize;
743 while let Some(relative_start) = text[search_start..].find(&marker) {
744 let start = search_start + relative_start;
745 let mut cursor = start + marker.len();
746 if text[cursor..]
747 .chars()
748 .next()
749 .map(|character| character.is_ascii_alphabetic())
750 .unwrap_or(false)
751 {
752 search_start = cursor;
753 continue;
754 }
755 if text.as_bytes().get(cursor) == Some(&b'*') {
756 cursor += 1;
757 }
758 cursor = skip_latex_whitespace(text, cursor);
759 if text.as_bytes().get(cursor) == Some(&b'[') {
760 cursor = skip_latex_optional_argument(text, cursor);
761 cursor = skip_latex_whitespace(text, cursor);
762 }
763 if text.as_bytes().get(cursor) == Some(&b'{') {
764 let (argument, _) = read_latex_braced_argument(text, cursor)?;
765 return Some(argument);
766 }
767 search_start = cursor.max(start + 1);
768 }
769 None
770}
771
772fn read_latex_braced_argument(text: &str, open: usize) -> Option<(String, usize)> {
773 if text.as_bytes().get(open) != Some(&b'{') {
774 return None;
775 }
776 let mut depth = 0usize;
777 let mut escaped = false;
778 for (relative, character) in text[open..].char_indices() {
779 let index = open + relative;
780 if character == '{' && !escaped {
781 depth += 1;
782 } else if character == '}' && !escaped {
783 depth = depth.saturating_sub(1);
784 if depth == 0 {
785 return Some((text[open + 1..index].to_owned(), index + 1));
786 }
787 }
788 escaped = character == '\\' && !escaped;
789 if character != '\\' {
790 escaped = false;
791 }
792 }
793 None
794}
795
796fn skip_latex_optional_argument(text: &str, open: usize) -> usize {
797 if text.as_bytes().get(open) != Some(&b'[') {
798 return open;
799 }
800 let mut escaped = false;
801 for (relative, character) in text[open + 1..].char_indices() {
802 if character == ']' && !escaped {
803 return open + 1 + relative + 1;
804 }
805 escaped = character == '\\' && !escaped;
806 if character != '\\' {
807 escaped = false;
808 }
809 }
810 open + 1
811}
812
813fn skip_latex_whitespace(text: &str, mut pos: usize) -> usize {
814 while pos < text.len() && text.as_bytes()[pos].is_ascii_whitespace() {
815 pos += 1;
816 }
817 pos
818}
819
820fn clean_latex_inline(text: &str) -> String {
821 let mut output = String::with_capacity(text.len());
822 let mut pos = 0usize;
823 while pos < text.len() {
824 let character = text[pos..].chars().next().unwrap();
825 if character == '\\' {
826 let next_pos = pos + character.len_utf8();
827 let Some(next_character) = text[next_pos..].chars().next() else {
828 break;
829 };
830 if next_character == '\\' {
831 output.push(' ');
832 pos = next_pos + next_character.len_utf8();
833 continue;
834 }
835 if matches!(
836 next_character,
837 '%' | '&' | '_' | '$' | '#' | '{' | '}' | '[' | ']'
838 ) {
839 output.push(next_character);
840 pos = next_pos + next_character.len_utf8();
841 continue;
842 }
843 let (name, after_name) = latex_command_name(text, next_pos);
844 if name.is_empty() {
845 pos = next_pos;
846 continue;
847 }
848 let (replacement, after_command) =
849 clean_latex_command_argument(text, &name, after_name);
850 output.push_str(&replacement);
851 pos = after_command;
852 continue;
853 }
854 if matches!(character, '{' | '}' | '$') {
855 pos += character.len_utf8();
856 continue;
857 }
858 if character == '~' {
859 output.push(' ');
860 } else {
861 output.push(character);
862 }
863 pos += character.len_utf8();
864 }
865 output.split_whitespace().collect::<Vec<_>>().join(" ")
866}
867
868fn clean_latex_command_argument(text: &str, name: &str, after_name: usize) -> (String, usize) {
869 let mut cursor = skip_latex_whitespace(text, after_name);
870 if text.as_bytes().get(cursor) == Some(&b'[') {
871 cursor = skip_latex_optional_argument(text, cursor);
872 cursor = skip_latex_whitespace(text, cursor);
873 }
874
875 if matches!(
876 name,
877 "label" | "pageref" | "ref" | "cite" | "citep" | "citet"
878 ) {
879 if text.as_bytes().get(cursor) == Some(&b'{') {
880 let (_, end) = read_latex_braced_argument(text, cursor).unwrap_or_default();
881 return (String::new(), end.max(cursor + 1));
882 }
883 return (String::new(), cursor);
884 }
885
886 if name == "href" {
887 if text.as_bytes().get(cursor) == Some(&b'{') {
888 let (_, first_end) = read_latex_braced_argument(text, cursor).unwrap_or_default();
889 let second_start = skip_latex_whitespace(text, first_end);
890 if text.as_bytes().get(second_start) == Some(&b'{') {
891 if let Some((argument, end)) = read_latex_braced_argument(text, second_start) {
892 return (clean_latex_inline(&argument), end);
893 }
894 }
895 return (String::new(), first_end.max(cursor + 1));
896 }
897 }
898
899 if matches!(name, "multicolumn" | "multirow") {
900 let mut arguments = Vec::new();
901 for _ in 0..3 {
902 cursor = skip_latex_whitespace(text, cursor);
903 if text.as_bytes().get(cursor) != Some(&b'{') {
904 break;
905 }
906 if let Some((argument, end)) = read_latex_braced_argument(text, cursor) {
907 arguments.push(argument);
908 cursor = end;
909 }
910 }
911 return (
912 arguments
913 .last()
914 .map(|argument| clean_latex_inline(argument))
915 .unwrap_or_default(),
916 cursor,
917 );
918 }
919
920 if text.as_bytes().get(cursor) == Some(&b'{') {
921 if let Some((argument, end)) = read_latex_braced_argument(text, cursor) {
922 return (clean_latex_inline(&argument), end);
923 }
924 }
925
926 let replacement = match name {
927 "LaTeX" => "LaTeX",
928 "TeX" => "TeX",
929 "quad" | "qquad" | "enspace" | "thinspace" => " ",
930 _ => "",
931 };
932 (replacement.to_owned(), cursor)
933}
934
935fn latex_command_name(text: &str, start: usize) -> (String, usize) {
936 let mut end = start;
937 for (relative, character) in text[start..].char_indices() {
938 if !character.is_ascii_alphabetic() {
939 break;
940 }
941 end = start + relative + character.len_utf8();
942 }
943 if end > start {
944 return (text[start..end].to_owned(), end);
945 }
946 if let Some(character) = text[start..].chars().next() {
947 let end = start + character.len_utf8();
948 (character.to_string(), end)
949 } else {
950 (String::new(), start)
951 }
952}
953
954fn latex_command_name_at(line: &str, start: usize) -> Option<String> {
955 if !line.starts_with('\\') {
956 return None;
957 }
958 let (name, _) = latex_command_name(line, start);
959 (!name.is_empty()).then_some(name)
960}
961
962fn latex_text_block(text: String, kind: String) -> Block {
963 Block::Text(TextBlock {
964 text,
965 kind,
966 bbox: None,
967 lines: Vec::new(),
968 source_anchors: vec![latex_source_anchor()],
969 confidence: Some(latex_confidence()), ..Default::default()
970 })
971}
972
973fn latex_source_anchor() -> SourceAnchor {
974 SourceAnchor {
975 page_number: 1,
976 pdf_object_ids: Vec::new(),
977 bbox: None,
978 extraction_method: LATEX_EXTRACTION_METHOD.to_owned(),
979 }
980}
981
982fn latex_confidence() -> Confidence {
983 Confidence {
984 score: 0.85,
985 calibrated: false,
986 }
987}
988
989fn markdown_document(source: &Source) -> Option<Document> {
990 if !is_markdown_source(source) {
991 return None;
992 }
993
994 let blocks = markdown_blocks(&source.content);
995 document_from_blocks(source, MARKDOWN_ENGINE_NAME, None, blocks)
996}
997
998fn is_markdown_source(source: &Source) -> bool {
999 source
1000 .path
1001 .as_deref()
1002 .map(|path| {
1003 let path = path.to_ascii_lowercase();
1004 path.ends_with(".md") || path.ends_with(".markdown")
1005 })
1006 .unwrap_or(false)
1007}
1008
1009fn markdown_blocks(content: &str) -> Vec<Block> {
1010 let lines = content.lines().collect::<Vec<_>>();
1011 let mut blocks = Vec::new();
1012 let mut paragraph = Vec::new();
1013 let mut index = 0usize;
1014
1015 while index < lines.len() {
1016 let trimmed = lines[index].trim();
1017 if trimmed.is_empty() {
1018 flush_markdown_paragraph(&mut blocks, &mut paragraph);
1019 index += 1;
1020 continue;
1021 }
1022 if let Some((level, text)) = markdown_heading(trimmed) {
1023 flush_markdown_paragraph(&mut blocks, &mut paragraph);
1024 blocks.push(markdown_text_block(text, format!("heading_{level}")));
1025 index += 1;
1026 continue;
1027 }
1028 if is_markdown_table_start(&lines, index) {
1029 flush_markdown_paragraph(&mut blocks, &mut paragraph);
1030 let (table, next_index) = markdown_table_block(&lines, index);
1031 blocks.push(table);
1032 index = next_index;
1033 continue;
1034 }
1035 if is_markdown_list_item(trimmed) {
1036 flush_markdown_paragraph(&mut blocks, &mut paragraph);
1037 let (list, next_index) = markdown_list_block(&lines, index);
1038 blocks.push(list);
1039 index = next_index;
1040 continue;
1041 }
1042
1043 paragraph.push(trimmed.to_owned());
1044 index += 1;
1045 }
1046
1047 flush_markdown_paragraph(&mut blocks, &mut paragraph);
1048 blocks
1049}
1050
1051fn flush_markdown_paragraph(blocks: &mut Vec<Block>, paragraph: &mut Vec<String>) {
1052 if paragraph.is_empty() {
1053 return;
1054 }
1055 blocks.push(markdown_text_block(
1056 paragraph.join(" "),
1057 "paragraph".to_owned(),
1058 ));
1059 paragraph.clear();
1060}
1061
1062fn markdown_heading(line: &str) -> Option<(usize, String)> {
1063 let hashes = line
1064 .chars()
1065 .take_while(|character| *character == '#')
1066 .count();
1067 if hashes == 0 || hashes > 6 {
1068 return None;
1069 }
1070 let text = line.get(hashes..)?.trim();
1071 if text.is_empty() {
1072 return None;
1073 }
1074 Some((hashes, clean_markdown_inline(text)))
1075}
1076
1077fn is_markdown_table_start(lines: &[&str], index: usize) -> bool {
1078 index + 1 < lines.len()
1079 && markdown_row_cells(lines[index]).len() >= 2
1080 && is_markdown_separator_row(lines[index + 1])
1081}
1082
1083fn markdown_table_block(lines: &[&str], index: usize) -> (Block, usize) {
1084 let headers = markdown_row_cells(lines[index]);
1085 let mut rows = Vec::new();
1086 let mut next_index = index + 2;
1087
1088 while next_index < lines.len() {
1089 let line = lines[next_index].trim();
1090 if line.is_empty() || !line.contains('|') {
1091 break;
1092 }
1093 let row = markdown_row_cells(line);
1094 if row.is_empty() {
1095 break;
1096 }
1097 rows.push(row);
1098 next_index += 1;
1099 }
1100
1101 (
1102 Block::Table(TableBlock {
1103 headers,
1104 rows,
1105 caption: None,
1106 bbox: None,
1107 cells: Vec::new(),
1108 source_anchors: vec![markdown_source_anchor()],
1109 confidence: Some(markdown_confidence()), ..Default::default()
1110 }),
1111 next_index,
1112 )
1113}
1114
1115fn markdown_row_cells(line: &str) -> Vec<String> {
1116 let trimmed = line.trim().trim_matches('|');
1117 trimmed
1118 .split('|')
1119 .map(|cell| clean_markdown_inline(cell.trim()))
1120 .collect::<Vec<_>>()
1121}
1122
1123fn is_markdown_separator_row(line: &str) -> bool {
1124 let cells = line.trim().trim_matches('|').split('|').collect::<Vec<_>>();
1125 if cells.len() < 2 {
1126 return false;
1127 }
1128 cells.iter().all(|cell| {
1129 let cell = cell.trim();
1130 let cell = cell.trim_matches(':');
1131 !cell.is_empty() && cell.chars().all(|character| character == '-')
1132 })
1133}
1134
1135fn is_markdown_list_item(line: &str) -> bool {
1136 markdown_list_text(line).is_some()
1137}
1138
1139fn markdown_list_block(lines: &[&str], index: usize) -> (Block, usize) {
1140 let mut items = Vec::new();
1141 let mut next_index = index;
1142 while next_index < lines.len() {
1143 let trimmed = lines[next_index].trim();
1144 let Some(item) = markdown_list_text(trimmed) else {
1145 break;
1146 };
1147 items.push(item);
1148 next_index += 1;
1149 }
1150 (
1151 markdown_text_block(items.join("\n"), "list".to_owned()),
1152 next_index,
1153 )
1154}
1155
1156fn markdown_list_text(line: &str) -> Option<String> {
1157 if let Some(text) = line.strip_prefix("- ").or_else(|| line.strip_prefix("* ")) {
1158 return Some(clean_markdown_inline(text));
1159 }
1160 let dot = line.find('.')?;
1161 if dot == 0
1162 || dot + 1 >= line.len()
1163 || !line[..dot]
1164 .chars()
1165 .all(|character| character.is_ascii_digit())
1166 {
1167 return None;
1168 }
1169 line[dot + 1..].strip_prefix(' ').map(clean_markdown_inline)
1170}
1171
1172fn clean_markdown_inline(text: &str) -> String {
1173 text.trim()
1174 .trim_matches('`')
1175 .split_whitespace()
1176 .collect::<Vec<_>>()
1177 .join(" ")
1178}
1179
1180fn markdown_text_block(text: String, kind: String) -> Block {
1181 Block::Text(TextBlock {
1182 text,
1183 kind,
1184 bbox: None,
1185 lines: Vec::new(),
1186 source_anchors: vec![markdown_source_anchor()],
1187 confidence: Some(markdown_confidence()), ..Default::default()
1188 })
1189}
1190
1191fn markdown_source_anchor() -> SourceAnchor {
1192 SourceAnchor {
1193 page_number: 1,
1194 pdf_object_ids: Vec::new(),
1195 bbox: None,
1196 extraction_method: MARKDOWN_EXTRACTION_METHOD.to_owned(),
1197 }
1198}
1199
1200fn markdown_confidence() -> Confidence {
1201 Confidence {
1202 score: 0.9,
1203 calibrated: false,
1204 }
1205}
1206
1207fn block_markdown_text(block: &Block) -> String {
1208 match block {
1209 Block::Text(text) => text.text.clone(),
1210 Block::Table(table) => {
1211 let mut rows = Vec::new();
1212 if !table.headers.is_empty() {
1213 rows.push(table.headers.join(" "));
1214 }
1215 rows.extend(table.rows.iter().map(|row| row.join(" ")));
1216 rows.join("\n")
1217 }
1218 Block::Figure(figure) => figure.caption.clone().unwrap_or_default(),
1219 }
1220}
1221
1222pub(crate) fn split_paragraphs(text: &str) -> Vec<String> {
1223 let mut paragraphs = Vec::new();
1224 let mut current = Vec::new();
1225
1226 for line in text.lines() {
1227 let trimmed = line.trim();
1228 if trimmed.is_empty() {
1229 flush_paragraph(&mut paragraphs, &mut current);
1230 } else {
1231 current.push(trimmed.to_owned());
1232 }
1233 }
1234
1235 flush_paragraph(&mut paragraphs, &mut current);
1236 paragraphs
1237}
1238
1239fn flush_paragraph(paragraphs: &mut Vec<String>, current: &mut Vec<String>) {
1240 if !current.is_empty() {
1241 paragraphs.push(current.join(" "));
1242 current.clear();
1243 }
1244}
1245
1246pub(crate) fn text_document_from_text(
1247 source: &Source,
1248 engine_name: &str,
1249 text: &str,
1250 title: Option<String>,
1251) -> Result<Document> {
1252 text_document_from_paragraphs(source, engine_name, split_paragraphs(text), title)
1253}
1254
1255pub(crate) fn text_document_from_paragraphs(
1256 source: &Source,
1257 engine_name: &str,
1258 paragraphs: Vec<String>,
1259 title: Option<String>,
1260) -> Result<Document> {
1261 let blocks = paragraphs
1262 .into_iter()
1263 .filter(|text| !text.trim().is_empty())
1264 .map(|text| {
1265 Block::Text(TextBlock {
1266 text,
1267 kind: "paragraph".to_owned(),
1268 bbox: None,
1269 lines: Vec::new(),
1270 source_anchors: vec![SourceAnchor {
1271 page_number: 1,
1272 pdf_object_ids: Vec::new(),
1273 bbox: None,
1274 extraction_method: engine_name.to_owned(),
1275 }],
1276 confidence: Some(Confidence {
1277 score: 0.9,
1278 calibrated: false,
1279 }), ..Default::default()
1280 })
1281 })
1282 .collect::<Vec<_>>();
1283 let plain_text = blocks
1284 .iter()
1285 .filter_map(|block| match block {
1286 Block::Text(text) => Some(text.text.as_str()),
1287 _ => None,
1288 })
1289 .collect::<Vec<_>>()
1290 .join("\n\n");
1291
1292 Ok(Document {
1293 schema_version: SCHEMA_VERSION.to_owned(),
1294 metadata: Metadata {
1295 format: source.format.clone(),
1296 engine: engine_name.to_owned(),
1297 source: source.path.clone(),
1298 title,
1299 character_count: plain_text.chars().count(),
1300 word_count: plain_text.split_whitespace().count(),
1301 block_count: blocks.len(),
1302 file_size_bytes: source.bytes.as_ref().map(|bytes| bytes.len() as u64),
1303 pdf_version: None,
1304 encrypted: false,
1305 },
1306 pages: vec![Page {
1307 number: 1,
1308 width: None,
1309 height: None,
1310 rotation: None,
1311 bbox: None,
1312 blocks,
1313 images: Vec::new(),
1314 assets: Vec::new(),
1315 warnings: Vec::new(), ..Default::default()
1316 }],
1317 assets: Vec::new(),
1318 warnings: Vec::new(),
1319 })
1320}