Skip to main content

cli_pdf_to_text/
lib.rs

1use hygg_shared::normalize_file_path;
2use rayon::prelude::*;
3use std::io::{BufWriter, Cursor};
4
5mod heuristics;
6mod layout_text_output;
7mod ocr;
8mod pdf_patch;
9mod sanitize;
10mod stream;
11mod stream_recovery;
12
13pub use stream::{PdfLineKind, PdfRenderedPage, PdfStream, SharedPdfStream};
14
15use heuristics::{
16  layout_needs_plaintext_fallback, should_prefer_plaintext_output,
17};
18use sanitize::sanitize_layout_text;
19use stream_recovery::recover_sparse_code_blocks;
20
21pub(crate) fn load_patched_doc_internal(
22  canonical_path: &std::path::Path,
23) -> Result<pdf_extract::Document, Box<dyn std::error::Error>> {
24  match pdf_patch::patched_pdf_bytes(canonical_path) {
25    Ok(bytes) => match pdf_extract::Document::load_mem(&bytes) {
26      Ok(doc) => Ok(doc),
27      Err(_) => Ok(pdf_extract::Document::load(canonical_path)?),
28    },
29    Err(_) => Ok(pdf_extract::Document::load(canonical_path)?),
30  }
31}
32
33pub(crate) fn render_page_layout_internal(
34  doc: &pdf_extract::Document,
35  page_num: u32,
36) -> Option<String> {
37  let mut buf = Vec::new();
38  {
39    let mut writer = BufWriter::new(Cursor::new(&mut buf));
40    let mut output = layout_text_output::LayoutTextOutput::new(
41      &mut writer as &mut dyn std::io::Write,
42    );
43    pdf_extract::output_doc_page(doc, &mut output, page_num).ok()?;
44  }
45  String::from_utf8(buf).ok()
46}
47
48/// Extract layout-aware text from every page in parallel.
49///
50/// `pdf_extract::Document` (a re-export of `lopdf::Document`) is
51/// `Send + Sync`, so we share one parsed instance across rayon
52/// workers via reference. Per-page output is collected and
53/// concatenated in page order.
54fn extract_with_layout_text(
55  canonical_path: &std::path::Path,
56) -> Result<String, Box<dyn std::error::Error>> {
57  let doc = load_patched_doc_internal(canonical_path)?;
58  pdf_extract::print_metadata(&doc);
59
60  let mut page_nums: Vec<u32> = doc.get_pages().into_keys().collect();
61  page_nums.sort_unstable();
62
63  // par_iter().collect() preserves source order, so the resulting Vec
64  // is already in page order without an extra sort.
65  let pages: Vec<Option<String>> = page_nums
66    .par_iter()
67    .map(|&page_num| render_page_layout_internal(&doc, page_num))
68    .collect();
69
70  let mut combined = String::new();
71  for page in pages.into_iter().flatten() {
72    combined.push_str(&page);
73  }
74  Ok(combined)
75}
76
77pub fn pdf_to_text(
78  pdf_path: &str,
79) -> Result<String, Box<dyn std::error::Error>> {
80  let canonical_path = normalize_file_path(pdf_path)?;
81
82  // `redirect_stderr::redirect_stdout` works on both Windows and Unix now;
83  // suppress the noisy logging pdf_extract / lopdf write to stdout while we
84  // do the extraction passes.
85  redirect_stderr::redirect_stdout()?;
86
87  let layout_text = extract_with_layout_text(&canonical_path);
88
89  let layout_text = layout_text?;
90  let mut layout_sanitized = sanitize_layout_text(&layout_text);
91
92  if let Ok(Some(recovered)) =
93    recover_sparse_code_blocks(&canonical_path, &layout_sanitized)
94  {
95    layout_sanitized = recovered;
96  }
97
98  // Only run the slower plaintext fallback when the layout pass shows
99  // damage that the plaintext heuristic might actually prefer. On large
100  // PDFs this halves wall time.
101  let plaintext_result = if layout_needs_plaintext_fallback(&layout_sanitized) {
102    pdf_extract::extract_text(&canonical_path).ok()
103  } else {
104    None
105  };
106
107  redirect_stderr::restore_stdout()?;
108
109  if let Some(plaintext_output) = plaintext_result {
110    let plaintext_sanitized = sanitize_layout_text(&plaintext_output);
111    if should_prefer_plaintext_output(&layout_sanitized, &plaintext_sanitized) {
112      return Ok(plaintext_sanitized);
113    }
114  }
115
116  Ok(layout_sanitized)
117}
118
119pub fn pdf_to_text_with_bundled_ocr(
120  pdf_path: &str,
121) -> Result<String, Box<dyn std::error::Error>> {
122  ocr::pdf_to_text_with_bundled_ocr(pdf_path)
123}
124
125pub fn pdf_to_ansi_text(
126  pdf_path: &str,
127  col: usize,
128) -> Result<String, Box<dyn std::error::Error>> {
129  let stream = PdfStream::open(pdf_path)?;
130  pdf_stream_to_ansi_text(&stream, col)
131}
132
133pub fn pdf_to_ansi_text_with_bundled_ocr(
134  pdf_path: &str,
135  col: usize,
136) -> Result<String, Box<dyn std::error::Error>> {
137  let stream = PdfStream::open_with_bundled_ocr(pdf_path)?;
138  pdf_stream_to_ansi_text(&stream, col)
139}
140
141fn pdf_stream_to_ansi_text(
142  stream: &PdfStream,
143  col: usize,
144) -> Result<String, Box<dyn std::error::Error>> {
145  let mut output = Vec::new();
146  for page in 1..=stream.total_pages() {
147    let Some(rendered) = stream.extract_page_with_images(page, col) else {
148      continue;
149    };
150    output.extend(rendered.lines);
151    if page < stream.total_pages() {
152      output.push(String::new());
153    }
154  }
155  Ok(output.join("\n"))
156}
157
158#[cfg(test)]
159mod tests {
160  use std::path::Path;
161
162  use super::{pdf_to_text, should_prefer_plaintext_output};
163
164  #[test]
165  fn keeps_layout_when_plaintext_has_no_structural_gain() {
166    let layout = concat!(
167      "A Heading\n",
168      "Some explanatory text.\n",
169      "Another paragraph.\n",
170    );
171    let plaintext = concat!(
172      "A Heading\n",
173      "Some explanatory text.\n",
174      "Another paragraph.\n",
175      "Noise line\n",
176    );
177    assert!(!should_prefer_plaintext_output(layout, plaintext));
178  }
179
180  #[test]
181  fn keeps_progit_codeblock_lines_in_output() {
182    let pdf_path = Path::new(env!("CARGO_MANIFEST_DIR"))
183      .join("../test-data/pdf/progit-1-50.pdf");
184    if !pdf_path.exists() {
185      return;
186    }
187
188    let text = pdf_to_text(
189      pdf_path.to_str().expect("test PDF path should be valid UTF-8"),
190    )
191    .expect("expected pdf_to_text to succeed for progit sample");
192
193    for expected in
194      ["*.a", "!lib.a", "/TODO", "build/", "doc/*.txt", "doc/**/*.pdf"]
195    {
196      assert!(
197        text.contains(expected),
198        "expected recovered codeblock to contain {expected:?}, got excerpt around heading: {:?}",
199        text
200          .lines()
201          .skip_while(|line| {
202            !line.contains("Here is another example .gitignore file:")
203          })
204          .take(40)
205          .collect::<Vec<_>>()
206      );
207    }
208  }
209}