1use hygg_shared::normalize_file_path;
2use rayon::prelude::*;
3use std::io::{BufWriter, Cursor};
4
5mod heuristics;
6mod layout_text_output;
7mod ocr;
8mod pdf_patch;
9mod sanitize;
10mod stream;
11mod stream_recovery;
12
13pub use stream::{PdfLineKind, PdfRenderedPage, PdfStream, SharedPdfStream};
14
15use heuristics::{
16 layout_needs_plaintext_fallback, should_prefer_plaintext_output,
17};
18use sanitize::sanitize_layout_text;
19use stream_recovery::recover_sparse_code_blocks;
20
21pub(crate) fn load_patched_doc_internal(
22 canonical_path: &std::path::Path,
23) -> Result<pdf_extract::Document, Box<dyn std::error::Error>> {
24 match pdf_patch::patched_pdf_bytes(canonical_path) {
25 Ok(bytes) => match pdf_extract::Document::load_mem(&bytes) {
26 Ok(doc) => Ok(doc),
27 Err(_) => Ok(pdf_extract::Document::load(canonical_path)?),
28 },
29 Err(_) => Ok(pdf_extract::Document::load(canonical_path)?),
30 }
31}
32
33pub(crate) fn render_page_layout_internal(
34 doc: &pdf_extract::Document,
35 page_num: u32,
36) -> Option<String> {
37 let mut buf = Vec::new();
38 {
39 let mut writer = BufWriter::new(Cursor::new(&mut buf));
40 let mut output = layout_text_output::LayoutTextOutput::new(
41 &mut writer as &mut dyn std::io::Write,
42 );
43 pdf_extract::output_doc_page(doc, &mut output, page_num).ok()?;
44 }
45 String::from_utf8(buf).ok()
46}
47
48fn extract_with_layout_text(
55 canonical_path: &std::path::Path,
56) -> Result<String, Box<dyn std::error::Error>> {
57 let doc = load_patched_doc_internal(canonical_path)?;
58 pdf_extract::print_metadata(&doc);
59
60 let mut page_nums: Vec<u32> = doc.get_pages().into_keys().collect();
61 page_nums.sort_unstable();
62
63 let pages: Vec<Option<String>> = page_nums
66 .par_iter()
67 .map(|&page_num| render_page_layout_internal(&doc, page_num))
68 .collect();
69
70 let mut combined = String::new();
71 for page in pages.into_iter().flatten() {
72 combined.push_str(&page);
73 }
74 Ok(combined)
75}
76
77pub fn pdf_to_text(
78 pdf_path: &str,
79) -> Result<String, Box<dyn std::error::Error>> {
80 let canonical_path = normalize_file_path(pdf_path)?;
81
82 redirect_stderr::redirect_stdout()?;
86
87 let layout_text = extract_with_layout_text(&canonical_path);
88
89 let layout_text = layout_text?;
90 let mut layout_sanitized = sanitize_layout_text(&layout_text);
91
92 if let Ok(Some(recovered)) =
93 recover_sparse_code_blocks(&canonical_path, &layout_sanitized)
94 {
95 layout_sanitized = recovered;
96 }
97
98 let plaintext_result = if layout_needs_plaintext_fallback(&layout_sanitized) {
102 pdf_extract::extract_text(&canonical_path).ok()
103 } else {
104 None
105 };
106
107 redirect_stderr::restore_stdout()?;
108
109 if let Some(plaintext_output) = plaintext_result {
110 let plaintext_sanitized = sanitize_layout_text(&plaintext_output);
111 if should_prefer_plaintext_output(&layout_sanitized, &plaintext_sanitized) {
112 return Ok(plaintext_sanitized);
113 }
114 }
115
116 Ok(layout_sanitized)
117}
118
119pub fn pdf_to_text_with_bundled_ocr(
120 pdf_path: &str,
121) -> Result<String, Box<dyn std::error::Error>> {
122 ocr::pdf_to_text_with_bundled_ocr(pdf_path)
123}
124
125pub fn pdf_to_ansi_text(
126 pdf_path: &str,
127 col: usize,
128) -> Result<String, Box<dyn std::error::Error>> {
129 let stream = PdfStream::open(pdf_path)?;
130 pdf_stream_to_ansi_text(&stream, col)
131}
132
133pub fn pdf_to_ansi_text_with_bundled_ocr(
134 pdf_path: &str,
135 col: usize,
136) -> Result<String, Box<dyn std::error::Error>> {
137 let stream = PdfStream::open_with_bundled_ocr(pdf_path)?;
138 pdf_stream_to_ansi_text(&stream, col)
139}
140
141fn pdf_stream_to_ansi_text(
142 stream: &PdfStream,
143 col: usize,
144) -> Result<String, Box<dyn std::error::Error>> {
145 let mut output = Vec::new();
146 for page in 1..=stream.total_pages() {
147 let Some(rendered) = stream.extract_page_with_images(page, col) else {
148 continue;
149 };
150 output.extend(rendered.lines);
151 if page < stream.total_pages() {
152 output.push(String::new());
153 }
154 }
155 Ok(output.join("\n"))
156}
157
158#[cfg(test)]
159mod tests {
160 use std::path::Path;
161
162 use super::{pdf_to_text, should_prefer_plaintext_output};
163
164 #[test]
165 fn keeps_layout_when_plaintext_has_no_structural_gain() {
166 let layout = concat!(
167 "A Heading\n",
168 "Some explanatory text.\n",
169 "Another paragraph.\n",
170 );
171 let plaintext = concat!(
172 "A Heading\n",
173 "Some explanatory text.\n",
174 "Another paragraph.\n",
175 "Noise line\n",
176 );
177 assert!(!should_prefer_plaintext_output(layout, plaintext));
178 }
179
180 #[test]
181 fn keeps_progit_codeblock_lines_in_output() {
182 let pdf_path = Path::new(env!("CARGO_MANIFEST_DIR"))
183 .join("../test-data/pdf/progit-1-50.pdf");
184 if !pdf_path.exists() {
185 return;
186 }
187
188 let text = pdf_to_text(
189 pdf_path.to_str().expect("test PDF path should be valid UTF-8"),
190 )
191 .expect("expected pdf_to_text to succeed for progit sample");
192
193 for expected in
194 ["*.a", "!lib.a", "/TODO", "build/", "doc/*.txt", "doc/**/*.pdf"]
195 {
196 assert!(
197 text.contains(expected),
198 "expected recovered codeblock to contain {expected:?}, got excerpt around heading: {:?}",
199 text
200 .lines()
201 .skip_while(|line| {
202 !line.contains("Here is another example .gitignore file:")
203 })
204 .take(40)
205 .collect::<Vec<_>>()
206 );
207 }
208 }
209}