1#![allow(clippy::useless_vec, clippy::vec_init_then_push, unused_variables)]
3
4pub mod adobe_korea1;
29pub mod detector;
30pub mod extractor;
31pub mod glyph_names;
32pub mod markdown;
33pub mod process_mode;
34pub mod structure_tree;
35pub mod tables;
36pub mod text_utils;
37pub mod tounicode;
38pub mod types;
39
40pub use detector::{
41 detect_pdf_type, detect_pdf_type_mem, detect_pdf_type_mem_with_config,
42 detect_pdf_type_with_config, DetectionConfig, PdfType, PdfTypeResult, ScanStrategy,
43};
44pub use extractor::{extract_text, extract_text_with_positions, extract_text_with_positions_pages};
45pub use markdown::{
46 to_markdown, to_markdown_from_items, to_markdown_from_items_with_rects, MarkdownOptions,
47};
48pub use process_mode::ProcessMode;
49pub use types::{LayoutComplexity, PdfLine, PdfRect, TextItem};
50
51use lopdf::Document;
52use std::collections::HashSet;
53use std::path::Path;
54use tounicode::FontCMaps;
55
56#[derive(Debug)]
62pub struct PdfProcessResult {
63 pub pdf_type: PdfType,
65 pub markdown: Option<String>,
67 pub page_count: u32,
69 pub processing_time_ms: u64,
71 pub pages_needing_ocr: Vec<u32>,
73 pub title: Option<String>,
75 pub confidence: f32,
77 pub layout: LayoutComplexity,
79 pub has_encoding_issues: bool,
82}
83
84#[derive(Debug, Clone)]
100pub struct PdfOptions {
101 pub mode: ProcessMode,
103 pub detection: DetectionConfig,
105 pub markdown: MarkdownOptions,
107 pub page_filter: Option<HashSet<u32>>,
109}
110
111impl Default for PdfOptions {
112 fn default() -> Self {
113 Self {
114 mode: ProcessMode::Full,
115 detection: DetectionConfig::default(),
116 markdown: MarkdownOptions::default(),
117 page_filter: None,
118 }
119 }
120}
121
122impl PdfOptions {
123 pub fn new() -> Self {
125 Self::default()
126 }
127
128 pub fn detect_only() -> Self {
130 Self {
131 mode: ProcessMode::DetectOnly,
132 ..Self::default()
133 }
134 }
135
136 pub fn mode(mut self, mode: ProcessMode) -> Self {
138 self.mode = mode;
139 self
140 }
141
142 pub fn detection(mut self, config: DetectionConfig) -> Self {
144 self.detection = config;
145 self
146 }
147
148 pub fn markdown(mut self, options: MarkdownOptions) -> Self {
150 self.markdown = options;
151 self
152 }
153
154 pub fn pages(mut self, pages: impl IntoIterator<Item = u32>) -> Self {
156 self.page_filter = Some(pages.into_iter().collect());
157 self
158 }
159}
160
161pub fn process_pdf<P: AsRef<Path>>(path: P) -> Result<PdfProcessResult, PdfError> {
170 process_pdf_with_options(path, PdfOptions::new())
171}
172
173pub fn detect_pdf<P: AsRef<Path>>(path: P) -> Result<PdfProcessResult, PdfError> {
177 process_pdf_with_options(path, PdfOptions::detect_only())
178}
179
180pub fn process_pdf_with_options<P: AsRef<Path>>(
184 path: P,
185 options: PdfOptions,
186) -> Result<PdfProcessResult, PdfError> {
187 let start = std::time::Instant::now();
188 validate_pdf_file(&path)?;
189
190 let (doc, page_count) = load_document_from_path(&path)?;
192
193 process_document(doc, page_count, options, start)
194}
195
196pub fn process_pdf_mem(buffer: &[u8]) -> Result<PdfProcessResult, PdfError> {
198 process_pdf_mem_with_options(buffer, PdfOptions::new())
199}
200
201pub fn detect_pdf_mem(buffer: &[u8]) -> Result<PdfProcessResult, PdfError> {
203 process_pdf_mem_with_options(buffer, PdfOptions::detect_only())
204}
205
206pub fn process_pdf_mem_with_options(
210 buffer: &[u8],
211 options: PdfOptions,
212) -> Result<PdfProcessResult, PdfError> {
213 let start = std::time::Instant::now();
214 validate_pdf_bytes(buffer)?;
215
216 let (doc, page_count) = load_document_from_mem(buffer)?;
217
218 process_document(doc, page_count, options, start)
219}
220
221#[deprecated(since = "0.2.0", note = "Use process_pdf_with_options instead")]
227pub fn process_pdf_with_config<P: AsRef<Path>>(
228 path: P,
229 config: DetectionConfig,
230 markdown_options: MarkdownOptions,
231) -> Result<PdfProcessResult, PdfError> {
232 process_pdf_with_options(
233 path,
234 PdfOptions::new()
235 .detection(config)
236 .markdown(markdown_options),
237 )
238}
239
240#[deprecated(since = "0.2.0", note = "Use process_pdf_with_options instead")]
242pub fn process_pdf_with_config_pages<P: AsRef<Path>>(
243 path: P,
244 config: DetectionConfig,
245 markdown_options: MarkdownOptions,
246 page_filter: Option<&HashSet<u32>>,
247) -> Result<PdfProcessResult, PdfError> {
248 let mut opts = PdfOptions::new()
249 .detection(config)
250 .markdown(markdown_options);
251 opts.page_filter = page_filter.cloned();
252 process_pdf_with_options(path, opts)
253}
254
255#[deprecated(since = "0.2.0", note = "Use process_pdf_mem_with_options instead")]
257pub fn process_pdf_mem_with_config(
258 buffer: &[u8],
259 config: DetectionConfig,
260 markdown_options: MarkdownOptions,
261) -> Result<PdfProcessResult, PdfError> {
262 process_pdf_mem_with_options(
263 buffer,
264 PdfOptions::new()
265 .detection(config)
266 .markdown(markdown_options),
267 )
268}
269
270fn load_document_from_path<P: AsRef<Path>>(path: P) -> Result<(Document, u32), PdfError> {
280 let buffer = std::fs::read(&path)?;
281 load_document_from_mem(&buffer)
282}
283
284fn load_document_from_mem(buffer: &[u8]) -> Result<(Document, u32), PdfError> {
286 let fixed = structure_tree::fix_bare_struct_names(buffer);
290 let buf = fixed.as_ref();
291
292 let doc = match Document::load_mem(buf) {
293 Ok(d) => d,
294 Err(ref e) if is_encrypted_lopdf_error(e) => Document::load_mem_with_password(buf, "")?,
295 Err(e) => return Err(e.into()),
296 };
297 let page_count = doc.get_pages().len() as u32;
298 Ok((doc, page_count))
299}
300
301fn process_document(
303 doc: Document,
304 page_count: u32,
305 options: PdfOptions,
306 start: std::time::Instant,
307) -> Result<PdfProcessResult, PdfError> {
308 let detection = detector::detect_from_document(&doc, page_count, &options.detection)?;
310 let pdf_type = detection.pdf_type;
311 let pages_needing_ocr = detection.pages_needing_ocr;
312 let title = detection.title;
313 let confidence = detection.confidence;
314
315 if options.mode == ProcessMode::DetectOnly {
317 return Ok(PdfProcessResult {
318 pdf_type,
319 markdown: None,
320 page_count,
321 processing_time_ms: start.elapsed().as_millis() as u64,
322 pages_needing_ocr,
323 title,
324 confidence,
325 layout: LayoutComplexity::default(),
326 has_encoding_issues: false,
327 });
328 }
329
330 if matches!(pdf_type, PdfType::Scanned | PdfType::ImageBased) {
332 return Ok(PdfProcessResult {
333 pdf_type,
334 markdown: None,
335 page_count,
336 processing_time_ms: start.elapsed().as_millis() as u64,
337 pages_needing_ocr,
338 title,
339 confidence,
340 layout: LayoutComplexity::default(),
341 has_encoding_issues: false,
342 });
343 }
344
345 let extracted = {
347 let font_cmaps = FontCMaps::from_doc(&doc);
348 let result = extractor::extract_positioned_text_from_doc(
349 &doc,
350 &font_cmaps,
351 options.page_filter.as_ref(),
352 );
353
354 if pdf_type == PdfType::Mixed {
358 if let Ok((ref items, _, _)) = result.as_ref().map(|(e, _, _)| e) {
359 let sample: String = items.iter().take(200).map(|i| i.text.as_str()).collect();
360 if is_garbage_text(&sample) || sample.trim().is_empty() {
361 extractor::extract_positioned_text_include_invisible(
362 &doc,
363 &font_cmaps,
364 options.page_filter.as_ref(),
365 )
366 } else {
367 result
368 }
369 } else {
370 extractor::extract_positioned_text_include_invisible(
372 &doc,
373 &font_cmaps,
374 options.page_filter.as_ref(),
375 )
376 }
377 } else {
378 result
379 }
380 };
381
382 let extracted = if pdf_type == PdfType::Mixed {
384 extracted.ok()
385 } else {
386 Some(extracted?)
387 };
388
389 let (struct_roles, struct_tables) = structure_tree::StructTree::from_doc(&doc)
391 .map(|tree| {
392 let page_ids = doc.get_pages();
393 let roles = tree.mcid_to_roles(&page_ids);
394 let tables = tree.extract_tables(&page_ids);
395 if !roles.is_empty() {
396 log::debug!(
397 "structure tree: {} pages with MCID roles, {} total MCIDs, {} tagged tables",
398 roles.len(),
399 tree.mcid_count(),
400 tables.len()
401 );
402 }
403 let roles = if roles.is_empty() { None } else { Some(roles) };
404 (roles, tables)
405 })
406 .unwrap_or((None, Vec::new()));
407
408 let (markdown, layout, has_encoding_issues, gid_pages) = match extracted {
409 Some(((items, rects, lines), page_thresholds, gid_encoded_pages)) => {
410 let (items, rects, lines) =
417 if pages_needing_ocr.is_empty() || pdf_type != PdfType::TextBased {
418 (items, rects, lines)
419 } else {
420 let ocr_set: std::collections::HashSet<u32> =
421 pages_needing_ocr.iter().copied().collect();
422 let mut garbage_pages: std::collections::HashSet<u32> =
424 std::collections::HashSet::new();
425 for &pg in &ocr_set {
426 let page_text: String = items
427 .iter()
428 .filter(|i| i.page == pg)
429 .map(|i| i.text.as_str())
430 .collect();
431 if is_cid_garbage(&page_text) {
432 garbage_pages.insert(pg);
433 }
434 }
435 if garbage_pages.is_empty() {
436 (items, rects, lines)
437 } else {
438 log::debug!(
439 "suppressing garbage text from OCR-flagged pages: {:?}",
440 garbage_pages
441 );
442 let items: Vec<_> = items
443 .into_iter()
444 .filter(|i| !garbage_pages.contains(&i.page))
445 .collect();
446 let rects: Vec<_> = rects
447 .into_iter()
448 .filter(|r| !garbage_pages.contains(&r.page))
449 .collect();
450 let lines: Vec<_> = lines
451 .into_iter()
452 .filter(|l| !garbage_pages.contains(&l.page))
453 .collect();
454 (items, rects, lines)
455 }
456 };
457
458 let layout = compute_layout_complexity(&items, &rects, &lines);
459
460 let md = if options.mode == ProcessMode::Analyze {
461 None
462 } else {
463 Some(markdown::to_markdown_from_items_with_rects_and_lines(
464 items,
465 options.markdown,
466 &rects,
467 &lines,
468 &page_thresholds,
469 struct_roles.as_ref(),
470 &struct_tables,
471 ))
472 };
473
474 let enc = md.as_ref().is_some_and(|m| detect_encoding_issues(m));
475 (md, layout, enc, gid_encoded_pages)
476 }
477 None => (
478 None,
479 LayoutComplexity::default(),
480 false,
481 std::collections::HashSet::new(),
482 ),
483 };
484
485 let (pdf_type, markdown, confidence) =
489 if pdf_type == PdfType::Mixed && markdown.as_ref().is_some_and(|m| is_garbage_text(m)) {
490 (PdfType::Scanned, None, 0.95)
491 } else {
492 (pdf_type, markdown, confidence)
493 };
494
495 let (markdown, has_encoding_issues, force_ocr_all) = if pdf_type == PdfType::TextBased
499 && markdown.as_ref().is_some_and(|m| is_garbage_text(m))
500 {
501 log::debug!("TextBased PDF has garbage text — flagging all pages for OCR");
502 (None, true, true)
503 } else {
504 (markdown, has_encoding_issues, false)
505 };
506
507 let all_gid = !gid_pages.is_empty() && gid_pages.len() as u32 >= page_count;
510 let mut pages_needing_ocr = pages_needing_ocr;
511 if force_ocr_all {
512 pages_needing_ocr = (1..=page_count).collect();
513 }
514 if !gid_pages.is_empty() {
515 log::debug!("pages with gid-encoded fonts (need OCR): {:?}", gid_pages);
516 for page in gid_pages {
517 if !pages_needing_ocr.contains(&page) {
518 pages_needing_ocr.push(page);
519 }
520 }
521 pages_needing_ocr.sort_unstable();
522 }
523
524 if pdf_type == PdfType::TextBased
529 && page_count > 0
530 && pages_needing_ocr.is_empty()
531 && markdown.is_some()
532 {
533 let md_len = markdown.as_ref().map_or(0, |m| m.len());
534 let chars_per_page = md_len as f32 / page_count as f32;
535 if chars_per_page < 50.0 && md_len < 500 {
536 log::debug!(
537 "sparse extraction: {:.0} chars/page — recommending OCR for all {} pages",
538 chars_per_page,
539 page_count
540 );
541 pages_needing_ocr = (1..=page_count).collect();
542 }
543 }
544
545 let markdown = if all_gid {
546 log::debug!(
547 "all {} pages have gid-encoded fonts — suppressing markdown output",
548 page_count
549 );
550 None
551 } else {
552 markdown
553 };
554
555 Ok(PdfProcessResult {
556 pdf_type,
557 markdown,
558 page_count,
559 processing_time_ms: start.elapsed().as_millis() as u64,
560 pages_needing_ocr,
561 title,
562 confidence,
563 layout,
564 has_encoding_issues,
565 })
566}
567
568fn detect_encoding_issues(markdown: &str) -> bool {
582 if markdown.contains('\u{FFFD}') {
584 return true;
585 }
586
587 let total_dollars = markdown.matches('$').count();
589 if total_dollars > 10 {
590 let bytes = markdown.as_bytes();
591 let mut letter_dollar_letter = 0usize;
592 for i in 1..bytes.len().saturating_sub(1) {
593 if bytes[i] == b'$'
594 && bytes[i - 1].is_ascii_alphabetic()
595 && bytes[i + 1].is_ascii_alphabetic()
596 {
597 letter_dollar_letter += 1;
598 }
599 }
600 if letter_dollar_letter > 20 || letter_dollar_letter * 2 > total_dollars {
601 return true;
602 }
603 }
604
605 false
606}
607
608fn is_garbage_text(markdown: &str) -> bool {
614 let mut alphanum = 0usize;
615 let mut non_alphanum = 0usize;
616 for ch in markdown.chars() {
617 if ch.is_whitespace() {
618 continue;
619 }
620 if matches!(ch, '#' | '*' | '|' | '-' | '\n') {
622 continue;
623 }
624 if ch.is_alphanumeric() {
625 alphanum += 1;
626 } else {
627 non_alphanum += 1;
628 }
629 }
630 let total = alphanum + non_alphanum;
631 total >= 50 && alphanum * 2 < total
632}
633
634fn is_cid_garbage(text: &str) -> bool {
642 if is_garbage_text(text) {
643 return true;
644 }
645 let mut total = 0usize;
646 let mut c1_control = 0usize;
647 let mut high_latin = 0usize;
648 for ch in text.chars() {
649 if ch.is_whitespace() {
650 continue;
651 }
652 total += 1;
653 if ('\u{0080}'..='\u{009F}').contains(&ch) {
655 c1_control += 1;
656 }
657 if ('\u{00A0}'..='\u{00FF}').contains(&ch) {
661 high_latin += 1;
662 }
663 }
664 if total < 5 {
665 return false;
666 }
667 if c1_control * 20 >= total {
669 return true;
670 }
671 let ascii_letters = text.chars().filter(|c| c.is_ascii_alphabetic()).count();
675 high_latin * 5 >= total * 2 && ascii_letters * 3 < total
676}
677
678fn compute_layout_complexity(
680 items: &[types::TextItem],
681 rects: &[types::PdfRect],
682 lines: &[types::PdfLine],
683) -> LayoutComplexity {
684 use markdown::analysis::calculate_font_stats_from_items;
685
686 let mut seen_pages: Vec<u32> = items.iter().map(|i| i.page).collect();
688 seen_pages.sort();
689 seen_pages.dedup();
690
691 let font_stats = calculate_font_stats_from_items(items);
692 let base_size = font_stats.most_common_size;
693
694 let mut pages_with_tables: Vec<u32> = Vec::new();
697 for &page in &seen_pages {
698 let page_items: Vec<&types::TextItem> = items.iter().filter(|i| i.page == page).collect();
699
700 let owned_items: Vec<types::TextItem> = page_items.iter().map(|i| (*i).clone()).collect();
702 let bands = markdown::split_side_by_side(&owned_items);
703
704 let band_ranges: Vec<(f32, f32)> = if bands.is_empty() {
705 vec![(f32::MIN, f32::MAX)]
707 } else {
708 bands
709 };
710
711 let mut found_table = false;
712 for &(x_lo, x_hi) in &band_ranges {
713 let margin = 2.0;
714 let band_items: Vec<types::TextItem> = owned_items
715 .iter()
716 .filter(|item| {
717 x_lo == f32::MIN || (item.x >= x_lo - margin && item.x < x_hi + margin)
718 })
719 .cloned()
720 .collect();
721
722 let band_rects: Vec<types::PdfRect> = if x_lo == f32::MIN {
723 rects.iter().filter(|r| r.page == page).cloned().collect()
724 } else {
725 markdown::filter_rects_to_band(rects, page, x_lo, x_hi)
726 };
727
728 let band_lines: Vec<types::PdfLine> = if x_lo == f32::MIN {
729 lines.iter().filter(|l| l.page == page).cloned().collect()
730 } else {
731 markdown::filter_lines_to_band(lines, page, x_lo, x_hi)
732 };
733
734 let (rect_tables, _) = tables::detect_tables_from_rects(&band_items, &band_rects, page);
735 if !rect_tables.is_empty() {
736 found_table = true;
737 break;
738 }
739 let line_tables = tables::detect_tables_from_lines(&band_items, &band_lines, page);
740 if !line_tables.is_empty() {
741 found_table = true;
742 break;
743 }
744 let heuristic_tables = tables::detect_tables(&band_items, base_size, false);
746 if !heuristic_tables.is_empty() {
747 found_table = true;
748 break;
749 }
750 }
751 if found_table {
752 pages_with_tables.push(page);
753 }
754 }
755
756 let mut pages_with_columns: Vec<u32> = Vec::new();
757 for page in seen_pages {
758 let cols = extractor::detect_columns(items, page, pages_with_tables.contains(&page));
759 if cols.len() >= 2 {
760 pages_with_columns.push(page);
761 }
762 }
763
764 let is_complex = !pages_with_tables.is_empty() || !pages_with_columns.is_empty();
765
766 LayoutComplexity {
767 is_complex,
768 pages_with_tables,
769 pages_with_columns,
770 }
771}
772
773#[derive(Debug, thiserror::Error)]
774pub enum PdfError {
775 #[error("IO error: {0}")]
776 Io(#[from] std::io::Error),
777 #[error("PDF parsing error: {0}")]
778 Parse(String),
779 #[error("PDF is encrypted")]
780 Encrypted,
781 #[error("Invalid PDF structure")]
782 InvalidStructure,
783 #[error("Not a PDF: {0}")]
784 NotAPdf(String),
785}
786
787impl From<lopdf::Error> for PdfError {
788 fn from(e: lopdf::Error) -> Self {
789 match e {
790 lopdf::Error::IO(io_err) => PdfError::Io(io_err),
791 lopdf::Error::Decryption(_)
792 | lopdf::Error::InvalidPassword
793 | lopdf::Error::AlreadyEncrypted
794 | lopdf::Error::UnsupportedSecurityHandler(_) => PdfError::Encrypted,
795 lopdf::Error::Unimplemented(msg) if msg.contains("encrypted") => PdfError::Encrypted,
796 lopdf::Error::Parse(ref pe) if pe.to_string().contains("invalid file header") => {
797 PdfError::NotAPdf("invalid PDF file header".to_string())
798 }
799 lopdf::Error::MissingXrefEntry
800 | lopdf::Error::Xref(_)
801 | lopdf::Error::IndirectObject { .. }
802 | lopdf::Error::ObjectIdMismatch
803 | lopdf::Error::InvalidObjectStream(_)
804 | lopdf::Error::InvalidOffset(_) => PdfError::InvalidStructure,
805 other => PdfError::Parse(other.to_string()),
806 }
807 }
808}
809
810pub(crate) fn is_encrypted_lopdf_error(e: &lopdf::Error) -> bool {
812 matches!(
813 e,
814 lopdf::Error::Decryption(_)
815 | lopdf::Error::InvalidPassword
816 | lopdf::Error::AlreadyEncrypted
817 | lopdf::Error::UnsupportedSecurityHandler(_)
818 ) || matches!(e, lopdf::Error::Unimplemented(msg) if msg.contains("encrypted"))
819}
820
821fn strip_bom_and_whitespace(bytes: &[u8]) -> &[u8] {
827 let b = if bytes.starts_with(&[0xEF, 0xBB, 0xBF]) {
828 &bytes[3..]
829 } else {
830 bytes
831 };
832 let start = b
833 .iter()
834 .position(|&c| !c.is_ascii_whitespace())
835 .unwrap_or(b.len());
836 &b[start..]
837}
838
839fn starts_with_ci(haystack: &[u8], needle: &[u8]) -> bool {
841 if haystack.len() < needle.len() {
842 return false;
843 }
844 haystack[..needle.len()]
845 .iter()
846 .zip(needle)
847 .all(|(a, b)| a.eq_ignore_ascii_case(b))
848}
849
850fn detect_file_type_hint(bytes: &[u8]) -> String {
852 if bytes.is_empty() {
853 return "file is empty".to_string();
854 }
855
856 let trimmed = strip_bom_and_whitespace(bytes);
857
858 if starts_with_ci(trimmed, b"<!doctype html")
860 || starts_with_ci(trimmed, b"<html")
861 || starts_with_ci(trimmed, b"<head")
862 || starts_with_ci(trimmed, b"<body")
863 {
864 return "file appears to be HTML".to_string();
865 }
866
867 if trimmed.starts_with(b"<?xml") || trimmed.starts_with(b"<") {
869 if starts_with_ci(trimmed, b"<?xml") {
870 return "file appears to be XML".to_string();
871 }
872 if trimmed.starts_with(b"<") && !trimmed.starts_with(b"<%") {
873 return "file appears to be XML".to_string();
874 }
875 }
876
877 if trimmed.starts_with(b"{") || trimmed.starts_with(b"[") {
879 return "file appears to be JSON".to_string();
880 }
881
882 if bytes.starts_with(&[0x89, 0x50, 0x4E, 0x47]) {
884 return "file appears to be a PNG image".to_string();
885 }
886
887 if bytes.starts_with(&[0xFF, 0xD8, 0xFF]) {
889 return "file appears to be a JPEG image".to_string();
890 }
891
892 if bytes.starts_with(&[0x50, 0x4B, 0x03, 0x04]) {
894 return "file appears to be a ZIP archive (possibly an Office document)".to_string();
895 }
896
897 let sample = &bytes[..bytes.len().min(512)];
899 let printable = sample
900 .iter()
901 .filter(|&&b| b.is_ascii_graphic() || b.is_ascii_whitespace())
902 .count();
903 if printable > sample.len() * 3 / 4 {
904 return "file appears to be plain text".to_string();
905 }
906
907 "file is not a PDF".to_string()
908}
909
910pub(crate) fn validate_pdf_bytes(buffer: &[u8]) -> Result<(), PdfError> {
914 if buffer.is_empty() {
915 return Err(PdfError::NotAPdf(detect_file_type_hint(buffer)));
916 }
917
918 let header = &buffer[..buffer.len().min(1024)];
919 let trimmed = strip_bom_and_whitespace(header);
920
921 if trimmed.starts_with(b"%PDF-") {
922 Ok(())
923 } else {
924 Err(PdfError::NotAPdf(detect_file_type_hint(buffer)))
925 }
926}
927
928pub(crate) fn validate_pdf_file<P: AsRef<Path>>(path: P) -> Result<(), PdfError> {
932 use std::io::Read;
933 let mut file = std::fs::File::open(path)?;
934 let mut buf = [0u8; 1024];
935 let n = file.read(&mut buf)?;
936 validate_pdf_bytes(&buf[..n])
937}
938
939#[cfg(test)]
940mod tests {
941 use super::*;
942
943 #[test]
944 fn test_detect_encoding_issues_fffd() {
945 assert!(detect_encoding_issues(
946 "Some text with \u{FFFD} replacement"
947 ));
948 }
949
950 #[test]
951 fn test_detect_encoding_issues_dollar_as_space() {
952 let garbled = "Last$advanced$Book$Programm$3th$Workshop$on$Chest$Wall$Deformities$and$More";
954 assert!(detect_encoding_issues(garbled));
955 }
956
957 #[test]
958 fn test_detect_encoding_issues_financial_text() {
959 let financial = "Revenue was $100M in Q1, up from $90M. Costs: $50M, $30M, $20M, $15M, $12M, $8M, $5M, $3M, $2M, $1M, $500K.";
961 assert!(!detect_encoding_issues(financial));
962 }
963
964 #[test]
965 fn test_detect_encoding_issues_clean_text() {
966 assert!(!detect_encoding_issues(
967 "Normal markdown text with no issues."
968 ));
969 }
970
971 #[test]
972 fn test_detect_encoding_issues_few_dollars() {
973 let text = "a$b c$d e$f";
975 assert!(!detect_encoding_issues(text));
976 }
977
978 #[test]
979 fn test_garbage_text_detection() {
980 let garbage = ",&<X ~%5&8-!A ~*(!,-!U (/#!U X ~#/=U 9/%*(!U !( X \
983 (%U-(-/ V %&((8-#&&< *,(6--< %5&8-!( (,(/! #/<5U X \
984 º&( >/5 /5&(#(8-!5 *,(6--( *,%@/-A W";
985 assert!(is_garbage_text(garbage));
986
987 let normal = "This is a normal paragraph with words and sentences that contains enough characters to pass the threshold.";
989 assert!(!is_garbage_text(normal));
990
991 let cyrillic =
993 "Роботизированные технологии комплексы для производства металлургических предприятий";
994 assert!(!is_garbage_text(cyrillic));
995 }
996
997 #[test]
998 fn test_cid_garbage_detection() {
999 let cid_garbage = "Ë>íÓ\tý\r\u{0088}æ&Ït\u{0094}äí;\ný;wAL¢©èåD\rü£\
1002 qq\u{0096}¶Í Æ\réá; Ô 7G\u{008B}ý;èÕç¢ £ ý;C";
1003 assert!(
1004 is_cid_garbage(cid_garbage),
1005 "CID garbage with C1 controls should be detected"
1006 );
1007
1008 let korean = "본 가격표는 국내 거주 중인 외국인을 위한 한국어 가격표의 비공식 번역본입니다";
1010 assert!(
1011 !is_cid_garbage(korean),
1012 "Valid Korean text should not be flagged as garbage"
1013 );
1014
1015 let japanese = "羽田空港新飛行経路に係る航空機騒音の測定結果";
1017 assert!(
1018 !is_cid_garbage(japanese),
1019 "Valid Japanese text should not be flagged as garbage"
1020 );
1021 }
1022}