use serde::{Deserialize, Serialize};
#[derive(Clone, Copy, Debug, PartialEq, Eq, Default, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
#[non_exhaustive]
pub enum ExtractMode {
TextOnly,
#[default]
Auto,
ForceOcr,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, Default, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
#[non_exhaustive]
pub enum OcrLanguage {
#[default]
English,
Chinese,
ChineseTraditional,
Japanese,
Korean,
Arabic,
Cyrillic,
Latin,
Devanagari,
Tamil,
Telugu,
Kannada,
}
#[derive(Debug, Clone)]
pub struct OcrModelSpec {
pub rec_file: String,
pub dict_file: String,
pub rec_url: String,
pub dict_url: String,
}
impl OcrLanguage {
pub const DET_URL: &'static str =
"https://huggingface.co/deepghs/paddleocr/resolve/main/det/ch_PP-OCRv4_det/model.onnx";
pub const ALL: &'static [OcrLanguage] = &[
OcrLanguage::English,
OcrLanguage::Chinese,
OcrLanguage::ChineseTraditional,
OcrLanguage::Japanese,
OcrLanguage::Korean,
OcrLanguage::Arabic,
OcrLanguage::Cyrillic,
OcrLanguage::Latin,
OcrLanguage::Devanagari,
OcrLanguage::Tamil,
OcrLanguage::Telugu,
OcrLanguage::Kannada,
];
#[must_use]
pub fn from_code(s: &str) -> Option<Self> {
Some(match s.trim().to_ascii_lowercase().as_str() {
"" | "en" | "eng" | "english" | "latin-en" => Self::English,
"zh" | "ch" | "chi" | "chinese" | "cjk" | "zh-cn" | "zh_hans" => Self::Chinese,
"zh-tw" | "zh_hant" | "chinese_cht" | "cht" | "traditional" => Self::ChineseTraditional,
"ja" | "jpn" | "japanese" | "japan" => Self::Japanese,
"ko" | "kor" | "korean" => Self::Korean,
"ar" | "ara" | "arabic" => Self::Arabic,
"ru" | "rus" | "russian" | "cyrillic" | "uk" | "be" | "bg" | "sr" => Self::Cyrillic,
"lat" | "latin" | "fr" | "de" | "es" | "it" | "pt" => Self::Latin,
"hi" | "mr" | "ne" | "devanagari" => Self::Devanagari,
"ta" | "tam" | "tamil" => Self::Tamil,
"te" | "tel" | "telugu" => Self::Telugu,
"kn" | "kan" | "ka" | "kannada" => Self::Kannada,
_ => return None,
})
}
#[must_use]
pub fn code(self) -> &'static str {
match self {
Self::English => "english",
Self::Chinese => "chinese",
Self::ChineseTraditional => "chinese_cht",
Self::Japanese => "japan",
Self::Korean => "korean",
Self::Arabic => "arabic",
Self::Cyrillic => "cyrillic",
Self::Latin => "latin",
Self::Devanagari => "devanagari",
Self::Tamil => "ta",
Self::Telugu => "te",
Self::Kannada => "ka",
}
}
#[must_use]
pub fn spec(self) -> OcrModelSpec {
let code = self.code();
match self {
Self::English => OcrModelSpec {
rec_file: "rec.onnx".into(),
dict_file: "en_dict.txt".into(),
rec_url: "https://huggingface.co/monkt/paddleocr-onnx/resolve/main/languages/english/rec.onnx".into(),
dict_url: "https://huggingface.co/monkt/paddleocr-onnx/resolve/main/languages/english/dict.txt".into(),
},
Self::Chinese => OcrModelSpec {
rec_file: "rec_chinese.onnx".into(),
dict_file: "chinese_dict.txt".into(),
rec_url: "https://huggingface.co/monkt/paddleocr-onnx/resolve/main/languages/chinese/rec.onnx".into(),
dict_url: "https://huggingface.co/monkt/paddleocr-onnx/resolve/main/languages/chinese/dict.txt".into(),
},
_ => OcrModelSpec {
rec_file: format!("rec_{code}.onnx"),
dict_file: format!("{code}_dict.txt"),
rec_url: format!(
"https://huggingface.co/deepghs/paddleocr/resolve/main/rec/{code}_PP-OCRv3_rec/model.onnx"
),
dict_url: format!(
"https://raw.githubusercontent.com/PaddlePaddle/PaddleOCR/main/ppocr/utils/dict/{code}_dict.txt"
),
},
}
}
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
#[serde(default, rename_all = "snake_case")]
#[non_exhaustive]
pub struct AutoExtractOptions {
pub mode: ExtractMode,
pub reconstruct_image_tables: bool,
pub emit_placeholders: bool,
pub ocr_languages: Vec<String>,
pub min_text_confidence: Option<f32>,
pub table_confidence: Option<f32>,
pub force_ocr_pages: Vec<usize>,
}
impl Default for AutoExtractOptions {
fn default() -> Self {
Self::balanced()
}
}
impl AutoExtractOptions {
#[must_use]
pub fn fast() -> Self {
Self {
mode: ExtractMode::TextOnly,
reconstruct_image_tables: false,
emit_placeholders: true,
ocr_languages: Vec::new(),
min_text_confidence: None,
table_confidence: None,
force_ocr_pages: Vec::new(),
}
}
#[must_use]
pub fn balanced() -> Self {
Self {
mode: ExtractMode::Auto,
reconstruct_image_tables: true,
emit_placeholders: true,
ocr_languages: Vec::new(),
min_text_confidence: None,
table_confidence: None,
force_ocr_pages: Vec::new(),
}
}
#[must_use]
pub fn high_fidelity() -> Self {
Self {
mode: ExtractMode::Auto,
reconstruct_image_tables: true,
emit_placeholders: true,
ocr_languages: Vec::new(),
min_text_confidence: Some(0.55),
table_confidence: Some(0.45),
force_ocr_pages: Vec::new(),
}
}
#[must_use]
pub fn builder() -> AutoExtractOptionsBuilder {
AutoExtractOptionsBuilder::new()
}
}
#[derive(Debug, Clone, Default)]
pub struct AutoExtractOptionsBuilder {
opts: AutoExtractOptions,
}
impl AutoExtractOptionsBuilder {
#[must_use]
pub fn new() -> Self {
Self {
opts: AutoExtractOptions::balanced(),
}
}
#[must_use]
pub fn mode(mut self, mode: ExtractMode) -> Self {
self.opts.mode = mode;
self
}
#[must_use]
pub fn reconstruct_image_tables(mut self, yes: bool) -> Self {
self.opts.reconstruct_image_tables = yes;
self
}
#[must_use]
pub fn emit_placeholders(mut self, yes: bool) -> Self {
self.opts.emit_placeholders = yes;
self
}
#[must_use]
pub fn ocr_languages<I, S>(mut self, langs: I) -> Self
where
I: IntoIterator<Item = S>,
S: Into<String>,
{
self.opts.ocr_languages = langs.into_iter().map(Into::into).collect();
self
}
#[must_use]
pub fn min_text_confidence(mut self, c: f32) -> Self {
self.opts.min_text_confidence = Some(c.clamp(0.0, 1.0));
self
}
#[must_use]
pub fn table_confidence(mut self, c: f32) -> Self {
self.opts.table_confidence = Some(c.clamp(0.0, 1.0));
self
}
#[must_use]
pub fn force_ocr_pages<I: IntoIterator<Item = usize>>(mut self, pages: I) -> Self {
self.opts.force_ocr_pages = pages.into_iter().collect();
self
}
#[must_use]
pub fn build(self) -> AutoExtractOptions {
self.opts
}
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
#[non_exhaustive]
pub enum PageKind {
TextLayer,
Scanned,
ImageText,
Mixed,
Empty,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
#[non_exhaustive]
pub enum ReasonCode {
Ok,
NativeTextHighConfidence,
NoTextLayerPresent,
TextLayerBelowThreshold,
GlyphMappingMissing,
EncryptedNoExtractPermission,
ImageTableReconstructed,
ImageTableNoStructure,
ChartNotTranscribed,
OcrRequestedButUnavailable,
OcrLowConfidenceFallback,
Empty,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
#[non_exhaustive]
pub enum ExtractSource {
NativeText,
Ocr,
ImageTableRecovery,
Fallback,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
#[non_exhaustive]
pub enum RegionKind {
Text,
Heading,
Table,
Figure,
Chart,
}
#[derive(Clone, Copy, Debug, PartialEq, Serialize, Deserialize)]
pub struct Quad {
pub points: [[f32; 2]; 4],
}
impl Quad {
#[must_use]
pub fn from_xywh(x: f32, y: f32, w: f32, h: f32) -> Self {
Self {
points: [[x, y + h], [x + w, y + h], [x + w, y], [x, y]],
}
}
}
#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
#[non_exhaustive]
pub struct TableData {
pub rows: Vec<Vec<String>>,
pub has_header: bool,
pub markdown: String,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
#[non_exhaustive]
pub struct Region {
pub bbox: Quad,
pub kind: RegionKind,
pub text: String,
pub table: Option<TableData>,
pub confidence: f32,
pub source: ExtractSource,
pub reason: ReasonCode,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
#[non_exhaustive]
pub enum ExtractionStatus {
Complete,
PartialSuccess,
NoTextRecovered,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
#[non_exhaustive]
pub struct PageExtraction {
pub page: usize,
pub kind: PageKind,
pub text: String,
pub regions: Vec<Region>,
pub confidence: f32,
pub reason: ReasonCode,
pub ocr_used: bool,
pub status: ExtractionStatus,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
#[non_exhaustive]
pub struct DocumentExtraction {
pub pages: Vec<PageExtraction>,
pub status: ExtractionStatus,
pub pages_needing_ocr: Vec<usize>,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
#[non_exhaustive]
pub struct DocumentClassification {
pub pages: Vec<PageKind>,
pub pages_needing_ocr: Vec<usize>,
pub summary: DocumentSummary,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
#[non_exhaustive]
pub enum DocumentSummary {
MostlyText,
MostlyScanned,
Mixed,
Empty,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
#[non_exhaustive]
pub enum ImageCodecClass {
None,
Ccitt,
Jbig2,
Dct,
Other,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
#[non_exhaustive]
pub enum ProducerPrior {
Scanner,
Authoring,
Unknown,
}
#[derive(Clone, Copy, Debug, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
#[non_exhaustive]
pub struct PageSignals {
pub text_glyph_count: usize,
pub text_area_ratio: f32,
pub image_area_ratio: f32,
pub codec: ImageCodecClass,
pub invisible_text_ratio: f32,
pub garbled_ratio: f32,
pub fragmented_word_ratio: f32,
pub consecutive_repeat_ratio: f32,
pub vector_path_density: f32,
pub has_reliable_structure: bool,
pub producer_prior: ProducerPrior,
pub page_is_empty: bool,
}
#[derive(Clone, Copy, Debug, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
#[non_exhaustive]
pub struct PageClassification {
pub page: usize,
pub kind: PageKind,
pub confidence: f32,
pub reason: ReasonCode,
pub signals: PageSignals,
}
#[must_use]
pub fn text_quality_gate(text: &str) -> Option<ReasonCode> {
let chars: Vec<char> = text.chars().collect();
let total = chars.len();
if total < 16 {
return None;
}
let bad = chars
.iter()
.filter(|&&c| {
c == '\u{FFFD}'
|| ('\u{0}'..'\u{9}').contains(&c)
|| ('\u{E}'..'\u{20}').contains(&c)
|| ('\u{E000}'..='\u{F8FF}').contains(&c) })
.count();
let garbled_ratio = bad as f32 / total as f32;
if garbled_ratio > 0.20 {
return Some(ReasonCode::GlyphMappingMissing);
}
let words: Vec<&str> = text.split_whitespace().collect();
if words.len() >= 8 {
let frag =
words.iter().filter(|w| w.chars().count() <= 2).count() as f32 / words.len() as f32;
if frag > 0.80 {
return Some(ReasonCode::GlyphMappingMissing);
}
let mut repeats = 0usize;
for w in words.windows(2) {
if w[0] == w[1] {
repeats += 1;
}
}
let repeat_ratio = repeats as f32 / words.len() as f32;
let avg_word_len =
words.iter().map(|w| w.chars().count()).sum::<usize>() as f32 / words.len() as f32;
if repeat_ratio > 0.30 || (frag > 0.55 && avg_word_len < 2.5) {
return Some(ReasonCode::TextLayerBelowThreshold);
}
}
let alnum = chars.iter().filter(|c| c.is_alphanumeric()).count();
if (alnum as f32 / total as f32) < 0.20 {
return Some(ReasonCode::TextLayerBelowThreshold);
}
None
}
#[must_use]
pub fn classify_from_signals(
s: &PageSignals,
cfg: &AutoExtractOptions,
) -> (PageKind, f32, ReasonCode) {
let min_text_conf = cfg.min_text_confidence.unwrap_or(0.70);
let scan_cover_min = 0.80_f32;
let sparse_text_max = 0.10_f32;
let min_glyphs = 24_usize;
if s.page_is_empty {
return (PageKind::Empty, 0.99, ReasonCode::Empty);
}
let usable_text = s.text_glyph_count >= min_glyphs
&& s.garbled_ratio <= 0.20
&& s.fragmented_word_ratio <= 0.80;
if !usable_text && s.vector_path_density > 0.60 && s.image_area_ratio < 0.20 {
return (PageKind::Scanned, 0.80, ReasonCode::NoTextLayerPresent);
}
if s.image_area_ratio >= scan_cover_min {
if usable_text && s.invisible_text_ratio >= 0.50 {
return (PageKind::TextLayer, 0.85, ReasonCode::NativeTextHighConfidence);
}
if s.text_area_ratio < sparse_text_max || !usable_text {
let conf = if s.codec == ImageCodecClass::Ccitt || s.codec == ImageCodecClass::Jbig2 {
0.95
} else {
0.85
};
return (PageKind::Scanned, conf, ReasonCode::NoTextLayerPresent);
}
}
if usable_text && s.image_area_ratio > 0.05 && s.image_area_ratio < scan_cover_min {
return (PageKind::ImageText, 0.75, ReasonCode::Ok);
}
if usable_text {
let mut conf = min_text_conf.max(0.80);
if s.has_reliable_structure {
conf = (conf + 0.10).min(0.99);
}
return (PageKind::TextLayer, conf, ReasonCode::NativeTextHighConfidence);
}
if s.text_glyph_count > 0 {
let clean = s.garbled_ratio <= 0.20 && s.fragmented_word_ratio <= 0.80;
if clean && s.image_area_ratio < 0.05 {
return (PageKind::TextLayer, 0.60, ReasonCode::NativeTextHighConfidence);
}
return (PageKind::Scanned, 0.80, ReasonCode::GlyphMappingMissing);
}
let conf = match s.producer_prior {
ProducerPrior::Scanner => 0.85,
_ => 0.70,
};
(PageKind::Scanned, conf, ReasonCode::NoTextLayerPresent)
}
#[must_use]
pub fn summarise(pages: &[PageKind]) -> DocumentSummary {
if pages.is_empty() || pages.iter().all(|k| *k == PageKind::Empty) {
return DocumentSummary::Empty;
}
let non_empty: Vec<&PageKind> = pages.iter().filter(|k| **k != PageKind::Empty).collect();
let text = non_empty
.iter()
.filter(|k| matches!(k, PageKind::TextLayer))
.count();
let scanned = non_empty
.iter()
.filter(|k| matches!(k, PageKind::Scanned))
.count();
let n = non_empty.len();
if text * 100 >= n * 80 {
DocumentSummary::MostlyText
} else if scanned * 100 >= n * 80 {
DocumentSummary::MostlyScanned
} else {
DocumentSummary::Mixed
}
}
use crate::converters::ConversionOptions;
use crate::document::PdfDocument;
#[derive(Debug, Clone)]
pub struct AutoExtractor {
opts: AutoExtractOptions,
}
impl Default for AutoExtractor {
fn default() -> Self {
Self::new()
}
}
impl AutoExtractor {
#[must_use]
pub fn new() -> Self {
Self {
opts: AutoExtractOptions::balanced(),
}
}
#[must_use]
pub fn text_only() -> Self {
Self {
opts: AutoExtractOptions::fast(),
}
}
#[must_use]
pub fn with(opts: AutoExtractOptions) -> Self {
Self { opts }
}
#[must_use]
pub fn options(&self) -> &AutoExtractOptions {
&self.opts
}
#[must_use]
pub fn model_cache_dir() -> std::path::PathBuf {
use std::path::PathBuf;
if let Some(d) = std::env::var_os("PDF_OXIDE_MODEL_DIR") {
return PathBuf::from(d);
}
#[cfg(windows)]
let base = std::env::var_os("LOCALAPPDATA")
.or_else(|| std::env::var_os("USERPROFILE"))
.map(PathBuf::from);
#[cfg(not(windows))]
let base = std::env::var_os("XDG_CACHE_HOME")
.map(PathBuf::from)
.or_else(|| std::env::var_os("HOME").map(|h| PathBuf::from(h).join(".cache")));
base.unwrap_or_else(|| PathBuf::from(".cache"))
.join("pdf_oxide")
.join("models")
}
pub fn prefetch_models(langs: &[OcrLanguage]) -> crate::Result<std::path::PathBuf> {
let dir = Self::model_cache_dir();
std::fs::create_dir_all(&dir).map_err(crate::error::Error::Io)?;
#[cfg(feature = "ocr")]
{
let mut want: Vec<OcrLanguage> = langs.to_vec();
if want.is_empty() {
want.push(OcrLanguage::English);
}
Self::http_fetch(OcrLanguage::DET_URL, &dir.join("det.onnx"))?;
for l in want {
let s = l.spec();
Self::http_fetch(&s.rec_url, &dir.join(&s.rec_file))?;
let dp = dir.join(&s.dict_file);
Self::http_fetch(&s.dict_url, &dp)?;
if let Ok(c) = std::fs::read_to_string(&dp) {
if c.lines().last() != Some(" ") {
let _ = std::fs::write(&dp, format!("{}\n ", c.trim_end_matches('\n')));
}
}
}
}
#[cfg(not(feature = "ocr"))]
{
let _ = langs; }
Ok(dir)
}
pub fn prefetch_models_default() -> crate::Result<std::path::PathBuf> {
Self::prefetch_models(&[OcrLanguage::English])
}
#[must_use]
pub fn prefetch_available() -> bool {
cfg!(feature = "ocr")
}
pub fn prefetch(&self) -> crate::Result<std::path::PathBuf> {
let mut langs: Vec<OcrLanguage> = self
.opts
.ocr_languages
.iter()
.filter_map(|s| OcrLanguage::from_code(s))
.collect();
if langs.is_empty() {
langs.push(OcrLanguage::English);
}
Self::prefetch_models(&langs)
}
#[cfg(feature = "ocr")]
fn http_fetch(url: &str, dest: &std::path::Path) -> crate::Result<()> {
use std::io::Read;
if dest.is_file() {
return Ok(());
}
let ioerr = |m: String| crate::error::Error::Io(std::io::Error::other(m));
let agent = ureq::Agent::config_builder()
.timeout_global(Some(std::time::Duration::from_secs(180)))
.build()
.new_agent();
let mut resp = agent
.get(url)
.call()
.map_err(|e| ioerr(format!("prefetch GET {url}: {e}")))?;
let mut buf = Vec::new();
resp.body_mut()
.as_reader()
.read_to_end(&mut buf)
.map_err(|e| ioerr(format!("prefetch read {url}: {e}")))?;
if buf.len() < 256 {
return Err(ioerr(format!(
"prefetch {url}: response too small ({} bytes) — likely an error page",
buf.len()
)));
}
let tmp = dest.with_extension("part");
std::fs::write(&tmp, &buf).map_err(crate::error::Error::Io)?;
std::fs::rename(&tmp, dest).map_err(crate::error::Error::Io)?;
Ok(())
}
#[must_use]
pub fn model_manifest() -> String {
let langs = [
OcrLanguage::English,
OcrLanguage::Chinese,
OcrLanguage::ChineseTraditional,
OcrLanguage::Japanese,
OcrLanguage::Korean,
OcrLanguage::Arabic,
OcrLanguage::Cyrillic,
OcrLanguage::Latin,
OcrLanguage::Devanagari,
OcrLanguage::Tamil,
OcrLanguage::Telugu,
OcrLanguage::Kannada,
];
let entries: Vec<serde_json::Value> = langs
.iter()
.map(|l| {
let s = l.spec();
serde_json::json!({
"language": l.code(),
"rec_file": s.rec_file,
"dict_file": s.dict_file,
"rec_url": s.rec_url,
"dict_url": s.dict_url,
})
})
.collect();
serde_json::json!({
"detector": { "file": "det.onnx", "url": OcrLanguage::DET_URL },
"languages": entries,
"note": "Hebrew has no upstream PaddleOCR recognition model; \
the loader is ready if one is provided.",
})
.to_string()
}
pub fn classify(&self, doc: &PdfDocument) -> crate::Result<DocumentClassification> {
doc.classify_document()
}
pub fn extract_text(&self, doc: &PdfDocument, page: usize) -> crate::Result<String> {
if matches!(self.opts.mode, ExtractMode::TextOnly) {
return doc.extract_text(page);
}
let cls = doc.classify_page(page)?;
Ok(self.route(doc, page, &cls)?.0)
}
#[cfg(feature = "ocr")]
fn ocr_lang_files(lang: &str) -> (String, String) {
match OcrLanguage::from_code(lang) {
Some(l) => {
let s = l.spec();
(s.rec_file, s.dict_file)
},
None => {
let other = lang.trim().to_ascii_lowercase();
(format!("rec_{other}.onnx"), format!("{other}_dict.txt"))
},
}
}
#[cfg(feature = "ocr")]
#[must_use]
fn detect_ocr_language(doc: &PdfDocument, page: usize) -> Option<OcrLanguage> {
let mut s = doc.extract_text(page).unwrap_or_default();
if s.trim().is_empty() {
s = doc.extract_text(0).unwrap_or_default();
}
if s.trim().is_empty() {
return None;
}
let (mut han, mut cyr, mut arab, mut deva, mut latin) = (0usize, 0, 0, 0, 0);
for c in s.chars().take(8000) {
match c {
'\u{4E00}'..='\u{9FFF}' | '\u{3400}'..='\u{4DBF}' => han += 1,
'\u{0400}'..='\u{04FF}' => cyr += 1,
'\u{0600}'..='\u{06FF}' | '\u{0750}'..='\u{077F}' => arab += 1,
'\u{0900}'..='\u{097F}' => deva += 1,
'A'..='Z' | 'a'..='z' => latin += 1,
_ => {},
}
}
let (n, lang) = [
(han, OcrLanguage::Chinese),
(cyr, OcrLanguage::Cyrillic),
(arab, OcrLanguage::Arabic),
(deva, OcrLanguage::Devanagari),
]
.into_iter()
.max_by_key(|(n, _)| *n)?;
(n >= 4 && n * 2 >= latin).then_some(lang)
}
#[cfg(feature = "ocr")]
#[must_use]
fn build_ocr_engine(&self, doc: &PdfDocument, page: usize) -> Option<crate::ocr::OcrEngine> {
let req: Vec<String> = if !self.opts.ocr_languages.is_empty() {
self.opts.ocr_languages.clone()
} else {
Self::detect_ocr_language(doc, page)
.map(|l| vec![l.code().to_string()])
.unwrap_or_default()
};
Self::load_ocr_engine(&req)
}
#[cfg(feature = "ocr")]
#[must_use]
fn load_ocr_engine(langs: &[String]) -> Option<crate::ocr::OcrEngine> {
let dir = Self::model_cache_dir();
let det = dir.join("det.onnx");
if !det.is_file() {
return None;
}
let mut tries: Vec<String> = langs
.iter()
.filter(|s| !s.trim().is_empty())
.cloned()
.collect();
tries.push("english".to_string());
for lang in tries {
let (recf, dictf) = Self::ocr_lang_files(&lang);
let rec = dir.join(&recf);
let dict = dir.join(&dictf);
if rec.is_file() && dict.is_file() {
if let Ok(e) =
crate::ocr::OcrEngine::new(&det, &rec, &dict, crate::ocr::OcrConfig::default())
{
return Some(e);
}
}
}
None
}
fn route(
&self,
doc: &PdfDocument,
page: usize,
cls: &PageClassification,
) -> crate::Result<(String, ExtractSource, ReasonCode)> {
let force = matches!(self.opts.mode, ExtractMode::ForceOcr)
|| self.opts.force_ocr_pages.contains(&page);
let needs_ocr =
force || matches!(cls.kind, PageKind::Scanned | PageKind::ImageText | PageKind::Mixed);
if !needs_ocr {
return Ok((doc.extract_text(page)?, ExtractSource::NativeText, cls.reason));
}
#[allow(unused_mut)]
let mut ocr_attempted = false;
#[cfg(feature = "ocr")]
{
match self.build_ocr_engine(doc, page) {
Some(engine) => match crate::ocr::extract_text_with_ocr(
doc,
page,
Some(&engine),
crate::ocr::OcrExtractOptions::default(),
) {
Ok(t) if !t.trim().is_empty() => {
return Ok((t, ExtractSource::Ocr, ReasonCode::Ok))
},
Ok(_) => {
ocr_attempted = true;
log::warn!(
"auto-extract: OCR produced no text for page \
{page} (kind={:?}); falling back to native text",
cls.kind
)
},
Err(e) => {
ocr_attempted = true;
log::warn!(
"auto-extract: OCR failed for page {page}: {e}; \
falling back to native text"
)
},
},
None => log::warn!(
"auto-extract: page {page} (kind={:?}) needs OCR but \
no models in {} — run scripts/setup_ocr_models.sh or \
set PDF_OXIDE_MODEL_DIR; falling back to native text",
cls.kind,
Self::model_cache_dir().display()
),
}
}
#[cfg(not(feature = "ocr"))]
{
log::warn!(
"auto-extract: OCR unavailable (ocr feature not enabled) \
for page {page} (kind={:?}); falling back to native \
text (reason OcrRequestedButUnavailable)",
cls.kind
);
}
let native = doc.extract_text(page)?;
if !native.trim().is_empty() && text_quality_gate(&native).is_none() {
Ok((native, ExtractSource::NativeText, ReasonCode::NativeTextHighConfidence))
} else {
let reason = if ocr_attempted {
ReasonCode::OcrLowConfidenceFallback
} else {
ReasonCode::OcrRequestedButUnavailable
};
Ok((native, ExtractSource::Fallback, reason))
}
}
pub fn extract_markdown(&self, doc: &PdfDocument, page: usize) -> crate::Result<String> {
doc.to_markdown(page, &ConversionOptions::default())
}
pub fn extract_html(&self, doc: &PdfDocument, page: usize) -> crate::Result<String> {
doc.to_html(page, &ConversionOptions::default())
}
pub fn extract_document_text(&self, doc: &PdfDocument) -> crate::Result<String> {
let n = doc.page_count()?;
let mut out = String::new();
for p in 0..n {
if p > 0 {
out.push_str("\n\n");
}
out.push_str(&self.extract_text(doc, p)?);
}
Ok(out)
}
pub fn extract_document_markdown(&self, doc: &PdfDocument) -> crate::Result<String> {
let n = doc.page_count()?;
let mut out = String::new();
for p in 0..n {
if p > 0 {
out.push_str("\n\n");
}
out.push_str(&self.extract_markdown(doc, p)?);
}
Ok(out)
}
pub fn extract_page(&self, doc: &PdfDocument, page: usize) -> crate::Result<PageExtraction> {
let cls = doc.classify_page(page)?;
#[cfg(feature = "ocr")]
{
if matches!(cls.kind, PageKind::ImageText | PageKind::Mixed)
&& !matches!(self.opts.mode, ExtractMode::TextOnly)
{
let native = doc.extract_text(page).unwrap_or_default();
let ocr: Option<String> = self
.build_ocr_engine(doc, page)
.and_then(|e| {
crate::ocr::ocr_page(
doc,
page,
&e,
&crate::ocr::OcrExtractOptions::default(),
)
.ok()
})
.filter(|t| !t.trim().is_empty());
let bbox = doc
.get_page_media_box(page)
.map(|(x0, y0, x1, y1)| {
Quad::from_xywh(x0.min(x1), y0.min(y1), (x1 - x0).abs(), (y1 - y0).abs())
})
.unwrap_or(Quad::from_xywh(0.0, 0.0, 0.0, 0.0));
let mut regions = Vec::new();
if !native.trim().is_empty() {
let nr = if text_quality_gate(&native).is_none() {
ReasonCode::NativeTextHighConfidence
} else {
ReasonCode::Ok
};
regions.push(Region {
bbox,
kind: RegionKind::Text,
text: native.clone(),
table: None,
confidence: cls.confidence,
source: ExtractSource::NativeText,
reason: nr,
});
}
if let Some(o) = ocr.as_ref() {
regions.push(Region {
bbox,
kind: RegionKind::Figure,
text: o.clone(),
table: None,
confidence: cls.confidence,
source: ExtractSource::Ocr,
reason: ReasonCode::Ok,
});
}
if !regions.is_empty() {
let text = match ocr.as_ref() {
Some(o) if !native.trim().is_empty() => {
crate::ocr::merge_native_and_ocr(&native, o)
},
Some(o) => o.clone(),
None => native.clone(),
};
let ocr_used = ocr.is_some();
let status = if text.trim().is_empty() {
ExtractionStatus::NoTextRecovered
} else {
ExtractionStatus::Complete
};
return Ok(PageExtraction {
page,
kind: cls.kind,
text,
regions,
confidence: cls.confidence,
reason: ReasonCode::Ok,
ocr_used,
status,
});
}
}
}
let (text, source, reason) = if matches!(self.opts.mode, ExtractMode::TextOnly) {
(doc.extract_text(page)?, ExtractSource::NativeText, cls.reason)
} else {
self.route(doc, page, &cls)?
};
let ocr_used = source == ExtractSource::Ocr;
let bbox = doc
.get_page_media_box(page)
.map(|(x0, y0, x1, y1)| {
Quad::from_xywh(x0.min(x1), y0.min(y1), (x1 - x0).abs(), (y1 - y0).abs())
})
.unwrap_or(Quad::from_xywh(0.0, 0.0, 0.0, 0.0));
let status = if text.trim().is_empty() {
ExtractionStatus::NoTextRecovered
} else if matches!(reason, ReasonCode::Ok | ReasonCode::NativeTextHighConfidence) {
ExtractionStatus::Complete
} else {
ExtractionStatus::PartialSuccess
};
let region = Region {
bbox,
kind: RegionKind::Text,
text: text.clone(),
table: None,
confidence: cls.confidence,
source,
reason,
};
Ok(PageExtraction {
page,
kind: cls.kind,
text,
regions: vec![region],
confidence: cls.confidence,
reason,
ocr_used,
status,
})
}
pub fn extract_document(&self, doc: &PdfDocument) -> crate::Result<DocumentExtraction> {
let n = doc.page_count()?;
let mut pages = Vec::with_capacity(n);
let mut need = Vec::new();
for p in 0..n {
let pe = self.extract_page(doc, p)?;
if matches!(pe.kind, PageKind::Scanned | PageKind::ImageText | PageKind::Mixed) {
need.push(p);
}
pages.push(pe);
}
let any_text = pages.iter().any(|p| !p.text.trim().is_empty());
let all_ok = pages
.iter()
.all(|p| matches!(p.reason, ReasonCode::Ok | ReasonCode::NativeTextHighConfidence));
let status = if !any_text {
ExtractionStatus::NoTextRecovered
} else if all_ok {
ExtractionStatus::Complete
} else {
ExtractionStatus::PartialSuccess
};
Ok(DocumentExtraction {
pages,
status,
pages_needing_ocr: need,
})
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn extract_mode_default_is_auto() {
assert_eq!(ExtractMode::default(), ExtractMode::Auto);
assert_eq!(AutoExtractOptions::default().mode, ExtractMode::Auto);
}
#[test]
fn presets_have_expected_shape() {
assert_eq!(AutoExtractOptions::fast().mode, ExtractMode::TextOnly);
assert!(!AutoExtractOptions::fast().reconstruct_image_tables);
assert_eq!(AutoExtractOptions::balanced().mode, ExtractMode::Auto);
assert!(AutoExtractOptions::balanced().reconstruct_image_tables);
assert_eq!(AutoExtractOptions::high_fidelity().mode, ExtractMode::Auto);
assert!(AutoExtractOptions::high_fidelity()
.min_text_confidence
.is_some());
assert_eq!(AutoExtractOptions::default(), AutoExtractOptions::balanced());
}
#[test]
fn builder_mirrors_ocrconfigbuilder_shape() {
let o = AutoExtractOptions::builder()
.mode(ExtractMode::ForceOcr)
.reconstruct_image_tables(false)
.ocr_languages(["en", "de"])
.min_text_confidence(2.0) .force_ocr_pages([0, 2])
.build();
assert_eq!(o.mode, ExtractMode::ForceOcr);
assert!(!o.reconstruct_image_tables);
assert_eq!(o.ocr_languages, vec!["en".to_string(), "de".to_string()]);
assert_eq!(o.min_text_confidence, Some(1.0)); assert_eq!(o.force_ocr_pages, vec![0, 2]);
}
#[test]
fn options_json_roundtrip_is_stable() {
let o = AutoExtractOptions::high_fidelity();
let js = serde_json::to_string(&o).expect("serialize");
assert!(js.contains("\"mode\":\"auto\""));
let back: AutoExtractOptions = serde_json::from_str(&js).expect("deserialize");
assert_eq!(o, back);
let partial: AutoExtractOptions =
serde_json::from_str(r#"{"mode":"force_ocr"}"#).expect("partial");
assert_eq!(partial.mode, ExtractMode::ForceOcr);
assert!(partial.reconstruct_image_tables); }
#[test]
fn reason_and_enum_wire_tokens_are_snake_case_frozen() {
assert_eq!(
serde_json::to_string(&ReasonCode::OcrRequestedButUnavailable).unwrap(),
"\"ocr_requested_but_unavailable\""
);
assert_eq!(
serde_json::to_string(&ExtractSource::ImageTableRecovery).unwrap(),
"\"image_table_recovery\""
);
assert_eq!(serde_json::to_string(&PageKind::ImageText).unwrap(), "\"image_text\"");
}
#[test]
fn quad_from_xywh_is_tl_tr_br_bl() {
let q = Quad::from_xywh(10.0, 20.0, 30.0, 40.0);
assert_eq!(q.points[0], [10.0, 60.0]); assert_eq!(q.points[2], [40.0, 20.0]); }
fn sig() -> PageSignals {
PageSignals {
text_glyph_count: 0,
text_area_ratio: 0.0,
image_area_ratio: 0.0,
codec: ImageCodecClass::None,
invisible_text_ratio: 0.0,
garbled_ratio: 0.0,
fragmented_word_ratio: 0.0,
consecutive_repeat_ratio: 0.0,
vector_path_density: 0.0,
has_reliable_structure: false,
producer_prior: ProducerPrior::Unknown,
page_is_empty: false,
}
}
#[test]
fn quality_gate_flags_cid_garbage_and_passes_clean() {
let garbage: String = "\u{FFFD}".repeat(40);
assert_eq!(text_quality_gate(&garbage), Some(ReasonCode::GlyphMappingMissing));
assert_eq!(
text_quality_gate("The quick brown fox jumps over the lazy dog repeatedly."),
None
);
}
#[test]
fn quality_gate_catches_column_scramble_and_fragmentation() {
let frag = "a b c d e f g h i j k l m n o p q r s t";
assert_eq!(text_quality_gate(frag), Some(ReasonCode::GlyphMappingMissing));
let scramble = "alpha alpha beta beta gamma gamma delta delta epsilon epsilon zeta zeta";
assert_eq!(text_quality_gate(scramble), Some(ReasonCode::TextLayerBelowThreshold));
}
#[test]
fn cascade_empty_scanned_sparse_over_scan_hybrid_textlayer() {
let mut s = sig();
s.page_is_empty = true;
assert_eq!(classify_from_signals(&s, &AutoExtractOptions::balanced()).0, PageKind::Empty);
let mut s = sig();
s.image_area_ratio = 0.97;
s.codec = ImageCodecClass::Ccitt;
let (k, c, _) = classify_from_signals(&s, &AutoExtractOptions::balanced());
assert_eq!(k, PageKind::Scanned);
assert!(c >= 0.95);
let mut s = sig();
s.image_area_ratio = 0.95;
s.text_glyph_count = 60; s.text_area_ratio = 0.02; assert_eq!(classify_from_signals(&s, &AutoExtractOptions::balanced()).0, PageKind::Scanned);
let mut s = sig();
s.text_glyph_count = 800;
s.text_area_ratio = 0.5;
s.image_area_ratio = 0.25;
assert_eq!(
classify_from_signals(&s, &AutoExtractOptions::balanced()).0,
PageKind::ImageText
);
let mut s = sig();
s.text_glyph_count = 1200;
s.text_area_ratio = 0.6;
s.has_reliable_structure = true;
let (k, c, r) = classify_from_signals(&s, &AutoExtractOptions::balanced());
assert_eq!(k, PageKind::TextLayer);
assert_eq!(r, ReasonCode::NativeTextHighConfidence);
assert!(c >= 0.90);
}
#[test]
fn cascade_keeps_good_ocr_sidecar_over_scan() {
let mut s = sig();
s.image_area_ratio = 0.96;
s.text_glyph_count = 1500;
s.text_area_ratio = 0.55;
s.invisible_text_ratio = 0.95;
assert_eq!(
classify_from_signals(&s, &AutoExtractOptions::balanced()).0,
PageKind::TextLayer
);
}
#[test]
fn summary_is_aggregate_only_never_forced_mode() {
use PageKind::*;
assert_eq!(summarise(&[]), DocumentSummary::Empty);
assert_eq!(summarise(&[Empty, Empty]), DocumentSummary::Empty);
assert_eq!(
summarise(&[TextLayer, TextLayer, TextLayer, TextLayer, Empty]),
DocumentSummary::MostlyText
);
assert_eq!(
summarise(&[Scanned, Scanned, Scanned, Scanned]),
DocumentSummary::MostlyScanned
);
assert_eq!(summarise(&[TextLayer, Scanned, ImageText, Scanned]), DocumentSummary::Mixed);
}
#[test]
fn auto_extractor_construction_is_cheap_and_infallible() {
assert_eq!(AutoExtractor::new().options().mode, ExtractMode::Auto);
assert_eq!(AutoExtractor::default().options().mode, ExtractMode::Auto);
assert_eq!(AutoExtractor::text_only().options().mode, ExtractMode::TextOnly);
let ae = AutoExtractor::with(AutoExtractOptions::high_fidelity());
assert!(ae.options().min_text_confidence.is_some());
let mm = AutoExtractor::model_manifest();
assert!(mm.contains("det.onnx") && mm.contains("english"));
assert!(AutoExtractor::model_cache_dir()
.to_string_lossy()
.contains("pdf_oxide"));
}
}