use super::error::{PdfError, PdfResult};
use super::ExtractionProgress;
use serde::{Deserialize, Serialize};
use std::path::Path;
use std::process::{Command, Stdio};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
pub enum Backend {
#[default]
Auto,
Marker,
Nougat,
}
impl Backend {
pub fn name(&self) -> &'static str {
match self {
Backend::Auto => "auto",
Backend::Marker => "marker",
Backend::Nougat => "nougat",
}
}
pub fn is_available(&self) -> bool {
match self {
Backend::Auto => Backend::Marker.is_available() || Backend::Nougat.is_available(),
Backend::Marker => check_marker_available(),
Backend::Nougat => check_nougat_available(),
}
}
}
#[derive(Debug, Clone)]
pub struct BackendCapabilities {
pub max_concurrent_pages: usize,
pub supports_math_ocr: bool,
pub supports_tables: bool,
pub supports_figures: bool,
pub pages_per_second_cpu: f32,
pub pages_per_second_gpu: f32,
pub gpu_available: bool,
}
#[derive(Debug, Clone)]
pub struct BackendInfo {
pub backend: Backend,
pub version: String,
pub path: String,
pub capabilities: BackendCapabilities,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ExtractedPage {
pub page_number: usize,
pub latex: String,
pub markdown: Option<String>,
pub equation_count: usize,
pub figure_count: usize,
pub table_count: usize,
pub confidence: f32,
pub warnings: Vec<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ExtractedDocument {
pub source_path: String,
pub backend: Backend,
pub pages: Vec<ExtractedPage>,
pub latex: String,
pub metadata: DocumentMetadata,
pub stats: ExtractionStats,
}
impl ExtractedDocument {
pub fn page_count(&self) -> usize {
self.pages.len()
}
pub fn equation_count(&self) -> usize {
self.pages.iter().map(|p| p.equation_count).sum()
}
pub fn average_confidence(&self) -> f32 {
if self.pages.is_empty() {
return 0.0;
}
self.pages.iter().map(|p| p.confidence).sum::<f32>() / self.pages.len() as f32
}
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct DocumentMetadata {
pub title: Option<String>,
pub authors: Vec<String>,
pub abstract_text: Option<String>,
pub keywords: Vec<String>,
pub created: Option<String>,
pub pdf_version: Option<String>,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct ExtractionStats {
pub processing_time_ms: u64,
pub successful_pages: usize,
pub failed_pages: usize,
pub total_characters: usize,
pub total_equations: usize,
pub total_figures: usize,
pub total_tables: usize,
}
pub trait PdfBackend {
fn info(&self) -> PdfResult<BackendInfo>;
fn extract<P, F>(&self, path: P, progress: &F) -> PdfResult<ExtractedDocument>
where
P: AsRef<Path>,
F: Fn(ExtractionProgress) + Send + Sync;
fn is_available(&self) -> bool;
fn backend_type(&self) -> Backend;
}
pub struct MarkerBackend {
python_path: String,
marker_path: Option<String>,
device: String,
}
impl MarkerBackend {
pub fn new() -> PdfResult<Self> {
if !check_marker_available() {
return Err(PdfError::BackendNotAvailable {
backend: "marker".into(),
reason: "Marker is not installed. Install with: pip install marker-pdf".into(),
});
}
Ok(Self {
python_path: "python3".to_string(),
marker_path: None,
device: "cpu".to_string(),
})
}
pub fn with_python_path(mut self, path: impl Into<String>) -> Self {
self.python_path = path.into();
self
}
pub fn with_marker_path(mut self, path: impl Into<String>) -> Self {
self.marker_path = Some(path.into());
self
}
pub fn with_device(mut self, device: impl Into<String>) -> Self {
self.device = device.into();
self
}
}
impl PdfBackend for MarkerBackend {
fn info(&self) -> PdfResult<BackendInfo> {
let version = get_marker_version(&self.python_path)?;
Ok(BackendInfo {
backend: Backend::Marker,
version,
path: self.python_path.clone(),
capabilities: BackendCapabilities {
max_concurrent_pages: 4,
supports_math_ocr: true,
supports_tables: true,
supports_figures: true,
pages_per_second_cpu: 1.0,
pages_per_second_gpu: 5.0,
gpu_available: self.device.starts_with("cuda"),
},
})
}
fn extract<P, F>(&self, path: P, progress: &F) -> PdfResult<ExtractedDocument>
where
P: AsRef<Path>,
F: Fn(ExtractionProgress) + Send + Sync,
{
let path = path.as_ref();
if !path.exists() {
return Err(PdfError::FileNotFound(path.to_path_buf()));
}
let start_time = std::time::Instant::now();
progress(ExtractionProgress {
current_page: 0,
total_pages: 0,
percent: 0.0,
stage: super::ExtractionStage::Analyzing,
});
let script = format!(
r#"
import sys
import json
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
# Initialize models
model_dict = create_model_dict()
# Convert PDF
converter = PdfConverter(artifact_dict=model_dict)
rendered = converter("{}")
# Output as JSON
result = {{
"markdown": rendered.markdown,
"metadata": rendered.metadata if hasattr(rendered, 'metadata') else {{}},
}}
print(json.dumps(result))
"#,
path.display()
);
let output = Command::new(&self.python_path)
.arg("-c")
.arg(&script)
.env("TORCH_DEVICE", &self.device)
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.output()
.map_err(|e| PdfError::BackendFailed {
backend: "marker".into(),
message: format!("Failed to execute Python: {}", e),
})?;
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
return Err(PdfError::BackendFailed {
backend: "marker".into(),
message: format!("Marker failed: {}", stderr),
});
}
let stdout = String::from_utf8_lossy(&output.stdout);
let result: serde_json::Value =
serde_json::from_str(&stdout).map_err(|e| PdfError::InvalidOutput {
backend: "marker".into(),
reason: format!("Failed to parse JSON output: {}", e),
})?;
let markdown = result["markdown"].as_str().unwrap_or("").to_string();
let latex = markdown_to_latex(&markdown);
let equation_count = latex.matches("\\begin{equation}").count()
+ latex.matches("\\[").count()
+ latex.matches("$$").count();
let processing_time = start_time.elapsed();
progress(ExtractionProgress {
current_page: 1,
total_pages: 1,
percent: 100.0,
stage: super::ExtractionStage::Postprocessing,
});
Ok(ExtractedDocument {
source_path: path.to_string_lossy().to_string(),
backend: Backend::Marker,
pages: vec![ExtractedPage {
page_number: 1,
latex: latex.clone(),
markdown: Some(markdown),
equation_count,
figure_count: 0,
table_count: 0,
confidence: 0.9,
warnings: Vec::new(),
}],
latex,
metadata: DocumentMetadata::default(),
stats: ExtractionStats {
processing_time_ms: processing_time.as_millis() as u64,
successful_pages: 1,
failed_pages: 0,
total_characters: 0,
total_equations: equation_count,
total_figures: 0,
total_tables: 0,
},
})
}
fn is_available(&self) -> bool {
check_marker_available()
}
fn backend_type(&self) -> Backend {
Backend::Marker
}
}
pub struct NougatBackend {
python_path: String,
nougat_path: Option<String>,
device: String,
model_tag: String,
}
impl NougatBackend {
pub fn new() -> PdfResult<Self> {
if !check_nougat_available() {
return Err(PdfError::BackendNotAvailable {
backend: "nougat".into(),
reason: "Nougat is not installed. Install with: pip install nougat-ocr".into(),
});
}
Ok(Self {
python_path: "python3".to_string(),
nougat_path: None,
device: "cpu".to_string(),
model_tag: "0.1.0-base".to_string(),
})
}
pub fn with_python_path(mut self, path: impl Into<String>) -> Self {
self.python_path = path.into();
self
}
pub fn with_nougat_path(mut self, path: impl Into<String>) -> Self {
self.nougat_path = Some(path.into());
self
}
pub fn with_device(mut self, device: impl Into<String>) -> Self {
self.device = device.into();
self
}
pub fn with_model_tag(mut self, tag: impl Into<String>) -> Self {
self.model_tag = tag.into();
self
}
}
impl PdfBackend for NougatBackend {
fn info(&self) -> PdfResult<BackendInfo> {
let version = get_nougat_version(&self.python_path)?;
Ok(BackendInfo {
backend: Backend::Nougat,
version,
path: self.python_path.clone(),
capabilities: BackendCapabilities {
max_concurrent_pages: 1, supports_math_ocr: true,
supports_tables: true,
supports_figures: false, pages_per_second_cpu: 0.1,
pages_per_second_gpu: 1.0,
gpu_available: self.device.starts_with("cuda"),
},
})
}
fn extract<P, F>(&self, path: P, progress: &F) -> PdfResult<ExtractedDocument>
where
P: AsRef<Path>,
F: Fn(ExtractionProgress) + Send + Sync,
{
let path = path.as_ref();
if !path.exists() {
return Err(PdfError::FileNotFound(path.to_path_buf()));
}
let start_time = std::time::Instant::now();
progress(ExtractionProgress {
current_page: 0,
total_pages: 0,
percent: 0.0,
stage: super::ExtractionStage::Analyzing,
});
let output = Command::new("nougat")
.arg(path)
.arg("--out")
.arg("-") .arg("--model")
.arg(&self.model_tag)
.arg("--no-skipping")
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.output()
.map_err(|e| PdfError::BackendFailed {
backend: "nougat".into(),
message: format!("Failed to execute nougat: {}", e),
})?;
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
return Err(PdfError::BackendFailed {
backend: "nougat".into(),
message: format!("Nougat failed: {}", stderr),
});
}
let mmd_content = String::from_utf8_lossy(&output.stdout).to_string();
let latex = mathpix_markdown_to_latex(&mmd_content);
let equation_count = latex.matches("\\begin{equation}").count()
+ latex.matches("\\[").count()
+ latex.matches("$$").count()
+ latex.matches("\\begin{align}").count();
let processing_time = start_time.elapsed();
progress(ExtractionProgress {
current_page: 1,
total_pages: 1,
percent: 100.0,
stage: super::ExtractionStage::Postprocessing,
});
Ok(ExtractedDocument {
source_path: path.to_string_lossy().to_string(),
backend: Backend::Nougat,
pages: vec![ExtractedPage {
page_number: 1,
latex: latex.clone(),
markdown: Some(mmd_content),
equation_count,
figure_count: 0,
table_count: 0,
confidence: 0.95, warnings: Vec::new(),
}],
latex,
metadata: DocumentMetadata::default(),
stats: ExtractionStats {
processing_time_ms: processing_time.as_millis() as u64,
successful_pages: 1,
failed_pages: 0,
total_characters: 0,
total_equations: equation_count,
total_figures: 0,
total_tables: 0,
},
})
}
fn is_available(&self) -> bool {
check_nougat_available()
}
fn backend_type(&self) -> Backend {
Backend::Nougat
}
}
fn check_marker_available() -> bool {
Command::new("python3")
.args(["-c", "import marker"])
.stdout(Stdio::null())
.stderr(Stdio::null())
.status()
.map(|s| s.success())
.unwrap_or(false)
}
fn check_nougat_available() -> bool {
Command::new("nougat")
.arg("--help")
.stdout(Stdio::null())
.stderr(Stdio::null())
.status()
.map(|s| s.success())
.unwrap_or(false)
}
fn get_marker_version(python_path: &str) -> PdfResult<String> {
let output = Command::new(python_path)
.args(["-c", "import marker; print(marker.__version__)"])
.output()
.map_err(|e| PdfError::BackendFailed {
backend: "marker".into(),
message: format!("Failed to get version: {}", e),
})?;
if output.status.success() {
Ok(String::from_utf8_lossy(&output.stdout).trim().to_string())
} else {
Ok("unknown".to_string())
}
}
fn get_nougat_version(python_path: &str) -> PdfResult<String> {
let output = Command::new(python_path)
.args(["-c", "import nougat; print(nougat.__version__)"])
.output()
.map_err(|e| PdfError::BackendFailed {
backend: "nougat".into(),
message: format!("Failed to get version: {}", e),
})?;
if output.status.success() {
Ok(String::from_utf8_lossy(&output.stdout).trim().to_string())
} else {
Ok("unknown".to_string())
}
}
fn markdown_to_latex(markdown: &str) -> String {
let mut latex = markdown.to_string();
latex = latex.replace("# ", "\\section{");
latex = latex.replace("## ", "\\subsection{");
latex = latex.replace("### ", "\\subsubsection{");
latex = regex::Regex::new(r"`([^`]+)`")
.unwrap()
.replace_all(&latex, r"\texttt{$1}")
.to_string();
latex = regex::Regex::new(r"\*\*([^*]+)\*\*")
.unwrap()
.replace_all(&latex, r"\textbf{$1}")
.to_string();
latex = regex::Regex::new(r"\*([^*]+)\*")
.unwrap()
.replace_all(&latex, r"\textit{$1}")
.to_string();
latex
}
fn mathpix_markdown_to_latex(mmd: &str) -> String {
let mut latex = mmd.to_string();
latex = regex::Regex::new(r"^# (.+)$")
.unwrap()
.replace_all(&latex, r"\section{$1}")
.to_string();
latex = regex::Regex::new(r"^## (.+)$")
.unwrap()
.replace_all(&latex, r"\subsection{$1}")
.to_string();
latex = regex::Regex::new(r"^### (.+)$")
.unwrap()
.replace_all(&latex, r"\subsubsection{$1}")
.to_string();
latex = regex::Regex::new(r"\*\*(.+?)\*\*")
.unwrap()
.replace_all(&latex, r"\textbf{$1}")
.to_string();
latex = regex::Regex::new(r"\*(.+?)\*")
.unwrap()
.replace_all(&latex, r"\textit{$1}")
.to_string();
latex
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_backend_names() {
assert_eq!(Backend::Auto.name(), "auto");
assert_eq!(Backend::Marker.name(), "marker");
assert_eq!(Backend::Nougat.name(), "nougat");
}
#[test]
fn test_markdown_to_latex() {
let md = "# Title\n**bold** and *italic*";
let latex = markdown_to_latex(md);
assert!(latex.contains("\\section{"));
assert!(latex.contains("\\textbf{bold}"));
assert!(latex.contains("\\textit{italic}"));
}
}