use anyhow::Result;
use rand::Rng;
use std::collections::HashMap;
use std::path::Path;
use crate::models::Reference;
pub type PageNumber = i16;
#[derive(Debug, Clone, PartialEq)]
pub struct ParserConfig {
pub pdf_path: String,
pub pdf_text_path: String,
pub pdf_figures: HashMap<PageNumber, String>,
pub pdf_xml_path: String,
pub sections: Vec<(PageNumber, String)>,
pub pdf_info: HashMap<String, String>,
pub use_llm: bool,
pub math_texts: HashMap<(PageNumber, usize), String>,
pub extract_references: bool,
pub references: Vec<Reference>,
}
impl ParserConfig {
pub fn new() -> ParserConfig {
let mut rng = rand::rng();
let random_value = rng.random_range(10000..99999);
let mut pdf_path = String::new();
pdf_path.push_str("/tmp/pdf_");
pdf_path.push_str(&random_value.to_string());
pdf_path.push_str(".pdf");
let pdf_figures = HashMap::new();
let pdf_html_path = pdf_path.clone().replace(".pdf", ".text.html");
let pdf_raw_html_path = pdf_path.clone().replace(".pdf", ".xml");
let sections = Vec::new();
ParserConfig {
pdf_path: pdf_path,
pdf_text_path: pdf_html_path,
pdf_figures: pdf_figures,
pdf_xml_path: pdf_raw_html_path,
sections: sections,
pdf_info: HashMap::new(),
use_llm: false,
math_texts: HashMap::new(),
extract_references: false,
references: Vec::new(),
}
}
pub fn pdf_width(&self) -> anyhow::Result<i32> {
self.pdf_info
.get("page_width")
.ok_or_else(|| anyhow::anyhow!("PDF width not available - pdfinfo may have failed"))?
.parse::<i32>()
.map_err(|e| anyhow::anyhow!("Invalid page_width value: {}", e))
}
pub fn pdf_height(&self) -> anyhow::Result<i32> {
self.pdf_info
.get("page_height")
.ok_or_else(|| anyhow::anyhow!("PDF height not available - pdfinfo may have failed"))?
.parse::<i32>()
.map_err(|e| anyhow::anyhow!("Invalid page_height value: {}", e))
}
pub fn clean_files(&self) -> Result<()> {
if Path::new(&self.pdf_path).exists() {
std::fs::remove_file(&self.pdf_path)?;
}
if Path::new(&self.pdf_text_path).exists() {
std::fs::remove_file(&self.pdf_text_path)?;
}
if Path::new(&self.pdf_xml_path).exists() {
std::fs::remove_file(&self.pdf_xml_path)?;
}
for figure in self.pdf_figures.values() {
if Path::new(figure).exists() {
std::fs::remove_file(figure)?;
}
}
return Ok(());
}
}