use anyhow::Result;
use scraper::html;
use std::collections::HashMap;
use crate::cleaner;
use crate::config::{PageNumber, ParserConfig};
use crate::converter::pdf2html;
use crate::extracter::{adjst_columns, extract_tables, get_text_area};
use crate::llm;
use crate::models::{Block, Coordinate, Line, Page, PaperOutput, Section};
fn parse_attr<T: std::str::FromStr>(
element: &scraper::ElementRef,
attr: &str,
element_type: &str,
) -> Result<T>
where
T::Err: std::fmt::Display,
{
element
.value()
.attr(attr)
.ok_or_else(|| anyhow::anyhow!("{} element missing '{}' attribute", element_type, attr))?
.parse::<T>()
.map_err(|e| {
anyhow::anyhow!(
"Invalid '{}' attribute in {} element: {}",
attr,
element_type,
e
)
})
}
pub(crate) fn parse_html2pages(config: &mut ParserConfig, html: html::Html) -> Result<Vec<Page>> {
let mut pages = Vec::new();
let section_title_regex = regex::Regex::new(r"^\d+\.?\s*").unwrap();
let whitespace_regex = regex::Regex::new(r"\s+").unwrap();
let page_selector = scraper::Selector::parse("page").unwrap();
let _pages = html.select(&page_selector);
for (_page_number, page) in _pages.enumerate() {
let page_number = (_page_number + 1) as PageNumber;
let page_width: f32 = parse_attr(&page, "width", "page")?;
let page_height: f32 = parse_attr(&page, "height", "page")?;
let mut _page = Page::new(page_width, page_height, page_number);
let fig_path = config.pdf_figures.get(&page_number).ok_or_else(|| {
anyhow::anyhow!(
"No figure path found for page {}. PDF processing may have failed.",
page_number
)
})?;
extract_tables(
fig_path,
&mut _page.tables,
_page.width as i32,
_page.height as i32,
);
let block_selector = scraper::Selector::parse("block").unwrap();
let _blocks = page.select(&block_selector);
for block in _blocks {
let block_xmin: f32 = parse_attr(&block, "xmin", "block")?;
let block_ymin: f32 = parse_attr(&block, "ymin", "block")?;
let block_xmax: f32 = parse_attr(&block, "xmax", "block")?;
let block_ymax: f32 = parse_attr(&block, "ymax", "block")?;
let mut _block = Block::new(
block_xmin,
block_ymin,
block_xmax - block_xmin,
block_ymax - block_ymin,
);
let line_selector = scraper::Selector::parse("line").unwrap();
let _lines = block.select(&line_selector);
'line_iter: for line in _lines {
let line_xmin: f32 = parse_attr(&line, "xmin", "line")?;
let line_ymin: f32 = parse_attr(&line, "ymin", "line")?;
let line_xmax: f32 = parse_attr(&line, "xmax", "line")?;
let line_ymax: f32 = parse_attr(&line, "ymax", "line")?;
let mut _line = Line::new(
line_xmin,
line_ymin,
line_xmax - line_xmin,
line_ymax - line_ymin,
);
let line_text: String = line.text().collect::<String>();
let line_text_normalized =
whitespace_regex.replace_all(line_text.trim(), " ").to_string();
let line_text_stripped =
section_title_regex.replace(&line_text_normalized, "").trim().to_string();
let is_section_title = config.sections.iter().any(|(_, section)| {
line_text_stripped.to_lowercase() == section.to_lowercase()
});
if !is_section_title {
for table in _page.tables.iter() {
let line_coord =
Coordinate::from_object(_line.x, _line.y, _line.width, _line.height);
if line_coord.is_contained_in(&table) {
continue 'line_iter;
}
}
}
let word_selector = scraper::Selector::parse("word").unwrap();
let _words = line.select(&word_selector);
for word in _words {
let word_xmin: f32 = parse_attr(&word, "xmin", "word")?;
let word_ymin: f32 = parse_attr(&word, "ymin", "word")?;
let word_xmax: f32 = parse_attr(&word, "xmax", "word")?;
let word_ymax: f32 = parse_attr(&word, "ymax", "word")?;
let text = word.text().collect::<String>();
_line.add_word(
text.clone(),
word_xmin,
word_ymin,
word_xmax - word_xmin,
word_ymax - word_ymin,
);
}
if _line.get_text().trim().len() > 0 {
_block.lines.push(_line);
}
}
if _block.lines.len() > 0 {
_page.blocks.push(_block);
}
}
if _page.blocks.len() > 0 {
pages.push(_page);
}
}
return Ok(pages);
}
pub(crate) fn parse_extract_textarea(
config: &mut ParserConfig,
pages: &mut Vec<Page>,
) -> Result<()> {
let section_titles =
config.sections.iter().map(|(_, section)| section.to_lowercase()).collect::<Vec<String>>();
let text_area = get_text_area(&pages);
let title_index_regex = regex::Regex::new(r"\d+\.").unwrap();
let full_text_mode = config.sections.is_empty();
if full_text_mode {
tracing::info!("Using full text extraction mode (no sections detected)");
}
for page in pages.iter_mut() {
let mut remove_indices: Vec<usize> = Vec::new();
let width = if page.number_of_columns == 2 {
page.width / 2.2
} else {
page.width / 1.1
};
for (i, block) in page.blocks.iter_mut().enumerate() {
let block_coord = Coordinate::from_object(block.x, block.y, block.width, block.height);
let iou = text_area.iou(&block_coord);
let block_text = block.get_text();
let block_text = title_index_regex.replace(&block_text, "").trim().to_string();
if (iou - 0.0).abs() < 1e-6 {
remove_indices.push(i);
} else if !full_text_mode
&& !section_titles.contains(&block_text.to_lowercase())
&& (block.width / width < 0.3 && block.lines.len() < 4)
{
remove_indices.push(i);
}
}
for i in remove_indices.iter().rev() {
page.blocks.remove(*i);
}
}
return Ok(());
}
pub(crate) fn parse_extract_section_text(
config: &mut ParserConfig,
pages: &mut Vec<Page>,
) -> Result<()> {
if config.sections.is_empty() {
tracing::info!("Full text extraction mode: assigning all blocks to 'Content' section");
for page in pages.iter_mut() {
for block in page.blocks.iter_mut() {
block.section = "Content".to_string();
}
}
return Ok(());
}
let mut current_section = "Abstract".to_string();
if cfg!(test) {
tracing::info!("Initial section: {}", current_section);
}
let title_regex = regex::Regex::new(r"\d+\.").unwrap();
for page in pages.iter_mut() {
let page_number = page.page_number;
for block in page.blocks.iter_mut() {
for line in block.lines.iter_mut() {
let text = line.get_text();
let text = title_regex.replace(&text, "").trim().to_string();
if config.sections.iter().any(|(pg, section)| {
if *pg < 0 {
text.to_lowercase() == section.to_lowercase()
} else {
text.to_lowercase() == section.to_lowercase() && pg == &page_number
}
}) {
current_section = text;
}
block.section = current_section.clone();
}
}
}
return Ok(());
}
pub fn collect_references_text(pages: &[Page]) -> String {
let mut references_text = String::new();
let mut in_references_fallback = false;
for page in pages {
for block in &page.blocks {
let block_text = block.get_text();
let section_lower = block.section.to_lowercase();
if section_lower == "references" || section_lower == "bibliography" {
let text_trimmed = block_text.trim();
if text_trimmed.eq_ignore_ascii_case("references")
|| text_trimmed.eq_ignore_ascii_case("bibliography")
{
continue;
}
references_text.push_str(&block_text);
references_text.push('\n');
continue;
}
if !in_references_fallback {
let text_lower = block_text.to_lowercase().trim().to_string();
if text_lower == "references" || text_lower == "bibliography" {
in_references_fallback = true;
continue;
}
}
if in_references_fallback {
references_text.push_str(&block_text);
references_text.push('\n');
}
}
}
references_text.trim().to_string()
}
pub async fn extract_references(
pages: &[Page],
config: &mut ParserConfig,
verbose: bool,
) -> Result<()> {
let references_text = collect_references_text(pages);
if references_text.is_empty() {
if verbose {
tracing::info!("No References section found, skipping reference extraction");
}
return Ok(());
}
if verbose {
tracing::info!(
"Extracting references from {} characters of text",
references_text.len()
);
}
match llm::extract_references_llm(&references_text).await {
Ok(refs) => {
if verbose {
tracing::info!("Extracted {} references", refs.len());
}
config.references = refs;
}
Err(e) => {
tracing::warn!("Reference extraction failed: {}", e);
}
}
Ok(())
}
pub fn pages2paper_output(pages: &Vec<Page>, config: &ParserConfig) -> PaperOutput {
let sections = Section::from_pages_with_math(pages, &config.math_texts);
PaperOutput {
sections,
references: config.references.clone(),
}
}
pub async fn parse(
path_or_url: &str,
config: &mut ParserConfig,
verbose: bool,
) -> Result<Vec<Page>> {
let time = std::time::Instant::now();
if verbose {
tracing::info!("Parsing PDF: {}", path_or_url);
}
if config.use_llm {
if llm::is_llm_available() {
if verbose {
tracing::info!("LLM processing enabled (OPENAI_API_KEY detected)");
}
} else {
tracing::warn!(
"OPENAI_API_KEY not set. Skipping LLM-enhanced processing. \
Math formulas may not be extracted correctly."
);
config.use_llm = false;
}
}
let html = pdf2html(path_or_url, config, verbose, time).await?;
if verbose {
tracing::info!(
"Converted PDF into HTML in {:.2}s",
time.elapsed().as_secs()
);
}
let mut pages = parse_html2pages(config, html)?;
if verbose {
tracing::info!(
"Parsed HTML into Pages in {:.2}s, found {} pages",
time.elapsed().as_secs(),
pages.len()
);
}
parse_extract_textarea(config, &mut pages)?;
if verbose {
tracing::info!("Extracted Text Area in {:.2}s", time.elapsed().as_secs());
}
adjst_columns(&mut pages, config)?;
if verbose {
tracing::info!("Adjusted Columns in {:.2}s", time.elapsed().as_secs());
}
if config.use_llm && !config.sections.is_empty() {
if verbose {
tracing::info!("Running LLM section validation...");
}
let max_pages = 3.min(config.pdf_figures.len());
let mut first_page_images: Vec<String> = Vec::new();
for pg in 1..=(max_pages as PageNumber) {
if let Some(path) = config.pdf_figures.get(&pg) {
first_page_images.push(path.clone());
}
}
if first_page_images.is_empty() {
let mut all_keys: Vec<PageNumber> = config.pdf_figures.keys().copied().collect();
all_keys.sort();
for key in all_keys.iter().take(3) {
if let Some(path) = config.pdf_figures.get(key) {
first_page_images.push(path.clone());
}
}
}
match llm::validate_sections(&first_page_images).await {
Ok(llm_sections) if !llm_sections.is_empty() => {
let merged = llm::merge_sections(&config.sections, &llm_sections);
if verbose {
tracing::info!(
"LLM section validation: {} font-based → {} merged sections",
config.sections.len(),
merged.len()
);
}
config.sections = merged;
}
Ok(_) => {
if verbose {
tracing::info!("LLM returned empty sections, keeping font-based results");
}
}
Err(e) => {
tracing::warn!(
"LLM section validation failed: {}. Using font-based results.",
e
);
}
}
}
parse_extract_section_text(config, &mut pages)?;
if verbose {
tracing::info!("Extracted Sections in {:.2}s", time.elapsed().as_secs());
}
cleaner::classify_blocks(&mut pages);
if verbose {
let caption_count: usize = pages
.iter()
.flat_map(|p| &p.blocks)
.filter(|b| b.block_type == crate::models::BlockType::Caption)
.count();
tracing::info!(
"Classified blocks in {:.2}s ({} captions detected)",
time.elapsed().as_secs(),
caption_count
);
}
let math_texts = if config.use_llm {
if verbose {
tracing::info!("Running LLM math extraction...");
}
let math_threshold = 0.3f32;
let mut pages_with_math: Vec<(PageNumber, String)> = Vec::new();
for page in &pages {
let page_text = page.get_text();
let density = llm::estimate_math_density(&page_text);
if density >= math_threshold {
if let Some(img_path) = config.pdf_figures.get(&page.page_number) {
pages_with_math.push((page.page_number, img_path.clone()));
if verbose {
tracing::info!(
"Page {} has math density {:.2}, queuing for LLM extraction",
page.page_number,
density
);
}
}
}
}
let mut math_texts: HashMap<(PageNumber, usize), String> = HashMap::new();
if !pages_with_math.is_empty() {
use futures::stream::{self, StreamExt};
let results: Vec<(PageNumber, Result<String>)> = stream::iter(pages_with_math)
.map(|(page_num, image_path)| async move {
let result = llm::extract_page_text_with_math(&image_path, page_num).await;
(page_num, result)
})
.buffer_unordered(5)
.collect()
.await;
for (page_num, result) in results {
match result {
Ok(llm_text) if !llm_text.is_empty() => {
let converted = llm::convert_latex_to_math_tags(&llm_text);
if let Some(page) = pages.iter().find(|p| p.page_number == page_num) {
let aligned = llm::align_llm_text_to_blocks(&converted, &page.blocks);
let aligned_count = aligned.len();
for (block_idx, math_text) in aligned {
math_texts.insert((page_num, block_idx), math_text);
}
if verbose {
tracing::info!(
"LLM aligned {} blocks for page {}",
aligned_count,
page_num
);
}
}
}
Ok(_) => {}
Err(e) => {
tracing::warn!("LLM math extraction failed for page {}: {}", page_num, e);
}
}
}
}
math_texts
} else {
if verbose {
tracing::info!("Using heuristic math markup (LLM not available)...");
}
llm::apply_heuristic_math_markup(&pages)
};
config.math_texts = math_texts;
for value in config.math_texts.values_mut() {
*value = llm::unicode_math_to_latex(value);
}
if verbose {
tracing::info!(
"Math markup complete in {:.2}s ({} blocks with math)",
time.elapsed().as_secs(),
config.math_texts.len()
);
}
if config.extract_references {
if llm::is_llm_available() {
if verbose {
tracing::info!("Extracting references...");
}
extract_references(&pages, config, verbose).await?;
if verbose {
tracing::info!(
"Reference extraction complete in {:.2}s ({} references)",
time.elapsed().as_secs(),
config.references.len()
);
}
} else {
tracing::warn!("Reference extraction requires OPENAI_API_KEY. Skipping.");
}
}
if verbose {
tracing::info!("Finished Parsing in {:.2}s", time.elapsed().as_secs());
}
return Ok(pages);
}
pub fn pages2json(pages: &Vec<Page>) -> String {
let sections = Section::from_pages(pages);
let mut json_data = Vec::<HashMap<&str, String>>::new();
for section in sections.iter() {
let mut data = HashMap::new();
data.insert("title", section.title.clone());
data.insert("contents", section.get_text());
json_data.push(data);
}
let json = serde_json::to_string(&json_data).unwrap();
return json;
}
pub fn pages2json_with_math(pages: &Vec<Page>, config: &crate::config::ParserConfig) -> String {
let sections = Section::from_pages_with_math(pages, &config.math_texts);
serde_json::to_string(§ions).unwrap_or_else(|_| "[]".to_string())
}
pub fn pages2sections(pages: &Vec<Page>, config: &crate::config::ParserConfig) -> Vec<Section> {
Section::from_pages_with_math(pages, &config.math_texts)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::config::ParserConfig;
use crate::models::{Coordinate, Section};
use crate::parser::pages2json;
use crate::parser::parse;
use crate::test_utils::{BuiltinPaper, TestPapers};
#[test_log::test(tokio::test)]
async fn test_parse_extract_sections_1() {
let tp = TestPapers::setup().await.expect("setup test papers");
let paper = tp.get_by_title(BuiltinPaper::AttentionIsAllYouNeed).unwrap();
let mut config = ParserConfig::new();
let mut pages = parse(
paper.dest_path(&tp.tmp_dir).to_str().unwrap(),
&mut config,
true,
)
.await
.unwrap();
match parse_extract_section_text(&mut config, &mut pages) {
Ok(()) => {}
Err(e) => {
tracing::error!("Failed to extract sections: {}", e);
assert!(false, "Failed to extract sections");
}
}
for (idx, section) in config.sections.iter().enumerate() {
tracing::info!("section {}: {}", idx, section.1);
}
assert!(config.sections.len() >= 5);
}
#[test_log::test(tokio::test)]
async fn test_parse_extract_sections_2() {
let tp = TestPapers::setup().await.expect("setup test papers");
let paper = tp.get_by_title(BuiltinPaper::MemAgent).unwrap();
let mut config = ParserConfig::new();
let mut pages = parse(
paper.dest_path(&tp.tmp_dir).to_str().unwrap(),
&mut config,
true,
)
.await
.unwrap();
match parse_extract_section_text(&mut config, &mut pages) {
Ok(()) => {}
Err(e) => {
tracing::error!("Failed to extract sections: {}", e);
assert!(false, "Failed to extract sections");
}
}
for (idx, section) in config.sections.iter().enumerate() {
tracing::info!("section {}: {}", idx, section.1);
}
assert!(config.sections.len() >= 6);
}
#[test_log::test(tokio::test)]
async fn test_parse_extract_sections_3() {
let tp = TestPapers::setup().await.expect("setup test papers");
let paper = tp.get_by_title(BuiltinPaper::UnsupervisedDialoguePolicies).unwrap();
let mut config = ParserConfig::new();
let mut pages = parse(
paper.dest_path(&tp.tmp_dir).to_str().unwrap(),
&mut config,
true,
)
.await
.unwrap();
match parse_extract_section_text(&mut config, &mut pages) {
Ok(()) => {}
Err(e) => {
tracing::error!("Failed to extract sections: {}", e);
assert!(false, "Failed to extract sections");
}
}
for (idx, section) in config.sections.iter().enumerate() {
tracing::info!("section {}: {}", idx, section.1);
}
assert!(config.sections.len() >= 9);
}
#[test_log::test(tokio::test)]
async fn test_parse_1() {
let mut config = ParserConfig::new();
let url = "https://arxiv.org/pdf/2308.10379";
let res = parse(url, &mut config, true).await;
let pages = res.unwrap();
assert!(pages.len() > 0, "No pages found");
for page in pages {
tracing::info!(
"page: {}: ({}, {})",
page.page_number,
page.width,
page.height
);
for block in &page.blocks {
let block_coord =
Coordinate::from_object(block.x, block.y, block.width, block.height);
tracing::info!(
" {} [({},{})x({},{})]:{}",
block.section,
block_coord.top_left.x as i32,
block_coord.top_left.y as i32,
block_coord.bottom_right.x as i32,
block_coord.bottom_right.y as i32,
block.get_text()
);
}
}
let _ = config.clean_files();
}
#[test_log::test(tokio::test)]
async fn test_parse_2() {
let mut config = ParserConfig::new();
let url = "https://arxiv.org/pdf/1706.03762";
let res = parse(url, &mut config, true).await;
let pages = res.unwrap();
assert!(pages.len() > 0);
for page in pages {
tracing::info!(
"page: {}: ({}, {})",
page.page_number,
page.width,
page.height
);
for block in &page.blocks {
let block_coord =
Coordinate::from_object(block.x, block.y, block.width, block.height);
tracing::info!(
" {} [({},{})x({},{})]:{}",
block.section,
block_coord.top_left.x as i32,
block_coord.top_left.y as i32,
block_coord.bottom_right.x as i32,
block_coord.bottom_right.y as i32,
block.get_text()
);
}
}
let _ = config.clean_files();
}
#[test_log::test(tokio::test)]
async fn test_parse_local_sample() {
let tp = TestPapers::setup().await.expect("setup test papers");
let paper = &tp.papers[0];
let path = paper.dest_path(&tp.tmp_dir);
let mut config = ParserConfig::new();
let pages =
parse(path.to_str().unwrap(), &mut config, true).await.expect("parse local sample");
assert!(pages.len() > 0, "No pages found");
let _ = config.clean_files();
let _ = tp.cleanup();
}
#[test_log::test(tokio::test)]
async fn test_pdf_to_json() {
let tp = TestPapers::setup().await.expect("setup test papers");
for built_in_paper in BuiltinPaper::ALL.iter() {
if matches!(built_in_paper, BuiltinPaper::ZepTemporalKnowledgeGraph) {
continue;
}
tracing::info!("Testing paper: {}", built_in_paper);
let paper = tp.get_by_title(*built_in_paper).expect("paper not found");
let mut config = ParserConfig::new();
let filepath = paper.dest_path(&tp.tmp_dir);
assert!(filepath.exists(), "file not found: {}", filepath.display());
let pages = match parse(filepath.to_str().unwrap(), &mut config, true).await {
Ok(pages) => pages,
Err(e) => {
tracing::error!("Failed to parse {}: {}", paper.filename, e);
continue;
}
};
let sections = Section::from_pages(&pages);
assert!(sections.len() > 2, "At least 3 sections are expected");
for section in sections.iter() {
assert!(section.title.len() > 0);
assert!(section.contents.len() > 0);
tracing::info!("{}", section.title);
}
let json = serde_json::to_string(§ions).unwrap();
assert!(json.len() > 0);
let json = pages2json(&pages);
assert!(json.len() > 0);
let _ = config.clean_files();
}
let _ = tp.cleanup();
}
#[test_log::test(tokio::test)]
async fn test_parse_zep_non_standard_format() {
let tp = TestPapers::setup().await.expect("setup papers");
let paper =
tp.get_by_title(BuiltinPaper::ZepTemporalKnowledgeGraph).expect("Zep paper not found");
let filepath = paper.dest_path(&tp.tmp_dir);
assert!(filepath.exists(), "file not found: {}", filepath.display());
let mut config = ParserConfig::new();
let result = parse(filepath.to_str().unwrap(), &mut config, true).await;
let pages =
result.expect("Zep paper should parse successfully");
tracing::info!("Zep paper parsed successfully with {} pages", pages.len());
assert!(pages.len() > 0, "Should have at least one page");
assert!(
!config.sections.is_empty(),
"Zep paper should have sections detected via fallback"
);
let section_names: Vec<&str> = config.sections.iter().map(|(_, s)| s.as_str()).collect();
tracing::info!("Detected sections: {:?}", section_names);
assert!(
section_names.iter().any(|s| s.to_lowercase() == "introduction"),
"Should detect Introduction section, got: {:?}", section_names
);
let mut total_blocks = 0;
for page in &pages {
for block in &page.blocks {
total_blocks += 1;
assert!(
!block.section.is_empty(),
"All blocks should have a section assigned"
);
}
}
assert!(total_blocks > 0, "Should have at least one block");
tracing::info!("Total blocks with sections: {}", total_blocks);
let sections = Section::from_pages(&pages);
assert!(sections.len() >= 1, "Should have at least one section");
let total_content: usize = sections.iter().map(|s| s.contents.len()).sum();
assert!(total_content > 0, "Sections should have content");
tracing::info!(
"Produced {} sections with {} total content blocks",
sections.len(),
total_content
);
let _ = config.clean_files();
let _ = tp.cleanup();
}
#[test_log::test(tokio::test)]
async fn test_parse_long_document() {
let tp = TestPapers::setup().await.expect("setup test papers");
let paper = tp
.get_by_title(BuiltinPaper::ImageGenerationSafety)
.expect("ImageGenerationSafety paper not found");
let filepath = paper.dest_path(&tp.tmp_dir);
assert!(filepath.exists(), "file not found: {}", filepath.display());
let mut config = ParserConfig::new();
let result = parse(filepath.to_str().unwrap(), &mut config, true).await;
let pages = result.expect("Long document should parse successfully");
tracing::info!("Long document parsed: {} pages", pages.len());
assert!(pages.len() > 0, "Should have at least one page");
let section_names: Vec<String> =
config.sections.iter().map(|(_, s)| s.to_lowercase()).collect();
tracing::info!("Detected sections: {:?}", config.sections);
assert!(
section_names.iter().any(|s| s.contains("introduction")),
"Expected 'Introduction' section in {:?}",
section_names
);
let sections = Section::from_pages(&pages);
assert!(sections.len() >= 1, "Should have at least 1 section");
for section in §ions {
tracing::info!(
"Section: {} ({} contents)",
section.title,
section.contents.len()
);
}
let _ = config.clean_files();
let _ = tp.cleanup();
}
#[test_log::test(tokio::test)]
async fn test_parse_llm_disabled_fallback() {
let tp = TestPapers::setup().await.expect("setup test papers");
let paper = tp.get_by_title(BuiltinPaper::AttentionIsAllYouNeed).unwrap();
let filepath = paper.dest_path(&tp.tmp_dir);
let mut config = ParserConfig::new();
config.use_llm = true;
let result = parse(filepath.to_str().unwrap(), &mut config, true).await;
let pages = result.expect("Parse should succeed even without LLM");
assert!(pages.len() > 0, "Should have pages");
if !llm::is_llm_available() {
assert!(
!config.use_llm,
"use_llm should be false when API key is not set"
);
}
let sections = Section::from_pages(&pages);
assert!(sections.len() >= 1, "Should produce at least 1 section");
let _ = config.clean_files();
let _ = tp.cleanup();
}
#[test]
fn test_math_density_estimation() {
let normal =
"This is a normal paragraph about machine learning and natural language processing.";
assert!(
llm::estimate_math_density(normal) < 0.1,
"Normal text should have low math density"
);
let math_like = "f ( x ) = a b c d e f g h i j k";
assert!(
llm::estimate_math_density(math_like) > 0.0,
"Math-like text should have some density"
);
assert_eq!(
llm::estimate_math_density(""),
0.0,
"Empty text should have 0 density"
);
}
#[test]
fn test_merge_sections() {
let font_based: Vec<(PageNumber, String)> = vec![
(1, "Abstract".to_string()),
(1, "Introduction".to_string()),
(3, "Method".to_string()),
(5, "Conclusion".to_string()),
(6, "References".to_string()),
];
let llm_sections = vec![
"Abstract".to_string(),
"Introduction".to_string(),
"Related Work".to_string(), "Method".to_string(),
"Conclusion".to_string(),
"References".to_string(),
"Appendix".to_string(), ];
let merged = llm::merge_sections(&font_based, &llm_sections);
assert_eq!(merged.len(), llm_sections.len());
let related = merged.iter().find(|(_, s)| s == "Related Work").unwrap();
assert_eq!(related.0, -1, "LLM-only section should have page=-1");
let appendix = merged.iter().find(|(_, s)| s == "Appendix").unwrap();
assert_eq!(appendix.0, -1, "LLM-only section should have page=-1");
let intro = merged.iter().find(|(_, s)| s == "Introduction").unwrap();
assert_eq!(intro.0, 1, "Font-based section should keep page number");
}
#[test_log::test(tokio::test)]
async fn test_llm_section_validation_conditional() {
if !llm::is_llm_available() {
tracing::info!("Skipping LLM test - OPENAI_API_KEY not set");
return;
}
let tp = TestPapers::setup().await.expect("setup test papers");
let paper = tp.get_by_title(BuiltinPaper::AttentionIsAllYouNeed).unwrap();
let filepath = paper.dest_path(&tp.tmp_dir);
let mut config = ParserConfig::new();
config.use_llm = true;
let pages = parse(filepath.to_str().unwrap(), &mut config, true)
.await
.expect("Parse with LLM should succeed");
assert!(pages.len() > 0);
let sections = Section::from_pages(&pages);
let section_titles: Vec<&str> = sections.iter().map(|s| s.title.as_str()).collect();
tracing::info!("LLM-validated sections: {:?}", section_titles);
assert!(
section_titles.iter().any(|t| t.to_lowercase().contains("introduction")),
"Introduction should be present"
);
let _ = config.clean_files();
let _ = tp.cleanup();
}
#[test_log::test(tokio::test)]
async fn test_collect_references_text() {
let tp = TestPapers::setup().await.expect("setup test papers");
let paper = tp.get_by_title(BuiltinPaper::AttentionIsAllYouNeed).unwrap();
let filepath = paper.dest_path(&tp.tmp_dir);
let mut config = ParserConfig::new();
let pages = parse(filepath.to_str().unwrap(), &mut config, false)
.await
.expect("Parse should succeed");
let mut section_counts: std::collections::HashMap<String, usize> =
std::collections::HashMap::new();
for page in &pages {
for block in &page.blocks {
*section_counts.entry(block.section.clone()).or_insert(0) += 1;
}
}
for (section, count) in §ion_counts {
tracing::info!("Section '{}': {} blocks", section, count);
}
let refs_text = collect_references_text(&pages);
tracing::info!("Collected references text: {} characters", refs_text.len());
tracing::info!(
"References text preview: {:?}",
refs_text.chars().take(200).collect::<String>()
);
let _ = config.clean_files();
let _ = tp.cleanup();
}
#[test_log::test(tokio::test)]
async fn test_paper_output_format() {
let tp = TestPapers::setup().await.expect("setup test papers");
let paper = tp.get_by_title(BuiltinPaper::AttentionIsAllYouNeed).unwrap();
let filepath = paper.dest_path(&tp.tmp_dir);
let mut config = ParserConfig::new();
let pages = parse(filepath.to_str().unwrap(), &mut config, false)
.await
.expect("Parse should succeed");
let output = pages2paper_output(&pages, &config);
assert!(!output.sections.is_empty(), "Should have sections");
assert!(
output.references.is_empty(),
"References should be empty without extract_references flag"
);
let json = serde_json::to_string_pretty(&output).unwrap();
assert!(json.contains("\"sections\""));
assert!(!json.contains("\"references\""));
let _ = config.clean_files();
let _ = tp.cleanup();
}
#[test_log::test(tokio::test)]
async fn test_extract_references_conditional() {
if !llm::is_llm_available() {
tracing::info!("Skipping reference extraction test - OPENAI_API_KEY not set");
return;
}
let tp = TestPapers::setup().await.expect("setup test papers");
let paper = tp.get_by_title(BuiltinPaper::AttentionIsAllYouNeed).unwrap();
let filepath = paper.dest_path(&tp.tmp_dir);
let mut config = ParserConfig::new();
config.use_llm = true;
config.extract_references = true;
let pages = parse(filepath.to_str().unwrap(), &mut config, true)
.await
.expect("Parse with reference extraction should succeed");
assert!(!pages.is_empty());
tracing::info!("Extracted {} references", config.references.len());
assert!(
config.references.len() > 5,
"Should extract multiple references"
);
if let Some(first_ref) = config.references.first() {
tracing::info!("First reference: {:?}", first_ref);
assert!(
first_ref.title.is_some() || first_ref.authors.is_some(),
"Reference should have title or authors"
);
}
let output = pages2paper_output(&pages, &config);
assert!(
!output.references.is_empty(),
"PaperOutput should have references"
);
let json = serde_json::to_string_pretty(&output).unwrap();
assert!(json.contains("\"references\""));
tracing::info!("PaperOutput JSON has {} bytes", json.len());
let _ = config.clean_files();
let _ = tp.cleanup();
}
#[test_log::test(tokio::test)]
async fn test_math_markup_heuristic_corpus() {
let tp = TestPapers::setup().await.expect("setup test papers");
let paper = tp.get_by_title(BuiltinPaper::AttentionIsAllYouNeed).unwrap();
let filepath = paper.dest_path(&tp.tmp_dir);
let mut config = ParserConfig::new();
config.use_llm = false;
let pages = parse(filepath.to_str().unwrap(), &mut config, false)
.await
.expect("Parse should succeed");
assert!(!pages.is_empty());
let sections = Section::from_pages_with_math(&pages, &config.math_texts);
let has_math = sections.iter().any(|s| s.math_contents.is_some());
tracing::info!(
"Heuristic math markup: {} math_texts entries, has_math_contents: {}",
config.math_texts.len(),
has_math
);
assert!(
config.math_texts.len() > 0,
"Math-heavy paper should have math_texts entries from heuristic markup"
);
let _ = config.clean_files();
let _ = tp.cleanup();
}
#[test_log::test(tokio::test)]
async fn test_math_markup_no_math_flag() {
let tp = TestPapers::setup().await.expect("setup test papers");
let paper = tp.get_by_title(BuiltinPaper::AttentionIsAllYouNeed).unwrap();
let filepath = paper.dest_path(&tp.tmp_dir);
let mut config = ParserConfig::new();
config.use_llm = false;
let pages = parse(filepath.to_str().unwrap(), &mut config, false)
.await
.expect("Parse should succeed");
config.math_texts.clear();
let sections = Section::from_pages_with_math(&pages, &config.math_texts);
for section in §ions {
assert!(
section.math_contents.is_none(),
"Section '{}' should have no math_contents when math_texts is empty",
section.title
);
}
let _ = config.clean_files();
let _ = tp.cleanup();
}
#[test_log::test(tokio::test)]
async fn test_math_markup_latex_format() {
let tp = TestPapers::setup().await.expect("setup test papers");
let paper = tp.get_by_title(BuiltinPaper::AttentionIsAllYouNeed).unwrap();
let filepath = paper.dest_path(&tp.tmp_dir);
let mut config = ParserConfig::new();
config.use_llm = false;
let _pages = parse(filepath.to_str().unwrap(), &mut config, false)
.await
.expect("Parse should succeed");
for ((page, block_idx), text) in &config.math_texts {
if text.contains("<math>") {
let math_tag_re = regex::Regex::new(r"<math[^>]*>(.*?)</math>").unwrap();
for cap in math_tag_re.captures_iter(text) {
let math_content = &cap[1];
let has_unicode_greek = math_content.chars().any(|c| {
('\u{03B1}'..='\u{03C9}').contains(&c)
|| ('\u{0391}'..='\u{03A9}').contains(&c)
});
if has_unicode_greek {
tracing::warn!(
"Page {} block {}: math content still has Unicode Greek: {}",
page,
block_idx,
math_content
);
}
}
}
}
tracing::info!(
"Checked {} math_texts entries for LaTeX format",
config.math_texts.len()
);
let _ = config.clean_files();
let _ = tp.cleanup();
}
}