use futures::future::join_all;
use tracing::{debug, info, warn};
use crate::error::Result;
use crate::index::parse::pdf::PdfPage;
use crate::llm::LlmClient;
use super::assigner::{PageAssigner, PageAssignerConfig};
use super::detector::{TocDetector, TocDetectorConfig};
use super::parser::{TocParser, TocParserConfig};
use super::repairer::{IndexRepairer, RepairerConfig};
use super::structure_extractor::{StructureExtractor, StructureExtractorConfig};
use super::types::{ProcessingMode, TocEntry, VerificationReport};
use super::verifier::{IndexVerifier, VerifierConfig};
#[derive(Debug, Clone)]
pub struct TocProcessorConfig {
pub detector: TocDetectorConfig,
pub parser: TocParserConfig,
pub assigner: PageAssignerConfig,
pub verifier: VerifierConfig,
pub repairer: RepairerConfig,
pub accuracy_threshold: f32,
pub max_repair_attempts: usize,
pub max_pages_per_entry: usize,
pub max_tokens_per_entry: usize,
}
impl Default for TocProcessorConfig {
fn default() -> Self {
Self {
detector: TocDetectorConfig::default(),
parser: TocParserConfig::default(),
assigner: PageAssignerConfig::default(),
verifier: VerifierConfig::default(),
repairer: RepairerConfig::default(),
accuracy_threshold: 0.6,
max_repair_attempts: 3,
max_pages_per_entry: 30,
max_tokens_per_entry: 20000,
}
}
}
pub struct TocProcessor {
config: TocProcessorConfig,
detector: TocDetector,
parser: TocParser,
assigner: PageAssigner,
verifier: IndexVerifier,
repairer: IndexRepairer,
llm_client: Option<LlmClient>,
}
impl TocProcessor {
pub fn new() -> Self {
Self::with_config(TocProcessorConfig::default())
}
pub fn with_llm_client(client: LlmClient) -> Self {
info!("TocProcessor: created with external LLM client");
let config = TocProcessorConfig::default();
Self {
detector: TocDetector::with_client(config.detector.clone(), client.clone()),
parser: TocParser::with_client(client.clone()),
assigner: PageAssigner::with_client(client.clone()),
verifier: IndexVerifier::with_client(client.clone()),
repairer: IndexRepairer::with_client(client.clone()),
llm_client: Some(client),
config,
}
}
pub fn with_config(config: TocProcessorConfig) -> Self {
info!("TocProcessor: created with config (no external LLM client)");
Self {
detector: TocDetector::new(config.detector.clone()),
parser: TocParser::new(config.parser.clone()),
assigner: PageAssigner::new(config.assigner.clone()),
verifier: IndexVerifier::new(config.verifier.clone()),
repairer: IndexRepairer::new(config.repairer.clone()),
llm_client: None,
config,
}
}
pub async fn process(&self, pages: &[PdfPage]) -> Result<Vec<TocEntry>> {
if pages.is_empty() {
return Ok(Vec::new());
}
info!("Processing {} pages for TOC extraction", pages.len());
let detection = self.detector.detect(pages).await?;
let initial_mode = if !detection.found {
info!("No TOC found in document");
ProcessingMode::NoToc
} else if detection.has_page_numbers {
info!(
"TOC found on pages {:?}, has page numbers",
detection.pages
);
ProcessingMode::TocWithPageNumbers
} else {
info!(
"TOC found on pages {:?}, no page numbers",
detection.pages
);
ProcessingMode::TocWithoutPageNumbers
};
let entries = self
.process_with_degradation(initial_mode, &detection, pages)
.await?;
self.refine_large_entries(entries, pages).await
}
async fn process_with_degradation(
&self,
initial_mode: ProcessingMode,
detection: &super::types::TocDetection,
pages: &[PdfPage],
) -> Result<Vec<TocEntry>> {
let mut mode = initial_mode;
loop {
info!("Attempting extraction with mode {:?}", mode);
let result = match mode {
ProcessingMode::TocWithPageNumbers => {
self.process_toc_with_page_numbers(detection, pages).await
}
ProcessingMode::TocWithoutPageNumbers => {
self.process_toc_without_page_numbers(detection, pages).await
}
ProcessingMode::NoToc => {
return self.process_without_toc(pages).await;
}
};
match result {
Ok(entries) if !entries.is_empty() => {
let mut mutable_entries = entries;
let report = self
.verify_and_repair(&mut mutable_entries, pages)
.await?;
if report.accuracy >= self.config.accuracy_threshold {
info!(
"Mode {:?} succeeded: {} entries, accuracy {:.1}%",
mode,
mutable_entries.len(),
report.accuracy * 100.0
);
return Ok(mutable_entries);
}
warn!(
"Mode {:?} accuracy {:.1}% below threshold {:.1}%",
mode,
report.accuracy * 100.0,
self.config.accuracy_threshold * 100.0
);
match mode.degrade() {
Some(next) => {
info!("Degrading from {:?} to {:?}", mode, next);
mode = next;
}
None => {
warn!("No further degradation possible, returning best effort");
return Ok(mutable_entries);
}
}
}
Ok(_) => {
warn!("Mode {:?} produced no entries", mode);
match mode.degrade() {
Some(next) => {
mode = next;
}
None => return Ok(Vec::new()),
}
}
Err(e) => {
warn!("Mode {:?} failed: {}", mode, e);
match mode.degrade() {
Some(next) => {
mode = next;
}
None => return Err(e),
}
}
}
}
}
async fn process_toc_with_page_numbers(
&self,
detection: &super::types::TocDetection,
pages: &[PdfPage],
) -> Result<Vec<TocEntry>> {
let toc_text = self.extract_toc_text(pages, &detection.pages);
if toc_text.trim().is_empty() {
return Ok(Vec::new());
}
let mut entries = self.parser.parse(&toc_text).await?;
if entries.is_empty() {
return Ok(Vec::new());
}
self.assigner.assign(&mut entries, pages).await?;
Ok(entries)
}
async fn process_toc_without_page_numbers(
&self,
detection: &super::types::TocDetection,
pages: &[PdfPage],
) -> Result<Vec<TocEntry>> {
let toc_text = self.extract_toc_text(pages, &detection.pages);
if toc_text.trim().is_empty() {
return Ok(Vec::new());
}
let mut entries = self.parser.parse(&toc_text).await?;
if entries.is_empty() {
return Ok(Vec::new());
}
for entry in &mut entries {
entry.toc_page = None;
}
self.assigner.assign(&mut entries, pages).await?;
Ok(entries)
}
async fn process_without_toc(&self, pages: &[PdfPage]) -> Result<Vec<TocEntry>> {
info!("Extracting structure from page content (no TOC available)");
let extractor = match &self.llm_client {
Some(client) => {
StructureExtractor::with_client(StructureExtractorConfig::default(), client.clone())
}
None => StructureExtractor::new(StructureExtractorConfig::default()),
};
extractor.extract(pages).await
}
fn extract_toc_text(&self, pages: &[PdfPage], toc_pages: &[usize]) -> String {
toc_pages
.iter()
.filter_map(|&page_num| pages.get(page_num - 1))
.map(|page| page.text.as_str())
.collect::<Vec<_>>()
.join("\n\n")
}
async fn verify_and_repair(
&self,
entries: &mut [TocEntry],
pages: &[PdfPage],
) -> Result<VerificationReport> {
let mut attempts = 0;
while attempts < self.config.max_repair_attempts {
let report = self.verifier.verify(entries, pages).await?;
if report.accuracy >= self.config.accuracy_threshold {
debug!(
"Verification passed: accuracy {:.1}%",
report.accuracy * 100.0
);
return Ok(report);
}
if report.errors.is_empty() {
return Ok(report);
}
let repaired = self.repairer.repair(entries, &report.errors, pages).await?;
if repaired == 0 {
debug!("No repairs possible");
return Ok(report);
}
attempts += 1;
debug!("Repair attempt {} complete", attempts);
}
self.verifier.verify(entries, pages).await
}
async fn refine_large_entries(
&self,
entries: Vec<TocEntry>,
pages: &[PdfPage],
) -> Result<Vec<TocEntry>> {
if entries.is_empty() {
return Ok(entries);
}
let page_count = pages.len();
let next_pages: Vec<Option<usize>> = entries
.iter()
.enumerate()
.map(|(i, _)| entries.get(i + 1).and_then(|e| e.physical_page))
.collect();
let llm_client = self.llm_client.clone();
let oversized_futures: Vec<_> = entries
.iter()
.enumerate()
.filter(|(i, entry)| {
let span = entry_page_span(entry, next_pages[*i], page_count);
let tokens = entry_token_count(entry, pages);
span > self.config.max_pages_per_entry
&& tokens > self.config.max_tokens_per_entry
})
.map(|(i, entry)| {
let start = entry.physical_page.unwrap_or(1);
let end = next_pages[i].unwrap_or(page_count);
let sub_pages: Vec<PdfPage> = pages
.iter()
.filter(|p| p.number >= start && p.number <= end)
.cloned()
.collect();
let entry_title = entry.title.clone();
let entry_level = entry.level;
let llm_client = llm_client.clone();
async move {
if sub_pages.is_empty() {
return (i, Vec::new());
}
debug!(
"Refining oversized entry '{}' (pages {}-{})",
entry_title, start, end
);
let extractor = match &llm_client {
Some(client) => StructureExtractor::with_client(
StructureExtractorConfig::default(),
client.clone(),
),
None => StructureExtractor::new(StructureExtractorConfig::default()),
};
match extractor.extract(&sub_pages).await {
Ok(sub_entries) => {
let skip = if sub_entries
.first()
.map(|e| e.title.trim() == entry_title.trim())
.unwrap_or(false)
{
1
} else {
0
};
let refined: Vec<TocEntry> = sub_entries[skip..]
.iter()
.map(|sub| {
TocEntry::new(&sub.title, sub.level + entry_level)
.with_physical_page(sub.physical_page.unwrap_or(start))
.with_confidence(sub.confidence * 0.9)
})
.collect();
info!(
"Refined '{}' into {} sub-entries",
entry_title,
refined.len()
);
(i, refined)
}
Err(e) => {
warn!("Sub-extraction failed for '{}': {}", entry_title, e);
(i, Vec::new())
}
}
}
})
.collect();
let extraction_results = join_all(oversized_futures).await;
let mut refined_map = std::collections::HashMap::new();
for (idx, sub_entries) in extraction_results {
if !sub_entries.is_empty() {
refined_map.insert(idx, sub_entries);
}
}
let mut result = Vec::with_capacity(entries.len() * 2);
for (i, entry) in entries.into_iter().enumerate() {
if let Some(sub_entries) = refined_map.remove(&i) {
result.extend(sub_entries);
} else {
result.push(entry);
}
}
Ok(result)
}
}
impl Default for TocProcessor {
fn default() -> Self {
Self::new()
}
}
fn entry_page_span(entry: &TocEntry, next_physical_page: Option<usize>, total_pages: usize) -> usize {
let start = entry.physical_page.unwrap_or(1);
let end = next_physical_page.unwrap_or(total_pages);
end.saturating_sub(start)
}
fn entry_token_count(entry: &TocEntry, pages: &[PdfPage]) -> usize {
let start = entry.physical_page.unwrap_or(1);
pages
.iter()
.filter(|p| p.number >= start)
.take(30) .map(|p| p.token_count)
.sum()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_processor_creation() {
let processor = TocProcessor::new();
assert_eq!(processor.config.accuracy_threshold, 0.6);
}
#[tokio::test]
async fn test_empty_pages() {
let processor = TocProcessor::new();
let entries = processor.process(&[]).await.unwrap();
assert!(entries.is_empty());
}
}