use tracing::debug;
use crate::config::LlmConfig;
use crate::error::Result;
use super::types::TocEntry;
use crate::llm::LlmClient;
#[derive(Debug, Clone)]
pub struct TocParserConfig {
pub llm_config: LlmConfig,
pub max_retries: usize,
pub verify_completeness: bool,
}
impl Default for TocParserConfig {
fn default() -> Self {
Self {
llm_config: LlmConfig::default(),
max_retries: 3,
verify_completeness: true,
}
}
}
pub struct TocParser {
config: TocParserConfig,
client: LlmClient,
}
impl TocParser {
pub fn new(config: TocParserConfig) -> Self {
let client = LlmClient::new(config.llm_config.clone().into());
Self { config, client }
}
pub fn with_client(client: LlmClient) -> Self {
Self {
config: TocParserConfig::default(),
client,
}
}
pub fn with_defaults() -> Self {
Self::new(TocParserConfig::default())
}
pub async fn parse(&self, toc_text: &str) -> Result<Vec<TocEntry>> {
if toc_text.trim().is_empty() {
return Ok(Vec::new());
}
let entries = self.parse_with_llm(toc_text).await?;
debug!("Initial parse: {} entries", entries.len());
if entries.is_empty() {
return Ok(entries);
}
if self.config.verify_completeness {
self.verify_and_complete(toc_text, entries).await
} else {
Ok(entries)
}
}
async fn parse_with_llm(&self, toc_text: &str) -> Result<Vec<TocEntry>> {
let system = r#"You are a document structure extraction expert.
Your task is to parse a Table of Contents (TOC) into a structured format.
Rules:
1. Extract all sections and subsections
2. Determine the hierarchy level (1 = top level, 2 = subsection, etc.)
3. Extract page numbers if present
4. Preserve original titles exactly (only fix spacing issues)
5. If the TOC seems incomplete, extract what you can see"#;
let user = format!(
r#"Parse this Table of Contents:
{}
Return a JSON array:
[
{{
"title": "Section Title",
"level": 1,
"page": 10
}},
...
]
Notes:
- "level" should reflect the hierarchy (1, 2, 3...)
- "page" is optional if not present in TOC
- Only output the JSON array, no other text"#,
toc_text
);
#[derive(serde::Deserialize)]
struct ParsedEntry {
title: String,
level: usize,
#[serde(default)]
page: Option<usize>,
}
let entries: Vec<ParsedEntry> = self.client.complete_json(system, &user).await?;
Ok(entries
.into_iter()
.map(|e| {
let mut entry = TocEntry::new(e.title, e.level);
if let Some(page) = e.page {
entry = entry.with_toc_page(page);
}
entry
})
.collect())
}
async fn verify_and_complete(
&self,
toc_text: &str,
mut entries: Vec<TocEntry>,
) -> Result<Vec<TocEntry>> {
let mut attempts = 0;
while attempts < self.config.max_retries {
let is_complete = self.check_completeness(toc_text, &entries).await?;
if is_complete {
debug!("TOC parsing complete after {} attempts", attempts + 1);
return Ok(entries);
}
debug!(
"TOC incomplete, attempting continuation (attempt {})",
attempts + 1
);
let additional = self.continue_parsing(toc_text, &entries).await?;
if additional.is_empty() {
break;
}
entries.extend(additional);
attempts += 1;
}
Ok(entries)
}
async fn check_completeness(&self, toc_text: &str, entries: &[TocEntry]) -> Result<bool> {
let system = "You are a document analysis assistant. Determine if the parsed entries completely represent the original TOC.";
let entries_json =
serde_json::to_string_pretty(&entries.iter().map(|e| &e.title).collect::<Vec<_>>())
.unwrap_or_default();
let user = format!(
r#"Original TOC:
{}
Parsed entries:
{}
Is the parsing complete? Reply with JSON:
{{"complete": true/false}}"#,
toc_text, entries_json
);
#[derive(serde::Deserialize)]
struct CompletenessCheck {
complete: bool,
}
let result: CompletenessCheck = self.client.complete_json(system, &user).await?;
Ok(result.complete)
}
async fn continue_parsing(
&self,
toc_text: &str,
existing: &[TocEntry],
) -> Result<Vec<TocEntry>> {
let system = "You are a document structure extraction expert. Continue parsing the TOC from where it was left off.";
let last_titles: Vec<_> = existing.iter().rev().take(5).map(|e| &e.title).collect();
let user = format!(
r#"Original TOC:
{}
Already parsed (last 5):
{:?}
Extract the REMAINING entries that were missed. Return a JSON array:
[
{{"title": "...", "level": N, "page": M}},
...
]
If nothing was missed, return an empty array: []"#,
toc_text, last_titles
);
#[derive(serde::Deserialize)]
struct ParsedEntry {
title: String,
level: usize,
#[serde(default)]
page: Option<usize>,
}
let entries: Vec<ParsedEntry> = self.client.complete_json(system, &user).await?;
Ok(entries
.into_iter()
.map(|e| {
let mut entry = TocEntry::new(e.title, e.level);
if let Some(page) = e.page {
entry = entry.with_toc_page(page);
}
entry
})
.collect())
}
}
#[cfg(test)]
mod tests {
use super::*;
#[tokio::test]
async fn test_parse_simple_toc() {
let parser = TocParser::with_defaults();
if std::env::var("OPENAI_API_KEY").is_err() {
return;
}
let toc_text = r#"
Chapter 1. Introduction 1
1.1 Background 2
1.2 Objectives 5
Chapter 2. Methods 10
"#;
let entries = parser.parse(toc_text).await.unwrap();
assert!(!entries.is_empty());
}
}