use std::collections::HashMap;
use futures::stream::{self, StreamExt};
use tracing::{debug, info};
use crate::config::LlmConfig;
use crate::error::Result;
use crate::index::parse::pdf::PdfPage;
use super::types::{PageOffset, TocEntry};
use crate::llm::LlmClient;
#[derive(Debug, Clone)]
pub struct PageAssignerConfig {
pub anchor_count: usize,
pub llm_config: LlmConfig,
pub max_offset_variance: usize,
}
impl Default for PageAssignerConfig {
fn default() -> Self {
Self {
anchor_count: 5,
llm_config: LlmConfig::default(),
max_offset_variance: 3,
}
}
}
pub struct PageAssigner {
config: PageAssignerConfig,
client: LlmClient,
}
impl PageAssigner {
pub fn new(config: PageAssignerConfig) -> Self {
let client = LlmClient::new(config.llm_config.clone().into());
Self { config, client }
}
pub fn with_client(client: LlmClient) -> Self {
Self {
config: PageAssignerConfig::default(),
client,
}
}
pub fn with_defaults() -> Self {
Self::new(PageAssignerConfig::default())
}
pub async fn assign(&self, entries: &mut [TocEntry], pages: &[PdfPage]) -> Result<()> {
if entries.is_empty() {
return Ok(());
}
let has_toc_pages = entries.iter().any(|e| e.toc_page.is_some());
if has_toc_pages {
self.assign_with_offset(entries, pages).await
} else {
self.assign_with_llm(entries, pages).await
}
}
async fn assign_with_offset(&self, entries: &mut [TocEntry], pages: &[PdfPage]) -> Result<()> {
info!("Assigning pages using offset calculation");
let anchors = self.select_anchors(entries, self.config.anchor_count);
let offset = self.calculate_offset(anchors, pages).await?;
if offset.confidence < 0.5 {
debug!("Offset confidence too low, falling back to LLM positioning");
return self.assign_with_llm(entries, pages).await;
}
info!(
"Calculated offset: {} (confidence: {})",
offset.offset, offset.confidence
);
for entry in entries.iter_mut() {
if let Some(toc_page) = entry.toc_page {
let physical = offset.apply(toc_page);
entry.physical_page = Some(physical.min(pages.len()));
}
}
Ok(())
}
fn select_anchors<'a>(&self, entries: &'a [TocEntry], count: usize) -> Vec<&'a TocEntry> {
let with_pages: Vec<_> = entries.iter().filter(|e| e.toc_page.is_some()).collect();
if with_pages.len() <= count {
return with_pages;
}
let step = with_pages.len() as f32 / count as f32;
(0..count)
.map(|i| with_pages[(i as f32 * step) as usize])
.collect()
}
async fn calculate_offset(
&self,
anchors: Vec<&TocEntry>,
pages: &[PdfPage],
) -> Result<PageOffset> {
if anchors.is_empty() {
return Ok(PageOffset::new(0, 0, 0.0));
}
let anchor_count = anchors.len();
let client = self.client.clone();
let pages_owned = pages.to_vec();
let futures: Vec<_> = anchors
.into_iter()
.map(|anchor| {
let title = anchor.title.clone();
let toc_page = anchor.toc_page.unwrap();
let client = client.clone();
let pages = pages_owned.clone();
async move {
let range_pages = Self::pages_around(&pages, toc_page, 3);
if range_pages.is_empty() {
return (0, false);
}
let content = Self::format_range_pages(&range_pages);
match Self::locate_with_client(&client, &title, &content).await {
Ok(Some(physical)) => {
let offset = physical as i32 - toc_page as i32;
debug!(
"Anchor '{}' found: toc={}, physical={}, offset={}",
title, toc_page, physical, offset
);
(offset, true)
}
_ => (0, false),
}
}
})
.collect();
let verified_offsets: Vec<_> = stream::iter(futures)
.buffer_unordered(5)
.collect()
.await;
let successful: Vec<_> = verified_offsets
.iter()
.filter(|(_, success)| *success)
.map(|(offset, _)| *offset)
.collect();
if successful.is_empty() {
return Ok(PageOffset::new(0, 0, 0.0));
}
let mode = Self::calculate_mode_static(&successful);
let sample_count = successful.len();
let confidence = sample_count as f32 / anchor_count as f32;
Ok(PageOffset::new(mode, sample_count, confidence))
}
fn calculate_mode(&self, values: &[i32]) -> i32 {
Self::calculate_mode_static(values)
}
fn calculate_mode_static(values: &[i32]) -> i32 {
let mut counts: HashMap<i32, usize> = HashMap::new();
for &v in values {
*counts.entry(v).or_insert(0) += 1;
}
counts
.into_iter()
.max_by_key(|&(_, count)| count)
.map(|(v, _)| v)
.unwrap_or(0)
}
fn pages_around(pages: &[PdfPage], center: usize, range: usize) -> Vec<PdfPage> {
let start = center.saturating_sub(range).max(1);
let end = (center + range).min(pages.len());
(start..=end)
.filter_map(|i| pages.get(i - 1).cloned())
.collect()
}
fn format_range_pages(pages: &[PdfPage]) -> String {
pages
.iter()
.map(|p| {
format!(
"<page_{}>\n{}\n</page_{}>",
p.number,
&p.text[..p.text.len().min(500)],
p.number
)
})
.collect::<Vec<_>>()
.join("\n\n")
}
async fn locate_with_client(
client: &LlmClient,
title: &str,
content: &str,
) -> Result<Option<usize>> {
let system = "You are a document analysis assistant. Find which page contains a specific section title.";
let user = format!(
r#"Find which page contains the section titled: "{}"
Pages:
{}
Reply in JSON format:
{{"page": <page_number or null>}}"#,
title, content
);
#[derive(serde::Deserialize)]
struct LocateResult {
page: Option<usize>,
}
let result: LocateResult = client.complete_json(system, &user).await?;
Ok(result.page)
}
async fn assign_with_llm(&self, entries: &mut [TocEntry], pages: &[PdfPage]) -> Result<()> {
info!("Assigning pages using LLM positioning");
let client = self.client.clone();
let pages_owned = pages.to_vec();
let total = entries.len();
let futures: Vec<_> = entries.iter().map(|entry| {
let title = entry.title.clone();
let client = client.clone();
let pages = pages_owned.clone();
async move {
let groups = Self::group_pages_owned(&pages, 5);
Self::locate_title_in_groups_static(&client, &title, &groups).await
}
}).collect();
let results: Vec<_> = stream::iter(futures)
.buffer_unordered(5)
.collect()
.await;
info!("Assigned pages for {}/{} entries", results.len(), total);
for (entry, result) in entries.iter_mut().zip(results.into_iter()) {
let physical = result?;
entry.physical_page = physical;
entry.confidence = if physical.is_some() { 0.8 } else { 0.3 };
}
Ok(())
}
fn group_pages_owned(pages: &[PdfPage], group_size: usize) -> Vec<Vec<PdfPage>> {
pages
.chunks(group_size)
.map(|chunk| chunk.to_vec())
.collect()
}
async fn locate_title_in_groups_static(
client: &LlmClient,
title: &str,
groups: &[Vec<PdfPage>],
) -> Result<Option<usize>> {
let system = "You are a document analysis assistant. Find which page contains a specific section title.";
for group in groups {
let content = group
.iter()
.map(|p| {
format!(
"<page_{}>\n{}\n</page_{}>",
p.number,
&p.text[..p.text.len().min(300)],
p.number
)
})
.collect::<Vec<_>>()
.join("\n\n");
let user = format!(
r#"Find which page contains the section titled: "{}"
Pages:
{}
Reply in JSON format:
{{"found": true/false, "page": <page_number if found>}}"#,
title, content
);
#[derive(serde::Deserialize)]
struct SearchResult {
found: bool,
page: Option<usize>,
}
let result: SearchResult = client.complete_json(system, &user).await?;
if result.found {
return Ok(result.page);
}
}
Ok(None)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_select_anchors() {
let assigner = PageAssigner::with_defaults();
let entries = vec![
TocEntry::new("Chapter 1", 1).with_toc_page(1),
TocEntry::new("Chapter 2", 1).with_toc_page(10),
TocEntry::new("Chapter 3", 1).with_toc_page(20),
TocEntry::new("Chapter 4", 1).with_toc_page(30),
];
let anchors = assigner.select_anchors(&entries, 2);
assert_eq!(anchors.len(), 2);
}
#[test]
fn test_calculate_mode() {
let assigner = PageAssigner::with_defaults();
let values = vec![2, 2, 2, 3, 3, 4];
assert_eq!(assigner.calculate_mode(&values), 2);
let values = vec![1, 1, 2, 2, 2];
assert_eq!(assigner.calculate_mode(&values), 2);
}
}