1use std::collections::HashMap;
7use tracing::{debug, info};
8
9use crate::config::LlmConfig;
10use crate::error::Result;
11use crate::parser::pdf::PdfPage;
12
13use super::types::{PageOffset, TocEntry};
14use crate::llm::LlmClient;
15
16#[derive(Debug, Clone)]
18pub struct PageAssignerConfig {
19 pub anchor_count: usize,
21
22 pub llm_config: LlmConfig,
24
25 pub max_offset_variance: usize,
27}
28
29impl Default for PageAssignerConfig {
30 fn default() -> Self {
31 Self {
32 anchor_count: 5,
33 llm_config: LlmConfig::default(),
34 max_offset_variance: 3,
35 }
36 }
37}
38
39pub struct PageAssigner {
41 config: PageAssignerConfig,
42 client: LlmClient,
43}
44
45impl PageAssigner {
46 pub fn new(config: PageAssignerConfig) -> Self {
48 let client = LlmClient::new(config.llm_config.clone().into());
49 Self { config, client }
50 }
51
52 pub fn with_defaults() -> Self {
54 Self::new(PageAssignerConfig::default())
55 }
56
57 pub async fn assign(&self, entries: &mut [TocEntry], pages: &[PdfPage]) -> Result<()> {
63 if entries.is_empty() {
64 return Ok(());
65 }
66
67 let has_toc_pages = entries.iter().any(|e| e.toc_page.is_some());
69
70 if has_toc_pages {
71 self.assign_with_offset(entries, pages).await
72 } else {
73 self.assign_with_llm(entries, pages).await
74 }
75 }
76
77 async fn assign_with_offset(&self, entries: &mut [TocEntry], pages: &[PdfPage]) -> Result<()> {
79 info!("Assigning pages using offset calculation");
80
81 let anchors = self.select_anchors(entries, self.config.anchor_count);
83
84 let offset = self.calculate_offset(anchors, pages).await?;
86
87 if offset.confidence < 0.5 {
88 debug!("Offset confidence too low, falling back to LLM positioning");
89 return self.assign_with_llm(entries, pages).await;
90 }
91
92 info!(
93 "Calculated offset: {} (confidence: {})",
94 offset.offset, offset.confidence
95 );
96
97 for entry in entries.iter_mut() {
99 if let Some(toc_page) = entry.toc_page {
100 let physical = offset.apply(toc_page);
101 entry.physical_page = Some(physical.min(pages.len()));
102 }
103 }
104
105 Ok(())
106 }
107
108 fn select_anchors<'a>(&self, entries: &'a [TocEntry], count: usize) -> Vec<&'a TocEntry> {
110 let with_pages: Vec<_> = entries.iter().filter(|e| e.toc_page.is_some()).collect();
112
113 if with_pages.len() <= count {
114 return with_pages;
115 }
116
117 let step = with_pages.len() as f32 / count as f32;
119 (0..count)
120 .map(|i| with_pages[(i as f32 * step) as usize])
121 .collect()
122 }
123
124 async fn calculate_offset(
126 &self,
127 anchors: Vec<&TocEntry>,
128 pages: &[PdfPage],
129 ) -> Result<PageOffset> {
130 if anchors.is_empty() {
131 return Ok(PageOffset::new(0, 0, 0.0));
132 }
133
134 let anchor_count = anchors.len();
135 let mut verified_offsets: Vec<(i32, bool)> = Vec::new();
136
137 for anchor in anchors {
138 let toc_page = anchor.toc_page.unwrap();
139
140 if let Some(physical) = self
142 .locate_title_in_range(anchor.title.as_str(), pages, toc_page)
143 .await?
144 {
145 let offset = physical as i32 - toc_page as i32;
146 verified_offsets.push((offset, true));
147 debug!(
148 "Anchor '{}' found: toc={}, physical={}, offset={}",
149 anchor.title, toc_page, physical, offset
150 );
151 } else {
152 verified_offsets.push((0, false));
153 }
154 }
155
156 let successful: Vec<_> = verified_offsets
158 .iter()
159 .filter(|(_, success)| *success)
160 .map(|(offset, _)| *offset)
161 .collect();
162
163 if successful.is_empty() {
164 return Ok(PageOffset::new(0, 0, 0.0));
165 }
166
167 let mode = self.calculate_mode(&successful);
168 let sample_count = successful.len();
169 let confidence = sample_count as f32 / anchor_count as f32;
170
171 Ok(PageOffset::new(mode, sample_count, confidence))
172 }
173
174 fn calculate_mode(&self, values: &[i32]) -> i32 {
176 let mut counts: HashMap<i32, usize> = HashMap::new();
177 for &v in values {
178 *counts.entry(v).or_insert(0) += 1;
179 }
180 counts
181 .into_iter()
182 .max_by_key(|&(_, count)| count)
183 .map(|(v, _)| v)
184 .unwrap_or(0)
185 }
186
187 async fn locate_title_in_range(
189 &self,
190 title: &str,
191 pages: &[PdfPage],
192 near_page: usize,
193 ) -> Result<Option<usize>> {
194 let start = (near_page.saturating_sub(3)).max(1);
196 let end = (near_page + 3).min(pages.len());
197
198 let range_pages: Vec<_> = (start..=end).filter_map(|i| pages.get(i - 1)).collect();
199
200 if range_pages.is_empty() {
201 return Ok(None);
202 }
203
204 let content = range_pages
206 .iter()
207 .map(|p| {
208 format!(
209 "<page_{}>\n{}\n</page_{}>",
210 p.number,
211 &p.text[..p.text.len().min(500)],
212 p.number
213 )
214 })
215 .collect::<Vec<_>>()
216 .join("\n\n");
217
218 let system = "You are a document analysis assistant. Find which page contains a specific section title.";
219 let user = format!(
220 r#"Find which page contains the section titled: "{}"
221
222Pages:
223{}
224
225Reply in JSON format:
226{{"page": <page_number or null>}}"#,
227 title, content
228 );
229
230 #[derive(serde::Deserialize)]
231 struct LocateResult {
232 page: Option<usize>,
233 }
234
235 let result: LocateResult = self.client.complete_json(system, &user).await?;
236 Ok(result.page)
237 }
238
239 async fn assign_with_llm(&self, entries: &mut [TocEntry], pages: &[PdfPage]) -> Result<()> {
241 info!("Assigning pages using LLM positioning");
242
243 let page_groups = self.group_pages(pages, 5);
245
246 for entry in entries.iter_mut() {
247 let physical = self
248 .locate_title_in_groups(entry.title.as_str(), &page_groups)
249 .await?;
250 entry.physical_page = physical;
251 entry.confidence = if physical.is_some() { 0.8 } else { 0.3 };
252 }
253
254 Ok(())
255 }
256
257 fn group_pages<'a>(&self, pages: &'a [PdfPage], group_size: usize) -> Vec<Vec<&'a PdfPage>> {
259 pages
260 .chunks(group_size)
261 .map(|chunk| chunk.iter().collect())
262 .collect()
263 }
264
265 async fn locate_title_in_groups(
267 &self,
268 title: &str,
269 groups: &[Vec<&PdfPage>],
270 ) -> Result<Option<usize>> {
271 let system = "You are a document analysis assistant. Find which page contains a specific section title.";
272
273 for group in groups {
274 let content = group
275 .iter()
276 .map(|p| {
277 format!(
278 "<page_{}>\n{}\n</page_{}>",
279 p.number,
280 &p.text[..p.text.len().min(300)],
281 p.number
282 )
283 })
284 .collect::<Vec<_>>()
285 .join("\n\n");
286
287 let user = format!(
288 r#"Find which page contains the section titled: "{}"
289
290Pages:
291{}
292
293Reply in JSON format:
294{{"found": true/false, "page": <page_number if found>}}"#,
295 title, content
296 );
297
298 #[derive(serde::Deserialize)]
299 struct SearchResult {
300 found: bool,
301 page: Option<usize>,
302 }
303
304 let result: SearchResult = self.client.complete_json(system, &user).await?;
305
306 if result.found {
307 return Ok(result.page);
308 }
309 }
310
311 Ok(None)
312 }
313}
314
315#[cfg(test)]
316mod tests {
317 use super::*;
318
319 #[test]
320 fn test_select_anchors() {
321 let assigner = PageAssigner::with_defaults();
322
323 let entries = vec![
324 TocEntry::new("Chapter 1", 1).with_toc_page(1),
325 TocEntry::new("Chapter 2", 1).with_toc_page(10),
326 TocEntry::new("Chapter 3", 1).with_toc_page(20),
327 TocEntry::new("Chapter 4", 1).with_toc_page(30),
328 ];
329
330 let anchors = assigner.select_anchors(&entries, 2);
331 assert_eq!(anchors.len(), 2);
332 }
333
334 #[test]
335 fn test_calculate_mode() {
336 let assigner = PageAssigner::with_defaults();
337
338 let values = vec![2, 2, 2, 3, 3, 4];
339 assert_eq!(assigner.calculate_mode(&values), 2);
340
341 let values = vec![1, 1, 2, 2, 2];
342 assert_eq!(assigner.calculate_mode(&values), 2);
343 }
344}