Skip to main content

vectorless/parser/toc/
assigner.rs

1// Copyright (c) 2026 vectorless developers
2// SPDX-License-Identifier: Apache-2.0
3
4//! Page assigner - assigns physical page numbers to TOC entries.
5
6use std::collections::HashMap;
7use tracing::{debug, info};
8
9use crate::config::LlmConfig;
10use crate::error::Result;
11use crate::parser::pdf::PdfPage;
12
13use super::types::{PageOffset, TocEntry};
14use crate::llm::LlmClient;
15
16/// Page assigner configuration.
17#[derive(Debug, Clone)]
18pub struct PageAssignerConfig {
19    /// Number of anchor points for offset calculation.
20    pub anchor_count: usize,
21
22    /// LLM configuration.
23    pub llm_config: LlmConfig,
24
25    /// Maximum offset variance allowed.
26    pub max_offset_variance: usize,
27}
28
29impl Default for PageAssignerConfig {
30    fn default() -> Self {
31        Self {
32            anchor_count: 5,
33            llm_config: LlmConfig::default(),
34            max_offset_variance: 3,
35        }
36    }
37}
38
39/// Page assigner - assigns physical page numbers to TOC entries.
40pub struct PageAssigner {
41    config: PageAssignerConfig,
42    client: LlmClient,
43}
44
45impl PageAssigner {
46    /// Create a new page assigner.
47    pub fn new(config: PageAssignerConfig) -> Self {
48        let client = LlmClient::new(config.llm_config.clone().into());
49        Self { config, client }
50    }
51
52    /// Create an assigner with default configuration.
53    pub fn with_defaults() -> Self {
54        Self::new(PageAssignerConfig::default())
55    }
56
57    /// Assign physical pages to TOC entries.
58    ///
59    /// Strategy:
60    /// 1. If entries have TOC pages → calculate offset → apply offset
61    /// 2. If no TOC pages → use LLM to locate each entry
62    pub async fn assign(&self, entries: &mut [TocEntry], pages: &[PdfPage]) -> Result<()> {
63        if entries.is_empty() {
64            return Ok(());
65        }
66
67        // Check if we have TOC page numbers
68        let has_toc_pages = entries.iter().any(|e| e.toc_page.is_some());
69
70        if has_toc_pages {
71            self.assign_with_offset(entries, pages).await
72        } else {
73            self.assign_with_llm(entries, pages).await
74        }
75    }
76
77    /// Assign pages using offset calculation.
78    async fn assign_with_offset(&self, entries: &mut [TocEntry], pages: &[PdfPage]) -> Result<()> {
79        info!("Assigning pages using offset calculation");
80
81        // Step 1: Select anchor entries
82        let anchors = self.select_anchors(entries, self.config.anchor_count);
83
84        // Step 2: Verify anchors and calculate offset
85        let offset = self.calculate_offset(anchors, pages).await?;
86
87        if offset.confidence < 0.5 {
88            debug!("Offset confidence too low, falling back to LLM positioning");
89            return self.assign_with_llm(entries, pages).await;
90        }
91
92        info!(
93            "Calculated offset: {} (confidence: {})",
94            offset.offset, offset.confidence
95        );
96
97        // Step 3: Apply offset to all entries
98        for entry in entries.iter_mut() {
99            if let Some(toc_page) = entry.toc_page {
100                let physical = offset.apply(toc_page);
101                entry.physical_page = Some(physical.min(pages.len()));
102            }
103        }
104
105        Ok(())
106    }
107
108    /// Select anchor entries for offset calculation.
109    fn select_anchors<'a>(&self, entries: &'a [TocEntry], count: usize) -> Vec<&'a TocEntry> {
110        // Select entries with TOC pages, evenly distributed
111        let with_pages: Vec<_> = entries.iter().filter(|e| e.toc_page.is_some()).collect();
112
113        if with_pages.len() <= count {
114            return with_pages;
115        }
116
117        // Select evenly distributed entries
118        let step = with_pages.len() as f32 / count as f32;
119        (0..count)
120            .map(|i| with_pages[(i as f32 * step) as usize])
121            .collect()
122    }
123
124    /// Calculate page offset by verifying anchors.
125    async fn calculate_offset(
126        &self,
127        anchors: Vec<&TocEntry>,
128        pages: &[PdfPage],
129    ) -> Result<PageOffset> {
130        if anchors.is_empty() {
131            return Ok(PageOffset::new(0, 0, 0.0));
132        }
133
134        let anchor_count = anchors.len();
135        let mut verified_offsets: Vec<(i32, bool)> = Vec::new();
136
137        for anchor in anchors {
138            let toc_page = anchor.toc_page.unwrap();
139
140            // Find the physical page where this title appears
141            if let Some(physical) = self
142                .locate_title_in_range(anchor.title.as_str(), pages, toc_page)
143                .await?
144            {
145                let offset = physical as i32 - toc_page as i32;
146                verified_offsets.push((offset, true));
147                debug!(
148                    "Anchor '{}' found: toc={}, physical={}, offset={}",
149                    anchor.title, toc_page, physical, offset
150                );
151            } else {
152                verified_offsets.push((0, false));
153            }
154        }
155
156        // Calculate the mode (most common offset)
157        let successful: Vec<_> = verified_offsets
158            .iter()
159            .filter(|(_, success)| *success)
160            .map(|(offset, _)| *offset)
161            .collect();
162
163        if successful.is_empty() {
164            return Ok(PageOffset::new(0, 0, 0.0));
165        }
166
167        let mode = self.calculate_mode(&successful);
168        let sample_count = successful.len();
169        let confidence = sample_count as f32 / anchor_count as f32;
170
171        Ok(PageOffset::new(mode, sample_count, confidence))
172    }
173
174    /// Calculate mode of offset values.
175    fn calculate_mode(&self, values: &[i32]) -> i32 {
176        let mut counts: HashMap<i32, usize> = HashMap::new();
177        for &v in values {
178            *counts.entry(v).or_insert(0) += 1;
179        }
180        counts
181            .into_iter()
182            .max_by_key(|&(_, count)| count)
183            .map(|(v, _)| v)
184            .unwrap_or(0)
185    }
186
187    /// Locate a title in a range of pages using LLM.
188    async fn locate_title_in_range(
189        &self,
190        title: &str,
191        pages: &[PdfPage],
192        near_page: usize,
193    ) -> Result<Option<usize>> {
194        // Search in a range around the expected page
195        let start = (near_page.saturating_sub(3)).max(1);
196        let end = (near_page + 3).min(pages.len());
197
198        let range_pages: Vec<_> = (start..=end).filter_map(|i| pages.get(i - 1)).collect();
199
200        if range_pages.is_empty() {
201            return Ok(None);
202        }
203
204        // Use LLM to find the exact page
205        let content = range_pages
206            .iter()
207            .map(|p| {
208                format!(
209                    "<page_{}>\n{}\n</page_{}>",
210                    p.number,
211                    &p.text[..p.text.len().min(500)],
212                    p.number
213                )
214            })
215            .collect::<Vec<_>>()
216            .join("\n\n");
217
218        let system = "You are a document analysis assistant. Find which page contains a specific section title.";
219        let user = format!(
220            r#"Find which page contains the section titled: "{}"
221
222Pages:
223{}
224
225Reply in JSON format:
226{{"page": <page_number or null>}}"#,
227            title, content
228        );
229
230        #[derive(serde::Deserialize)]
231        struct LocateResult {
232            page: Option<usize>,
233        }
234
235        let result: LocateResult = self.client.complete_json(system, &user).await?;
236        Ok(result.page)
237    }
238
239    /// Assign pages using LLM for each entry.
240    async fn assign_with_llm(&self, entries: &mut [TocEntry], pages: &[PdfPage]) -> Result<()> {
241        info!("Assigning pages using LLM positioning");
242
243        // Group pages for efficient processing
244        let page_groups = self.group_pages(pages, 5);
245
246        for entry in entries.iter_mut() {
247            let physical = self
248                .locate_title_in_groups(entry.title.as_str(), &page_groups)
249                .await?;
250            entry.physical_page = physical;
251            entry.confidence = if physical.is_some() { 0.8 } else { 0.3 };
252        }
253
254        Ok(())
255    }
256
257    /// Group pages for batch processing.
258    fn group_pages<'a>(&self, pages: &'a [PdfPage], group_size: usize) -> Vec<Vec<&'a PdfPage>> {
259        pages
260            .chunks(group_size)
261            .map(|chunk| chunk.iter().collect())
262            .collect()
263    }
264
265    /// Locate a title across page groups.
266    async fn locate_title_in_groups(
267        &self,
268        title: &str,
269        groups: &[Vec<&PdfPage>],
270    ) -> Result<Option<usize>> {
271        let system = "You are a document analysis assistant. Find which page contains a specific section title.";
272
273        for group in groups {
274            let content = group
275                .iter()
276                .map(|p| {
277                    format!(
278                        "<page_{}>\n{}\n</page_{}>",
279                        p.number,
280                        &p.text[..p.text.len().min(300)],
281                        p.number
282                    )
283                })
284                .collect::<Vec<_>>()
285                .join("\n\n");
286
287            let user = format!(
288                r#"Find which page contains the section titled: "{}"
289
290Pages:
291{}
292
293Reply in JSON format:
294{{"found": true/false, "page": <page_number if found>}}"#,
295                title, content
296            );
297
298            #[derive(serde::Deserialize)]
299            struct SearchResult {
300                found: bool,
301                page: Option<usize>,
302            }
303
304            let result: SearchResult = self.client.complete_json(system, &user).await?;
305
306            if result.found {
307                return Ok(result.page);
308            }
309        }
310
311        Ok(None)
312    }
313}
314
315#[cfg(test)]
316mod tests {
317    use super::*;
318
319    #[test]
320    fn test_select_anchors() {
321        let assigner = PageAssigner::with_defaults();
322
323        let entries = vec![
324            TocEntry::new("Chapter 1", 1).with_toc_page(1),
325            TocEntry::new("Chapter 2", 1).with_toc_page(10),
326            TocEntry::new("Chapter 3", 1).with_toc_page(20),
327            TocEntry::new("Chapter 4", 1).with_toc_page(30),
328        ];
329
330        let anchors = assigner.select_anchors(&entries, 2);
331        assert_eq!(anchors.len(), 2);
332    }
333
334    #[test]
335    fn test_calculate_mode() {
336        let assigner = PageAssigner::with_defaults();
337
338        let values = vec![2, 2, 2, 3, 3, 4];
339        assert_eq!(assigner.calculate_mode(&values), 2);
340
341        let values = vec![1, 1, 2, 2, 2];
342        assert_eq!(assigner.calculate_mode(&values), 2);
343    }
344}