1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
//! Chunk-to-page mapper for RAG-aligned PDF editing
//!
//! Maps `DocumentChunk` page numbers to PDF page indices and extracts
//! only the pages relevant to retrieved chunks.
use std::io::Cursor;
use crate::ai::chunking::DocumentChunk;
use super::OperationError;
use super::OperationResult;
/// Maps RAG chunks to their corresponding PDF pages and extracts relevant pages.
pub struct ChunkPageMapper;
impl ChunkPageMapper {
/// Get the 0-indexed page indices covered by the given chunks.
///
/// Chunk `page_numbers` are 1-indexed (matching the chunker convention).
/// Returns sorted, deduplicated, 0-indexed page indices.
pub fn pages_for_chunks(chunks: &[&DocumentChunk]) -> Vec<usize> {
let mut pages: Vec<usize> = chunks
.iter()
.flat_map(|c| c.page_numbers.iter())
.filter(|&&p| p > 0)
.map(|&p| p - 1) // 1-indexed → 0-indexed
.collect();
pages.sort();
pages.dedup();
pages
}
/// Extract only the pages referenced by the given chunks into a new PDF.
///
/// # Arguments
///
/// * `pdf_bytes` - The original PDF file bytes
/// * `chunks` - Chunks whose pages should be extracted
///
/// # Returns
///
/// The new PDF bytes containing only the relevant pages.
pub fn extract_pages_for_chunks(
pdf_bytes: &[u8],
chunks: &[&DocumentChunk],
) -> OperationResult<Vec<u8>> {
let page_indices = Self::pages_for_chunks(chunks);
if page_indices.is_empty() {
return Err(OperationError::NoPagesToProcess);
}
let cursor = Cursor::new(pdf_bytes);
let reader = crate::parser::PdfReader::new(cursor)
.map_err(|e| OperationError::ParseError(e.to_string()))?;
let document = reader.into_document();
let page_count = document
.page_count()
.map_err(|e| OperationError::ParseError(e.to_string()))?
as usize;
// Validate indices
for &idx in &page_indices {
if idx >= page_count {
return Err(OperationError::PageIndexOutOfBounds(idx, page_count));
}
}
let mut output_doc = crate::document::Document::new();
for &page_idx in &page_indices {
let parsed_page = document
.get_page(page_idx as u32)
.map_err(|e| OperationError::ParseError(e.to_string()))?;
let page = crate::page::Page::from_parsed_with_content(&parsed_page, &document)
.map_err(|e| OperationError::ParseError(e.to_string()))?;
output_doc.add_page(page);
}
output_doc.to_bytes().map_err(OperationError::PdfError)
}
}