use std::collections::HashMap;
use std::io::Cursor;
use crate::ai::chunking::DocumentChunk;
use crate::annotations::MarkupAnnotation;
use crate::geometry::{Point, Rectangle};
use crate::graphics::Color;
use crate::text::extraction::{ExtractedText, ExtractionOptions, TextFragment};
const PAGE_SEPARATOR: &str = "\n\n";
#[derive(Debug, Clone)]
pub struct IndexedFragment {
pub page: usize,
pub start_char: usize,
pub end_char: usize,
pub x: f64,
pub y: f64,
pub width: f64,
pub height: f64,
}
impl IndexedFragment {
pub fn to_rectangle(&self) -> Rectangle {
Rectangle::from_position_and_size(self.x, self.y, self.width, self.height)
}
}
#[derive(Debug)]
pub struct TextPositionIndex {
entries: Vec<IndexedFragment>,
page_offsets: Vec<usize>,
}
impl TextPositionIndex {
pub fn build(pages: &[ExtractedText]) -> Self {
let mut entries = Vec::new();
let mut page_offsets = Vec::new();
let mut global_offset: usize = 0;
for (page_idx, page) in pages.iter().enumerate() {
page_offsets.push(global_offset);
let page_text = &page.text;
let mut search_from: usize = 0;
for fragment in &page.fragments {
if fragment.text.is_empty() {
continue;
}
if let Some(pos_in_page) = page_text[search_from..].find(&fragment.text) {
let local_offset = search_from + pos_in_page;
let frag_len = fragment.text.len();
entries.push(IndexedFragment {
page: page_idx,
start_char: global_offset + local_offset,
end_char: global_offset + local_offset + frag_len,
x: fragment.x,
y: fragment.y,
width: fragment.width,
height: fragment.height,
});
search_from = local_offset + frag_len;
}
}
global_offset += page_text.len();
if page_idx < pages.len() - 1 {
global_offset += PAGE_SEPARATOR.len();
}
}
Self {
entries,
page_offsets,
}
}
pub fn fragments_for_range(&self, start: usize, end: usize) -> Vec<&IndexedFragment> {
if start >= end {
return Vec::new();
}
self.entries
.iter()
.filter(|e| e.start_char < end && e.end_char > start)
.collect()
}
pub fn page_offset(&self, page: usize) -> Option<usize> {
self.page_offsets.get(page).copied()
}
pub fn len(&self) -> usize {
self.entries.len()
}
pub fn is_empty(&self) -> bool {
self.entries.is_empty()
}
pub fn entries(&self) -> &[IndexedFragment] {
&self.entries
}
}
pub fn fragment_to_highlight_rect(frag: &TextFragment) -> Rectangle {
Rectangle::new(
Point::new(frag.x, frag.y),
Point::new(frag.x + frag.width, frag.y + frag.height),
)
}
#[derive(Debug, Clone)]
pub struct HighlightStyle {
pub color: Color,
pub opacity: f64,
}
impl Default for HighlightStyle {
fn default() -> Self {
Self {
color: Color::Rgb(1.0, 1.0, 0.0), opacity: 0.5,
}
}
}
impl HighlightStyle {
pub fn new() -> Self {
Self::default()
}
pub fn with_color(mut self, color: Color) -> Self {
self.color = color;
self
}
pub fn with_opacity(mut self, opacity: f64) -> Self {
self.opacity = opacity;
self
}
}
#[derive(Debug, thiserror::Error)]
pub enum SourceHighlighterError {
#[error("text extraction failed: {0}")]
TextExtractionFailed(String),
#[error("page reconstruction failed: {0}")]
PageReconstructionFailed(String),
#[error("write failed: {0}")]
WriteFailed(String),
}
pub type SourceHighlighterResult<T> = Result<T, SourceHighlighterError>;
pub struct SourceHighlighter;
impl SourceHighlighter {
pub fn highlight_chunks(
pdf_bytes: &[u8],
chunks: &[&DocumentChunk],
style: HighlightStyle,
) -> SourceHighlighterResult<Vec<u8>> {
if chunks.is_empty() {
return Ok(pdf_bytes.to_vec());
}
let cursor = Cursor::new(pdf_bytes);
let reader = crate::parser::PdfReader::new(cursor)
.map_err(|e| SourceHighlighterError::TextExtractionFailed(e.to_string()))?;
let document = reader.into_document();
let options = ExtractionOptions {
preserve_layout: true,
..Default::default()
};
let extracted_pages = document
.extract_text_with_options(options)
.map_err(|e| SourceHighlighterError::TextExtractionFailed(e.to_string()))?;
let index = TextPositionIndex::build(&extracted_pages);
let mut annotations_by_page: HashMap<usize, Vec<Rectangle>> = HashMap::new();
for chunk in chunks {
let start = chunk.metadata.position.start_char;
let end = chunk.metadata.position.end_char;
for frag in index.fragments_for_range(start, end) {
annotations_by_page
.entry(frag.page)
.or_default()
.push(frag.to_rectangle());
}
}
let page_count = document
.page_count()
.map_err(|e| SourceHighlighterError::PageReconstructionFailed(e.to_string()))?;
let mut output_doc = crate::document::Document::new();
for page_idx in 0..page_count {
let parsed_page = document
.get_page(page_idx)
.map_err(|e| SourceHighlighterError::PageReconstructionFailed(e.to_string()))?;
let mut page = crate::page::Page::from_parsed_with_content(&parsed_page, &document)
.map_err(|e| SourceHighlighterError::PageReconstructionFailed(e.to_string()))?;
if let Some(rects) = annotations_by_page.get(&(page_idx as usize)) {
for rect in rects {
let highlight =
MarkupAnnotation::highlight(*rect).with_color(style.color.clone());
page.add_annotation(highlight.to_annotation());
}
}
output_doc.add_page(page);
}
output_doc
.to_bytes()
.map_err(|e| SourceHighlighterError::WriteFailed(e.to_string()))
}
}
#[cfg(test)]
mod tests {
use super::*;
fn make_fragment(text: &str, x: f64, y: f64, width: f64, height: f64) -> TextFragment {
TextFragment {
text: text.to_string(),
x,
y,
width,
height,
font_size: 12.0,
font_name: None,
is_bold: false,
is_italic: false,
color: None,
space_decisions: Vec::new(),
}
}
fn make_extracted(fragments: Vec<TextFragment>) -> ExtractedText {
let text = fragments
.iter()
.map(|f| f.text.as_str())
.collect::<Vec<_>>()
.join(" ");
ExtractedText { text, fragments }
}
#[test]
fn test_index_single_fragment() {
let page = ExtractedText {
text: "Hello".to_string(),
fragments: vec![make_fragment("Hello", 100.0, 700.0, 50.0, 12.0)],
};
let index = TextPositionIndex::build(&[page]);
assert_eq!(index.len(), 1);
let results = index.fragments_for_range(0, 5);
assert_eq!(results.len(), 1);
assert_eq!(results[0].start_char, 0);
assert_eq!(results[0].end_char, 5);
assert!((results[0].x - 100.0).abs() < 0.01);
}
#[test]
fn test_index_multiple_fragments() {
let page = ExtractedText {
text: "Hello World Test".to_string(),
fragments: vec![
make_fragment("Hello", 100.0, 700.0, 50.0, 12.0),
make_fragment("World", 160.0, 700.0, 55.0, 12.0),
make_fragment("Test", 225.0, 700.0, 40.0, 12.0),
],
};
let index = TextPositionIndex::build(&[page]);
assert_eq!(index.len(), 3);
let results = index.fragments_for_range(6, 11);
assert_eq!(results.len(), 1);
assert!((results[0].x - 160.0).abs() < 0.01);
}
#[test]
fn test_index_cross_page() {
let page1 = ExtractedText {
text: "Page one".to_string(),
fragments: vec![make_fragment("Page one", 72.0, 700.0, 80.0, 12.0)],
};
let page2 = ExtractedText {
text: "Page two".to_string(),
fragments: vec![make_fragment("Page two", 72.0, 700.0, 80.0, 12.0)],
};
let index = TextPositionIndex::build(&[page1, page2]);
assert_eq!(index.len(), 2);
let results = index.fragments_for_range(5, 15);
assert_eq!(results.len(), 2);
assert_eq!(results[0].page, 0);
assert_eq!(results[1].page, 1);
}
#[test]
fn test_index_empty_range() {
let page = ExtractedText {
text: "Hello".to_string(),
fragments: vec![make_fragment("Hello", 100.0, 700.0, 50.0, 12.0)],
};
let index = TextPositionIndex::build(&[page]);
let results = index.fragments_for_range(2, 2);
assert!(results.is_empty(), "Empty range should return no results");
}
#[test]
fn test_index_exact_boundary() {
let page = ExtractedText {
text: "Hello".to_string(),
fragments: vec![make_fragment("Hello", 100.0, 700.0, 50.0, 12.0)],
};
let index = TextPositionIndex::build(&[page]);
let results = index.fragments_for_range(0, 5);
assert_eq!(results.len(), 1);
let results = index.fragments_for_range(5, 10);
assert!(results.is_empty());
}
#[test]
fn test_index_no_overlap() {
let page = ExtractedText {
text: "Hello".to_string(),
fragments: vec![make_fragment("Hello", 100.0, 700.0, 50.0, 12.0)],
};
let index = TextPositionIndex::build(&[page]);
let results = index.fragments_for_range(100, 200);
assert!(
results.is_empty(),
"Query far beyond text should return nothing"
);
}
#[test]
fn test_fragment_to_highlight_rect_conversion() {
let frag = make_fragment("Test", 100.0, 500.0, 200.0, 15.0);
let rect = fragment_to_highlight_rect(&frag);
assert!((rect.lower_left.x - 100.0).abs() < 0.01);
assert!((rect.lower_left.y - 500.0).abs() < 0.01);
assert!((rect.upper_right.x - 300.0).abs() < 0.01);
assert!((rect.upper_right.y - 515.0).abs() < 0.01);
}
#[test]
fn test_index_build_from_extracted() {
let fragments = vec![
make_fragment("Alpha", 72.0, 750.0, 45.0, 12.0),
make_fragment("Beta", 130.0, 750.0, 35.0, 12.0),
make_fragment("Gamma", 180.0, 750.0, 50.0, 12.0),
];
let page = make_extracted(fragments);
let index = TextPositionIndex::build(&[page]);
assert_eq!(index.len(), 3);
for entry in index.entries() {
assert_eq!(entry.page, 0);
}
}
#[test]
fn test_index_fragments_grouped_by_page() {
let page1 = ExtractedText {
text: "AAA".to_string(),
fragments: vec![make_fragment("AAA", 72.0, 700.0, 30.0, 12.0)],
};
let page2 = ExtractedText {
text: "BBB CCC".to_string(),
fragments: vec![
make_fragment("BBB", 72.0, 700.0, 30.0, 12.0),
make_fragment("CCC", 110.0, 700.0, 30.0, 12.0),
],
};
let index = TextPositionIndex::build(&[page1, page2]);
let all = index.fragments_for_range(0, 100);
assert_eq!(all.len(), 3);
let page0_frags: Vec<_> = all.iter().filter(|f| f.page == 0).collect();
let page1_frags: Vec<_> = all.iter().filter(|f| f.page == 1).collect();
assert_eq!(page0_frags.len(), 1);
assert_eq!(page1_frags.len(), 2);
}
#[test]
fn test_index_page_offsets() {
let page1 = ExtractedText {
text: "ABCDE".to_string(), fragments: vec![make_fragment("ABCDE", 72.0, 700.0, 50.0, 12.0)],
};
let page2 = ExtractedText {
text: "FGHIJ".to_string(), fragments: vec![make_fragment("FGHIJ", 72.0, 700.0, 50.0, 12.0)],
};
let page3 = ExtractedText {
text: "KLMNO".to_string(), fragments: vec![make_fragment("KLMNO", 72.0, 700.0, 50.0, 12.0)],
};
let index = TextPositionIndex::build(&[page1, page2, page3]);
assert_eq!(index.page_offset(0), Some(0));
assert_eq!(index.page_offset(1), Some(7));
assert_eq!(index.page_offset(2), Some(14));
assert_eq!(index.page_offset(3), None);
}
#[test]
fn test_index_whitespace_only_fragments() {
let page = ExtractedText {
text: "Hello World".to_string(),
fragments: vec![
make_fragment("Hello", 72.0, 700.0, 50.0, 12.0),
make_fragment("", 130.0, 700.0, 10.0, 12.0), make_fragment("World", 145.0, 700.0, 55.0, 12.0),
],
};
let index = TextPositionIndex::build(&[page]);
assert_eq!(index.len(), 2);
let results = index.fragments_for_range(0, 20);
assert_eq!(results.len(), 2);
}
#[test]
fn test_index_preserves_order() {
let page = ExtractedText {
text: "AAA BBB CCC DDD".to_string(),
fragments: vec![
make_fragment("AAA", 72.0, 700.0, 30.0, 12.0),
make_fragment("BBB", 110.0, 700.0, 30.0, 12.0),
make_fragment("CCC", 150.0, 700.0, 30.0, 12.0),
make_fragment("DDD", 190.0, 700.0, 30.0, 12.0),
],
};
let index = TextPositionIndex::build(&[page]);
let entries = index.entries();
for i in 1..entries.len() {
assert!(
entries[i].start_char >= entries[i - 1].start_char,
"Entries should be ordered by start_char"
);
}
}
}