use crate::document::PdfDocument;
use crate::error::{Error, Result};
use crate::geometry::Rect;
use crate::layout::TextSpan;
use regex::{Regex, RegexBuilder};
#[derive(Debug, Clone, serde::Serialize)]
pub struct SearchResult {
pub page: usize,
pub text: String,
pub bbox: Rect,
pub start_index: usize,
pub end_index: usize,
pub span_boxes: Vec<Rect>,
}
#[derive(Debug, Clone, Default, serde::Deserialize)]
pub struct SearchOptions {
pub case_insensitive: bool,
pub literal: bool,
pub whole_word: bool,
pub max_results: usize,
pub page_range: Option<(usize, usize)>,
}
impl SearchOptions {
pub fn new() -> Self {
Self::default()
}
pub fn case_insensitive() -> Self {
Self {
case_insensitive: true,
..Default::default()
}
}
pub fn with_case_insensitive(mut self, value: bool) -> Self {
self.case_insensitive = value;
self
}
pub fn with_literal(mut self, value: bool) -> Self {
self.literal = value;
self
}
pub fn with_whole_word(mut self, value: bool) -> Self {
self.whole_word = value;
self
}
pub fn with_max_results(mut self, max: usize) -> Self {
self.max_results = max;
self
}
pub fn with_page_range(mut self, start: usize, end: usize) -> Self {
self.page_range = Some((start, end));
self
}
}
pub struct TextSearcher;
impl TextSearcher {
pub fn search(
doc: &mut PdfDocument,
pattern: &str,
options: &SearchOptions,
) -> Result<Vec<SearchResult>> {
let regex = Self::build_regex(pattern, options)?;
let page_count = doc.page_count()?;
let (start_page, end_page) = options
.page_range
.unwrap_or((0, page_count.saturating_sub(1)));
let end_page = end_page.min(page_count.saturating_sub(1));
let mut results = Vec::new();
for page in start_page..=end_page {
let page_results = Self::search_page(doc, page, ®ex, options)?;
results.extend(page_results);
if options.max_results > 0 && results.len() >= options.max_results {
results.truncate(options.max_results);
break;
}
}
Ok(results)
}
pub fn search_page(
doc: &mut PdfDocument,
page: usize,
regex: &Regex,
options: &SearchOptions,
) -> Result<Vec<SearchResult>> {
let spans = doc.extract_spans(page)?;
let (full_text, span_positions) = Self::build_text_with_positions(&spans);
let mut results = Vec::new();
for mat in regex.find_iter(&full_text) {
let start = mat.start();
let end = mat.end();
let matched_text = mat.as_str().to_string();
let (bbox, span_boxes) = Self::compute_match_bbox(start, end, &spans, &span_positions);
results.push(SearchResult {
page,
text: matched_text,
bbox,
start_index: start,
end_index: end,
span_boxes,
});
if options.max_results > 0 && results.len() >= options.max_results {
break;
}
}
Ok(results)
}
fn build_regex(pattern: &str, options: &SearchOptions) -> Result<Regex> {
let mut pattern_str = if options.literal {
regex::escape(pattern)
} else {
pattern.to_string()
};
if options.whole_word {
pattern_str = format!(r"\b{}\b", pattern_str);
}
RegexBuilder::new(&pattern_str)
.case_insensitive(options.case_insensitive)
.build()
.map_err(|e| Error::InvalidPdf(format!("Invalid regex pattern: {}", e)))
}
fn build_text_with_positions(spans: &[TextSpan]) -> (String, Vec<(usize, usize, usize)>) {
let mut full_text = String::new();
let mut positions = Vec::new();
for (idx, span) in spans.iter().enumerate() {
let start = full_text.len();
full_text.push_str(&span.text);
let end = full_text.len();
positions.push((start, end, idx));
if idx < spans.len() - 1 && !span.text.ends_with(' ') {
full_text.push(' ');
}
}
(full_text, positions)
}
fn compute_match_bbox(
match_start: usize,
match_end: usize,
spans: &[TextSpan],
span_positions: &[(usize, usize, usize)],
) -> (Rect, Vec<Rect>) {
let mut span_boxes = Vec::new();
let mut combined_bbox: Option<Rect> = None;
for &(span_start, span_end, span_idx) in span_positions {
if span_start < match_end && span_end > match_start {
let span = &spans[span_idx];
span_boxes.push(span.bbox);
if let Some(ref mut bbox) = combined_bbox {
*bbox = bbox.union(&span.bbox);
} else {
combined_bbox = Some(span.bbox);
}
}
}
(combined_bbox.unwrap_or_else(|| Rect::new(0.0, 0.0, 0.0, 0.0)), span_boxes)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_search_options_default() {
let opts = SearchOptions::default();
assert!(!opts.case_insensitive);
assert!(!opts.literal);
assert!(!opts.whole_word);
assert_eq!(opts.max_results, 0);
assert!(opts.page_range.is_none());
}
#[test]
fn test_search_options_builder() {
let opts = SearchOptions::new()
.with_case_insensitive(true)
.with_literal(true)
.with_whole_word(true)
.with_max_results(10)
.with_page_range(0, 5);
assert!(opts.case_insensitive);
assert!(opts.literal);
assert!(opts.whole_word);
assert_eq!(opts.max_results, 10);
assert_eq!(opts.page_range, Some((0, 5)));
}
#[test]
fn test_build_regex_simple() {
let opts = SearchOptions::default();
let regex = TextSearcher::build_regex("hello", &opts).unwrap();
assert!(regex.is_match("hello world"));
assert!(!regex.is_match("HELLO world"));
}
#[test]
fn test_build_regex_case_insensitive() {
let opts = SearchOptions::case_insensitive();
let regex = TextSearcher::build_regex("hello", &opts).unwrap();
assert!(regex.is_match("hello world"));
assert!(regex.is_match("HELLO world"));
assert!(regex.is_match("HeLLo world"));
}
#[test]
fn test_build_regex_literal() {
let opts = SearchOptions::new().with_literal(true);
let regex = TextSearcher::build_regex("a.b", &opts).unwrap();
assert!(regex.is_match("a.b"));
assert!(!regex.is_match("axb")); }
#[test]
fn test_build_regex_whole_word() {
let opts = SearchOptions::new().with_whole_word(true);
let regex = TextSearcher::build_regex("cat", &opts).unwrap();
assert!(regex.is_match("the cat sat"));
assert!(!regex.is_match("category"));
assert!(!regex.is_match("concatenate"));
}
#[test]
fn test_build_text_with_positions() {
let spans = vec![
TextSpan {
artifact_type: None,
text: "Hello".to_string(),
bbox: Rect::new(0.0, 0.0, 50.0, 12.0),
font_name: "Arial".to_string(),
font_size: 12.0,
font_weight: crate::layout::FontWeight::Normal,
is_italic: false,
is_monospace: false,
color: crate::layout::Color {
r: 0.0,
g: 0.0,
b: 0.0,
},
mcid: None,
sequence: 0,
split_boundary_before: false,
offset_semantic: false,
char_spacing: 0.0,
word_spacing: 0.0,
horizontal_scaling: 100.0,
primary_detected: false,
char_widths: vec![],
},
TextSpan {
artifact_type: None,
text: "World".to_string(),
bbox: Rect::new(55.0, 0.0, 105.0, 12.0),
font_name: "Arial".to_string(),
font_size: 12.0,
font_weight: crate::layout::FontWeight::Normal,
is_italic: false,
is_monospace: false,
color: crate::layout::Color {
r: 0.0,
g: 0.0,
b: 0.0,
},
mcid: None,
sequence: 1,
split_boundary_before: false,
offset_semantic: false,
char_spacing: 0.0,
word_spacing: 0.0,
horizontal_scaling: 100.0,
primary_detected: false,
char_widths: vec![],
},
];
let (text, positions) = TextSearcher::build_text_with_positions(&spans);
assert_eq!(text, "Hello World");
assert_eq!(positions.len(), 2);
assert_eq!(positions[0], (0, 5, 0)); assert_eq!(positions[1], (6, 11, 1)); }
}