pub mod char;
pub mod chars;
pub mod search;
pub mod segment;
pub mod segments;
use crate::bindgen::{FPDF_TEXTPAGE, FPDF_WCHAR, FPDF_WIDESTRING};
use crate::bindings::PdfiumLibraryBindings;
use crate::error::PdfiumError;
use crate::pdf::document::page::annotation::PdfPageAnnotation;
use crate::pdf::document::page::annotation::PdfPageAnnotationCommon;
use crate::pdf::document::page::object::private::internal::PdfPageObjectPrivate;
use crate::pdf::document::page::object::text::PdfPageTextObject;
use crate::pdf::document::page::text::chars::{PdfPageTextCharIndex, PdfPageTextChars};
use crate::pdf::document::page::text::search::{PdfPageTextSearch, PdfSearchOptions};
use crate::pdf::document::page::text::segments::PdfPageTextSegments;
use crate::pdf::document::page::PdfPage;
use crate::pdf::points::PdfPoints;
use crate::pdf::rect::PdfRect;
use crate::pdfium::PdfiumLibraryBindingsAccessor;
use crate::utils::mem::{create_byte_buffer, create_sized_buffer};
use crate::utils::utf16le::{
get_pdfium_utf16le_bytes_from_str, get_string_from_pdfium_utf16le_bytes,
};
use bytemuck::cast_slice;
use std::fmt::{Display, Formatter};
use std::marker::PhantomData;
use std::os::raw::{c_double, c_int};
use std::ptr::null_mut;
pub struct PdfPageText<'a> {
text_page_handle: FPDF_TEXTPAGE,
page: &'a PdfPage<'a>,
lifetime: PhantomData<&'a FPDF_TEXTPAGE>,
}
impl<'a> PdfPageText<'a> {
pub(crate) fn from_pdfium(text_page_handle: FPDF_TEXTPAGE, page: &'a PdfPage<'a>) -> Self {
PdfPageText {
text_page_handle,
page,
lifetime: PhantomData,
}
}
#[inline]
pub(crate) fn text_page_handle(&self) -> FPDF_TEXTPAGE {
self.text_page_handle
}
#[inline]
pub fn len(&self) -> i32 {
unsafe { self.bindings().FPDFText_CountChars(self.text_page_handle()) }
}
#[inline]
pub fn is_empty(&self) -> bool {
self.len() == 0
}
#[inline]
pub fn segments(&self) -> PdfPageTextSegments<'_> {
PdfPageTextSegments::new(self, 0, self.len(), self.bindings())
}
#[inline]
pub fn segments_subset(
&self,
start: PdfPageTextCharIndex,
count: PdfPageTextCharIndex,
) -> PdfPageTextSegments<'_> {
PdfPageTextSegments::new(self, start as i32, count as i32, self.bindings())
}
#[inline]
pub fn chars(&self) -> PdfPageTextChars<'_> {
PdfPageTextChars::new(
self.page.document_handle(),
self.page.page_handle(),
self.text_page_handle(),
(0..self.len()).collect(),
)
}
#[cfg(any(
feature = "pdfium_future",
feature = "pdfium_7543",
feature = "pdfium_7350",
feature = "pdfium_7215",
feature = "pdfium_7123",
feature = "pdfium_6996",
feature = "pdfium_6721",
feature = "pdfium_6666",
feature = "pdfium_6611",
))]
#[inline]
pub fn chars_for_object(
&self,
object: &PdfPageTextObject,
) -> Result<PdfPageTextChars<'_>, PdfiumError> {
Ok(PdfPageTextChars::new(
self.page.document_handle(),
self.page.page_handle(),
self.text_page_handle(),
self.chars()
.iter()
.filter(|char| {
(unsafe {
self.bindings()
.FPDFText_GetTextObject(self.text_page_handle(), char.index() as i32)
}) == object.object_handle()
})
.map(|char| char.index() as i32)
.collect(),
))
}
#[inline]
pub fn chars_for_annotation(
&self,
annotation: &PdfPageAnnotation,
) -> Result<PdfPageTextChars<'_>, PdfiumError> {
self.chars_inside_rect(annotation.bounds()?)
.map_err(|_| PdfiumError::NoCharsInAnnotation)
}
#[inline]
pub fn chars_inside_rect(&self, rect: PdfRect) -> Result<PdfPageTextChars<'_>, PdfiumError> {
let tolerance_x = rect.width() / 2.0;
let tolerance_y = rect.height() / 2.0;
let center_height = rect.bottom() + tolerance_y;
match (
Self::get_char_index_near_point(
self.text_page_handle(),
rect.left(),
tolerance_x,
center_height,
tolerance_y,
self.bindings(),
),
Self::get_char_index_near_point(
self.text_page_handle(),
rect.right(),
tolerance_x,
center_height,
tolerance_y,
self.bindings(),
),
) {
(Some(start), Some(end)) => Ok(PdfPageTextChars::new(
self.page.document_handle(),
self.page.page_handle(),
self.text_page_handle(),
(start as i32..=end as i32 + 1).collect(),
)),
(Some(start), None) => Ok(PdfPageTextChars::new(
self.page.document_handle(),
self.page.page_handle(),
self.text_page_handle(),
(start as i32..=start as i32 + 1).collect(),
)),
(None, Some(end)) => Ok(PdfPageTextChars::new(
self.page.document_handle(),
self.page.page_handle(),
self.text_page_handle(),
(end as i32..=end as i32 + 1).collect(),
)),
_ => Err(PdfiumError::NoCharsInRect),
}
}
pub(crate) fn get_char_index_near_point(
text_page_handle: FPDF_TEXTPAGE,
x: PdfPoints,
tolerance_x: PdfPoints,
y: PdfPoints,
tolerance_y: PdfPoints,
bindings: &dyn PdfiumLibraryBindings,
) -> Option<PdfPageTextCharIndex> {
match unsafe {
bindings.FPDFText_GetCharIndexAtPos(
text_page_handle,
x.value as c_double,
y.value as c_double,
tolerance_x.value as c_double,
tolerance_y.value as c_double,
)
} {
-1 => None, -3 => None, index => Some(index as PdfPageTextCharIndex),
}
}
pub fn all(&self) -> String {
self.inside_rect(self.page.page_size())
}
pub fn inside_rect(&self, rect: PdfRect) -> String {
let left = rect.left().value as f64;
let top = rect.top().value as f64;
let right = rect.right().value as f64;
let bottom = rect.bottom().value as f64;
let chars_count = unsafe {
self.bindings().FPDFText_GetBoundedText(
self.text_page_handle(),
left,
top,
right,
bottom,
null_mut(),
0,
)
};
if chars_count == 0 {
return String::new();
}
let mut buffer = create_sized_buffer(chars_count as usize);
let result = unsafe {
self.bindings().FPDFText_GetBoundedText(
self.text_page_handle(),
left,
top,
right,
bottom,
buffer.as_mut_ptr(),
chars_count,
)
};
assert_eq!(result, chars_count);
get_string_from_pdfium_utf16le_bytes(cast_slice(buffer.as_slice()).to_vec())
.unwrap_or_default()
}
pub fn for_object(&self, object: &PdfPageTextObject) -> String {
let buffer_length = unsafe {
self.bindings().FPDFTextObj_GetText(
object.object_handle(),
self.text_page_handle(),
null_mut(),
0,
)
};
if buffer_length == 0 {
return String::new();
}
let mut buffer = create_byte_buffer(buffer_length as usize);
let result = unsafe {
self.bindings().FPDFTextObj_GetText(
object.object_handle(),
self.text_page_handle(),
buffer.as_mut_ptr() as *mut FPDF_WCHAR,
buffer_length,
)
};
assert_eq!(result, buffer_length);
get_string_from_pdfium_utf16le_bytes(buffer).unwrap_or_default()
}
#[inline]
pub fn for_annotation(&self, annotation: &PdfPageAnnotation) -> Result<String, PdfiumError> {
let bounds = annotation.bounds()?;
Ok(self.inside_rect(bounds))
}
#[inline]
pub fn search(
&self,
text: &str,
options: &PdfSearchOptions,
) -> Result<PdfPageTextSearch<'_>, PdfiumError> {
self.search_from(text, options, 0)
}
pub fn search_from(
&self,
text: &str,
options: &PdfSearchOptions,
index: PdfPageTextCharIndex,
) -> Result<PdfPageTextSearch<'_>, PdfiumError> {
if text.is_empty() {
Err(PdfiumError::TextSearchTargetIsEmpty)
} else {
Ok(PdfPageTextSearch::from_pdfium(
unsafe {
self.bindings().FPDFText_FindStart(
self.text_page_handle(),
get_pdfium_utf16le_bytes_from_str(text).as_ptr() as FPDF_WIDESTRING,
options.as_pdfium(),
index as c_int,
)
},
self,
))
}
}
}
impl<'a> Display for PdfPageText<'a> {
#[inline]
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
f.write_str(self.all().as_str())
}
}
impl<'a> Drop for PdfPageText<'a> {
#[inline]
fn drop(&mut self) {
unsafe {
self.bindings().FPDFText_ClosePage(self.text_page_handle());
}
}
}
impl<'a> PdfiumLibraryBindingsAccessor<'a> for PdfPageText<'a> {}
#[cfg(feature = "thread_safe")]
unsafe impl<'a> Send for PdfPageText<'a> {}
#[cfg(feature = "thread_safe")]
unsafe impl<'a> Sync for PdfPageText<'a> {}
#[cfg(test)]
mod tests {
use itertools::Itertools;
use std::ffi::OsStr;
use std::fs;
use crate::prelude::*;
use crate::utils::test::test_bind_to_pdfium;
#[test]
fn test_overlapping_chars_results() -> Result<(), PdfiumError> {
let pdfium = test_bind_to_pdfium();
let mut document = pdfium.create_new_pdf()?;
let mut page = document
.pages_mut()
.create_page_at_start(PdfPagePaperSize::a4())?;
let font = document.fonts_mut().courier();
let txt1 = page.objects_mut().create_text_object(
PdfPoints::ZERO,
PdfPoints::ZERO,
"AAAAAA",
font,
PdfPoints::new(10.0),
)?;
let txt2 = page.objects_mut().create_text_object(
PdfPoints::ZERO,
PdfPoints::ZERO,
"BBBBBB",
font,
PdfPoints::new(10.0),
)?;
let txt3 = page.objects_mut().create_text_object(
PdfPoints::ZERO,
PdfPoints::ZERO,
"CDCDCDE",
font,
PdfPoints::new(10.0),
)?;
let page_text = page.text()?;
assert!(test_one_overlapping_text_object_results(
&txt1, &page_text, "AAAAAA"
)?);
assert!(test_one_overlapping_text_object_results(
&txt2, &page_text, "BBBBBB"
)?);
assert!(test_one_overlapping_text_object_results(
&txt3, &page_text, "CDCDCDE"
)?);
Ok(())
}
fn test_one_overlapping_text_object_results(
object: &PdfPageObject,
page_text: &PdfPageText,
expected: &str,
) -> Result<bool, PdfiumError> {
if let Some(txt) = object.as_text_object() {
assert_eq!(txt.text().trim(), expected);
assert_eq!(page_text.for_object(txt).trim(), expected);
for (index, char) in txt.chars(&page_text)?.iter().enumerate() {
assert_eq!(txt.text().chars().nth(index), char.unicode_char());
assert_eq!(expected.chars().nth(index), char.unicode_char());
}
Ok(true)
} else {
Ok(false)
}
}
#[test]
fn test_text_chars_results_equality() -> Result<(), PdfiumError> {
let pdfium = test_bind_to_pdfium();
let samples = fs::read_dir("./test/")
.unwrap()
.filter_map(|entry| match entry {
Ok(e) => Some(e.path()),
Err(_) => None,
})
.filter(|path| path.extension() == Some(OsStr::new("pdf")))
.collect::<Vec<_>>();
assert!(samples.len() > 0);
for sample in samples {
println!("Testing all text objects in file {}", sample.display());
let document = pdfium.load_pdf_from_file(&sample, None)?;
for page in document.pages().iter() {
let text = page.text()?;
for object in page.objects().iter() {
if let Some(obj) = object.as_text_object() {
let chars = obj
.chars(&text)?
.iter()
.filter_map(|char| char.unicode_string())
.join("");
assert_eq!(obj.text().trim(), chars.replace("\0", "").trim());
}
}
}
}
Ok(())
}
}