1pub mod char;
5pub mod chars;
6pub mod search;
7pub mod segment;
8pub mod segments;
9
10use crate::bindgen::{FPDF_TEXTPAGE, FPDF_WCHAR, FPDF_WIDESTRING};
11use crate::bindings::PdfiumLibraryBindings;
12use crate::error::PdfiumError;
13use crate::pdf::document::page::annotation::PdfPageAnnotation;
14use crate::pdf::document::page::annotation::PdfPageAnnotationCommon;
15use crate::pdf::document::page::object::private::internal::PdfPageObjectPrivate;
16use crate::pdf::document::page::object::text::PdfPageTextObject;
17use crate::pdf::document::page::text::chars::{PdfPageTextCharIndex, PdfPageTextChars};
18use crate::pdf::document::page::text::search::{PdfPageTextSearch, PdfSearchOptions};
19use crate::pdf::document::page::text::segments::PdfPageTextSegments;
20use crate::pdf::document::page::PdfPage;
21use crate::pdf::points::PdfPoints;
22use crate::pdf::rect::PdfRect;
23use crate::pdfium::PdfiumLibraryBindingsAccessor;
24use crate::utils::mem::{create_byte_buffer, create_sized_buffer};
25use crate::utils::utf16le::{
26 get_pdfium_utf16le_bytes_from_str, get_string_from_pdfium_utf16le_bytes,
27};
28use bytemuck::cast_slice;
29use std::fmt::{Display, Formatter};
30use std::marker::PhantomData;
31use std::os::raw::{c_double, c_int};
32use std::ptr::null_mut;
33
34pub struct PdfPageText<'a> {
48 text_page_handle: FPDF_TEXTPAGE,
49 page: &'a PdfPage<'a>,
50 lifetime: PhantomData<&'a FPDF_TEXTPAGE>,
51}
52
53impl<'a> PdfPageText<'a> {
54 pub(crate) fn from_pdfium(text_page_handle: FPDF_TEXTPAGE, page: &'a PdfPage<'a>) -> Self {
55 PdfPageText {
56 text_page_handle,
57 page,
58 lifetime: PhantomData,
59 }
60 }
61
62 #[inline]
64 pub(crate) fn text_page_handle(&self) -> FPDF_TEXTPAGE {
65 self.text_page_handle
66 }
67
68 #[inline]
73 pub fn len(&self) -> i32 {
74 unsafe { self.bindings().FPDFText_CountChars(self.text_page_handle()) }
75 }
76
77 #[inline]
79 pub fn is_empty(&self) -> bool {
80 self.len() == 0
81 }
82
83 #[inline]
85 pub fn segments(&self) -> PdfPageTextSegments<'_> {
86 PdfPageTextSegments::new(self, 0, self.len(), self.bindings())
87 }
88
89 #[inline]
92 pub fn segments_subset(
93 &self,
94 start: PdfPageTextCharIndex,
95 count: PdfPageTextCharIndex,
96 ) -> PdfPageTextSegments<'_> {
97 PdfPageTextSegments::new(self, start as i32, count as i32, self.bindings())
98 }
99
100 #[inline]
102 pub fn chars(&self) -> PdfPageTextChars<'_> {
103 PdfPageTextChars::new(
104 self.page.document_handle(),
105 self.page.page_handle(),
106 self.text_page_handle(),
107 (0..self.len()).collect(),
108 )
109 }
110
111 #[cfg(any(
112 feature = "pdfium_future",
113 feature = "pdfium_7763",
114 feature = "pdfium_7543",
115 feature = "pdfium_7350",
116 feature = "pdfium_7215",
117 feature = "pdfium_7123",
118 feature = "pdfium_6996",
119 feature = "pdfium_6721",
120 feature = "pdfium_6666",
121 feature = "pdfium_6611",
122 ))]
123 #[inline]
128 pub fn chars_for_object(
129 &self,
130 object: &PdfPageTextObject,
131 ) -> Result<PdfPageTextChars<'_>, PdfiumError> {
132 Ok(PdfPageTextChars::new(
133 self.page.document_handle(),
134 self.page.page_handle(),
135 self.text_page_handle(),
136 self.chars()
137 .iter()
138 .filter(|char| {
139 (unsafe {
140 self.bindings()
141 .FPDFText_GetTextObject(self.text_page_handle(), char.index() as i32)
142 }) == object.object_handle()
143 })
144 .map(|char| char.index() as i32)
145 .collect(),
146 ))
147 }
148
149 #[inline]
154 pub fn chars_for_annotation(
155 &self,
156 annotation: &PdfPageAnnotation,
157 ) -> Result<PdfPageTextChars<'_>, PdfiumError> {
158 self.chars_inside_rect(annotation.bounds()?)
159 .map_err(|_| PdfiumError::NoCharsInAnnotation)
160 }
161
162 #[inline]
165 pub fn chars_inside_rect<'b>(
166 &'b self,
167 rect: PdfRect,
168 ) -> Result<PdfPageTextChars<'a>, PdfiumError> {
169 let tolerance_x = rect.width() / 2.0;
170 let tolerance_y = rect.height() / 2.0;
171 let center_height = rect.bottom() + tolerance_y;
172
173 match (
174 Self::get_char_index_near_point(
175 self.text_page_handle(),
176 rect.left(),
177 tolerance_x,
178 center_height,
179 tolerance_y,
180 self.bindings(),
181 ),
182 Self::get_char_index_near_point(
183 self.text_page_handle(),
184 rect.right(),
185 tolerance_x,
186 center_height,
187 tolerance_y,
188 self.bindings(),
189 ),
190 ) {
191 (Some(start), Some(end)) => Ok(PdfPageTextChars::new(
192 self.page.document_handle(),
193 self.page.page_handle(),
194 self.text_page_handle(),
195 (start as i32..=end as i32 + 1).collect(),
196 )),
197 (Some(start), None) => Ok(PdfPageTextChars::new(
198 self.page.document_handle(),
199 self.page.page_handle(),
200 self.text_page_handle(),
201 (start as i32..=start as i32 + 1).collect(),
202 )),
203 (None, Some(end)) => Ok(PdfPageTextChars::new(
204 self.page.document_handle(),
205 self.page.page_handle(),
206 self.text_page_handle(),
207 (end as i32..=end as i32 + 1).collect(),
208 )),
209 _ => Err(PdfiumError::NoCharsInRect),
210 }
211 }
212
213 pub(crate) fn get_char_index_near_point(
217 text_page_handle: FPDF_TEXTPAGE,
218 x: PdfPoints,
219 tolerance_x: PdfPoints,
220 y: PdfPoints,
221 tolerance_y: PdfPoints,
222 bindings: &dyn PdfiumLibraryBindings,
223 ) -> Option<PdfPageTextCharIndex> {
224 match unsafe {
225 bindings.FPDFText_GetCharIndexAtPos(
226 text_page_handle,
227 x.value as c_double,
228 y.value as c_double,
229 tolerance_x.value as c_double,
230 tolerance_y.value as c_double,
231 )
232 } {
233 -1 => None, -3 => None, index => Some(index as PdfPageTextCharIndex),
236 }
237 }
238
239 pub fn all(&self) -> String {
246 self.inside_rect(self.page.page_size())
247 }
248
249 pub fn inside_rect(&self, rect: PdfRect) -> String {
257 let left = rect.left().value as f64;
267
268 let top = rect.top().value as f64;
269
270 let right = rect.right().value as f64;
271
272 let bottom = rect.bottom().value as f64;
273
274 let chars_count = unsafe {
275 self.bindings().FPDFText_GetBoundedText(
276 self.text_page_handle(),
277 left,
278 top,
279 right,
280 bottom,
281 null_mut(),
282 0,
283 )
284 };
285
286 if chars_count == 0 {
287 return String::new();
290 }
291
292 let mut buffer = create_sized_buffer(chars_count as usize);
293
294 let result = unsafe {
295 self.bindings().FPDFText_GetBoundedText(
296 self.text_page_handle(),
297 left,
298 top,
299 right,
300 bottom,
301 buffer.as_mut_ptr(),
302 chars_count,
303 )
304 };
305
306 assert_eq!(result, chars_count);
307
308 get_string_from_pdfium_utf16le_bytes(cast_slice(buffer.as_slice()).to_vec())
309 .unwrap_or_default()
310 }
311
312 pub fn for_object(&self, object: &PdfPageTextObject) -> String {
315 let buffer_length = unsafe {
325 self.bindings().FPDFTextObj_GetText(
326 object.object_handle(),
327 self.text_page_handle(),
328 null_mut(),
329 0,
330 )
331 };
332
333 if buffer_length == 0 {
334 return String::new();
337 }
338
339 let mut buffer = create_byte_buffer(buffer_length as usize);
340
341 let result = unsafe {
342 self.bindings().FPDFTextObj_GetText(
343 object.object_handle(),
344 self.text_page_handle(),
345 buffer.as_mut_ptr() as *mut FPDF_WCHAR,
346 buffer_length,
347 )
348 };
349
350 assert_eq!(result, buffer_length);
351
352 get_string_from_pdfium_utf16le_bytes(buffer).unwrap_or_default()
353 }
354
355 #[inline]
363 pub fn for_annotation(&self, annotation: &PdfPageAnnotation) -> Result<String, PdfiumError> {
364 let bounds = annotation.bounds()?;
365
366 Ok(self.inside_rect(bounds))
367 }
368
369 #[inline]
372 pub fn search(
373 &self,
374 text: &str,
375 options: &PdfSearchOptions,
376 ) -> Result<PdfPageTextSearch<'_>, PdfiumError> {
377 self.search_from(text, options, 0)
378 }
379
380 pub fn search_from(
384 &self,
385 text: &str,
386 options: &PdfSearchOptions,
387 index: PdfPageTextCharIndex,
388 ) -> Result<PdfPageTextSearch<'_>, PdfiumError> {
389 if text.is_empty() {
390 Err(PdfiumError::TextSearchTargetIsEmpty)
391 } else {
392 Ok(PdfPageTextSearch::from_pdfium(
393 unsafe {
394 self.bindings().FPDFText_FindStart(
395 self.text_page_handle(),
396 get_pdfium_utf16le_bytes_from_str(text).as_ptr() as FPDF_WIDESTRING,
397 options.as_pdfium(),
398 index as c_int,
399 )
400 },
401 self,
402 ))
403 }
404 }
405}
406
407impl<'a> Display for PdfPageText<'a> {
408 #[inline]
409 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
410 f.write_str(self.all().as_str())
411 }
412}
413
414impl<'a> Drop for PdfPageText<'a> {
415 #[inline]
417 fn drop(&mut self) {
418 unsafe {
419 self.bindings().FPDFText_ClosePage(self.text_page_handle());
420 }
421 }
422}
423
424impl<'a> PdfiumLibraryBindingsAccessor<'a> for PdfPageText<'a> {}
425
426#[cfg(feature = "thread_safe")]
427unsafe impl<'a> Send for PdfPageText<'a> {}
428
429#[cfg(feature = "thread_safe")]
430unsafe impl<'a> Sync for PdfPageText<'a> {}
431
432#[cfg(test)]
433mod tests {
434 use itertools::Itertools;
435 use std::ffi::OsStr;
436 use std::fs;
437
438 use crate::prelude::*;
439 use crate::utils::test::test_bind_to_pdfium;
440
441 #[test]
442 fn test_overlapping_chars_results() -> Result<(), PdfiumError> {
443 let pdfium = test_bind_to_pdfium();
448
449 let mut document = pdfium.create_new_pdf()?;
452
453 let mut page = document
454 .pages_mut()
455 .create_page_at_start(PdfPagePaperSize::a4())?;
456
457 let font = document.fonts_mut().courier();
458
459 let txt1 = page.objects_mut().create_text_object(
460 PdfPoints::ZERO,
461 PdfPoints::ZERO,
462 "AAAAAA",
463 font,
464 PdfPoints::new(10.0),
465 )?;
466
467 let txt2 = page.objects_mut().create_text_object(
468 PdfPoints::ZERO,
469 PdfPoints::ZERO,
470 "BBBBBB",
471 font,
472 PdfPoints::new(10.0),
473 )?;
474
475 let txt3 = page.objects_mut().create_text_object(
476 PdfPoints::ZERO,
477 PdfPoints::ZERO,
478 "CDCDCDE",
479 font,
480 PdfPoints::new(10.0),
481 )?;
482
483 let page_text = page.text()?;
484
485 assert!(test_one_overlapping_text_object_results(
488 &txt1, &page_text, "AAAAAA"
489 )?);
490 assert!(test_one_overlapping_text_object_results(
491 &txt2, &page_text, "BBBBBB"
492 )?);
493 assert!(test_one_overlapping_text_object_results(
494 &txt3, &page_text, "CDCDCDE"
495 )?);
496
497 Ok(())
498 }
499
500 fn test_one_overlapping_text_object_results(
501 object: &PdfPageObject,
502 page_text: &PdfPageText,
503 expected: &str,
504 ) -> Result<bool, PdfiumError> {
505 if let Some(txt) = object.as_text_object() {
506 assert_eq!(txt.text().trim(), expected);
507 assert_eq!(page_text.for_object(txt).trim(), expected);
508
509 for (index, char) in txt.chars(&page_text)?.iter().enumerate() {
510 assert_eq!(txt.text().chars().nth(index), char.unicode_char());
511 assert_eq!(expected.chars().nth(index), char.unicode_char());
512 }
513
514 Ok(true)
515 } else {
516 Ok(false)
517 }
518 }
519
520 #[test]
521 fn test_text_chars_results_equality() -> Result<(), PdfiumError> {
522 let pdfium = test_bind_to_pdfium();
526
527 let samples = fs::read_dir("./test/")
528 .unwrap()
529 .filter_map(|entry| match entry {
530 Ok(e) => Some(e.path()),
531 Err(_) => None,
532 })
533 .filter(|path| path.extension() == Some(OsStr::new("pdf")))
534 .collect::<Vec<_>>();
535
536 assert!(samples.len() > 0);
537
538 for sample in samples {
539 println!("Testing all text objects in file {}", sample.display());
540
541 let document = pdfium.load_pdf_from_file(&sample, None)?;
542
543 for page in document.pages().iter() {
544 let text = page.text()?;
545
546 for object in page.objects().iter() {
547 if let Some(obj) = object.as_text_object() {
548 let chars = obj
549 .chars(&text)?
550 .iter()
551 .filter_map(|char| char.unicode_string())
552 .join("");
553
554 assert_eq!(obj.text().trim(), chars.replace("\0", "").trim());
555 }
556 }
557 }
558 }
559
560 Ok(())
561 }
562
563 #[test]
564 fn test_text_segment_chars_char_lifetimes() -> Result<(), PdfiumError> {
565 let pdfium = test_bind_to_pdfium();
570 let document = pdfium.load_pdf_from_file("./test/export-test.pdf", None)?;
571 let page = document.pages().first()?;
572 let text = page.text()?;
573
574 let _char = {
575 let chars = {
576 let segment = text.segments().first()?;
577
578 segment.chars()?
579 }; chars.first()?
582 }; Ok(())
585 }
586}