1pub mod char;
5pub mod chars;
6pub mod search;
7pub mod segment;
8pub mod segments;
9
10use crate::bindgen::{FPDF_TEXTPAGE, FPDF_WCHAR, FPDF_WIDESTRING};
11use crate::bindings::PdfiumLibraryBindings;
12use crate::error::PdfiumError;
13use crate::pdf::document::page::annotation::PdfPageAnnotation;
14use crate::pdf::document::page::annotation::PdfPageAnnotationCommon;
15use crate::pdf::document::page::object::private::internal::PdfPageObjectPrivate;
16use crate::pdf::document::page::object::text::PdfPageTextObject;
17use crate::pdf::document::page::text::chars::{PdfPageTextCharIndex, PdfPageTextChars};
18use crate::pdf::document::page::text::search::{PdfPageTextSearch, PdfSearchOptions};
19use crate::pdf::document::page::text::segments::PdfPageTextSegments;
20use crate::pdf::document::page::PdfPage;
21use crate::pdf::points::PdfPoints;
22use crate::pdf::rect::PdfRect;
23use crate::pdfium::PdfiumLibraryBindingsAccessor;
24use crate::utils::mem::{create_byte_buffer, create_sized_buffer};
25use crate::utils::utf16le::{
26 get_pdfium_utf16le_bytes_from_str, get_string_from_pdfium_utf16le_bytes,
27};
28use bytemuck::cast_slice;
29use std::fmt::{Display, Formatter};
30use std::marker::PhantomData;
31use std::os::raw::{c_double, c_int};
32use std::ptr::null_mut;
33
34pub struct PdfPageText<'a> {
48 text_page_handle: FPDF_TEXTPAGE,
49 page: &'a PdfPage<'a>,
50 lifetime: PhantomData<&'a FPDF_TEXTPAGE>,
51}
52
53impl<'a> PdfPageText<'a> {
54 pub(crate) fn from_pdfium(text_page_handle: FPDF_TEXTPAGE, page: &'a PdfPage<'a>) -> Self {
55 PdfPageText {
56 text_page_handle,
57 page,
58 lifetime: PhantomData,
59 }
60 }
61
62 #[inline]
64 pub(crate) fn text_page_handle(&self) -> FPDF_TEXTPAGE {
65 self.text_page_handle
66 }
67
68 #[inline]
73 pub fn len(&self) -> i32 {
74 unsafe { self.bindings().FPDFText_CountChars(self.text_page_handle()) }
75 }
76
77 #[inline]
79 pub fn is_empty(&self) -> bool {
80 self.len() == 0
81 }
82
83 #[inline]
85 pub fn segments(&self) -> PdfPageTextSegments<'_> {
86 PdfPageTextSegments::new(self, 0, self.len(), self.bindings())
87 }
88
89 #[inline]
92 pub fn segments_subset(
93 &self,
94 start: PdfPageTextCharIndex,
95 count: PdfPageTextCharIndex,
96 ) -> PdfPageTextSegments<'_> {
97 PdfPageTextSegments::new(self, start as i32, count as i32, self.bindings())
98 }
99
100 #[inline]
102 pub fn chars(&self) -> PdfPageTextChars<'_> {
103 PdfPageTextChars::new(
104 self.page.document_handle(),
105 self.page.page_handle(),
106 self.text_page_handle(),
107 (0..self.len()).collect(),
108 )
109 }
110
111 #[cfg(any(
112 feature = "pdfium_future",
113 feature = "pdfium_7543",
114 feature = "pdfium_7350",
115 feature = "pdfium_7215",
116 feature = "pdfium_7123",
117 feature = "pdfium_6996",
118 feature = "pdfium_6721",
119 feature = "pdfium_6666",
120 feature = "pdfium_6611",
121 ))]
122 #[inline]
127 pub fn chars_for_object(
128 &self,
129 object: &PdfPageTextObject,
130 ) -> Result<PdfPageTextChars<'_>, PdfiumError> {
131 Ok(PdfPageTextChars::new(
132 self.page.document_handle(),
133 self.page.page_handle(),
134 self.text_page_handle(),
135 self.chars()
136 .iter()
137 .filter(|char| {
138 (unsafe {
139 self.bindings()
140 .FPDFText_GetTextObject(self.text_page_handle(), char.index() as i32)
141 }) == object.object_handle()
142 })
143 .map(|char| char.index() as i32)
144 .collect(),
145 ))
146 }
147
148 #[inline]
153 pub fn chars_for_annotation(
154 &self,
155 annotation: &PdfPageAnnotation,
156 ) -> Result<PdfPageTextChars<'_>, PdfiumError> {
157 self.chars_inside_rect(annotation.bounds()?)
158 .map_err(|_| PdfiumError::NoCharsInAnnotation)
159 }
160
161 #[inline]
164 pub fn chars_inside_rect(&self, rect: PdfRect) -> Result<PdfPageTextChars<'_>, PdfiumError> {
165 let tolerance_x = rect.width() / 2.0;
166 let tolerance_y = rect.height() / 2.0;
167 let center_height = rect.bottom() + tolerance_y;
168
169 match (
170 Self::get_char_index_near_point(
171 self.text_page_handle(),
172 rect.left(),
173 tolerance_x,
174 center_height,
175 tolerance_y,
176 self.bindings(),
177 ),
178 Self::get_char_index_near_point(
179 self.text_page_handle(),
180 rect.right(),
181 tolerance_x,
182 center_height,
183 tolerance_y,
184 self.bindings(),
185 ),
186 ) {
187 (Some(start), Some(end)) => Ok(PdfPageTextChars::new(
188 self.page.document_handle(),
189 self.page.page_handle(),
190 self.text_page_handle(),
191 (start as i32..=end as i32 + 1).collect(),
192 )),
193 (Some(start), None) => Ok(PdfPageTextChars::new(
194 self.page.document_handle(),
195 self.page.page_handle(),
196 self.text_page_handle(),
197 (start as i32..=start as i32 + 1).collect(),
198 )),
199 (None, Some(end)) => Ok(PdfPageTextChars::new(
200 self.page.document_handle(),
201 self.page.page_handle(),
202 self.text_page_handle(),
203 (end as i32..=end as i32 + 1).collect(),
204 )),
205 _ => Err(PdfiumError::NoCharsInRect),
206 }
207 }
208
209 pub(crate) fn get_char_index_near_point(
213 text_page_handle: FPDF_TEXTPAGE,
214 x: PdfPoints,
215 tolerance_x: PdfPoints,
216 y: PdfPoints,
217 tolerance_y: PdfPoints,
218 bindings: &dyn PdfiumLibraryBindings,
219 ) -> Option<PdfPageTextCharIndex> {
220 match unsafe {
221 bindings.FPDFText_GetCharIndexAtPos(
222 text_page_handle,
223 x.value as c_double,
224 y.value as c_double,
225 tolerance_x.value as c_double,
226 tolerance_y.value as c_double,
227 )
228 } {
229 -1 => None, -3 => None, index => Some(index as PdfPageTextCharIndex),
232 }
233 }
234
235 pub fn all(&self) -> String {
242 self.inside_rect(self.page.page_size())
243 }
244
245 pub fn inside_rect(&self, rect: PdfRect) -> String {
253 let left = rect.left().value as f64;
263
264 let top = rect.top().value as f64;
265
266 let right = rect.right().value as f64;
267
268 let bottom = rect.bottom().value as f64;
269
270 let chars_count = unsafe {
271 self.bindings().FPDFText_GetBoundedText(
272 self.text_page_handle(),
273 left,
274 top,
275 right,
276 bottom,
277 null_mut(),
278 0,
279 )
280 };
281
282 if chars_count == 0 {
283 return String::new();
286 }
287
288 let mut buffer = create_sized_buffer(chars_count as usize);
289
290 let result = unsafe {
291 self.bindings().FPDFText_GetBoundedText(
292 self.text_page_handle(),
293 left,
294 top,
295 right,
296 bottom,
297 buffer.as_mut_ptr(),
298 chars_count,
299 )
300 };
301
302 assert_eq!(result, chars_count);
303
304 get_string_from_pdfium_utf16le_bytes(cast_slice(buffer.as_slice()).to_vec())
305 .unwrap_or_default()
306 }
307
308 pub fn for_object(&self, object: &PdfPageTextObject) -> String {
311 let buffer_length = unsafe {
321 self.bindings().FPDFTextObj_GetText(
322 object.object_handle(),
323 self.text_page_handle(),
324 null_mut(),
325 0,
326 )
327 };
328
329 if buffer_length == 0 {
330 return String::new();
333 }
334
335 let mut buffer = create_byte_buffer(buffer_length as usize);
336
337 let result = unsafe {
338 self.bindings().FPDFTextObj_GetText(
339 object.object_handle(),
340 self.text_page_handle(),
341 buffer.as_mut_ptr() as *mut FPDF_WCHAR,
342 buffer_length,
343 )
344 };
345
346 assert_eq!(result, buffer_length);
347
348 get_string_from_pdfium_utf16le_bytes(buffer).unwrap_or_default()
349 }
350
351 #[inline]
359 pub fn for_annotation(&self, annotation: &PdfPageAnnotation) -> Result<String, PdfiumError> {
360 let bounds = annotation.bounds()?;
361
362 Ok(self.inside_rect(bounds))
363 }
364
365 #[inline]
368 pub fn search(
369 &self,
370 text: &str,
371 options: &PdfSearchOptions,
372 ) -> Result<PdfPageTextSearch<'_>, PdfiumError> {
373 self.search_from(text, options, 0)
374 }
375
376 pub fn search_from(
380 &self,
381 text: &str,
382 options: &PdfSearchOptions,
383 index: PdfPageTextCharIndex,
384 ) -> Result<PdfPageTextSearch<'_>, PdfiumError> {
385 if text.is_empty() {
386 Err(PdfiumError::TextSearchTargetIsEmpty)
387 } else {
388 Ok(PdfPageTextSearch::from_pdfium(
389 unsafe {
390 self.bindings().FPDFText_FindStart(
391 self.text_page_handle(),
392 get_pdfium_utf16le_bytes_from_str(text).as_ptr() as FPDF_WIDESTRING,
393 options.as_pdfium(),
394 index as c_int,
395 )
396 },
397 self,
398 ))
399 }
400 }
401}
402
403impl<'a> Display for PdfPageText<'a> {
404 #[inline]
405 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
406 f.write_str(self.all().as_str())
407 }
408}
409
410impl<'a> Drop for PdfPageText<'a> {
411 #[inline]
413 fn drop(&mut self) {
414 unsafe {
415 self.bindings().FPDFText_ClosePage(self.text_page_handle());
416 }
417 }
418}
419
420impl<'a> PdfiumLibraryBindingsAccessor<'a> for PdfPageText<'a> {}
421
422#[cfg(feature = "thread_safe")]
423unsafe impl<'a> Send for PdfPageText<'a> {}
424
425#[cfg(feature = "thread_safe")]
426unsafe impl<'a> Sync for PdfPageText<'a> {}
427
428#[cfg(test)]
429mod tests {
430 use itertools::Itertools;
431 use std::ffi::OsStr;
432 use std::fs;
433
434 use crate::prelude::*;
435 use crate::utils::test::test_bind_to_pdfium;
436
437 #[test]
438 fn test_overlapping_chars_results() -> Result<(), PdfiumError> {
439 let pdfium = test_bind_to_pdfium();
444
445 let mut document = pdfium.create_new_pdf()?;
448
449 let mut page = document
450 .pages_mut()
451 .create_page_at_start(PdfPagePaperSize::a4())?;
452
453 let font = document.fonts_mut().courier();
454
455 let txt1 = page.objects_mut().create_text_object(
456 PdfPoints::ZERO,
457 PdfPoints::ZERO,
458 "AAAAAA",
459 font,
460 PdfPoints::new(10.0),
461 )?;
462
463 let txt2 = page.objects_mut().create_text_object(
464 PdfPoints::ZERO,
465 PdfPoints::ZERO,
466 "BBBBBB",
467 font,
468 PdfPoints::new(10.0),
469 )?;
470
471 let txt3 = page.objects_mut().create_text_object(
472 PdfPoints::ZERO,
473 PdfPoints::ZERO,
474 "CDCDCDE",
475 font,
476 PdfPoints::new(10.0),
477 )?;
478
479 let page_text = page.text()?;
480
481 assert!(test_one_overlapping_text_object_results(
484 &txt1, &page_text, "AAAAAA"
485 )?);
486 assert!(test_one_overlapping_text_object_results(
487 &txt2, &page_text, "BBBBBB"
488 )?);
489 assert!(test_one_overlapping_text_object_results(
490 &txt3, &page_text, "CDCDCDE"
491 )?);
492
493 Ok(())
494 }
495
496 fn test_one_overlapping_text_object_results(
497 object: &PdfPageObject,
498 page_text: &PdfPageText,
499 expected: &str,
500 ) -> Result<bool, PdfiumError> {
501 if let Some(txt) = object.as_text_object() {
502 assert_eq!(txt.text().trim(), expected);
503 assert_eq!(page_text.for_object(txt).trim(), expected);
504
505 for (index, char) in txt.chars(&page_text)?.iter().enumerate() {
506 assert_eq!(txt.text().chars().nth(index), char.unicode_char());
507 assert_eq!(expected.chars().nth(index), char.unicode_char());
508 }
509
510 Ok(true)
511 } else {
512 Ok(false)
513 }
514 }
515
516 #[test]
517 fn test_text_chars_results_equality() -> Result<(), PdfiumError> {
518 let pdfium = test_bind_to_pdfium();
522
523 let samples = fs::read_dir("./test/")
524 .unwrap()
525 .filter_map(|entry| match entry {
526 Ok(e) => Some(e.path()),
527 Err(_) => None,
528 })
529 .filter(|path| path.extension() == Some(OsStr::new("pdf")))
530 .collect::<Vec<_>>();
531
532 assert!(samples.len() > 0);
533
534 for sample in samples {
535 println!("Testing all text objects in file {}", sample.display());
536
537 let document = pdfium.load_pdf_from_file(&sample, None)?;
538
539 for page in document.pages().iter() {
540 let text = page.text()?;
541
542 for object in page.objects().iter() {
543 if let Some(obj) = object.as_text_object() {
544 let chars = obj
545 .chars(&text)?
546 .iter()
547 .filter_map(|char| char.unicode_string())
548 .join("");
549
550 assert_eq!(obj.text().trim(), chars.replace("\0", "").trim());
551 }
552 }
553 }
554 }
555
556 Ok(())
557 }
558}