1pub mod char;
5pub mod chars;
6pub mod search;
7pub mod segment;
8pub mod segments;
9
10use crate::bindgen::{FPDF_TEXTPAGE, FPDF_WCHAR, FPDF_WIDESTRING};
11use crate::bindings::PdfiumLibraryBindings;
12use crate::error::PdfiumError;
13use crate::pdf::document::page::annotation::PdfPageAnnotation;
14use crate::pdf::document::page::annotation::PdfPageAnnotationCommon;
15use crate::pdf::document::page::object::private::internal::PdfPageObjectPrivate;
16use crate::pdf::document::page::object::text::PdfPageTextObject;
17use crate::pdf::document::page::text::chars::{PdfPageTextCharIndex, PdfPageTextChars};
18use crate::pdf::document::page::text::search::{PdfPageTextSearch, PdfSearchOptions};
19use crate::pdf::document::page::text::segments::PdfPageTextSegments;
20use crate::pdf::document::page::PdfPage;
21use crate::pdf::points::PdfPoints;
22use crate::pdf::rect::PdfRect;
23use crate::utils::mem::{create_byte_buffer, create_sized_buffer};
24use crate::utils::utf16le::{
25 get_pdfium_utf16le_bytes_from_str, get_string_from_pdfium_utf16le_bytes,
26};
27use bytemuck::cast_slice;
28use std::fmt::{Display, Formatter};
29use std::os::raw::{c_double, c_int};
30use std::ptr::null_mut;
31
32pub struct PdfPageText<'a> {
46 text_page_handle: FPDF_TEXTPAGE,
47 page: &'a PdfPage<'a>,
48 bindings: &'a dyn PdfiumLibraryBindings,
49}
50
51impl<'a> PdfPageText<'a> {
52 pub(crate) fn from_pdfium(
53 text_page_handle: FPDF_TEXTPAGE,
54 page: &'a PdfPage<'a>,
55 bindings: &'a dyn PdfiumLibraryBindings,
56 ) -> Self {
57 PdfPageText {
58 text_page_handle,
59 page,
60 bindings,
61 }
62 }
63
64 #[inline]
66 pub(crate) fn text_page_handle(&self) -> FPDF_TEXTPAGE {
67 self.text_page_handle
68 }
69
70 #[inline]
72 pub fn bindings(&self) -> &'a dyn PdfiumLibraryBindings {
73 self.bindings
74 }
75
76 #[inline]
81 pub fn len(&self) -> i32 {
82 self.bindings.FPDFText_CountChars(self.text_page_handle())
83 }
84
85 #[inline]
87 pub fn is_empty(&self) -> bool {
88 self.len() == 0
89 }
90
91 #[inline]
93 pub fn segments(&self) -> PdfPageTextSegments {
94 PdfPageTextSegments::new(self, 0, self.len(), self.bindings())
95 }
96
97 #[inline]
100 pub fn segments_subset(
101 &self,
102 start: PdfPageTextCharIndex,
103 count: PdfPageTextCharIndex,
104 ) -> PdfPageTextSegments {
105 PdfPageTextSegments::new(self, start as i32, count as i32, self.bindings())
106 }
107
108 #[inline]
110 pub fn chars(&self) -> PdfPageTextChars {
111 PdfPageTextChars::new(
112 self.page.document_handle(),
113 self.page.page_handle(),
114 self.text_page_handle(),
115 (0..self.len()).collect(),
116 self.bindings(),
117 )
118 }
119
120 #[cfg(any(
121 feature = "pdfium_future",
122 feature = "pdfium_7350",
123 feature = "pdfium_7215",
124 feature = "pdfium_7123",
125 feature = "pdfium_6996",
126 feature = "pdfium_6721",
127 feature = "pdfium_6666",
128 feature = "pdfium_6611",
129 ))]
130 #[inline]
135 pub fn chars_for_object(
136 &self,
137 object: &PdfPageTextObject,
138 ) -> Result<PdfPageTextChars, PdfiumError> {
139 Ok(PdfPageTextChars::new(
140 self.page.document_handle(),
141 self.page.page_handle(),
142 self.text_page_handle(),
143 self.chars()
144 .iter()
145 .filter(|char| {
146 self.bindings
147 .FPDFText_GetTextObject(self.text_page_handle(), char.index() as i32)
148 == object.object_handle()
149 })
150 .map(|char| char.index() as i32)
151 .collect(),
152 self.bindings(),
153 ))
154 }
155
156 #[inline]
161 pub fn chars_for_annotation(
162 &self,
163 annotation: &PdfPageAnnotation,
164 ) -> Result<PdfPageTextChars, PdfiumError> {
165 self.chars_inside_rect(annotation.bounds()?)
166 .map_err(|_| PdfiumError::NoCharsInAnnotation)
167 }
168
169 #[inline]
172 pub fn chars_inside_rect(&self, rect: PdfRect) -> Result<PdfPageTextChars, PdfiumError> {
173 let tolerance_x = rect.width() / 2.0;
174 let tolerance_y = rect.height() / 2.0;
175 let center_height = rect.bottom() + tolerance_y;
176
177 match (
178 Self::get_char_index_near_point(
179 self.text_page_handle(),
180 rect.left(),
181 tolerance_x,
182 center_height,
183 tolerance_y,
184 self.bindings(),
185 ),
186 Self::get_char_index_near_point(
187 self.text_page_handle(),
188 rect.right(),
189 tolerance_x,
190 center_height,
191 tolerance_y,
192 self.bindings(),
193 ),
194 ) {
195 (Some(start), Some(end)) => Ok(PdfPageTextChars::new(
196 self.page.document_handle(),
197 self.page.page_handle(),
198 self.text_page_handle(),
199 (start as i32..=end as i32 + 1).collect(),
200 self.bindings,
201 )),
202 (Some(start), None) => Ok(PdfPageTextChars::new(
203 self.page.document_handle(),
204 self.page.page_handle(),
205 self.text_page_handle(),
206 (start as i32..=start as i32 + 1).collect(),
207 self.bindings,
208 )),
209 (None, Some(end)) => Ok(PdfPageTextChars::new(
210 self.page.document_handle(),
211 self.page.page_handle(),
212 self.text_page_handle(),
213 (end as i32..=end as i32 + 1).collect(),
214 self.bindings,
215 )),
216 _ => Err(PdfiumError::NoCharsInRect),
217 }
218 }
219
220 pub(crate) fn get_char_index_near_point(
224 text_page_handle: FPDF_TEXTPAGE,
225 x: PdfPoints,
226 tolerance_x: PdfPoints,
227 y: PdfPoints,
228 tolerance_y: PdfPoints,
229 bindings: &dyn PdfiumLibraryBindings,
230 ) -> Option<PdfPageTextCharIndex> {
231 match bindings.FPDFText_GetCharIndexAtPos(
232 text_page_handle,
233 x.value as c_double,
234 y.value as c_double,
235 tolerance_x.value as c_double,
236 tolerance_y.value as c_double,
237 ) {
238 -1 => None, -3 => None, index => Some(index as PdfPageTextCharIndex),
241 }
242 }
243
244 pub fn all(&self) -> String {
251 self.inside_rect(self.page.page_size())
252 }
253
254 pub fn inside_rect(&self, rect: PdfRect) -> String {
262 let left = rect.left().value as f64;
272
273 let top = rect.top().value as f64;
274
275 let right = rect.right().value as f64;
276
277 let bottom = rect.bottom().value as f64;
278
279 let chars_count = self.bindings().FPDFText_GetBoundedText(
280 self.text_page_handle(),
281 left,
282 top,
283 right,
284 bottom,
285 null_mut(),
286 0,
287 );
288
289 if chars_count == 0 {
290 return String::new();
293 }
294
295 let mut buffer = create_sized_buffer(chars_count as usize);
296
297 let result = self.bindings().FPDFText_GetBoundedText(
298 self.text_page_handle(),
299 left,
300 top,
301 right,
302 bottom,
303 buffer.as_mut_ptr(),
304 chars_count,
305 );
306
307 assert_eq!(result, chars_count);
308
309 get_string_from_pdfium_utf16le_bytes(cast_slice(buffer.as_slice()).to_vec())
310 .unwrap_or_default()
311 }
312
313 pub fn for_object(&self, object: &PdfPageTextObject) -> String {
316 let buffer_length = self.bindings().FPDFTextObj_GetText(
326 object.object_handle(),
327 self.text_page_handle(),
328 null_mut(),
329 0,
330 );
331
332 if buffer_length == 0 {
333 return String::new();
336 }
337
338 let mut buffer = create_byte_buffer(buffer_length as usize);
339
340 let result = self.bindings().FPDFTextObj_GetText(
341 object.object_handle(),
342 self.text_page_handle(),
343 buffer.as_mut_ptr() as *mut FPDF_WCHAR,
344 buffer_length,
345 );
346
347 assert_eq!(result, buffer_length);
348
349 get_string_from_pdfium_utf16le_bytes(buffer).unwrap_or_default()
350 }
351
352 #[inline]
360 pub fn for_annotation(&self, annotation: &PdfPageAnnotation) -> Result<String, PdfiumError> {
361 let bounds = annotation.bounds()?;
362
363 Ok(self.inside_rect(bounds))
364 }
365
366 #[inline]
369 pub fn search(
370 &self,
371 text: &str,
372 options: &PdfSearchOptions,
373 ) -> Result<PdfPageTextSearch, PdfiumError> {
374 self.search_from(text, options, 0)
375 }
376
377 pub fn search_from(
381 &self,
382 text: &str,
383 options: &PdfSearchOptions,
384 index: PdfPageTextCharIndex,
385 ) -> Result<PdfPageTextSearch, PdfiumError> {
386 if text.is_empty() {
387 Err(PdfiumError::TextSearchTargetIsEmpty)
388 } else {
389 Ok(PdfPageTextSearch::from_pdfium(
390 self.bindings().FPDFText_FindStart(
391 self.text_page_handle(),
392 get_pdfium_utf16le_bytes_from_str(text).as_ptr() as FPDF_WIDESTRING,
393 options.as_pdfium(),
394 index as c_int,
395 ),
396 self,
397 self.bindings(),
398 ))
399 }
400 }
401}
402
403impl<'a> Display for PdfPageText<'a> {
404 #[inline]
405 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
406 f.write_str(self.all().as_str())
407 }
408}
409
410impl<'a> Drop for PdfPageText<'a> {
411 #[inline]
413 fn drop(&mut self) {
414 self.bindings().FPDFText_ClosePage(self.text_page_handle());
415 }
416}
417
418#[cfg(test)]
419mod tests {
420 use itertools::Itertools;
421 use std::ffi::OsStr;
422 use std::fs;
423
424 use crate::prelude::*;
425 use crate::utils::test::test_bind_to_pdfium;
426
427 #[test]
428 fn test_overlapping_chars_results() -> Result<(), PdfiumError> {
429 let pdfium = test_bind_to_pdfium();
434
435 let mut document = pdfium.create_new_pdf()?;
438
439 let mut page = document
440 .pages_mut()
441 .create_page_at_start(PdfPagePaperSize::a4())?;
442
443 let font = document.fonts_mut().courier();
444
445 let txt1 = page.objects_mut().create_text_object(
446 PdfPoints::ZERO,
447 PdfPoints::ZERO,
448 "AAAAAA",
449 font,
450 PdfPoints::new(10.0),
451 )?;
452
453 let txt2 = page.objects_mut().create_text_object(
454 PdfPoints::ZERO,
455 PdfPoints::ZERO,
456 "BBBBBB",
457 font,
458 PdfPoints::new(10.0),
459 )?;
460
461 let txt3 = page.objects_mut().create_text_object(
462 PdfPoints::ZERO,
463 PdfPoints::ZERO,
464 "CDCDCDE",
465 font,
466 PdfPoints::new(10.0),
467 )?;
468
469 let page_text = page.text()?;
470
471 assert!(test_one_overlapping_text_object_results(
474 &txt1, &page_text, "AAAAAA"
475 )?);
476 assert!(test_one_overlapping_text_object_results(
477 &txt2, &page_text, "BBBBBB"
478 )?);
479 assert!(test_one_overlapping_text_object_results(
480 &txt3, &page_text, "CDCDCDE"
481 )?);
482
483 Ok(())
484 }
485
486 fn test_one_overlapping_text_object_results(
487 object: &PdfPageObject,
488 page_text: &PdfPageText,
489 expected: &str,
490 ) -> Result<bool, PdfiumError> {
491 if let Some(txt) = object.as_text_object() {
492 assert_eq!(txt.text().trim(), expected);
493 assert_eq!(page_text.for_object(txt).trim(), expected);
494
495 for (index, char) in txt.chars(&page_text)?.iter().enumerate() {
496 assert_eq!(txt.text().chars().nth(index), char.unicode_char());
497 assert_eq!(expected.chars().nth(index), char.unicode_char());
498 }
499
500 Ok(true)
501 } else {
502 Ok(false)
503 }
504 }
505
506 #[test]
507 fn test_text_chars_results_equality() -> Result<(), PdfiumError> {
508 let pdfium = test_bind_to_pdfium();
512
513 let samples = fs::read_dir("./test/")
514 .unwrap()
515 .filter_map(|entry| match entry {
516 Ok(e) => Some(e.path()),
517 Err(_) => None,
518 })
519 .filter(|path| path.extension() == Some(OsStr::new("pdf")))
520 .collect::<Vec<_>>();
521
522 assert!(samples.len() > 0);
523
524 for sample in samples {
525 println!("Testing all text objects in file {}", sample.display());
526
527 let document = pdfium.load_pdf_from_file(&sample, None)?;
528
529 for page in document.pages().iter() {
530 let text = page.text()?;
531
532 for object in page.objects().iter() {
533 if let Some(obj) = object.as_text_object() {
534 let chars = obj
535 .chars(&text)?
536 .iter()
537 .filter_map(|char| char.unicode_string())
538 .join("");
539
540 assert_eq!(obj.text().trim(), chars.replace("\0", "").trim());
541 }
542 }
543 }
544 }
545
546 Ok(())
547 }
548}