pdfium_render/pdf/document/page/text.rs
1//! Defines the [PdfPageText] struct, exposing functionality related to the
2//! collection of Unicode characters visible on a single [PdfPage].
3
4pub mod char;
5pub mod chars;
6pub mod search;
7pub mod segment;
8pub mod segments;
9
10use crate::bindgen::{FPDF_TEXTPAGE, FPDF_WCHAR, FPDF_WIDESTRING};
11use crate::bindings::PdfiumLibraryBindings;
12use crate::error::PdfiumError;
13use crate::pdf::document::page::annotation::PdfPageAnnotation;
14use crate::pdf::document::page::annotation::PdfPageAnnotationCommon;
15use crate::pdf::document::page::object::private::internal::PdfPageObjectPrivate;
16use crate::pdf::document::page::object::text::PdfPageTextObject;
17use crate::pdf::document::page::object::PdfPageObjectCommon;
18use crate::pdf::document::page::objects::common::PdfPageObjectsCommon;
19use crate::pdf::document::page::text::chars::{PdfPageTextCharIndex, PdfPageTextChars};
20use crate::pdf::document::page::text::search::{PdfPageTextSearch, PdfSearchOptions};
21use crate::pdf::document::page::text::segments::PdfPageTextSegments;
22use crate::pdf::document::page::{PdfPage, PdfPageContentRegenerationStrategy, PdfPageIndexCache};
23use crate::pdf::document::pages::PdfPageIndex;
24use crate::pdf::points::PdfPoints;
25use crate::pdf::rect::PdfRect;
26use crate::utils::mem::{create_byte_buffer, create_sized_buffer};
27use crate::utils::utf16le::{
28 get_pdfium_utf16le_bytes_from_str, get_string_from_pdfium_utf16le_bytes,
29};
30use bytemuck::cast_slice;
31use std::fmt::{Display, Formatter};
32use std::os::raw::{c_double, c_int};
33use std::ptr::null_mut;
34
35/// The collection of Unicode characters visible on a single [PdfPage].
36///
37/// Use the [PdfPageText::all()] function to easily return all characters in the containing
38/// [PdfPage] in the order in which they are defined in the PDF file.
39///
40/// Use the [PdfPageText::search()] function to initialise a new [PdfPageTextSearch] object,
41/// yielding the results of searching for a target string within the character collection.
42///
43/// In complex custom layouts, the order in which characters are defined in the document
44/// and the order in which they appear visually during rendering (and thus the order in
45/// which they are read by a user) may not necessarily match.
46///
47/// [PdfPageText] implements both the [ToString] and the [Display] traits.
48pub struct PdfPageText<'a> {
49 text_page_handle: FPDF_TEXTPAGE,
50 page: &'a PdfPage<'a>,
51 bindings: &'a dyn PdfiumLibraryBindings,
52}
53
54impl<'a> PdfPageText<'a> {
55 pub(crate) fn from_pdfium(
56 text_page_handle: FPDF_TEXTPAGE,
57 page: &'a PdfPage<'a>,
58 bindings: &'a dyn PdfiumLibraryBindings,
59 ) -> Self {
60 PdfPageText {
61 text_page_handle,
62 page,
63 bindings,
64 }
65 }
66
67 /// Returns the internal `FPDF_TEXTPAGE` handle for this [PdfPageText].
68 #[inline]
69 pub(crate) fn text_page_handle(&self) -> FPDF_TEXTPAGE {
70 self.text_page_handle
71 }
72
73 /// Returns the [PdfiumLibraryBindings] used by this [PdfPageText].
74 #[inline]
75 pub fn bindings(&self) -> &'a dyn PdfiumLibraryBindings {
76 self.bindings
77 }
78
79 /// Returns the total number of characters in all text segments in the containing [PdfPage].
80 ///
81 /// The character count includes whitespace and newlines, and so may differ slightly
82 /// from the result of calling `PdfPageText::all().len()`.
83 #[inline]
84 pub fn len(&self) -> i32 {
85 self.bindings.FPDFText_CountChars(self.text_page_handle())
86 }
87
88 /// Returns `true` if there are no characters in any text box collection in the containing [PdfPage].
89 #[inline]
90 pub fn is_empty(&self) -> bool {
91 self.len() == 0
92 }
93
94 /// Returns a collection of all the `PdfPageTextSegment` text segments in the containing [PdfPage].
95 #[inline]
96 pub fn segments(&self) -> PdfPageTextSegments {
97 PdfPageTextSegments::new(self, 0, self.len(), self.bindings())
98 }
99
100 /// Returns a subset of the `PdfPageTextSegment` text segments in the containing [PdfPage].
101 /// Only text segments containing characters in the given index range will be included.
102 #[inline]
103 pub fn segments_subset(
104 &self,
105 start: PdfPageTextCharIndex,
106 count: PdfPageTextCharIndex,
107 ) -> PdfPageTextSegments {
108 PdfPageTextSegments::new(self, start as i32, count as i32, self.bindings())
109 }
110
111 /// Returns a collection of all the `PdfPageTextChar` characters in the containing [PdfPage].
112 #[inline]
113 pub fn chars(&self) -> PdfPageTextChars {
114 PdfPageTextChars::new(
115 self.page.document_handle(),
116 self.page.page_handle(),
117 self.text_page_handle(),
118 0,
119 self.len(),
120 self.bindings(),
121 )
122 }
123
124 /// Returns a collection of all the `PdfPageTextChar` characters in the given [PdfPageTextObject].
125 ///
126 /// The return result will be empty if the given [PdfPageTextObject] is not attached to the
127 /// containing [PdfPage].
128 #[inline]
129 pub fn chars_for_object(
130 &self,
131 object: &PdfPageTextObject,
132 ) -> Result<PdfPageTextChars, PdfiumError> {
133 // To avoid any possibility of returning the wrong characters in the event
134 // of overlapping text objects, we create a new page, create a copy of the target
135 // text object on the new page, and return the PdfPageTextChars object _for the
136 // copy_, rather than the object itself.
137
138 let page_index = self
139 .bindings()
140 .FPDF_GetPageCount(self.page.document_handle());
141
142 let (document_handle, start_index, end_index) = {
143 // We must avoid several potential lifetime traps. First, the newly created page
144 // and its text page must live at least as long as the PdfPageTextChars object we
145 // return; second, we need to tidy up both the text page and the page once
146 // the PdfPageTextChars object we return falls out of scope (indeed, we need to
147 // delete the newly created page from the document).
148
149 // To manage the lifetimes correctly, we give the PdfPageTextChars object itself
150 // ownership over the newly created page and its text page. The PdfPageTextChars
151 // object will take responsibility for disposing of its own parent objects
152 // when it falls out of scope, including removing the page from the document.
153
154 // We cannot transfer the ownership of a new PdfPage instance to PdfPageTextChars
155 // because PdfPageTextChars is itself created as an indirect child of a PdfPage.
156 // This creates a cyclical relationship between the two objects. To avoid intractable
157 // borrowing problems, we pass raw handles only.
158
159 // Create the new temporary page...
160
161 let page_handle = self.bindings().FPDFPage_New(
162 self.page.document_handle(),
163 page_index,
164 self.page.width().value as c_double,
165 self.page.height().value as c_double,
166 );
167
168 let mut new_page = PdfPage::from_pdfium(
169 self.page.document_handle(),
170 page_handle,
171 None,
172 None,
173 self.bindings,
174 );
175
176 PdfPageIndexCache::cache_props_for_page(
177 self.page.document_handle(),
178 page_handle,
179 page_index as PdfPageIndex,
180 PdfPageContentRegenerationStrategy::AutomaticOnEveryChange,
181 );
182
183 // ... copy the target object onto the new page...
184
185 let copy = object.try_copy_impl(self.page.document_handle(), self.bindings)?;
186 let copy = new_page.objects_mut().add_object(copy)?;
187
188 // ... get the character range for the target object's bounds...
189
190 let bounds = copy.bounds()?;
191 let text_page = new_page.text()?;
192 let tolerance_x = bounds.width() / 2.0;
193 let tolerance_y = bounds.height() / 2.0;
194 let center_height = bounds.bottom() + tolerance_y;
195
196 let start_index = Self::get_char_index_near_point(
197 text_page.text_page_handle(),
198 bounds.left(),
199 tolerance_x,
200 center_height,
201 tolerance_y,
202 self.bindings(),
203 )
204 .ok_or(PdfiumError::NoCharsInRect)?;
205
206 let end_index = Self::get_char_index_near_point(
207 text_page.text_page_handle(),
208 bounds.right(),
209 tolerance_x,
210 center_height,
211 tolerance_y,
212 self.bindings(),
213 )
214 .map(|end| end.saturating_sub(start_index))
215 .ok_or(PdfiumError::NoCharsInRect)?;
216
217 (new_page.document_handle(), start_index, end_index)
218 };
219
220 // ... and use raw handles and indices to create a new PdfPageTextChars instance
221 // that isn't bound to the lifetime of the current object.
222
223 Ok(PdfPageTextChars::new_with_owned_page(
224 document_handle,
225 page_index,
226 start_index as i32,
227 end_index as i32 + 1,
228 self.bindings(),
229 ))
230 }
231
232 /// Returns a collection of all the `PdfPageTextChar` characters in the given [PdfPageAnnotation].
233 ///
234 /// The return result will be empty if the given [PdfPageAnnotation] is not attached to the
235 /// containing [PdfPage].
236 #[inline]
237 pub fn chars_for_annotation(
238 &self,
239 annotation: &PdfPageAnnotation,
240 ) -> Result<PdfPageTextChars, PdfiumError> {
241 self.chars_inside_rect(annotation.bounds()?)
242 .map_err(|_| PdfiumError::NoCharsInAnnotation)
243 }
244
245 /// Returns a collection of all the `PdfPageTextChar` characters that lie within the bounds of
246 /// the given [PdfRect] in the containing [PdfPage].
247 #[inline]
248 pub fn chars_inside_rect(&self, rect: PdfRect) -> Result<PdfPageTextChars, PdfiumError> {
249 let tolerance_x = rect.width() / 2.0;
250 let tolerance_y = rect.height() / 2.0;
251 let center_height = rect.bottom() + tolerance_y;
252
253 let chars = self.chars();
254
255 match (
256 chars.get_char_near_point(rect.left(), tolerance_x, center_height, tolerance_y),
257 chars.get_char_near_point(rect.right(), tolerance_x, center_height, tolerance_y),
258 ) {
259 (Some(start), Some(end)) => Ok(PdfPageTextChars::new(
260 self.page.document_handle(),
261 self.page.page_handle(),
262 self.text_page_handle(),
263 start.index() as i32,
264 end.index().saturating_sub(start.index()) as i32 + 1,
265 self.bindings,
266 )),
267 _ => Err(PdfiumError::NoCharsInRect),
268 }
269 }
270
271 /// Returns the character near to the given x and y positions on the containing [PdfPage],
272 /// if any. The returned character will be no further from the given positions than the given
273 /// tolerance values.
274 pub(crate) fn get_char_index_near_point(
275 text_page_handle: FPDF_TEXTPAGE,
276 x: PdfPoints,
277 tolerance_x: PdfPoints,
278 y: PdfPoints,
279 tolerance_y: PdfPoints,
280 bindings: &dyn PdfiumLibraryBindings,
281 ) -> Option<PdfPageTextCharIndex> {
282 match bindings.FPDFText_GetCharIndexAtPos(
283 text_page_handle,
284 x.value as c_double,
285 y.value as c_double,
286 tolerance_x.value as c_double,
287 tolerance_y.value as c_double,
288 ) {
289 -1 => None, // No character at position within tolerances
290 -3 => None, // An error occurred, but we'll eat it
291 index => Some(index as PdfPageTextCharIndex),
292 }
293 }
294
295 /// Returns all characters that lie within the containing [PdfPage], in the order in which
296 /// they are defined in the document, concatenated into a single string.
297 ///
298 /// In complex custom layouts, the order in which characters are defined in the document
299 /// and the order in which they appear visually during rendering (and thus the order in
300 /// which they are read by a user) may not necessarily match.
301 pub fn all(&self) -> String {
302 self.inside_rect(self.page.page_size())
303 }
304
305 /// Returns all characters that lie within the bounds of the given [PdfRect] in the
306 /// containing [PdfPage], in the order in which they are defined in the document,
307 /// concatenated into a single string.
308 ///
309 /// In complex custom layouts, the order in which characters are defined in the document
310 /// and the order in which they appear visually during rendering (and thus the order in
311 /// which they are read by a user) may not necessarily match.
312 pub fn inside_rect(&self, rect: PdfRect) -> String {
313 // Retrieving the bounded text from Pdfium is a two-step operation. First, we call
314 // FPDFText_GetBoundedText() with a null buffer; this will retrieve the length of
315 // the bounded text in _characters_ (not _bytes_!). If the length is zero, then there is
316 // no text within the given rectangle's boundaries.
317
318 // If the length is non-zero, then we reserve a buffer (sized in words rather than bytes,
319 // to allow for two bytes per character) and call FPDFText_GetBoundedText() again with a
320 // pointer to the buffer; this will write the bounded text to the buffer in UTF16-LE format.
321
322 let left = rect.left().value as f64;
323
324 let top = rect.top().value as f64;
325
326 let right = rect.right().value as f64;
327
328 let bottom = rect.bottom().value as f64;
329
330 let chars_count = self.bindings().FPDFText_GetBoundedText(
331 self.text_page_handle(),
332 left,
333 top,
334 right,
335 bottom,
336 null_mut(),
337 0,
338 );
339
340 if chars_count == 0 {
341 // No text lies within the given rectangle.
342
343 return String::new();
344 }
345
346 let mut buffer = create_sized_buffer(chars_count as usize);
347
348 let result = self.bindings().FPDFText_GetBoundedText(
349 self.text_page_handle(),
350 left,
351 top,
352 right,
353 bottom,
354 buffer.as_mut_ptr(),
355 chars_count,
356 );
357
358 assert_eq!(result, chars_count);
359
360 get_string_from_pdfium_utf16le_bytes(cast_slice(buffer.as_slice()).to_vec())
361 .unwrap_or_default()
362 }
363
364 /// Returns all characters assigned to the given [PdfPageTextObject] in this [PdfPageText] object,
365 /// concatenated into a single string.
366 pub fn for_object(&self, object: &PdfPageTextObject) -> String {
367 // Retrieving the string value from Pdfium is a two-step operation. First, we call
368 // FPDFTextObj_GetText() with a null buffer; this will retrieve the length of
369 // the text in bytes, assuming the page object exists. If the length is zero,
370 // then there is no text.
371
372 // If the length is non-zero, then we reserve a byte buffer of the given
373 // length and call FPDFTextObj_GetText() again with a pointer to the buffer;
374 // this will write the text for the page object into the buffer.
375
376 let buffer_length = self.bindings().FPDFTextObj_GetText(
377 object.object_handle(),
378 self.text_page_handle(),
379 null_mut(),
380 0,
381 );
382
383 if buffer_length == 0 {
384 // There is no text.
385
386 return String::new();
387 }
388
389 let mut buffer = create_byte_buffer(buffer_length as usize);
390
391 let result = self.bindings().FPDFTextObj_GetText(
392 object.object_handle(),
393 self.text_page_handle(),
394 buffer.as_mut_ptr() as *mut FPDF_WCHAR,
395 buffer_length,
396 );
397
398 assert_eq!(result, buffer_length);
399
400 get_string_from_pdfium_utf16le_bytes(buffer).unwrap_or_default()
401 }
402
403 /// Returns all characters that lie within the bounds of the given [PdfPageAnnotation] in the
404 /// containing [PdfPage], in the order in which they are defined in the document,
405 /// concatenated into a single string.
406 ///
407 /// In complex custom layouts, the order in which characters are defined in the document
408 /// and the order in which they appear visually during rendering (and thus the order in
409 /// which they are read by a user) may not necessarily match.
410 #[inline]
411 pub fn for_annotation(&self, annotation: &PdfPageAnnotation) -> Result<String, PdfiumError> {
412 let bounds = annotation.bounds()?;
413
414 Ok(self.inside_rect(bounds))
415 }
416
417 /// Starts a search for the given text string, returning a new [PdfPageTextSearch]
418 /// object that can be used to step through the search results.
419 #[inline]
420 pub fn search(&self, text: &str, options: &PdfSearchOptions) -> PdfPageTextSearch {
421 self.search_from(text, options, 0)
422 }
423
424 /// Starts a search for the given test string from the given character position,
425 /// returning a new [PdfPageTextSearch] object that can be used to step through
426 /// the search results.
427 pub fn search_from(
428 &self,
429 text: &str,
430 options: &PdfSearchOptions,
431 index: PdfPageTextCharIndex,
432 ) -> PdfPageTextSearch {
433 PdfPageTextSearch::from_pdfium(
434 self.bindings().FPDFText_FindStart(
435 self.text_page_handle(),
436 get_pdfium_utf16le_bytes_from_str(text).as_ptr() as FPDF_WIDESTRING,
437 options.as_pdfium(),
438 index as c_int,
439 ),
440 self,
441 self.bindings(),
442 )
443 }
444}
445
446impl<'a> Display for PdfPageText<'a> {
447 #[inline]
448 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
449 f.write_str(self.all().as_str())
450 }
451}
452
453impl<'a> Drop for PdfPageText<'a> {
454 /// Closes the [PdfPageText] collection, releasing held memory.
455 #[inline]
456 fn drop(&mut self) {
457 self.bindings().FPDFText_ClosePage(self.text_page_handle());
458 }
459}
460
461#[cfg(test)]
462mod tests {
463 use crate::prelude::*;
464 use crate::utils::test::test_bind_to_pdfium;
465
466 #[test]
467 fn test_overlapping_chars_results() -> Result<(), PdfiumError> {
468 // Test to make sure the result of the .chars_for_object() function returns the
469 // correct results in the event of overlapping text objects.
470 // For more details, see: https://github.com/ajrcarey/pdfium-render/issues/98
471
472 let pdfium = test_bind_to_pdfium();
473
474 // Create a new document with three overlapping text objects.
475
476 let mut document = pdfium.create_new_pdf()?;
477
478 let mut page = document
479 .pages_mut()
480 .create_page_at_start(PdfPagePaperSize::a4())?;
481
482 let font = document.fonts_mut().courier();
483
484 let txt1 = page.objects_mut().create_text_object(
485 PdfPoints::ZERO,
486 PdfPoints::ZERO,
487 "AAAAAA",
488 font,
489 PdfPoints::new(10.0),
490 )?;
491
492 let txt2 = page.objects_mut().create_text_object(
493 PdfPoints::ZERO,
494 PdfPoints::ZERO,
495 "BBBBBB",
496 font,
497 PdfPoints::new(10.0),
498 )?;
499
500 let txt3 = page.objects_mut().create_text_object(
501 PdfPoints::ZERO,
502 PdfPoints::ZERO,
503 "CDCDCDE",
504 font,
505 PdfPoints::new(10.0),
506 )?;
507
508 let page_text = page.text()?;
509
510 // Check the results for all three objects are not affected by overlapping.
511
512 assert!(test_one_overlapping_text_object_results(
513 &txt1, &page_text, "AAAAAA"
514 )?);
515 assert!(test_one_overlapping_text_object_results(
516 &txt2, &page_text, "BBBBBB"
517 )?);
518 assert!(test_one_overlapping_text_object_results(
519 &txt3, &page_text, "CDCDCDE"
520 )?);
521
522 Ok(())
523 }
524
525 fn test_one_overlapping_text_object_results(
526 object: &PdfPageObject,
527 page_text: &PdfPageText,
528 expected: &str,
529 ) -> Result<bool, PdfiumError> {
530 if let Some(txt) = object.as_text_object() {
531 assert_eq!(txt.text().trim(), expected);
532 assert_eq!(page_text.for_object(txt).trim(), expected);
533
534 for (index, char) in txt.chars(&page_text)?.iter().enumerate() {
535 assert_eq!(txt.text().chars().nth(index), char.unicode_char());
536 assert_eq!(expected.chars().nth(index), char.unicode_char());
537 }
538
539 Ok(true)
540 } else {
541 Ok(false)
542 }
543 }
544}