pdfium_render/pdf/document/page/
text.rs

1//! Defines the [PdfPageText] struct, exposing functionality related to the
2//! collection of Unicode characters visible on a single [PdfPage].
3
4pub mod char;
5pub mod chars;
6pub mod search;
7pub mod segment;
8pub mod segments;
9
10use crate::bindgen::{FPDF_TEXTPAGE, FPDF_WCHAR, FPDF_WIDESTRING};
11use crate::bindings::PdfiumLibraryBindings;
12use crate::error::PdfiumError;
13use crate::pdf::document::page::annotation::PdfPageAnnotation;
14use crate::pdf::document::page::annotation::PdfPageAnnotationCommon;
15use crate::pdf::document::page::object::private::internal::PdfPageObjectPrivate;
16use crate::pdf::document::page::object::text::PdfPageTextObject;
17use crate::pdf::document::page::object::PdfPageObjectCommon;
18use crate::pdf::document::page::objects::common::PdfPageObjectsCommon;
19use crate::pdf::document::page::text::chars::{PdfPageTextCharIndex, PdfPageTextChars};
20use crate::pdf::document::page::text::search::{PdfPageTextSearch, PdfSearchOptions};
21use crate::pdf::document::page::text::segments::PdfPageTextSegments;
22use crate::pdf::document::page::{PdfPage, PdfPageContentRegenerationStrategy, PdfPageIndexCache};
23use crate::pdf::document::pages::PdfPageIndex;
24use crate::pdf::points::PdfPoints;
25use crate::pdf::rect::PdfRect;
26use crate::utils::mem::{create_byte_buffer, create_sized_buffer};
27use crate::utils::utf16le::{
28    get_pdfium_utf16le_bytes_from_str, get_string_from_pdfium_utf16le_bytes,
29};
30use bytemuck::cast_slice;
31use std::fmt::{Display, Formatter};
32use std::os::raw::{c_double, c_int};
33use std::ptr::null_mut;
34
35/// The collection of Unicode characters visible on a single [PdfPage].
36///
37/// Use the [PdfPageText::all()] function to easily return all characters in the containing
38/// [PdfPage] in the order in which they are defined in the PDF file.
39///
40/// Use the [PdfPageText::search()] function to initialise a new [PdfPageTextSearch] object,
41/// yielding the results of searching for a target string within the character collection.
42///
43/// In complex custom layouts, the order in which characters are defined in the document
44/// and the order in which they appear visually during rendering (and thus the order in
45/// which they are read by a user) may not necessarily match.
46///
47/// [PdfPageText] implements both the [ToString] and the [Display] traits.
48pub struct PdfPageText<'a> {
49    text_page_handle: FPDF_TEXTPAGE,
50    page: &'a PdfPage<'a>,
51    bindings: &'a dyn PdfiumLibraryBindings,
52}
53
54impl<'a> PdfPageText<'a> {
55    pub(crate) fn from_pdfium(
56        text_page_handle: FPDF_TEXTPAGE,
57        page: &'a PdfPage<'a>,
58        bindings: &'a dyn PdfiumLibraryBindings,
59    ) -> Self {
60        PdfPageText {
61            text_page_handle,
62            page,
63            bindings,
64        }
65    }
66
67    /// Returns the internal `FPDF_TEXTPAGE` handle for this [PdfPageText].
68    #[inline]
69    pub(crate) fn text_page_handle(&self) -> FPDF_TEXTPAGE {
70        self.text_page_handle
71    }
72
73    /// Returns the [PdfiumLibraryBindings] used by this [PdfPageText].
74    #[inline]
75    pub fn bindings(&self) -> &'a dyn PdfiumLibraryBindings {
76        self.bindings
77    }
78
79    /// Returns the total number of characters in all text segments in the containing [PdfPage].
80    ///
81    /// The character count includes whitespace and newlines, and so may differ slightly
82    /// from the result of calling `PdfPageText::all().len()`.
83    #[inline]
84    pub fn len(&self) -> i32 {
85        self.bindings.FPDFText_CountChars(self.text_page_handle())
86    }
87
88    /// Returns `true` if there are no characters in any text box collection in the containing [PdfPage].
89    #[inline]
90    pub fn is_empty(&self) -> bool {
91        self.len() == 0
92    }
93
94    /// Returns a collection of all the `PdfPageTextSegment` text segments in the containing [PdfPage].
95    #[inline]
96    pub fn segments(&self) -> PdfPageTextSegments {
97        PdfPageTextSegments::new(self, 0, self.len(), self.bindings())
98    }
99
100    /// Returns a subset of the `PdfPageTextSegment` text segments in the containing [PdfPage].
101    /// Only text segments containing characters in the given index range will be included.
102    #[inline]
103    pub fn segments_subset(
104        &self,
105        start: PdfPageTextCharIndex,
106        count: PdfPageTextCharIndex,
107    ) -> PdfPageTextSegments {
108        PdfPageTextSegments::new(self, start as i32, count as i32, self.bindings())
109    }
110
111    /// Returns a collection of all the `PdfPageTextChar` characters in the containing [PdfPage].
112    #[inline]
113    pub fn chars(&self) -> PdfPageTextChars {
114        PdfPageTextChars::new(
115            self.page.document_handle(),
116            self.page.page_handle(),
117            self.text_page_handle(),
118            0,
119            self.len(),
120            self.bindings(),
121        )
122    }
123
124    /// Returns a collection of all the `PdfPageTextChar` characters in the given [PdfPageTextObject].
125    ///
126    /// The return result will be empty if the given [PdfPageTextObject] is not attached to the
127    /// containing [PdfPage].
128    #[inline]
129    pub fn chars_for_object(
130        &self,
131        object: &PdfPageTextObject,
132    ) -> Result<PdfPageTextChars, PdfiumError> {
133        // To avoid any possibility of returning the wrong characters in the event
134        // of overlapping text objects, we create a new page, create a copy of the target
135        // text object on the new page, and return the PdfPageTextChars object _for the
136        // copy_, rather than the object itself.
137
138        let page_index = self
139            .bindings()
140            .FPDF_GetPageCount(self.page.document_handle());
141
142        let (document_handle, start_index, end_index) = {
143            // We must avoid several potential lifetime traps. First, the newly created page
144            // and its text page must live at least as long as the PdfPageTextChars object we
145            // return; second, we need to tidy up both the text page and the page once
146            // the PdfPageTextChars object we return falls out of scope (indeed, we need to
147            // delete the newly created page from the document).
148
149            // To manage the lifetimes correctly, we give the PdfPageTextChars object itself
150            // ownership over the newly created page and its text page. The PdfPageTextChars
151            // object will take responsibility for disposing of its own parent objects
152            // when it falls out of scope, including removing the page from the document.
153
154            // We cannot transfer the ownership of a new PdfPage instance to PdfPageTextChars
155            // because PdfPageTextChars is itself created as an indirect child of a PdfPage.
156            // This creates a cyclical relationship between the two objects. To avoid intractable
157            // borrowing problems, we pass raw handles only.
158
159            // Create the new temporary page...
160
161            let page_handle = self.bindings().FPDFPage_New(
162                self.page.document_handle(),
163                page_index,
164                self.page.width().value as c_double,
165                self.page.height().value as c_double,
166            );
167
168            let mut new_page = PdfPage::from_pdfium(
169                self.page.document_handle(),
170                page_handle,
171                None,
172                None,
173                self.bindings,
174            );
175
176            PdfPageIndexCache::cache_props_for_page(
177                self.page.document_handle(),
178                page_handle,
179                page_index as PdfPageIndex,
180                PdfPageContentRegenerationStrategy::AutomaticOnEveryChange,
181            );
182
183            // ... copy the target object onto the new page...
184
185            let copy = object.try_copy_impl(self.page.document_handle(), self.bindings)?;
186            let copy = new_page.objects_mut().add_object(copy)?;
187
188            // ... get the character range for the target object's bounds...
189
190            let bounds = copy.bounds()?;
191            let text_page = new_page.text()?;
192            let tolerance_x = bounds.width() / 2.0;
193            let tolerance_y = bounds.height() / 2.0;
194            let center_height = bounds.bottom() + tolerance_y;
195
196            let start_index = Self::get_char_index_near_point(
197                text_page.text_page_handle(),
198                bounds.left(),
199                tolerance_x,
200                center_height,
201                tolerance_y,
202                self.bindings(),
203            )
204            .ok_or(PdfiumError::NoCharsInRect)?;
205
206            let end_index = Self::get_char_index_near_point(
207                text_page.text_page_handle(),
208                bounds.right(),
209                tolerance_x,
210                center_height,
211                tolerance_y,
212                self.bindings(),
213            )
214            .map(|end| end.saturating_sub(start_index))
215            .ok_or(PdfiumError::NoCharsInRect)?;
216
217            (new_page.document_handle(), start_index, end_index)
218        };
219
220        // ... and use raw handles and indices to create a new PdfPageTextChars instance
221        // that isn't bound to the lifetime of the current object.
222
223        Ok(PdfPageTextChars::new_with_owned_page(
224            document_handle,
225            page_index,
226            start_index as i32,
227            end_index as i32 + 1,
228            self.bindings(),
229        ))
230    }
231
232    /// Returns a collection of all the `PdfPageTextChar` characters in the given [PdfPageAnnotation].
233    ///
234    /// The return result will be empty if the given [PdfPageAnnotation] is not attached to the
235    /// containing [PdfPage].
236    #[inline]
237    pub fn chars_for_annotation(
238        &self,
239        annotation: &PdfPageAnnotation,
240    ) -> Result<PdfPageTextChars, PdfiumError> {
241        self.chars_inside_rect(annotation.bounds()?)
242            .map_err(|_| PdfiumError::NoCharsInAnnotation)
243    }
244
245    /// Returns a collection of all the `PdfPageTextChar` characters that lie within the bounds of
246    /// the given [PdfRect] in the containing [PdfPage].
247    #[inline]
248    pub fn chars_inside_rect(&self, rect: PdfRect) -> Result<PdfPageTextChars, PdfiumError> {
249        let tolerance_x = rect.width() / 2.0;
250        let tolerance_y = rect.height() / 2.0;
251        let center_height = rect.bottom() + tolerance_y;
252
253        let chars = self.chars();
254
255        match (
256            chars.get_char_near_point(rect.left(), tolerance_x, center_height, tolerance_y),
257            chars.get_char_near_point(rect.right(), tolerance_x, center_height, tolerance_y),
258        ) {
259            (Some(start), Some(end)) => Ok(PdfPageTextChars::new(
260                self.page.document_handle(),
261                self.page.page_handle(),
262                self.text_page_handle(),
263                start.index() as i32,
264                end.index().saturating_sub(start.index()) as i32 + 1,
265                self.bindings,
266            )),
267            _ => Err(PdfiumError::NoCharsInRect),
268        }
269    }
270
271    /// Returns the character near to the given x and y positions on the containing [PdfPage],
272    /// if any. The returned character will be no further from the given positions than the given
273    /// tolerance values.
274    pub(crate) fn get_char_index_near_point(
275        text_page_handle: FPDF_TEXTPAGE,
276        x: PdfPoints,
277        tolerance_x: PdfPoints,
278        y: PdfPoints,
279        tolerance_y: PdfPoints,
280        bindings: &dyn PdfiumLibraryBindings,
281    ) -> Option<PdfPageTextCharIndex> {
282        match bindings.FPDFText_GetCharIndexAtPos(
283            text_page_handle,
284            x.value as c_double,
285            y.value as c_double,
286            tolerance_x.value as c_double,
287            tolerance_y.value as c_double,
288        ) {
289            -1 => None, // No character at position within tolerances
290            -3 => None, // An error occurred, but we'll eat it
291            index => Some(index as PdfPageTextCharIndex),
292        }
293    }
294
295    /// Returns all characters that lie within the containing [PdfPage], in the order in which
296    /// they are defined in the document, concatenated into a single string.
297    ///
298    /// In complex custom layouts, the order in which characters are defined in the document
299    /// and the order in which they appear visually during rendering (and thus the order in
300    /// which they are read by a user) may not necessarily match.
301    pub fn all(&self) -> String {
302        self.inside_rect(self.page.page_size())
303    }
304
305    /// Returns all characters that lie within the bounds of the given [PdfRect] in the
306    /// containing [PdfPage], in the order in which they are defined in the document,
307    /// concatenated into a single string.
308    ///
309    /// In complex custom layouts, the order in which characters are defined in the document
310    /// and the order in which they appear visually during rendering (and thus the order in
311    /// which they are read by a user) may not necessarily match.
312    pub fn inside_rect(&self, rect: PdfRect) -> String {
313        // Retrieving the bounded text from Pdfium is a two-step operation. First, we call
314        // FPDFText_GetBoundedText() with a null buffer; this will retrieve the length of
315        // the bounded text in _characters_ (not _bytes_!). If the length is zero, then there is
316        // no text within the given rectangle's boundaries.
317
318        // If the length is non-zero, then we reserve a buffer (sized in words rather than bytes,
319        // to allow for two bytes per character) and call FPDFText_GetBoundedText() again with a
320        // pointer to the buffer; this will write the bounded text to the buffer in UTF16-LE format.
321
322        let left = rect.left().value as f64;
323
324        let top = rect.top().value as f64;
325
326        let right = rect.right().value as f64;
327
328        let bottom = rect.bottom().value as f64;
329
330        let chars_count = self.bindings().FPDFText_GetBoundedText(
331            self.text_page_handle(),
332            left,
333            top,
334            right,
335            bottom,
336            null_mut(),
337            0,
338        );
339
340        if chars_count == 0 {
341            // No text lies within the given rectangle.
342
343            return String::new();
344        }
345
346        let mut buffer = create_sized_buffer(chars_count as usize);
347
348        let result = self.bindings().FPDFText_GetBoundedText(
349            self.text_page_handle(),
350            left,
351            top,
352            right,
353            bottom,
354            buffer.as_mut_ptr(),
355            chars_count,
356        );
357
358        assert_eq!(result, chars_count);
359
360        get_string_from_pdfium_utf16le_bytes(cast_slice(buffer.as_slice()).to_vec())
361            .unwrap_or_default()
362    }
363
364    /// Returns all characters assigned to the given [PdfPageTextObject] in this [PdfPageText] object,
365    /// concatenated into a single string.
366    pub fn for_object(&self, object: &PdfPageTextObject) -> String {
367        // Retrieving the string value from Pdfium is a two-step operation. First, we call
368        // FPDFTextObj_GetText() with a null buffer; this will retrieve the length of
369        // the text in bytes, assuming the page object exists. If the length is zero,
370        // then there is no text.
371
372        // If the length is non-zero, then we reserve a byte buffer of the given
373        // length and call FPDFTextObj_GetText() again with a pointer to the buffer;
374        // this will write the text for the page object into the buffer.
375
376        let buffer_length = self.bindings().FPDFTextObj_GetText(
377            object.object_handle(),
378            self.text_page_handle(),
379            null_mut(),
380            0,
381        );
382
383        if buffer_length == 0 {
384            // There is no text.
385
386            return String::new();
387        }
388
389        let mut buffer = create_byte_buffer(buffer_length as usize);
390
391        let result = self.bindings().FPDFTextObj_GetText(
392            object.object_handle(),
393            self.text_page_handle(),
394            buffer.as_mut_ptr() as *mut FPDF_WCHAR,
395            buffer_length,
396        );
397
398        assert_eq!(result, buffer_length);
399
400        get_string_from_pdfium_utf16le_bytes(buffer).unwrap_or_default()
401    }
402
403    /// Returns all characters that lie within the bounds of the given [PdfPageAnnotation] in the
404    /// containing [PdfPage], in the order in which they are defined in the document,
405    /// concatenated into a single string.
406    ///
407    /// In complex custom layouts, the order in which characters are defined in the document
408    /// and the order in which they appear visually during rendering (and thus the order in
409    /// which they are read by a user) may not necessarily match.
410    #[inline]
411    pub fn for_annotation(&self, annotation: &PdfPageAnnotation) -> Result<String, PdfiumError> {
412        let bounds = annotation.bounds()?;
413
414        Ok(self.inside_rect(bounds))
415    }
416
417    /// Starts a search for the given text string, returning a new [PdfPageTextSearch]
418    /// object that can be used to step through the search results.
419    #[inline]
420    pub fn search(&self, text: &str, options: &PdfSearchOptions) -> PdfPageTextSearch {
421        self.search_from(text, options, 0)
422    }
423
424    /// Starts a search for the given test string from the given character position,
425    /// returning a new [PdfPageTextSearch] object that can be used to step through
426    /// the search results.
427    pub fn search_from(
428        &self,
429        text: &str,
430        options: &PdfSearchOptions,
431        index: PdfPageTextCharIndex,
432    ) -> PdfPageTextSearch {
433        PdfPageTextSearch::from_pdfium(
434            self.bindings().FPDFText_FindStart(
435                self.text_page_handle(),
436                get_pdfium_utf16le_bytes_from_str(text).as_ptr() as FPDF_WIDESTRING,
437                options.as_pdfium(),
438                index as c_int,
439            ),
440            self,
441            self.bindings(),
442        )
443    }
444}
445
446impl<'a> Display for PdfPageText<'a> {
447    #[inline]
448    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
449        f.write_str(self.all().as_str())
450    }
451}
452
453impl<'a> Drop for PdfPageText<'a> {
454    /// Closes the [PdfPageText] collection, releasing held memory.
455    #[inline]
456    fn drop(&mut self) {
457        self.bindings().FPDFText_ClosePage(self.text_page_handle());
458    }
459}
460
461#[cfg(test)]
462mod tests {
463    use crate::prelude::*;
464    use crate::utils::test::test_bind_to_pdfium;
465
466    #[test]
467    fn test_overlapping_chars_results() -> Result<(), PdfiumError> {
468        // Test to make sure the result of the .chars_for_object() function returns the
469        // correct results in the event of overlapping text objects.
470        // For more details, see: https://github.com/ajrcarey/pdfium-render/issues/98
471
472        let pdfium = test_bind_to_pdfium();
473
474        // Create a new document with three overlapping text objects.
475
476        let mut document = pdfium.create_new_pdf()?;
477
478        let mut page = document
479            .pages_mut()
480            .create_page_at_start(PdfPagePaperSize::a4())?;
481
482        let font = document.fonts_mut().courier();
483
484        let txt1 = page.objects_mut().create_text_object(
485            PdfPoints::ZERO,
486            PdfPoints::ZERO,
487            "AAAAAA",
488            font,
489            PdfPoints::new(10.0),
490        )?;
491
492        let txt2 = page.objects_mut().create_text_object(
493            PdfPoints::ZERO,
494            PdfPoints::ZERO,
495            "BBBBBB",
496            font,
497            PdfPoints::new(10.0),
498        )?;
499
500        let txt3 = page.objects_mut().create_text_object(
501            PdfPoints::ZERO,
502            PdfPoints::ZERO,
503            "CDCDCDE",
504            font,
505            PdfPoints::new(10.0),
506        )?;
507
508        let page_text = page.text()?;
509
510        // Check the results for all three objects are not affected by overlapping.
511
512        assert!(test_one_overlapping_text_object_results(
513            &txt1, &page_text, "AAAAAA"
514        )?);
515        assert!(test_one_overlapping_text_object_results(
516            &txt2, &page_text, "BBBBBB"
517        )?);
518        assert!(test_one_overlapping_text_object_results(
519            &txt3, &page_text, "CDCDCDE"
520        )?);
521
522        Ok(())
523    }
524
525    fn test_one_overlapping_text_object_results(
526        object: &PdfPageObject,
527        page_text: &PdfPageText,
528        expected: &str,
529    ) -> Result<bool, PdfiumError> {
530        if let Some(txt) = object.as_text_object() {
531            assert_eq!(txt.text().trim(), expected);
532            assert_eq!(page_text.for_object(txt).trim(), expected);
533
534            for (index, char) in txt.chars(&page_text)?.iter().enumerate() {
535                assert_eq!(txt.text().chars().nth(index), char.unicode_char());
536                assert_eq!(expected.chars().nth(index), char.unicode_char());
537            }
538
539            Ok(true)
540        } else {
541            Ok(false)
542        }
543    }
544}