pdfium_render/pdf/document/page/
text.rs

1//! Defines the [PdfPageText] struct, exposing functionality related to the
2//! collection of Unicode characters visible on a single [PdfPage].
3
4pub mod char;
5pub mod chars;
6pub mod search;
7pub mod segment;
8pub mod segments;
9
10use crate::bindgen::{FPDF_TEXTPAGE, FPDF_WCHAR, FPDF_WIDESTRING};
11use crate::bindings::PdfiumLibraryBindings;
12use crate::error::PdfiumError;
13use crate::pdf::document::page::annotation::PdfPageAnnotation;
14use crate::pdf::document::page::annotation::PdfPageAnnotationCommon;
15use crate::pdf::document::page::object::private::internal::PdfPageObjectPrivate;
16use crate::pdf::document::page::object::text::PdfPageTextObject;
17use crate::pdf::document::page::text::chars::{PdfPageTextCharIndex, PdfPageTextChars};
18use crate::pdf::document::page::text::search::{PdfPageTextSearch, PdfSearchOptions};
19use crate::pdf::document::page::text::segments::PdfPageTextSegments;
20use crate::pdf::document::page::PdfPage;
21use crate::pdf::points::PdfPoints;
22use crate::pdf::rect::PdfRect;
23use crate::utils::mem::{create_byte_buffer, create_sized_buffer};
24use crate::utils::utf16le::{
25    get_pdfium_utf16le_bytes_from_str, get_string_from_pdfium_utf16le_bytes,
26};
27use bytemuck::cast_slice;
28use std::fmt::{Display, Formatter};
29use std::os::raw::{c_double, c_int};
30use std::ptr::null_mut;
31
32#[cfg(any(
33    feature = "pdfium_future",
34    feature = "pdfium_7350",
35    feature = "pdfium_7215",
36    feature = "pdfium_7123",
37    feature = "pdfium_6996",
38    feature = "pdfium_6721",
39    feature = "pdfium_6666",
40    feature = "pdfium_6611",
41))]
42use crate::pdf::document::page::object::PdfPageObjectCommon;
43
44/// The collection of Unicode characters visible on a single [PdfPage].
45///
46/// Use the [PdfPageText::all()] function to easily return all characters in the containing
47/// [PdfPage] in the order in which they are defined in the PDF file.
48///
49/// Use the [PdfPageText::search()] function to initialise a new [PdfPageTextSearch] object,
50/// yielding the results of searching for a target string within the character collection.
51///
52/// In complex custom layouts, the order in which characters are defined in the document
53/// and the order in which they appear visually during rendering (and thus the order in
54/// which they are read by a user) may not necessarily match.
55///
56/// [PdfPageText] implements both the [ToString] and the [Display] traits.
57pub struct PdfPageText<'a> {
58    text_page_handle: FPDF_TEXTPAGE,
59    page: &'a PdfPage<'a>,
60    bindings: &'a dyn PdfiumLibraryBindings,
61}
62
63impl<'a> PdfPageText<'a> {
64    pub(crate) fn from_pdfium(
65        text_page_handle: FPDF_TEXTPAGE,
66        page: &'a PdfPage<'a>,
67        bindings: &'a dyn PdfiumLibraryBindings,
68    ) -> Self {
69        PdfPageText {
70            text_page_handle,
71            page,
72            bindings,
73        }
74    }
75
76    /// Returns the internal `FPDF_TEXTPAGE` handle for this [PdfPageText].
77    #[inline]
78    pub(crate) fn text_page_handle(&self) -> FPDF_TEXTPAGE {
79        self.text_page_handle
80    }
81
82    /// Returns the [PdfiumLibraryBindings] used by this [PdfPageText].
83    #[inline]
84    pub fn bindings(&self) -> &'a dyn PdfiumLibraryBindings {
85        self.bindings
86    }
87
88    /// Returns the total number of characters in all text segments in the containing [PdfPage].
89    ///
90    /// The character count includes whitespace and newlines, and so may differ slightly
91    /// from the result of calling `PdfPageText::all().len()`.
92    #[inline]
93    pub fn len(&self) -> i32 {
94        self.bindings.FPDFText_CountChars(self.text_page_handle())
95    }
96
97    /// Returns `true` if there are no characters in any text box collection in the containing [PdfPage].
98    #[inline]
99    pub fn is_empty(&self) -> bool {
100        self.len() == 0
101    }
102
103    /// Returns a collection of all the `PdfPageTextSegment` text segments in the containing [PdfPage].
104    #[inline]
105    pub fn segments(&self) -> PdfPageTextSegments {
106        PdfPageTextSegments::new(self, 0, self.len(), self.bindings())
107    }
108
109    /// Returns a subset of the `PdfPageTextSegment` text segments in the containing [PdfPage].
110    /// Only text segments containing characters in the given index range will be included.
111    #[inline]
112    pub fn segments_subset(
113        &self,
114        start: PdfPageTextCharIndex,
115        count: PdfPageTextCharIndex,
116    ) -> PdfPageTextSegments {
117        PdfPageTextSegments::new(self, start as i32, count as i32, self.bindings())
118    }
119
120    /// Returns a collection of all the `PdfPageTextChar` characters in the containing [PdfPage].
121    #[inline]
122    pub fn chars(&self) -> PdfPageTextChars {
123        PdfPageTextChars::new(
124            self.page.document_handle(),
125            self.page.page_handle(),
126            self.text_page_handle(),
127            (0..self.len()).collect(),
128            self.bindings(),
129        )
130    }
131
132    #[cfg(any(
133        feature = "pdfium_future",
134        feature = "pdfium_7350",
135        feature = "pdfium_7215",
136        feature = "pdfium_7123",
137        feature = "pdfium_6996",
138        feature = "pdfium_6721",
139        feature = "pdfium_6666",
140        feature = "pdfium_6611",
141    ))]
142    /// Returns a collection of all the `PdfPageTextChar` characters in the given [PdfPageTextObject].
143    ///
144    /// The return result will be empty if the given [PdfPageTextObject] is not attached to the
145    /// containing [PdfPage].
146    #[inline]
147    pub fn chars_for_object(
148        &self,
149        object: &PdfPageTextObject,
150    ) -> Result<PdfPageTextChars, PdfiumError> {
151        let chars_inside_bounds = self
152            .chars_inside_rect(object.bounds()?.to_rect())
153            .map_err(|_| PdfiumError::NoCharsInPageObject)?;
154
155        // The collection contains _all_ characters inside the bounds of the
156        // given text object, including characters from any overlapping objects.
157        // Filter the collection so it contains only characters from the
158        // given text object.
159
160        Ok(PdfPageTextChars::new(
161            self.page.document_handle(),
162            self.page.page_handle(),
163            self.text_page_handle(),
164            chars_inside_bounds
165                .iter()
166                .filter(|char| {
167                    self.bindings
168                        .FPDFText_GetTextObject(self.text_page_handle(), char.index() as i32)
169                        == object.object_handle()
170                })
171                .map(|char| char.index() as i32)
172                .collect(),
173            self.bindings(),
174        ))
175    }
176
177    /// Returns a collection of all the `PdfPageTextChar` characters in the given [PdfPageAnnotation].
178    ///
179    /// The return result will be empty if the given [PdfPageAnnotation] is not attached to the
180    /// containing [PdfPage].
181    #[inline]
182    pub fn chars_for_annotation(
183        &self,
184        annotation: &PdfPageAnnotation,
185    ) -> Result<PdfPageTextChars, PdfiumError> {
186        self.chars_inside_rect(annotation.bounds()?)
187            .map_err(|_| PdfiumError::NoCharsInAnnotation)
188    }
189
190    /// Returns a collection of all the `PdfPageTextChar` characters that lie within the bounds of
191    /// the given [PdfRect] in the containing [PdfPage].
192    #[inline]
193    pub fn chars_inside_rect(&self, rect: PdfRect) -> Result<PdfPageTextChars, PdfiumError> {
194        let tolerance_x = rect.width() / 2.0;
195        let tolerance_y = rect.height() / 2.0;
196        let center_height = rect.bottom() + tolerance_y;
197
198        let chars = self.chars();
199
200        match (
201            chars.get_char_near_point(rect.left(), tolerance_x, center_height, tolerance_y),
202            chars.get_char_near_point(rect.right(), tolerance_x, center_height, tolerance_y),
203        ) {
204            (Some(start), Some(end)) => Ok(PdfPageTextChars::new(
205                self.page.document_handle(),
206                self.page.page_handle(),
207                self.text_page_handle(),
208                (start.index() as i32..end.index().saturating_sub(start.index()) as i32 + 1)
209                    .collect(),
210                self.bindings,
211            )),
212            _ => Err(PdfiumError::NoCharsInRect),
213        }
214    }
215
216    /// Returns the character near to the given x and y positions on the containing [PdfPage],
217    /// if any. The returned character will be no further from the given positions than the given
218    /// tolerance values.
219    pub(crate) fn get_char_index_near_point(
220        text_page_handle: FPDF_TEXTPAGE,
221        x: PdfPoints,
222        tolerance_x: PdfPoints,
223        y: PdfPoints,
224        tolerance_y: PdfPoints,
225        bindings: &dyn PdfiumLibraryBindings,
226    ) -> Option<PdfPageTextCharIndex> {
227        match bindings.FPDFText_GetCharIndexAtPos(
228            text_page_handle,
229            x.value as c_double,
230            y.value as c_double,
231            tolerance_x.value as c_double,
232            tolerance_y.value as c_double,
233        ) {
234            -1 => None, // No character at position within tolerances
235            -3 => None, // An error occurred, but we'll eat it
236            index => Some(index as PdfPageTextCharIndex),
237        }
238    }
239
240    /// Returns all characters that lie within the containing [PdfPage], in the order in which
241    /// they are defined in the document, concatenated into a single string.
242    ///
243    /// In complex custom layouts, the order in which characters are defined in the document
244    /// and the order in which they appear visually during rendering (and thus the order in
245    /// which they are read by a user) may not necessarily match.
246    pub fn all(&self) -> String {
247        self.inside_rect(self.page.page_size())
248    }
249
250    /// Returns all characters that lie within the bounds of the given [PdfRect] in the
251    /// containing [PdfPage], in the order in which they are defined in the document,
252    /// concatenated into a single string.
253    ///
254    /// In complex custom layouts, the order in which characters are defined in the document
255    /// and the order in which they appear visually during rendering (and thus the order in
256    /// which they are read by a user) may not necessarily match.
257    pub fn inside_rect(&self, rect: PdfRect) -> String {
258        // Retrieving the bounded text from Pdfium is a two-step operation. First, we call
259        // FPDFText_GetBoundedText() with a null buffer; this will retrieve the length of
260        // the bounded text in _characters_ (not _bytes_!). If the length is zero, then there is
261        // no text within the given rectangle's boundaries.
262
263        // If the length is non-zero, then we reserve a buffer (sized in words rather than bytes,
264        // to allow for two bytes per character) and call FPDFText_GetBoundedText() again with a
265        // pointer to the buffer; this will write the bounded text to the buffer in UTF16-LE format.
266
267        let left = rect.left().value as f64;
268
269        let top = rect.top().value as f64;
270
271        let right = rect.right().value as f64;
272
273        let bottom = rect.bottom().value as f64;
274
275        let chars_count = self.bindings().FPDFText_GetBoundedText(
276            self.text_page_handle(),
277            left,
278            top,
279            right,
280            bottom,
281            null_mut(),
282            0,
283        );
284
285        if chars_count == 0 {
286            // No text lies within the given rectangle.
287
288            return String::new();
289        }
290
291        let mut buffer = create_sized_buffer(chars_count as usize);
292
293        let result = self.bindings().FPDFText_GetBoundedText(
294            self.text_page_handle(),
295            left,
296            top,
297            right,
298            bottom,
299            buffer.as_mut_ptr(),
300            chars_count,
301        );
302
303        assert_eq!(result, chars_count);
304
305        get_string_from_pdfium_utf16le_bytes(cast_slice(buffer.as_slice()).to_vec())
306            .unwrap_or_default()
307    }
308
309    /// Returns all characters assigned to the given [PdfPageTextObject] in this [PdfPageText] object,
310    /// concatenated into a single string.
311    pub fn for_object(&self, object: &PdfPageTextObject) -> String {
312        // Retrieving the string value from Pdfium is a two-step operation. First, we call
313        // FPDFTextObj_GetText() with a null buffer; this will retrieve the length of
314        // the text in bytes, assuming the page object exists. If the length is zero,
315        // then there is no text.
316
317        // If the length is non-zero, then we reserve a byte buffer of the given
318        // length and call FPDFTextObj_GetText() again with a pointer to the buffer;
319        // this will write the text for the page object into the buffer.
320
321        let buffer_length = self.bindings().FPDFTextObj_GetText(
322            object.object_handle(),
323            self.text_page_handle(),
324            null_mut(),
325            0,
326        );
327
328        if buffer_length == 0 {
329            // There is no text.
330
331            return String::new();
332        }
333
334        let mut buffer = create_byte_buffer(buffer_length as usize);
335
336        let result = self.bindings().FPDFTextObj_GetText(
337            object.object_handle(),
338            self.text_page_handle(),
339            buffer.as_mut_ptr() as *mut FPDF_WCHAR,
340            buffer_length,
341        );
342
343        assert_eq!(result, buffer_length);
344
345        get_string_from_pdfium_utf16le_bytes(buffer).unwrap_or_default()
346    }
347
348    /// Returns all characters that lie within the bounds of the given [PdfPageAnnotation] in the
349    /// containing [PdfPage], in the order in which they are defined in the document,
350    /// concatenated into a single string.
351    ///
352    /// In complex custom layouts, the order in which characters are defined in the document
353    /// and the order in which they appear visually during rendering (and thus the order in
354    /// which they are read by a user) may not necessarily match.
355    #[inline]
356    pub fn for_annotation(&self, annotation: &PdfPageAnnotation) -> Result<String, PdfiumError> {
357        let bounds = annotation.bounds()?;
358
359        Ok(self.inside_rect(bounds))
360    }
361
362    /// Starts a search for the given text string, returning a new [PdfPageTextSearch]
363    /// object that can be used to step through the search results.
364    #[inline]
365    pub fn search(
366        &self,
367        text: &str,
368        options: &PdfSearchOptions,
369    ) -> Result<PdfPageTextSearch, PdfiumError> {
370        self.search_from(text, options, 0)
371    }
372
373    /// Starts a search for the given test string from the given character position,
374    /// returning a new [PdfPageTextSearch] object that can be used to step through
375    /// the search results.
376    pub fn search_from(
377        &self,
378        text: &str,
379        options: &PdfSearchOptions,
380        index: PdfPageTextCharIndex,
381    ) -> Result<PdfPageTextSearch, PdfiumError> {
382        if text.is_empty() {
383            Err(PdfiumError::TextSearchTargetIsEmpty)
384        } else {
385            Ok(PdfPageTextSearch::from_pdfium(
386                self.bindings().FPDFText_FindStart(
387                    self.text_page_handle(),
388                    get_pdfium_utf16le_bytes_from_str(text).as_ptr() as FPDF_WIDESTRING,
389                    options.as_pdfium(),
390                    index as c_int,
391                ),
392                self,
393                self.bindings(),
394            ))
395        }
396    }
397}
398
399impl<'a> Display for PdfPageText<'a> {
400    #[inline]
401    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
402        f.write_str(self.all().as_str())
403    }
404}
405
406impl<'a> Drop for PdfPageText<'a> {
407    /// Closes the [PdfPageText] collection, releasing held memory.
408    #[inline]
409    fn drop(&mut self) {
410        self.bindings().FPDFText_ClosePage(self.text_page_handle());
411    }
412}
413
414#[cfg(test)]
415mod tests {
416    use crate::prelude::*;
417    use crate::utils::test::test_bind_to_pdfium;
418
419    #[test]
420    fn test_overlapping_chars_results() -> Result<(), PdfiumError> {
421        // Test to make sure the result of the .chars_for_object() function returns the
422        // correct results in the event of overlapping text objects.
423        // For more details, see: https://github.com/ajrcarey/pdfium-render/issues/98
424
425        let pdfium = test_bind_to_pdfium();
426
427        // Create a new document with three overlapping text objects.
428
429        let mut document = pdfium.create_new_pdf()?;
430
431        let mut page = document
432            .pages_mut()
433            .create_page_at_start(PdfPagePaperSize::a4())?;
434
435        let font = document.fonts_mut().courier();
436
437        let txt1 = page.objects_mut().create_text_object(
438            PdfPoints::ZERO,
439            PdfPoints::ZERO,
440            "AAAAAA",
441            font,
442            PdfPoints::new(10.0),
443        )?;
444
445        let txt2 = page.objects_mut().create_text_object(
446            PdfPoints::ZERO,
447            PdfPoints::ZERO,
448            "BBBBBB",
449            font,
450            PdfPoints::new(10.0),
451        )?;
452
453        let txt3 = page.objects_mut().create_text_object(
454            PdfPoints::ZERO,
455            PdfPoints::ZERO,
456            "CDCDCDE",
457            font,
458            PdfPoints::new(10.0),
459        )?;
460
461        let page_text = page.text()?;
462
463        // Check the results for all three objects are not affected by overlapping.
464
465        assert!(test_one_overlapping_text_object_results(
466            &txt1, &page_text, "AAAAAA"
467        )?);
468        assert!(test_one_overlapping_text_object_results(
469            &txt2, &page_text, "BBBBBB"
470        )?);
471        assert!(test_one_overlapping_text_object_results(
472            &txt3, &page_text, "CDCDCDE"
473        )?);
474
475        Ok(())
476    }
477
478    fn test_one_overlapping_text_object_results(
479        object: &PdfPageObject,
480        page_text: &PdfPageText,
481        expected: &str,
482    ) -> Result<bool, PdfiumError> {
483        if let Some(txt) = object.as_text_object() {
484            assert_eq!(txt.text().trim(), expected);
485            assert_eq!(page_text.for_object(txt).trim(), expected);
486
487            for (index, char) in txt.chars(&page_text)?.iter().enumerate() {
488                assert_eq!(txt.text().chars().nth(index), char.unicode_char());
489                assert_eq!(expected.chars().nth(index), char.unicode_char());
490            }
491
492            Ok(true)
493        } else {
494            Ok(false)
495        }
496    }
497}