pdfium_render/pdf/document/page/
text.rs

1//! Defines the [PdfPageText] struct, exposing functionality related to the
2//! collection of Unicode characters visible on a single [PdfPage].
3
4pub mod char;
5pub mod chars;
6pub mod search;
7pub mod segment;
8pub mod segments;
9
10use crate::bindgen::{FPDF_TEXTPAGE, FPDF_WCHAR, FPDF_WIDESTRING};
11use crate::bindings::PdfiumLibraryBindings;
12use crate::error::PdfiumError;
13use crate::pdf::document::page::annotation::PdfPageAnnotation;
14use crate::pdf::document::page::annotation::PdfPageAnnotationCommon;
15use crate::pdf::document::page::object::private::internal::PdfPageObjectPrivate;
16use crate::pdf::document::page::object::text::PdfPageTextObject;
17use crate::pdf::document::page::text::chars::{PdfPageTextCharIndex, PdfPageTextChars};
18use crate::pdf::document::page::text::search::{PdfPageTextSearch, PdfSearchOptions};
19use crate::pdf::document::page::text::segments::PdfPageTextSegments;
20use crate::pdf::document::page::PdfPage;
21use crate::pdf::points::PdfPoints;
22use crate::pdf::rect::PdfRect;
23use crate::utils::mem::{create_byte_buffer, create_sized_buffer};
24use crate::utils::utf16le::{
25    get_pdfium_utf16le_bytes_from_str, get_string_from_pdfium_utf16le_bytes,
26};
27use bytemuck::cast_slice;
28use std::fmt::{Display, Formatter};
29use std::os::raw::{c_double, c_int};
30use std::ptr::null_mut;
31
32#[cfg(any(
33    feature = "pdfium_future",
34    feature = "pdfium_7123",
35    feature = "pdfium_6996",
36    feature = "pdfium_6721",
37    feature = "pdfium_6666",
38    feature = "pdfium_6611",
39))]
40use crate::pdf::document::page::object::PdfPageObjectCommon;
41
42/// The collection of Unicode characters visible on a single [PdfPage].
43///
44/// Use the [PdfPageText::all()] function to easily return all characters in the containing
45/// [PdfPage] in the order in which they are defined in the PDF file.
46///
47/// Use the [PdfPageText::search()] function to initialise a new [PdfPageTextSearch] object,
48/// yielding the results of searching for a target string within the character collection.
49///
50/// In complex custom layouts, the order in which characters are defined in the document
51/// and the order in which they appear visually during rendering (and thus the order in
52/// which they are read by a user) may not necessarily match.
53///
54/// [PdfPageText] implements both the [ToString] and the [Display] traits.
55pub struct PdfPageText<'a> {
56    text_page_handle: FPDF_TEXTPAGE,
57    page: &'a PdfPage<'a>,
58    bindings: &'a dyn PdfiumLibraryBindings,
59}
60
61impl<'a> PdfPageText<'a> {
62    pub(crate) fn from_pdfium(
63        text_page_handle: FPDF_TEXTPAGE,
64        page: &'a PdfPage<'a>,
65        bindings: &'a dyn PdfiumLibraryBindings,
66    ) -> Self {
67        PdfPageText {
68            text_page_handle,
69            page,
70            bindings,
71        }
72    }
73
74    /// Returns the internal `FPDF_TEXTPAGE` handle for this [PdfPageText].
75    #[inline]
76    pub(crate) fn text_page_handle(&self) -> FPDF_TEXTPAGE {
77        self.text_page_handle
78    }
79
80    /// Returns the [PdfiumLibraryBindings] used by this [PdfPageText].
81    #[inline]
82    pub fn bindings(&self) -> &'a dyn PdfiumLibraryBindings {
83        self.bindings
84    }
85
86    /// Returns the total number of characters in all text segments in the containing [PdfPage].
87    ///
88    /// The character count includes whitespace and newlines, and so may differ slightly
89    /// from the result of calling `PdfPageText::all().len()`.
90    #[inline]
91    pub fn len(&self) -> i32 {
92        self.bindings.FPDFText_CountChars(self.text_page_handle())
93    }
94
95    /// Returns `true` if there are no characters in any text box collection in the containing [PdfPage].
96    #[inline]
97    pub fn is_empty(&self) -> bool {
98        self.len() == 0
99    }
100
101    /// Returns a collection of all the `PdfPageTextSegment` text segments in the containing [PdfPage].
102    #[inline]
103    pub fn segments(&self) -> PdfPageTextSegments {
104        PdfPageTextSegments::new(self, 0, self.len(), self.bindings())
105    }
106
107    /// Returns a subset of the `PdfPageTextSegment` text segments in the containing [PdfPage].
108    /// Only text segments containing characters in the given index range will be included.
109    #[inline]
110    pub fn segments_subset(
111        &self,
112        start: PdfPageTextCharIndex,
113        count: PdfPageTextCharIndex,
114    ) -> PdfPageTextSegments {
115        PdfPageTextSegments::new(self, start as i32, count as i32, self.bindings())
116    }
117
118    /// Returns a collection of all the `PdfPageTextChar` characters in the containing [PdfPage].
119    #[inline]
120    pub fn chars(&self) -> PdfPageTextChars {
121        PdfPageTextChars::new(
122            self.page.document_handle(),
123            self.page.page_handle(),
124            self.text_page_handle(),
125            (0..self.len()).collect(),
126            self.bindings(),
127        )
128    }
129
130    #[cfg(any(
131        feature = "pdfium_future",
132        feature = "pdfium_7123",
133        feature = "pdfium_6996",
134        feature = "pdfium_6721",
135        feature = "pdfium_6666",
136        feature = "pdfium_6611",
137    ))]
138    /// Returns a collection of all the `PdfPageTextChar` characters in the given [PdfPageTextObject].
139    ///
140    /// The return result will be empty if the given [PdfPageTextObject] is not attached to the
141    /// containing [PdfPage].
142    #[inline]
143    pub fn chars_for_object(
144        &self,
145        object: &PdfPageTextObject,
146    ) -> Result<PdfPageTextChars, PdfiumError> {
147        let chars_inside_bounds = self
148            .chars_inside_rect(object.bounds()?.to_rect())
149            .map_err(|_| PdfiumError::NoCharsInPageObject)?;
150
151        // The collection contains _all_ characters inside the bounds of the
152        // given text object, including characters from any overlapping objects.
153        // Filter the collection so it contains only characters from the
154        // given text object.
155
156        Ok(PdfPageTextChars::new(
157            self.page.document_handle(),
158            self.page.page_handle(),
159            self.text_page_handle(),
160            chars_inside_bounds
161                .iter()
162                .filter(|char| {
163                    self.bindings
164                        .FPDFText_GetTextObject(self.text_page_handle(), char.index() as i32)
165                        == object.object_handle()
166                })
167                .map(|char| char.index() as i32)
168                .collect(),
169            self.bindings(),
170        ))
171    }
172
173    /// Returns a collection of all the `PdfPageTextChar` characters in the given [PdfPageAnnotation].
174    ///
175    /// The return result will be empty if the given [PdfPageAnnotation] is not attached to the
176    /// containing [PdfPage].
177    #[inline]
178    pub fn chars_for_annotation(
179        &self,
180        annotation: &PdfPageAnnotation,
181    ) -> Result<PdfPageTextChars, PdfiumError> {
182        self.chars_inside_rect(annotation.bounds()?)
183            .map_err(|_| PdfiumError::NoCharsInAnnotation)
184    }
185
186    /// Returns a collection of all the `PdfPageTextChar` characters that lie within the bounds of
187    /// the given [PdfRect] in the containing [PdfPage].
188    #[inline]
189    pub fn chars_inside_rect(&self, rect: PdfRect) -> Result<PdfPageTextChars, PdfiumError> {
190        let tolerance_x = rect.width() / 2.0;
191        let tolerance_y = rect.height() / 2.0;
192        let center_height = rect.bottom() + tolerance_y;
193
194        let chars = self.chars();
195
196        match (
197            chars.get_char_near_point(rect.left(), tolerance_x, center_height, tolerance_y),
198            chars.get_char_near_point(rect.right(), tolerance_x, center_height, tolerance_y),
199        ) {
200            (Some(start), Some(end)) => Ok(PdfPageTextChars::new(
201                self.page.document_handle(),
202                self.page.page_handle(),
203                self.text_page_handle(),
204                (start.index() as i32..end.index().saturating_sub(start.index()) as i32 + 1)
205                    .collect(),
206                self.bindings,
207            )),
208            _ => Err(PdfiumError::NoCharsInRect),
209        }
210    }
211
212    /// Returns the character near to the given x and y positions on the containing [PdfPage],
213    /// if any. The returned character will be no further from the given positions than the given
214    /// tolerance values.
215    pub(crate) fn get_char_index_near_point(
216        text_page_handle: FPDF_TEXTPAGE,
217        x: PdfPoints,
218        tolerance_x: PdfPoints,
219        y: PdfPoints,
220        tolerance_y: PdfPoints,
221        bindings: &dyn PdfiumLibraryBindings,
222    ) -> Option<PdfPageTextCharIndex> {
223        match bindings.FPDFText_GetCharIndexAtPos(
224            text_page_handle,
225            x.value as c_double,
226            y.value as c_double,
227            tolerance_x.value as c_double,
228            tolerance_y.value as c_double,
229        ) {
230            -1 => None, // No character at position within tolerances
231            -3 => None, // An error occurred, but we'll eat it
232            index => Some(index as PdfPageTextCharIndex),
233        }
234    }
235
236    /// Returns all characters that lie within the containing [PdfPage], in the order in which
237    /// they are defined in the document, concatenated into a single string.
238    ///
239    /// In complex custom layouts, the order in which characters are defined in the document
240    /// and the order in which they appear visually during rendering (and thus the order in
241    /// which they are read by a user) may not necessarily match.
242    pub fn all(&self) -> String {
243        self.inside_rect(self.page.page_size())
244    }
245
246    /// Returns all characters that lie within the bounds of the given [PdfRect] in the
247    /// containing [PdfPage], in the order in which they are defined in the document,
248    /// concatenated into a single string.
249    ///
250    /// In complex custom layouts, the order in which characters are defined in the document
251    /// and the order in which they appear visually during rendering (and thus the order in
252    /// which they are read by a user) may not necessarily match.
253    pub fn inside_rect(&self, rect: PdfRect) -> String {
254        // Retrieving the bounded text from Pdfium is a two-step operation. First, we call
255        // FPDFText_GetBoundedText() with a null buffer; this will retrieve the length of
256        // the bounded text in _characters_ (not _bytes_!). If the length is zero, then there is
257        // no text within the given rectangle's boundaries.
258
259        // If the length is non-zero, then we reserve a buffer (sized in words rather than bytes,
260        // to allow for two bytes per character) and call FPDFText_GetBoundedText() again with a
261        // pointer to the buffer; this will write the bounded text to the buffer in UTF16-LE format.
262
263        let left = rect.left().value as f64;
264
265        let top = rect.top().value as f64;
266
267        let right = rect.right().value as f64;
268
269        let bottom = rect.bottom().value as f64;
270
271        let chars_count = self.bindings().FPDFText_GetBoundedText(
272            self.text_page_handle(),
273            left,
274            top,
275            right,
276            bottom,
277            null_mut(),
278            0,
279        );
280
281        if chars_count == 0 {
282            // No text lies within the given rectangle.
283
284            return String::new();
285        }
286
287        let mut buffer = create_sized_buffer(chars_count as usize);
288
289        let result = self.bindings().FPDFText_GetBoundedText(
290            self.text_page_handle(),
291            left,
292            top,
293            right,
294            bottom,
295            buffer.as_mut_ptr(),
296            chars_count,
297        );
298
299        assert_eq!(result, chars_count);
300
301        get_string_from_pdfium_utf16le_bytes(cast_slice(buffer.as_slice()).to_vec())
302            .unwrap_or_default()
303    }
304
305    /// Returns all characters assigned to the given [PdfPageTextObject] in this [PdfPageText] object,
306    /// concatenated into a single string.
307    pub fn for_object(&self, object: &PdfPageTextObject) -> String {
308        // Retrieving the string value from Pdfium is a two-step operation. First, we call
309        // FPDFTextObj_GetText() with a null buffer; this will retrieve the length of
310        // the text in bytes, assuming the page object exists. If the length is zero,
311        // then there is no text.
312
313        // If the length is non-zero, then we reserve a byte buffer of the given
314        // length and call FPDFTextObj_GetText() again with a pointer to the buffer;
315        // this will write the text for the page object into the buffer.
316
317        let buffer_length = self.bindings().FPDFTextObj_GetText(
318            object.object_handle(),
319            self.text_page_handle(),
320            null_mut(),
321            0,
322        );
323
324        if buffer_length == 0 {
325            // There is no text.
326
327            return String::new();
328        }
329
330        let mut buffer = create_byte_buffer(buffer_length as usize);
331
332        let result = self.bindings().FPDFTextObj_GetText(
333            object.object_handle(),
334            self.text_page_handle(),
335            buffer.as_mut_ptr() as *mut FPDF_WCHAR,
336            buffer_length,
337        );
338
339        assert_eq!(result, buffer_length);
340
341        get_string_from_pdfium_utf16le_bytes(buffer).unwrap_or_default()
342    }
343
344    /// Returns all characters that lie within the bounds of the given [PdfPageAnnotation] in the
345    /// containing [PdfPage], in the order in which they are defined in the document,
346    /// concatenated into a single string.
347    ///
348    /// In complex custom layouts, the order in which characters are defined in the document
349    /// and the order in which they appear visually during rendering (and thus the order in
350    /// which they are read by a user) may not necessarily match.
351    #[inline]
352    pub fn for_annotation(&self, annotation: &PdfPageAnnotation) -> Result<String, PdfiumError> {
353        let bounds = annotation.bounds()?;
354
355        Ok(self.inside_rect(bounds))
356    }
357
358    /// Starts a search for the given text string, returning a new [PdfPageTextSearch]
359    /// object that can be used to step through the search results.
360    #[inline]
361    pub fn search(&self, text: &str, options: &PdfSearchOptions) -> PdfPageTextSearch {
362        self.search_from(text, options, 0)
363    }
364
365    /// Starts a search for the given test string from the given character position,
366    /// returning a new [PdfPageTextSearch] object that can be used to step through
367    /// the search results.
368    pub fn search_from(
369        &self,
370        text: &str,
371        options: &PdfSearchOptions,
372        index: PdfPageTextCharIndex,
373    ) -> PdfPageTextSearch {
374        PdfPageTextSearch::from_pdfium(
375            self.bindings().FPDFText_FindStart(
376                self.text_page_handle(),
377                get_pdfium_utf16le_bytes_from_str(text).as_ptr() as FPDF_WIDESTRING,
378                options.as_pdfium(),
379                index as c_int,
380            ),
381            self,
382            self.bindings(),
383        )
384    }
385}
386
387impl<'a> Display for PdfPageText<'a> {
388    #[inline]
389    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
390        f.write_str(self.all().as_str())
391    }
392}
393
394impl<'a> Drop for PdfPageText<'a> {
395    /// Closes the [PdfPageText] collection, releasing held memory.
396    #[inline]
397    fn drop(&mut self) {
398        self.bindings().FPDFText_ClosePage(self.text_page_handle());
399    }
400}
401
402#[cfg(test)]
403mod tests {
404    use crate::prelude::*;
405    use crate::utils::test::test_bind_to_pdfium;
406
407    #[test]
408    fn test_overlapping_chars_results() -> Result<(), PdfiumError> {
409        // Test to make sure the result of the .chars_for_object() function returns the
410        // correct results in the event of overlapping text objects.
411        // For more details, see: https://github.com/ajrcarey/pdfium-render/issues/98
412
413        let pdfium = test_bind_to_pdfium();
414
415        // Create a new document with three overlapping text objects.
416
417        let mut document = pdfium.create_new_pdf()?;
418
419        let mut page = document
420            .pages_mut()
421            .create_page_at_start(PdfPagePaperSize::a4())?;
422
423        let font = document.fonts_mut().courier();
424
425        let txt1 = page.objects_mut().create_text_object(
426            PdfPoints::ZERO,
427            PdfPoints::ZERO,
428            "AAAAAA",
429            font,
430            PdfPoints::new(10.0),
431        )?;
432
433        let txt2 = page.objects_mut().create_text_object(
434            PdfPoints::ZERO,
435            PdfPoints::ZERO,
436            "BBBBBB",
437            font,
438            PdfPoints::new(10.0),
439        )?;
440
441        let txt3 = page.objects_mut().create_text_object(
442            PdfPoints::ZERO,
443            PdfPoints::ZERO,
444            "CDCDCDE",
445            font,
446            PdfPoints::new(10.0),
447        )?;
448
449        let page_text = page.text()?;
450
451        // Check the results for all three objects are not affected by overlapping.
452
453        assert!(test_one_overlapping_text_object_results(
454            &txt1, &page_text, "AAAAAA"
455        )?);
456        assert!(test_one_overlapping_text_object_results(
457            &txt2, &page_text, "BBBBBB"
458        )?);
459        assert!(test_one_overlapping_text_object_results(
460            &txt3, &page_text, "CDCDCDE"
461        )?);
462
463        Ok(())
464    }
465
466    fn test_one_overlapping_text_object_results(
467        object: &PdfPageObject,
468        page_text: &PdfPageText,
469        expected: &str,
470    ) -> Result<bool, PdfiumError> {
471        if let Some(txt) = object.as_text_object() {
472            assert_eq!(txt.text().trim(), expected);
473            assert_eq!(page_text.for_object(txt).trim(), expected);
474
475            for (index, char) in txt.chars(&page_text)?.iter().enumerate() {
476                assert_eq!(txt.text().chars().nth(index), char.unicode_char());
477                assert_eq!(expected.chars().nth(index), char.unicode_char());
478            }
479
480            Ok(true)
481        } else {
482            Ok(false)
483        }
484    }
485}