Skip to main content

pdfium_render/pdf/document/page/
text.rs

1//! Defines the [PdfPageText] struct, exposing functionality related to the
2//! collection of Unicode characters visible on a single [PdfPage].
3
4pub mod char;
5pub mod chars;
6pub mod search;
7pub mod segment;
8pub mod segments;
9
10use crate::bindgen::{FPDF_TEXTPAGE, FPDF_WCHAR, FPDF_WIDESTRING};
11use crate::bindings::PdfiumLibraryBindings;
12use crate::error::PdfiumError;
13use crate::pdf::document::page::annotation::PdfPageAnnotation;
14use crate::pdf::document::page::annotation::PdfPageAnnotationCommon;
15use crate::pdf::document::page::object::private::internal::PdfPageObjectPrivate;
16use crate::pdf::document::page::object::text::PdfPageTextObject;
17use crate::pdf::document::page::text::chars::{PdfPageTextCharIndex, PdfPageTextChars};
18use crate::pdf::document::page::text::search::{PdfPageTextSearch, PdfSearchOptions};
19use crate::pdf::document::page::text::segments::PdfPageTextSegments;
20use crate::pdf::document::page::PdfPage;
21use crate::pdf::points::PdfPoints;
22use crate::pdf::rect::PdfRect;
23use crate::pdfium::PdfiumLibraryBindingsAccessor;
24use crate::utils::mem::{create_byte_buffer, create_sized_buffer};
25use crate::utils::utf16le::{
26    get_pdfium_utf16le_bytes_from_str, get_string_from_pdfium_utf16le_bytes,
27};
28use bytemuck::cast_slice;
29use std::fmt::{Display, Formatter};
30use std::marker::PhantomData;
31use std::os::raw::{c_double, c_int};
32use std::ptr::null_mut;
33
34/// A collection of all Unicode characters on a single [PdfPage].
35///
36/// Use the [PdfPageText::all()] function to easily return all characters in the containing
37/// [PdfPage] in the order in which they are defined in the PDF file.
38///
39/// Use the [PdfPageText::search()] function to initialize a new [PdfPageTextSearch] object,
40/// yielding the results of searching for a target string within the character collection.
41///
42/// In complex custom layouts, the order in which characters are defined in the document
43/// and the order in which they appear visually during rendering (and thus the order in
44/// which they are read by a user) may not necessarily match.
45///
46/// [PdfPageText] implements both the [ToString] and the [Display] traits.
47pub struct PdfPageText<'a> {
48    text_page_handle: FPDF_TEXTPAGE,
49    page: &'a PdfPage<'a>,
50    lifetime: PhantomData<&'a FPDF_TEXTPAGE>,
51}
52
53impl<'a> PdfPageText<'a> {
54    pub(crate) fn from_pdfium(text_page_handle: FPDF_TEXTPAGE, page: &'a PdfPage<'a>) -> Self {
55        PdfPageText {
56            text_page_handle,
57            page,
58            lifetime: PhantomData,
59        }
60    }
61
62    /// Returns the internal `FPDF_TEXTPAGE` handle for this [PdfPageText].
63    #[inline]
64    pub(crate) fn text_page_handle(&self) -> FPDF_TEXTPAGE {
65        self.text_page_handle
66    }
67
68    /// Returns the total number of characters in all text segments in the containing [PdfPage].
69    ///
70    /// The character count includes whitespace and newlines, and so may differ slightly
71    /// from the result of calling `PdfPageText::all().len()`.
72    #[inline]
73    pub fn len(&self) -> i32 {
74        unsafe { self.bindings().FPDFText_CountChars(self.text_page_handle()) }
75    }
76
77    /// Returns `true` if there are no characters in any text box collection in the containing [PdfPage].
78    #[inline]
79    pub fn is_empty(&self) -> bool {
80        self.len() == 0
81    }
82
83    /// Returns a collection of all the `PdfPageTextSegment` text segments in the containing [PdfPage].
84    #[inline]
85    pub fn segments(&self) -> PdfPageTextSegments<'_> {
86        PdfPageTextSegments::new(self, 0, self.len(), self.bindings())
87    }
88
89    /// Returns a subset of the `PdfPageTextSegment` text segments in the containing [PdfPage].
90    /// Only text segments containing characters in the given index range will be included.
91    #[inline]
92    pub fn segments_subset(
93        &self,
94        start: PdfPageTextCharIndex,
95        count: PdfPageTextCharIndex,
96    ) -> PdfPageTextSegments<'_> {
97        PdfPageTextSegments::new(self, start as i32, count as i32, self.bindings())
98    }
99
100    /// Returns a collection of all the `PdfPageTextChar` characters in the containing [PdfPage].
101    #[inline]
102    pub fn chars(&self) -> PdfPageTextChars<'_> {
103        PdfPageTextChars::new(
104            self.page.document_handle(),
105            self.page.page_handle(),
106            self.text_page_handle(),
107            (0..self.len()).collect(),
108        )
109    }
110
111    #[cfg(any(
112        feature = "pdfium_future",
113        feature = "pdfium_7543",
114        feature = "pdfium_7350",
115        feature = "pdfium_7215",
116        feature = "pdfium_7123",
117        feature = "pdfium_6996",
118        feature = "pdfium_6721",
119        feature = "pdfium_6666",
120        feature = "pdfium_6611",
121    ))]
122    /// Returns a collection of all the `PdfPageTextChar` characters in the given [PdfPageTextObject].
123    ///
124    /// The return result will be empty if the given [PdfPageTextObject] is not attached to the
125    /// containing [PdfPage].
126    #[inline]
127    pub fn chars_for_object(
128        &self,
129        object: &PdfPageTextObject,
130    ) -> Result<PdfPageTextChars<'_>, PdfiumError> {
131        Ok(PdfPageTextChars::new(
132            self.page.document_handle(),
133            self.page.page_handle(),
134            self.text_page_handle(),
135            self.chars()
136                .iter()
137                .filter(|char| {
138                    (unsafe {
139                        self.bindings()
140                            .FPDFText_GetTextObject(self.text_page_handle(), char.index() as i32)
141                    }) == object.object_handle()
142                })
143                .map(|char| char.index() as i32)
144                .collect(),
145        ))
146    }
147
148    /// Returns a collection of all the `PdfPageTextChar` characters in the given [PdfPageAnnotation].
149    ///
150    /// The return result will be empty if the given [PdfPageAnnotation] is not attached to the
151    /// containing [PdfPage].
152    #[inline]
153    pub fn chars_for_annotation(
154        &self,
155        annotation: &PdfPageAnnotation,
156    ) -> Result<PdfPageTextChars<'_>, PdfiumError> {
157        self.chars_inside_rect(annotation.bounds()?)
158            .map_err(|_| PdfiumError::NoCharsInAnnotation)
159    }
160
161    /// Returns a collection of all the `PdfPageTextChar` characters that lie within the bounds of
162    /// the given [PdfRect] in the containing [PdfPage].
163    #[inline]
164    pub fn chars_inside_rect(&self, rect: PdfRect) -> Result<PdfPageTextChars<'_>, PdfiumError> {
165        let tolerance_x = rect.width() / 2.0;
166        let tolerance_y = rect.height() / 2.0;
167        let center_height = rect.bottom() + tolerance_y;
168
169        match (
170            Self::get_char_index_near_point(
171                self.text_page_handle(),
172                rect.left(),
173                tolerance_x,
174                center_height,
175                tolerance_y,
176                self.bindings(),
177            ),
178            Self::get_char_index_near_point(
179                self.text_page_handle(),
180                rect.right(),
181                tolerance_x,
182                center_height,
183                tolerance_y,
184                self.bindings(),
185            ),
186        ) {
187            (Some(start), Some(end)) => Ok(PdfPageTextChars::new(
188                self.page.document_handle(),
189                self.page.page_handle(),
190                self.text_page_handle(),
191                (start as i32..=end as i32 + 1).collect(),
192            )),
193            (Some(start), None) => Ok(PdfPageTextChars::new(
194                self.page.document_handle(),
195                self.page.page_handle(),
196                self.text_page_handle(),
197                (start as i32..=start as i32 + 1).collect(),
198            )),
199            (None, Some(end)) => Ok(PdfPageTextChars::new(
200                self.page.document_handle(),
201                self.page.page_handle(),
202                self.text_page_handle(),
203                (end as i32..=end as i32 + 1).collect(),
204            )),
205            _ => Err(PdfiumError::NoCharsInRect),
206        }
207    }
208
209    /// Returns the character near to the given x and y positions on the containing [PdfPage],
210    /// if any. The returned character will be no further from the given positions than the given
211    /// tolerance values.
212    pub(crate) fn get_char_index_near_point(
213        text_page_handle: FPDF_TEXTPAGE,
214        x: PdfPoints,
215        tolerance_x: PdfPoints,
216        y: PdfPoints,
217        tolerance_y: PdfPoints,
218        bindings: &dyn PdfiumLibraryBindings,
219    ) -> Option<PdfPageTextCharIndex> {
220        match unsafe {
221            bindings.FPDFText_GetCharIndexAtPos(
222                text_page_handle,
223                x.value as c_double,
224                y.value as c_double,
225                tolerance_x.value as c_double,
226                tolerance_y.value as c_double,
227            )
228        } {
229            -1 => None, // No character at position within tolerances
230            -3 => None, // An error occurred, but we'll eat it
231            index => Some(index as PdfPageTextCharIndex),
232        }
233    }
234
235    /// Returns all characters that lie within the containing [PdfPage], in the order in which
236    /// they are defined in the document, concatenated into a single string.
237    ///
238    /// In complex custom layouts, the order in which characters are defined in the document
239    /// and the order in which they appear visually during rendering (and thus the order in
240    /// which they are read by a user) may not necessarily match.
241    pub fn all(&self) -> String {
242        self.inside_rect(self.page.page_size())
243    }
244
245    /// Returns all characters that lie within the bounds of the given [PdfRect] in the
246    /// containing [PdfPage], in the order in which they are defined in the document,
247    /// concatenated into a single string.
248    ///
249    /// In complex custom layouts, the order in which characters are defined in the document
250    /// and the order in which they appear visually during rendering (and thus the order in
251    /// which they are read by a user) may not necessarily match.
252    pub fn inside_rect(&self, rect: PdfRect) -> String {
253        // Retrieving the bounded text from Pdfium is a two-step operation. First, we call
254        // FPDFText_GetBoundedText() with a null buffer; this will retrieve the length of
255        // the bounded text in _characters_ (not _bytes_!). If the length is zero, then there is
256        // no text within the given rectangle's boundaries.
257
258        // If the length is non-zero, then we reserve a buffer (sized in words rather than bytes,
259        // to allow for two bytes per character) and call FPDFText_GetBoundedText() again with a
260        // pointer to the buffer; this will write the bounded text to the buffer in UTF16-LE format.
261
262        let left = rect.left().value as f64;
263
264        let top = rect.top().value as f64;
265
266        let right = rect.right().value as f64;
267
268        let bottom = rect.bottom().value as f64;
269
270        let chars_count = unsafe {
271            self.bindings().FPDFText_GetBoundedText(
272                self.text_page_handle(),
273                left,
274                top,
275                right,
276                bottom,
277                null_mut(),
278                0,
279            )
280        };
281
282        if chars_count == 0 {
283            // No text lies within the given rectangle.
284
285            return String::new();
286        }
287
288        let mut buffer = create_sized_buffer(chars_count as usize);
289
290        let result = unsafe {
291            self.bindings().FPDFText_GetBoundedText(
292                self.text_page_handle(),
293                left,
294                top,
295                right,
296                bottom,
297                buffer.as_mut_ptr(),
298                chars_count,
299            )
300        };
301
302        assert_eq!(result, chars_count);
303
304        get_string_from_pdfium_utf16le_bytes(cast_slice(buffer.as_slice()).to_vec())
305            .unwrap_or_default()
306    }
307
308    /// Returns all characters assigned to the given [PdfPageTextObject] in this [PdfPageText] object,
309    /// concatenated into a single string.
310    pub fn for_object(&self, object: &PdfPageTextObject) -> String {
311        // Retrieving the string value from Pdfium is a two-step operation. First, we call
312        // FPDFTextObj_GetText() with a null buffer; this will retrieve the length of
313        // the text in bytes, assuming the page object exists. If the length is zero,
314        // then there is no text.
315
316        // If the length is non-zero, then we reserve a byte buffer of the given
317        // length and call FPDFTextObj_GetText() again with a pointer to the buffer;
318        // this will write the text for the page object into the buffer.
319
320        let buffer_length = unsafe {
321            self.bindings().FPDFTextObj_GetText(
322                object.object_handle(),
323                self.text_page_handle(),
324                null_mut(),
325                0,
326            )
327        };
328
329        if buffer_length == 0 {
330            // There is no text.
331
332            return String::new();
333        }
334
335        let mut buffer = create_byte_buffer(buffer_length as usize);
336
337        let result = unsafe {
338            self.bindings().FPDFTextObj_GetText(
339                object.object_handle(),
340                self.text_page_handle(),
341                buffer.as_mut_ptr() as *mut FPDF_WCHAR,
342                buffer_length,
343            )
344        };
345
346        assert_eq!(result, buffer_length);
347
348        get_string_from_pdfium_utf16le_bytes(buffer).unwrap_or_default()
349    }
350
351    /// Returns all characters that lie within the bounds of the given [PdfPageAnnotation] in the
352    /// containing [PdfPage], in the order in which they are defined in the document,
353    /// concatenated into a single string.
354    ///
355    /// In complex custom layouts, the order in which characters are defined in the document
356    /// and the order in which they appear visually during rendering (and thus the order in
357    /// which they are read by a user) may not necessarily match.
358    #[inline]
359    pub fn for_annotation(&self, annotation: &PdfPageAnnotation) -> Result<String, PdfiumError> {
360        let bounds = annotation.bounds()?;
361
362        Ok(self.inside_rect(bounds))
363    }
364
365    /// Starts a search for the given text string, returning a new [PdfPageTextSearch]
366    /// object that can be used to step through the search results.
367    #[inline]
368    pub fn search(
369        &self,
370        text: &str,
371        options: &PdfSearchOptions,
372    ) -> Result<PdfPageTextSearch<'_>, PdfiumError> {
373        self.search_from(text, options, 0)
374    }
375
376    /// Starts a search for the given test string from the given character position,
377    /// returning a new [PdfPageTextSearch] object that can be used to step through
378    /// the search results.
379    pub fn search_from(
380        &self,
381        text: &str,
382        options: &PdfSearchOptions,
383        index: PdfPageTextCharIndex,
384    ) -> Result<PdfPageTextSearch<'_>, PdfiumError> {
385        if text.is_empty() {
386            Err(PdfiumError::TextSearchTargetIsEmpty)
387        } else {
388            Ok(PdfPageTextSearch::from_pdfium(
389                unsafe {
390                    self.bindings().FPDFText_FindStart(
391                        self.text_page_handle(),
392                        get_pdfium_utf16le_bytes_from_str(text).as_ptr() as FPDF_WIDESTRING,
393                        options.as_pdfium(),
394                        index as c_int,
395                    )
396                },
397                self,
398            ))
399        }
400    }
401}
402
403impl<'a> Display for PdfPageText<'a> {
404    #[inline]
405    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
406        f.write_str(self.all().as_str())
407    }
408}
409
410impl<'a> Drop for PdfPageText<'a> {
411    /// Closes the [PdfPageText] collection, releasing held memory.
412    #[inline]
413    fn drop(&mut self) {
414        unsafe {
415            self.bindings().FPDFText_ClosePage(self.text_page_handle());
416        }
417    }
418}
419
420impl<'a> PdfiumLibraryBindingsAccessor<'a> for PdfPageText<'a> {}
421
422#[cfg(feature = "thread_safe")]
423unsafe impl<'a> Send for PdfPageText<'a> {}
424
425#[cfg(feature = "thread_safe")]
426unsafe impl<'a> Sync for PdfPageText<'a> {}
427
428#[cfg(test)]
429mod tests {
430    use itertools::Itertools;
431    use std::ffi::OsStr;
432    use std::fs;
433
434    use crate::prelude::*;
435    use crate::utils::test::test_bind_to_pdfium;
436
437    #[test]
438    fn test_overlapping_chars_results() -> Result<(), PdfiumError> {
439        // Test to make sure the result of the .chars_for_object() function returns the
440        // correct results in the event of overlapping text objects.
441        // For more details, see: https://github.com/ajrcarey/pdfium-render/issues/98
442
443        let pdfium = test_bind_to_pdfium();
444
445        // Create a new document with three overlapping text objects.
446
447        let mut document = pdfium.create_new_pdf()?;
448
449        let mut page = document
450            .pages_mut()
451            .create_page_at_start(PdfPagePaperSize::a4())?;
452
453        let font = document.fonts_mut().courier();
454
455        let txt1 = page.objects_mut().create_text_object(
456            PdfPoints::ZERO,
457            PdfPoints::ZERO,
458            "AAAAAA",
459            font,
460            PdfPoints::new(10.0),
461        )?;
462
463        let txt2 = page.objects_mut().create_text_object(
464            PdfPoints::ZERO,
465            PdfPoints::ZERO,
466            "BBBBBB",
467            font,
468            PdfPoints::new(10.0),
469        )?;
470
471        let txt3 = page.objects_mut().create_text_object(
472            PdfPoints::ZERO,
473            PdfPoints::ZERO,
474            "CDCDCDE",
475            font,
476            PdfPoints::new(10.0),
477        )?;
478
479        let page_text = page.text()?;
480
481        // Check the results for all three objects are not affected by overlapping.
482
483        assert!(test_one_overlapping_text_object_results(
484            &txt1, &page_text, "AAAAAA"
485        )?);
486        assert!(test_one_overlapping_text_object_results(
487            &txt2, &page_text, "BBBBBB"
488        )?);
489        assert!(test_one_overlapping_text_object_results(
490            &txt3, &page_text, "CDCDCDE"
491        )?);
492
493        Ok(())
494    }
495
496    fn test_one_overlapping_text_object_results(
497        object: &PdfPageObject,
498        page_text: &PdfPageText,
499        expected: &str,
500    ) -> Result<bool, PdfiumError> {
501        if let Some(txt) = object.as_text_object() {
502            assert_eq!(txt.text().trim(), expected);
503            assert_eq!(page_text.for_object(txt).trim(), expected);
504
505            for (index, char) in txt.chars(&page_text)?.iter().enumerate() {
506                assert_eq!(txt.text().chars().nth(index), char.unicode_char());
507                assert_eq!(expected.chars().nth(index), char.unicode_char());
508            }
509
510            Ok(true)
511        } else {
512            Ok(false)
513        }
514    }
515
516    #[test]
517    fn test_text_chars_results_equality() -> Result<(), PdfiumError> {
518        // For all available test documents, check that the results of
519        // PdfPageObjectText::text() and PdfPageObjectText::chars() match.
520
521        let pdfium = test_bind_to_pdfium();
522
523        let samples = fs::read_dir("./test/")
524            .unwrap()
525            .filter_map(|entry| match entry {
526                Ok(e) => Some(e.path()),
527                Err(_) => None,
528            })
529            .filter(|path| path.extension() == Some(OsStr::new("pdf")))
530            .collect::<Vec<_>>();
531
532        assert!(samples.len() > 0);
533
534        for sample in samples {
535            println!("Testing all text objects in file {}", sample.display());
536
537            let document = pdfium.load_pdf_from_file(&sample, None)?;
538
539            for page in document.pages().iter() {
540                let text = page.text()?;
541
542                for object in page.objects().iter() {
543                    if let Some(obj) = object.as_text_object() {
544                        let chars = obj
545                            .chars(&text)?
546                            .iter()
547                            .filter_map(|char| char.unicode_string())
548                            .join("");
549
550                        assert_eq!(obj.text().trim(), chars.replace("\0", "").trim());
551                    }
552                }
553            }
554        }
555
556        Ok(())
557    }
558}