Skip to main content

pdfium_render/pdf/document/page/
text.rs

1//! Defines the [PdfPageText] struct, exposing functionality related to the
2//! collection of Unicode characters visible on a single [PdfPage].
3
4pub mod char;
5pub mod chars;
6pub mod search;
7pub mod segment;
8pub mod segments;
9
10use crate::bindgen::{FPDF_TEXTPAGE, FPDF_WCHAR, FPDF_WIDESTRING};
11use crate::bindings::PdfiumLibraryBindings;
12use crate::error::PdfiumError;
13use crate::pdf::document::page::annotation::PdfPageAnnotation;
14use crate::pdf::document::page::annotation::PdfPageAnnotationCommon;
15use crate::pdf::document::page::object::private::internal::PdfPageObjectPrivate;
16use crate::pdf::document::page::object::text::PdfPageTextObject;
17use crate::pdf::document::page::text::chars::{PdfPageTextCharIndex, PdfPageTextChars};
18use crate::pdf::document::page::text::search::{PdfPageTextSearch, PdfSearchOptions};
19use crate::pdf::document::page::text::segments::PdfPageTextSegments;
20use crate::pdf::document::page::PdfPage;
21use crate::pdf::points::PdfPoints;
22use crate::pdf::rect::PdfRect;
23use crate::pdfium::PdfiumLibraryBindingsAccessor;
24use crate::utils::mem::{create_byte_buffer, create_sized_buffer};
25use crate::utils::utf16le::{
26    get_pdfium_utf16le_bytes_from_str, get_string_from_pdfium_utf16le_bytes,
27};
28use bytemuck::cast_slice;
29use std::fmt::{Display, Formatter};
30use std::marker::PhantomData;
31use std::os::raw::{c_double, c_int};
32use std::ptr::null_mut;
33
34/// A collection of all Unicode characters on a single [PdfPage].
35///
36/// Use the [PdfPageText::all()] function to easily return all characters in the containing
37/// [PdfPage] in the order in which they are defined in the PDF file.
38///
39/// Use the [PdfPageText::search()] function to initialize a new [PdfPageTextSearch] object,
40/// yielding the results of searching for a target string within the character collection.
41///
42/// In complex custom layouts, the order in which characters are defined in the document
43/// and the order in which they appear visually during rendering (and thus the order in
44/// which they are read by a user) may not necessarily match.
45///
46/// [PdfPageText] implements both the [ToString] and the [Display] traits.
47pub struct PdfPageText<'a> {
48    text_page_handle: FPDF_TEXTPAGE,
49    page: &'a PdfPage<'a>,
50    lifetime: PhantomData<&'a FPDF_TEXTPAGE>,
51}
52
53impl<'a> PdfPageText<'a> {
54    pub(crate) fn from_pdfium(text_page_handle: FPDF_TEXTPAGE, page: &'a PdfPage<'a>) -> Self {
55        PdfPageText {
56            text_page_handle,
57            page,
58            lifetime: PhantomData,
59        }
60    }
61
62    /// Returns the internal `FPDF_TEXTPAGE` handle for this [PdfPageText].
63    #[inline]
64    pub(crate) fn text_page_handle(&self) -> FPDF_TEXTPAGE {
65        self.text_page_handle
66    }
67
68    /// Returns the total number of characters in all text segments in the containing [PdfPage].
69    ///
70    /// The character count includes whitespace and newlines, and so may differ slightly
71    /// from the result of calling `PdfPageText::all().len()`.
72    #[inline]
73    pub fn len(&self) -> i32 {
74        unsafe { self.bindings().FPDFText_CountChars(self.text_page_handle()) }
75    }
76
77    /// Returns `true` if there are no characters in any text box collection in the containing [PdfPage].
78    #[inline]
79    pub fn is_empty(&self) -> bool {
80        self.len() == 0
81    }
82
83    /// Returns a collection of all the `PdfPageTextSegment` text segments in the containing [PdfPage].
84    #[inline]
85    pub fn segments(&self) -> PdfPageTextSegments<'_> {
86        PdfPageTextSegments::new(self, 0, self.len(), self.bindings())
87    }
88
89    /// Returns a subset of the `PdfPageTextSegment` text segments in the containing [PdfPage].
90    /// Only text segments containing characters in the given index range will be included.
91    #[inline]
92    pub fn segments_subset(
93        &self,
94        start: PdfPageTextCharIndex,
95        count: PdfPageTextCharIndex,
96    ) -> PdfPageTextSegments<'_> {
97        PdfPageTextSegments::new(self, start as i32, count as i32, self.bindings())
98    }
99
100    /// Returns a collection of all the `PdfPageTextChar` characters in the containing [PdfPage].
101    #[inline]
102    pub fn chars(&self) -> PdfPageTextChars<'_> {
103        PdfPageTextChars::new(
104            self.page.document_handle(),
105            self.page.page_handle(),
106            self.text_page_handle(),
107            (0..self.len()).collect(),
108        )
109    }
110
111    #[cfg(any(
112        feature = "pdfium_future",
113        feature = "pdfium_7763",
114        feature = "pdfium_7543",
115        feature = "pdfium_7350",
116        feature = "pdfium_7215",
117        feature = "pdfium_7123",
118        feature = "pdfium_6996",
119        feature = "pdfium_6721",
120        feature = "pdfium_6666",
121        feature = "pdfium_6611",
122    ))]
123    /// Returns a collection of all the `PdfPageTextChar` characters in the given [PdfPageTextObject].
124    ///
125    /// The return result will be empty if the given [PdfPageTextObject] is not attached to the
126    /// containing [PdfPage].
127    #[inline]
128    pub fn chars_for_object(
129        &self,
130        object: &PdfPageTextObject,
131    ) -> Result<PdfPageTextChars<'_>, PdfiumError> {
132        Ok(PdfPageTextChars::new(
133            self.page.document_handle(),
134            self.page.page_handle(),
135            self.text_page_handle(),
136            self.chars()
137                .iter()
138                .filter(|char| {
139                    (unsafe {
140                        self.bindings()
141                            .FPDFText_GetTextObject(self.text_page_handle(), char.index() as i32)
142                    }) == object.object_handle()
143                })
144                .map(|char| char.index() as i32)
145                .collect(),
146        ))
147    }
148
149    /// Returns a collection of all the `PdfPageTextChar` characters in the given [PdfPageAnnotation].
150    ///
151    /// The return result will be empty if the given [PdfPageAnnotation] is not attached to the
152    /// containing [PdfPage].
153    #[inline]
154    pub fn chars_for_annotation(
155        &self,
156        annotation: &PdfPageAnnotation,
157    ) -> Result<PdfPageTextChars<'_>, PdfiumError> {
158        self.chars_inside_rect(annotation.bounds()?)
159            .map_err(|_| PdfiumError::NoCharsInAnnotation)
160    }
161
162    /// Returns a collection of all the `PdfPageTextChar` characters that lie within the bounds of
163    /// the given [PdfRect] in the containing [PdfPage].
164    #[inline]
165    pub fn chars_inside_rect<'b>(
166        &'b self,
167        rect: PdfRect,
168    ) -> Result<PdfPageTextChars<'a>, PdfiumError> {
169        let tolerance_x = rect.width() / 2.0;
170        let tolerance_y = rect.height() / 2.0;
171        let center_height = rect.bottom() + tolerance_y;
172
173        match (
174            Self::get_char_index_near_point(
175                self.text_page_handle(),
176                rect.left(),
177                tolerance_x,
178                center_height,
179                tolerance_y,
180                self.bindings(),
181            ),
182            Self::get_char_index_near_point(
183                self.text_page_handle(),
184                rect.right(),
185                tolerance_x,
186                center_height,
187                tolerance_y,
188                self.bindings(),
189            ),
190        ) {
191            (Some(start), Some(end)) => Ok(PdfPageTextChars::new(
192                self.page.document_handle(),
193                self.page.page_handle(),
194                self.text_page_handle(),
195                (start as i32..=end as i32 + 1).collect(),
196            )),
197            (Some(start), None) => Ok(PdfPageTextChars::new(
198                self.page.document_handle(),
199                self.page.page_handle(),
200                self.text_page_handle(),
201                (start as i32..=start as i32 + 1).collect(),
202            )),
203            (None, Some(end)) => Ok(PdfPageTextChars::new(
204                self.page.document_handle(),
205                self.page.page_handle(),
206                self.text_page_handle(),
207                (end as i32..=end as i32 + 1).collect(),
208            )),
209            _ => Err(PdfiumError::NoCharsInRect),
210        }
211    }
212
213    /// Returns the character near to the given x and y positions on the containing [PdfPage],
214    /// if any. The returned character will be no further from the given positions than the given
215    /// tolerance values.
216    pub(crate) fn get_char_index_near_point(
217        text_page_handle: FPDF_TEXTPAGE,
218        x: PdfPoints,
219        tolerance_x: PdfPoints,
220        y: PdfPoints,
221        tolerance_y: PdfPoints,
222        bindings: &dyn PdfiumLibraryBindings,
223    ) -> Option<PdfPageTextCharIndex> {
224        match unsafe {
225            bindings.FPDFText_GetCharIndexAtPos(
226                text_page_handle,
227                x.value as c_double,
228                y.value as c_double,
229                tolerance_x.value as c_double,
230                tolerance_y.value as c_double,
231            )
232        } {
233            -1 => None, // No character at position within tolerances
234            -3 => None, // An error occurred, but we'll eat it
235            index => Some(index as PdfPageTextCharIndex),
236        }
237    }
238
239    /// Returns all characters that lie within the containing [PdfPage], in the order in which
240    /// they are defined in the document, concatenated into a single string.
241    ///
242    /// In complex custom layouts, the order in which characters are defined in the document
243    /// and the order in which they appear visually during rendering (and thus the order in
244    /// which they are read by a user) may not necessarily match.
245    pub fn all(&self) -> String {
246        self.inside_rect(self.page.page_size())
247    }
248
249    /// Returns all characters that lie within the bounds of the given [PdfRect] in the
250    /// containing [PdfPage], in the order in which they are defined in the document,
251    /// concatenated into a single string.
252    ///
253    /// In complex custom layouts, the order in which characters are defined in the document
254    /// and the order in which they appear visually during rendering (and thus the order in
255    /// which they are read by a user) may not necessarily match.
256    pub fn inside_rect(&self, rect: PdfRect) -> String {
257        // Retrieving the bounded text from Pdfium is a two-step operation. First, we call
258        // FPDFText_GetBoundedText() with a null buffer; this will retrieve the length of
259        // the bounded text in _characters_ (not _bytes_!). If the length is zero, then there is
260        // no text within the given rectangle's boundaries.
261
262        // If the length is non-zero, then we reserve a buffer (sized in words rather than bytes,
263        // to allow for two bytes per character) and call FPDFText_GetBoundedText() again with a
264        // pointer to the buffer; this will write the bounded text to the buffer in UTF16-LE format.
265
266        let left = rect.left().value as f64;
267
268        let top = rect.top().value as f64;
269
270        let right = rect.right().value as f64;
271
272        let bottom = rect.bottom().value as f64;
273
274        let chars_count = unsafe {
275            self.bindings().FPDFText_GetBoundedText(
276                self.text_page_handle(),
277                left,
278                top,
279                right,
280                bottom,
281                null_mut(),
282                0,
283            )
284        };
285
286        if chars_count == 0 {
287            // No text lies within the given rectangle.
288
289            return String::new();
290        }
291
292        let mut buffer = create_sized_buffer(chars_count as usize);
293
294        let result = unsafe {
295            self.bindings().FPDFText_GetBoundedText(
296                self.text_page_handle(),
297                left,
298                top,
299                right,
300                bottom,
301                buffer.as_mut_ptr(),
302                chars_count,
303            )
304        };
305
306        assert_eq!(result, chars_count);
307
308        get_string_from_pdfium_utf16le_bytes(cast_slice(buffer.as_slice()).to_vec())
309            .unwrap_or_default()
310    }
311
312    /// Returns all characters assigned to the given [PdfPageTextObject] in this [PdfPageText] object,
313    /// concatenated into a single string.
314    pub fn for_object(&self, object: &PdfPageTextObject) -> String {
315        // Retrieving the string value from Pdfium is a two-step operation. First, we call
316        // FPDFTextObj_GetText() with a null buffer; this will retrieve the length of
317        // the text in bytes, assuming the page object exists. If the length is zero,
318        // then there is no text.
319
320        // If the length is non-zero, then we reserve a byte buffer of the given
321        // length and call FPDFTextObj_GetText() again with a pointer to the buffer;
322        // this will write the text for the page object into the buffer.
323
324        let buffer_length = unsafe {
325            self.bindings().FPDFTextObj_GetText(
326                object.object_handle(),
327                self.text_page_handle(),
328                null_mut(),
329                0,
330            )
331        };
332
333        if buffer_length == 0 {
334            // There is no text.
335
336            return String::new();
337        }
338
339        let mut buffer = create_byte_buffer(buffer_length as usize);
340
341        let result = unsafe {
342            self.bindings().FPDFTextObj_GetText(
343                object.object_handle(),
344                self.text_page_handle(),
345                buffer.as_mut_ptr() as *mut FPDF_WCHAR,
346                buffer_length,
347            )
348        };
349
350        assert_eq!(result, buffer_length);
351
352        get_string_from_pdfium_utf16le_bytes(buffer).unwrap_or_default()
353    }
354
355    /// Returns all characters that lie within the bounds of the given [PdfPageAnnotation] in the
356    /// containing [PdfPage], in the order in which they are defined in the document,
357    /// concatenated into a single string.
358    ///
359    /// In complex custom layouts, the order in which characters are defined in the document
360    /// and the order in which they appear visually during rendering (and thus the order in
361    /// which they are read by a user) may not necessarily match.
362    #[inline]
363    pub fn for_annotation(&self, annotation: &PdfPageAnnotation) -> Result<String, PdfiumError> {
364        let bounds = annotation.bounds()?;
365
366        Ok(self.inside_rect(bounds))
367    }
368
369    /// Starts a search for the given text string, returning a new [PdfPageTextSearch]
370    /// object that can be used to step through the search results.
371    #[inline]
372    pub fn search(
373        &self,
374        text: &str,
375        options: &PdfSearchOptions,
376    ) -> Result<PdfPageTextSearch<'_>, PdfiumError> {
377        self.search_from(text, options, 0)
378    }
379
380    /// Starts a search for the given test string from the given character position,
381    /// returning a new [PdfPageTextSearch] object that can be used to step through
382    /// the search results.
383    pub fn search_from(
384        &self,
385        text: &str,
386        options: &PdfSearchOptions,
387        index: PdfPageTextCharIndex,
388    ) -> Result<PdfPageTextSearch<'_>, PdfiumError> {
389        if text.is_empty() {
390            Err(PdfiumError::TextSearchTargetIsEmpty)
391        } else {
392            Ok(PdfPageTextSearch::from_pdfium(
393                unsafe {
394                    self.bindings().FPDFText_FindStart(
395                        self.text_page_handle(),
396                        get_pdfium_utf16le_bytes_from_str(text).as_ptr() as FPDF_WIDESTRING,
397                        options.as_pdfium(),
398                        index as c_int,
399                    )
400                },
401                self,
402            ))
403        }
404    }
405}
406
407impl<'a> Display for PdfPageText<'a> {
408    #[inline]
409    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
410        f.write_str(self.all().as_str())
411    }
412}
413
414impl<'a> Drop for PdfPageText<'a> {
415    /// Closes the [PdfPageText] collection, releasing held memory.
416    #[inline]
417    fn drop(&mut self) {
418        unsafe {
419            self.bindings().FPDFText_ClosePage(self.text_page_handle());
420        }
421    }
422}
423
424impl<'a> PdfiumLibraryBindingsAccessor<'a> for PdfPageText<'a> {}
425
426#[cfg(feature = "thread_safe")]
427unsafe impl<'a> Send for PdfPageText<'a> {}
428
429#[cfg(feature = "thread_safe")]
430unsafe impl<'a> Sync for PdfPageText<'a> {}
431
432#[cfg(test)]
433mod tests {
434    use itertools::Itertools;
435    use std::ffi::OsStr;
436    use std::fs;
437
438    use crate::prelude::*;
439    use crate::utils::test::test_bind_to_pdfium;
440
441    #[test]
442    fn test_overlapping_chars_results() -> Result<(), PdfiumError> {
443        // Test to make sure the result of the .chars_for_object() function returns the
444        // correct results in the event of overlapping text objects.
445        // For more details, see: https://github.com/ajrcarey/pdfium-render/issues/98
446
447        let pdfium = test_bind_to_pdfium();
448
449        // Create a new document with three overlapping text objects.
450
451        let mut document = pdfium.create_new_pdf()?;
452
453        let mut page = document
454            .pages_mut()
455            .create_page_at_start(PdfPagePaperSize::a4())?;
456
457        let font = document.fonts_mut().courier();
458
459        let txt1 = page.objects_mut().create_text_object(
460            PdfPoints::ZERO,
461            PdfPoints::ZERO,
462            "AAAAAA",
463            font,
464            PdfPoints::new(10.0),
465        )?;
466
467        let txt2 = page.objects_mut().create_text_object(
468            PdfPoints::ZERO,
469            PdfPoints::ZERO,
470            "BBBBBB",
471            font,
472            PdfPoints::new(10.0),
473        )?;
474
475        let txt3 = page.objects_mut().create_text_object(
476            PdfPoints::ZERO,
477            PdfPoints::ZERO,
478            "CDCDCDE",
479            font,
480            PdfPoints::new(10.0),
481        )?;
482
483        let page_text = page.text()?;
484
485        // Check the results for all three objects are not affected by overlapping.
486
487        assert!(test_one_overlapping_text_object_results(
488            &txt1, &page_text, "AAAAAA"
489        )?);
490        assert!(test_one_overlapping_text_object_results(
491            &txt2, &page_text, "BBBBBB"
492        )?);
493        assert!(test_one_overlapping_text_object_results(
494            &txt3, &page_text, "CDCDCDE"
495        )?);
496
497        Ok(())
498    }
499
500    fn test_one_overlapping_text_object_results(
501        object: &PdfPageObject,
502        page_text: &PdfPageText,
503        expected: &str,
504    ) -> Result<bool, PdfiumError> {
505        if let Some(txt) = object.as_text_object() {
506            assert_eq!(txt.text().trim(), expected);
507            assert_eq!(page_text.for_object(txt).trim(), expected);
508
509            for (index, char) in txt.chars(&page_text)?.iter().enumerate() {
510                assert_eq!(txt.text().chars().nth(index), char.unicode_char());
511                assert_eq!(expected.chars().nth(index), char.unicode_char());
512            }
513
514            Ok(true)
515        } else {
516            Ok(false)
517        }
518    }
519
520    #[test]
521    fn test_text_chars_results_equality() -> Result<(), PdfiumError> {
522        // For all available test documents, check that the results of
523        // PdfPageObjectText::text() and PdfPageObjectText::chars() match.
524
525        let pdfium = test_bind_to_pdfium();
526
527        let samples = fs::read_dir("./test/")
528            .unwrap()
529            .filter_map(|entry| match entry {
530                Ok(e) => Some(e.path()),
531                Err(_) => None,
532            })
533            .filter(|path| path.extension() == Some(OsStr::new("pdf")))
534            .collect::<Vec<_>>();
535
536        assert!(samples.len() > 0);
537
538        for sample in samples {
539            println!("Testing all text objects in file {}", sample.display());
540
541            let document = pdfium.load_pdf_from_file(&sample, None)?;
542
543            for page in document.pages().iter() {
544                let text = page.text()?;
545
546                for object in page.objects().iter() {
547                    if let Some(obj) = object.as_text_object() {
548                        let chars = obj
549                            .chars(&text)?
550                            .iter()
551                            .filter_map(|char| char.unicode_string())
552                            .join("");
553
554                        assert_eq!(obj.text().trim(), chars.replace("\0", "").trim());
555                    }
556                }
557            }
558        }
559
560        Ok(())
561    }
562
563    #[test]
564    fn test_text_segment_chars_char_lifetimes() -> Result<(), PdfiumError> {
565        // Lifetimes of segments, text chars, and text char should be bound to the
566        // lifetime of page text, but not necessarily to one another. See:
567        // https://github.com/ajrcarey/pdfium-render/pull/248
568
569        let pdfium = test_bind_to_pdfium();
570        let document = pdfium.load_pdf_from_file("./test/export-test.pdf", None)?;
571        let page = document.pages().first()?;
572        let text = page.text()?;
573
574        let _char = {
575            let chars = {
576                let segment = text.segments().first()?;
577
578                segment.chars()?
579            }; // PdfPageTextSegment object is dropped here
580
581            chars.first()?
582        }; // PdfPageTextChars object is dropped here
583
584        Ok(())
585    }
586}