pdfium_render/pdf/document/page/
text.rs

1//! Defines the [PdfPageText] struct, exposing functionality related to the
2//! collection of Unicode characters visible on a single [PdfPage].
3
4pub mod char;
5pub mod chars;
6pub mod search;
7pub mod segment;
8pub mod segments;
9
10use crate::bindgen::{FPDF_TEXTPAGE, FPDF_WCHAR, FPDF_WIDESTRING};
11use crate::bindings::PdfiumLibraryBindings;
12use crate::error::PdfiumError;
13use crate::pdf::document::page::annotation::PdfPageAnnotation;
14use crate::pdf::document::page::annotation::PdfPageAnnotationCommon;
15use crate::pdf::document::page::object::private::internal::PdfPageObjectPrivate;
16use crate::pdf::document::page::object::text::PdfPageTextObject;
17use crate::pdf::document::page::text::chars::{PdfPageTextCharIndex, PdfPageTextChars};
18use crate::pdf::document::page::text::search::{PdfPageTextSearch, PdfSearchOptions};
19use crate::pdf::document::page::text::segments::PdfPageTextSegments;
20use crate::pdf::document::page::PdfPage;
21use crate::pdf::points::PdfPoints;
22use crate::pdf::rect::PdfRect;
23use crate::utils::mem::{create_byte_buffer, create_sized_buffer};
24use crate::utils::utf16le::{
25    get_pdfium_utf16le_bytes_from_str, get_string_from_pdfium_utf16le_bytes,
26};
27use bytemuck::cast_slice;
28use std::fmt::{Display, Formatter};
29use std::os::raw::{c_double, c_int};
30use std::ptr::null_mut;
31
32/// The collection of Unicode characters visible on a single [PdfPage].
33///
34/// Use the [PdfPageText::all()] function to easily return all characters in the containing
35/// [PdfPage] in the order in which they are defined in the PDF file.
36///
37/// Use the [PdfPageText::search()] function to initialise a new [PdfPageTextSearch] object,
38/// yielding the results of searching for a target string within the character collection.
39///
40/// In complex custom layouts, the order in which characters are defined in the document
41/// and the order in which they appear visually during rendering (and thus the order in
42/// which they are read by a user) may not necessarily match.
43///
44/// [PdfPageText] implements both the [ToString] and the [Display] traits.
45pub struct PdfPageText<'a> {
46    text_page_handle: FPDF_TEXTPAGE,
47    page: &'a PdfPage<'a>,
48    bindings: &'a dyn PdfiumLibraryBindings,
49}
50
51impl<'a> PdfPageText<'a> {
52    pub(crate) fn from_pdfium(
53        text_page_handle: FPDF_TEXTPAGE,
54        page: &'a PdfPage<'a>,
55        bindings: &'a dyn PdfiumLibraryBindings,
56    ) -> Self {
57        PdfPageText {
58            text_page_handle,
59            page,
60            bindings,
61        }
62    }
63
64    /// Returns the internal `FPDF_TEXTPAGE` handle for this [PdfPageText].
65    #[inline]
66    pub(crate) fn text_page_handle(&self) -> FPDF_TEXTPAGE {
67        self.text_page_handle
68    }
69
70    /// Returns the [PdfiumLibraryBindings] used by this [PdfPageText].
71    #[inline]
72    pub fn bindings(&self) -> &'a dyn PdfiumLibraryBindings {
73        self.bindings
74    }
75
76    /// Returns the total number of characters in all text segments in the containing [PdfPage].
77    ///
78    /// The character count includes whitespace and newlines, and so may differ slightly
79    /// from the result of calling `PdfPageText::all().len()`.
80    #[inline]
81    pub fn len(&self) -> i32 {
82        self.bindings.FPDFText_CountChars(self.text_page_handle())
83    }
84
85    /// Returns `true` if there are no characters in any text box collection in the containing [PdfPage].
86    #[inline]
87    pub fn is_empty(&self) -> bool {
88        self.len() == 0
89    }
90
91    /// Returns a collection of all the `PdfPageTextSegment` text segments in the containing [PdfPage].
92    #[inline]
93    pub fn segments(&self) -> PdfPageTextSegments {
94        PdfPageTextSegments::new(self, 0, self.len(), self.bindings())
95    }
96
97    /// Returns a subset of the `PdfPageTextSegment` text segments in the containing [PdfPage].
98    /// Only text segments containing characters in the given index range will be included.
99    #[inline]
100    pub fn segments_subset(
101        &self,
102        start: PdfPageTextCharIndex,
103        count: PdfPageTextCharIndex,
104    ) -> PdfPageTextSegments {
105        PdfPageTextSegments::new(self, start as i32, count as i32, self.bindings())
106    }
107
108    /// Returns a collection of all the `PdfPageTextChar` characters in the containing [PdfPage].
109    #[inline]
110    pub fn chars(&self) -> PdfPageTextChars {
111        PdfPageTextChars::new(
112            self.page.document_handle(),
113            self.page.page_handle(),
114            self.text_page_handle(),
115            (0..self.len()).collect(),
116            self.bindings(),
117        )
118    }
119
120    #[cfg(any(
121        feature = "pdfium_future",
122        feature = "pdfium_7350",
123        feature = "pdfium_7215",
124        feature = "pdfium_7123",
125        feature = "pdfium_6996",
126        feature = "pdfium_6721",
127        feature = "pdfium_6666",
128        feature = "pdfium_6611",
129    ))]
130    /// Returns a collection of all the `PdfPageTextChar` characters in the given [PdfPageTextObject].
131    ///
132    /// The return result will be empty if the given [PdfPageTextObject] is not attached to the
133    /// containing [PdfPage].
134    #[inline]
135    pub fn chars_for_object(
136        &self,
137        object: &PdfPageTextObject,
138    ) -> Result<PdfPageTextChars, PdfiumError> {
139        Ok(PdfPageTextChars::new(
140            self.page.document_handle(),
141            self.page.page_handle(),
142            self.text_page_handle(),
143            self.chars()
144                .iter()
145                .filter(|char| {
146                    self.bindings
147                        .FPDFText_GetTextObject(self.text_page_handle(), char.index() as i32)
148                        == object.object_handle()
149                })
150                .map(|char| char.index() as i32)
151                .collect(),
152            self.bindings(),
153        ))
154    }
155
156    /// Returns a collection of all the `PdfPageTextChar` characters in the given [PdfPageAnnotation].
157    ///
158    /// The return result will be empty if the given [PdfPageAnnotation] is not attached to the
159    /// containing [PdfPage].
160    #[inline]
161    pub fn chars_for_annotation(
162        &self,
163        annotation: &PdfPageAnnotation,
164    ) -> Result<PdfPageTextChars, PdfiumError> {
165        self.chars_inside_rect(annotation.bounds()?)
166            .map_err(|_| PdfiumError::NoCharsInAnnotation)
167    }
168
169    /// Returns a collection of all the `PdfPageTextChar` characters that lie within the bounds of
170    /// the given [PdfRect] in the containing [PdfPage].
171    #[inline]
172    pub fn chars_inside_rect(&self, rect: PdfRect) -> Result<PdfPageTextChars, PdfiumError> {
173        let tolerance_x = rect.width() / 2.0;
174        let tolerance_y = rect.height() / 2.0;
175        let center_height = rect.bottom() + tolerance_y;
176
177        match (
178            Self::get_char_index_near_point(
179                self.text_page_handle(),
180                rect.left(),
181                tolerance_x,
182                center_height,
183                tolerance_y,
184                self.bindings(),
185            ),
186            Self::get_char_index_near_point(
187                self.text_page_handle(),
188                rect.right(),
189                tolerance_x,
190                center_height,
191                tolerance_y,
192                self.bindings(),
193            ),
194        ) {
195            (Some(start), Some(end)) => Ok(PdfPageTextChars::new(
196                self.page.document_handle(),
197                self.page.page_handle(),
198                self.text_page_handle(),
199                (start as i32..=end as i32 + 1).collect(),
200                self.bindings,
201            )),
202            (Some(start), None) => Ok(PdfPageTextChars::new(
203                self.page.document_handle(),
204                self.page.page_handle(),
205                self.text_page_handle(),
206                (start as i32..=start as i32 + 1).collect(),
207                self.bindings,
208            )),
209            (None, Some(end)) => Ok(PdfPageTextChars::new(
210                self.page.document_handle(),
211                self.page.page_handle(),
212                self.text_page_handle(),
213                (end as i32..=end as i32 + 1).collect(),
214                self.bindings,
215            )),
216            _ => Err(PdfiumError::NoCharsInRect),
217        }
218    }
219
220    /// Returns the character near to the given x and y positions on the containing [PdfPage],
221    /// if any. The returned character will be no further from the given positions than the given
222    /// tolerance values.
223    pub(crate) fn get_char_index_near_point(
224        text_page_handle: FPDF_TEXTPAGE,
225        x: PdfPoints,
226        tolerance_x: PdfPoints,
227        y: PdfPoints,
228        tolerance_y: PdfPoints,
229        bindings: &dyn PdfiumLibraryBindings,
230    ) -> Option<PdfPageTextCharIndex> {
231        match bindings.FPDFText_GetCharIndexAtPos(
232            text_page_handle,
233            x.value as c_double,
234            y.value as c_double,
235            tolerance_x.value as c_double,
236            tolerance_y.value as c_double,
237        ) {
238            -1 => None, // No character at position within tolerances
239            -3 => None, // An error occurred, but we'll eat it
240            index => Some(index as PdfPageTextCharIndex),
241        }
242    }
243
244    /// Returns all characters that lie within the containing [PdfPage], in the order in which
245    /// they are defined in the document, concatenated into a single string.
246    ///
247    /// In complex custom layouts, the order in which characters are defined in the document
248    /// and the order in which they appear visually during rendering (and thus the order in
249    /// which they are read by a user) may not necessarily match.
250    pub fn all(&self) -> String {
251        self.inside_rect(self.page.page_size())
252    }
253
254    /// Returns all characters that lie within the bounds of the given [PdfRect] in the
255    /// containing [PdfPage], in the order in which they are defined in the document,
256    /// concatenated into a single string.
257    ///
258    /// In complex custom layouts, the order in which characters are defined in the document
259    /// and the order in which they appear visually during rendering (and thus the order in
260    /// which they are read by a user) may not necessarily match.
261    pub fn inside_rect(&self, rect: PdfRect) -> String {
262        // Retrieving the bounded text from Pdfium is a two-step operation. First, we call
263        // FPDFText_GetBoundedText() with a null buffer; this will retrieve the length of
264        // the bounded text in _characters_ (not _bytes_!). If the length is zero, then there is
265        // no text within the given rectangle's boundaries.
266
267        // If the length is non-zero, then we reserve a buffer (sized in words rather than bytes,
268        // to allow for two bytes per character) and call FPDFText_GetBoundedText() again with a
269        // pointer to the buffer; this will write the bounded text to the buffer in UTF16-LE format.
270
271        let left = rect.left().value as f64;
272
273        let top = rect.top().value as f64;
274
275        let right = rect.right().value as f64;
276
277        let bottom = rect.bottom().value as f64;
278
279        let chars_count = self.bindings().FPDFText_GetBoundedText(
280            self.text_page_handle(),
281            left,
282            top,
283            right,
284            bottom,
285            null_mut(),
286            0,
287        );
288
289        if chars_count == 0 {
290            // No text lies within the given rectangle.
291
292            return String::new();
293        }
294
295        let mut buffer = create_sized_buffer(chars_count as usize);
296
297        let result = self.bindings().FPDFText_GetBoundedText(
298            self.text_page_handle(),
299            left,
300            top,
301            right,
302            bottom,
303            buffer.as_mut_ptr(),
304            chars_count,
305        );
306
307        assert_eq!(result, chars_count);
308
309        get_string_from_pdfium_utf16le_bytes(cast_slice(buffer.as_slice()).to_vec())
310            .unwrap_or_default()
311    }
312
313    /// Returns all characters assigned to the given [PdfPageTextObject] in this [PdfPageText] object,
314    /// concatenated into a single string.
315    pub fn for_object(&self, object: &PdfPageTextObject) -> String {
316        // Retrieving the string value from Pdfium is a two-step operation. First, we call
317        // FPDFTextObj_GetText() with a null buffer; this will retrieve the length of
318        // the text in bytes, assuming the page object exists. If the length is zero,
319        // then there is no text.
320
321        // If the length is non-zero, then we reserve a byte buffer of the given
322        // length and call FPDFTextObj_GetText() again with a pointer to the buffer;
323        // this will write the text for the page object into the buffer.
324
325        let buffer_length = self.bindings().FPDFTextObj_GetText(
326            object.object_handle(),
327            self.text_page_handle(),
328            null_mut(),
329            0,
330        );
331
332        if buffer_length == 0 {
333            // There is no text.
334
335            return String::new();
336        }
337
338        let mut buffer = create_byte_buffer(buffer_length as usize);
339
340        let result = self.bindings().FPDFTextObj_GetText(
341            object.object_handle(),
342            self.text_page_handle(),
343            buffer.as_mut_ptr() as *mut FPDF_WCHAR,
344            buffer_length,
345        );
346
347        assert_eq!(result, buffer_length);
348
349        get_string_from_pdfium_utf16le_bytes(buffer).unwrap_or_default()
350    }
351
352    /// Returns all characters that lie within the bounds of the given [PdfPageAnnotation] in the
353    /// containing [PdfPage], in the order in which they are defined in the document,
354    /// concatenated into a single string.
355    ///
356    /// In complex custom layouts, the order in which characters are defined in the document
357    /// and the order in which they appear visually during rendering (and thus the order in
358    /// which they are read by a user) may not necessarily match.
359    #[inline]
360    pub fn for_annotation(&self, annotation: &PdfPageAnnotation) -> Result<String, PdfiumError> {
361        let bounds = annotation.bounds()?;
362
363        Ok(self.inside_rect(bounds))
364    }
365
366    /// Starts a search for the given text string, returning a new [PdfPageTextSearch]
367    /// object that can be used to step through the search results.
368    #[inline]
369    pub fn search(
370        &self,
371        text: &str,
372        options: &PdfSearchOptions,
373    ) -> Result<PdfPageTextSearch, PdfiumError> {
374        self.search_from(text, options, 0)
375    }
376
377    /// Starts a search for the given test string from the given character position,
378    /// returning a new [PdfPageTextSearch] object that can be used to step through
379    /// the search results.
380    pub fn search_from(
381        &self,
382        text: &str,
383        options: &PdfSearchOptions,
384        index: PdfPageTextCharIndex,
385    ) -> Result<PdfPageTextSearch, PdfiumError> {
386        if text.is_empty() {
387            Err(PdfiumError::TextSearchTargetIsEmpty)
388        } else {
389            Ok(PdfPageTextSearch::from_pdfium(
390                self.bindings().FPDFText_FindStart(
391                    self.text_page_handle(),
392                    get_pdfium_utf16le_bytes_from_str(text).as_ptr() as FPDF_WIDESTRING,
393                    options.as_pdfium(),
394                    index as c_int,
395                ),
396                self,
397                self.bindings(),
398            ))
399        }
400    }
401}
402
403impl<'a> Display for PdfPageText<'a> {
404    #[inline]
405    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
406        f.write_str(self.all().as_str())
407    }
408}
409
410impl<'a> Drop for PdfPageText<'a> {
411    /// Closes the [PdfPageText] collection, releasing held memory.
412    #[inline]
413    fn drop(&mut self) {
414        self.bindings().FPDFText_ClosePage(self.text_page_handle());
415    }
416}
417
418#[cfg(test)]
419mod tests {
420    use itertools::Itertools;
421    use std::ffi::OsStr;
422    use std::fs;
423
424    use crate::prelude::*;
425    use crate::utils::test::test_bind_to_pdfium;
426
427    #[test]
428    fn test_overlapping_chars_results() -> Result<(), PdfiumError> {
429        // Test to make sure the result of the .chars_for_object() function returns the
430        // correct results in the event of overlapping text objects.
431        // For more details, see: https://github.com/ajrcarey/pdfium-render/issues/98
432
433        let pdfium = test_bind_to_pdfium();
434
435        // Create a new document with three overlapping text objects.
436
437        let mut document = pdfium.create_new_pdf()?;
438
439        let mut page = document
440            .pages_mut()
441            .create_page_at_start(PdfPagePaperSize::a4())?;
442
443        let font = document.fonts_mut().courier();
444
445        let txt1 = page.objects_mut().create_text_object(
446            PdfPoints::ZERO,
447            PdfPoints::ZERO,
448            "AAAAAA",
449            font,
450            PdfPoints::new(10.0),
451        )?;
452
453        let txt2 = page.objects_mut().create_text_object(
454            PdfPoints::ZERO,
455            PdfPoints::ZERO,
456            "BBBBBB",
457            font,
458            PdfPoints::new(10.0),
459        )?;
460
461        let txt3 = page.objects_mut().create_text_object(
462            PdfPoints::ZERO,
463            PdfPoints::ZERO,
464            "CDCDCDE",
465            font,
466            PdfPoints::new(10.0),
467        )?;
468
469        let page_text = page.text()?;
470
471        // Check the results for all three objects are not affected by overlapping.
472
473        assert!(test_one_overlapping_text_object_results(
474            &txt1, &page_text, "AAAAAA"
475        )?);
476        assert!(test_one_overlapping_text_object_results(
477            &txt2, &page_text, "BBBBBB"
478        )?);
479        assert!(test_one_overlapping_text_object_results(
480            &txt3, &page_text, "CDCDCDE"
481        )?);
482
483        Ok(())
484    }
485
486    fn test_one_overlapping_text_object_results(
487        object: &PdfPageObject,
488        page_text: &PdfPageText,
489        expected: &str,
490    ) -> Result<bool, PdfiumError> {
491        if let Some(txt) = object.as_text_object() {
492            assert_eq!(txt.text().trim(), expected);
493            assert_eq!(page_text.for_object(txt).trim(), expected);
494
495            for (index, char) in txt.chars(&page_text)?.iter().enumerate() {
496                assert_eq!(txt.text().chars().nth(index), char.unicode_char());
497                assert_eq!(expected.chars().nth(index), char.unicode_char());
498            }
499
500            Ok(true)
501        } else {
502            Ok(false)
503        }
504    }
505
506    #[test]
507    fn test_text_chars_results_equality() -> Result<(), PdfiumError> {
508        // For all available test documents, check that the results of
509        // PdfPageObjectText::text() and PdfPageObjectText::chars() match.
510
511        let pdfium = test_bind_to_pdfium();
512
513        let samples = fs::read_dir("./test/")
514            .unwrap()
515            .filter_map(|entry| match entry {
516                Ok(e) => Some(e.path()),
517                Err(_) => None,
518            })
519            .filter(|path| path.extension() == Some(OsStr::new("pdf")))
520            .collect::<Vec<_>>();
521
522        assert!(samples.len() > 0);
523
524        for sample in samples {
525            println!("Testing all text objects in file {}", sample.display());
526
527            let document = pdfium.load_pdf_from_file(&sample, None)?;
528
529            for page in document.pages().iter() {
530                let text = page.text()?;
531
532                for object in page.objects().iter() {
533                    if let Some(obj) = object.as_text_object() {
534                        let chars = obj
535                            .chars(&text)?
536                            .iter()
537                            .filter_map(|char| char.unicode_string())
538                            .join("");
539
540                        assert_eq!(obj.text().trim(), chars.replace("\0", "").trim());
541                    }
542                }
543            }
544        }
545
546        Ok(())
547    }
548}