pdfium_render/pdf/document/page/
text.rs1pub mod char;
5pub mod chars;
6pub mod search;
7pub mod segment;
8pub mod segments;
9
10use crate::bindgen::{FPDF_TEXTPAGE, FPDF_WCHAR, FPDF_WIDESTRING};
11use crate::bindings::PdfiumLibraryBindings;
12use crate::error::PdfiumError;
13use crate::pdf::document::page::annotation::PdfPageAnnotation;
14use crate::pdf::document::page::annotation::PdfPageAnnotationCommon;
15use crate::pdf::document::page::object::private::internal::PdfPageObjectPrivate;
16use crate::pdf::document::page::object::text::PdfPageTextObject;
17use crate::pdf::document::page::text::chars::{PdfPageTextCharIndex, PdfPageTextChars};
18use crate::pdf::document::page::text::search::{PdfPageTextSearch, PdfSearchOptions};
19use crate::pdf::document::page::text::segments::PdfPageTextSegments;
20use crate::pdf::document::page::PdfPage;
21use crate::pdf::points::PdfPoints;
22use crate::pdf::rect::PdfRect;
23use crate::utils::mem::{create_byte_buffer, create_sized_buffer};
24use crate::utils::utf16le::{
25 get_pdfium_utf16le_bytes_from_str, get_string_from_pdfium_utf16le_bytes,
26};
27use bytemuck::cast_slice;
28use std::fmt::{Display, Formatter};
29use std::os::raw::{c_double, c_int};
30use std::ptr::null_mut;
31
32#[cfg(any(
33 feature = "pdfium_future",
34 feature = "pdfium_7123",
35 feature = "pdfium_6996",
36 feature = "pdfium_6721",
37 feature = "pdfium_6666",
38 feature = "pdfium_6611",
39))]
40use crate::pdf::document::page::object::PdfPageObjectCommon;
41
42pub struct PdfPageText<'a> {
56 text_page_handle: FPDF_TEXTPAGE,
57 page: &'a PdfPage<'a>,
58 bindings: &'a dyn PdfiumLibraryBindings,
59}
60
61impl<'a> PdfPageText<'a> {
62 pub(crate) fn from_pdfium(
63 text_page_handle: FPDF_TEXTPAGE,
64 page: &'a PdfPage<'a>,
65 bindings: &'a dyn PdfiumLibraryBindings,
66 ) -> Self {
67 PdfPageText {
68 text_page_handle,
69 page,
70 bindings,
71 }
72 }
73
74 #[inline]
76 pub(crate) fn text_page_handle(&self) -> FPDF_TEXTPAGE {
77 self.text_page_handle
78 }
79
80 #[inline]
82 pub fn bindings(&self) -> &'a dyn PdfiumLibraryBindings {
83 self.bindings
84 }
85
86 #[inline]
91 pub fn len(&self) -> i32 {
92 self.bindings.FPDFText_CountChars(self.text_page_handle())
93 }
94
95 #[inline]
97 pub fn is_empty(&self) -> bool {
98 self.len() == 0
99 }
100
101 #[inline]
103 pub fn segments(&self) -> PdfPageTextSegments {
104 PdfPageTextSegments::new(self, 0, self.len(), self.bindings())
105 }
106
107 #[inline]
110 pub fn segments_subset(
111 &self,
112 start: PdfPageTextCharIndex,
113 count: PdfPageTextCharIndex,
114 ) -> PdfPageTextSegments {
115 PdfPageTextSegments::new(self, start as i32, count as i32, self.bindings())
116 }
117
118 #[inline]
120 pub fn chars(&self) -> PdfPageTextChars {
121 PdfPageTextChars::new(
122 self.page.document_handle(),
123 self.page.page_handle(),
124 self.text_page_handle(),
125 (0..self.len()).collect(),
126 self.bindings(),
127 )
128 }
129
130 #[cfg(any(
131 feature = "pdfium_future",
132 feature = "pdfium_7123",
133 feature = "pdfium_6996",
134 feature = "pdfium_6721",
135 feature = "pdfium_6666",
136 feature = "pdfium_6611",
137 ))]
138 #[inline]
143 pub fn chars_for_object(
144 &self,
145 object: &PdfPageTextObject,
146 ) -> Result<PdfPageTextChars, PdfiumError> {
147 let chars_inside_bounds = self
148 .chars_inside_rect(object.bounds()?.to_rect())
149 .map_err(|_| PdfiumError::NoCharsInPageObject)?;
150
151 Ok(PdfPageTextChars::new(
157 self.page.document_handle(),
158 self.page.page_handle(),
159 self.text_page_handle(),
160 chars_inside_bounds
161 .iter()
162 .filter(|char| {
163 self.bindings
164 .FPDFText_GetTextObject(self.text_page_handle(), char.index() as i32)
165 == object.object_handle()
166 })
167 .map(|char| char.index() as i32)
168 .collect(),
169 self.bindings(),
170 ))
171 }
172
173 #[inline]
178 pub fn chars_for_annotation(
179 &self,
180 annotation: &PdfPageAnnotation,
181 ) -> Result<PdfPageTextChars, PdfiumError> {
182 self.chars_inside_rect(annotation.bounds()?)
183 .map_err(|_| PdfiumError::NoCharsInAnnotation)
184 }
185
186 #[inline]
189 pub fn chars_inside_rect(&self, rect: PdfRect) -> Result<PdfPageTextChars, PdfiumError> {
190 let tolerance_x = rect.width() / 2.0;
191 let tolerance_y = rect.height() / 2.0;
192 let center_height = rect.bottom() + tolerance_y;
193
194 let chars = self.chars();
195
196 match (
197 chars.get_char_near_point(rect.left(), tolerance_x, center_height, tolerance_y),
198 chars.get_char_near_point(rect.right(), tolerance_x, center_height, tolerance_y),
199 ) {
200 (Some(start), Some(end)) => Ok(PdfPageTextChars::new(
201 self.page.document_handle(),
202 self.page.page_handle(),
203 self.text_page_handle(),
204 (start.index() as i32..end.index().saturating_sub(start.index()) as i32 + 1)
205 .collect(),
206 self.bindings,
207 )),
208 _ => Err(PdfiumError::NoCharsInRect),
209 }
210 }
211
212 pub(crate) fn get_char_index_near_point(
216 text_page_handle: FPDF_TEXTPAGE,
217 x: PdfPoints,
218 tolerance_x: PdfPoints,
219 y: PdfPoints,
220 tolerance_y: PdfPoints,
221 bindings: &dyn PdfiumLibraryBindings,
222 ) -> Option<PdfPageTextCharIndex> {
223 match bindings.FPDFText_GetCharIndexAtPos(
224 text_page_handle,
225 x.value as c_double,
226 y.value as c_double,
227 tolerance_x.value as c_double,
228 tolerance_y.value as c_double,
229 ) {
230 -1 => None, -3 => None, index => Some(index as PdfPageTextCharIndex),
233 }
234 }
235
236 pub fn all(&self) -> String {
243 self.inside_rect(self.page.page_size())
244 }
245
246 pub fn inside_rect(&self, rect: PdfRect) -> String {
254 let left = rect.left().value as f64;
264
265 let top = rect.top().value as f64;
266
267 let right = rect.right().value as f64;
268
269 let bottom = rect.bottom().value as f64;
270
271 let chars_count = self.bindings().FPDFText_GetBoundedText(
272 self.text_page_handle(),
273 left,
274 top,
275 right,
276 bottom,
277 null_mut(),
278 0,
279 );
280
281 if chars_count == 0 {
282 return String::new();
285 }
286
287 let mut buffer = create_sized_buffer(chars_count as usize);
288
289 let result = self.bindings().FPDFText_GetBoundedText(
290 self.text_page_handle(),
291 left,
292 top,
293 right,
294 bottom,
295 buffer.as_mut_ptr(),
296 chars_count,
297 );
298
299 assert_eq!(result, chars_count);
300
301 get_string_from_pdfium_utf16le_bytes(cast_slice(buffer.as_slice()).to_vec())
302 .unwrap_or_default()
303 }
304
305 pub fn for_object(&self, object: &PdfPageTextObject) -> String {
308 let buffer_length = self.bindings().FPDFTextObj_GetText(
318 object.object_handle(),
319 self.text_page_handle(),
320 null_mut(),
321 0,
322 );
323
324 if buffer_length == 0 {
325 return String::new();
328 }
329
330 let mut buffer = create_byte_buffer(buffer_length as usize);
331
332 let result = self.bindings().FPDFTextObj_GetText(
333 object.object_handle(),
334 self.text_page_handle(),
335 buffer.as_mut_ptr() as *mut FPDF_WCHAR,
336 buffer_length,
337 );
338
339 assert_eq!(result, buffer_length);
340
341 get_string_from_pdfium_utf16le_bytes(buffer).unwrap_or_default()
342 }
343
344 #[inline]
352 pub fn for_annotation(&self, annotation: &PdfPageAnnotation) -> Result<String, PdfiumError> {
353 let bounds = annotation.bounds()?;
354
355 Ok(self.inside_rect(bounds))
356 }
357
358 #[inline]
361 pub fn search(&self, text: &str, options: &PdfSearchOptions) -> PdfPageTextSearch {
362 self.search_from(text, options, 0)
363 }
364
365 pub fn search_from(
369 &self,
370 text: &str,
371 options: &PdfSearchOptions,
372 index: PdfPageTextCharIndex,
373 ) -> PdfPageTextSearch {
374 PdfPageTextSearch::from_pdfium(
375 self.bindings().FPDFText_FindStart(
376 self.text_page_handle(),
377 get_pdfium_utf16le_bytes_from_str(text).as_ptr() as FPDF_WIDESTRING,
378 options.as_pdfium(),
379 index as c_int,
380 ),
381 self,
382 self.bindings(),
383 )
384 }
385}
386
387impl<'a> Display for PdfPageText<'a> {
388 #[inline]
389 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
390 f.write_str(self.all().as_str())
391 }
392}
393
394impl<'a> Drop for PdfPageText<'a> {
395 #[inline]
397 fn drop(&mut self) {
398 self.bindings().FPDFText_ClosePage(self.text_page_handle());
399 }
400}
401
402#[cfg(test)]
403mod tests {
404 use crate::prelude::*;
405 use crate::utils::test::test_bind_to_pdfium;
406
407 #[test]
408 fn test_overlapping_chars_results() -> Result<(), PdfiumError> {
409 let pdfium = test_bind_to_pdfium();
414
415 let mut document = pdfium.create_new_pdf()?;
418
419 let mut page = document
420 .pages_mut()
421 .create_page_at_start(PdfPagePaperSize::a4())?;
422
423 let font = document.fonts_mut().courier();
424
425 let txt1 = page.objects_mut().create_text_object(
426 PdfPoints::ZERO,
427 PdfPoints::ZERO,
428 "AAAAAA",
429 font,
430 PdfPoints::new(10.0),
431 )?;
432
433 let txt2 = page.objects_mut().create_text_object(
434 PdfPoints::ZERO,
435 PdfPoints::ZERO,
436 "BBBBBB",
437 font,
438 PdfPoints::new(10.0),
439 )?;
440
441 let txt3 = page.objects_mut().create_text_object(
442 PdfPoints::ZERO,
443 PdfPoints::ZERO,
444 "CDCDCDE",
445 font,
446 PdfPoints::new(10.0),
447 )?;
448
449 let page_text = page.text()?;
450
451 assert!(test_one_overlapping_text_object_results(
454 &txt1, &page_text, "AAAAAA"
455 )?);
456 assert!(test_one_overlapping_text_object_results(
457 &txt2, &page_text, "BBBBBB"
458 )?);
459 assert!(test_one_overlapping_text_object_results(
460 &txt3, &page_text, "CDCDCDE"
461 )?);
462
463 Ok(())
464 }
465
466 fn test_one_overlapping_text_object_results(
467 object: &PdfPageObject,
468 page_text: &PdfPageText,
469 expected: &str,
470 ) -> Result<bool, PdfiumError> {
471 if let Some(txt) = object.as_text_object() {
472 assert_eq!(txt.text().trim(), expected);
473 assert_eq!(page_text.for_object(txt).trim(), expected);
474
475 for (index, char) in txt.chars(&page_text)?.iter().enumerate() {
476 assert_eq!(txt.text().chars().nth(index), char.unicode_char());
477 assert_eq!(expected.chars().nth(index), char.unicode_char());
478 }
479
480 Ok(true)
481 } else {
482 Ok(false)
483 }
484 }
485}