oxidize_pdf/text/plaintext/
extractor.rs1use super::types::{LineBreakMode, PlainTextConfig, PlainTextResult};
7use crate::parser::content::{ContentOperation, ContentParser};
8use crate::parser::document::PdfDocument;
9use crate::parser::objects::PdfObject;
10use crate::parser::page_tree::ParsedPage;
11use crate::parser::ParseResult;
12use crate::text::encoding::TextEncoding;
13use crate::text::extraction_cmap::{CMapTextExtractor, FontInfo};
14use std::collections::HashMap;
15use std::io::{Read, Seek};
16
17const IDENTITY: [f64; 6] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
19
20#[derive(Debug, Clone)]
22struct TextState {
23 text_matrix: [f64; 6],
24 text_line_matrix: [f64; 6],
25 leading: f64,
26 font_size: f64,
27 font_name: Option<String>,
28}
29
30impl Default for TextState {
31 fn default() -> Self {
32 Self {
33 text_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
34 text_line_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
35 leading: 0.0,
36 font_size: 0.0,
37 font_name: None,
38 }
39 }
40}
41
42pub struct PlainTextExtractor {
107 config: PlainTextConfig,
109 font_cache: HashMap<String, FontInfo>,
111}
112
113impl Default for PlainTextExtractor {
114 fn default() -> Self {
115 Self::new()
116 }
117}
118
119impl PlainTextExtractor {
120 pub fn new() -> Self {
130 Self {
131 config: PlainTextConfig::default(),
132 font_cache: HashMap::new(),
133 }
134 }
135
136 pub fn with_config(config: PlainTextConfig) -> Self {
147 Self {
148 config,
149 font_cache: HashMap::new(),
150 }
151 }
152
153 pub fn extract<R: Read + Seek>(
182 &mut self,
183 document: &PdfDocument<R>,
184 page_index: u32,
185 ) -> ParseResult<PlainTextResult> {
186 let page = document.get_page(page_index)?;
188
189 self.extract_font_resources(&page, document)?;
191
192 let streams = page.content_streams_with_document(document)?;
194
195 let mut extracted_text = String::with_capacity(4096);
197 let mut state = TextState::default();
198 let mut in_text_object = false;
199 let mut last_x = 0.0;
200 let mut last_y = 0.0;
201
202 for stream_data in streams {
204 let operations = match ContentParser::parse_content(&stream_data) {
205 Ok(ops) => ops,
206 Err(e) => {
207 tracing::debug!("Warning: Failed to parse content stream, skipping: {}", e);
208 continue;
209 }
210 };
211
212 for op in operations {
213 match op {
214 ContentOperation::BeginText => {
215 in_text_object = true;
216 state.text_matrix = IDENTITY;
217 state.text_line_matrix = IDENTITY;
218 }
219
220 ContentOperation::EndText => {
221 in_text_object = false;
222 }
223
224 ContentOperation::SetTextMatrix(a, b, c, d, e, f) => {
225 state.text_matrix =
226 [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
227 state.text_line_matrix =
228 [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
229 }
230
231 ContentOperation::MoveText(tx, ty) => {
232 let new_matrix = multiply_matrix(
233 &[1.0, 0.0, 0.0, 1.0, tx as f64, ty as f64],
234 &state.text_line_matrix,
235 );
236 state.text_matrix = new_matrix;
237 state.text_line_matrix = new_matrix;
238 }
239
240 ContentOperation::NextLine => {
241 let new_matrix = multiply_matrix(
242 &[1.0, 0.0, 0.0, 1.0, 0.0, -state.leading],
243 &state.text_line_matrix,
244 );
245 state.text_matrix = new_matrix;
246 state.text_line_matrix = new_matrix;
247 }
248
249 ContentOperation::ShowText(text) => {
250 if in_text_object {
251 let decoded = self.decode_text::<R>(&text, &state)?;
252
253 let (x, y) = transform_point(0.0, 0.0, &state.text_matrix);
255
256 if !extracted_text.is_empty() {
258 let dx = x - last_x;
259 let dy = (y - last_y).abs();
260
261 if dy > self.config.newline_threshold {
262 extracted_text.push('\n');
263 } else if dx > self.config.space_threshold * state.font_size {
264 extracted_text.push(' ');
265 }
266 }
267
268 extracted_text.push_str(&decoded);
269 last_x = x;
270 last_y = y;
271 }
272 }
273
274 ContentOperation::SetFont(name, size) => {
275 state.font_name = Some(name);
276 state.font_size = size as f64;
277 }
278
279 ContentOperation::SetLeading(leading) => {
280 state.leading = leading as f64;
281 }
282
283 _ => {
284 }
286 }
287 }
288 }
289
290 let processed_text = self.apply_line_break_mode(&extracted_text);
292
293 Ok(PlainTextResult::new(processed_text))
294 }
295
296 pub fn extract_lines<R: Read + Seek>(
320 &mut self,
321 document: &PdfDocument<R>,
322 page_index: u32,
323 ) -> ParseResult<Vec<String>> {
324 let result = self.extract(document, page_index)?;
325
326 Ok(result.text.lines().map(|line| line.to_string()).collect())
327 }
328
329 fn extract_font_resources<R: Read + Seek>(
331 &mut self,
332 page: &ParsedPage,
333 document: &PdfDocument<R>,
334 ) -> ParseResult<()> {
335 if let Some(resources) = page.get_resources() {
340 if let Some(PdfObject::Dictionary(font_dict)) = resources.get("Font") {
341 for (font_name, font_obj) in font_dict.0.iter() {
343 if let Some(font_ref) = font_obj.as_reference() {
344 if let Ok(PdfObject::Dictionary(font_dict)) =
345 document.get_object(font_ref.0, font_ref.1)
346 {
347 let mut cmap_extractor: CMapTextExtractor<R> = CMapTextExtractor::new();
349
350 if let Ok(font_info) =
351 cmap_extractor.extract_font_info(&font_dict, document)
352 {
353 self.font_cache.insert(font_name.0.clone(), font_info);
354 }
355 }
356 }
357 }
358 }
359 }
360
361 Ok(())
362 }
363
364 fn decode_text<R: Read + Seek>(
366 &self,
367 text_bytes: &[u8],
368 state: &TextState,
369 ) -> ParseResult<String> {
370 if let Some(ref font_name) = state.font_name {
372 if let Some(font_info) = self.font_cache.get(font_name) {
373 if let Ok(decoded) =
374 crate::text::extraction_cmap::decode_text_with_font(text_bytes, font_info)
375 {
376 return Ok(decoded);
377 }
378 }
379 }
380
381 let encoding = if let Some(ref font_name) = state.font_name {
383 let font_lower = font_name.as_bytes();
385 if font_lower
386 .iter()
387 .any(|&b| b.to_ascii_lowercase() == b'r' && font_name.contains("roman"))
388 {
389 TextEncoding::MacRomanEncoding
390 } else if font_name.contains("WinAnsi") || font_name.contains("winansi") {
391 TextEncoding::WinAnsiEncoding
392 } else if font_name.contains("Standard") || font_name.contains("standard") {
393 TextEncoding::StandardEncoding
394 } else if font_name.contains("PdfDoc") || font_name.contains("pdfdoc") {
395 TextEncoding::PdfDocEncoding
396 } else if font_name.starts_with("Times")
397 || font_name.starts_with("Helvetica")
398 || font_name.starts_with("Courier")
399 {
400 TextEncoding::WinAnsiEncoding
401 } else {
402 TextEncoding::PdfDocEncoding
403 }
404 } else {
405 TextEncoding::WinAnsiEncoding
406 };
407
408 Ok(encoding.decode(text_bytes))
409 }
410
411 fn apply_line_break_mode(&self, text: &str) -> String {
413 match self.config.line_break_mode {
414 LineBreakMode::Auto => self.auto_line_breaks(text),
415 LineBreakMode::PreserveAll => text.to_string(),
416 LineBreakMode::Normalize => self.normalize_line_breaks(text),
417 }
418 }
419
420 fn auto_line_breaks(&self, text: &str) -> String {
422 let lines: Vec<&str> = text.lines().collect();
423 let mut result = String::with_capacity(text.len());
424
425 for (i, line) in lines.iter().enumerate() {
426 let trimmed = line.trim_end();
427
428 if trimmed.is_empty() {
429 result.push('\n');
430 continue;
431 }
432
433 result.push_str(line);
434
435 if i < lines.len() - 1 {
436 let next_line = lines[i + 1].trim_start();
437
438 let ends_with_punct = trimmed.ends_with('.')
439 || trimmed.ends_with('!')
440 || trimmed.ends_with('?')
441 || trimmed.ends_with(':');
442
443 let next_is_empty = next_line.is_empty();
444
445 if ends_with_punct || next_is_empty {
446 result.push('\n');
447 } else {
448 result.push(' ');
449 }
450 }
451 }
452
453 result
454 }
455
456 fn normalize_line_breaks(&self, text: &str) -> String {
458 let lines: Vec<&str> = text.lines().collect();
459 let mut result = String::with_capacity(text.len());
460
461 for (i, line) in lines.iter().enumerate() {
462 let trimmed = line.trim_end();
463
464 if trimmed.is_empty() {
465 result.push('\n');
466 continue;
467 }
468
469 if trimmed.ends_with('-') && i < lines.len() - 1 {
470 let next_line = lines[i + 1].trim_start();
471 if !next_line.is_empty() {
472 result.push_str(&trimmed[..trimmed.len() - 1]);
473 continue;
474 }
475 }
476
477 result.push_str(line);
478
479 if i < lines.len() - 1 {
480 result.push('\n');
481 }
482 }
483
484 result
485 }
486
487 pub fn config(&self) -> &PlainTextConfig {
499 &self.config
500 }
501}
502
503#[inline]
505fn is_identity(matrix: &[f64; 6]) -> bool {
506 matrix[0] == 1.0
507 && matrix[1] == 0.0
508 && matrix[2] == 0.0
509 && matrix[3] == 1.0
510 && matrix[4] == 0.0
511 && matrix[5] == 0.0
512}
513
514#[inline]
516fn multiply_matrix(m1: &[f64; 6], m2: &[f64; 6]) -> [f64; 6] {
517 if is_identity(m1) {
519 return *m2;
520 }
521 if is_identity(m2) {
523 return *m1;
524 }
525
526 [
528 m1[0] * m2[0] + m1[1] * m2[2],
529 m1[0] * m2[1] + m1[1] * m2[3],
530 m1[2] * m2[0] + m1[3] * m2[2],
531 m1[2] * m2[1] + m1[3] * m2[3],
532 m1[4] * m2[0] + m1[5] * m2[2] + m2[4],
533 m1[4] * m2[1] + m1[5] * m2[3] + m2[5],
534 ]
535}
536
537#[inline]
539fn transform_point(x: f64, y: f64, matrix: &[f64; 6]) -> (f64, f64) {
540 let new_x = matrix[0] * x + matrix[2] * y + matrix[4];
541 let new_y = matrix[1] * x + matrix[3] * y + matrix[5];
542 (new_x, new_y)
543}
544
545#[cfg(test)]
546mod tests {
547 use super::*;
548
549 #[test]
550 fn test_new() {
551 let extractor = PlainTextExtractor::new();
552 assert_eq!(extractor.config.space_threshold, 0.3);
553 }
554
555 #[test]
556 fn test_with_config() {
557 let config = PlainTextConfig::dense();
558 let extractor = PlainTextExtractor::with_config(config.clone());
559 assert_eq!(extractor.config, config);
560 }
561
562 #[test]
563 fn test_default() {
564 let extractor = PlainTextExtractor::default();
565 assert_eq!(extractor.config, PlainTextConfig::default());
566 }
567
568 #[test]
569 fn test_normalize_line_breaks_hyphenated() {
570 let extractor = PlainTextExtractor::new();
571 let text = "This is a docu-\nment with hyphen-\nated words.";
572 let normalized = extractor.normalize_line_breaks(text);
573 assert_eq!(normalized, "This is a document with hyphenated words.");
574 }
575
576 #[test]
577 fn test_normalize_line_breaks_no_hyphen() {
578 let extractor = PlainTextExtractor::new();
579 let text = "This is a normal\ntext without\nhyphens.";
580 let normalized = extractor.normalize_line_breaks(text);
581 assert_eq!(normalized, "This is a normal\ntext without\nhyphens.");
582 }
583
584 #[test]
585 fn test_auto_line_breaks_punctuation() {
586 let extractor = PlainTextExtractor::new();
587 let text = "First sentence.\nSecond sentence.\nThird sentence.";
588 let processed = extractor.auto_line_breaks(text);
589 assert_eq!(
590 processed,
591 "First sentence.\nSecond sentence.\nThird sentence."
592 );
593 }
594
595 #[test]
596 fn test_auto_line_breaks_wrapped() {
597 let extractor = PlainTextExtractor::new();
598 let text = "This is a long line that\nwas wrapped in the PDF\nfor layout purposes";
599 let processed = extractor.auto_line_breaks(text);
600 assert!(processed.contains("long line that was"));
601 assert!(processed.contains("wrapped in the PDF for"));
602 }
603
604 #[test]
605 fn test_auto_line_breaks_empty_lines() {
606 let extractor = PlainTextExtractor::new();
607 let text = "Paragraph one.\n\nParagraph two.\n\nParagraph three.";
608 let processed = extractor.auto_line_breaks(text);
609 assert!(processed.contains("\n\n"));
610 }
611
612 #[test]
613 fn test_apply_line_break_mode_preserve_all() {
614 let extractor = PlainTextExtractor::with_config(PlainTextConfig {
615 line_break_mode: LineBreakMode::PreserveAll,
616 ..Default::default()
617 });
618 let text = "Line 1\nLine 2\nLine 3";
619 let processed = extractor.apply_line_break_mode(text);
620 assert_eq!(processed, text);
621 }
622
623 #[test]
624 fn test_apply_line_break_mode_normalize() {
625 let extractor = PlainTextExtractor::with_config(PlainTextConfig {
626 line_break_mode: LineBreakMode::Normalize,
627 ..Default::default()
628 });
629 let text = "docu-\nment";
630 let processed = extractor.apply_line_break_mode(text);
631 assert_eq!(processed, "document");
632 }
633
634 #[test]
635 fn test_apply_line_break_mode_auto() {
636 let extractor = PlainTextExtractor::with_config(PlainTextConfig {
637 line_break_mode: LineBreakMode::Auto,
638 ..Default::default()
639 });
640 let text = "First sentence.\nSecond part";
641 let processed = extractor.apply_line_break_mode(text);
642 assert!(processed.contains("First sentence.\nSecond"));
643 }
644
645 #[test]
646 fn test_config_getter() {
647 let config = PlainTextConfig::loose();
648 let extractor = PlainTextExtractor::with_config(config.clone());
649 assert_eq!(extractor.config(), &config);
650 }
651
652 #[test]
653 fn test_multiply_matrix() {
654 let m1 = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
655 let m2 = [1.0, 0.0, 0.0, 1.0, 5.0, 15.0];
656 let result = multiply_matrix(&m1, &m2);
657 assert_eq!(result, [1.0, 0.0, 0.0, 1.0, 15.0, 35.0]);
658 }
659
660 #[test]
661 fn test_transform_point() {
662 let matrix = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
663 let (x, y) = transform_point(5.0, 10.0, &matrix);
664 assert_eq!(x, 15.0);
665 assert_eq!(y, 30.0);
666 }
667}