oxidize_pdf/text/plaintext/
extractor.rs1use super::types::{LineBreakMode, PlainTextConfig, PlainTextResult};
7use crate::parser::content::{ContentOperation, ContentParser};
8use crate::parser::document::PdfDocument;
9use crate::parser::objects::PdfObject;
10use crate::parser::page_tree::ParsedPage;
11use crate::parser::ParseResult;
12use crate::text::encoding::TextEncoding;
13use crate::text::extraction_cmap::{CMapTextExtractor, FontInfo};
14use std::collections::HashMap;
15use std::io::{Read, Seek};
16
17const IDENTITY: [f64; 6] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
19
20#[derive(Debug, Clone)]
22struct TextState {
23 text_matrix: [f64; 6],
24 text_line_matrix: [f64; 6],
25 leading: f64,
26 font_size: f64,
27 font_name: Option<String>,
28}
29
30impl Default for TextState {
31 fn default() -> Self {
32 Self {
33 text_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
34 text_line_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
35 leading: 0.0,
36 font_size: 0.0,
37 font_name: None,
38 }
39 }
40}
41
42pub struct PlainTextExtractor {
107 config: PlainTextConfig,
109 font_cache: HashMap<String, FontInfo>,
111 cmap_extractor: CMapTextExtractor<std::fs::File>,
113}
114
115impl Default for PlainTextExtractor {
116 fn default() -> Self {
117 Self::new()
118 }
119}
120
121impl PlainTextExtractor {
122 pub fn new() -> Self {
132 Self {
133 config: PlainTextConfig::default(),
134 font_cache: HashMap::new(),
135 cmap_extractor: CMapTextExtractor::new(),
136 }
137 }
138
139 pub fn with_config(config: PlainTextConfig) -> Self {
150 Self {
151 config,
152 font_cache: HashMap::new(),
153 cmap_extractor: CMapTextExtractor::new(),
154 }
155 }
156
157 pub fn extract<R: Read + Seek>(
186 &mut self,
187 document: &PdfDocument<R>,
188 page_index: u32,
189 ) -> ParseResult<PlainTextResult> {
190 let page = document.get_page(page_index)?;
192
193 self.extract_font_resources(&page, document)?;
195
196 let streams = page.content_streams_with_document(document)?;
198
199 let mut extracted_text = String::with_capacity(4096);
201 let mut state = TextState::default();
202 let mut in_text_object = false;
203 let mut last_x = 0.0;
204 let mut last_y = 0.0;
205
206 for stream_data in streams {
208 let operations = match ContentParser::parse_content(&stream_data) {
209 Ok(ops) => ops,
210 Err(e) => {
211 tracing::debug!("Warning: Failed to parse content stream, skipping: {}", e);
212 continue;
213 }
214 };
215
216 for op in operations {
217 match op {
218 ContentOperation::BeginText => {
219 in_text_object = true;
220 state.text_matrix = IDENTITY;
221 state.text_line_matrix = IDENTITY;
222 }
223
224 ContentOperation::EndText => {
225 in_text_object = false;
226 }
227
228 ContentOperation::SetTextMatrix(a, b, c, d, e, f) => {
229 state.text_matrix =
230 [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
231 state.text_line_matrix =
232 [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
233 }
234
235 ContentOperation::MoveText(tx, ty) => {
236 let new_matrix = multiply_matrix(
237 &[1.0, 0.0, 0.0, 1.0, tx as f64, ty as f64],
238 &state.text_line_matrix,
239 );
240 state.text_matrix = new_matrix;
241 state.text_line_matrix = new_matrix;
242 }
243
244 ContentOperation::NextLine => {
245 let new_matrix = multiply_matrix(
246 &[1.0, 0.0, 0.0, 1.0, 0.0, -state.leading],
247 &state.text_line_matrix,
248 );
249 state.text_matrix = new_matrix;
250 state.text_line_matrix = new_matrix;
251 }
252
253 ContentOperation::ShowText(text) => {
254 if in_text_object {
255 let decoded = self.decode_text::<R>(&text, &state)?;
256
257 let (x, y) = transform_point(0.0, 0.0, &state.text_matrix);
259
260 if !extracted_text.is_empty() {
262 let dx = x - last_x;
263 let dy = (y - last_y).abs();
264
265 if dy > self.config.newline_threshold {
266 extracted_text.push('\n');
267 } else if dx > self.config.space_threshold * state.font_size {
268 extracted_text.push(' ');
269 }
270 }
271
272 extracted_text.push_str(&decoded);
273 last_x = x;
274 last_y = y;
275 }
276 }
277
278 ContentOperation::SetFont(name, size) => {
279 state.font_name = Some(name);
280 state.font_size = size as f64;
281 }
282
283 ContentOperation::SetLeading(leading) => {
284 state.leading = leading as f64;
285 }
286
287 _ => {
288 }
290 }
291 }
292 }
293
294 let processed_text = self.apply_line_break_mode(&extracted_text);
296
297 Ok(PlainTextResult::new(processed_text))
298 }
299
300 pub fn extract_lines<R: Read + Seek>(
324 &mut self,
325 document: &PdfDocument<R>,
326 page_index: u32,
327 ) -> ParseResult<Vec<String>> {
328 let result = self.extract(document, page_index)?;
329
330 Ok(result.text.lines().map(|line| line.to_string()).collect())
331 }
332
333 fn extract_font_resources<R: Read + Seek>(
335 &mut self,
336 page: &ParsedPage,
337 document: &PdfDocument<R>,
338 ) -> ParseResult<()> {
339 if let Some(resources) = page.get_resources() {
344 if let Some(PdfObject::Dictionary(font_dict)) = resources.get("Font") {
345 for (font_name, font_obj) in font_dict.0.iter() {
347 if let Some(font_ref) = font_obj.as_reference() {
348 if let Ok(PdfObject::Dictionary(font_dict)) =
349 document.get_object(font_ref.0, font_ref.1)
350 {
351 let mut cmap_extractor: CMapTextExtractor<R> = CMapTextExtractor::new();
353
354 if let Ok(font_info) =
355 cmap_extractor.extract_font_info(&font_dict, document)
356 {
357 self.font_cache.insert(font_name.0.clone(), font_info);
358 }
359 }
360 }
361 }
362 }
363 }
364
365 Ok(())
366 }
367
368 fn decode_text<R: Read + Seek>(
370 &self,
371 text_bytes: &[u8],
372 state: &TextState,
373 ) -> ParseResult<String> {
374 if let Some(ref font_name) = state.font_name {
376 if let Some(font_info) = self.font_cache.get(font_name) {
377 if let Ok(decoded) = self
378 .cmap_extractor
379 .decode_text_with_font(text_bytes, font_info)
380 {
381 return Ok(decoded);
382 }
383 }
384 }
385
386 let encoding = if let Some(ref font_name) = state.font_name {
388 let font_lower = font_name.as_bytes();
390 if font_lower
391 .iter()
392 .any(|&b| b.to_ascii_lowercase() == b'r' && font_name.contains("roman"))
393 {
394 TextEncoding::MacRomanEncoding
395 } else if font_name.contains("WinAnsi") || font_name.contains("winansi") {
396 TextEncoding::WinAnsiEncoding
397 } else if font_name.contains("Standard") || font_name.contains("standard") {
398 TextEncoding::StandardEncoding
399 } else if font_name.contains("PdfDoc") || font_name.contains("pdfdoc") {
400 TextEncoding::PdfDocEncoding
401 } else if font_name.starts_with("Times")
402 || font_name.starts_with("Helvetica")
403 || font_name.starts_with("Courier")
404 {
405 TextEncoding::WinAnsiEncoding
406 } else {
407 TextEncoding::PdfDocEncoding
408 }
409 } else {
410 TextEncoding::WinAnsiEncoding
411 };
412
413 Ok(encoding.decode(text_bytes))
414 }
415
416 fn apply_line_break_mode(&self, text: &str) -> String {
418 match self.config.line_break_mode {
419 LineBreakMode::Auto => self.auto_line_breaks(text),
420 LineBreakMode::PreserveAll => text.to_string(),
421 LineBreakMode::Normalize => self.normalize_line_breaks(text),
422 }
423 }
424
425 fn auto_line_breaks(&self, text: &str) -> String {
427 let lines: Vec<&str> = text.lines().collect();
428 let mut result = String::with_capacity(text.len());
429
430 for (i, line) in lines.iter().enumerate() {
431 let trimmed = line.trim_end();
432
433 if trimmed.is_empty() {
434 result.push('\n');
435 continue;
436 }
437
438 result.push_str(line);
439
440 if i < lines.len() - 1 {
441 let next_line = lines[i + 1].trim_start();
442
443 let ends_with_punct = trimmed.ends_with('.')
444 || trimmed.ends_with('!')
445 || trimmed.ends_with('?')
446 || trimmed.ends_with(':');
447
448 let next_is_empty = next_line.is_empty();
449
450 if ends_with_punct || next_is_empty {
451 result.push('\n');
452 } else {
453 result.push(' ');
454 }
455 }
456 }
457
458 result
459 }
460
461 fn normalize_line_breaks(&self, text: &str) -> String {
463 let lines: Vec<&str> = text.lines().collect();
464 let mut result = String::with_capacity(text.len());
465
466 for (i, line) in lines.iter().enumerate() {
467 let trimmed = line.trim_end();
468
469 if trimmed.is_empty() {
470 result.push('\n');
471 continue;
472 }
473
474 if trimmed.ends_with('-') && i < lines.len() - 1 {
475 let next_line = lines[i + 1].trim_start();
476 if !next_line.is_empty() {
477 result.push_str(&trimmed[..trimmed.len() - 1]);
478 continue;
479 }
480 }
481
482 result.push_str(line);
483
484 if i < lines.len() - 1 {
485 result.push('\n');
486 }
487 }
488
489 result
490 }
491
492 pub fn config(&self) -> &PlainTextConfig {
504 &self.config
505 }
506}
507
508#[inline]
510fn is_identity(matrix: &[f64; 6]) -> bool {
511 matrix[0] == 1.0
512 && matrix[1] == 0.0
513 && matrix[2] == 0.0
514 && matrix[3] == 1.0
515 && matrix[4] == 0.0
516 && matrix[5] == 0.0
517}
518
519#[inline]
521fn multiply_matrix(m1: &[f64; 6], m2: &[f64; 6]) -> [f64; 6] {
522 if is_identity(m1) {
524 return *m2;
525 }
526 if is_identity(m2) {
528 return *m1;
529 }
530
531 [
533 m1[0] * m2[0] + m1[1] * m2[2],
534 m1[0] * m2[1] + m1[1] * m2[3],
535 m1[2] * m2[0] + m1[3] * m2[2],
536 m1[2] * m2[1] + m1[3] * m2[3],
537 m1[4] * m2[0] + m1[5] * m2[2] + m2[4],
538 m1[4] * m2[1] + m1[5] * m2[3] + m2[5],
539 ]
540}
541
542#[inline]
544fn transform_point(x: f64, y: f64, matrix: &[f64; 6]) -> (f64, f64) {
545 let new_x = matrix[0] * x + matrix[2] * y + matrix[4];
546 let new_y = matrix[1] * x + matrix[3] * y + matrix[5];
547 (new_x, new_y)
548}
549
550#[cfg(test)]
551mod tests {
552 use super::*;
553
554 #[test]
555 fn test_new() {
556 let extractor = PlainTextExtractor::new();
557 assert_eq!(extractor.config.space_threshold, 0.3);
558 }
559
560 #[test]
561 fn test_with_config() {
562 let config = PlainTextConfig::dense();
563 let extractor = PlainTextExtractor::with_config(config.clone());
564 assert_eq!(extractor.config, config);
565 }
566
567 #[test]
568 fn test_default() {
569 let extractor = PlainTextExtractor::default();
570 assert_eq!(extractor.config, PlainTextConfig::default());
571 }
572
573 #[test]
574 fn test_normalize_line_breaks_hyphenated() {
575 let extractor = PlainTextExtractor::new();
576 let text = "This is a docu-\nment with hyphen-\nated words.";
577 let normalized = extractor.normalize_line_breaks(text);
578 assert_eq!(normalized, "This is a document with hyphenated words.");
579 }
580
581 #[test]
582 fn test_normalize_line_breaks_no_hyphen() {
583 let extractor = PlainTextExtractor::new();
584 let text = "This is a normal\ntext without\nhyphens.";
585 let normalized = extractor.normalize_line_breaks(text);
586 assert_eq!(normalized, "This is a normal\ntext without\nhyphens.");
587 }
588
589 #[test]
590 fn test_auto_line_breaks_punctuation() {
591 let extractor = PlainTextExtractor::new();
592 let text = "First sentence.\nSecond sentence.\nThird sentence.";
593 let processed = extractor.auto_line_breaks(text);
594 assert_eq!(
595 processed,
596 "First sentence.\nSecond sentence.\nThird sentence."
597 );
598 }
599
600 #[test]
601 fn test_auto_line_breaks_wrapped() {
602 let extractor = PlainTextExtractor::new();
603 let text = "This is a long line that\nwas wrapped in the PDF\nfor layout purposes";
604 let processed = extractor.auto_line_breaks(text);
605 assert!(processed.contains("long line that was"));
606 assert!(processed.contains("wrapped in the PDF for"));
607 }
608
609 #[test]
610 fn test_auto_line_breaks_empty_lines() {
611 let extractor = PlainTextExtractor::new();
612 let text = "Paragraph one.\n\nParagraph two.\n\nParagraph three.";
613 let processed = extractor.auto_line_breaks(text);
614 assert!(processed.contains("\n\n"));
615 }
616
617 #[test]
618 fn test_apply_line_break_mode_preserve_all() {
619 let extractor = PlainTextExtractor::with_config(PlainTextConfig {
620 line_break_mode: LineBreakMode::PreserveAll,
621 ..Default::default()
622 });
623 let text = "Line 1\nLine 2\nLine 3";
624 let processed = extractor.apply_line_break_mode(text);
625 assert_eq!(processed, text);
626 }
627
628 #[test]
629 fn test_apply_line_break_mode_normalize() {
630 let extractor = PlainTextExtractor::with_config(PlainTextConfig {
631 line_break_mode: LineBreakMode::Normalize,
632 ..Default::default()
633 });
634 let text = "docu-\nment";
635 let processed = extractor.apply_line_break_mode(text);
636 assert_eq!(processed, "document");
637 }
638
639 #[test]
640 fn test_apply_line_break_mode_auto() {
641 let extractor = PlainTextExtractor::with_config(PlainTextConfig {
642 line_break_mode: LineBreakMode::Auto,
643 ..Default::default()
644 });
645 let text = "First sentence.\nSecond part";
646 let processed = extractor.apply_line_break_mode(text);
647 assert!(processed.contains("First sentence.\nSecond"));
648 }
649
650 #[test]
651 fn test_config_getter() {
652 let config = PlainTextConfig::loose();
653 let extractor = PlainTextExtractor::with_config(config.clone());
654 assert_eq!(extractor.config(), &config);
655 }
656
657 #[test]
658 fn test_multiply_matrix() {
659 let m1 = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
660 let m2 = [1.0, 0.0, 0.0, 1.0, 5.0, 15.0];
661 let result = multiply_matrix(&m1, &m2);
662 assert_eq!(result, [1.0, 0.0, 0.0, 1.0, 15.0, 35.0]);
663 }
664
665 #[test]
666 fn test_transform_point() {
667 let matrix = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
668 let (x, y) = transform_point(5.0, 10.0, &matrix);
669 assert_eq!(x, 15.0);
670 assert_eq!(y, 30.0);
671 }
672}