oxidize_pdf/text/
mod.rs

1pub mod cmap;
2mod encoding;
3mod extraction;
4mod extraction_cmap;
5mod flow;
6mod font;
7pub mod font_manager;
8pub mod fonts;
9mod header_footer;
10mod layout;
11mod list;
12pub mod metrics;
13pub mod ocr;
14pub mod table;
15pub mod validation;
16
17#[cfg(test)]
18mod cmap_tests;
19
20#[cfg(feature = "ocr-tesseract")]
21pub mod tesseract_provider;
22
23pub use encoding::TextEncoding;
24pub use extraction::{ExtractedText, ExtractionOptions, TextExtractor, TextFragment};
25pub use flow::{TextAlign, TextFlowContext};
26pub use font::{Font, FontEncoding, FontFamily, FontWithEncoding};
27pub use font_manager::{CustomFont, FontDescriptor, FontFlags, FontManager, FontMetrics, FontType};
28pub use header_footer::{HeaderFooter, HeaderFooterOptions, HeaderFooterPosition};
29pub use layout::{ColumnContent, ColumnLayout, ColumnOptions, TextFormat};
30pub use list::{
31    BulletStyle, ListElement, ListItem, ListOptions, ListStyle as ListStyleEnum, OrderedList,
32    OrderedListStyle, UnorderedList,
33};
34pub use metrics::{measure_char, measure_text, split_into_words};
35pub use ocr::{
36    CharacterConfidence, CorrectionCandidate, CorrectionReason, CorrectionSuggestion,
37    CorrectionType, FragmentType, ImagePreprocessing, MockOcrProvider, OcrEngine, OcrError,
38    OcrOptions, OcrPostProcessor, OcrProcessingResult, OcrProvider, OcrRegion, OcrResult,
39    OcrTextFragment, WordConfidence,
40};
41pub use table::{HeaderStyle, Table, TableCell, TableOptions};
42pub use validation::{MatchType, TextMatch, TextValidationResult, TextValidator};
43
44#[cfg(feature = "ocr-tesseract")]
45pub use tesseract_provider::{RustyTesseractConfig, RustyTesseractProvider};
46
47use crate::error::Result;
48use crate::Color;
49use std::fmt::Write;
50
51/// Text rendering mode for PDF text operations
52#[derive(Clone, Copy, Debug, PartialEq, Eq)]
53pub enum TextRenderingMode {
54    /// Fill text (default)
55    Fill = 0,
56    /// Stroke text
57    Stroke = 1,
58    /// Fill and stroke text
59    FillStroke = 2,
60    /// Invisible text (for searchable text over images)
61    Invisible = 3,
62    /// Fill text and add to path for clipping
63    FillClip = 4,
64    /// Stroke text and add to path for clipping
65    StrokeClip = 5,
66    /// Fill and stroke text and add to path for clipping
67    FillStrokeClip = 6,
68    /// Add text to path for clipping (invisible)
69    Clip = 7,
70}
71
72#[derive(Clone)]
73pub struct TextContext {
74    operations: String,
75    current_font: Font,
76    font_size: f64,
77    text_matrix: [f64; 6],
78    // Pending position for next write operation
79    pending_position: Option<(f64, f64)>,
80    // Text state parameters
81    character_spacing: Option<f64>,
82    word_spacing: Option<f64>,
83    horizontal_scaling: Option<f64>,
84    leading: Option<f64>,
85    text_rise: Option<f64>,
86    rendering_mode: Option<TextRenderingMode>,
87    // Color parameters
88    fill_color: Option<Color>,
89    stroke_color: Option<Color>,
90}
91
92impl Default for TextContext {
93    fn default() -> Self {
94        Self::new()
95    }
96}
97
98impl TextContext {
99    pub fn new() -> Self {
100        Self {
101            operations: String::new(),
102            current_font: Font::Helvetica,
103            font_size: 12.0,
104            text_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
105            pending_position: None,
106            character_spacing: None,
107            word_spacing: None,
108            horizontal_scaling: None,
109            leading: None,
110            text_rise: None,
111            rendering_mode: None,
112            fill_color: None,
113            stroke_color: None,
114        }
115    }
116
117    pub fn set_font(&mut self, font: Font, size: f64) -> &mut Self {
118        self.current_font = font;
119        self.font_size = size;
120        self
121    }
122
123    /// Get the current font
124    #[allow(dead_code)]
125    pub(crate) fn current_font(&self) -> &Font {
126        &self.current_font
127    }
128
129    pub fn at(&mut self, x: f64, y: f64) -> &mut Self {
130        // Update text_matrix immediately and store for write() operation
131        self.text_matrix[4] = x;
132        self.text_matrix[5] = y;
133        self.pending_position = Some((x, y));
134        self
135    }
136
137    pub fn write(&mut self, text: &str) -> Result<&mut Self> {
138        // Begin text object
139        self.operations.push_str("BT\n");
140
141        // Set font
142        writeln!(
143            &mut self.operations,
144            "/{} {} Tf",
145            self.current_font.pdf_name(),
146            self.font_size
147        )
148        .expect("Writing to String should never fail");
149
150        // Apply text state parameters
151        self.apply_text_state_parameters();
152
153        // Set text position using pending_position if available, otherwise use text_matrix
154        let (x, y) = if let Some((px, py)) = self.pending_position.take() {
155            // Use and consume the pending position
156            (px, py)
157        } else {
158            // Fallback to text_matrix values
159            (self.text_matrix[4], self.text_matrix[5])
160        };
161
162        writeln!(&mut self.operations, "{:.2} {:.2} Td", x, y)
163            .expect("Writing to String should never fail");
164
165        // Encode text using WinAnsiEncoding
166        let encoding = TextEncoding::WinAnsiEncoding;
167        let encoded_bytes = encoding.encode(text);
168
169        // Show text as a literal string
170        self.operations.push('(');
171        for &byte in &encoded_bytes {
172            match byte {
173                b'(' => self.operations.push_str("\\("),
174                b')' => self.operations.push_str("\\)"),
175                b'\\' => self.operations.push_str("\\\\"),
176                b'\n' => self.operations.push_str("\\n"),
177                b'\r' => self.operations.push_str("\\r"),
178                b'\t' => self.operations.push_str("\\t"),
179                // For bytes in the printable ASCII range, write as is
180                0x20..=0x7E => self.operations.push(byte as char),
181                // For other bytes, write as octal escape sequences
182                _ => write!(&mut self.operations, "\\{byte:03o}")
183                    .expect("Writing to String should never fail"),
184            }
185        }
186        self.operations.push_str(") Tj\n");
187
188        // End text object
189        self.operations.push_str("ET\n");
190
191        Ok(self)
192    }
193
194    pub fn write_line(&mut self, text: &str) -> Result<&mut Self> {
195        self.write(text)?;
196        self.text_matrix[5] -= self.font_size * 1.2; // Move down for next line
197        Ok(self)
198    }
199
200    pub fn set_character_spacing(&mut self, spacing: f64) -> &mut Self {
201        self.character_spacing = Some(spacing);
202        self
203    }
204
205    pub fn set_word_spacing(&mut self, spacing: f64) -> &mut Self {
206        self.word_spacing = Some(spacing);
207        self
208    }
209
210    pub fn set_horizontal_scaling(&mut self, scale: f64) -> &mut Self {
211        self.horizontal_scaling = Some(scale);
212        self
213    }
214
215    pub fn set_leading(&mut self, leading: f64) -> &mut Self {
216        self.leading = Some(leading);
217        self
218    }
219
220    pub fn set_text_rise(&mut self, rise: f64) -> &mut Self {
221        self.text_rise = Some(rise);
222        self
223    }
224
225    /// Set the text rendering mode
226    pub fn set_rendering_mode(&mut self, mode: TextRenderingMode) -> &mut Self {
227        self.rendering_mode = Some(mode);
228        self
229    }
230
231    /// Set the text fill color
232    pub fn set_fill_color(&mut self, color: Color) -> &mut Self {
233        self.fill_color = Some(color);
234        self
235    }
236
237    /// Set the text stroke color
238    pub fn set_stroke_color(&mut self, color: Color) -> &mut Self {
239        self.stroke_color = Some(color);
240        self
241    }
242
243    /// Apply text state parameters to the operations string
244    fn apply_text_state_parameters(&mut self) {
245        // Character spacing (Tc)
246        if let Some(spacing) = self.character_spacing {
247            writeln!(&mut self.operations, "{spacing:.2} Tc")
248                .expect("Writing to String should never fail");
249        }
250
251        // Word spacing (Tw)
252        if let Some(spacing) = self.word_spacing {
253            writeln!(&mut self.operations, "{spacing:.2} Tw")
254                .expect("Writing to String should never fail");
255        }
256
257        // Horizontal scaling (Tz)
258        if let Some(scale) = self.horizontal_scaling {
259            writeln!(&mut self.operations, "{:.2} Tz", scale * 100.0)
260                .expect("Writing to String should never fail");
261        }
262
263        // Leading (TL)
264        if let Some(leading) = self.leading {
265            writeln!(&mut self.operations, "{leading:.2} TL")
266                .expect("Writing to String should never fail");
267        }
268
269        // Text rise (Ts)
270        if let Some(rise) = self.text_rise {
271            writeln!(&mut self.operations, "{rise:.2} Ts")
272                .expect("Writing to String should never fail");
273        }
274
275        // Text rendering mode (Tr)
276        if let Some(mode) = self.rendering_mode {
277            writeln!(&mut self.operations, "{} Tr", mode as u8)
278                .expect("Writing to String should never fail");
279        }
280
281        // Fill color
282        if let Some(color) = self.fill_color {
283            match color {
284                Color::Rgb(r, g, b) => {
285                    writeln!(&mut self.operations, "{r:.3} {g:.3} {b:.3} rg")
286                        .expect("Writing to String should never fail");
287                }
288                Color::Gray(gray) => {
289                    writeln!(&mut self.operations, "{gray:.3} g")
290                        .expect("Writing to String should never fail");
291                }
292                Color::Cmyk(c, m, y, k) => {
293                    writeln!(&mut self.operations, "{c:.3} {m:.3} {y:.3} {k:.3} k")
294                        .expect("Writing to String should never fail");
295                }
296            }
297        }
298
299        // Stroke color
300        if let Some(color) = self.stroke_color {
301            match color {
302                Color::Rgb(r, g, b) => {
303                    writeln!(&mut self.operations, "{r:.3} {g:.3} {b:.3} RG")
304                        .expect("Writing to String should never fail");
305                }
306                Color::Gray(gray) => {
307                    writeln!(&mut self.operations, "{gray:.3} G")
308                        .expect("Writing to String should never fail");
309                }
310                Color::Cmyk(c, m, y, k) => {
311                    writeln!(&mut self.operations, "{c:.3} {m:.3} {y:.3} {k:.3} K")
312                        .expect("Writing to String should never fail");
313                }
314            }
315        }
316    }
317
318    pub(crate) fn generate_operations(&self) -> Result<Vec<u8>> {
319        Ok(self.operations.as_bytes().to_vec())
320    }
321
322    /// Get the current font size
323    pub fn font_size(&self) -> f64 {
324        self.font_size
325    }
326
327    /// Get the current text matrix
328    pub fn text_matrix(&self) -> [f64; 6] {
329        self.text_matrix
330    }
331
332    /// Get the current position
333    pub fn position(&self) -> (f64, f64) {
334        (self.text_matrix[4], self.text_matrix[5])
335    }
336
337    /// Clear all operations and reset text state parameters
338    pub fn clear(&mut self) {
339        self.operations.clear();
340        self.character_spacing = None;
341        self.word_spacing = None;
342        self.horizontal_scaling = None;
343        self.leading = None;
344        self.text_rise = None;
345        self.rendering_mode = None;
346        self.fill_color = None;
347        self.stroke_color = None;
348    }
349
350    /// Get the raw operations string
351    pub fn operations(&self) -> &str {
352        &self.operations
353    }
354
355    /// Generate text state operations for testing purposes
356    #[cfg(test)]
357    pub fn generate_text_state_operations(&self) -> String {
358        let mut ops = String::new();
359
360        // Character spacing (Tc)
361        if let Some(spacing) = self.character_spacing {
362            writeln!(&mut ops, "{spacing:.2} Tc").unwrap();
363        }
364
365        // Word spacing (Tw)
366        if let Some(spacing) = self.word_spacing {
367            writeln!(&mut ops, "{spacing:.2} Tw").unwrap();
368        }
369
370        // Horizontal scaling (Tz)
371        if let Some(scale) = self.horizontal_scaling {
372            writeln!(&mut ops, "{:.2} Tz", scale * 100.0).unwrap();
373        }
374
375        // Leading (TL)
376        if let Some(leading) = self.leading {
377            writeln!(&mut ops, "{leading:.2} TL").unwrap();
378        }
379
380        // Text rise (Ts)
381        if let Some(rise) = self.text_rise {
382            writeln!(&mut ops, "{rise:.2} Ts").unwrap();
383        }
384
385        // Text rendering mode (Tr)
386        if let Some(mode) = self.rendering_mode {
387            writeln!(&mut ops, "{} Tr", mode as u8).unwrap();
388        }
389
390        ops
391    }
392}
393
394#[cfg(test)]
395mod tests {
396    use super::*;
397
398    #[test]
399    fn test_text_context_new() {
400        let context = TextContext::new();
401        assert_eq!(context.current_font, Font::Helvetica);
402        assert_eq!(context.font_size, 12.0);
403        assert_eq!(context.text_matrix, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
404        assert!(context.operations.is_empty());
405    }
406
407    #[test]
408    fn test_text_context_default() {
409        let context = TextContext::default();
410        assert_eq!(context.current_font, Font::Helvetica);
411        assert_eq!(context.font_size, 12.0);
412    }
413
414    #[test]
415    fn test_set_font() {
416        let mut context = TextContext::new();
417        context.set_font(Font::TimesBold, 14.0);
418        assert_eq!(context.current_font, Font::TimesBold);
419        assert_eq!(context.font_size, 14.0);
420    }
421
422    #[test]
423    fn test_position() {
424        let mut context = TextContext::new();
425        context.at(100.0, 200.0);
426        let (x, y) = context.position();
427        assert_eq!(x, 100.0);
428        assert_eq!(y, 200.0);
429        assert_eq!(context.text_matrix[4], 100.0);
430        assert_eq!(context.text_matrix[5], 200.0);
431    }
432
433    #[test]
434    fn test_write_simple_text() {
435        let mut context = TextContext::new();
436        context.write("Hello").unwrap();
437
438        let ops = context.operations();
439        assert!(ops.contains("BT\n"));
440        assert!(ops.contains("ET\n"));
441        assert!(ops.contains("/Helvetica 12 Tf"));
442        assert!(ops.contains("(Hello) Tj"));
443    }
444
445    #[test]
446    fn test_write_text_with_escaping() {
447        let mut context = TextContext::new();
448        context.write("(Hello)").unwrap();
449
450        let ops = context.operations();
451        assert!(ops.contains("(\\(Hello\\)) Tj"));
452    }
453
454    #[test]
455    fn test_write_line() {
456        let mut context = TextContext::new();
457        let initial_y = context.text_matrix[5];
458        context.write_line("Line 1").unwrap();
459
460        // Y position should have moved down
461        let new_y = context.text_matrix[5];
462        assert!(new_y < initial_y);
463        assert_eq!(new_y, initial_y - 12.0 * 1.2); // font_size * 1.2
464    }
465
466    #[test]
467    fn test_character_spacing() {
468        let mut context = TextContext::new();
469        context.set_character_spacing(2.5);
470
471        let ops = context.generate_text_state_operations();
472        assert!(ops.contains("2.50 Tc"));
473    }
474
475    #[test]
476    fn test_word_spacing() {
477        let mut context = TextContext::new();
478        context.set_word_spacing(1.5);
479
480        let ops = context.generate_text_state_operations();
481        assert!(ops.contains("1.50 Tw"));
482    }
483
484    #[test]
485    fn test_horizontal_scaling() {
486        let mut context = TextContext::new();
487        context.set_horizontal_scaling(1.25);
488
489        let ops = context.generate_text_state_operations();
490        assert!(ops.contains("125.00 Tz")); // 1.25 * 100
491    }
492
493    #[test]
494    fn test_leading() {
495        let mut context = TextContext::new();
496        context.set_leading(15.0);
497
498        let ops = context.generate_text_state_operations();
499        assert!(ops.contains("15.00 TL"));
500    }
501
502    #[test]
503    fn test_text_rise() {
504        let mut context = TextContext::new();
505        context.set_text_rise(3.0);
506
507        let ops = context.generate_text_state_operations();
508        assert!(ops.contains("3.00 Ts"));
509    }
510
511    #[test]
512    fn test_clear() {
513        let mut context = TextContext::new();
514        context.write("Hello").unwrap();
515        assert!(!context.operations().is_empty());
516
517        context.clear();
518        assert!(context.operations().is_empty());
519    }
520
521    #[test]
522    fn test_generate_operations() {
523        let mut context = TextContext::new();
524        context.write("Test").unwrap();
525
526        let ops_bytes = context.generate_operations().unwrap();
527        let ops_string = String::from_utf8(ops_bytes).unwrap();
528        assert_eq!(ops_string, context.operations());
529    }
530
531    #[test]
532    fn test_method_chaining() {
533        let mut context = TextContext::new();
534        context
535            .set_font(Font::Courier, 10.0)
536            .at(50.0, 100.0)
537            .set_character_spacing(1.0)
538            .set_word_spacing(2.0);
539
540        assert_eq!(context.current_font(), &Font::Courier);
541        assert_eq!(context.font_size(), 10.0);
542        let (x, y) = context.position();
543        assert_eq!(x, 50.0);
544        assert_eq!(y, 100.0);
545    }
546
547    #[test]
548    fn test_text_matrix_access() {
549        let mut context = TextContext::new();
550        context.at(25.0, 75.0);
551
552        let matrix = context.text_matrix();
553        assert_eq!(matrix, [1.0, 0.0, 0.0, 1.0, 25.0, 75.0]);
554    }
555
556    #[test]
557    fn test_special_characters_encoding() {
558        let mut context = TextContext::new();
559        context.write("Test\nLine\tTab").unwrap();
560
561        let ops = context.operations();
562        assert!(ops.contains("\\n"));
563        assert!(ops.contains("\\t"));
564    }
565
566    #[test]
567    fn test_rendering_mode_fill() {
568        let mut context = TextContext::new();
569        context.set_rendering_mode(TextRenderingMode::Fill);
570
571        let ops = context.generate_text_state_operations();
572        assert!(ops.contains("0 Tr"));
573    }
574
575    #[test]
576    fn test_rendering_mode_stroke() {
577        let mut context = TextContext::new();
578        context.set_rendering_mode(TextRenderingMode::Stroke);
579
580        let ops = context.generate_text_state_operations();
581        assert!(ops.contains("1 Tr"));
582    }
583
584    #[test]
585    fn test_rendering_mode_fill_stroke() {
586        let mut context = TextContext::new();
587        context.set_rendering_mode(TextRenderingMode::FillStroke);
588
589        let ops = context.generate_text_state_operations();
590        assert!(ops.contains("2 Tr"));
591    }
592
593    #[test]
594    fn test_rendering_mode_invisible() {
595        let mut context = TextContext::new();
596        context.set_rendering_mode(TextRenderingMode::Invisible);
597
598        let ops = context.generate_text_state_operations();
599        assert!(ops.contains("3 Tr"));
600    }
601
602    #[test]
603    fn test_rendering_mode_fill_clip() {
604        let mut context = TextContext::new();
605        context.set_rendering_mode(TextRenderingMode::FillClip);
606
607        let ops = context.generate_text_state_operations();
608        assert!(ops.contains("4 Tr"));
609    }
610
611    #[test]
612    fn test_rendering_mode_stroke_clip() {
613        let mut context = TextContext::new();
614        context.set_rendering_mode(TextRenderingMode::StrokeClip);
615
616        let ops = context.generate_text_state_operations();
617        assert!(ops.contains("5 Tr"));
618    }
619
620    #[test]
621    fn test_rendering_mode_fill_stroke_clip() {
622        let mut context = TextContext::new();
623        context.set_rendering_mode(TextRenderingMode::FillStrokeClip);
624
625        let ops = context.generate_text_state_operations();
626        assert!(ops.contains("6 Tr"));
627    }
628
629    #[test]
630    fn test_rendering_mode_clip() {
631        let mut context = TextContext::new();
632        context.set_rendering_mode(TextRenderingMode::Clip);
633
634        let ops = context.generate_text_state_operations();
635        assert!(ops.contains("7 Tr"));
636    }
637
638    #[test]
639    fn test_text_state_parameters_chaining() {
640        let mut context = TextContext::new();
641        context
642            .set_character_spacing(1.5)
643            .set_word_spacing(2.0)
644            .set_horizontal_scaling(1.1)
645            .set_leading(14.0)
646            .set_text_rise(0.5)
647            .set_rendering_mode(TextRenderingMode::FillStroke);
648
649        let ops = context.generate_text_state_operations();
650        assert!(ops.contains("1.50 Tc"));
651        assert!(ops.contains("2.00 Tw"));
652        assert!(ops.contains("110.00 Tz"));
653        assert!(ops.contains("14.00 TL"));
654        assert!(ops.contains("0.50 Ts"));
655        assert!(ops.contains("2 Tr"));
656    }
657
658    #[test]
659    fn test_all_text_state_operators_generated() {
660        let mut context = TextContext::new();
661
662        // Test all operators in sequence
663        context.set_character_spacing(1.0); // Tc
664        context.set_word_spacing(2.0); // Tw
665        context.set_horizontal_scaling(1.2); // Tz
666        context.set_leading(15.0); // TL
667        context.set_text_rise(1.0); // Ts
668        context.set_rendering_mode(TextRenderingMode::Stroke); // Tr
669
670        let ops = context.generate_text_state_operations();
671
672        // Verify all PDF text state operators are present
673        assert!(
674            ops.contains("Tc"),
675            "Character spacing operator (Tc) not found"
676        );
677        assert!(ops.contains("Tw"), "Word spacing operator (Tw) not found");
678        assert!(
679            ops.contains("Tz"),
680            "Horizontal scaling operator (Tz) not found"
681        );
682        assert!(ops.contains("TL"), "Leading operator (TL) not found");
683        assert!(ops.contains("Ts"), "Text rise operator (Ts) not found");
684        assert!(
685            ops.contains("Tr"),
686            "Text rendering mode operator (Tr) not found"
687        );
688    }
689
690    #[test]
691    fn test_text_color_operations() {
692        use crate::Color;
693
694        let mut context = TextContext::new();
695
696        // Test RGB fill color
697        context.set_fill_color(Color::rgb(1.0, 0.0, 0.0));
698        context.apply_text_state_parameters();
699
700        let ops = context.operations();
701        assert!(
702            ops.contains("1.000 0.000 0.000 rg"),
703            "RGB fill color operator (rg) not found in: {ops}"
704        );
705
706        // Clear and test RGB stroke color
707        context.clear();
708        context.set_stroke_color(Color::rgb(0.0, 1.0, 0.0));
709        context.apply_text_state_parameters();
710
711        let ops = context.operations();
712        assert!(
713            ops.contains("0.000 1.000 0.000 RG"),
714            "RGB stroke color operator (RG) not found in: {ops}"
715        );
716
717        // Clear and test grayscale fill color
718        context.clear();
719        context.set_fill_color(Color::gray(0.5));
720        context.apply_text_state_parameters();
721
722        let ops = context.operations();
723        assert!(
724            ops.contains("0.500 g"),
725            "Gray fill color operator (g) not found in: {ops}"
726        );
727
728        // Clear and test CMYK stroke color
729        context.clear();
730        context.set_stroke_color(Color::cmyk(0.2, 0.3, 0.4, 0.1));
731        context.apply_text_state_parameters();
732
733        let ops = context.operations();
734        assert!(
735            ops.contains("0.200 0.300 0.400 0.100 K"),
736            "CMYK stroke color operator (K) not found in: {ops}"
737        );
738
739        // Test both fill and stroke colors together
740        context.clear();
741        context.set_fill_color(Color::rgb(1.0, 0.0, 0.0));
742        context.set_stroke_color(Color::rgb(0.0, 0.0, 1.0));
743        context.apply_text_state_parameters();
744
745        let ops = context.operations();
746        assert!(
747            ops.contains("1.000 0.000 0.000 rg") && ops.contains("0.000 0.000 1.000 RG"),
748            "Both fill and stroke colors not found in: {ops}"
749        );
750    }
751}