oxidize_pdf/text/
mod.rs

1pub mod cmap;
2mod encoding;
3mod extraction;
4mod extraction_cmap;
5mod flow;
6mod font;
7pub mod fonts;
8mod header_footer;
9mod layout;
10mod list;
11mod metrics;
12pub mod ocr;
13mod table;
14mod table_advanced;
15
16#[cfg(test)]
17mod cmap_tests;
18
19#[cfg(feature = "ocr-tesseract")]
20pub mod tesseract_provider;
21
22pub use encoding::TextEncoding;
23pub use extraction::{ExtractedText, ExtractionOptions, TextExtractor, TextFragment};
24pub use flow::{TextAlign, TextFlowContext};
25pub use font::{Font, FontEncoding, FontFamily, FontWithEncoding};
26pub use header_footer::{HeaderFooter, HeaderFooterOptions, HeaderFooterPosition};
27pub use layout::{ColumnContent, ColumnLayout, ColumnOptions, TextFormat};
28pub use list::{
29    BulletStyle, ListElement, ListItem, ListOptions, ListStyle as ListStyleEnum, OrderedList,
30    OrderedListStyle, UnorderedList,
31};
32pub use metrics::{measure_char, measure_text, split_into_words};
33pub use ocr::{
34    FragmentType, ImagePreprocessing, MockOcrProvider, OcrEngine, OcrError, OcrOptions,
35    OcrProcessingResult, OcrProvider, OcrResult, OcrTextFragment,
36};
37pub use table::{HeaderStyle, Table, TableCell, TableOptions};
38pub use table_advanced::{
39    AdvancedTable, AdvancedTableCell, AdvancedTableOptions, AlternatingRowColors, BorderLine,
40    BorderStyle, CellContent, CellPadding, ColumnDefinition, ColumnWidth, LineStyle, TableRow,
41    VerticalAlign,
42};
43
44use crate::error::Result;
45use std::fmt::Write;
46
47/// Text rendering mode for PDF text operations
48#[derive(Clone, Copy, Debug, PartialEq, Eq)]
49pub enum TextRenderingMode {
50    /// Fill text (default)
51    Fill = 0,
52    /// Stroke text
53    Stroke = 1,
54    /// Fill and stroke text
55    FillStroke = 2,
56    /// Invisible text (for searchable text over images)
57    Invisible = 3,
58    /// Fill text and add to path for clipping
59    FillClip = 4,
60    /// Stroke text and add to path for clipping
61    StrokeClip = 5,
62    /// Fill and stroke text and add to path for clipping
63    FillStrokeClip = 6,
64    /// Add text to path for clipping (invisible)
65    Clip = 7,
66}
67
68#[derive(Clone)]
69pub struct TextContext {
70    operations: String,
71    current_font: Font,
72    font_size: f64,
73    text_matrix: [f64; 6],
74    // Text state parameters
75    character_spacing: Option<f64>,
76    word_spacing: Option<f64>,
77    horizontal_scaling: Option<f64>,
78    leading: Option<f64>,
79    text_rise: Option<f64>,
80    rendering_mode: Option<TextRenderingMode>,
81}
82
83impl Default for TextContext {
84    fn default() -> Self {
85        Self::new()
86    }
87}
88
89impl TextContext {
90    pub fn new() -> Self {
91        Self {
92            operations: String::new(),
93            current_font: Font::Helvetica,
94            font_size: 12.0,
95            text_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
96            character_spacing: None,
97            word_spacing: None,
98            horizontal_scaling: None,
99            leading: None,
100            text_rise: None,
101            rendering_mode: None,
102        }
103    }
104
105    pub fn set_font(&mut self, font: Font, size: f64) -> &mut Self {
106        self.current_font = font;
107        self.font_size = size;
108        self
109    }
110
111    /// Get the current font
112    pub(crate) fn current_font(&self) -> &Font {
113        &self.current_font
114    }
115
116    pub fn at(&mut self, x: f64, y: f64) -> &mut Self {
117        self.text_matrix[4] = x;
118        self.text_matrix[5] = y;
119        self
120    }
121
122    pub fn write(&mut self, text: &str) -> Result<&mut Self> {
123        // Begin text object
124        self.operations.push_str("BT\n");
125
126        // Set font
127        writeln!(
128            &mut self.operations,
129            "/{} {} Tf",
130            self.current_font.pdf_name(),
131            self.font_size
132        )
133        .unwrap();
134
135        // Apply text state parameters
136        self.apply_text_state_parameters();
137
138        // Set text position
139        writeln!(
140            &mut self.operations,
141            "{:.2} {:.2} Td",
142            self.text_matrix[4], self.text_matrix[5]
143        )
144        .unwrap();
145
146        // Encode text using WinAnsiEncoding
147        let encoding = TextEncoding::WinAnsiEncoding;
148        let encoded_bytes = encoding.encode(text);
149
150        // Show text as a literal string
151        self.operations.push('(');
152        for &byte in &encoded_bytes {
153            match byte {
154                b'(' => self.operations.push_str("\\("),
155                b')' => self.operations.push_str("\\)"),
156                b'\\' => self.operations.push_str("\\\\"),
157                b'\n' => self.operations.push_str("\\n"),
158                b'\r' => self.operations.push_str("\\r"),
159                b'\t' => self.operations.push_str("\\t"),
160                // For bytes in the printable ASCII range, write as is
161                0x20..=0x7E => self.operations.push(byte as char),
162                // For other bytes, write as octal escape sequences
163                _ => write!(&mut self.operations, "\\{byte:03o}").unwrap(),
164            }
165        }
166        self.operations.push_str(") Tj\n");
167
168        // End text object
169        self.operations.push_str("ET\n");
170
171        Ok(self)
172    }
173
174    pub fn write_line(&mut self, text: &str) -> Result<&mut Self> {
175        self.write(text)?;
176        self.text_matrix[5] -= self.font_size * 1.2; // Move down for next line
177        Ok(self)
178    }
179
180    pub fn set_character_spacing(&mut self, spacing: f64) -> &mut Self {
181        self.character_spacing = Some(spacing);
182        self
183    }
184
185    pub fn set_word_spacing(&mut self, spacing: f64) -> &mut Self {
186        self.word_spacing = Some(spacing);
187        self
188    }
189
190    pub fn set_horizontal_scaling(&mut self, scale: f64) -> &mut Self {
191        self.horizontal_scaling = Some(scale);
192        self
193    }
194
195    pub fn set_leading(&mut self, leading: f64) -> &mut Self {
196        self.leading = Some(leading);
197        self
198    }
199
200    pub fn set_text_rise(&mut self, rise: f64) -> &mut Self {
201        self.text_rise = Some(rise);
202        self
203    }
204
205    /// Set the text rendering mode
206    pub fn set_rendering_mode(&mut self, mode: TextRenderingMode) -> &mut Self {
207        self.rendering_mode = Some(mode);
208        self
209    }
210
211    /// Apply text state parameters to the operations string
212    fn apply_text_state_parameters(&mut self) {
213        // Character spacing (Tc)
214        if let Some(spacing) = self.character_spacing {
215            writeln!(&mut self.operations, "{:.2} Tc", spacing).unwrap();
216        }
217
218        // Word spacing (Tw)
219        if let Some(spacing) = self.word_spacing {
220            writeln!(&mut self.operations, "{:.2} Tw", spacing).unwrap();
221        }
222
223        // Horizontal scaling (Tz)
224        if let Some(scale) = self.horizontal_scaling {
225            writeln!(&mut self.operations, "{:.2} Tz", scale * 100.0).unwrap();
226        }
227
228        // Leading (TL)
229        if let Some(leading) = self.leading {
230            writeln!(&mut self.operations, "{:.2} TL", leading).unwrap();
231        }
232
233        // Text rise (Ts)
234        if let Some(rise) = self.text_rise {
235            writeln!(&mut self.operations, "{:.2} Ts", rise).unwrap();
236        }
237
238        // Text rendering mode (Tr)
239        if let Some(mode) = self.rendering_mode {
240            writeln!(&mut self.operations, "{} Tr", mode as u8).unwrap();
241        }
242    }
243
244    pub(crate) fn generate_operations(&self) -> Result<Vec<u8>> {
245        Ok(self.operations.as_bytes().to_vec())
246    }
247
248    /// Get the current font size
249    pub fn font_size(&self) -> f64 {
250        self.font_size
251    }
252
253    /// Get the current text matrix
254    pub fn text_matrix(&self) -> [f64; 6] {
255        self.text_matrix
256    }
257
258    /// Get the current position
259    pub fn position(&self) -> (f64, f64) {
260        (self.text_matrix[4], self.text_matrix[5])
261    }
262
263    /// Clear all operations and reset text state parameters
264    pub fn clear(&mut self) {
265        self.operations.clear();
266        self.character_spacing = None;
267        self.word_spacing = None;
268        self.horizontal_scaling = None;
269        self.leading = None;
270        self.text_rise = None;
271        self.rendering_mode = None;
272    }
273
274    /// Get the raw operations string
275    pub fn operations(&self) -> &str {
276        &self.operations
277    }
278
279    /// Generate text state operations for testing purposes
280    #[cfg(test)]
281    pub fn generate_text_state_operations(&self) -> String {
282        let mut ops = String::new();
283
284        // Character spacing (Tc)
285        if let Some(spacing) = self.character_spacing {
286            writeln!(&mut ops, "{:.2} Tc", spacing).unwrap();
287        }
288
289        // Word spacing (Tw)
290        if let Some(spacing) = self.word_spacing {
291            writeln!(&mut ops, "{:.2} Tw", spacing).unwrap();
292        }
293
294        // Horizontal scaling (Tz)
295        if let Some(scale) = self.horizontal_scaling {
296            writeln!(&mut ops, "{:.2} Tz", scale * 100.0).unwrap();
297        }
298
299        // Leading (TL)
300        if let Some(leading) = self.leading {
301            writeln!(&mut ops, "{:.2} TL", leading).unwrap();
302        }
303
304        // Text rise (Ts)
305        if let Some(rise) = self.text_rise {
306            writeln!(&mut ops, "{:.2} Ts", rise).unwrap();
307        }
308
309        // Text rendering mode (Tr)
310        if let Some(mode) = self.rendering_mode {
311            writeln!(&mut ops, "{} Tr", mode as u8).unwrap();
312        }
313
314        ops
315    }
316}
317
318#[cfg(test)]
319mod tests {
320    use super::*;
321
322    #[test]
323    fn test_text_context_new() {
324        let context = TextContext::new();
325        assert_eq!(context.current_font, Font::Helvetica);
326        assert_eq!(context.font_size, 12.0);
327        assert_eq!(context.text_matrix, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
328        assert!(context.operations.is_empty());
329    }
330
331    #[test]
332    fn test_text_context_default() {
333        let context = TextContext::default();
334        assert_eq!(context.current_font, Font::Helvetica);
335        assert_eq!(context.font_size, 12.0);
336    }
337
338    #[test]
339    fn test_set_font() {
340        let mut context = TextContext::new();
341        context.set_font(Font::TimesBold, 14.0);
342        assert_eq!(context.current_font, Font::TimesBold);
343        assert_eq!(context.font_size, 14.0);
344    }
345
346    #[test]
347    fn test_position() {
348        let mut context = TextContext::new();
349        context.at(100.0, 200.0);
350        let (x, y) = context.position();
351        assert_eq!(x, 100.0);
352        assert_eq!(y, 200.0);
353        assert_eq!(context.text_matrix[4], 100.0);
354        assert_eq!(context.text_matrix[5], 200.0);
355    }
356
357    #[test]
358    fn test_write_simple_text() {
359        let mut context = TextContext::new();
360        context.write("Hello").unwrap();
361
362        let ops = context.operations();
363        assert!(ops.contains("BT\n"));
364        assert!(ops.contains("ET\n"));
365        assert!(ops.contains("/Helvetica 12 Tf"));
366        assert!(ops.contains("(Hello) Tj"));
367    }
368
369    #[test]
370    fn test_write_text_with_escaping() {
371        let mut context = TextContext::new();
372        context.write("(Hello)").unwrap();
373
374        let ops = context.operations();
375        assert!(ops.contains("(\\(Hello\\)) Tj"));
376    }
377
378    #[test]
379    fn test_write_line() {
380        let mut context = TextContext::new();
381        let initial_y = context.text_matrix[5];
382        context.write_line("Line 1").unwrap();
383
384        // Y position should have moved down
385        let new_y = context.text_matrix[5];
386        assert!(new_y < initial_y);
387        assert_eq!(new_y, initial_y - 12.0 * 1.2); // font_size * 1.2
388    }
389
390    #[test]
391    fn test_character_spacing() {
392        let mut context = TextContext::new();
393        context.set_character_spacing(2.5);
394
395        let ops = context.generate_text_state_operations();
396        assert!(ops.contains("2.50 Tc"));
397    }
398
399    #[test]
400    fn test_word_spacing() {
401        let mut context = TextContext::new();
402        context.set_word_spacing(1.5);
403
404        let ops = context.generate_text_state_operations();
405        assert!(ops.contains("1.50 Tw"));
406    }
407
408    #[test]
409    fn test_horizontal_scaling() {
410        let mut context = TextContext::new();
411        context.set_horizontal_scaling(1.25);
412
413        let ops = context.generate_text_state_operations();
414        assert!(ops.contains("125.00 Tz")); // 1.25 * 100
415    }
416
417    #[test]
418    fn test_leading() {
419        let mut context = TextContext::new();
420        context.set_leading(15.0);
421
422        let ops = context.generate_text_state_operations();
423        assert!(ops.contains("15.00 TL"));
424    }
425
426    #[test]
427    fn test_text_rise() {
428        let mut context = TextContext::new();
429        context.set_text_rise(3.0);
430
431        let ops = context.generate_text_state_operations();
432        assert!(ops.contains("3.00 Ts"));
433    }
434
435    #[test]
436    fn test_clear() {
437        let mut context = TextContext::new();
438        context.write("Hello").unwrap();
439        assert!(!context.operations().is_empty());
440
441        context.clear();
442        assert!(context.operations().is_empty());
443    }
444
445    #[test]
446    fn test_generate_operations() {
447        let mut context = TextContext::new();
448        context.write("Test").unwrap();
449
450        let ops_bytes = context.generate_operations().unwrap();
451        let ops_string = String::from_utf8(ops_bytes).unwrap();
452        assert_eq!(ops_string, context.operations());
453    }
454
455    #[test]
456    fn test_method_chaining() {
457        let mut context = TextContext::new();
458        context
459            .set_font(Font::Courier, 10.0)
460            .at(50.0, 100.0)
461            .set_character_spacing(1.0)
462            .set_word_spacing(2.0);
463
464        assert_eq!(context.current_font(), &Font::Courier);
465        assert_eq!(context.font_size(), 10.0);
466        let (x, y) = context.position();
467        assert_eq!(x, 50.0);
468        assert_eq!(y, 100.0);
469    }
470
471    #[test]
472    fn test_text_matrix_access() {
473        let mut context = TextContext::new();
474        context.at(25.0, 75.0);
475
476        let matrix = context.text_matrix();
477        assert_eq!(matrix, [1.0, 0.0, 0.0, 1.0, 25.0, 75.0]);
478    }
479
480    #[test]
481    fn test_special_characters_encoding() {
482        let mut context = TextContext::new();
483        context.write("Test\nLine\tTab").unwrap();
484
485        let ops = context.operations();
486        assert!(ops.contains("\\n"));
487        assert!(ops.contains("\\t"));
488    }
489
490    #[test]
491    fn test_rendering_mode_fill() {
492        let mut context = TextContext::new();
493        context.set_rendering_mode(TextRenderingMode::Fill);
494
495        let ops = context.generate_text_state_operations();
496        assert!(ops.contains("0 Tr"));
497    }
498
499    #[test]
500    fn test_rendering_mode_stroke() {
501        let mut context = TextContext::new();
502        context.set_rendering_mode(TextRenderingMode::Stroke);
503
504        let ops = context.generate_text_state_operations();
505        assert!(ops.contains("1 Tr"));
506    }
507
508    #[test]
509    fn test_rendering_mode_fill_stroke() {
510        let mut context = TextContext::new();
511        context.set_rendering_mode(TextRenderingMode::FillStroke);
512
513        let ops = context.generate_text_state_operations();
514        assert!(ops.contains("2 Tr"));
515    }
516
517    #[test]
518    fn test_rendering_mode_invisible() {
519        let mut context = TextContext::new();
520        context.set_rendering_mode(TextRenderingMode::Invisible);
521
522        let ops = context.generate_text_state_operations();
523        assert!(ops.contains("3 Tr"));
524    }
525
526    #[test]
527    fn test_rendering_mode_fill_clip() {
528        let mut context = TextContext::new();
529        context.set_rendering_mode(TextRenderingMode::FillClip);
530
531        let ops = context.generate_text_state_operations();
532        assert!(ops.contains("4 Tr"));
533    }
534
535    #[test]
536    fn test_rendering_mode_stroke_clip() {
537        let mut context = TextContext::new();
538        context.set_rendering_mode(TextRenderingMode::StrokeClip);
539
540        let ops = context.generate_text_state_operations();
541        assert!(ops.contains("5 Tr"));
542    }
543
544    #[test]
545    fn test_rendering_mode_fill_stroke_clip() {
546        let mut context = TextContext::new();
547        context.set_rendering_mode(TextRenderingMode::FillStrokeClip);
548
549        let ops = context.generate_text_state_operations();
550        assert!(ops.contains("6 Tr"));
551    }
552
553    #[test]
554    fn test_rendering_mode_clip() {
555        let mut context = TextContext::new();
556        context.set_rendering_mode(TextRenderingMode::Clip);
557
558        let ops = context.generate_text_state_operations();
559        assert!(ops.contains("7 Tr"));
560    }
561
562    #[test]
563    fn test_text_state_parameters_chaining() {
564        let mut context = TextContext::new();
565        context
566            .set_character_spacing(1.5)
567            .set_word_spacing(2.0)
568            .set_horizontal_scaling(1.1)
569            .set_leading(14.0)
570            .set_text_rise(0.5)
571            .set_rendering_mode(TextRenderingMode::FillStroke);
572
573        let ops = context.generate_text_state_operations();
574        assert!(ops.contains("1.50 Tc"));
575        assert!(ops.contains("2.00 Tw"));
576        assert!(ops.contains("110.00 Tz"));
577        assert!(ops.contains("14.00 TL"));
578        assert!(ops.contains("0.50 Ts"));
579        assert!(ops.contains("2 Tr"));
580    }
581
582    #[test]
583    fn test_all_text_state_operators_generated() {
584        let mut context = TextContext::new();
585
586        // Test all operators in sequence
587        context.set_character_spacing(1.0); // Tc
588        context.set_word_spacing(2.0); // Tw
589        context.set_horizontal_scaling(1.2); // Tz
590        context.set_leading(15.0); // TL
591        context.set_text_rise(1.0); // Ts
592        context.set_rendering_mode(TextRenderingMode::Stroke); // Tr
593
594        let ops = context.generate_text_state_operations();
595
596        // Verify all PDF text state operators are present
597        assert!(
598            ops.contains("Tc"),
599            "Character spacing operator (Tc) not found"
600        );
601        assert!(ops.contains("Tw"), "Word spacing operator (Tw) not found");
602        assert!(
603            ops.contains("Tz"),
604            "Horizontal scaling operator (Tz) not found"
605        );
606        assert!(ops.contains("TL"), "Leading operator (TL) not found");
607        assert!(ops.contains("Ts"), "Text rise operator (Ts) not found");
608        assert!(
609            ops.contains("Tr"),
610            "Text rendering mode operator (Tr) not found"
611        );
612    }
613}