oxidize_pdf/text/
mod.rs

1mod encoding;
2mod extraction;
3mod flow;
4mod font;
5mod metrics;
6pub mod ocr;
7
8#[cfg(feature = "ocr-tesseract")]
9pub mod tesseract_provider;
10
11pub use encoding::TextEncoding;
12pub use extraction::{ExtractedText, ExtractionOptions, TextExtractor, TextFragment};
13pub use flow::{TextAlign, TextFlowContext};
14pub use font::{Font, FontFamily};
15pub use metrics::{measure_char, measure_text, split_into_words};
16pub use ocr::{
17    FragmentType, ImagePreprocessing, MockOcrProvider, OcrEngine, OcrError, OcrOptions,
18    OcrProcessingResult, OcrProvider, OcrResult, OcrTextFragment,
19};
20
21use crate::error::Result;
22use std::fmt::Write;
23
24#[derive(Clone)]
25pub struct TextContext {
26    operations: String,
27    current_font: Font,
28    font_size: f64,
29    text_matrix: [f64; 6],
30}
31
32impl Default for TextContext {
33    fn default() -> Self {
34        Self::new()
35    }
36}
37
38impl TextContext {
39    pub fn new() -> Self {
40        Self {
41            operations: String::new(),
42            current_font: Font::Helvetica,
43            font_size: 12.0,
44            text_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
45        }
46    }
47
48    pub fn set_font(&mut self, font: Font, size: f64) -> &mut Self {
49        self.current_font = font;
50        self.font_size = size;
51        self
52    }
53
54    pub fn at(&mut self, x: f64, y: f64) -> &mut Self {
55        self.text_matrix[4] = x;
56        self.text_matrix[5] = y;
57        self
58    }
59
60    pub fn write(&mut self, text: &str) -> Result<&mut Self> {
61        // Begin text object
62        self.operations.push_str("BT\n");
63
64        // Set font
65        writeln!(
66            &mut self.operations,
67            "/{} {} Tf",
68            self.current_font.pdf_name(),
69            self.font_size
70        )
71        .unwrap();
72
73        // Set text position
74        writeln!(
75            &mut self.operations,
76            "{:.2} {:.2} Td",
77            self.text_matrix[4], self.text_matrix[5]
78        )
79        .unwrap();
80
81        // Encode text using WinAnsiEncoding
82        let encoding = TextEncoding::WinAnsiEncoding;
83        let encoded_bytes = encoding.encode(text);
84
85        // Show text as a literal string
86        self.operations.push('(');
87        for &byte in &encoded_bytes {
88            match byte {
89                b'(' => self.operations.push_str("\\("),
90                b')' => self.operations.push_str("\\)"),
91                b'\\' => self.operations.push_str("\\\\"),
92                b'\n' => self.operations.push_str("\\n"),
93                b'\r' => self.operations.push_str("\\r"),
94                b'\t' => self.operations.push_str("\\t"),
95                // For bytes in the printable ASCII range, write as is
96                0x20..=0x7E => self.operations.push(byte as char),
97                // For other bytes, write as octal escape sequences
98                _ => write!(&mut self.operations, "\\{byte:03o}").unwrap(),
99            }
100        }
101        self.operations.push_str(") Tj\n");
102
103        // End text object
104        self.operations.push_str("ET\n");
105
106        Ok(self)
107    }
108
109    pub fn write_line(&mut self, text: &str) -> Result<&mut Self> {
110        self.write(text)?;
111        self.text_matrix[5] -= self.font_size * 1.2; // Move down for next line
112        Ok(self)
113    }
114
115    pub fn set_character_spacing(&mut self, spacing: f64) -> &mut Self {
116        writeln!(&mut self.operations, "{spacing:.2} Tc").unwrap();
117        self
118    }
119
120    pub fn set_word_spacing(&mut self, spacing: f64) -> &mut Self {
121        writeln!(&mut self.operations, "{spacing:.2} Tw").unwrap();
122        self
123    }
124
125    pub fn set_horizontal_scaling(&mut self, scale: f64) -> &mut Self {
126        writeln!(&mut self.operations, "{:.2} Tz", scale * 100.0).unwrap();
127        self
128    }
129
130    pub fn set_leading(&mut self, leading: f64) -> &mut Self {
131        writeln!(&mut self.operations, "{leading:.2} TL").unwrap();
132        self
133    }
134
135    pub fn set_text_rise(&mut self, rise: f64) -> &mut Self {
136        writeln!(&mut self.operations, "{rise:.2} Ts").unwrap();
137        self
138    }
139
140    pub(crate) fn generate_operations(&self) -> Result<Vec<u8>> {
141        Ok(self.operations.as_bytes().to_vec())
142    }
143}