oxidize_pdf/text/
mod.rs

1mod encoding;
2mod extraction;
3mod flow;
4mod font;
5mod metrics;
6pub mod ocr;
7
8#[cfg(feature = "ocr-tesseract")]
9pub mod tesseract_provider;
10
11pub use encoding::TextEncoding;
12pub use extraction::{ExtractedText, ExtractionOptions, TextExtractor, TextFragment};
13pub use flow::{TextAlign, TextFlowContext};
14pub use font::{Font, FontFamily};
15pub use metrics::{measure_char, measure_text, split_into_words};
16pub use ocr::{
17    FragmentType, ImagePreprocessing, MockOcrProvider, OcrEngine, OcrError, OcrOptions,
18    OcrProcessingResult, OcrProvider, OcrResult, OcrTextFragment,
19};
20
21use crate::error::Result;
22use std::fmt::Write;
23
24#[derive(Clone)]
25pub struct TextContext {
26    operations: String,
27    current_font: Font,
28    font_size: f64,
29    text_matrix: [f64; 6],
30}
31
32impl Default for TextContext {
33    fn default() -> Self {
34        Self::new()
35    }
36}
37
38impl TextContext {
39    pub fn new() -> Self {
40        Self {
41            operations: String::new(),
42            current_font: Font::Helvetica,
43            font_size: 12.0,
44            text_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
45        }
46    }
47
48    pub fn set_font(&mut self, font: Font, size: f64) -> &mut Self {
49        self.current_font = font;
50        self.font_size = size;
51        self
52    }
53
54    pub fn at(&mut self, x: f64, y: f64) -> &mut Self {
55        self.text_matrix[4] = x;
56        self.text_matrix[5] = y;
57        self
58    }
59
60    pub fn write(&mut self, text: &str) -> Result<&mut Self> {
61        // Begin text object
62        self.operations.push_str("BT\n");
63
64        // Set font
65        writeln!(
66            &mut self.operations,
67            "/{} {} Tf",
68            self.current_font.pdf_name(),
69            self.font_size
70        )
71        .unwrap();
72
73        // Set text position
74        writeln!(
75            &mut self.operations,
76            "{:.2} {:.2} Td",
77            self.text_matrix[4], self.text_matrix[5]
78        )
79        .unwrap();
80
81        // Encode text using WinAnsiEncoding
82        let encoding = TextEncoding::WinAnsiEncoding;
83        let encoded_bytes = encoding.encode(text);
84
85        // Show text as a literal string
86        self.operations.push('(');
87        for &byte in &encoded_bytes {
88            match byte {
89                b'(' => self.operations.push_str("\\("),
90                b')' => self.operations.push_str("\\)"),
91                b'\\' => self.operations.push_str("\\\\"),
92                b'\n' => self.operations.push_str("\\n"),
93                b'\r' => self.operations.push_str("\\r"),
94                b'\t' => self.operations.push_str("\\t"),
95                // For bytes in the printable ASCII range, write as is
96                0x20..=0x7E => self.operations.push(byte as char),
97                // For other bytes, write as octal escape sequences
98                _ => write!(&mut self.operations, "\\{byte:03o}").unwrap(),
99            }
100        }
101        self.operations.push_str(") Tj\n");
102
103        // End text object
104        self.operations.push_str("ET\n");
105
106        Ok(self)
107    }
108
109    pub fn write_line(&mut self, text: &str) -> Result<&mut Self> {
110        self.write(text)?;
111        self.text_matrix[5] -= self.font_size * 1.2; // Move down for next line
112        Ok(self)
113    }
114
115    pub fn set_character_spacing(&mut self, spacing: f64) -> &mut Self {
116        writeln!(&mut self.operations, "{spacing:.2} Tc").unwrap();
117        self
118    }
119
120    pub fn set_word_spacing(&mut self, spacing: f64) -> &mut Self {
121        writeln!(&mut self.operations, "{spacing:.2} Tw").unwrap();
122        self
123    }
124
125    pub fn set_horizontal_scaling(&mut self, scale: f64) -> &mut Self {
126        writeln!(&mut self.operations, "{:.2} Tz", scale * 100.0).unwrap();
127        self
128    }
129
130    pub fn set_leading(&mut self, leading: f64) -> &mut Self {
131        writeln!(&mut self.operations, "{leading:.2} TL").unwrap();
132        self
133    }
134
135    pub fn set_text_rise(&mut self, rise: f64) -> &mut Self {
136        writeln!(&mut self.operations, "{rise:.2} Ts").unwrap();
137        self
138    }
139
140    pub(crate) fn generate_operations(&self) -> Result<Vec<u8>> {
141        Ok(self.operations.as_bytes().to_vec())
142    }
143    
144    /// Get the current font
145    pub fn current_font(&self) -> Font {
146        self.current_font
147    }
148    
149    /// Get the current font size
150    pub fn font_size(&self) -> f64 {
151        self.font_size
152    }
153    
154    /// Get the current text matrix
155    pub fn text_matrix(&self) -> [f64; 6] {
156        self.text_matrix
157    }
158    
159    /// Get the current position
160    pub fn position(&self) -> (f64, f64) {
161        (self.text_matrix[4], self.text_matrix[5])
162    }
163    
164    /// Clear all operations
165    pub fn clear(&mut self) {
166        self.operations.clear();
167    }
168    
169    /// Get the raw operations string
170    pub fn operations(&self) -> &str {
171        &self.operations
172    }
173}
174
175#[cfg(test)]
176mod tests {
177    use super::*;
178    
179    #[test]
180    fn test_text_context_new() {
181        let context = TextContext::new();
182        assert_eq!(context.current_font, Font::Helvetica);
183        assert_eq!(context.font_size, 12.0);
184        assert_eq!(context.text_matrix, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
185        assert!(context.operations.is_empty());
186    }
187    
188    #[test]
189    fn test_text_context_default() {
190        let context = TextContext::default();
191        assert_eq!(context.current_font, Font::Helvetica);
192        assert_eq!(context.font_size, 12.0);
193    }
194    
195    #[test]
196    fn test_set_font() {
197        let mut context = TextContext::new();
198        context.set_font(Font::TimesBold, 14.0);
199        assert_eq!(context.current_font, Font::TimesBold);
200        assert_eq!(context.font_size, 14.0);
201    }
202    
203    #[test]
204    fn test_position() {
205        let mut context = TextContext::new();
206        context.at(100.0, 200.0);
207        let (x, y) = context.position();
208        assert_eq!(x, 100.0);
209        assert_eq!(y, 200.0);
210        assert_eq!(context.text_matrix[4], 100.0);
211        assert_eq!(context.text_matrix[5], 200.0);
212    }
213    
214    #[test]
215    fn test_write_simple_text() {
216        let mut context = TextContext::new();
217        context.write("Hello").unwrap();
218        
219        let ops = context.operations();
220        assert!(ops.contains("BT\n"));
221        assert!(ops.contains("ET\n"));
222        assert!(ops.contains("/Helvetica 12 Tf"));
223        assert!(ops.contains("(Hello) Tj"));
224    }
225    
226    #[test]
227    fn test_write_text_with_escaping() {
228        let mut context = TextContext::new();
229        context.write("(Hello)").unwrap();
230        
231        let ops = context.operations();
232        assert!(ops.contains("(\\(Hello\\)) Tj"));
233    }
234    
235    #[test]
236    fn test_write_line() {
237        let mut context = TextContext::new();
238        let initial_y = context.text_matrix[5];
239        context.write_line("Line 1").unwrap();
240        
241        // Y position should have moved down
242        let new_y = context.text_matrix[5];
243        assert!(new_y < initial_y);
244        assert_eq!(new_y, initial_y - 12.0 * 1.2); // font_size * 1.2
245    }
246    
247    #[test]
248    fn test_character_spacing() {
249        let mut context = TextContext::new();
250        context.set_character_spacing(2.5);
251        
252        let ops = context.operations();
253        assert!(ops.contains("2.50 Tc"));
254    }
255    
256    #[test]
257    fn test_word_spacing() {
258        let mut context = TextContext::new();
259        context.set_word_spacing(1.5);
260        
261        let ops = context.operations();
262        assert!(ops.contains("1.50 Tw"));
263    }
264    
265    #[test]
266    fn test_horizontal_scaling() {
267        let mut context = TextContext::new();
268        context.set_horizontal_scaling(1.25);
269        
270        let ops = context.operations();
271        assert!(ops.contains("125.00 Tz")); // 1.25 * 100
272    }
273    
274    #[test]
275    fn test_leading() {
276        let mut context = TextContext::new();
277        context.set_leading(15.0);
278        
279        let ops = context.operations();
280        assert!(ops.contains("15.00 TL"));
281    }
282    
283    #[test]
284    fn test_text_rise() {
285        let mut context = TextContext::new();
286        context.set_text_rise(3.0);
287        
288        let ops = context.operations();
289        assert!(ops.contains("3.00 Ts"));
290    }
291    
292    #[test]
293    fn test_clear() {
294        let mut context = TextContext::new();
295        context.write("Hello").unwrap();
296        assert!(!context.operations().is_empty());
297        
298        context.clear();
299        assert!(context.operations().is_empty());
300    }
301    
302    #[test]
303    fn test_generate_operations() {
304        let mut context = TextContext::new();
305        context.write("Test").unwrap();
306        
307        let ops_bytes = context.generate_operations().unwrap();
308        let ops_string = String::from_utf8(ops_bytes).unwrap();
309        assert_eq!(ops_string, context.operations());
310    }
311    
312    #[test]
313    fn test_method_chaining() {
314        let mut context = TextContext::new();
315        context
316            .set_font(Font::Courier, 10.0)
317            .at(50.0, 100.0)
318            .set_character_spacing(1.0)
319            .set_word_spacing(2.0);
320        
321        assert_eq!(context.current_font(), Font::Courier);
322        assert_eq!(context.font_size(), 10.0);
323        let (x, y) = context.position();
324        assert_eq!(x, 50.0);
325        assert_eq!(y, 100.0);
326    }
327    
328    #[test]
329    fn test_text_matrix_access() {
330        let mut context = TextContext::new();
331        context.at(25.0, 75.0);
332        
333        let matrix = context.text_matrix();
334        assert_eq!(matrix, [1.0, 0.0, 0.0, 1.0, 25.0, 75.0]);
335    }
336    
337    #[test]
338    fn test_special_characters_encoding() {
339        let mut context = TextContext::new();
340        context.write("Test\nLine\tTab").unwrap();
341        
342        let ops = context.operations();
343        assert!(ops.contains("\\n"));
344        assert!(ops.contains("\\t"));
345    }
346}