ruvector_scipix/output/
formatter.rs

1//! Multi-format output formatter with batch processing and streaming support
2
3use super::*;
4use crate::output::{html, latex, mmd, smiles};
5use std::io::Write;
6
7/// Configuration for output formatting
8#[derive(Debug, Clone)]
9pub struct FormatterConfig {
10    /// Target output formats
11    pub formats: Vec<OutputFormat>,
12
13    /// Enable pretty printing (where applicable)
14    pub pretty: bool,
15
16    /// Include confidence scores in output
17    pub include_confidence: bool,
18
19    /// Include bounding box data
20    pub include_bbox: bool,
21
22    /// Math delimiter style for LaTeX/MMD
23    pub math_delimiters: MathDelimiters,
24
25    /// HTML rendering engine
26    pub html_engine: HtmlEngine,
27
28    /// Enable streaming for large documents
29    pub streaming: bool,
30}
31
32impl Default for FormatterConfig {
33    fn default() -> Self {
34        Self {
35            formats: vec![OutputFormat::Text],
36            pretty: true,
37            include_confidence: false,
38            include_bbox: false,
39            math_delimiters: MathDelimiters::default(),
40            html_engine: HtmlEngine::MathJax,
41            streaming: false,
42        }
43    }
44}
45
46/// Math delimiter configuration
47#[derive(Debug, Clone)]
48pub struct MathDelimiters {
49    pub inline_start: String,
50    pub inline_end: String,
51    pub display_start: String,
52    pub display_end: String,
53}
54
55impl Default for MathDelimiters {
56    fn default() -> Self {
57        Self {
58            inline_start: "$".to_string(),
59            inline_end: "$".to_string(),
60            display_start: "$$".to_string(),
61            display_end: "$$".to_string(),
62        }
63    }
64}
65
66/// HTML rendering engine options
67#[derive(Debug, Clone, Copy, PartialEq, Eq)]
68pub enum HtmlEngine {
69    MathJax,
70    KaTeX,
71    Raw,
72}
73
74/// Main output formatter
75pub struct OutputFormatter {
76    config: FormatterConfig,
77}
78
79impl OutputFormatter {
80    /// Create a new formatter with default configuration
81    pub fn new() -> Self {
82        Self {
83            config: FormatterConfig::default(),
84        }
85    }
86
87    /// Create a formatter with custom configuration
88    pub fn with_config(config: FormatterConfig) -> Self {
89        Self { config }
90    }
91
92    /// Format a single OCR result
93    pub fn format_result(&self, result: &OcrResult) -> Result<FormatsData, String> {
94        let mut formats = FormatsData::default();
95
96        for format in &self.config.formats {
97            let output = self.format_single(result, *format)?;
98            self.set_format_output(&mut formats, *format, output);
99        }
100
101        Ok(formats)
102    }
103
104    /// Format multiple results in batch
105    pub fn format_batch(&self, results: &[OcrResult]) -> Result<Vec<FormatsData>, String> {
106        results
107            .iter()
108            .map(|result| self.format_result(result))
109            .collect()
110    }
111
112    /// Stream format results to a writer
113    pub fn format_stream<W: Write>(
114        &self,
115        results: &[OcrResult],
116        writer: &mut W,
117        format: OutputFormat,
118    ) -> Result<(), String> {
119        for (i, result) in results.iter().enumerate() {
120            let output = self.format_single(result, format)?;
121            writer
122                .write_all(output.as_bytes())
123                .map_err(|e| format!("Write error: {}", e))?;
124
125            // Add separator between results
126            if i < results.len() - 1 {
127                writer
128                    .write_all(b"\n\n---\n\n")
129                    .map_err(|e| format!("Write error: {}", e))?;
130            }
131        }
132
133        Ok(())
134    }
135
136    /// Format a single result to a specific format
137    fn format_single(&self, result: &OcrResult, format: OutputFormat) -> Result<String, String> {
138        match format {
139            OutputFormat::Text => self.format_text(result),
140            OutputFormat::LaTeX => self.format_latex(result, false),
141            OutputFormat::LaTeXStyled => self.format_latex(result, true),
142            OutputFormat::Mmd => self.format_mmd(result),
143            OutputFormat::Html => self.format_html(result),
144            OutputFormat::Smiles => self.format_smiles(result),
145            OutputFormat::Docx => self.format_docx(result),
146            OutputFormat::MathML => self.format_mathml(result),
147            OutputFormat::AsciiMath => self.format_asciimath(result),
148        }
149    }
150
151    fn format_text(&self, result: &OcrResult) -> Result<String, String> {
152        if let Some(text) = &result.formats.text {
153            return Ok(text.clone());
154        }
155
156        // Fallback: extract text from line data
157        if let Some(line_data) = &result.line_data {
158            let text = line_data
159                .iter()
160                .map(|line| line.text.as_str())
161                .collect::<Vec<_>>()
162                .join("\n");
163            return Ok(text);
164        }
165
166        Err("No text content available".to_string())
167    }
168
169    fn format_latex(&self, result: &OcrResult, styled: bool) -> Result<String, String> {
170        let latex_content = if styled {
171            result.formats.latex_styled.as_ref()
172                .or(result.formats.latex_normal.as_ref())
173        } else {
174            result.formats.latex_normal.as_ref()
175        };
176
177        if let Some(latex) = latex_content {
178            if styled {
179                // Wrap in document with packages
180                Ok(latex::LaTeXFormatter::new()
181                    .with_packages(vec![
182                        "amsmath".to_string(),
183                        "amssymb".to_string(),
184                        "graphicx".to_string(),
185                    ])
186                    .format_document(latex))
187            } else {
188                Ok(latex.clone())
189            }
190        } else {
191            Err("No LaTeX content available".to_string())
192        }
193    }
194
195    fn format_mmd(&self, result: &OcrResult) -> Result<String, String> {
196        if let Some(mmd) = &result.formats.mmd {
197            return Ok(mmd.clone());
198        }
199
200        // Generate MMD from line data
201        if let Some(line_data) = &result.line_data {
202            let formatter = mmd::MmdFormatter::with_delimiters(
203                self.config.math_delimiters.clone()
204            );
205            return Ok(formatter.format(line_data));
206        }
207
208        Err("No MMD content available".to_string())
209    }
210
211    fn format_html(&self, result: &OcrResult) -> Result<String, String> {
212        if let Some(html) = &result.formats.html {
213            return Ok(html.clone());
214        }
215
216        // Generate HTML with math rendering
217        let content = self.format_text(result)?;
218        let formatter = html::HtmlFormatter::new()
219            .with_engine(self.config.html_engine)
220            .with_styling(self.config.pretty);
221
222        Ok(formatter.format(&content, result.line_data.as_deref()))
223    }
224
225    fn format_smiles(&self, result: &OcrResult) -> Result<String, String> {
226        if let Some(smiles) = &result.formats.smiles {
227            return Ok(smiles.clone());
228        }
229
230        // Generate SMILES if we have chemical structure data
231        let generator = smiles::SmilesGenerator::new();
232        generator.generate_from_result(result)
233    }
234
235    fn format_docx(&self, _result: &OcrResult) -> Result<String, String> {
236        // DOCX requires binary format, return placeholder
237        Err("DOCX format requires binary output - use save_docx() instead".to_string())
238    }
239
240    fn format_mathml(&self, result: &OcrResult) -> Result<String, String> {
241        if let Some(mathml) = &result.formats.mathml {
242            return Ok(mathml.clone());
243        }
244
245        Err("MathML generation not yet implemented".to_string())
246    }
247
248    fn format_asciimath(&self, result: &OcrResult) -> Result<String, String> {
249        if let Some(asciimath) = &result.formats.asciimath {
250            return Ok(asciimath.clone());
251        }
252
253        Err("AsciiMath conversion not yet implemented".to_string())
254    }
255
256    fn set_format_output(&self, formats: &mut FormatsData, format: OutputFormat, output: String) {
257        match format {
258            OutputFormat::Text => formats.text = Some(output),
259            OutputFormat::LaTeX => formats.latex_normal = Some(output),
260            OutputFormat::LaTeXStyled => formats.latex_styled = Some(output),
261            OutputFormat::Mmd => formats.mmd = Some(output),
262            OutputFormat::Html => formats.html = Some(output),
263            OutputFormat::Smiles => formats.smiles = Some(output),
264            OutputFormat::MathML => formats.mathml = Some(output),
265            OutputFormat::AsciiMath => formats.asciimath = Some(output),
266            OutputFormat::Docx => {}, // Binary format, handled separately
267        }
268    }
269}
270
271impl Default for OutputFormatter {
272    fn default() -> Self {
273        Self::new()
274    }
275}
276
277/// Builder for OutputFormatter configuration
278pub struct FormatterBuilder {
279    config: FormatterConfig,
280}
281
282impl FormatterBuilder {
283    pub fn new() -> Self {
284        Self {
285            config: FormatterConfig::default(),
286        }
287    }
288
289    pub fn formats(mut self, formats: Vec<OutputFormat>) -> Self {
290        self.config.formats = formats;
291        self
292    }
293
294    pub fn add_format(mut self, format: OutputFormat) -> Self {
295        self.config.formats.push(format);
296        self
297    }
298
299    pub fn pretty(mut self, pretty: bool) -> Self {
300        self.config.pretty = pretty;
301        self
302    }
303
304    pub fn include_confidence(mut self, include: bool) -> Self {
305        self.config.include_confidence = include;
306        self
307    }
308
309    pub fn include_bbox(mut self, include: bool) -> Self {
310        self.config.include_bbox = include;
311        self
312    }
313
314    pub fn math_delimiters(mut self, delimiters: MathDelimiters) -> Self {
315        self.config.math_delimiters = delimiters;
316        self
317    }
318
319    pub fn html_engine(mut self, engine: HtmlEngine) -> Self {
320        self.config.html_engine = engine;
321        self
322    }
323
324    pub fn streaming(mut self, streaming: bool) -> Self {
325        self.config.streaming = streaming;
326        self
327    }
328
329    pub fn build(self) -> OutputFormatter {
330        OutputFormatter::with_config(self.config)
331    }
332}
333
334impl Default for FormatterBuilder {
335    fn default() -> Self {
336        Self::new()
337    }
338}
339
340#[cfg(test)]
341mod tests {
342    use super::*;
343
344    fn create_test_result() -> OcrResult {
345        OcrResult {
346            request_id: "test_123".to_string(),
347            version: "3.0".to_string(),
348            image_width: 800,
349            image_height: 600,
350            is_printed: true,
351            is_handwritten: false,
352            auto_rotate_confidence: 0.95,
353            auto_rotate_degrees: 0,
354            confidence: 0.98,
355            confidence_rate: 0.97,
356            formats: FormatsData {
357                text: Some("E = mc^2".to_string()),
358                latex_normal: Some(r"E = mc^2".to_string()),
359                ..Default::default()
360            },
361            line_data: None,
362            error: None,
363            metadata: HashMap::new(),
364        }
365    }
366
367    #[test]
368    fn test_format_text() {
369        let formatter = OutputFormatter::new();
370        let result = create_test_result();
371
372        let output = formatter.format_single(&result, OutputFormat::Text).unwrap();
373        assert_eq!(output, "E = mc^2");
374    }
375
376    #[test]
377    fn test_format_latex() {
378        let formatter = OutputFormatter::new();
379        let result = create_test_result();
380
381        let output = formatter.format_single(&result, OutputFormat::LaTeX).unwrap();
382        assert!(output.contains("mc^2"));
383    }
384
385    #[test]
386    fn test_builder() {
387        let formatter = FormatterBuilder::new()
388            .add_format(OutputFormat::Text)
389            .add_format(OutputFormat::LaTeX)
390            .pretty(true)
391            .include_confidence(true)
392            .build();
393
394        assert_eq!(formatter.config.formats.len(), 2);
395        assert!(formatter.config.pretty);
396        assert!(formatter.config.include_confidence);
397    }
398
399    #[test]
400    fn test_batch_format() {
401        let formatter = OutputFormatter::new();
402        let results = vec![create_test_result(), create_test_result()];
403
404        let outputs = formatter.format_batch(&results).unwrap();
405        assert_eq!(outputs.len(), 2);
406    }
407}