ruvector_scipix/output/
mod.rs

1//! Output formatting module for Scipix OCR results
2//!
3//! Supports multiple output formats:
4//! - Text: Plain text extraction
5//! - LaTeX: Mathematical notation
6//! - Scipix Markdown (mmd): Enhanced markdown with math
7//! - MathML: XML-based mathematical markup
8//! - HTML: Web-ready output with math rendering
9//! - SMILES: Chemical structure notation
10//! - DOCX: Microsoft Word format (Office Math ML)
11
12use serde::{Deserialize, Serialize};
13use std::collections::HashMap;
14
15pub mod formatter;
16pub mod mmd;
17pub mod latex;
18pub mod html;
19pub mod docx;
20pub mod json;
21pub mod smiles;
22
23pub use formatter::{OutputFormatter, MathDelimiters, HtmlEngine};
24pub use json::ApiResponse;
25
26/// Output format types supported by Scipix OCR
27#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
28#[serde(rename_all = "snake_case")]
29pub enum OutputFormat {
30    /// Plain text output
31    Text,
32    /// LaTeX mathematical notation
33    #[serde(rename = "latex_normal")]
34    LaTeX,
35    /// Styled LaTeX with custom packages
36    #[serde(rename = "latex_styled")]
37    LaTeXStyled,
38    /// Mathematical Markup Language
39    #[serde(rename = "mathml")]
40    MathML,
41    /// Scipix Markdown (enhanced markdown)
42    #[serde(rename = "mmd")]
43    Mmd,
44    /// ASCII Math notation
45    #[serde(rename = "asciimath")]
46    AsciiMath,
47    /// HTML with embedded math
48    Html,
49    /// Chemical structure notation
50    #[serde(rename = "smiles")]
51    Smiles,
52    /// Microsoft Word format
53    Docx,
54}
55
56impl OutputFormat {
57    /// Get the file extension for this format
58    pub fn extension(&self) -> &'static str {
59        match self {
60            OutputFormat::Text => "txt",
61            OutputFormat::LaTeX | OutputFormat::LaTeXStyled => "tex",
62            OutputFormat::MathML => "xml",
63            OutputFormat::Mmd => "mmd",
64            OutputFormat::AsciiMath => "txt",
65            OutputFormat::Html => "html",
66            OutputFormat::Smiles => "smi",
67            OutputFormat::Docx => "docx",
68        }
69    }
70
71    /// Get the MIME type for this format
72    pub fn mime_type(&self) -> &'static str {
73        match self {
74            OutputFormat::Text | OutputFormat::AsciiMath => "text/plain",
75            OutputFormat::LaTeX | OutputFormat::LaTeXStyled => "application/x-latex",
76            OutputFormat::MathML => "application/mathml+xml",
77            OutputFormat::Mmd => "text/markdown",
78            OutputFormat::Html => "text/html",
79            OutputFormat::Smiles => "chemical/x-daylight-smiles",
80            OutputFormat::Docx => "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
81        }
82    }
83}
84
85/// Complete OCR result with all possible output formats
86#[derive(Debug, Clone, Serialize, Deserialize)]
87pub struct OcrResult {
88    /// Request identifier
89    pub request_id: String,
90
91    /// Version of the OCR engine
92    pub version: String,
93
94    /// Image dimensions
95    pub image_width: u32,
96    pub image_height: u32,
97
98    /// Processing status
99    pub is_printed: bool,
100    pub is_handwritten: bool,
101    pub auto_rotate_confidence: f32,
102    pub auto_rotate_degrees: i32,
103
104    /// Confidence scores
105    pub confidence: f32,
106    pub confidence_rate: f32,
107
108    /// Available output formats
109    pub formats: FormatsData,
110
111    /// Detailed line and word data
112    #[serde(skip_serializing_if = "Option::is_none")]
113    pub line_data: Option<Vec<LineData>>,
114
115    /// Error information if processing failed
116    #[serde(skip_serializing_if = "Option::is_none")]
117    pub error: Option<String>,
118
119    /// Processing metadata
120    #[serde(flatten)]
121    pub metadata: HashMap<String, serde_json::Value>,
122}
123
124/// All available output format data
125#[derive(Debug, Clone, Default, Serialize, Deserialize)]
126pub struct FormatsData {
127    #[serde(skip_serializing_if = "Option::is_none")]
128    pub text: Option<String>,
129
130    #[serde(skip_serializing_if = "Option::is_none")]
131    pub latex_normal: Option<String>,
132
133    #[serde(skip_serializing_if = "Option::is_none")]
134    pub latex_styled: Option<String>,
135
136    #[serde(skip_serializing_if = "Option::is_none")]
137    pub latex_simplified: Option<String>,
138
139    #[serde(skip_serializing_if = "Option::is_none")]
140    pub mathml: Option<String>,
141
142    #[serde(skip_serializing_if = "Option::is_none")]
143    pub asciimath: Option<String>,
144
145    #[serde(skip_serializing_if = "Option::is_none")]
146    pub mmd: Option<String>,
147
148    #[serde(skip_serializing_if = "Option::is_none")]
149    pub html: Option<String>,
150
151    #[serde(skip_serializing_if = "Option::is_none")]
152    pub smiles: Option<String>,
153}
154
155/// Line-level OCR data with positioning
156#[derive(Debug, Clone, Serialize, Deserialize)]
157pub struct LineData {
158    /// Line type: text, math, table, image, etc.
159    #[serde(rename = "type")]
160    pub line_type: String,
161
162    /// Content in various formats
163    pub text: String,
164
165    #[serde(skip_serializing_if = "Option::is_none")]
166    pub latex: Option<String>,
167
168    /// Bounding box coordinates
169    pub bbox: BoundingBox,
170
171    /// Confidence score
172    pub confidence: f32,
173
174    /// Word-level data
175    #[serde(skip_serializing_if = "Option::is_none")]
176    pub words: Option<Vec<WordData>>,
177}
178
179/// Word-level OCR data
180#[derive(Debug, Clone, Serialize, Deserialize)]
181pub struct WordData {
182    pub text: String,
183    pub bbox: BoundingBox,
184    pub confidence: f32,
185
186    #[serde(skip_serializing_if = "Option::is_none")]
187    pub latex: Option<String>,
188}
189
190/// Bounding box coordinates (x, y, width, height)
191#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
192pub struct BoundingBox {
193    pub x: f32,
194    pub y: f32,
195    pub width: f32,
196    pub height: f32,
197}
198
199impl BoundingBox {
200    pub fn new(x: f32, y: f32, width: f32, height: f32) -> Self {
201        Self { x, y, width, height }
202    }
203
204    pub fn area(&self) -> f32 {
205        self.width * self.height
206    }
207
208    pub fn center(&self) -> (f32, f32) {
209        (self.x + self.width / 2.0, self.y + self.height / 2.0)
210    }
211}
212
213/// Convert between output formats
214pub fn convert_format(content: &str, from: OutputFormat, to: OutputFormat) -> Result<String, String> {
215    // Simple pass-through for same format
216    if from == to {
217        return Ok(content.to_string());
218    }
219
220    // Format-specific conversions
221    match (from, to) {
222        (OutputFormat::LaTeX, OutputFormat::Text) => {
223            // Strip LaTeX commands for plain text
224            Ok(strip_latex(content))
225        }
226        (OutputFormat::Mmd, OutputFormat::LaTeX) => {
227            // Extract LaTeX from markdown
228            Ok(extract_latex_from_mmd(content))
229        }
230        (OutputFormat::LaTeX, OutputFormat::Html) => {
231            // Wrap LaTeX in HTML with MathJax
232            Ok(format!(
233                r#"<!DOCTYPE html>
234<html>
235<head>
236    <script src="https://polyfill.io/v3/polyfill.min.js?features=es6"></script>
237    <script id="MathJax-script" async src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
238</head>
239<body>
240    <p>\({}\)</p>
241</body>
242</html>"#,
243                content
244            ))
245        }
246        _ => Err(format!("Conversion from {:?} to {:?} not supported", from, to)),
247    }
248}
249
250fn strip_latex(content: &str) -> String {
251    // Remove common LaTeX commands
252    let mut result = content.to_string();
253
254    // Remove math delimiters
255    result = result.replace("\\(", "").replace("\\)", "");
256    result = result.replace("\\[", "").replace("\\]", "");
257    result = result.replace("$$", "");
258
259    // Remove common commands but keep their content
260    for cmd in &["\\text", "\\mathrm", "\\mathbf", "\\mathit"] {
261        result = result.replace(&format!("{}{}", cmd, "{"), "");
262    }
263    result = result.replace("}", "");
264
265    // Remove standalone commands
266    for cmd in &["\\\\", "\\,", "\\;", "\\:", "\\!", "\\quad", "\\qquad"] {
267        result = result.replace(cmd, " ");
268    }
269
270    result.trim().to_string()
271}
272
273fn extract_latex_from_mmd(content: &str) -> String {
274    let mut latex_parts = Vec::new();
275    let mut in_math = false;
276    let mut current = String::new();
277
278    let chars: Vec<char> = content.chars().collect();
279    let mut i = 0;
280
281    while i < chars.len() {
282        if i + 1 < chars.len() && chars[i] == '$' && chars[i + 1] == '$' {
283            if in_math {
284                latex_parts.push(current.clone());
285                current.clear();
286                in_math = false;
287            } else {
288                in_math = true;
289            }
290            i += 2;
291        } else if chars[i] == '$' {
292            in_math = !in_math;
293            i += 1;
294        } else if in_math {
295            current.push(chars[i]);
296            i += 1;
297        } else {
298            i += 1;
299        }
300    }
301
302    latex_parts.join("\n\n")
303}
304
305#[cfg(test)]
306mod tests {
307    use super::*;
308
309    #[test]
310    fn test_output_format_extension() {
311        assert_eq!(OutputFormat::Text.extension(), "txt");
312        assert_eq!(OutputFormat::LaTeX.extension(), "tex");
313        assert_eq!(OutputFormat::Html.extension(), "html");
314        assert_eq!(OutputFormat::Mmd.extension(), "mmd");
315    }
316
317    #[test]
318    fn test_output_format_mime_type() {
319        assert_eq!(OutputFormat::Text.mime_type(), "text/plain");
320        assert_eq!(OutputFormat::LaTeX.mime_type(), "application/x-latex");
321        assert_eq!(OutputFormat::Html.mime_type(), "text/html");
322    }
323
324    #[test]
325    fn test_bounding_box() {
326        let bbox = BoundingBox::new(10.0, 20.0, 100.0, 50.0);
327        assert_eq!(bbox.area(), 5000.0);
328        assert_eq!(bbox.center(), (60.0, 45.0));
329    }
330
331    #[test]
332    fn test_strip_latex() {
333        let input = r"\text{Hello } \mathbf{World}";
334        let output = strip_latex(input);
335        assert!(output.contains("Hello"));
336        assert!(output.contains("World"));
337    }
338
339    #[test]
340    fn test_convert_same_format() {
341        let content = "test content";
342        let result = convert_format(content, OutputFormat::Text, OutputFormat::Text).unwrap();
343        assert_eq!(result, content);
344    }
345}