oxify_connect_vision/
types.rs

1//! Core types for OCR/Vision processing results.
2
3use serde::{Deserialize, Serialize};
4
5/// Result of OCR/Vision processing on an image.
6#[derive(Debug, Clone, Serialize, Deserialize)]
7pub struct OcrResult {
8    /// Extracted plain text (all blocks concatenated).
9    pub text: String,
10
11    /// Layout-preserved Markdown representation.
12    /// Tables, headers, and lists are formatted appropriately.
13    pub markdown: String,
14
15    /// Individual text blocks with position and metadata.
16    pub blocks: Vec<TextBlock>,
17
18    /// Processing metadata.
19    pub metadata: OcrMetadata,
20}
21
22impl OcrResult {
23    /// Create a new empty OCR result.
24    pub fn empty() -> Self {
25        Self {
26            text: String::new(),
27            markdown: String::new(),
28            blocks: Vec::new(),
29            metadata: OcrMetadata::default(),
30        }
31    }
32
33    /// Create a simple OCR result with just text.
34    pub fn from_text(text: impl Into<String>) -> Self {
35        let text = text.into();
36        Self {
37            markdown: text.clone(),
38            text,
39            blocks: Vec::new(),
40            metadata: OcrMetadata::default(),
41        }
42    }
43}
44
45/// A single text block extracted from an image.
46#[derive(Debug, Clone, Serialize, Deserialize)]
47pub struct TextBlock {
48    /// Extracted text content.
49    pub text: String,
50
51    /// Bounding box coordinates: [x1, y1, x2, y2].
52    /// Coordinates are normalized to [0, 1] range relative to image dimensions.
53    pub bbox: [f32; 4],
54
55    /// Confidence score (0.0 to 1.0).
56    pub confidence: f32,
57
58    /// Semantic role of this block.
59    pub role: BlockRole,
60
61    /// Reading order index (0-based).
62    pub order: usize,
63}
64
65impl TextBlock {
66    /// Create a new text block with default values.
67    pub fn new(text: impl Into<String>) -> Self {
68        Self {
69            text: text.into(),
70            bbox: [0.0, 0.0, 1.0, 1.0],
71            confidence: 1.0,
72            role: BlockRole::Text,
73            order: 0,
74        }
75    }
76
77    /// Set the bounding box.
78    pub fn with_bbox(mut self, bbox: [f32; 4]) -> Self {
79        self.bbox = bbox;
80        self
81    }
82
83    /// Set the confidence score.
84    pub fn with_confidence(mut self, confidence: f32) -> Self {
85        self.confidence = confidence;
86        self
87    }
88
89    /// Set the block role.
90    pub fn with_role(mut self, role: BlockRole) -> Self {
91        self.role = role;
92        self
93    }
94
95    /// Set the reading order.
96    pub fn with_order(mut self, order: usize) -> Self {
97        self.order = order;
98        self
99    }
100}
101
102/// Semantic role of a text block.
103#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
104#[serde(rename_all = "snake_case")]
105pub enum BlockRole {
106    /// Document title.
107    Title,
108    /// Section header.
109    Header,
110    /// Regular text paragraph.
111    #[default]
112    Text,
113    /// Table content.
114    Table,
115    /// List item.
116    List,
117    /// Image caption.
118    Caption,
119    /// Footer content.
120    Footer,
121    /// Page number.
122    PageNumber,
123    /// Code block.
124    Code,
125    /// Quote or citation.
126    Quote,
127    /// Other/unknown role.
128    Other,
129}
130
131impl BlockRole {
132    /// Get the Markdown prefix for this role.
133    pub fn markdown_prefix(&self) -> &'static str {
134        match self {
135            BlockRole::Title => "# ",
136            BlockRole::Header => "## ",
137            BlockRole::List => "- ",
138            BlockRole::Quote => "> ",
139            BlockRole::Code => "```\n",
140            _ => "",
141        }
142    }
143
144    /// Get the Markdown suffix for this role.
145    pub fn markdown_suffix(&self) -> &'static str {
146        match self {
147            BlockRole::Code => "\n```",
148            _ => "",
149        }
150    }
151}
152
153/// Metadata about the OCR processing.
154#[derive(Debug, Clone, Serialize, Deserialize, Default)]
155pub struct OcrMetadata {
156    /// Provider used for OCR.
157    pub provider: String,
158
159    /// Model name/version.
160    pub model: Option<String>,
161
162    /// Processing time in milliseconds.
163    pub processing_time_ms: u64,
164
165    /// Image dimensions (width, height).
166    pub image_size: Option<(u32, u32)>,
167
168    /// Detected language(s).
169    pub languages: Vec<String>,
170
171    /// Number of pages (for multi-page documents).
172    pub page_count: u32,
173
174    /// Current page number (1-indexed).
175    pub current_page: u32,
176}
177
178/// Input image data for OCR processing.
179#[derive(Debug, Clone)]
180pub enum ImageInput {
181    /// Raw bytes of the image.
182    Bytes(Vec<u8>),
183
184    /// Base64-encoded image data.
185    Base64(String),
186
187    /// File path to the image.
188    Path(String),
189
190    /// URL to fetch the image from.
191    Url(String),
192}
193
194impl ImageInput {
195    /// Convert to raw bytes.
196    pub async fn to_bytes(&self) -> Result<Vec<u8>, VisionInputError> {
197        match self {
198            ImageInput::Bytes(bytes) => Ok(bytes.clone()),
199            ImageInput::Base64(encoded) => {
200                use base64::Engine;
201                base64::engine::general_purpose::STANDARD
202                    .decode(encoded)
203                    .map_err(|e| VisionInputError::Base64Decode(e.to_string()))
204            }
205            ImageInput::Path(path) => tokio::fs::read(path)
206                .await
207                .map_err(|e| VisionInputError::FileRead(e.to_string())),
208            ImageInput::Url(_url) => {
209                // URL fetching would require reqwest dependency
210                Err(VisionInputError::UrlNotSupported)
211            }
212        }
213    }
214}
215
216/// Error type for image input processing.
217#[derive(Debug, thiserror::Error)]
218pub enum VisionInputError {
219    #[error("Failed to decode base64: {0}")]
220    Base64Decode(String),
221
222    #[error("Failed to read file: {0}")]
223    FileRead(String),
224
225    #[error("URL input is not supported in this build")]
226    UrlNotSupported,
227}
228
229/// Output format for OCR results.
230#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
231#[serde(rename_all = "snake_case")]
232pub enum OutputFormat {
233    /// Plain text output.
234    Text,
235    /// Markdown-formatted output (preserves layout).
236    #[default]
237    Markdown,
238    /// Structured JSON output.
239    Json,
240    /// All formats (for maximum flexibility).
241    All,
242}
243
244#[cfg(test)]
245mod tests {
246    use super::*;
247
248    #[test]
249    fn test_ocr_result_from_text() {
250        let result = OcrResult::from_text("Hello, World!");
251        assert_eq!(result.text, "Hello, World!");
252        assert_eq!(result.markdown, "Hello, World!");
253        assert!(result.blocks.is_empty());
254    }
255
256    #[test]
257    fn test_text_block_builder() {
258        let block = TextBlock::new("Test")
259            .with_bbox([0.1, 0.2, 0.3, 0.4])
260            .with_confidence(0.95)
261            .with_role(BlockRole::Header)
262            .with_order(1);
263
264        assert_eq!(block.text, "Test");
265        assert_eq!(block.bbox, [0.1, 0.2, 0.3, 0.4]);
266        assert_eq!(block.confidence, 0.95);
267        assert_eq!(block.role, BlockRole::Header);
268        assert_eq!(block.order, 1);
269    }
270
271    #[test]
272    fn test_block_role_markdown() {
273        assert_eq!(BlockRole::Title.markdown_prefix(), "# ");
274        assert_eq!(BlockRole::Header.markdown_prefix(), "## ");
275        assert_eq!(BlockRole::List.markdown_prefix(), "- ");
276        assert_eq!(BlockRole::Text.markdown_prefix(), "");
277    }
278
279    #[test]
280    fn test_ocr_result_serialization() {
281        let result = OcrResult::from_text("Test");
282        let json = serde_json::to_string(&result).unwrap();
283        let deserialized: OcrResult = serde_json::from_str(&json).unwrap();
284        assert_eq!(deserialized.text, result.text);
285    }
286}