Skip to main content

quantum_sdk/
vision.rs

1use serde::{Deserialize, Serialize};
2use std::collections::HashMap;
3
4use crate::client::Client;
5use crate::error::Result;
6
7// ---------------------------------------------------------------------------
8// Request
9// ---------------------------------------------------------------------------
10
11/// Request body for vision analysis endpoints.
12#[derive(Debug, Clone, Serialize, Default)]
13pub struct VisionRequest {
14    /// Base64-encoded image (with or without data: prefix).
15    #[serde(skip_serializing_if = "Option::is_none")]
16    pub image_base64: Option<String>,
17
18    /// Image URL (fetched by the model provider).
19    #[serde(skip_serializing_if = "Option::is_none")]
20    pub image_url: Option<String>,
21
22    /// Model to use. Default: gemini-2.5-flash.
23    #[serde(skip_serializing_if = "Option::is_none")]
24    pub model: Option<String>,
25
26    /// Analysis profile: "combined" (default), "scene", "objects", "ocr", "quality".
27    #[serde(skip_serializing_if = "Option::is_none")]
28    pub profile: Option<String>,
29
30    /// Domain context for relevance checking.
31    #[serde(skip_serializing_if = "Option::is_none")]
32    pub context: Option<VisionContext>,
33}
34
35/// Domain context for relevance analysis.
36#[derive(Debug, Clone, Serialize, Deserialize, Default)]
37pub struct VisionContext {
38    /// Installation type (e.g. "solar", "heat_pump", "ev_charger").
39    #[serde(skip_serializing_if = "Option::is_none")]
40    pub installation_type: Option<String>,
41
42    /// Phase (e.g. "pre_install", "installation", "post_install").
43    #[serde(skip_serializing_if = "Option::is_none")]
44    pub phase: Option<String>,
45
46    /// Expected items for relevance checking.
47    #[serde(skip_serializing_if = "Option::is_none")]
48    pub expected_items: Option<Vec<String>>,
49}
50
51// ---------------------------------------------------------------------------
52// Response
53// ---------------------------------------------------------------------------
54
55/// Full vision analysis response.
56#[derive(Debug, Clone, Deserialize, Default)]
57pub struct VisionResponse {
58    /// Scene description.
59    #[serde(default)]
60    pub caption: Option<String>,
61
62    /// Suggested tags (lowercase_snake_case).
63    #[serde(default)]
64    pub tags: Vec<String>,
65
66    /// Detected objects with bounding boxes.
67    #[serde(default)]
68    pub objects: Vec<DetectedObject>,
69
70    /// Image quality assessment.
71    #[serde(default)]
72    pub quality: Option<QualityAssessment>,
73
74    /// Relevance check against context.
75    #[serde(default)]
76    pub relevance: Option<RelevanceCheck>,
77
78    /// Extracted text and overlay metadata.
79    #[serde(default)]
80    pub ocr: Option<OcrResult>,
81
82    /// Model used.
83    #[serde(default)]
84    pub model: String,
85
86    /// Cost in ticks.
87    #[serde(default)]
88    pub cost_ticks: i64,
89
90    /// Request identifier.
91    #[serde(default)]
92    pub request_id: String,
93}
94
95/// A detected object with bounding box.
96#[derive(Debug, Clone, Serialize, Deserialize, Default)]
97pub struct DetectedObject {
98    /// Object label.
99    pub label: String,
100
101    /// Detection confidence (0.0 - 1.0).
102    #[serde(default)]
103    pub confidence: f64,
104
105    /// Bounding box: [y_min, x_min, y_max, x_max] normalised to 0-1000.
106    #[serde(default)]
107    pub bounding_box: [i32; 4],
108}
109
110/// Image quality assessment.
111#[derive(Debug, Clone, Deserialize, Default)]
112pub struct QualityAssessment {
113    /// Overall rating: "good", "acceptable", "poor".
114    #[serde(default)]
115    pub overall: String,
116
117    /// Quality score (0.0 - 1.0).
118    #[serde(default)]
119    pub score: f64,
120
121    /// Blur level: "none", "slight", "significant".
122    #[serde(default)]
123    pub blur: String,
124
125    /// Lighting: "well_lit", "dim", "dark".
126    #[serde(default)]
127    pub darkness: String,
128
129    /// Resolution: "high", "adequate", "low".
130    #[serde(default)]
131    pub resolution: String,
132
133    /// Exposure: "correct", "over", "under".
134    #[serde(default)]
135    pub exposure: String,
136
137    /// Specific issues found.
138    #[serde(default)]
139    pub issues: Vec<String>,
140}
141
142/// Relevance check against expected content.
143#[derive(Debug, Clone, Deserialize, Default)]
144pub struct RelevanceCheck {
145    /// Whether the image is relevant to the context.
146    #[serde(default)]
147    pub relevant: bool,
148
149    /// Relevance score (0.0 - 1.0).
150    #[serde(default)]
151    pub score: f64,
152
153    /// Items expected based on context.
154    #[serde(default)]
155    pub expected_items: Vec<String>,
156
157    /// Items actually found in the image.
158    #[serde(default)]
159    pub found_items: Vec<String>,
160
161    /// Expected but not found.
162    #[serde(default)]
163    pub missing_items: Vec<String>,
164
165    /// Found but not expected.
166    #[serde(default)]
167    pub unexpected_items: Vec<String>,
168
169    /// Additional notes.
170    #[serde(default)]
171    pub notes: Option<String>,
172}
173
174/// OCR / text extraction result.
175#[derive(Debug, Clone, Deserialize, Default)]
176pub struct OcrResult {
177    /// All extracted text concatenated.
178    #[serde(default)]
179    pub text: Option<String>,
180
181    /// Extracted metadata (GPS, timestamp, address, etc.).
182    #[serde(default)]
183    pub metadata: HashMap<String, String>,
184
185    /// Individual text overlays with positions.
186    #[serde(default)]
187    pub overlays: Vec<TextOverlay>,
188}
189
190/// A detected text region in the image.
191#[derive(Debug, Clone, Deserialize, Default)]
192pub struct TextOverlay {
193    /// Extracted text content.
194    #[serde(default)]
195    pub text: String,
196
197    /// Bounding box: [y_min, x_min, y_max, x_max] normalised to 0-1000.
198    #[serde(default)]
199    pub bounding_box: Option<[i32; 4]>,
200
201    /// Overlay type: "gps", "timestamp", "address", "label", "other".
202    #[serde(rename = "type", default)]
203    pub overlay_type: Option<String>,
204}
205
206// ---------------------------------------------------------------------------
207// Client methods
208// ---------------------------------------------------------------------------
209
210impl Client {
211    /// Full combined vision analysis (scene + objects + quality + OCR + relevance).
212    pub async fn vision_analyze(&self, req: &VisionRequest) -> Result<VisionResponse> {
213        let (resp, _meta) = self
214            .post_json::<VisionRequest, VisionResponse>("/qai/v1/vision/analyze", req)
215            .await?;
216        Ok(resp)
217    }
218
219    /// Object detection with bounding boxes.
220    pub async fn vision_detect(&self, req: &VisionRequest) -> Result<VisionResponse> {
221        let (resp, _meta) = self
222            .post_json::<VisionRequest, VisionResponse>("/qai/v1/vision/detect", req)
223            .await?;
224        Ok(resp)
225    }
226
227    /// Scene description and tags.
228    pub async fn vision_describe(&self, req: &VisionRequest) -> Result<VisionResponse> {
229        let (resp, _meta) = self
230            .post_json::<VisionRequest, VisionResponse>("/qai/v1/vision/describe", req)
231            .await?;
232        Ok(resp)
233    }
234
235    /// Text extraction and overlay metadata (OCR).
236    pub async fn vision_ocr(&self, req: &VisionRequest) -> Result<VisionResponse> {
237        let (resp, _meta) = self
238            .post_json::<VisionRequest, VisionResponse>("/qai/v1/vision/ocr", req)
239            .await?;
240        Ok(resp)
241    }
242
243    /// Image quality assessment.
244    pub async fn vision_quality(&self, req: &VisionRequest) -> Result<VisionResponse> {
245        let (resp, _meta) = self
246            .post_json::<VisionRequest, VisionResponse>("/qai/v1/vision/quality", req)
247            .await?;
248        Ok(resp)
249    }
250}