car_inference/tasks/grounding.rs
1//! Visual grounding — extract structured object-localization output
2//! from a VL model's text response.
3//!
4//! Qwen2.5-VL is trained to emit bounding boxes inline in its text
5//! using Qwen-specific delimiters:
6//!
7//! ```text
8//! <|object_ref_start|>person<|object_ref_end|><|box_start|>(123,456),(789,1011)<|box_end|>
9//! ```
10//!
11//! The two spans pair 1:1: every `<|object_ref_*|>` label binds to the
12//! following `<|box_*|>` coordinate pair (xyxy, top-left + bottom-right
13//! in pixel space of the *input image*). Some outputs omit the label
14//! span when the user's prompt already named the object.
15//!
16//! This module provides a request/result surface for "ask a VL model
17//! to localize things" and a parser that turns Qwen's text output into
18//! `Vec<BoundingBox>`.
19
20use crate::tasks::generate::ContentBlock;
21use serde::{Deserialize, Serialize};
22
23/// A request for structured visual grounding.
24///
25/// Unlike [`crate::GenerateRequest`], this is the dedicated entry
26/// point for localization tasks — callers get back typed
27/// [`BoundingBox`]es rather than having to grep them out of a text
28/// response. Internally it still runs through the generate path and
29/// parses Qwen-style spans from the output; the dedicated type exists
30/// so callers can express intent explicitly and so future routing can
31/// prefer models with the `Grounding` capability.
32#[derive(Debug, Clone, Serialize, Deserialize)]
33pub struct GroundRequest {
34 /// Image content to ground against. Exactly one image is expected
35 /// — multi-image grounding is provider-specific and not yet
36 /// standardized across VL models.
37 pub image: ContentBlock,
38 /// Free-form text prompt describing what to localize
39 /// (e.g. "Find all the street signs").
40 pub prompt: String,
41 /// Optional model override. When unset, routes to the preferred
42 /// model declaring the `Grounding` capability.
43 #[serde(default)]
44 pub model: Option<String>,
45 /// Optional explicit label list. When provided, callers can
46 /// post-filter the returned boxes to these label names. Not
47 /// currently sent to the model (Qwen2.5-VL derives labels from
48 /// the prompt); reserved for providers that accept a controlled
49 /// vocabulary on the request.
50 #[serde(default)]
51 pub labels: Option<Vec<String>>,
52}
53
54/// Result of a [`GroundRequest`].
55#[derive(Debug, Clone, Serialize, Deserialize)]
56pub struct GroundResult {
57 /// Detected objects with their pixel-space xyxy boxes.
58 pub boxes: Vec<BoundingBox>,
59 /// Raw model output text, preserved so callers can see any
60 /// non-box narration the model produced (e.g. "I can't see any
61 /// street signs in this image").
62 pub raw_text: String,
63 /// Name of the model that produced this result, when known.
64 #[serde(default, skip_serializing_if = "Option::is_none")]
65 pub model_used: Option<String>,
66}
67
68/// A single detection emitted by a VL model's grounding head.
69///
70/// Coordinates are in *pixel space of the input image*, following
71/// Qwen2.5-VL's **inclusive xyxy** convention — the same as COCO and
72/// standard detection reference implementations. Both corners are
73/// inclusive: width is `x2 - x1 + 1`, height is `y2 - y1 + 1`.
74/// `label` is the model-supplied object reference; may be empty when
75/// the prompt already named the subject ("find the dog").
76#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
77pub struct BoundingBox {
78 /// Inclusive left pixel.
79 pub x1: u32,
80 /// Inclusive top pixel.
81 pub y1: u32,
82 /// Inclusive right pixel (width = `x2 - x1 + 1`).
83 pub x2: u32,
84 /// Inclusive bottom pixel (height = `y2 - y1 + 1`).
85 pub y2: u32,
86 /// Object label the model attached to this box. Empty when no
87 /// `<|object_ref_*|>` span preceded the box.
88 #[serde(default, skip_serializing_if = "String::is_empty")]
89 pub label: String,
90 /// Model-reported confidence in `[0.0, 1.0]` when available.
91 /// Qwen2.5-VL does not currently emit per-box confidences inline;
92 /// this field exists for forward compat with richer backends.
93 #[serde(default, skip_serializing_if = "Option::is_none")]
94 pub confidence: Option<f32>,
95}
96
97impl BoundingBox {
98 /// Pixel width of this box. Inclusive xyxy convention means a
99 /// single-pixel box has width 1, not 0, so we add 1.
100 pub fn width(&self) -> u32 {
101 self.x2.saturating_sub(self.x1).saturating_add(1)
102 }
103
104 /// Pixel height of this box. Same inclusive convention as [`width`].
105 pub fn height(&self) -> u32 {
106 self.y2.saturating_sub(self.y1).saturating_add(1)
107 }
108
109 /// Pixel area. Convenience for ranking / IoU callers.
110 pub fn area(&self) -> u64 {
111 self.width() as u64 * self.height() as u64
112 }
113}
114
115/// Parse Qwen2.5-VL grounding spans out of a model text response.
116///
117/// Recognizes the upstream delimiter pairs
118/// (`<|object_ref_start|>...<|object_ref_end|>` followed by
119/// `<|box_start|>(x1,y1),(x2,y2)<|box_end|>`) and returns one
120/// [`BoundingBox`] per `<|box_*|>` span. Returns an empty vec when
121/// the output contains no boxes — typical for "just describe"
122/// prompts.
123///
124/// Robust to the common variant where the model emits a bare
125/// `<|box_*|>` span without a preceding label (the box gets an empty
126/// `label`), and to extra whitespace inside the coordinate tuple.
127pub fn parse_boxes(text: &str) -> Vec<BoundingBox> {
128 const LABEL_OPEN: &str = "<|object_ref_start|>";
129 const LABEL_CLOSE: &str = "<|object_ref_end|>";
130 const BOX_OPEN: &str = "<|box_start|>";
131 const BOX_CLOSE: &str = "<|box_end|>";
132
133 // Fast-path: plain text responses (the overwhelming majority of
134 // generate calls) have zero `<|box_start|>` occurrences. Bail
135 // before allocating the output Vec so the ungated parser cost
136 // on the generate hot path is a single `str::find`.
137 if !text.contains(BOX_OPEN) {
138 return Vec::new();
139 }
140
141 let mut out = Vec::new();
142 let mut rest = text;
143 loop {
144 let box_pos = match rest.find(BOX_OPEN) {
145 Some(p) => p,
146 None => break,
147 };
148 // Everything before this box, look backward for a label span.
149 let before = &rest[..box_pos];
150 let label = if let Some(lo) = before.rfind(LABEL_OPEN) {
151 if let Some(lc) = before[lo + LABEL_OPEN.len()..].find(LABEL_CLOSE) {
152 let start = lo + LABEL_OPEN.len();
153 let end = start + lc;
154 before[start..end].trim().to_string()
155 } else {
156 String::new()
157 }
158 } else {
159 String::new()
160 };
161
162 let coord_start = box_pos + BOX_OPEN.len();
163 let close_rel = match rest[coord_start..].find(BOX_CLOSE) {
164 Some(p) => p,
165 None => break, // Unterminated box — bail.
166 };
167 let coord_body = &rest[coord_start..coord_start + close_rel];
168 if let Some(b) = parse_coord_pair(coord_body, label) {
169 out.push(b);
170 }
171 rest = &rest[coord_start + close_rel + BOX_CLOSE.len()..];
172 }
173 out
174}
175
176/// Parse `(x1,y1),(x2,y2)` — tolerant of whitespace inside and around
177/// the pairs. Returns `None` if any coordinate fails to parse.
178fn parse_coord_pair(s: &str, label: String) -> Option<BoundingBox> {
179 // Collect numeric tokens in order; first four are x1,y1,x2,y2.
180 let mut nums = Vec::with_capacity(4);
181 let mut cur = String::new();
182 for ch in s.chars() {
183 if ch.is_ascii_digit() {
184 cur.push(ch);
185 } else if !cur.is_empty() {
186 nums.push(cur.parse::<u32>().ok()?);
187 cur.clear();
188 if nums.len() == 4 {
189 break;
190 }
191 }
192 }
193 if !cur.is_empty() && nums.len() < 4 {
194 nums.push(cur.parse::<u32>().ok()?);
195 }
196 if nums.len() < 4 {
197 return None;
198 }
199 Some(BoundingBox {
200 x1: nums[0],
201 y1: nums[1],
202 x2: nums[2],
203 y2: nums[3],
204 label,
205 confidence: None,
206 })
207}
208
209#[cfg(test)]
210mod tests {
211 use super::*;
212
213 #[test]
214 fn empty_text_yields_no_boxes() {
215 assert!(parse_boxes("").is_empty());
216 assert!(parse_boxes("just a plain description").is_empty());
217 }
218
219 #[test]
220 fn single_labeled_box_parses() {
221 let text =
222 "<|object_ref_start|>dog<|object_ref_end|><|box_start|>(10,20),(110,220)<|box_end|>";
223 let boxes = parse_boxes(text);
224 assert_eq!(boxes.len(), 1);
225 assert_eq!(boxes[0].label, "dog");
226 assert_eq!(boxes[0].x1, 10);
227 assert_eq!(boxes[0].y1, 20);
228 assert_eq!(boxes[0].x2, 110);
229 assert_eq!(boxes[0].y2, 220);
230 }
231
232 #[test]
233 fn box_without_label_has_empty_label() {
234 let text = "The object is at <|box_start|>(1,2),(3,4)<|box_end|> pixels.";
235 let boxes = parse_boxes(text);
236 assert_eq!(boxes.len(), 1);
237 assert!(boxes[0].label.is_empty());
238 }
239
240 #[test]
241 fn multiple_boxes_pair_with_preceding_labels() {
242 let text = "<|object_ref_start|>a<|object_ref_end|><|box_start|>(0,0),(1,1)<|box_end|> \
243 <|object_ref_start|>b<|object_ref_end|><|box_start|>(2,2),(3,3)<|box_end|>";
244 let boxes = parse_boxes(text);
245 assert_eq!(boxes.len(), 2);
246 assert_eq!(boxes[0].label, "a");
247 assert_eq!(boxes[1].label, "b");
248 }
249
250 #[test]
251 fn whitespace_inside_coordinate_tuple_is_tolerated() {
252 let text = "<|box_start|>( 10 , 20 ),( 30 , 40 )<|box_end|>";
253 let boxes = parse_boxes(text);
254 assert_eq!(boxes.len(), 1);
255 assert_eq!(boxes[0].x1, 10);
256 assert_eq!(boxes[0].x2, 30);
257 }
258
259 #[test]
260 fn unterminated_box_is_dropped_not_crashing() {
261 let text = "<|box_start|>(1,2),(3,4 -- and then no close tag";
262 assert!(parse_boxes(text).is_empty());
263 }
264
265 #[test]
266 fn malformed_coords_yield_no_box() {
267 // Too few numbers — drop this one.
268 let text = "<|box_start|>(1,2,3)<|box_end|>";
269 assert!(parse_boxes(text).is_empty());
270 }
271
272 #[test]
273 fn inclusive_xyxy_semantics_width_height_area() {
274 // Qwen2.5-VL emits inclusive xyxy: a single-pixel box means
275 // width=1, not 0. (Neo flagged the original exclusive-variant
276 // docs as a systematic −1 drift on every downstream metric.)
277 let single = BoundingBox {
278 x1: 10,
279 y1: 20,
280 x2: 10,
281 y2: 20,
282 label: String::new(),
283 confidence: None,
284 };
285 assert_eq!(single.width(), 1);
286 assert_eq!(single.height(), 1);
287 assert_eq!(single.area(), 1);
288
289 let wide = BoundingBox {
290 x1: 0,
291 y1: 0,
292 x2: 99,
293 y2: 49,
294 label: String::new(),
295 confidence: None,
296 };
297 assert_eq!(wide.width(), 100);
298 assert_eq!(wide.height(), 50);
299 assert_eq!(wide.area(), 5_000);
300 }
301
302 #[test]
303 fn boxes_mixed_with_prose_parse_cleanly() {
304 let text = "Here is a picture of a dog <|object_ref_start|>Labrador<|object_ref_end|>\
305 <|box_start|>(100,200),(300,400)<|box_end|> and its toy \
306 <|object_ref_start|>ball<|object_ref_end|><|box_start|>(10,20),(40,50)<|box_end|>.";
307 let boxes = parse_boxes(text);
308 assert_eq!(boxes.len(), 2);
309 assert_eq!(boxes[0].label, "Labrador");
310 assert_eq!(boxes[1].label, "ball");
311 }
312}