Skip to main content

car_inference/tasks/
grounding.rs

1//! Visual grounding — extract structured object-localization output
2//! from a VL model's text response.
3//!
4//! Qwen2.5-VL is trained to emit bounding boxes inline in its text
5//! using Qwen-specific delimiters:
6//!
7//! ```text
8//! <|object_ref_start|>person<|object_ref_end|><|box_start|>(123,456),(789,1011)<|box_end|>
9//! ```
10//!
11//! The two spans pair 1:1: every `<|object_ref_*|>` label binds to the
12//! following `<|box_*|>` coordinate pair (xyxy, top-left + bottom-right
13//! in pixel space of the *input image*). Some outputs omit the label
14//! span when the user's prompt already named the object.
15//!
16//! This module provides a request/result surface for "ask a VL model
17//! to localize things" and a parser that turns Qwen's text output into
18//! `Vec<BoundingBox>`.
19
20use crate::tasks::generate::ContentBlock;
21use serde::{Deserialize, Serialize};
22
23/// A request for structured visual grounding.
24///
25/// Unlike [`crate::GenerateRequest`], this is the dedicated entry
26/// point for localization tasks — callers get back typed
27/// [`BoundingBox`]es rather than having to grep them out of a text
28/// response. Internally it still runs through the generate path and
29/// parses Qwen-style spans from the output; the dedicated type exists
30/// so callers can express intent explicitly and so future routing can
31/// prefer models with the `Grounding` capability.
32#[derive(Debug, Clone, Serialize, Deserialize)]
33pub struct GroundRequest {
34    /// Image content to ground against. Exactly one image is expected
35    /// — multi-image grounding is provider-specific and not yet
36    /// standardized across VL models.
37    pub image: ContentBlock,
38    /// Free-form text prompt describing what to localize
39    /// (e.g. "Find all the street signs").
40    pub prompt: String,
41    /// Optional model override. When unset, routes to the preferred
42    /// model declaring the `Grounding` capability.
43    #[serde(default)]
44    pub model: Option<String>,
45    /// Optional explicit label list. When provided, callers can
46    /// post-filter the returned boxes to these label names. Not
47    /// currently sent to the model (Qwen2.5-VL derives labels from
48    /// the prompt); reserved for providers that accept a controlled
49    /// vocabulary on the request.
50    #[serde(default)]
51    pub labels: Option<Vec<String>>,
52}
53
54/// Result of a [`GroundRequest`].
55#[derive(Debug, Clone, Serialize, Deserialize)]
56pub struct GroundResult {
57    /// Detected objects with their pixel-space xyxy boxes.
58    pub boxes: Vec<BoundingBox>,
59    /// Raw model output text, preserved so callers can see any
60    /// non-box narration the model produced (e.g. "I can't see any
61    /// street signs in this image").
62    pub raw_text: String,
63    /// Name of the model that produced this result, when known.
64    #[serde(default, skip_serializing_if = "Option::is_none")]
65    pub model_used: Option<String>,
66}
67
68/// A single detection emitted by a VL model's grounding head.
69///
70/// Coordinates are in *pixel space of the input image*, following
71/// Qwen2.5-VL's **inclusive xyxy** convention — the same as COCO and
72/// standard detection reference implementations. Both corners are
73/// inclusive: width is `x2 - x1 + 1`, height is `y2 - y1 + 1`.
74/// `label` is the model-supplied object reference; may be empty when
75/// the prompt already named the subject ("find the dog").
76#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
77pub struct BoundingBox {
78    /// Inclusive left pixel.
79    pub x1: u32,
80    /// Inclusive top pixel.
81    pub y1: u32,
82    /// Inclusive right pixel (width = `x2 - x1 + 1`).
83    pub x2: u32,
84    /// Inclusive bottom pixel (height = `y2 - y1 + 1`).
85    pub y2: u32,
86    /// Object label the model attached to this box. Empty when no
87    /// `<|object_ref_*|>` span preceded the box.
88    #[serde(default, skip_serializing_if = "String::is_empty")]
89    pub label: String,
90    /// Model-reported confidence in `[0.0, 1.0]` when available.
91    /// Qwen2.5-VL does not currently emit per-box confidences inline;
92    /// this field exists for forward compat with richer backends.
93    #[serde(default, skip_serializing_if = "Option::is_none")]
94    pub confidence: Option<f32>,
95}
96
97impl BoundingBox {
98    /// Pixel width of this box. Inclusive xyxy convention means a
99    /// single-pixel box has width 1, not 0, so we add 1.
100    pub fn width(&self) -> u32 {
101        self.x2.saturating_sub(self.x1).saturating_add(1)
102    }
103
104    /// Pixel height of this box. Same inclusive convention as [`width`].
105    pub fn height(&self) -> u32 {
106        self.y2.saturating_sub(self.y1).saturating_add(1)
107    }
108
109    /// Pixel area. Convenience for ranking / IoU callers.
110    pub fn area(&self) -> u64 {
111        self.width() as u64 * self.height() as u64
112    }
113}
114
115/// Parse Qwen2.5-VL grounding spans out of a model text response.
116///
117/// Recognizes the upstream delimiter pairs
118/// (`<|object_ref_start|>...<|object_ref_end|>` followed by
119/// `<|box_start|>(x1,y1),(x2,y2)<|box_end|>`) and returns one
120/// [`BoundingBox`] per `<|box_*|>` span. Returns an empty vec when
121/// the output contains no boxes — typical for "just describe"
122/// prompts.
123///
124/// Robust to the common variant where the model emits a bare
125/// `<|box_*|>` span without a preceding label (the box gets an empty
126/// `label`), and to extra whitespace inside the coordinate tuple.
127pub fn parse_boxes(text: &str) -> Vec<BoundingBox> {
128    const LABEL_OPEN: &str = "<|object_ref_start|>";
129    const LABEL_CLOSE: &str = "<|object_ref_end|>";
130    const BOX_OPEN: &str = "<|box_start|>";
131    const BOX_CLOSE: &str = "<|box_end|>";
132
133    // Fast-path: plain text responses (the overwhelming majority of
134    // generate calls) have zero `<|box_start|>` occurrences. Bail
135    // before allocating the output Vec so the ungated parser cost
136    // on the generate hot path is a single `str::find`.
137    if !text.contains(BOX_OPEN) {
138        return Vec::new();
139    }
140
141    let mut out = Vec::new();
142    let mut rest = text;
143    loop {
144        let box_pos = match rest.find(BOX_OPEN) {
145            Some(p) => p,
146            None => break,
147        };
148        // Everything before this box, look backward for a label span.
149        let before = &rest[..box_pos];
150        let label = if let Some(lo) = before.rfind(LABEL_OPEN) {
151            if let Some(lc) = before[lo + LABEL_OPEN.len()..].find(LABEL_CLOSE) {
152                let start = lo + LABEL_OPEN.len();
153                let end = start + lc;
154                before[start..end].trim().to_string()
155            } else {
156                String::new()
157            }
158        } else {
159            String::new()
160        };
161
162        let coord_start = box_pos + BOX_OPEN.len();
163        let close_rel = match rest[coord_start..].find(BOX_CLOSE) {
164            Some(p) => p,
165            None => break, // Unterminated box — bail.
166        };
167        let coord_body = &rest[coord_start..coord_start + close_rel];
168        if let Some(b) = parse_coord_pair(coord_body, label) {
169            out.push(b);
170        }
171        rest = &rest[coord_start + close_rel + BOX_CLOSE.len()..];
172    }
173    out
174}
175
176/// Parse `(x1,y1),(x2,y2)` — tolerant of whitespace inside and around
177/// the pairs. Returns `None` if any coordinate fails to parse.
178fn parse_coord_pair(s: &str, label: String) -> Option<BoundingBox> {
179    // Collect numeric tokens in order; first four are x1,y1,x2,y2.
180    let mut nums = Vec::with_capacity(4);
181    let mut cur = String::new();
182    for ch in s.chars() {
183        if ch.is_ascii_digit() {
184            cur.push(ch);
185        } else if !cur.is_empty() {
186            nums.push(cur.parse::<u32>().ok()?);
187            cur.clear();
188            if nums.len() == 4 {
189                break;
190            }
191        }
192    }
193    if !cur.is_empty() && nums.len() < 4 {
194        nums.push(cur.parse::<u32>().ok()?);
195    }
196    if nums.len() < 4 {
197        return None;
198    }
199    Some(BoundingBox {
200        x1: nums[0],
201        y1: nums[1],
202        x2: nums[2],
203        y2: nums[3],
204        label,
205        confidence: None,
206    })
207}
208
209#[cfg(test)]
210mod tests {
211    use super::*;
212
213    #[test]
214    fn empty_text_yields_no_boxes() {
215        assert!(parse_boxes("").is_empty());
216        assert!(parse_boxes("just a plain description").is_empty());
217    }
218
219    #[test]
220    fn single_labeled_box_parses() {
221        let text =
222            "<|object_ref_start|>dog<|object_ref_end|><|box_start|>(10,20),(110,220)<|box_end|>";
223        let boxes = parse_boxes(text);
224        assert_eq!(boxes.len(), 1);
225        assert_eq!(boxes[0].label, "dog");
226        assert_eq!(boxes[0].x1, 10);
227        assert_eq!(boxes[0].y1, 20);
228        assert_eq!(boxes[0].x2, 110);
229        assert_eq!(boxes[0].y2, 220);
230    }
231
232    #[test]
233    fn box_without_label_has_empty_label() {
234        let text = "The object is at <|box_start|>(1,2),(3,4)<|box_end|> pixels.";
235        let boxes = parse_boxes(text);
236        assert_eq!(boxes.len(), 1);
237        assert!(boxes[0].label.is_empty());
238    }
239
240    #[test]
241    fn multiple_boxes_pair_with_preceding_labels() {
242        let text = "<|object_ref_start|>a<|object_ref_end|><|box_start|>(0,0),(1,1)<|box_end|> \
243                    <|object_ref_start|>b<|object_ref_end|><|box_start|>(2,2),(3,3)<|box_end|>";
244        let boxes = parse_boxes(text);
245        assert_eq!(boxes.len(), 2);
246        assert_eq!(boxes[0].label, "a");
247        assert_eq!(boxes[1].label, "b");
248    }
249
250    #[test]
251    fn whitespace_inside_coordinate_tuple_is_tolerated() {
252        let text = "<|box_start|>( 10 , 20 ),( 30 , 40 )<|box_end|>";
253        let boxes = parse_boxes(text);
254        assert_eq!(boxes.len(), 1);
255        assert_eq!(boxes[0].x1, 10);
256        assert_eq!(boxes[0].x2, 30);
257    }
258
259    #[test]
260    fn unterminated_box_is_dropped_not_crashing() {
261        let text = "<|box_start|>(1,2),(3,4 -- and then no close tag";
262        assert!(parse_boxes(text).is_empty());
263    }
264
265    #[test]
266    fn malformed_coords_yield_no_box() {
267        // Too few numbers — drop this one.
268        let text = "<|box_start|>(1,2,3)<|box_end|>";
269        assert!(parse_boxes(text).is_empty());
270    }
271
272    #[test]
273    fn inclusive_xyxy_semantics_width_height_area() {
274        // Qwen2.5-VL emits inclusive xyxy: a single-pixel box means
275        // width=1, not 0. (Neo flagged the original exclusive-variant
276        // docs as a systematic −1 drift on every downstream metric.)
277        let single = BoundingBox {
278            x1: 10,
279            y1: 20,
280            x2: 10,
281            y2: 20,
282            label: String::new(),
283            confidence: None,
284        };
285        assert_eq!(single.width(), 1);
286        assert_eq!(single.height(), 1);
287        assert_eq!(single.area(), 1);
288
289        let wide = BoundingBox {
290            x1: 0,
291            y1: 0,
292            x2: 99,
293            y2: 49,
294            label: String::new(),
295            confidence: None,
296        };
297        assert_eq!(wide.width(), 100);
298        assert_eq!(wide.height(), 50);
299        assert_eq!(wide.area(), 5_000);
300    }
301
302    #[test]
303    fn boxes_mixed_with_prose_parse_cleanly() {
304        let text = "Here is a picture of a dog <|object_ref_start|>Labrador<|object_ref_end|>\
305                    <|box_start|>(100,200),(300,400)<|box_end|> and its toy \
306                    <|object_ref_start|>ball<|object_ref_end|><|box_start|>(10,20),(40,50)<|box_end|>.";
307        let boxes = parse_boxes(text);
308        assert_eq!(boxes.len(), 2);
309        assert_eq!(boxes[0].label, "Labrador");
310        assert_eq!(boxes[1].label, "ball");
311    }
312}