car-inference 0.14.0

Local model inference for CAR — Candle backend with Qwen3 models
Documentation
//! Visual grounding — extract structured object-localization output
//! from a VL model's text response.
//!
//! Qwen2.5-VL is trained to emit bounding boxes inline in its text
//! using Qwen-specific delimiters:
//!
//! ```text
//! <|object_ref_start|>person<|object_ref_end|><|box_start|>(123,456),(789,1011)<|box_end|>
//! ```
//!
//! The two spans pair 1:1: every `<|object_ref_*|>` label binds to the
//! following `<|box_*|>` coordinate pair (xyxy, top-left + bottom-right
//! in pixel space of the *input image*). Some outputs omit the label
//! span when the user's prompt already named the object.
//!
//! This module provides a request/result surface for "ask a VL model
//! to localize things" and a parser that turns Qwen's text output into
//! `Vec<BoundingBox>`.

use crate::tasks::generate::ContentBlock;
use serde::{Deserialize, Serialize};

/// A request for structured visual grounding.
///
/// Unlike [`crate::GenerateRequest`], this is the dedicated entry
/// point for localization tasks — callers get back typed
/// [`BoundingBox`]es rather than having to grep them out of a text
/// response. Internally it still runs through the generate path and
/// parses Qwen-style spans from the output; the dedicated type exists
/// so callers can express intent explicitly and so future routing can
/// prefer models with the `Grounding` capability.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GroundRequest {
    /// Image content to ground against. Exactly one image is expected
    /// — multi-image grounding is provider-specific and not yet
    /// standardized across VL models.
    pub image: ContentBlock,
    /// Free-form text prompt describing what to localize
    /// (e.g. "Find all the street signs").
    pub prompt: String,
    /// Optional model override. When unset, routes to the preferred
    /// model declaring the `Grounding` capability.
    #[serde(default)]
    pub model: Option<String>,
    /// Optional explicit label list. When provided, callers can
    /// post-filter the returned boxes to these label names. Not
    /// currently sent to the model (Qwen2.5-VL derives labels from
    /// the prompt); reserved for providers that accept a controlled
    /// vocabulary on the request.
    #[serde(default)]
    pub labels: Option<Vec<String>>,
}

/// Result of a [`GroundRequest`].
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GroundResult {
    /// Detected objects with their pixel-space xyxy boxes.
    pub boxes: Vec<BoundingBox>,
    /// Raw model output text, preserved so callers can see any
    /// non-box narration the model produced (e.g. "I can't see any
    /// street signs in this image").
    pub raw_text: String,
    /// Name of the model that produced this result, when known.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub model_used: Option<String>,
}

/// A single detection emitted by a VL model's grounding head.
///
/// Coordinates are in *pixel space of the input image*, following
/// Qwen2.5-VL's **inclusive xyxy** convention — the same as COCO and
/// standard detection reference implementations. Both corners are
/// inclusive: width is `x2 - x1 + 1`, height is `y2 - y1 + 1`.
/// `label` is the model-supplied object reference; may be empty when
/// the prompt already named the subject ("find the dog").
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct BoundingBox {
    /// Inclusive left pixel.
    pub x1: u32,
    /// Inclusive top pixel.
    pub y1: u32,
    /// Inclusive right pixel (width = `x2 - x1 + 1`).
    pub x2: u32,
    /// Inclusive bottom pixel (height = `y2 - y1 + 1`).
    pub y2: u32,
    /// Object label the model attached to this box. Empty when no
    /// `<|object_ref_*|>` span preceded the box.
    #[serde(default, skip_serializing_if = "String::is_empty")]
    pub label: String,
    /// Model-reported confidence in `[0.0, 1.0]` when available.
    /// Qwen2.5-VL does not currently emit per-box confidences inline;
    /// this field exists for forward compat with richer backends.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub confidence: Option<f32>,
}

impl BoundingBox {
    /// Pixel width of this box. Inclusive xyxy convention means a
    /// single-pixel box has width 1, not 0, so we add 1.
    pub fn width(&self) -> u32 {
        self.x2.saturating_sub(self.x1).saturating_add(1)
    }

    /// Pixel height of this box. Same inclusive convention as [`width`].
    pub fn height(&self) -> u32 {
        self.y2.saturating_sub(self.y1).saturating_add(1)
    }

    /// Pixel area. Convenience for ranking / IoU callers.
    pub fn area(&self) -> u64 {
        self.width() as u64 * self.height() as u64
    }
}

/// Parse Qwen2.5-VL grounding spans out of a model text response.
///
/// Recognizes the upstream delimiter pairs
/// (`<|object_ref_start|>...<|object_ref_end|>` followed by
/// `<|box_start|>(x1,y1),(x2,y2)<|box_end|>`) and returns one
/// [`BoundingBox`] per `<|box_*|>` span. Returns an empty vec when
/// the output contains no boxes — typical for "just describe"
/// prompts.
///
/// Robust to the common variant where the model emits a bare
/// `<|box_*|>` span without a preceding label (the box gets an empty
/// `label`), and to extra whitespace inside the coordinate tuple.
pub fn parse_boxes(text: &str) -> Vec<BoundingBox> {
    const LABEL_OPEN: &str = "<|object_ref_start|>";
    const LABEL_CLOSE: &str = "<|object_ref_end|>";
    const BOX_OPEN: &str = "<|box_start|>";
    const BOX_CLOSE: &str = "<|box_end|>";

    // Fast-path: plain text responses (the overwhelming majority of
    // generate calls) have zero `<|box_start|>` occurrences. Bail
    // before allocating the output Vec so the ungated parser cost
    // on the generate hot path is a single `str::find`.
    if !text.contains(BOX_OPEN) {
        return Vec::new();
    }

    let mut out = Vec::new();
    let mut rest = text;
    loop {
        let box_pos = match rest.find(BOX_OPEN) {
            Some(p) => p,
            None => break,
        };
        // Everything before this box, look backward for a label span.
        let before = &rest[..box_pos];
        let label = if let Some(lo) = before.rfind(LABEL_OPEN) {
            if let Some(lc) = before[lo + LABEL_OPEN.len()..].find(LABEL_CLOSE) {
                let start = lo + LABEL_OPEN.len();
                let end = start + lc;
                before[start..end].trim().to_string()
            } else {
                String::new()
            }
        } else {
            String::new()
        };

        let coord_start = box_pos + BOX_OPEN.len();
        let close_rel = match rest[coord_start..].find(BOX_CLOSE) {
            Some(p) => p,
            None => break, // Unterminated box — bail.
        };
        let coord_body = &rest[coord_start..coord_start + close_rel];
        if let Some(b) = parse_coord_pair(coord_body, label) {
            out.push(b);
        }
        rest = &rest[coord_start + close_rel + BOX_CLOSE.len()..];
    }
    out
}

/// Parse `(x1,y1),(x2,y2)` — tolerant of whitespace inside and around
/// the pairs. Returns `None` if any coordinate fails to parse.
fn parse_coord_pair(s: &str, label: String) -> Option<BoundingBox> {
    // Collect numeric tokens in order; first four are x1,y1,x2,y2.
    let mut nums = Vec::with_capacity(4);
    let mut cur = String::new();
    for ch in s.chars() {
        if ch.is_ascii_digit() {
            cur.push(ch);
        } else if !cur.is_empty() {
            nums.push(cur.parse::<u32>().ok()?);
            cur.clear();
            if nums.len() == 4 {
                break;
            }
        }
    }
    if !cur.is_empty() && nums.len() < 4 {
        nums.push(cur.parse::<u32>().ok()?);
    }
    if nums.len() < 4 {
        return None;
    }
    Some(BoundingBox {
        x1: nums[0],
        y1: nums[1],
        x2: nums[2],
        y2: nums[3],
        label,
        confidence: None,
    })
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn empty_text_yields_no_boxes() {
        assert!(parse_boxes("").is_empty());
        assert!(parse_boxes("just a plain description").is_empty());
    }

    #[test]
    fn single_labeled_box_parses() {
        let text =
            "<|object_ref_start|>dog<|object_ref_end|><|box_start|>(10,20),(110,220)<|box_end|>";
        let boxes = parse_boxes(text);
        assert_eq!(boxes.len(), 1);
        assert_eq!(boxes[0].label, "dog");
        assert_eq!(boxes[0].x1, 10);
        assert_eq!(boxes[0].y1, 20);
        assert_eq!(boxes[0].x2, 110);
        assert_eq!(boxes[0].y2, 220);
    }

    #[test]
    fn box_without_label_has_empty_label() {
        let text = "The object is at <|box_start|>(1,2),(3,4)<|box_end|> pixels.";
        let boxes = parse_boxes(text);
        assert_eq!(boxes.len(), 1);
        assert!(boxes[0].label.is_empty());
    }

    #[test]
    fn multiple_boxes_pair_with_preceding_labels() {
        let text = "<|object_ref_start|>a<|object_ref_end|><|box_start|>(0,0),(1,1)<|box_end|> \
                    <|object_ref_start|>b<|object_ref_end|><|box_start|>(2,2),(3,3)<|box_end|>";
        let boxes = parse_boxes(text);
        assert_eq!(boxes.len(), 2);
        assert_eq!(boxes[0].label, "a");
        assert_eq!(boxes[1].label, "b");
    }

    #[test]
    fn whitespace_inside_coordinate_tuple_is_tolerated() {
        let text = "<|box_start|>( 10 , 20 ),( 30 , 40 )<|box_end|>";
        let boxes = parse_boxes(text);
        assert_eq!(boxes.len(), 1);
        assert_eq!(boxes[0].x1, 10);
        assert_eq!(boxes[0].x2, 30);
    }

    #[test]
    fn unterminated_box_is_dropped_not_crashing() {
        let text = "<|box_start|>(1,2),(3,4 -- and then no close tag";
        assert!(parse_boxes(text).is_empty());
    }

    #[test]
    fn malformed_coords_yield_no_box() {
        // Too few numbers — drop this one.
        let text = "<|box_start|>(1,2,3)<|box_end|>";
        assert!(parse_boxes(text).is_empty());
    }

    #[test]
    fn inclusive_xyxy_semantics_width_height_area() {
        // Qwen2.5-VL emits inclusive xyxy: a single-pixel box means
        // width=1, not 0. (Neo flagged the original exclusive-variant
        // docs as a systematic −1 drift on every downstream metric.)
        let single = BoundingBox {
            x1: 10,
            y1: 20,
            x2: 10,
            y2: 20,
            label: String::new(),
            confidence: None,
        };
        assert_eq!(single.width(), 1);
        assert_eq!(single.height(), 1);
        assert_eq!(single.area(), 1);

        let wide = BoundingBox {
            x1: 0,
            y1: 0,
            x2: 99,
            y2: 49,
            label: String::new(),
            confidence: None,
        };
        assert_eq!(wide.width(), 100);
        assert_eq!(wide.height(), 50);
        assert_eq!(wide.area(), 5_000);
    }

    #[test]
    fn boxes_mixed_with_prose_parse_cleanly() {
        let text = "Here is a picture of a dog <|object_ref_start|>Labrador<|object_ref_end|>\
                    <|box_start|>(100,200),(300,400)<|box_end|> and its toy \
                    <|object_ref_start|>ball<|object_ref_end|><|box_start|>(10,20),(40,50)<|box_end|>.";
        let boxes = parse_boxes(text);
        assert_eq!(boxes.len(), 2);
        assert_eq!(boxes[0].label, "Labrador");
        assert_eq!(boxes[1].label, "ball");
    }
}