use crate::tasks::generate::ContentBlock;
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GroundRequest {
pub image: ContentBlock,
pub prompt: String,
#[serde(default)]
pub model: Option<String>,
#[serde(default)]
pub labels: Option<Vec<String>>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GroundResult {
pub boxes: Vec<BoundingBox>,
pub raw_text: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub model_used: Option<String>,
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct BoundingBox {
pub x1: u32,
pub y1: u32,
pub x2: u32,
pub y2: u32,
#[serde(default, skip_serializing_if = "String::is_empty")]
pub label: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub confidence: Option<f32>,
}
impl BoundingBox {
pub fn width(&self) -> u32 {
self.x2.saturating_sub(self.x1).saturating_add(1)
}
pub fn height(&self) -> u32 {
self.y2.saturating_sub(self.y1).saturating_add(1)
}
pub fn area(&self) -> u64 {
self.width() as u64 * self.height() as u64
}
}
pub fn parse_boxes(text: &str) -> Vec<BoundingBox> {
const LABEL_OPEN: &str = "<|object_ref_start|>";
const LABEL_CLOSE: &str = "<|object_ref_end|>";
const BOX_OPEN: &str = "<|box_start|>";
const BOX_CLOSE: &str = "<|box_end|>";
if !text.contains(BOX_OPEN) {
return Vec::new();
}
let mut out = Vec::new();
let mut rest = text;
loop {
let box_pos = match rest.find(BOX_OPEN) {
Some(p) => p,
None => break,
};
let before = &rest[..box_pos];
let label = if let Some(lo) = before.rfind(LABEL_OPEN) {
if let Some(lc) = before[lo + LABEL_OPEN.len()..].find(LABEL_CLOSE) {
let start = lo + LABEL_OPEN.len();
let end = start + lc;
before[start..end].trim().to_string()
} else {
String::new()
}
} else {
String::new()
};
let coord_start = box_pos + BOX_OPEN.len();
let close_rel = match rest[coord_start..].find(BOX_CLOSE) {
Some(p) => p,
None => break, };
let coord_body = &rest[coord_start..coord_start + close_rel];
if let Some(b) = parse_coord_pair(coord_body, label) {
out.push(b);
}
rest = &rest[coord_start + close_rel + BOX_CLOSE.len()..];
}
out
}
fn parse_coord_pair(s: &str, label: String) -> Option<BoundingBox> {
let mut nums = Vec::with_capacity(4);
let mut cur = String::new();
for ch in s.chars() {
if ch.is_ascii_digit() {
cur.push(ch);
} else if !cur.is_empty() {
nums.push(cur.parse::<u32>().ok()?);
cur.clear();
if nums.len() == 4 {
break;
}
}
}
if !cur.is_empty() && nums.len() < 4 {
nums.push(cur.parse::<u32>().ok()?);
}
if nums.len() < 4 {
return None;
}
Some(BoundingBox {
x1: nums[0],
y1: nums[1],
x2: nums[2],
y2: nums[3],
label,
confidence: None,
})
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn empty_text_yields_no_boxes() {
assert!(parse_boxes("").is_empty());
assert!(parse_boxes("just a plain description").is_empty());
}
#[test]
fn single_labeled_box_parses() {
let text =
"<|object_ref_start|>dog<|object_ref_end|><|box_start|>(10,20),(110,220)<|box_end|>";
let boxes = parse_boxes(text);
assert_eq!(boxes.len(), 1);
assert_eq!(boxes[0].label, "dog");
assert_eq!(boxes[0].x1, 10);
assert_eq!(boxes[0].y1, 20);
assert_eq!(boxes[0].x2, 110);
assert_eq!(boxes[0].y2, 220);
}
#[test]
fn box_without_label_has_empty_label() {
let text = "The object is at <|box_start|>(1,2),(3,4)<|box_end|> pixels.";
let boxes = parse_boxes(text);
assert_eq!(boxes.len(), 1);
assert!(boxes[0].label.is_empty());
}
#[test]
fn multiple_boxes_pair_with_preceding_labels() {
let text = "<|object_ref_start|>a<|object_ref_end|><|box_start|>(0,0),(1,1)<|box_end|> \
<|object_ref_start|>b<|object_ref_end|><|box_start|>(2,2),(3,3)<|box_end|>";
let boxes = parse_boxes(text);
assert_eq!(boxes.len(), 2);
assert_eq!(boxes[0].label, "a");
assert_eq!(boxes[1].label, "b");
}
#[test]
fn whitespace_inside_coordinate_tuple_is_tolerated() {
let text = "<|box_start|>( 10 , 20 ),( 30 , 40 )<|box_end|>";
let boxes = parse_boxes(text);
assert_eq!(boxes.len(), 1);
assert_eq!(boxes[0].x1, 10);
assert_eq!(boxes[0].x2, 30);
}
#[test]
fn unterminated_box_is_dropped_not_crashing() {
let text = "<|box_start|>(1,2),(3,4 -- and then no close tag";
assert!(parse_boxes(text).is_empty());
}
#[test]
fn malformed_coords_yield_no_box() {
let text = "<|box_start|>(1,2,3)<|box_end|>";
assert!(parse_boxes(text).is_empty());
}
#[test]
fn inclusive_xyxy_semantics_width_height_area() {
let single = BoundingBox {
x1: 10,
y1: 20,
x2: 10,
y2: 20,
label: String::new(),
confidence: None,
};
assert_eq!(single.width(), 1);
assert_eq!(single.height(), 1);
assert_eq!(single.area(), 1);
let wide = BoundingBox {
x1: 0,
y1: 0,
x2: 99,
y2: 49,
label: String::new(),
confidence: None,
};
assert_eq!(wide.width(), 100);
assert_eq!(wide.height(), 50);
assert_eq!(wide.area(), 5_000);
}
#[test]
fn boxes_mixed_with_prose_parse_cleanly() {
let text = "Here is a picture of a dog <|object_ref_start|>Labrador<|object_ref_end|>\
<|box_start|>(100,200),(300,400)<|box_end|> and its toy \
<|object_ref_start|>ball<|object_ref_end|><|box_start|>(10,20),(40,50)<|box_end|>.";
let boxes = parse_boxes(text);
assert_eq!(boxes.len(), 2);
assert_eq!(boxes[0].label, "Labrador");
assert_eq!(boxes[1].label, "ball");
}
}