Skip to main content

rlx_locateanything/
processor_prompt.rs

1// RLX — versatile ML compiler + runtime.
2// Copyright (C) 2026 Eugene Hauptmann, Nataliya Kosmyna.
3//
4// This program is free software: you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation, version 3.
7//
8// This program is distributed in the hope that it will be useful,
9// but WITHOUT ANY WARRANTY; without even the implied warranty of
10// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11// GNU General Public License for more details.
12//
13// You should have received a copy of the GNU General Public License
14// along with this program. If not, see <https://www.gnu.org/licenses/>.
15
16//! HuggingFace `LocateAnythingProcessor` chat + `<image-1>` expansion layout.
17//!
18//! Matches `processing_locateanything.py` (`py_apply_chat_template` + `replace_media_placeholder`
19//! + single-pass tokenizer encode). Distinct from [`crate::tokenizer::build_user_prompt_ids`]
20//!   (RLX CLI path: no system message, raw `image_token_index` placeholders).
21
22use crate::config::LocateAnythingConfig;
23use anyhow::{Context, Result};
24use serde::Deserialize;
25use std::path::Path;
26
27#[derive(Debug, Clone, Deserialize)]
28pub struct ProcessorPromptConfig {
29    pub image_start_token: String,
30    pub image_end_token: String,
31    pub image_token: String,
32}
33
34impl ProcessorPromptConfig {
35    pub fn from_model_dir(model_dir: &Path) -> Result<Self> {
36        let path = model_dir.join("processor_config.json");
37        let raw = std::fs::read_to_string(&path).with_context(|| format!("read {path:?}"))?;
38        serde_json::from_str(&raw).with_context(|| format!("parse {path:?}"))
39    }
40
41    /// Replace `<image-1>` with `<image 1><img>{image_token×N}</img>` (HF `replace_media_placeholder`).
42    pub fn expand_image_placeholder(&self, text: &str, n_image_tokens: usize) -> String {
43        let span = format!(
44            "<image 1>{}{}{}",
45            self.image_start_token,
46            self.image_token.repeat(n_image_tokens),
47            self.image_end_token
48        );
49        text.replace("<image-1>", &span)
50    }
51
52    /// Full string passed to the HF processor tokenizer (system + user + assistant prompt).
53    pub fn build_chat_prompt_string(
54        &self,
55        user_body_with_placeholder: &str,
56        n_image_tokens: usize,
57    ) -> String {
58        let user_expanded =
59            self.expand_image_placeholder(user_body_with_placeholder, n_image_tokens);
60        format!(
61            "<|im_start|>system\nYou are a helpful assistant.\n<|im_end|>\n\
62             <|im_start|>user\n\
63             {user_expanded}\
64             <|im_end|>\n\
65             <|im_start|>assistant\n"
66        )
67    }
68}
69
70/// Token ids from HF processor layout; `user_text` should include the `<image-1>` prefix.
71///
72/// Vision placeholders are inserted as [`LocateAnythingConfig::image_token_index`] ids
73/// (same as HF `replace_media_placeholder`), not by encoding a long `<IMG_CONTEXT>` run
74/// (BPE would merge adjacent specials when using `vocab.json`+`merges.txt` alone).
75#[cfg(feature = "tokenizer")]
76pub fn build_processor_prompt_ids(
77    model_dir: &Path,
78    cfg: &LocateAnythingConfig,
79    tokenizer: &tokenizers::Tokenizer,
80    user_text_with_placeholder: &str,
81    n_image_tokens: usize,
82) -> Result<Vec<u32>> {
83    let proc_cfg = ProcessorPromptConfig::from_model_dir(model_dir)?;
84    let user_body = user_text_with_placeholder
85        .strip_prefix("<image-1>")
86        .unwrap_or(user_text_with_placeholder);
87
88    let mut ids = crate::tokenizer::encode(
89        tokenizer,
90        "<|im_start|>system\nYou are a helpful assistant.\n<|im_end|>\n",
91    )?;
92    ids.extend(crate::tokenizer::encode(tokenizer, "<|im_start|>user\n")?);
93    ids.extend(crate::tokenizer::encode(tokenizer, "<image 1>")?);
94    ids.extend(crate::tokenizer::encode(
95        tokenizer,
96        &proc_cfg.image_start_token,
97    )?);
98    ids.extend(std::iter::repeat_n(cfg.image_token_index, n_image_tokens));
99    ids.extend(crate::tokenizer::encode(
100        tokenizer,
101        &proc_cfg.image_end_token,
102    )?);
103    ids.extend(crate::tokenizer::encode(tokenizer, user_body)?);
104    ids.extend(crate::tokenizer::encode(
105        tokenizer,
106        "<|im_end|>\n<|im_start|>assistant\n",
107    )?);
108    Ok(ids)
109}
110
111#[cfg(feature = "tokenizer")]
112pub fn ground_single_with_image_placeholder(phrase: &str) -> String {
113    format!("<image-1>{}", crate::prompts::ground_single(phrase))
114}