use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::sync::OnceLock;
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct LangConfig {
pub name: String,
pub hint: Option<String>,
}
fn lang_configs() -> &'static HashMap<String, LangConfig> {
static DATA: OnceLock<HashMap<String, LangConfig>> = OnceLock::new();
DATA.get_or_init(|| {
let json = include_str!("../languages/languages.json");
serde_json::from_str(json).expect("invalid languages.json")
})
}
pub fn supported_languages_json() -> String {
let configs = lang_configs();
let names: HashMap<&str, &str> = configs
.iter()
.map(|(code, cfg)| (code.as_str(), cfg.name.as_str()))
.collect();
serde_json::to_string(&names).unwrap_or_default()
}
pub const PHRASE_QUALITY_RULES: &str = r#"DO NOT:
- Use the customer's exact message as a seed
- Repeat the same structure with word swaps ("cancel my order" / "cancel my purchase" / "cancel my item")
- Use overly polished corporate language
- Include order numbers, names, dates, or specific products
- Emit phrases whose intent is carried entirely by a generic conversational stem ("how do I do this", "tell me about it", "what's that", "I need help", "can you fix it") — every phrase must contain at least one word specific to THIS intent
- Generate translations of the same phrases across languages — each language should have culturally natural expressions"#;
pub const REVIEW_FIX_GUIDELINES: &str = r#"You are maintaining a keyword-based intent classification engine. A customer query failed to match the correct intent.
For each missed intent, extract the SHORTEST meaningful span from the customer's query that clearly expresses that intent.
Rules:
- Use the customer's ACTUAL WORDS — do not paraphrase or invent new vocabulary
- Extract ONLY the portion relevant to this intent — not the whole message
- Strip filler ("um", "like", "you know", "honestly", "just"), profanity, and personal details (names, order numbers)
- 2-8 words — the intent-bearing core only
- The extracted span must make sense in isolation as something a user would say for this intent
- If the query has no clean overlap with the intent, extract the closest relevant words anyway"#;
const BASE_GUIDELINES: &str = r#"Generate realistic seed phrases for an intent classification engine. These phrases train a keyword-matching engine (not an LLM), so vocabulary diversity is critical.
Intent ID: {intent_id}
Description: {description}
Generate exactly 10 phrases per language. Each phrase must be something a real human would actually type in a chat box or support ticket. Requirements:
VARIETY IN LENGTH:
- 2-3 short phrases (2-4 words): "cancel order", "refund status"
- 4-5 medium phrases (5-10 words): "I need to cancel the order I placed"
- 2-3 long/conversational phrases (10+ words): "hey I ordered something yesterday and I changed my mind, can you cancel it"
VARIETY IN STYLE:
- Formal: "I would like to request a cancellation"
- Casual: "yo can I cancel this thing"
- Frustrated: "why is it so hard to cancel an order around here"
- Question form: "how do I cancel my recent order"
- Command form: "cancel order 12345"
- Contextual/story: "I found a better price elsewhere so I need to cancel"
VOCABULARY DIVERSITY (most important):
- Use different verbs for the same action (cancel/terminate/revoke/withdraw/undo)
- Use different nouns (order/purchase/transaction/item)
- Include phrases that describe the SITUATION not just the action ("I changed my mind", "ordered by mistake")
- Include emotional/frustrated variants that real users type
DO NOT:
- Repeat the same structure with word swaps ("cancel my order" / "cancel my purchase" / "cancel my item")
- Use overly polished corporate language
- Emit phrases whose intent is carried entirely by a generic conversational stem ("how do I do this", "tell me about it") — every phrase must contain words specific to THIS intent
- Generate translations of the same phrases across languages — each language should have culturally natural expressions"#;
pub fn build_prompt(
intent_id: &str,
description: &str,
languages: &[String],
examples: &[String],
) -> String {
let configs = lang_configs();
let guidelines = BASE_GUIDELINES
.replace(
"{intent_id}",
if intent_id.is_empty() {
"(unnamed)"
} else {
intent_id
},
)
.replace("{description}", description);
let anchor_block = if examples.is_empty() {
String::new()
} else {
let lines: Vec<String> = examples
.iter()
.filter(|e| !e.trim().is_empty())
.take(5)
.map(|e| format!(" - \"{}\"", e.trim()))
.collect();
if lines.is_empty() {
String::new()
} else {
format!(
"\n\nThe user provided these anchor examples — generate variations \
around them, matching the same intent. Keep the same meaning but \
vary vocabulary, length, and style. Do NOT just word-swap; treat \
each anchor as one of the ~10 outputs and generate the rest as \
genuinely different phrasings:\n{}",
lines.join("\n")
)
}
};
if languages.len() == 1 {
let lang = &languages[0];
let lang_name = configs
.get(lang.as_str())
.map(|c| c.name.as_str())
.unwrap_or(lang.as_str());
let hint = configs
.get(lang.as_str())
.and_then(|c| c.hint.as_deref())
.map(|h| format!("\n\n{}", h))
.unwrap_or_default();
format!(
"{}{}{}\n\nLanguage: {}\n\nReturn ONLY a JSON array of strings. No markdown, no explanation.",
guidelines, anchor_block, hint, lang_name
)
} else {
let lang_names: Vec<&str> = languages
.iter()
.map(|l| {
configs
.get(l.as_str())
.map(|c| c.name.as_str())
.unwrap_or(l.as_str())
})
.collect();
let lang_list = lang_names.join(", ");
let hints: Vec<&str> = languages
.iter()
.filter_map(|l| configs.get(l.as_str()))
.filter_map(|c| c.hint.as_deref())
.collect();
let hints_block = if hints.is_empty() {
String::new()
} else {
let items: Vec<String> = hints.iter().map(|h| format!("- {}", h)).collect();
format!("\n\nLanguage-specific instructions:\n{}", items.join("\n"))
};
format!(
"{}{}\n\nLanguages: {}\nFor non-English languages: write how native speakers actually type in chat, not translations of English phrases. Include slang, colloquialisms, and culturally natural expressions.{}\n\nReturn ONLY a JSON object mapping language codes to arrays. No markdown, no explanation. Example:\n{{\"en\": [\"phrase one\", \"long conversational phrase here\"], \"es\": [\"frase natural\", \"frase larga y conversacional aquí\"]}}",
guidelines, anchor_block, lang_list, hints_block
)
}
}
pub fn parse_response(response_text: &str, languages: &[String]) -> Result<String, String> {
let phrases_by_lang: HashMap<String, Vec<String>> = if languages.len() == 1 {
let array_str = extract_json_array(response_text)
.ok_or_else(|| "Could not parse response as JSON array".to_string())?;
let parsed: Vec<String> =
serde_json::from_str(&array_str).map_err(|e| format!("JSON parse error: {}", e))?;
let mut map = HashMap::new();
map.insert(languages[0].clone(), parsed);
map
} else {
let obj_str = extract_json_object(response_text)
.ok_or_else(|| "Could not parse response as JSON object".to_string())?;
serde_json::from_str(&obj_str).map_err(|e| format!("JSON parse error: {}", e))?
};
let total: usize = phrases_by_lang.values().map(|v| v.len()).sum();
let result = serde_json::json!({
"phrases_by_lang": phrases_by_lang,
"total": total,
});
serde_json::to_string(&result).map_err(|e| format!("Serialization error: {}", e))
}
fn extract_json_array(text: &str) -> Option<String> {
let start = text.find('[')?;
let end = text.rfind(']')?;
if end > start {
Some(text[start..=end].to_string())
} else {
None
}
}
fn extract_json_object(text: &str) -> Option<String> {
let start = text.find('{')?;
let end = text.rfind('}')?;
if end > start {
Some(text[start..=end].to_string())
} else {
None
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn supported_languages_includes_expected() {
let json = supported_languages_json();
let map: HashMap<String, String> = serde_json::from_str(&json).unwrap();
assert_eq!(map.get("en").unwrap(), "English");
assert_eq!(map.get("zh").unwrap(), "Chinese");
assert_eq!(map.get("ta").unwrap(), "Tamil");
assert!(map.len() >= 12);
}
#[test]
fn build_prompt_single_lang() {
let prompt = build_prompt("cancel", "cancel order", &["en".to_string()], &[]);
assert!(prompt.contains("Intent ID: cancel"));
assert!(prompt.contains("Language: English"));
assert!(prompt.contains("JSON array"));
}
#[test]
fn build_prompt_multi_lang_includes_hints() {
let prompt = build_prompt(
"cancel",
"cancel order",
&["en".to_string(), "zh".to_string(), "ta".to_string()],
&[],
);
assert!(prompt.contains("Languages: English, Chinese, Tamil"));
assert!(prompt.contains("simplified Chinese"));
assert!(prompt.contains("traditional Chinese"));
assert!(prompt.contains("pure Tamil script"));
assert!(prompt.contains("JSON object"));
}
#[test]
fn build_prompt_with_examples_includes_anchors() {
let examples = vec![
"cancel my order".to_string(),
"stop the order I placed".to_string(),
];
let prompt = build_prompt("cancel", "cancel order", &["en".to_string()], &examples);
assert!(prompt.contains("anchor examples"));
assert!(prompt.contains("\"cancel my order\""));
assert!(prompt.contains("\"stop the order I placed\""));
}
#[test]
fn build_prompt_empty_examples_is_unchanged() {
let plain = build_prompt("cancel", "cancel order", &["en".to_string()], &[]);
let with_blank = build_prompt(
"cancel",
"cancel order",
&["en".to_string()],
&["".to_string(), " ".to_string()],
);
assert_eq!(plain, with_blank, "blank/empty examples must be a no-op");
}
#[test]
fn parse_response_single_lang() {
let response = r#"["cancel my order", "stop the order"]"#;
let result = parse_response(response, &["en".to_string()]).unwrap();
let parsed: serde_json::Value = serde_json::from_str(&result).unwrap();
assert_eq!(parsed["total"], 2);
assert_eq!(parsed["phrases_by_lang"]["en"].as_array().unwrap().len(), 2);
}
#[test]
fn parse_response_multi_lang() {
let response = r#"{"en": ["cancel"], "es": ["cancelar", "anular"]}"#;
let result = parse_response(response, &["en".to_string(), "es".to_string()]).unwrap();
let parsed: serde_json::Value = serde_json::from_str(&result).unwrap();
assert_eq!(parsed["total"], 3);
}
#[test]
fn parse_response_with_surrounding_text() {
let response = "Here are the seeds:\n[\"phrase one\", \"phrase two\"]\nDone.";
let result = parse_response(response, &["en".to_string()]).unwrap();
let parsed: serde_json::Value = serde_json::from_str(&result).unwrap();
assert_eq!(parsed["total"], 2);
}
#[test]
fn parse_response_bad_input() {
let result = parse_response("no json here", &["en".to_string()]);
assert!(result.is_err());
}
}