use std::sync::OnceLock;
use crate::coding::{contains_cjk, contains_devanagari};
use crate::seed::{
self, Slot, WordForm, ROLE_TRANSLATION_INTO_MARKER, ROLE_TRANSLATION_OBJECT_MARKER,
ROLE_TRANSLATION_TARGET_DIRECTION, ROLE_TRANSLATION_UNQUOTED_FRAME,
};
#[must_use]
pub fn extract_unquoted_translation_surface(prompt: &str) -> Option<String> {
let trimmed = prompt.trim_end_matches(['.', '!', '?', '。']);
let lower = trimmed.to_lowercase();
extract_circumfix_surface(trimmed, &lower)
.or_else(|| extract_hindi_unquoted_surface(trimmed, &lower))
.or_else(|| extract_chinese_unquoted_surface(trimmed, &lower))
}
fn extract_circumfix_surface(original: &str, lower: &str) -> Option<String> {
markers()
.circumfix_frames
.iter()
.find_map(|(prefix, marker)| {
extract_between_prefix_and_marker(original, lower, prefix, marker)
})
}
fn extract_between_prefix_and_marker(
original: &str,
lower: &str,
prefix: &str,
marker: &str,
) -> Option<String> {
let rest = lower.strip_prefix(prefix)?;
let marker_offset = rest.find(marker)?;
let start = prefix.len();
let end = start + marker_offset;
clean_unquoted_surface(&original[start..end])
}
fn extract_hindi_unquoted_surface(original: &str, lower: &str) -> Option<String> {
let table = markers();
if !table
.hindi_verb_stems
.iter()
.any(|stem| lower.contains(stem))
{
return None;
}
for target_marker in &table.hindi_target_markers {
let Some(target_offset) = lower.find(target_marker) else {
continue;
};
let before_target = &lower[..target_offset];
for surface_marker in &table.hindi_object_markers {
if let Some(surface_end) = before_target.rfind(surface_marker) {
return clean_unquoted_surface(&original[..surface_end]);
}
}
}
None
}
fn extract_chinese_unquoted_surface(original: &str, lower: &str) -> Option<String> {
let table = markers();
for prefix in &table.chinese_command_prefixes {
let Some(rest) = lower.strip_prefix(prefix) else {
continue;
};
if let Some(marker_offset) = first_marker(rest, &table.chinese_command_markers) {
let start = prefix.len();
let end = start + marker_offset;
return clean_unquoted_surface(&original[start..end]);
}
}
for prefix in &table.chinese_translate_prefixes {
let Some(rest) = lower.strip_prefix(prefix) else {
continue;
};
if let Some(marker_offset) = first_marker(rest, &table.chinese_target_markers) {
let start = prefix.len();
let end = start + marker_offset;
return clean_unquoted_surface(&original[start..end]);
}
}
None
}
fn first_marker(text: &str, markers: &[&str]) -> Option<usize> {
markers.iter().filter_map(|marker| text.find(marker)).min()
}
fn clean_unquoted_surface(candidate: &str) -> Option<String> {
let cleaned = candidate.trim();
if cleaned.is_empty()
|| cleaned.chars().any(|character| {
matches!(
character,
'"' | '\'' | '«' | '»' | '`' | '“' | '”' | '‘' | '’'
)
})
{
return None;
}
Some(cleaned.to_owned())
}
struct TranslationMarkers {
circumfix_frames: Vec<(&'static str, &'static str)>,
hindi_verb_stems: Vec<&'static str>,
hindi_target_markers: Vec<&'static str>,
hindi_object_markers: Vec<&'static str>,
chinese_command_prefixes: Vec<&'static str>,
chinese_command_markers: Vec<&'static str>,
chinese_translate_prefixes: Vec<&'static str>,
chinese_target_markers: Vec<&'static str>,
}
fn markers() -> &'static TranslationMarkers {
static CACHE: OnceLock<TranslationMarkers> = OnceLock::new();
CACHE.get_or_init(|| TranslationMarkers {
circumfix_frames: circumfix_frames(ROLE_TRANSLATION_UNQUOTED_FRAME),
hindi_verb_stems: bare_script_forms(ROLE_TRANSLATION_UNQUOTED_FRAME, contains_devanagari),
hindi_target_markers: script_forms(ROLE_TRANSLATION_INTO_MARKER, contains_devanagari),
hindi_object_markers: script_forms(ROLE_TRANSLATION_OBJECT_MARKER, contains_devanagari),
chinese_command_prefixes: script_forms(ROLE_TRANSLATION_OBJECT_MARKER, contains_cjk),
chinese_command_markers: script_forms(ROLE_TRANSLATION_INTO_MARKER, contains_cjk),
chinese_translate_prefixes: bare_script_forms(
ROLE_TRANSLATION_UNQUOTED_FRAME,
contains_cjk,
),
chinese_target_markers: script_forms(ROLE_TRANSLATION_TARGET_DIRECTION, contains_cjk),
})
}
fn circumfix_frames(role: &str) -> Vec<(&'static str, &'static str)> {
seed::lexicon()
.role_word_forms(role)
.into_iter()
.filter(|form| form.slot() == Slot::Circumfix)
.map(|form| (form.before_slot(), form.after_slot()))
.collect()
}
fn script_forms(role: &str, script: fn(&str) -> bool) -> Vec<&'static str> {
seed::lexicon()
.role_word_forms(role)
.into_iter()
.filter(|form| script(&form.text))
.map(|form| form.text.as_str())
.collect()
}
fn bare_script_forms(role: &str, script: fn(&str) -> bool) -> Vec<&'static str> {
seed::lexicon()
.role_word_forms(role)
.into_iter()
.filter(|form| form.slot() == Slot::Bare && script(&form.text))
.map(WordForm::before_slot)
.collect()
}