use std::collections::HashSet;
use std::sync::LazyLock;
use std::time::Instant;
#[cfg(feature = "translation")]
use rayon::prelude::*;
use regex::Regex;
#[derive(Debug, Clone, PartialEq)]
pub struct TextItem {
pub text: String,
pub text_type: TextType,
pub priority: TextPriority,
pub complexity: f32,
pub location: String,
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum TextType {
Title, Content, Link, Button, Label, Alt, Placeholder, Other, }
#[derive(Debug, Clone, PartialEq, PartialOrd, Ord, Eq, Hash)]
pub enum TextPriority {
Low = 1,
Normal = 2,
High = 3,
Critical = 4,
}
#[derive(Debug, Clone)]
pub struct Batch {
pub id: usize,
pub items: Vec<TextItem>,
pub priority: TextPriority,
pub estimated_chars: usize,
pub created_at: Instant,
}
#[derive(Debug, Clone)]
pub struct TextAnalysis {
pub should_translate: bool,
pub translatability_score: f32,
pub text_type: TextType,
pub priority: TextPriority,
pub features: Vec<String>,
}
static URL_REGEX: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"https?://[^\s]+|www\.[^\s]+|[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
.expect("URL regex should be valid")
});
static EMAIL_REGEX: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
.expect("Email regex should be valid")
});
static CODE_REGEX: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"^\s*[{}\[\]();,]|[=+\-*/%<>!&|^~]|\b(function|var|let|const|if|else|for|while|return|class|def|import|export)\b")
.expect("Code regex should be valid")
});
static CHINESE_REGEX: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"[\u4e00-\u9fff]")
.expect("Chinese regex should be valid")
});
static FUNCTIONAL_WORDS: LazyLock<HashSet<&'static str>> = LazyLock::new(|| {
[
"ok", "yes", "no", "on", "off", "true", "false", "null", "none",
"home", "back", "next", "prev", "close", "open", "save", "load",
"new", "edit", "delete", "add", "remove", "clear", "reset",
"login", "logout", "signup", "signin", "submit", "cancel",
"confirm", "apply", "ok", "done", "finish", "start", "stop",
"play", "pause", "resume", "skip", "retry", "refresh", "reload",
"search", "filter", "sort", "view", "show", "hide", "toggle",
"expand", "collapse", "minimize", "maximize", "restore",
"copy", "paste", "cut", "undo", "redo", "select", "all",
"help", "info", "about", "contact", "privacy", "terms",
"settings", "config", "options", "preferences", "profile",
"account", "user", "admin", "guest", "public", "private",
"draft", "published", "archived", "deleted", "active", "inactive",
"enabled", "disabled", "online", "offline", "available", "busy",
"free", "premium", "pro", "basic", "standard", "advanced",
"low", "medium", "high", "max", "min", "auto", "manual",
].into_iter().collect()
});
pub mod analyzers {
use super::*;
pub fn check_length(text: &str) -> bool {
let trimmed = text.trim();
!trimmed.is_empty() && trimmed.len() >= 2
}
pub fn is_whitespace_only(text: &str) -> bool {
text.trim().is_empty()
}
pub fn is_numeric_only(text: &str) -> bool {
text.trim().chars().all(|c| c.is_ascii_digit() || c == '.' || c == ',' || c == ' ')
}
pub fn is_url(text: &str) -> bool {
URL_REGEX.is_match(text.trim())
}
pub fn is_email(text: &str) -> bool {
EMAIL_REGEX.is_match(text.trim())
}
pub fn is_code(text: &str) -> bool {
let trimmed = text.trim();
CODE_REGEX.is_match(trimmed) ||
trimmed.starts_with("function") ||
trimmed.contains("=>") ||
(trimmed.contains('{') && trimmed.contains('}'))
}
pub fn chinese_char_ratio(text: &str) -> f32 {
let total_chars = text.chars().count();
if total_chars == 0 {
return 0.0;
}
let chinese_chars = CHINESE_REGEX.find_iter(text).count();
chinese_chars as f32 / total_chars as f32
}
pub fn is_functional_word(text: &str) -> bool {
let trimmed = text.trim().to_ascii_lowercase();
FUNCTIONAL_WORDS.contains(trimmed.as_str())
}
pub fn alphabetic_ratio(text: &str) -> f32 {
let total_chars = text.chars().count();
if total_chars == 0 {
return 0.0;
}
let alphabetic_chars = text.chars().filter(|c| c.is_alphabetic()).count();
alphabetic_chars as f32 / total_chars as f32
}
pub fn special_char_density(text: &str) -> f32 {
let total_chars = text.chars().count();
if total_chars == 0 {
return 0.0;
}
let special_chars = text.chars()
.filter(|c| !c.is_alphanumeric() && !c.is_whitespace())
.count();
special_chars as f32 / total_chars as f32
}
}
pub struct TextFilter;
impl TextFilter {
pub fn new() -> Self {
Self
}
pub fn should_translate(&self, text: &str) -> bool {
self.analyze_text(text).should_translate
}
pub fn analyze_text(&self, text: &str) -> TextAnalysis {
use analyzers::*;
if !check_length(text) || is_whitespace_only(text) {
return TextAnalysis {
should_translate: false,
translatability_score: 0.0,
text_type: TextType::Other,
priority: TextPriority::Low,
features: vec!["too_short".to_string()],
};
}
let mut features = Vec::new();
let mut score = 1.0f32;
if is_numeric_only(text) {
features.push("numeric".to_string());
score *= 0.1;
}
if is_url(text) {
features.push("url".to_string());
score *= 0.0;
}
if is_email(text) {
features.push("email".to_string());
score *= 0.0;
}
if is_code(text) {
features.push("code".to_string());
score *= 0.2;
}
if is_functional_word(text) {
features.push("functional".to_string());
score *= 0.3;
}
let chinese_ratio = chinese_char_ratio(text);
if chinese_ratio > 0.3 {
features.push("chinese".to_string());
score *= 0.1; }
let alphabetic_ratio = alphabetic_ratio(text);
if alphabetic_ratio < 0.3 {
features.push("low_alphabetic".to_string());
score *= 0.5;
}
let special_density = special_char_density(text);
if special_density > 0.5 {
features.push("high_special_chars".to_string());
score *= 0.4;
}
let text_type = self.infer_text_type(text);
let priority = self.infer_priority(&text_type, text.len());
TextAnalysis {
should_translate: score > 0.5,
translatability_score: score,
text_type,
priority,
features,
}
}
fn infer_text_type(&self, text: &str) -> TextType {
let len = text.len();
let has_punctuation = text.chars().any(|c| ".!?。!?".contains(c));
if len < 10 && !has_punctuation {
TextType::Label
} else if len < 50 && !has_punctuation {
TextType::Button
} else if has_punctuation && len > 20 {
TextType::Content
} else {
TextType::Other
}
}
fn infer_priority(&self, text_type: &TextType, length: usize) -> TextPriority {
match text_type {
TextType::Title => TextPriority::Critical,
TextType::Content if length > 100 => TextPriority::High,
TextType::Content => TextPriority::Normal,
TextType::Button | TextType::Link => TextPriority::High,
TextType::Label | TextType::Alt => TextPriority::Normal,
TextType::Placeholder => TextPriority::Low,
TextType::Other => TextPriority::Low,
}
}
#[cfg(feature = "translation")]
pub fn filter_texts_parallel(&self, texts: Vec<String>) -> Vec<String> {
texts
.into_par_iter()
.filter(|text| self.should_translate(text))
.collect()
}
#[cfg(not(feature = "translation"))]
pub fn filter_texts_parallel(&self, texts: Vec<String>) -> Vec<String> {
texts
.into_iter()
.filter(|text| self.should_translate(text))
.collect()
}
}
impl Default for TextFilter {
fn default() -> Self {
Self::new()
}
}
pub struct BatchManager {
next_id: std::sync::atomic::AtomicUsize,
}
impl BatchManager {
pub fn new() -> Self {
Self {
next_id: std::sync::atomic::AtomicUsize::new(1),
}
}
pub fn create_batches(&self, items: Vec<TextItem>) -> Vec<Batch> {
items
.into_iter()
.filter(|item| !item.text.trim().is_empty())
.collect::<Vec<_>>()
.pipe(|items| self.group_by_priority(items))
.pipe(|groups| self.optimize_batch_sizes(groups))
.pipe(|batches| self.sort_by_priority(batches))
}
fn group_by_priority(&self, items: Vec<TextItem>) -> Vec<Vec<TextItem>> {
use std::collections::HashMap;
let mut groups: HashMap<TextPriority, Vec<TextItem>> = HashMap::new();
for item in items {
groups.entry(item.priority.clone()).or_default().push(item);
}
let mut result: Vec<_> = groups.into_values().collect();
result.sort_by(|a, b| {
b.first().map(|item| &item.priority)
.cmp(&a.first().map(|item| &item.priority))
});
result
}
fn optimize_batch_sizes(&self, groups: Vec<Vec<TextItem>>) -> Vec<Batch> {
const MAX_BATCH_SIZE: usize = 50;
const MIN_BATCH_SIZE: usize = 5;
groups
.into_iter()
.flat_map(|group| {
if group.len() <= MAX_BATCH_SIZE {
vec![group]
} else {
group
.chunks(MAX_BATCH_SIZE)
.map(|chunk| chunk.to_vec())
.collect()
}
})
.filter(|group| group.len() >= MIN_BATCH_SIZE ||
group.iter().any(|item| item.priority >= TextPriority::High))
.map(|items| self.create_batch(items))
.collect()
}
fn create_batch(&self, items: Vec<TextItem>) -> Batch {
let id = self.next_id.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
let priority = items
.iter()
.map(|item| &item.priority)
.max()
.cloned()
.unwrap_or(TextPriority::Low);
let estimated_chars = items
.iter()
.map(|item| item.text.len())
.sum();
Batch {
id,
items,
priority,
estimated_chars,
created_at: Instant::now(),
}
}
fn sort_by_priority(&self, mut batches: Vec<Batch>) -> Vec<Batch> {
batches.sort_by(|a, b| {
b.priority.cmp(&a.priority)
.then_with(|| a.created_at.cmp(&b.created_at))
});
batches
}
}
impl Default for BatchManager {
fn default() -> Self {
Self::new()
}
}
trait Pipe: Sized {
fn pipe<F, R>(self, f: F) -> R
where
F: FnOnce(Self) -> R;
}
impl<T> Pipe for T {
fn pipe<F, R>(self, f: F) -> R
where
F: FnOnce(Self) -> R,
{
f(self)
}
}
pub fn create_text_item(text: String, location: String) -> TextItem {
let filter = TextFilter::new();
let analysis = filter.analyze_text(&text);
TextItem {
complexity: analysis.translatability_score,
text,
text_type: analysis.text_type,
priority: analysis.priority,
location,
}
}
pub fn batch_analyze_texts(texts: &[String]) -> Vec<TextAnalysis> {
let filter = TextFilter::new();
#[cfg(feature = "translation")]
{
texts
.par_iter()
.map(|text| filter.analyze_text(text))
.collect()
}
#[cfg(not(feature = "translation"))]
{
texts
.iter()
.map(|text| filter.analyze_text(text))
.collect()
}
}
pub fn create_optimized_batches(items: Vec<TextItem>) -> Vec<Batch> {
let manager = BatchManager::new();
manager.create_batches(items)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_text_filter_basic() {
let filter = TextFilter::new();
assert!(filter.should_translate("Hello World"));
assert!(!filter.should_translate(""));
assert!(!filter.should_translate(" "));
assert!(!filter.should_translate("123"));
assert!(!filter.should_translate("https://example.com"));
assert!(!filter.should_translate("test@example.com"));
}
#[test]
fn test_analyzers() {
use analyzers::*;
assert!(check_length("Hello"));
assert!(!check_length("H"));
assert!(!check_length(""));
assert!(is_whitespace_only(" "));
assert!(!is_whitespace_only("Hello"));
assert!(is_numeric_only("123"));
assert!(!is_numeric_only("123abc"));
assert!(is_url("https://example.com"));
assert!(!is_url("hello world"));
assert!(is_email("test@example.com"));
assert!(!is_email("not an email"));
}
#[test]
fn test_batch_manager() {
let manager = BatchManager::new();
let items = vec![
create_text_item("Hello World, this is a longer text".to_string(), "p1".to_string()),
create_text_item("Another longer text for testing".to_string(), "p2".to_string()),
create_text_item("Third longer text item".to_string(), "p3".to_string()),
create_text_item("Fourth longer text item".to_string(), "p4".to_string()),
create_text_item("Fifth longer text item".to_string(), "p5".to_string()),
];
let batches = manager.create_batches(items);
assert!(!batches.is_empty());
}
#[test]
fn test_text_analysis() {
let filter = TextFilter::new();
let analysis = filter.analyze_text("Hello World");
assert!(analysis.should_translate);
assert!(analysis.translatability_score > 0.5);
assert!(matches!(analysis.text_type, TextType::Button | TextType::Label));
}
}