use anyhow::{Context, Result};
use serde_json::{Value, json};
fn validate_json_language_consistency(json: &Value) -> Result<()> {
let json_str =
serde_json::to_string_pretty(json).context("Failed to serialize JSON for validation")?;
let has_latin = json_str.chars().any(|c| c.is_ascii_alphabetic());
let has_cjk = json_str.chars().any(is_cjk_character);
let has_cyrillic = json_str.chars().any(is_cyrillic_character);
let has_arabic = json_str.chars().any(is_arabic_character);
let script_count = [has_latin, has_cjk, has_cyrillic, has_arabic]
.iter()
.filter(|&&x| x)
.count();
if script_count > 2 {
eprintln!(
"Warning: JSON contains {} different scripts - possible language mixing",
script_count
);
}
validate_json_keys(json)?;
Ok(())
}
fn validate_json_keys(value: &Value) -> Result<()> {
match value {
Value::Object(map) => {
for (key, val) in map {
if !key
.chars()
.all(|c| c.is_ascii_alphanumeric() || c == '_' || c == '-')
{
let suggestion = sanitize_key_name(key);
anyhow::bail!(
"JSON key '{}' contains non-identifier characters - possible language mixing.\n\
Keys must be valid identifiers (ASCII alphanumeric + underscore/hyphen).\n\
Suggestion: Rename to '{}' or use camelCase.",
key,
suggestion
);
}
validate_json_keys(val)?;
}
}
Value::Array(arr) => {
for val in arr {
validate_json_keys(val)?;
}
}
_ => {}
}
Ok(())
}
fn sanitize_key_name(key: &str) -> String {
let sanitized = key
.chars()
.map(|c| {
if c.is_ascii_alphanumeric() || c == '_' || c == '-' {
c
} else {
'_'
}
})
.collect::<String>();
let trimmed = sanitized.trim_matches('_');
if trimmed.is_empty() {
"sanitized_key".to_string()
} else {
trimmed.to_string()
}
}
fn validate_markdown_language_consistency(markdown: &str) -> Result<()> {
let lines: Vec<&str> = markdown.lines().collect();
let mut section_scripts = Vec::new();
let mut current_section_chars = String::new();
for line in lines {
if line.starts_with('#') && !current_section_chars.is_empty() {
section_scripts.push(detect_predominant_script(¤t_section_chars));
current_section_chars.clear();
}
current_section_chars.push_str(line);
}
if !current_section_chars.is_empty() {
section_scripts.push(detect_predominant_script(¤t_section_chars));
}
if section_scripts.len() > 1 {
let first_script = section_scripts[0];
for (idx, &script) in section_scripts.iter().enumerate().skip(1) {
if script != first_script && script != Script::Mixed {
eprintln!(
"Warning: Markdown section {} changed from {:?} to {:?}",
idx, first_script, script
);
}
}
}
Ok(())
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum Script {
Latin,
#[allow(non_camel_case_types)]
Cjk,
Cyrillic,
Arabic,
Mixed,
Unknown,
}
fn detect_predominant_script(text: &str) -> Script {
let total_chars: usize = text.chars().filter(|c| c.is_alphabetic()).count();
if total_chars == 0 {
return Script::Unknown;
}
let latin_count = text.chars().filter(|c| c.is_ascii_alphabetic()).count();
let cjk_count = text.chars().filter(|c| is_cjk_character(*c)).count();
let cyrillic_count = text.chars().filter(|c| is_cyrillic_character(*c)).count();
let arabic_count = text.chars().filter(|c| is_arabic_character(*c)).count();
let max_count = [latin_count, cjk_count, cyrillic_count, arabic_count]
.iter()
.max()
.copied()
.unwrap_or(0);
if max_count < (total_chars * 70 / 100) {
return Script::Mixed;
}
if latin_count == max_count {
Script::Latin
} else if cjk_count == max_count {
Script::Cjk
} else if cyrillic_count == max_count {
Script::Cyrillic
} else if arabic_count == max_count {
Script::Arabic
} else {
Script::Unknown
}
}
fn is_cjk_character(c: char) -> bool {
matches!(c,
'\u{4E00}'..='\u{9FFF}' | '\u{3400}'..='\u{4DBF}' | '\u{20000}'..='\u{2A6DF}' | '\u{3040}'..='\u{309F}' | '\u{30A0}'..='\u{30FF}' | '\u{AC00}'..='\u{D7AF}' )
}
fn is_cyrillic_character(c: char) -> bool {
matches!(c, '\u{0400}'..='\u{04FF}')
}
fn is_arabic_character(c: char) -> bool {
matches!(c, '\u{0600}'..='\u{06FF}')
}
#[cfg(test)]
mod unit_tests {
use super::*;
#[test]
fn test_valid_json_with_consistent_language() {
let json = json!({
"status": "success",
"message": "Operation completed successfully",
"data": {
"count": 42,
"items": ["apple", "banana", "cherry"]
}
});
assert!(validate_json_language_consistency(&json).is_ok());
}
#[test]
fn test_json_with_invalid_key_characters() {
let json_str = r#"{"状态": "success", "message": "test"}"#;
let json: Value = serde_json::from_str(json_str).unwrap();
let result = validate_json_language_consistency(&json);
assert!(result.is_err());
let err_msg = result.unwrap_err().to_string();
assert!(
err_msg.contains("non-identifier") || err_msg.contains("language mixing"),
"Expected error about non-identifier characters, got: {}",
err_msg
);
}
#[test]
fn test_json_with_mixed_language_values() {
let json = json!({
"english_text": "Hello world",
"chinese_text": "你好世界",
"mixed_text": "Hello 世界"
});
assert!(validate_json_language_consistency(&json).is_ok());
}
#[test]
fn test_markdown_with_consistent_language() {
let markdown = r#"
# Introduction
This is a test document in English.
It should maintain consistent language throughout.
## Details
More content in the same language.
"#;
assert!(validate_markdown_language_consistency(markdown).is_ok());
}
#[test]
fn test_markdown_with_section_language_switching() {
let markdown = r#"
# English Section
This is in English.
# 中文部分
这是中文内容。
"#;
assert!(validate_markdown_language_consistency(markdown).is_ok());
}
#[test]
fn test_script_detection_latin() {
let text = "Hello world, this is an English text.";
assert_eq!(detect_predominant_script(text), Script::Latin);
}
#[test]
fn test_script_detection_cjk() {
let text = "这是中文文本,包含一些汉字。";
assert_eq!(detect_predominant_script(text), Script::Cjk);
}
#[test]
fn test_script_detection_mixed() {
let text = "你好世界这是中文测试内容 Hello world this is English test content";
let script = detect_predominant_script(text);
assert!(
matches!(script, Script::Mixed | Script::Latin | Script::Cjk),
"Expected Mixed, Latin, or Cjk, got {:?}",
script
);
}
#[test]
fn test_cyrillic_detection() {
assert!(is_cyrillic_character('А'));
assert!(is_cyrillic_character('Я'));
assert!(!is_cyrillic_character('A'));
}
#[test]
fn test_cjk_detection() {
assert!(is_cjk_character('中'));
assert!(is_cjk_character('日'));
assert!(is_cjk_character('한'));
assert!(!is_cjk_character('A'));
}
#[test]
fn test_arabic_detection() {
assert!(is_arabic_character('ا'));
assert!(is_arabic_character('ب'));
assert!(!is_arabic_character('A'));
}
#[test]
fn test_nested_json_validation() {
let json = json!({
"level1": {
"level2": {
"level3": {
"valid_key": "value"
}
}
}
});
assert!(validate_json_language_consistency(&json).is_ok());
}
#[test]
fn test_json_array_validation() {
let json = json!({
"items": [
{"name": "item1", "value": 1},
{"name": "item2", "value": 2}
]
});
assert!(validate_json_language_consistency(&json).is_ok());
}
#[test]
fn test_emoji_in_json_values() {
let json = json!({
"status": "success [DONE]",
"message": "Operation completed ✓ ",
"data": {
"celebration": "[ROCKET][PARTY]"
}
});
assert!(validate_json_language_consistency(&json).is_ok());
}
#[test]
fn test_deeply_nested_json() {
let mut nested = json!({"valid_key": "value"});
for i in 0..15 {
nested = json!({
format!("level_{}", i): nested
});
}
assert!(
validate_json_language_consistency(&nested).is_ok(),
"Deeply nested JSON should validate successfully"
);
}
#[test]
fn test_mixed_content_in_code_snippets() {
let json = json!({
"code_example": "const greeting = '你好'; // Chinese hello",
"description": "This demonstrates internationalization",
"language": "javascript"
});
assert!(validate_json_language_consistency(&json).is_ok());
}
#[test]
fn test_empty_json_structures() {
let empty_object = json!({});
let empty_array = json!([]);
let mixed = json!({
"empty_obj": {},
"empty_arr": [],
"nested_empty": {
"inner": {}
}
});
assert!(validate_json_language_consistency(&empty_object).is_ok());
assert!(validate_json_language_consistency(&empty_array).is_ok());
assert!(validate_json_language_consistency(&mixed).is_ok());
}
#[test]
fn test_sanitize_key_name_suggestions() {
assert_eq!(sanitize_key_name("状态"), "sanitized_key"); assert_eq!(sanitize_key_name("my key"), "my_key");
assert_eq!(sanitize_key_name("test-key"), "test-key");
assert_eq!(sanitize_key_name("valid_key"), "valid_key");
assert_eq!(sanitize_key_name("_underscore_"), "underscore");
assert_eq!(sanitize_key_name("___"), "sanitized_key"); }
}
pub fn validate_conversation_language_consistency(responses: &[Value]) -> Result<()> {
for (idx, response) in responses.iter().enumerate() {
validate_json_language_consistency(response)
.map_err(|e| anyhow::anyhow!("Response {} failed validation: {}", idx, e))?;
}
Ok(())
}
pub fn validate_tool_response_language(tool_name: &str, response: &Value) -> Result<()> {
validate_json_keys(response)?;
if let Some(obj) = response.as_object() {
let has_success = obj.contains_key("success");
let has_error = obj.contains_key("error");
let has_message = obj.contains_key("message");
if !has_success && !has_error && !has_message {
eprintln!(
"Warning: Tool '{}' response missing standard fields (success/error/message)",
tool_name
);
}
}
Ok(())
}
#[cfg(test)]
mod integration_tests {
use super::*;
use assert_fs::TempDir;
use vtcode_core::config::constants::tools;
use vtcode_core::tools::ToolRegistry;
#[tokio::test]
async fn test_read_file_response_language_consistency() {
let temp_dir = TempDir::new().unwrap();
let test_file = temp_dir.path().join("test.txt");
std::fs::write(&test_file, "Test content").unwrap();
let registry = ToolRegistry::new(temp_dir.path().to_path_buf()).await;
let _ = registry.allow_all_tools().await;
let response = registry
.execute_tool(
tools::READ_FILE,
json!({
"path": "test.txt"
}),
)
.await
.unwrap();
assert!(
validate_tool_response_language(tools::READ_FILE, &response).is_ok(),
"read_file response should maintain language consistency"
);
assert!(
validate_json_language_consistency(&response).is_ok(),
"read_file response JSON should be consistent"
);
}
#[tokio::test]
async fn test_list_files_response_language_consistency() {
let temp_dir = TempDir::new().unwrap();
std::fs::write(temp_dir.path().join("file1.txt"), "content1").unwrap();
std::fs::write(temp_dir.path().join("file2.txt"), "content2").unwrap();
let registry = ToolRegistry::new(temp_dir.path().to_path_buf()).await;
let _ = registry.allow_all_tools().await;
let response = registry
.execute_tool(
tools::UNIFIED_SEARCH,
json!({
"path": ".",
"per_page": 10
}),
)
.await
.unwrap();
assert!(
validate_tool_response_language(tools::UNIFIED_SEARCH, &response).is_ok(),
"list_files response should maintain language consistency"
);
assert!(
validate_json_language_consistency(&response).is_ok(),
"list_files response JSON should be consistent"
);
}
#[tokio::test]
async fn test_write_file_response_language_consistency() {
let temp_dir = TempDir::new().unwrap();
let registry = ToolRegistry::new(temp_dir.path().to_path_buf()).await;
let _ = registry.allow_all_tools().await;
let response = registry
.execute_tool(
tools::WRITE_FILE,
json!({
"path": "output.txt",
"content": "Test output content",
"mode": "overwrite"
}),
)
.await
.unwrap();
assert!(
validate_tool_response_language(tools::WRITE_FILE, &response).is_ok(),
"write_file response should maintain language consistency"
);
assert!(
validate_json_language_consistency(&response).is_ok(),
"write_file response JSON should be consistent"
);
}
#[tokio::test]
async fn test_multi_tool_conversation_consistency() {
let temp_dir = TempDir::new().unwrap();
let registry = ToolRegistry::new(temp_dir.path().to_path_buf()).await;
let _ = registry.allow_all_tools().await;
let mut responses = Vec::new();
let write_response = registry
.execute_tool(
tools::WRITE_FILE,
json!({
"path": "test.txt",
"content": "Initial content",
"mode": "overwrite"
}),
)
.await
.unwrap();
responses.push(write_response);
let read_response = registry
.execute_tool(
tools::READ_FILE,
json!({
"path": "test.txt"
}),
)
.await
.unwrap();
responses.push(read_response);
let list_response = registry
.execute_tool(
tools::UNIFIED_SEARCH,
json!({
"path": ".",
"per_page": 10
}),
)
.await
.unwrap();
responses.push(list_response);
assert!(
validate_conversation_language_consistency(&responses).is_ok(),
"Multi-turn conversation should maintain language consistency across all tool calls"
);
}
}