use crate::engine::RedactionEngine;
use crate::scan::{scan_text_with_engine_with_lang, RedactionResult, ScanError};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
#[non_exhaustive]
pub enum LangHintSource {
CallerProvided,
FixtureExperimental,
Heuristic,
}
impl LangHintSource {
pub fn is_trusted(&self) -> bool {
match self {
LangHintSource::CallerProvided => true,
LangHintSource::FixtureExperimental => true,
LangHintSource::Heuristic => false,
}
}
}
#[derive(Debug, Clone)]
#[non_exhaustive]
pub struct LanguageHint {
pub lang: String,
pub source: LangHintSource,
pub confidence: f32,
}
pub const LANG_HINT_TRUSTED_CONFIDENCE: f32 = 0.5;
impl LanguageHint {
pub fn caller_provided(lang: impl Into<String>) -> Self {
Self {
lang: lang.into(),
source: LangHintSource::CallerProvided,
confidence: 1.0,
}
}
pub fn fixture(lang: impl Into<String>) -> Self {
Self {
lang: lang.into(),
source: LangHintSource::FixtureExperimental,
confidence: 1.0,
}
}
pub fn heuristic(lang: impl Into<String>, confidence: f32) -> Self {
Self {
lang: lang.into(),
source: LangHintSource::Heuristic,
confidence: confidence.clamp(0.0, 1.0),
}
}
pub fn detect(text: &str) -> Self {
detect_lang_heuristic(text)
}
pub fn lang_str(&self) -> Option<&str> {
if !self.source.is_trusted() {
return None;
}
if self.confidence < LANG_HINT_TRUSTED_CONFIDENCE {
return None;
}
Some(self.lang.as_str())
}
}
pub fn detect_lang_heuristic(text: &str) -> LanguageHint {
use LangHintSource::Heuristic;
let mut cjk = 0;
let mut hira = 0;
let mut kata = 0;
let mut hangul = 0;
for ch in text.chars() {
let cp = ch as u32;
if (0x4E00..=0x9FFF).contains(&cp) {
cjk += 1;
} else if (0x3040..=0x309F).contains(&cp) {
hira += 1;
} else if (0x30A0..=0x30FF).contains(&cp) {
kata += 1;
} else if (0xAC00..=0xD7AF).contains(&cp) {
hangul += 1;
}
}
if hangul >= 2 {
return LanguageHint {
lang: "ko".to_string(),
source: Heuristic,
confidence: 0.9,
};
}
if hira + kata >= 2 {
return LanguageHint {
lang: "ja".to_string(),
source: Heuristic,
confidence: 0.9,
};
}
if cjk >= 2 {
return LanguageHint {
lang: "zh".to_string(),
source: Heuristic,
confidence: 0.85,
};
}
let low = text.to_lowercase();
let de_hints = [
"herr ",
"frau ",
"straße",
"strasse",
"gmbh",
"münchen",
"berlin",
"hamburg",
"köln",
"müller",
"schmidt",
" und ",
"ich bin",
"guten tag",
"bitte",
"webseite",
"konto",
"verwendet",
"verfügbar",
"geboren am",
];
let fr_hints = [
"monsieur",
"madame",
"bonjour",
"paris",
"lyon",
"marseille",
"merci",
"veuillez",
"envoyer",
"visitez",
"né le ",
"née le ",
"téléphone",
"adresse:",
" et ",
];
let it_hints = [
"signor",
"signora",
"roma",
"milano",
"napoli",
"bologna",
"buongiorno",
"grazie",
"contatta",
"telefono",
"nato il",
"nata il",
"codice fiscale",
"visita ",
"indirizzo:",
" e ",
];
let es_hints = [
"señor",
"señora",
"calle ",
"madrid",
"barcelona",
"valencia",
"sevilla",
"gracias",
"por favor",
"avenida",
];
if de_hints.iter().any(|h| low.contains(h)) {
return LanguageHint {
lang: "de".to_string(),
source: Heuristic,
confidence: 0.7,
};
}
if fr_hints.iter().any(|h| low.contains(h)) {
return LanguageHint {
lang: "fr".to_string(),
source: Heuristic,
confidence: 0.7,
};
}
if it_hints.iter().any(|h| low.contains(h)) {
return LanguageHint {
lang: "it".to_string(),
source: Heuristic,
confidence: 0.7,
};
}
if es_hints.iter().any(|h| low.contains(h)) {
return LanguageHint {
lang: "es".to_string(),
source: Heuristic,
confidence: 0.7,
};
}
let has_de_chars = text
.chars()
.any(|c| matches!(c, 'ä' | 'ö' | 'ü' | 'ß' | 'Ä' | 'Ö' | 'Ü'));
if has_de_chars {
return LanguageHint {
lang: "de".to_string(),
source: Heuristic,
confidence: 0.45, };
}
let has_western_accent = text.chars().any(|c| {
matches!(
c,
'à' | 'â'
| 'ç'
| 'é'
| 'è'
| 'ê'
| 'ë'
| 'î'
| 'ï'
| 'ô'
| 'û'
| 'ù'
| 'À'
| 'É'
| 'È'
| 'Ê'
| 'Ô'
)
});
if has_western_accent {
return LanguageHint {
lang: "fr".to_string(),
source: Heuristic,
confidence: 0.4,
};
}
LanguageHint {
lang: "en".to_string(),
source: Heuristic,
confidence: 0.3,
}
}
pub fn scan_text_with_engine_with_hint(
input: &str,
engine: &dyn RedactionEngine,
hint: Option<&LanguageHint>,
) -> Result<RedactionResult, ScanError> {
let lang = hint.and_then(|h| h.lang_str());
scan_text_with_engine_with_lang(input, engine, lang)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn caller_provided_high_trust_returns_lang() {
let h = LanguageHint::caller_provided("de");
assert_eq!(h.source, LangHintSource::CallerProvided);
assert!((h.confidence - 1.0).abs() < f32::EPSILON);
assert_eq!(h.lang_str(), Some("de"));
}
#[test]
fn fixture_experimental_returns_lang() {
let h = LanguageHint::fixture("it");
assert_eq!(h.source, LangHintSource::FixtureExperimental);
assert_eq!(h.lang_str(), Some("it"));
}
#[test]
fn heuristic_always_returns_none_even_max_confidence() {
let h = LanguageHint::heuristic("de", 1.0);
assert_eq!(h.source, LangHintSource::Heuristic);
assert_eq!(
h.lang_str(),
None,
"Heuristic source 即使 confidence=1.0 也必须返 None(决策不可信任)"
);
}
#[test]
fn low_confidence_returns_none_even_caller_provided() {
let mut h = LanguageHint::caller_provided("de");
h.confidence = 0.4; assert_eq!(
h.lang_str(),
None,
"confidence < 0.5 必须 fail-closed 返 None"
);
}
#[test]
fn heuristic_confidence_clamp() {
let h_neg = LanguageHint::heuristic("de", -0.5);
assert!(h_neg.confidence >= 0.0);
let h_over = LanguageHint::heuristic("de", 2.0);
assert!(h_over.confidence <= 1.0);
}
#[test]
#[allow(unreachable_patterns)]
fn lang_hint_source_non_exhaustive_match_compiles() {
let s = LangHintSource::CallerProvided;
let trusted = match s {
LangHintSource::CallerProvided => true,
LangHintSource::FixtureExperimental => true,
LangHintSource::Heuristic => false,
_ => false, };
assert!(trusted);
}
#[test]
fn is_trusted_consistent_with_lang_str_decision() {
assert!(LangHintSource::CallerProvided.is_trusted());
assert!(LangHintSource::FixtureExperimental.is_trusted());
assert!(!LangHintSource::Heuristic.is_trusted());
}
#[test]
fn scan_with_hint_empty_input_fail_closed() {
let h = LanguageHint::caller_provided("de");
let r = scan_text_with_engine_with_hint("", &crate::engine::NoopEngine, Some(&h));
assert!(matches!(r, Err(ScanError::EmptyInput)));
}
#[test]
fn scan_with_hint_none_equivalent_to_legacy() {
let r = scan_text_with_engine_with_hint("hello", &crate::engine::NoopEngine, None)
.expect("non-empty");
assert!(r.findings.is_empty(), "NoopEngine + 'hello' 无 finding");
}
#[test]
fn detect_lang_zh_chinese() {
let h = detect_lang_heuristic("请联系王小明处理订单");
assert_eq!(h.lang, "zh");
assert_eq!(h.source, LangHintSource::Heuristic);
assert!(h.confidence >= 0.8);
}
#[test]
fn detect_lang_ja_japanese() {
let h = detect_lang_heuristic("田中太郎さんが昨日来ました");
assert_eq!(h.lang, "ja");
assert!(h.confidence >= 0.85);
}
#[test]
fn detect_lang_ko_korean() {
let h = detect_lang_heuristic("김민수 씨에게 연락하세요");
assert_eq!(h.lang, "ko");
assert!(h.confidence >= 0.85);
}
#[test]
fn detect_lang_de_keyword() {
let h = detect_lang_heuristic("Herr Schmidt arbeitet hier.");
assert_eq!(h.lang, "de");
assert!((h.confidence - 0.7).abs() < 0.01);
}
#[test]
fn detect_lang_fr_keyword() {
let h = detect_lang_heuristic("Monsieur Dupont travaille ici.");
assert_eq!(h.lang, "fr");
}
#[test]
fn detect_lang_it_keyword() {
let h = detect_lang_heuristic("Il signor Rossi lavora qui.");
assert_eq!(h.lang, "it");
}
#[test]
fn detect_lang_short_text_low_confidence() {
let h = detect_lang_heuristic("John Smith works here.");
assert_eq!(h.lang, "en");
assert!(
h.confidence < LANG_HINT_TRUSTED_CONFIDENCE,
"短英文文本 confidence 必须 < 0.5(fail-closed 退化 baseline)"
);
assert_eq!(
h.lang_str(),
None,
"无论 lang 是什么,Heuristic source 永返 None"
);
}
#[test]
fn detect_lang_high_confidence_still_not_trusted() {
let h = detect_lang_heuristic("田中太郎さんが昨日来ました");
assert!(h.confidence >= 0.85, "ja 应高 confidence");
assert_eq!(
h.lang_str(),
None,
"Heuristic source 即使 high confidence 也必须返 None(feedback_lang_review_authoritative)"
);
}
#[test]
fn language_hint_detect_method_equivalent_to_function() {
let text = "Herr Schmidt arbeitet hier.";
let from_method = LanguageHint::detect(text);
let from_fn = detect_lang_heuristic(text);
assert_eq!(from_method.lang, from_fn.lang);
assert_eq!(from_method.source, from_fn.source);
assert_eq!(from_method.confidence, from_fn.confidence);
}
#[test]
fn detect_lang_de_accent_fallback() {
let h = detect_lang_heuristic("Herr ß test");
assert_eq!(h.lang, "de");
let h2 = detect_lang_heuristic("würde");
assert_eq!(h2.lang, "de");
assert!(
h2.confidence < LANG_HINT_TRUSTED_CONFIDENCE,
"fallback 模糊路径 confidence 必须 < 0.5"
);
}
#[test]
fn detect_lang_documented_as_advisory_not_authoritative() {
let h = detect_lang_heuristic("John Smith works here.");
assert_eq!(h.source, LangHintSource::Heuristic);
assert!(!h.source.is_trusted(), "Heuristic source 永远不可信任");
}
}