use crate::Quantifier;
use crate::lang::{detect_language, Language};
fn detect_language_near(text: &str, char_offset: usize) -> Language {
if let Some(c) = text.chars().nth(char_offset) {
match c {
'\u{4e00}'..='\u{9fff}' => return Language::Chinese,
'\u{3040}'..='\u{30ff}' => return Language::Japanese,
'\u{ac00}'..='\u{d7af}' => return Language::Korean,
'\u{0600}'..='\u{06ff}' => return Language::Arabic,
'\u{0590}'..='\u{05ff}' => return Language::Hebrew,
'\u{0400}'..='\u{04ff}' => return Language::Russian,
_ => {}
}
}
const PRE_CHARS: usize = 32;
const POST_CHARS: usize = 160;
let window_start = char_offset.saturating_sub(PRE_CHARS);
let window: String = text
.chars()
.skip(window_start)
.take(PRE_CHARS + POST_CHARS)
.collect();
if window.is_empty() {
detect_language(text)
} else {
detect_language(&window)
}
}
#[must_use]
pub fn is_negated_with_cues(text: &str, entity_start: usize, cues: &[&str]) -> bool {
const WINDOW_CHARS: usize = 200;
let window_start = entity_start.saturating_sub(WINDOW_CHARS);
let prefix: String = text
.chars()
.skip(window_start)
.take(entity_start.saturating_sub(window_start))
.collect();
let words: Vec<&str> = prefix.split_whitespace().collect();
let last_words: Vec<&str> = words.iter().rev().take(3).copied().collect();
for word in &last_words {
if cues.contains(&word.to_lowercase().as_str()) {
return true;
}
}
false
}
#[must_use]
pub fn is_negated_with_substrings(text: &str, entity_start: usize, cues: &[&str]) -> bool {
const WINDOW_CHARS: usize = 200;
let window_start = entity_start.saturating_sub(WINDOW_CHARS);
let prefix: String = text
.chars()
.skip(window_start)
.take(entity_start.saturating_sub(window_start))
.collect();
cues.iter().any(|cue| prefix.contains(cue))
}
#[must_use]
pub fn detect_quantifier_with_cues(
text: &str,
entity_start: usize,
universal: &[&str],
existential: &[&str],
none: &[&str],
definite: &[&str],
) -> Option<Quantifier> {
const WINDOW_CHARS: usize = 80;
let window_start = entity_start.saturating_sub(WINDOW_CHARS);
let prefix: String = text
.chars()
.skip(window_start)
.take(entity_start.saturating_sub(window_start))
.collect();
let words: Vec<&str> = prefix.split_whitespace().collect();
words.last().and_then(|word| {
let w = word.to_lowercase();
let w = w.as_str();
if none.contains(&w) {
Some(Quantifier::None)
} else if universal.contains(&w) {
Some(Quantifier::Universal)
} else if existential.contains(&w) {
Some(Quantifier::Existential)
} else if definite.contains(&w) {
Some(Quantifier::Definite)
} else {
None
}
})
}
#[must_use]
pub fn detect_quantifier_with_substrings(
text: &str,
entity_start: usize,
universal: &[&str],
existential: &[&str],
none: &[&str],
definite: &[&str],
) -> Option<Quantifier> {
const WINDOW_CHARS: usize = 64;
let window_start = entity_start.saturating_sub(WINDOW_CHARS);
let prefix: String = text
.chars()
.skip(window_start)
.take(entity_start.saturating_sub(window_start))
.collect();
if none.iter().any(|cue| prefix.contains(cue)) {
Some(Quantifier::None)
} else if universal.iter().any(|cue| prefix.contains(cue)) {
Some(Quantifier::Universal)
} else if existential.iter().any(|cue| prefix.contains(cue)) {
Some(Quantifier::Existential)
} else if definite.iter().any(|cue| prefix.contains(cue)) {
Some(Quantifier::Definite)
} else {
None
}
}
pub mod lexicons {
pub const EN_NEGATION_WORDS: &[&str] = &[
"not",
"no",
"never",
"none",
"neither",
"nor",
"without",
"negative",
"isn't",
"aren't",
"wasn't",
"weren't",
"don't",
"doesn't",
"didn't",
"won't",
"wouldn't",
"couldn't",
"shouldn't",
];
pub const EN_UNIVERSAL: &[&str] = &["every", "all", "each", "any"];
pub const EN_EXISTENTIAL: &[&str] = &["some", "certain", "a", "an"];
pub const EN_NONE: &[&str] = &["no", "none"];
pub const EN_DEFINITE: &[&str] = &["the", "this", "that", "these", "those"];
pub const EN_APPROXIMATE: &[&str] = &[
"approximately",
"about",
"roughly",
"nearly",
"around",
"over",
"more than",
"fewer than",
"less than",
];
pub const EN_MIN_BOUND: &[&str] = &["at least", "no fewer than"];
pub const EN_MAX_BOUND: &[&str] = &["at most", "no more than", "up to"];
pub const DE_NEGATION_WORDS: &[&str] = &["nicht", "kein", "keine", "keinen", "nie", "ohne"];
pub const FR_NEGATION_WORDS: &[&str] = &["pas", "jamais", "aucun", "aucune", "sans"];
pub const ES_NEGATION_WORDS: &[&str] = &["no", "nunca", "ningun", "ningún", "ninguna", "sin"];
pub const IT_NEGATION_WORDS: &[&str] = &["non", "mai", "nessun", "nessuna", "senza"];
pub const PT_NEGATION_WORDS: &[&str] = &["não", "nao", "nunca", "nenhum", "nenhuma", "sem"];
pub const RU_NEGATION_WORDS: &[&str] = &["не", "нет", "никогда", "без"];
pub const DE_UNIVERSAL: &[&str] = &["alle", "jeder", "jede", "jedes"];
pub const DE_EXISTENTIAL: &[&str] = &["ein", "eine", "einen", "einige", "manche"];
pub const DE_NONE: &[&str] = &["kein", "keine", "keinen", "keinem", "keiner", "keins"];
pub const DE_DEFINITE: &[&str] = &[
"der", "die", "das", "diese", "dieser", "dieses", "jener", "jene", "jenes",
];
pub const FR_UNIVERSAL: &[&str] = &["tous", "toutes", "chaque"];
pub const FR_EXISTENTIAL: &[&str] = &["un", "une", "des", "quelques", "certains", "certaines"];
pub const FR_NONE: &[&str] = &["aucun", "aucune"];
pub const FR_DEFINITE: &[&str] = &["le", "la", "les", "ce", "cette", "ces", "cet"];
pub const ES_UNIVERSAL: &[&str] = &["todos", "todas", "cada", "cualquier"];
pub const ES_EXISTENTIAL: &[&str] = &[
"un", "una", "unos", "unas", "algún", "alguna", "algunos", "algunas",
];
pub const ES_NONE: &[&str] = &[
"ningún", "ninguna", "ninguno", "ningunos", "ningunas", "ningun",
];
pub const ES_DEFINITE: &[&str] = &[
"el", "la", "los", "las", "este", "esta", "estos", "estas", "ese", "esa", "esos", "esas",
];
pub const IT_UNIVERSAL: &[&str] = &["tutti", "tutte", "ogni", "qualsiasi"];
pub const IT_EXISTENTIAL: &[&str] = &[
"un", "una", "uno", "alcuni", "alcune", "qualche", "certi", "certe",
];
pub const IT_NONE: &[&str] = &["nessun", "nessuna", "nessuno"];
pub const IT_DEFINITE: &[&str] = &[
"il", "lo", "la", "i", "gli", "le", "questo", "questa", "questi", "queste", "quello",
"quella",
];
pub const PT_UNIVERSAL: &[&str] = &["todos", "todas", "cada", "qualquer"];
pub const PT_EXISTENTIAL: &[&str] = &[
"um", "uma", "uns", "umas", "algum", "alguma", "alguns", "algumas",
];
pub const PT_NONE: &[&str] = &["nenhum", "nenhuma", "nenhuns", "nenhumas"];
pub const PT_DEFINITE: &[&str] = &[
"o", "a", "os", "as", "este", "esta", "estes", "estas", "esse", "essa", "esses", "essas",
];
pub const RU_UNIVERSAL: &[&str] = &["все", "каждый", "каждая", "каждое"];
pub const RU_EXISTENTIAL: &[&str] = &["некоторые", "один", "одна", "одно"];
pub const RU_NONE: &[&str] = &["никакой", "никакая", "никакие", "нет"];
pub const RU_DEFINITE: &[&str] = &["этот", "эта", "это", "эти", "тот", "та", "то", "те"];
pub const ZH_NEGATION_CUES: &[&str] = &["不", "没", "沒有", "没有", "無", "无"];
pub const JA_NEGATION_CUES: &[&str] = &["ない", "ません", "無い", "ず"];
pub const KO_NEGATION_CUES: &[&str] = &["안", "않", "못", "없"];
pub const ZH_UNIVERSAL: &[&str] = &["每", "所有", "全部"];
pub const ZH_EXISTENTIAL: &[&str] = &["一些", "某些", "有些"];
pub const ZH_NONE: &[&str] = &["没有", "沒有", "无", "無", "没"];
pub const ZH_DEFINITE: &[&str] = &["这", "那", "这些", "那些", "该"];
pub const JA_UNIVERSAL: &[&str] = &["全て", "すべて", "毎", "各"];
pub const JA_EXISTENTIAL: &[&str] = &["いくつか", "ある"];
pub const JA_NONE: &[&str] = &["無い", "無し", "ない"];
pub const JA_DEFINITE: &[&str] = &["この", "その", "あの"];
pub const KO_UNIVERSAL: &[&str] = &["모든", "각", "매"];
pub const KO_EXISTENTIAL: &[&str] = &["몇몇", "일부", "어떤", "어느"];
pub const KO_NONE: &[&str] = &["없", "아무도", "아무것도"];
pub const KO_DEFINITE: &[&str] = &["이", "그", "저"];
pub const AR_UNIVERSAL: &[&str] = &["كل"];
pub const AR_EXISTENTIAL: &[&str] = &["بعض", "أحد", "احد"];
pub const AR_NONE: &[&str] = &["لا", "ليس", "بدون"];
pub const AR_DEFINITE: &[&str] = &["هذا", "هذه", "ذلك", "تلك", "هؤلاء"];
pub const HE_UNIVERSAL: &[&str] = &["כל"];
pub const HE_EXISTENTIAL: &[&str] = &["כמה"];
pub const HE_NONE: &[&str] = &["אין", "לא"];
pub const HE_DEFINITE: &[&str] = &["זה", "זאת", "אלה", "האלו"];
}
#[must_use]
pub fn is_negated_en(text: &str, entity_start: usize) -> bool {
is_negated_with_cues(text, entity_start, lexicons::EN_NEGATION_WORDS)
}
#[must_use]
pub fn is_negated_lang(text: &str, entity_start: usize, lang: Language) -> bool {
match lang {
Language::English => is_negated_en(text, entity_start),
Language::German => is_negated_with_cues(text, entity_start, lexicons::DE_NEGATION_WORDS),
Language::French => is_negated_with_cues(text, entity_start, lexicons::FR_NEGATION_WORDS),
Language::Spanish => is_negated_with_cues(text, entity_start, lexicons::ES_NEGATION_WORDS),
Language::Italian => is_negated_with_cues(text, entity_start, lexicons::IT_NEGATION_WORDS),
Language::Portuguese => {
is_negated_with_cues(text, entity_start, lexicons::PT_NEGATION_WORDS)
}
Language::Russian => is_negated_with_cues(text, entity_start, lexicons::RU_NEGATION_WORDS),
Language::Chinese => {
is_negated_with_substrings(text, entity_start, lexicons::ZH_NEGATION_CUES)
}
Language::Japanese => {
is_negated_with_substrings(text, entity_start, lexicons::JA_NEGATION_CUES)
}
Language::Korean => {
is_negated_with_substrings(text, entity_start, lexicons::KO_NEGATION_CUES)
}
Language::Arabic | Language::Hebrew | Language::Other => false,
}
}
#[must_use]
pub fn is_negated_auto(text: &str, entity_start: usize) -> bool {
is_negated_lang(text, entity_start, detect_language_near(text, entity_start))
}
#[must_use]
pub fn detect_quantifier_en(text: &str, entity_start: usize) -> Option<Quantifier> {
detect_quantifier_with_cues(
text,
entity_start,
lexicons::EN_UNIVERSAL,
lexicons::EN_EXISTENTIAL,
lexicons::EN_NONE,
lexicons::EN_DEFINITE,
)
.or_else(|| detect_approximate_quantifier(text, entity_start))
}
#[must_use]
pub fn detect_quantifier_lang(
text: &str,
entity_start: usize,
lang: Language,
) -> Option<Quantifier> {
match lang {
Language::English => detect_quantifier_en(text, entity_start),
Language::German => detect_quantifier_with_cues(
text,
entity_start,
lexicons::DE_UNIVERSAL,
lexicons::DE_EXISTENTIAL,
lexicons::DE_NONE,
lexicons::DE_DEFINITE,
),
Language::French => detect_quantifier_with_cues(
text,
entity_start,
lexicons::FR_UNIVERSAL,
lexicons::FR_EXISTENTIAL,
lexicons::FR_NONE,
lexicons::FR_DEFINITE,
),
Language::Spanish => detect_quantifier_with_cues(
text,
entity_start,
lexicons::ES_UNIVERSAL,
lexicons::ES_EXISTENTIAL,
lexicons::ES_NONE,
lexicons::ES_DEFINITE,
),
Language::Italian => detect_quantifier_with_cues(
text,
entity_start,
lexicons::IT_UNIVERSAL,
lexicons::IT_EXISTENTIAL,
lexicons::IT_NONE,
lexicons::IT_DEFINITE,
),
Language::Portuguese => detect_quantifier_with_cues(
text,
entity_start,
lexicons::PT_UNIVERSAL,
lexicons::PT_EXISTENTIAL,
lexicons::PT_NONE,
lexicons::PT_DEFINITE,
),
Language::Russian => detect_quantifier_with_cues(
text,
entity_start,
lexicons::RU_UNIVERSAL,
lexicons::RU_EXISTENTIAL,
lexicons::RU_NONE,
lexicons::RU_DEFINITE,
),
Language::Chinese => detect_quantifier_with_substrings(
text,
entity_start,
lexicons::ZH_UNIVERSAL,
lexicons::ZH_EXISTENTIAL,
lexicons::ZH_NONE,
lexicons::ZH_DEFINITE,
),
Language::Japanese => detect_quantifier_with_substrings(
text,
entity_start,
lexicons::JA_UNIVERSAL,
lexicons::JA_EXISTENTIAL,
lexicons::JA_NONE,
lexicons::JA_DEFINITE,
),
Language::Korean => detect_quantifier_with_substrings(
text,
entity_start,
lexicons::KO_UNIVERSAL,
lexicons::KO_EXISTENTIAL,
lexicons::KO_NONE,
lexicons::KO_DEFINITE,
),
Language::Arabic => {
let q = detect_quantifier_with_cues(
text,
entity_start,
lexicons::AR_UNIVERSAL,
lexicons::AR_EXISTENTIAL,
lexicons::AR_NONE,
lexicons::AR_DEFINITE,
);
if q.is_some() {
return q;
}
let window_start = entity_start.saturating_sub(40);
let prefix: String = text
.chars()
.skip(window_start)
.take(entity_start.saturating_sub(window_start))
.collect();
let last = prefix.split_whitespace().last().unwrap_or("");
if last.starts_with("ال") {
Some(Quantifier::Definite)
} else {
None
}
}
Language::Hebrew => {
let q = detect_quantifier_with_cues(
text,
entity_start,
lexicons::HE_UNIVERSAL,
lexicons::HE_EXISTENTIAL,
lexicons::HE_NONE,
lexicons::HE_DEFINITE,
);
if q.is_some() {
return q;
}
let window_start = entity_start.saturating_sub(40);
let prefix: String = text
.chars()
.skip(window_start)
.take(entity_start.saturating_sub(window_start))
.collect();
let last = prefix.split_whitespace().last().unwrap_or("");
if last.starts_with('ה') {
Some(Quantifier::Definite)
} else {
None
}
}
Language::Other => None,
}
}
#[must_use]
pub fn detect_approximate_quantifier(text: &str, entity_start: usize) -> Option<Quantifier> {
const WINDOW_CHARS: usize = 40;
let window_start = entity_start.saturating_sub(WINDOW_CHARS);
let prefix: String = text
.chars()
.skip(window_start)
.take(entity_start.saturating_sub(window_start))
.collect();
let lower = prefix.to_lowercase();
if lexicons::EN_MIN_BOUND.iter().any(|cue| lower.contains(cue)) {
Some(Quantifier::MinBound)
} else if lexicons::EN_MAX_BOUND.iter().any(|cue| lower.contains(cue)) {
Some(Quantifier::MaxBound)
} else if lexicons::EN_APPROXIMATE
.iter()
.any(|cue| lower.contains(cue))
{
Some(Quantifier::Approximate)
} else {
Option::None
}
}
#[must_use]
pub fn detect_quantifier_auto(text: &str, entity_start: usize) -> Option<Quantifier> {
detect_quantifier_lang(text, entity_start, detect_language_near(text, entity_start))
.or_else(|| detect_approximate_quantifier(text, entity_start))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_detect_language_near_code_switching() {
let text = "This is English. 这是中文。Back to English.";
assert_eq!(detect_language_near(text, 5), Language::English);
let chinese_start = "This is English. ".chars().count();
assert_eq!(
detect_language_near(text, chinese_start + 1),
Language::Chinese
);
}
#[test]
fn test_is_negated_basic() {
assert!(is_negated_en("He is not a doctor", 12)); assert!(is_negated_en("I never saw John", 11)); assert!(!is_negated_en("He is a doctor", 8)); assert!(!is_negated_en("The quick brown fox", 4)); }
#[test]
fn test_is_negated_multilingual_examples() {
assert!(is_negated_lang("Er ist nicht Arzt", 13, Language::German));
assert!(is_negated_lang("pas médecin", 4, Language::French));
assert!(is_negated_lang("no médico", 3, Language::Spanish));
assert!(is_negated_lang("他不是医生", 3, Language::Chinese));
}
#[test]
fn test_detect_quantifier_basic() {
assert_eq!(
detect_quantifier_en("Every student passed", 6),
Some(Quantifier::Universal)
);
assert_eq!(
detect_quantifier_en("Some students failed", 5),
Some(Quantifier::Existential)
);
assert_eq!(
detect_quantifier_en("No student failed", 3),
Some(Quantifier::None)
);
assert_eq!(
detect_quantifier_en("The student passed", 4),
Some(Quantifier::Definite)
);
assert_eq!(detect_quantifier_en("Student passed", 0), None);
}
#[test]
fn test_detect_quantifier_with_cues_prefers_none_over_universal_on_overlap() {
let text = "y z";
let q = detect_quantifier_with_cues(text, 2, &["y"], &[], &["y"], &[]);
assert_eq!(q, Some(Quantifier::None));
}
#[test]
fn test_detect_quantifier_auto_degrades_safely() {
assert_eq!(
detect_quantifier_lang("kein Arzt", 5, Language::German),
Some(Quantifier::None)
);
assert_eq!(
detect_quantifier_lang("aucun médecin", 6, Language::French),
Some(Quantifier::None)
);
assert_eq!(
detect_quantifier_lang("ningún médico", 6, Language::Spanish),
Some(Quantifier::None)
);
assert_eq!(
detect_quantifier_lang("每个 医生", 3, Language::Chinese),
Some(Quantifier::Universal)
);
}
#[test]
fn test_negation_contraction_forms() {
assert!(is_negated_en("She doesn't like cats", 18)); assert!(is_negated_en("They won't attend meetings", 17)); assert!(is_negated_en("He couldn't find keys", 16)); }
#[test]
fn test_negation_case_insensitive() {
assert!(is_negated_en("He is NOT a doctor", 12));
assert!(is_negated_en("I Never saw John", 11));
}
#[test]
fn test_negation_outside_three_word_window() {
let text = "not one of the many doctors";
let entity_start = "not one of the many ".chars().count();
assert!(!is_negated_en(text, entity_start));
}
#[test]
fn test_negation_entity_at_start() {
assert!(!is_negated_en("Doctor is here", 0));
}
#[test]
fn test_negation_substring_chinese() {
assert!(is_negated_with_substrings(
"他没有钱",
3,
lexicons::ZH_NEGATION_CUES
));
assert!(!is_negated_with_substrings(
"他有钱",
2,
lexicons::ZH_NEGATION_CUES
));
}
#[test]
fn test_quantifier_all_four_classes_en() {
assert_eq!(
detect_quantifier_en("all dogs", 4),
Some(Quantifier::Universal)
);
assert_eq!(
detect_quantifier_en("a dog", 2),
Some(Quantifier::Existential)
);
assert_eq!(detect_quantifier_en("no dogs", 3), Some(Quantifier::None));
assert_eq!(
detect_quantifier_en("these dogs", 6),
Some(Quantifier::Definite)
);
}
#[test]
fn test_quantifier_case_insensitive() {
assert_eq!(
detect_quantifier_en("EVERY student", 6),
Some(Quantifier::Universal)
);
assert_eq!(
detect_quantifier_en("The cat", 4),
Some(Quantifier::Definite)
);
}
#[test]
fn test_quantifier_no_prefix_returns_none() {
assert_eq!(detect_quantifier_en("dogs run", 0), None);
}
#[test]
fn test_quantifier_substring_japanese() {
assert_eq!(
detect_quantifier_with_substrings(
"全ての学生",
3,
lexicons::JA_UNIVERSAL,
&[],
&[],
&[]
),
Some(Quantifier::Universal)
);
assert_eq!(
detect_quantifier_with_substrings("この本", 2, &[], &[], &[], lexicons::JA_DEFINITE),
Some(Quantifier::Definite)
);
}
#[test]
fn test_detect_language_near_script_fast_path() {
let text = "Hello 你好世界 and more";
let zh_offset = "Hello ".chars().count(); assert_eq!(detect_language_near(text, zh_offset), Language::Chinese);
let text2 = "abc あいう xyz";
let ja_offset = "abc ".chars().count(); assert_eq!(detect_language_near(text2, ja_offset), Language::Japanese);
let text3 = "abc 한국어 xyz";
let ko_offset = "abc ".chars().count();
assert_eq!(detect_language_near(text3, ko_offset), Language::Korean);
}
#[test]
fn test_negation_lang_other_returns_false() {
assert!(!is_negated_lang("no doctor", 3, Language::Other));
assert!(!is_negated_lang("no doctor", 3, Language::Arabic));
}
#[test]
fn test_quantifier_lang_other_returns_none() {
assert_eq!(
detect_quantifier_lang("every dog", 6, Language::Other),
None
);
}
}