use std::collections::{HashMap, HashSet};
use std::sync::OnceLock;
fn expand_contractions(text: &str) -> String {
let text = text.replace('\u{2019}', "'");
let text = text.replace("won't", "will not");
let text = text.replace("can't", "can not");
let text = text.replace("shan't", "shall not");
let text = text.replace("n't", " not");
let text = text.replace("'re", " ");
let text = text.replace("'ve", " ");
let text = text.replace("'ll", " ");
let text = text.replace("'d", " ");
let text = text.replace("'m", " ");
text.replace("'s", " ")
}
pub fn expand_contractions_public(text: &str) -> String {
expand_contractions(text)
}
const PSEUDO_NEGATION: &[&str] = &[
"no problem",
"no worries",
"no wonder",
"no way",
"not sure",
"not bad",
"not only",
"not just",
"can not wait",
"can not believe",
];
fn find_negated_positions(words: &[&str], stop_set: &HashSet<String>) -> HashSet<usize> {
let mut negated: HashSet<usize> = HashSet::new();
let mut negating = false;
let joined = words.join(" ");
for phrase in PSEUDO_NEGATION {
if joined.contains(phrase) {
return negated; }
}
for (i, &word) in words.iter().enumerate() {
if word == "not" && i > 0 && words[i - 1] == "do" {
negating = true;
continue;
}
if word == "never" || word == "without" || word == "except" {
negating = true;
continue;
}
if word == "and" || word == "but" || word == "or" || word == "then" {
negating = false;
continue;
}
if negating && !stop_set.contains(word) {
negated.insert(i);
}
}
negated
}
struct StopWordData {
universal: HashSet<String>,
cjk_chars: HashSet<char>,
}
#[derive(serde::Deserialize)]
struct StopWordsJson {
universal: Vec<String>,
unsegmented: HashMap<String, Vec<String>>,
}
fn stop_data() -> &'static StopWordData {
static DATA: OnceLock<StopWordData> = OnceLock::new();
DATA.get_or_init(|| {
let json_str = include_str!("../languages/stopwords.json");
let raw: StopWordsJson = serde_json::from_str(json_str).expect("invalid stopwords.json");
let universal: HashSet<String> = raw.universal.into_iter().collect();
let mut cjk_chars = HashSet::new();
for words in raw.unsegmented.values() {
for w in words {
for c in w.chars() {
cjk_chars.insert(c);
}
}
}
StopWordData {
universal,
cjk_chars,
}
})
}
pub(crate) fn universal_stop_set() -> &'static HashSet<String> {
&stop_data().universal
}
pub fn cjk_stop_char_set() -> &'static HashSet<char> {
&stop_data().cjk_chars
}
pub fn tokenize_full(text: &str) -> Vec<String> {
let text = expand_contractions(text);
let lower = text.to_lowercase();
let mut tokens: Vec<String> = Vec::new();
for raw_word in lower.split(|c: char| !c.is_alphanumeric() && c != '\'') {
let word = raw_word.trim_matches('\'');
if word.is_empty() {
continue;
}
let has_cjk = word.chars().any(is_cjk);
if has_cjk {
for c in word.chars() {
if is_cjk(c) {
tokens.push(c.to_string());
}
}
} else if word.len() >= 2 {
tokens.push(word.to_string());
}
}
tokens
}
pub fn generate_skip_bigrams(tokens: &[String], max_gap: usize) -> Vec<String> {
let mut result = Vec::new();
for i in 0..tokens.len() {
for j in (i + 2)..=(i + 1 + max_gap).min(tokens.len().saturating_sub(1)) {
result.push(format!("{}~{}", tokens[i], tokens[j]));
}
}
result
}
pub fn tokenize(query: &str) -> Vec<String> {
let lower = query.to_lowercase();
let expanded = expand_contractions(&lower);
let mut all_terms: Vec<String> = Vec::new();
let mut seen: HashSet<String> = HashSet::new();
for segment in expanded.split(['.', '!', '?']) {
let segment = segment.trim();
if segment.is_empty() {
continue;
}
for term in tokenize_segment(segment) {
if seen.insert(term.clone()) {
all_terms.push(term);
}
}
}
all_terms
}
fn tokenize_segment(text: &str) -> Vec<String> {
let raw_words: Vec<&str> = text
.split(|c: char| !c.is_alphanumeric() && c != '-')
.filter(|w| !w.is_empty())
.collect();
let stop_set = universal_stop_set();
let cjk_stop_set = cjk_stop_char_set();
let mut words: Vec<String> = Vec::new();
let mut is_word_cjk: Vec<bool> = Vec::new();
for word in &raw_words {
let has_cjk = word.chars().any(is_cjk);
if !has_cjk {
words.push(word.to_string());
is_word_cjk.push(false);
} else {
let chars: Vec<char> = word.chars().collect();
let mut cjk_run = String::new();
for &c in &chars {
if is_cjk(c) {
cjk_run.push(c);
} else {
if !cjk_run.is_empty() {
expand_cjk_run(&cjk_run, cjk_stop_set, &mut words, &mut is_word_cjk);
cjk_run.clear();
}
let s = c.to_string();
if !s.is_empty() && s.chars().any(|c| c.is_alphanumeric()) {
words.push(s);
is_word_cjk.push(false);
}
}
}
if !cjk_run.is_empty() {
expand_cjk_run(&cjk_run, cjk_stop_set, &mut words, &mut is_word_cjk);
}
}
}
let word_refs: Vec<&str> = words.iter().map(|w| w.as_str()).collect();
let negated = find_negated_positions(&word_refs, stop_set);
let mut terms: Vec<String> = Vec::new();
let mut seen: HashSet<String> = HashSet::new();
for (i, word) in words.iter().enumerate() {
if !stop_set.contains(word.as_str()) {
let term = if negated.contains(&i) {
format!("not_{}", word)
} else {
word.clone()
};
if seen.insert(term.clone()) {
terms.push(term);
}
}
}
let non_stop: Vec<String> = words
.iter()
.enumerate()
.filter(|(i, w)| !stop_set.contains(w.as_str()) && !is_word_cjk[*i])
.map(|(i, w)| {
if negated.contains(&i) {
format!("not_{}", w)
} else {
w.clone()
}
})
.collect();
for window in non_stop.windows(2) {
let bigram = format!("{} {}", window[0], window[1]);
if seen.insert(bigram.clone()) {
terms.push(bigram);
}
}
terms
}
fn expand_cjk_run(
run: &str,
stop_set: &HashSet<char>,
words: &mut Vec<String>,
is_cjk: &mut Vec<bool>,
) {
let chars: Vec<char> = run.chars().filter(|c| !stop_set.contains(c)).collect();
if chars.is_empty() {
return;
}
if chars.is_empty() {
return;
}
if chars.len() == 1 {
words.push(chars[0].to_string());
is_cjk.push(true);
return;
}
let full: String = chars.iter().collect();
words.push(full);
is_cjk.push(true);
for window in chars.windows(2) {
let bigram: String = window.iter().collect();
words.push(bigram);
is_cjk.push(true);
}
}
pub fn training_to_terms(queries: &[String]) -> HashMap<String, f32> {
if queries.is_empty() {
return HashMap::new();
}
let mut term_counts: HashMap<String, u32> = HashMap::new();
for query in queries {
let tokens = tokenize(query);
for token in tokens {
*term_counts.entry(token).or_insert(0) += 1;
}
}
if term_counts.is_empty() {
return HashMap::new();
}
let max_count = *term_counts.values().max().unwrap_or(&1);
term_counts
.into_iter()
.map(|(term, count)| {
let weight = (0.3 + 0.65 * (count as f32 / max_count as f32)).min(0.95);
(term, (weight * 100.0).round() / 100.0)
})
.collect()
}
#[derive(Debug, Clone)]
pub struct PositionedTerm {
pub term: String,
pub offset: usize,
pub end_offset: usize,
pub is_cjk: bool,
}
pub fn tokenize_positioned(query: &str) -> (Vec<PositionedTerm>, Vec<char>) {
let lower = query.to_lowercase();
let expanded = expand_contractions(&lower);
let chars: Vec<char> = expanded.chars().collect();
let mut words_positions: Vec<(String, usize, usize)> = Vec::new();
let mut i = 0;
while i < chars.len() {
if chars[i].is_alphanumeric() || chars[i] == '-' {
let start = i;
while i < chars.len() && (chars[i].is_alphanumeric() || chars[i] == '-') {
i += 1;
}
let word: String = chars[start..i].iter().collect();
words_positions.push((word, start, i));
} else {
i += 1;
}
}
let stop_set = universal_stop_set();
let word_strs: Vec<&str> = words_positions.iter().map(|(w, _, _)| w.as_str()).collect();
let negated = find_negated_positions(&word_strs, stop_set);
let positioned: Vec<PositionedTerm> = words_positions
.iter()
.enumerate()
.filter(|(_, (w, _, _))| !stop_set.contains(w.as_str()))
.map(|(idx, (w, start, end))| {
let term = if negated.contains(&idx) {
format!("not_{}", w)
} else {
w.clone()
};
PositionedTerm {
term,
offset: *start,
end_offset: *end,
is_cjk: false,
}
})
.collect();
(positioned, chars)
}
pub fn is_cjk(c: char) -> bool {
matches!(c,
'\u{4E00}'..='\u{9FFF}' | '\u{3040}'..='\u{309F}' | '\u{30A0}'..='\u{30FF}' | '\u{AC00}'..='\u{D7AF}' | '\u{3400}'..='\u{4DBF}' | '\u{F900}'..='\u{FAFF}' | '\u{1100}'..='\u{11FF}' | '\u{3130}'..='\u{318F}' | '\u{0E00}'..='\u{0E7F}' | '\u{0E80}'..='\u{0EFF}' | '\u{1000}'..='\u{109F}' | '\u{1780}'..='\u{17FF}' )
}
pub const CJK_NEGATION_MARKERS: &[&str] = &["不", "没", "别", "未"];
pub const JA_NEGATION_SUFFIXES: &[&str] = &["ない", "しない", "できない"];
pub const CJK_CLAUSE_BOUNDARIES: &[char] = &[',', '、', '。', ';'];
pub const CJK_CONJUNCTIONS: &[&str] = &["但", "然后", "而且", "或者"];
#[derive(Debug, Clone, PartialEq)]
pub enum ScriptType {
Latin,
Cjk,
}
#[derive(Debug, Clone)]
pub struct ScriptRun {
pub script: ScriptType,
pub text: String,
pub char_offset: usize,
}
pub fn split_script_runs(text: &str) -> Vec<ScriptRun> {
let mut runs = Vec::new();
let mut current_text = String::new();
let mut current_is_cjk: Option<bool> = None;
let mut run_start = 0;
for (char_idx, c) in text.chars().enumerate() {
if is_cjk(c) {
if current_is_cjk == Some(false) {
if !current_text.is_empty() {
runs.push(ScriptRun {
script: ScriptType::Latin,
text: std::mem::take(&mut current_text),
char_offset: run_start,
});
}
run_start = char_idx;
}
if current_is_cjk.is_none() {
run_start = char_idx;
}
current_is_cjk = Some(true);
current_text.push(c);
} else if c.is_alphanumeric() {
if current_is_cjk == Some(true) {
if !current_text.is_empty() {
runs.push(ScriptRun {
script: ScriptType::Cjk,
text: std::mem::take(&mut current_text),
char_offset: run_start,
});
}
run_start = char_idx;
}
if current_is_cjk.is_none() {
run_start = char_idx;
}
current_is_cjk = Some(false);
current_text.push(c);
} else {
current_text.push(c);
}
}
if !current_text.is_empty() {
let script = match current_is_cjk {
Some(true) => ScriptType::Cjk,
_ => ScriptType::Latin,
};
runs.push(ScriptRun {
script,
text: current_text,
char_offset: run_start,
});
}
runs
}
pub fn find_cjk_negated_regions(text: &str) -> Vec<(usize, usize)> {
let chars: Vec<char> = text.chars().collect();
let text_len = chars.len();
let mut regions = Vec::new();
let stop_set: HashSet<char> = CJK_CLAUSE_BOUNDARIES.iter().copied().collect();
for (i, &c) in chars.iter().enumerate() {
let s: String = c.to_string();
if CJK_NEGATION_MARKERS.contains(&s.as_str()) {
let neg_start = i + 1; let mut neg_end = text_len;
for j in neg_start..text_len {
if stop_set.contains(&chars[j]) {
neg_end = j;
break;
}
let remaining: String = chars[j..].iter().collect();
if CJK_CONJUNCTIONS
.iter()
.any(|conj| remaining.starts_with(conj))
{
neg_end = j;
break;
}
}
if neg_start < neg_end {
regions.push((neg_start, neg_end));
}
}
}
let text_str: String = chars.iter().collect();
for suffix in JA_NEGATION_SUFFIXES {
let suffix_chars: Vec<char> = suffix.chars().collect();
let suffix_len = suffix_chars.len();
if text_len >= suffix_len {
for i in 0..=(text_len - suffix_len) {
if chars[i..i + suffix_len] == suffix_chars[..] {
let neg_start = i + suffix_len;
let mut neg_end = text_len;
#[allow(clippy::needless_range_loop)]
for j in neg_start..text_len {
if stop_set.contains(&chars[j]) {
neg_end = j;
break;
}
}
if neg_start < neg_end {
regions.push((neg_start, neg_end));
}
}
}
}
}
let _ = text_str;
regions
}
pub fn generate_cjk_residual_bigrams(text: &str) -> Vec<String> {
let stop_set = cjk_stop_char_set();
let cleaned: Vec<char> = text
.chars()
.filter(|c| is_cjk(*c) && !stop_set.contains(c))
.collect();
let mut bigrams = Vec::new();
for window in cleaned.windows(2) {
let bigram: String = window.iter().collect();
bigrams.push(bigram);
}
bigrams
}
pub fn segment_breaks(query: &str) -> Vec<usize> {
let chars: Vec<char> = query.chars().collect();
let len = chars.len();
let mut breaks: Vec<usize> = Vec::new();
let lower: String = query.to_lowercase();
let lower_chars: Vec<char> = lower.chars().collect();
for (i, &c) in chars.iter().enumerate() {
match c {
'.' | '?' | '!' => breaks.push(i + 1),
';' => breaks.push(i),
_ => {}
}
}
for (i, &c) in chars.iter().enumerate() {
if c == ',' {
let remaining_start = i + 1;
if remaining_start < len {
let end = len.min(remaining_start + 15);
let rest: String = lower_chars[remaining_start..end].iter().collect();
let trimmed = rest.trim_start();
if trimmed.starts_with("and ")
|| trimmed.starts_with("but ")
|| trimmed.starts_with("or ")
|| trimmed.starts_with("so ")
|| trimmed.starts_with("because ")
|| trimmed.starts_with("also ")
|| trimmed.starts_with("then ")
|| trimmed.starts_with("however ")
{
breaks.push(i);
}
}
}
}
let words: Vec<&str> = query.split_whitespace().collect();
if words.len() >= 5 {
let mut word_offsets: Vec<usize> = Vec::new();
let mut pos = 0;
for &word in &words {
if let Some(idx) = query[pos..].find(word) {
word_offsets.push(pos + idx);
pos = pos + idx + word.len();
}
}
let conjunctions = ["and", "but", "or", "because", "however", "also"];
for (wi, &word) in words.iter().enumerate() {
let lower_word = word.to_lowercase();
let clean = lower_word.trim_end_matches(|c: char| !c.is_alphabetic());
if conjunctions.contains(&clean) {
if wi >= 3 && wi + 2 < words.len() {
if let Some(&offset) = word_offsets.get(wi) {
breaks.push(offset);
}
}
}
}
}
breaks.sort();
breaks.dedup();
if !breaks.is_empty() {
let mut filtered = Vec::new();
let mut prev = 0usize;
for &brk in &breaks {
let seg =
&query[byte_offset_from_char(&chars, prev)..byte_offset_from_char(&chars, brk)];
let content_chars = seg.chars().filter(|c| c.is_alphanumeric()).count();
if content_chars >= 6 {
filtered.push(brk);
prev = brk;
}
}
if let Some(&last_brk) = filtered.last() {
let seg = &query[byte_offset_from_char(&chars, last_brk)..];
let content_chars = seg.chars().filter(|c| c.is_alphanumeric()).count();
if content_chars < 6 {
filtered.pop(); }
}
breaks = filtered;
}
breaks
}
fn byte_offset_from_char(chars: &[char], char_pos: usize) -> usize {
chars[..char_pos.min(chars.len())]
.iter()
.map(|c| c.len_utf8())
.sum()
}
pub fn is_learnable_cjk_bigram(bigram: &str) -> bool {
let stop_set = cjk_stop_char_set();
let neg_chars: HashSet<char> = ['不', '没', '别', '未'].iter().copied().collect();
let chars: Vec<char> = bigram.chars().collect();
if chars.len() < 2 {
return false;
}
chars
.iter()
.any(|c| !stop_set.contains(c) && !neg_chars.contains(c))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn tokenize_simple() {
let terms = tokenize("list my repos");
assert!(terms.contains(&"list".to_string()));
assert!(terms.contains(&"repos".to_string()));
assert!(!terms.contains(&"my".to_string()));
}
#[test]
fn tokenize_bigrams() {
let terms = tokenize("charge credit card");
assert!(terms.contains(&"charge".to_string()));
assert!(terms.contains(&"credit".to_string()));
assert!(terms.contains(&"card".to_string()));
assert!(terms.contains(&"charge credit".to_string()));
assert!(terms.contains(&"credit card".to_string()));
}
#[test]
fn tokenize_strips_punctuation() {
let terms = tokenize("what's my repo?");
assert!(terms.contains(&"repo".to_string()));
}
#[test]
fn tokenize_empty() {
assert!(tokenize("").is_empty());
}
#[test]
fn tokenize_all_stop_words() {
assert!(tokenize("the a an in on at to of for by").is_empty());
}
#[test]
fn tokenize_deduplication() {
let terms = tokenize("charge charge charge");
assert_eq!(terms.iter().filter(|t| *t == "charge").count(), 1);
}
#[test]
fn tokenize_contractions() {
let terms = tokenize("I don't want to cancel");
assert!(!terms.contains(&"don".to_string()));
assert!(!terms.contains(&"t".to_string()));
assert!(
terms.contains(&"not_cancel".to_string()),
"cancel should be negated"
);
assert!(
!terms.contains(&"cancel".to_string()),
"bare cancel should not appear"
);
let terms = tokenize("I can't log in");
assert!(!terms.contains(&"t".to_string()));
assert!(terms.contains(&"log".to_string()));
let terms = tokenize("what's happening");
assert!(!terms.contains(&"s".to_string()));
assert!(terms.contains(&"happening".to_string()));
let terms = tokenize("it won't work");
assert!(!terms.contains(&"won".to_string()));
assert!(terms.contains(&"work".to_string()));
}
#[test]
fn tokenize_negation_prefix() {
let terms = tokenize("don't cancel my order");
assert!(
!terms.contains(&"cancel".to_string()),
"bare 'cancel' should not appear"
);
assert!(
terms.contains(&"not_cancel".to_string()),
"should have not_cancel"
);
assert!(
!terms.contains(&"order".to_string()),
"bare 'order' should not appear"
);
assert!(
terms.contains(&"not_order".to_string()),
"order is also in negation scope"
);
let terms = tokenize("don't want to cancel my order");
assert!(
terms.contains(&"not_cancel".to_string()),
"cancel should be negated"
);
assert!(
terms.contains(&"not_order".to_string()),
"order should be negated"
);
assert!(!terms.contains(&"cancel".to_string()));
assert!(!terms.contains(&"order".to_string()));
let terms = tokenize("I don't have my card");
assert!(
!terms.contains(&"card".to_string()),
"card is in negation scope"
);
assert!(terms.contains(&"not_card".to_string()));
let terms = tokenize("I can't log in");
assert!(terms.contains(&"log".to_string()));
assert!(
!terms.contains(&"not_log".to_string()),
"can't is inability, not negation"
);
let terms = tokenize("I never received my card");
assert!(!terms.contains(&"received".to_string()));
assert!(terms.contains(&"not_received".to_string()));
assert!(
!terms.contains(&"card".to_string()),
"card is also in scope after never"
);
assert!(terms.contains(&"not_card".to_string()));
let terms = tokenize("track my order without cancelling");
assert!(terms.contains(&"track".to_string()));
assert!(terms.contains(&"order".to_string()));
assert!(!terms.contains(&"cancelling".to_string()));
assert!(terms.contains(&"not_cancelling".to_string()));
let terms = tokenize("cancel my order");
assert!(terms.contains(&"cancel".to_string()));
assert!(terms.contains(&"order".to_string()));
assert!(!terms.iter().any(|t| t.starts_with("not_")));
}
#[test]
fn tokenize_pseudo_negation_bypass() {
let terms = tokenize("no problem with my order");
assert!(
!terms.iter().any(|t| t.starts_with("not_")),
"pseudo-negation should not trigger not_ prefix"
);
assert!(terms.contains(&"problem".to_string()));
let terms = tokenize("I can't wait to start");
assert!(terms.contains(&"wait".to_string()));
assert!(terms.contains(&"start".to_string()));
}
#[test]
fn tokenize_negation_in_multi_intent_context() {
let terms = tokenize("cancel my order but don't refund");
assert!(
terms.contains(&"cancel".to_string()),
"cancel should be bare (not negated)"
);
assert!(terms.contains(&"order".to_string()));
assert!(
terms.contains(&"not_refund".to_string()),
"refund should be negated"
);
assert!(
!terms.contains(&"refund".to_string()),
"bare refund should not appear"
);
}
#[test]
fn training_basic() {
let terms = training_to_terms(&[
"pause the music".to_string(),
"stop playing".to_string(),
"stop the music".to_string(),
]);
assert!(terms.contains_key("music"));
assert!(terms.contains_key("stop"));
assert!(terms.contains_key("pause"));
assert!(terms["music"] > terms["pause"]);
assert!((terms["music"] - terms["stop"]).abs() < 0.01);
}
#[test]
fn training_includes_bigrams() {
let terms =
training_to_terms(&["stop the music".to_string(), "stop playing now".to_string()]);
assert!(terms.contains_key("stop music"));
assert!(terms.contains_key("stop playing"));
}
#[test]
fn training_empty() {
assert!(training_to_terms(&[]).is_empty());
assert!(training_to_terms(&["the a an".to_string()]).is_empty());
}
#[test]
fn training_weight_range() {
let queries: Vec<String> = (0..15)
.map(|i| {
if i < 10 {
"music".to_string()
} else {
"song".to_string()
}
})
.collect();
let terms = training_to_terms(&queries);
for weight in terms.values() {
assert!(*weight >= 0.3);
assert!(*weight <= 0.95);
}
}
#[test]
fn is_cjk_detection() {
assert!(is_cjk('取')); assert!(is_cjk('の')); assert!(is_cjk('カ')); assert!(is_cjk('한')); assert!(!is_cjk('a'));
assert!(!is_cjk('1'));
assert!(!is_cjk(' '));
}
#[test]
fn split_script_runs_latin_only() {
let runs = split_script_runs("cancel my order");
assert_eq!(runs.len(), 1);
assert_eq!(runs[0].script, ScriptType::Latin);
}
#[test]
fn split_script_runs_cjk_only() {
let runs = split_script_runs("取消订单");
assert_eq!(runs.len(), 1);
assert_eq!(runs[0].script, ScriptType::Cjk);
assert_eq!(runs[0].char_offset, 0);
}
#[test]
fn split_script_runs_mixed() {
let runs = split_script_runs("cancel 取消订单 order");
assert_eq!(runs.len(), 3);
assert_eq!(runs[0].script, ScriptType::Latin);
assert_eq!(runs[1].script, ScriptType::Cjk);
assert_eq!(runs[2].script, ScriptType::Latin);
}
#[test]
fn cjk_residual_bigrams() {
let bigrams = generate_cjk_residual_bigrams("取消订单");
assert_eq!(bigrams.len(), 3);
assert!(bigrams.contains(&"取消".to_string()));
assert!(bigrams.contains(&"订单".to_string()));
}
#[test]
fn cjk_residual_bigrams_filters_stop_chars() {
let bigrams = generate_cjk_residual_bigrams("我的订单");
assert!(bigrams.contains(&"订单".to_string()));
assert!(!bigrams.iter().any(|b| b.contains('的'))); }
#[test]
fn cjk_negation_regions() {
let regions = find_cjk_negated_regions("不取消");
assert_eq!(regions.len(), 1);
assert_eq!(regions[0], (1, 3));
}
#[test]
fn cjk_negation_stops_at_clause_boundary() {
let regions = find_cjk_negated_regions("不取消,查看订单");
assert_eq!(regions.len(), 1);
assert_eq!(regions[0].0, 1);
assert_eq!(regions[0].1, 3); }
#[test]
fn cjk_negation_stops_at_conjunction() {
let regions = find_cjk_negated_regions("不取消然后查看");
assert_eq!(regions.len(), 1);
assert_eq!(regions[0].0, 1);
assert_eq!(regions[0].1, 3); }
#[test]
fn learnable_cjk_bigram_checks() {
assert!(is_learnable_cjk_bigram("取消"));
assert!(is_learnable_cjk_bigram("订单"));
assert!(is_learnable_cjk_bigram("我的")); assert!(!is_learnable_cjk_bigram("的了")); assert!(!is_learnable_cjk_bigram("a")); }
#[test]
fn positioned_terms_have_char_offsets() {
let (terms, _chars) = tokenize_positioned("cancel my order");
let cancel = terms.iter().find(|t| t.term == "cancel").unwrap();
assert_eq!(cancel.offset, 0);
assert_eq!(cancel.end_offset, 6);
let order = terms.iter().find(|t| t.term == "order").unwrap();
assert_eq!(order.offset, 10);
assert_eq!(order.end_offset, 15);
}
}