fn is_cjk(c: char) -> bool {
matches!(c,
'\u{4E00}'..='\u{9FFF}' | '\u{3400}'..='\u{4DBF}' | '\u{F900}'..='\u{FAFF}' | '\u{3000}'..='\u{303F}' | '\u{3040}'..='\u{309F}' | '\u{30A0}'..='\u{30FF}' | '\u{AC00}'..='\u{D7AF}' | '\u{1100}'..='\u{11FF}' )
}
pub fn tokenize_identifier(s: &str) -> Vec<String> {
tokenize_identifier_iter(s).collect()
}
pub(super) fn tokenize_identifier_iter(s: &str) -> impl Iterator<Item = String> + '_ {
TokenizeIdentifierIter {
chars: s.chars().peekable(),
current: String::new(),
done: false,
}
}
struct TokenizeIdentifierIter<'a> {
chars: std::iter::Peekable<std::str::Chars<'a>>,
current: String,
done: bool,
}
impl<'a> Iterator for TokenizeIdentifierIter<'a> {
type Item = String;
fn next(&mut self) -> Option<Self::Item> {
if self.done {
return None;
}
loop {
match self.chars.next() {
Some(c) if c == '_' || c == '-' || c == ' ' => {
if !self.current.is_empty() {
return Some(std::mem::take(&mut self.current));
}
}
Some(c) if is_cjk(c) => {
if !self.current.is_empty() {
let result = std::mem::take(&mut self.current);
self.current.push(c);
return Some(result);
}
return Some(c.to_string());
}
Some(c) if c.is_uppercase() && !self.current.is_empty() => {
let result = std::mem::take(&mut self.current);
self.current.push(c.to_lowercase().next().unwrap_or(c));
return Some(result);
}
Some(c) => {
self.current.push(c.to_lowercase().next().unwrap_or(c));
}
None => {
self.done = true;
if !self.current.is_empty() {
return Some(std::mem::take(&mut self.current));
}
return None;
}
}
}
}
}
const MAX_FTS_OUTPUT_LEN: usize = 16384;
pub fn normalize_for_fts(text: &str) -> String {
let mut result = String::new();
let mut current_word = String::new();
let flush_word = |word: &str, result: &mut String| {
for token in tokenize_identifier_iter(word) {
if !result.is_empty() {
result.push(' ');
}
result.push_str(&token);
}
};
for c in text.chars() {
if c.is_alphanumeric() || c == '_' {
current_word.push(c);
} else if !current_word.is_empty() {
flush_word(¤t_word, &mut result);
current_word.clear();
if result.len() >= MAX_FTS_OUTPUT_LEN {
let boundary = result.floor_char_boundary(MAX_FTS_OUTPUT_LEN);
let truncate_at = result[..boundary].rfind(' ').unwrap_or(boundary);
result.truncate(truncate_at);
return result;
}
}
}
if !current_word.is_empty() {
flush_word(¤t_word, &mut result);
}
if result.len() > MAX_FTS_OUTPUT_LEN {
let boundary = result.floor_char_boundary(MAX_FTS_OUTPUT_LEN);
let truncate_at = result[..boundary].rfind(' ').unwrap_or(boundary);
result.truncate(truncate_at);
}
result
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_tokenize_identifier() {
assert_eq!(
tokenize_identifier("parseConfigFile"),
vec!["parse", "config", "file"]
);
assert_eq!(
tokenize_identifier("get_user_name"),
vec!["get", "user", "name"]
);
assert_eq!(tokenize_identifier("simple"), vec!["simple"]);
assert_eq!(tokenize_identifier(""), Vec::<String>::new());
}
#[test]
fn test_tokenize_identifier_cjk() {
assert_eq!(
tokenize_identifier("获取用户名"),
vec!["获", "取", "用", "户", "名"]
);
assert_eq!(
tokenize_identifier("get用户Name"),
vec!["get", "用", "户", "name"]
);
assert_eq!(
tokenize_identifier("こんにちは"),
vec!["こ", "ん", "に", "ち", "は"]
);
assert_eq!(tokenize_identifier("사용자"), vec!["사", "용", "자"]);
assert_eq!(
tokenize_identifier("get_用户_name"),
vec!["get", "用", "户", "name"]
);
}
#[test]
fn test_normalize_for_fts_cjk() {
assert_eq!(normalize_for_fts("获取用户名"), "获 取 用 户 名");
assert_eq!(normalize_for_fts("fn get_用户()"), "fn get 用 户");
}
#[test]
fn test_normalize_for_fts_output_bounded() {
let long_upper = "A".repeat(20000);
let result = normalize_for_fts(&long_upper);
assert!(
result.len() <= MAX_FTS_OUTPUT_LEN,
"FTS output should be capped at {} but was {}",
MAX_FTS_OUTPUT_LEN,
result.len()
);
}
#[test]
fn test_normalize_for_fts_normal_input_unchanged() {
assert_eq!(normalize_for_fts("hello"), "hello");
assert_eq!(normalize_for_fts("HelloWorld"), "hello world");
assert_eq!(normalize_for_fts("get_user_name"), "get user name");
}
#[test]
fn test_normalize_for_fts_cjk_truncation_no_panic() {
let cjk_heavy: String = "获".repeat(10000);
let result = normalize_for_fts(&cjk_heavy);
assert!(
result.len() <= MAX_FTS_OUTPUT_LEN,
"CJK FTS output should be capped but was {}",
result.len()
);
assert!(result.is_char_boundary(result.len()));
}
mod fuzz {
use super::*;
use proptest::prelude::*;
proptest! {
#[test]
fn fuzz_tokenize_identifier_no_panic(input in "\\PC{0,200}") {
let _ = tokenize_identifier(&input);
}
#[test]
fn fuzz_tokenize_identifier_like(input in "[a-zA-Z_][a-zA-Z0-9_]{0,50}") {
let result = tokenize_identifier(&input);
for token in &result {
prop_assert!(!token.is_empty(), "Empty token in result");
}
}
}
}
}