use std::collections::HashMap;
pub fn tokenize_text(text: &str) -> HashMap<String, u32> {
let mut tf: HashMap<String, u32> = HashMap::new();
for token in split_and_normalize(text) {
*tf.entry(token).or_default() += 1;
}
tf
}
pub fn tokenize_code(content: &str) -> HashMap<String, u32> {
let mut tf: HashMap<String, u32> = HashMap::new();
for word in split_code_words(content) {
if word.len() < 2 || is_noise_word(&word) {
continue;
}
for token in expand_compound(&word) {
*tf.entry(token).or_default() += 1;
}
}
tf
}
pub fn tokenize_path(path: &str) -> Vec<String> {
let mut tokens = Vec::new();
for segment in path.split('/') {
if let Some((name, ext)) = segment.rsplit_once('.') {
let name_lower = name.to_lowercase();
if name_lower.len() >= 2 {
tokens.push(name_lower);
}
let ext_lower = ext.to_lowercase();
if ext_lower.len() >= 2 {
tokens.push(ext_lower);
}
} else {
let seg_lower = segment.to_lowercase();
if seg_lower.len() >= 2 {
tokens.push(seg_lower);
}
}
}
tokens.push(path.to_lowercase());
tokens
}
pub fn tokenize_query(query: &str) -> Vec<String> {
let mut seen = std::collections::HashSet::new();
let mut tokens = Vec::new();
let mut clean_buf = String::new();
for word in query.split_whitespace() {
clean_buf.clear();
clean_buf.reserve(word.len());
clean_buf.extend(
word.chars()
.filter(|c| c.is_alphanumeric() || *c == '_' || *c == '-' || *c == '.'),
);
if clean_buf.len() < 2 {
continue;
}
let lower = clean_buf.to_lowercase();
if is_query_stop_word(&lower) {
continue;
}
for token in expand_compound(&clean_buf) {
if seen.insert(token.clone()) {
tokens.push(token);
}
}
}
tokens
}
pub fn split_camel_case(s: &str) -> Vec<String> {
let mut parts = Vec::new();
let mut current = String::new();
let chars: Vec<char> = s.chars().collect();
let len = chars.len();
for i in 0..len {
let ch = chars[i];
if ch.is_uppercase() && !current.is_empty() {
let prev_lower = i > 0 && chars[i - 1].is_lowercase();
let next_lower = i + 1 < len && chars[i + 1].is_lowercase();
if prev_lower || (next_lower && current.len() > 1) {
parts.push(std::mem::take(&mut current));
}
}
current.push(ch);
}
if !current.is_empty() {
parts.push(current);
}
parts
}
fn expand_compound(word: &str) -> Vec<String> {
let mut tokens = Vec::new();
let is_already_lower = word.bytes().all(|b| !b.is_ascii_uppercase());
if !is_already_lower {
let camel_parts = split_camel_case(word);
if camel_parts.len() > 1 {
for part in &camel_parts {
let p = part.to_lowercase();
if p.len() >= 2 && !is_noise_word(&p) {
tokens.push(p);
}
}
tokens.push(word.to_lowercase());
return tokens;
}
}
let lower = if is_already_lower {
None
} else {
Some(word.to_lowercase())
};
let lower_ref = lower.as_deref().unwrap_or(word);
let snake_parts: Vec<&str> = lower_ref.split('_').filter(|p| p.len() >= 2).collect();
if snake_parts.len() > 1 {
for part in &snake_parts {
if !is_noise_word(part) {
tokens.push(part.to_string());
}
}
tokens.push(lower_ref.to_string());
} else {
tokens.push(lower_ref.to_string());
}
tokens
}
fn split_and_normalize(text: &str) -> Vec<String> {
let mut tokens = Vec::new();
for raw_word in text.split(|c: char| !c.is_alphanumeric() && c != '_' && c != '-') {
let word = raw_word.trim();
if word.len() < 2 {
continue;
}
if is_stop_word(word) {
continue;
}
tokens.extend(expand_compound(word));
}
tokens
}
fn split_code_words(content: &str) -> Vec<String> {
let mut words = Vec::new();
let mut current = String::new();
let mut in_string = false;
let mut string_char = '"';
for ch in content.chars() {
if in_string {
if ch == string_char {
in_string = false;
}
continue;
}
match ch {
'"' | '\'' => {
if !current.is_empty() {
words.push(std::mem::take(&mut current));
}
in_string = true;
string_char = ch;
}
c if c.is_alphanumeric() || c == '_' => {
current.push(c);
}
_ => {
if !current.is_empty() {
words.push(std::mem::take(&mut current));
}
}
}
}
if !current.is_empty() {
words.push(current);
}
words
}
pub fn is_noise_word(word: &str) -> bool {
if word
.bytes()
.all(|b| b.is_ascii_lowercase() || !b.is_ascii_alphabetic())
{
is_stop_word(word)
|| matches!(
word,
"ok" | "err"
| "const"
| "static"
| "async"
| "await"
| "match"
| "while"
| "loop"
| "break"
| "continue"
| "crate"
| "super"
| "as"
| "in"
| "ref"
| "dyn"
| "if"
| "else"
)
} else {
let w = word.to_lowercase();
is_stop_word(&w)
|| matches!(
w.as_str(),
"ok" | "err"
| "const"
| "static"
| "async"
| "await"
| "match"
| "while"
| "loop"
| "break"
| "continue"
| "crate"
| "super"
| "as"
| "in"
| "ref"
| "dyn"
| "if"
| "else"
)
}
}
pub fn is_stop_word(word: &str) -> bool {
if word
.bytes()
.all(|b| b.is_ascii_lowercase() || !b.is_ascii_alphabetic())
{
matches!(
word,
"the"
| "and"
| "for"
| "with"
| "from"
| "this"
| "that"
| "self"
| "mut"
| "let"
| "pub"
| "use"
| "mod"
| "fn"
| "impl"
| "struct"
| "enum"
| "type"
| "trait"
| "where"
| "return"
| "true"
| "false"
| "none"
| "some"
| "is"
| "are"
| "was"
| "has"
| "had"
| "not"
| "but"
| "all"
| "can"
| "will"
| "into"
| "then"
| "than"
)
} else {
matches!(
word.to_lowercase().as_str(),
"the"
| "and"
| "for"
| "with"
| "from"
| "this"
| "that"
| "self"
| "mut"
| "let"
| "pub"
| "use"
| "mod"
| "fn"
| "impl"
| "struct"
| "enum"
| "type"
| "trait"
| "where"
| "return"
| "true"
| "false"
| "none"
| "some"
| "is"
| "are"
| "was"
| "has"
| "had"
| "not"
| "but"
| "all"
| "can"
| "will"
| "into"
| "then"
| "than"
)
}
}
pub fn is_query_stop_word(word: &str) -> bool {
matches!(
word,
"the"
| "is"
| "at"
| "which"
| "on"
| "a"
| "an"
| "be"
| "to"
| "of"
| "it"
| "in"
| "do"
| "does"
| "was"
| "were"
| "been"
| "being"
| "have"
| "has"
| "had"
| "having"
| "can"
| "could"
| "would"
| "should"
| "will"
| "shall"
| "may"
| "might"
| "are"
| "am"
)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_split_camel_case() {
assert_eq!(split_camel_case("camelCase"), vec!["camel", "Case"]);
assert_eq!(split_camel_case("PascalCase"), vec!["Pascal", "Case"]);
assert_eq!(split_camel_case("HTMLParser"), vec!["HTML", "Parser"]);
assert_eq!(split_camel_case("simple"), vec!["simple"]);
}
#[test]
fn test_tokenize_text_snake_case() {
let tf = tokenize_text("shared_knowledge base");
assert!(tf.contains_key("shared"));
assert!(tf.contains_key("knowledge"));
assert!(tf.contains_key("shared_knowledge"));
assert!(tf.contains_key("base"));
}
#[test]
fn test_tokenize_text_camel_case() {
let tf = tokenize_text("buildContextSummary");
assert!(tf.contains_key("build"));
assert!(tf.contains_key("context"));
assert!(tf.contains_key("summary"));
}
#[test]
fn test_tokenize_code_skips_strings() {
let tf = tokenize_code(r#"let x = "hello world";"#);
assert!(!tf.contains_key("hello"));
assert!(!tf.contains_key("world"));
}
#[test]
fn test_tokenize_code_snake_case() {
let tf = tokenize_code("let my_variable = 42;");
assert!(tf.contains_key("my"));
assert!(tf.contains_key("variable"));
assert!(tf.contains_key("my_variable"));
}
#[test]
fn test_tokenize_code_camel_case() {
let tf = tokenize_code("myFunctionName()");
assert!(tf.contains_key("my"));
assert!(tf.contains_key("function"));
assert!(tf.contains_key("name"));
assert!(tf.contains_key("myfunctionname"));
}
#[test]
fn test_tokenize_path_segments() {
let tokens = tokenize_path("src/agent/context.rs");
assert!(tokens.contains(&"src".to_string()));
assert!(tokens.contains(&"agent".to_string()));
assert!(tokens.contains(&"context".to_string()));
assert!(tokens.contains(&"rs".to_string()));
}
#[test]
fn test_tokenize_query_deduplicates() {
let tokens = tokenize_query("error error handling");
let error_count = tokens.iter().filter(|t| *t == "error").count();
assert_eq!(error_count, 1, "query tokens should be deduplicated");
assert!(tokens.contains(&"handling".to_string()));
}
#[test]
fn test_tokenize_query_camel_case() {
let tokens = tokenize_query("parseURL");
assert!(
tokens.contains(&"parse".to_string()),
"camelCase query should split: got {tokens:?}"
);
assert!(tokens.contains(&"url".to_string()));
}
#[test]
fn test_tokenize_query_filters_stop_words() {
let tokens = tokenize_query("where is the error?");
assert!(!tokens.contains(&"the".to_string()));
assert!(!tokens.contains(&"is".to_string()));
assert!(tokens.contains(&"error".to_string()));
}
#[test]
fn test_noise_word_superset() {
assert!(is_noise_word("async"));
assert!(is_noise_word("await"));
assert!(is_noise_word("crate"));
assert!(is_noise_word("the"));
assert!(is_noise_word("pub"));
}
}