use std::collections::HashMap;
use graphify_core::graph::KnowledgeGraph;
pub fn tokenize(input: &str) -> Vec<String> {
let raw: Vec<&str> = input
.split(&['_', '.', ':', '/', '\\', '-'][..])
.flat_map(|s| s.split_whitespace())
.collect();
let mut tokens: Vec<String> = Vec::new();
for piece in &raw {
if piece.is_empty() {
continue;
}
let mut segment = String::new();
for ch in piece.chars() {
if ch.is_uppercase()
&& !segment.is_empty()
&& !segment.chars().last().unwrap().is_uppercase()
{
tokens.push(segment.to_lowercase());
segment.clear();
}
segment.push(ch);
}
if !segment.is_empty() {
tokens.push(segment.to_lowercase());
}
}
tokens
}
pub struct SearchIndex {
index: HashMap<String, Vec<(String, f64)>>,
}
impl SearchIndex {
pub fn build(graph: &KnowledgeGraph) -> Self {
let mut index: HashMap<String, Vec<(String, f64)>> = HashMap::new();
for node_id in graph.node_ids() {
let Some(node) = graph.get_node(&node_id) else {
continue;
};
let degree = graph.degree(&node_id) as f64;
let degree_boost = degree.ln_1p() * 0.1;
let mut insert = |text: &str, base: f64| {
for tok in tokenize(text) {
let weight = base + degree_boost;
index
.entry(tok)
.or_default()
.push((node_id.clone(), weight));
}
};
insert(&node.label, 2.0);
insert(&node.id, 1.0);
insert(&node.source_file, 0.5);
}
SearchIndex { index }
}
pub fn search(&self, terms: &[String]) -> Vec<(f64, String)> {
let mut scores: HashMap<String, f64> = HashMap::new();
let term_tokens: Vec<String> = terms.iter().flat_map(|t| tokenize(t)).collect();
for term_tok in &term_tokens {
if let Some(entries) = self.index.get(term_tok) {
for (node_id, weight) in entries {
*scores.entry(node_id.clone()).or_default() += weight;
}
}
for (token, entries) in &self.index {
if token != term_tok && token.starts_with(term_tok) {
for (node_id, weight) in entries {
*scores.entry(node_id.clone()).or_default() += weight * 0.5;
}
}
}
}
let mut results: Vec<(f64, String)> = scores
.into_iter()
.map(|(node_id, score)| (score, node_id))
.collect();
results.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
results
}
}
#[cfg(test)]
mod tests {
use super::*;
use graphify_core::model::{GraphNode, NodeType};
use std::collections::HashMap;
fn make_node(id: &str, label: &str, source_file: &str) -> GraphNode {
GraphNode {
id: id.into(),
label: label.into(),
source_file: source_file.into(),
source_location: None,
node_type: NodeType::Class,
community: None,
extra: HashMap::new(),
}
}
fn make_graph() -> KnowledgeGraph {
let mut g = KnowledgeGraph::new();
g.add_node(make_node(
"auth_service",
"AuthService",
"src/auth/service.rs",
))
.unwrap();
g.add_node(make_node(
"user_manager",
"UserManager",
"src/user/manager.rs",
))
.unwrap();
g.add_node(make_node("database_pool", "DatabasePool", "src/db/pool.rs"))
.unwrap();
g.add_node(make_node("cache_layer", "CacheLayer", "src/cache/layer.rs"))
.unwrap();
g
}
fn make_graph_with_edges() -> KnowledgeGraph {
use graphify_core::confidence::Confidence;
use graphify_core::model::GraphEdge;
let mut g = KnowledgeGraph::new();
g.add_node(make_node("auth", "AuthService", "src/auth.rs"))
.unwrap();
g.add_node(make_node("user", "UserManager", "src/user.rs"))
.unwrap();
g.add_node(make_node("db", "Database", "src/db.rs"))
.unwrap();
g.add_node(make_node("cache", "CacheLayer", "src/cache.rs"))
.unwrap();
let edge = GraphEdge {
source: "auth".into(),
target: "user".into(),
relation: "calls".into(),
confidence: Confidence::Extracted,
confidence_score: 1.0,
source_file: "test.rs".into(),
source_location: None,
weight: 1.0,
provenance: None,
extra: HashMap::new(),
};
g.add_edge(edge).unwrap();
g
}
#[test]
fn tokenize_camel_case() {
let tokens = tokenize("camelCase");
assert_eq!(tokens, vec!["camel", "case"]);
}
#[test]
fn tokenize_underscore() {
let tokens = tokenize("foo_bar_baz");
assert_eq!(tokens, vec!["foo", "bar", "baz"]);
}
#[test]
fn tokenize_dot_separator() {
let tokens = tokenize("mod.rs");
assert_eq!(tokens, vec!["mod", "rs"]);
}
#[test]
fn tokenize_double_colon() {
let tokens = tokenize("std::collections::HashMap");
assert_eq!(tokens, vec!["std", "collections", "hash", "map"]);
}
#[test]
fn tokenize_slash() {
let tokens = tokenize("src/main/mod.rs");
assert_eq!(tokens, vec!["src", "main", "mod", "rs"]);
}
#[test]
fn tokenize_backslash() {
let tokens = tokenize(r"src\main\mod.rs");
assert_eq!(tokens, vec!["src", "main", "mod", "rs"]);
}
#[test]
fn tokenize_hyphen() {
let tokens = tokenize("my-component-name");
assert_eq!(tokens, vec!["my", "component", "name"]);
}
#[test]
fn tokenize_whitespace() {
let tokens = tokenize("foo bar\tbaz");
assert_eq!(tokens, vec!["foo", "bar", "baz"]);
}
#[test]
fn tokenize_mixed() {
let tokens = tokenize("MyComponent_test.rs");
assert_eq!(tokens, vec!["my", "component", "test", "rs"]);
}
#[test]
fn tokenize_empty() {
let tokens = tokenize("");
assert!(tokens.is_empty());
}
#[test]
fn tokenize_all_lowercase() {
let tokens = tokenize("AuthService");
assert!(tokens.iter().all(|t| t == &t.to_lowercase()));
}
#[test]
fn tokenize_consecutive_uppercase() {
let tokens = tokenize("HTTPServer");
assert_eq!(tokens, vec!["httpserver"]);
}
#[test]
fn build_creates_index_entries() {
let g = make_graph();
let idx = SearchIndex::build(&g);
assert!(idx.index.contains_key("auth"));
let entries = &idx.index["auth"];
assert!(entries.iter().any(|(id, _)| id == "auth_service"));
}
#[test]
fn build_label_weight_higher_than_id() {
let g = make_graph();
let idx = SearchIndex::build(&g);
let _label_weight = idx.index["service"]
.iter()
.filter(|(id, _)| id == "auth_service")
.map(|(_, w)| *w)
.fold(f64::NEG_INFINITY, f64::max);
let weights: Vec<f64> = idx.index["service"]
.iter()
.filter(|(id, _)| id == "auth_service")
.map(|(_, w)| *w)
.collect();
assert!(
weights.len() >= 2,
"should have label and id entries for 'service'"
);
assert!(
weights.iter().any(|w| *w >= 2.0),
"at least one weight >= 2.0 (label), got {:?}",
weights
);
}
#[test]
fn build_source_file_tokens() {
let g = make_graph();
let idx = SearchIndex::build(&g);
assert!(idx.index.contains_key("pool"));
}
#[test]
fn search_exact_label_match() {
let g = make_graph();
let idx = SearchIndex::build(&g);
let results = idx.search(&["auth".to_string()]);
assert!(!results.is_empty());
assert!(results.iter().any(|(_, id)| id == "auth_service"));
}
#[test]
fn search_exact_id_match() {
let g = make_graph();
let idx = SearchIndex::build(&g);
let results = idx.search(&["database".to_string()]);
assert!(!results.is_empty());
assert!(results.iter().any(|(_, id)| id == "database_pool"));
}
#[test]
fn search_source_file_match() {
let g = make_graph();
let idx = SearchIndex::build(&g);
let results = idx.search(&["cache".to_string()]);
assert!(!results.is_empty());
assert!(results.iter().any(|(_, id)| id == "cache_layer"));
}
#[test]
fn search_no_match() {
let g = make_graph();
let idx = SearchIndex::build(&g);
let results = idx.search(&["nonexistent_xyz".to_string()]);
assert!(results.is_empty());
}
#[test]
fn search_prefix_match() {
let g = make_graph();
let idx = SearchIndex::build(&g);
let results = idx.search(&["use".to_string()]);
assert!(
results.iter().any(|(_, id)| id == "user_manager"),
"'use' should prefix-match 'user' from UserManager, got: {:?}",
results
);
}
#[test]
fn search_prefix_lower_weight() {
let g = make_graph();
let idx = SearchIndex::build(&g);
let exact = idx.search(&["user".to_string()]);
let prefix = idx.search(&["use".to_string()]);
let exact_score = exact
.iter()
.find(|(_, id)| id == "user_manager")
.map(|(s, _)| *s)
.unwrap_or(0.0);
let prefix_score = prefix
.iter()
.find(|(_, id)| id == "user_manager")
.map(|(s, _)| *s)
.unwrap_or(0.0);
assert!(
exact_score > prefix_score,
"exact match ({}) should score higher than prefix match ({})",
exact_score,
prefix_score
);
}
#[test]
fn search_multiple_terms_aggregate() {
let g = make_graph();
let idx = SearchIndex::build(&g);
let results = idx.search(&["auth".to_string(), "service".to_string()]);
assert!(!results.is_empty());
let top = &results[0];
assert_eq!(top.1, "auth_service");
let single = idx.search(&["auth".to_string()]);
let single_score = single
.iter()
.find(|(_, id)| id == "auth_service")
.map(|(s, _)| *s)
.unwrap_or(0.0);
assert!(
top.0 > single_score,
"two-term match ({}) should score higher than single-term ({})",
top.0,
single_score
);
}
#[test]
fn search_sorted_descending() {
let g = make_graph();
let idx = SearchIndex::build(&g);
let results = idx.search(&["service".to_string()]);
for w in results.windows(2) {
assert!(
w[0].0 >= w[1].0,
"results should be sorted descending by score"
);
}
}
#[test]
fn search_degree_boost() {
let g_no_edges = make_graph(); let g_with_edges = make_graph_with_edges();
let idx_no = SearchIndex::build(&g_no_edges);
let idx_with = SearchIndex::build(&g_with_edges);
let results_no = idx_no.search(&["auth".to_string()]);
let results_with = idx_with.search(&["auth".to_string()]);
let score_no = results_no
.iter()
.find(|(_, id)| id == "auth")
.map(|(s, _)| *s)
.unwrap_or(0.0);
let score_with = results_with
.iter()
.find(|(_, id)| id == "auth")
.map(|(s, _)| *s)
.unwrap_or(0.0);
assert!(
score_with > score_no,
"node with edges ({}) should score higher than without ({})",
score_with,
score_no
);
}
#[test]
fn search_empty_terms() {
let g = make_graph();
let idx = SearchIndex::build(&g);
let results = idx.search(&[]);
assert!(results.is_empty());
}
#[test]
fn search_case_insensitive_terms() {
let g = make_graph();
let idx = SearchIndex::build(&g);
let lower = idx.search(&["auth".to_string()]);
let upper = idx.search(&["AUTH".to_string()]);
assert_eq!(lower.len(), upper.len());
}
#[test]
fn search_id_token_lower_than_label() {
let g = make_graph();
let idx = SearchIndex::build(&g);
let results = idx.search(&["rs".to_string()]);
assert!(!results.is_empty(), "should find 'rs' in source files");
let rs_score = results[0].0;
let results_label = idx.search(&["manager".to_string()]);
let label_score = results_label[0].0;
assert!(
label_score > rs_score,
"label token ({}) should score higher than source_file-only token ({})",
label_score,
rs_score
);
}
}