mod distributed;
pub use distributed::{DistributedHybridConfig, DistributedQueryEngine};
use crate::types::*;
pub fn tokenize(text: &str) -> Vec<String> {
let stopwords: std::collections::HashSet<&str> = [
"the", "a", "an", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had",
"do", "does", "did", "will", "would", "could", "should", "may", "might", "shall", "can",
"need", "to", "of", "in", "for", "on", "with", "at", "by", "from", "as", "into", "through",
"during", "before", "after", "above", "below", "between", "out", "off", "over", "under",
"again", "further", "then", "once", "and", "but", "or", "if", "while", "what", "which",
"who", "this", "that", "these", "those", "it", "its", "how",
]
.iter()
.cloned()
.collect();
text.to_lowercase()
.split_whitespace()
.filter(|w| w.len() >= 3 && !stopwords.contains(w))
.map(|w| w.trim_matches(|c: char| !c.is_alphanumeric()).to_string())
.filter(|w| w.len() >= 3)
.collect()
}
pub fn merge_results(results: Vec<Vec<ScoredNode>>, max_results: usize) -> Vec<ScoredNode> {
let mut all: Vec<ScoredNode> = results.into_iter().flatten().collect();
all.sort_by(|a, b| {
b.score
.partial_cmp(&a.score)
.unwrap_or(std::cmp::Ordering::Equal)
});
all.truncate(max_results);
all
}
#[cfg(test)]
mod tests {
use super::*;
use phago_core::types::NodeId;
#[test]
fn test_tokenize_basic() {
let tokens = tokenize("The cell membrane");
assert!(tokens.contains(&"cell".to_string()));
assert!(tokens.contains(&"membrane".to_string()));
assert!(!tokens.contains(&"the".to_string()));
}
#[test]
fn test_tokenize_filters_short_words() {
let tokens = tokenize("a is the on by");
assert!(tokens.is_empty());
}
#[test]
fn test_tokenize_trims_punctuation() {
let tokens = tokenize("cell, membrane.");
assert!(tokens.contains(&"cell".to_string()));
assert!(tokens.contains(&"membrane".to_string()));
}
#[test]
fn test_tokenize_lowercase() {
let tokens = tokenize("CELL Membrane");
assert!(tokens.contains(&"cell".to_string()));
assert!(tokens.contains(&"membrane".to_string()));
}
#[test]
fn test_merge_results_empty() {
let results: Vec<Vec<ScoredNode>> = vec![];
let merged = merge_results(results, 10);
assert!(merged.is_empty());
}
#[test]
fn test_merge_results_sorting() {
let results = vec![
vec![ScoredNode {
node_id: NodeId::from_seed(1),
label: "low".to_string(),
score: 0.3,
shard_id: ShardId::new(0),
}],
vec![ScoredNode {
node_id: NodeId::from_seed(2),
label: "high".to_string(),
score: 0.9,
shard_id: ShardId::new(1),
}],
];
let merged = merge_results(results, 10);
assert_eq!(merged.len(), 2);
assert_eq!(merged[0].label, "high");
assert_eq!(merged[1].label, "low");
}
#[test]
fn test_merge_results_truncates() {
let results = vec![vec![
ScoredNode {
node_id: NodeId::from_seed(1),
label: "a".to_string(),
score: 0.9,
shard_id: ShardId::new(0),
},
ScoredNode {
node_id: NodeId::from_seed(2),
label: "b".to_string(),
score: 0.8,
shard_id: ShardId::new(0),
},
ScoredNode {
node_id: NodeId::from_seed(3),
label: "c".to_string(),
score: 0.7,
shard_id: ShardId::new(0),
},
]];
let merged = merge_results(results, 2);
assert_eq!(merged.len(), 2);
assert_eq!(merged[0].label, "a");
assert_eq!(merged[1].label, "b");
}
}