use smallvec::SmallVec;
use std::collections::HashMap;
use crate::error::M1ndResult;
use crate::graph::Graph;
use crate::types::*;
const NGRAM_SIZE: usize = 3;
const MAX_TOKEN_LENGTH: usize = 200;
const WALKS_PER_NODE: usize = 20;
const WALK_LENGTH: usize = 10;
const WINDOW_SIZE: usize = 4;
const COOCCURRENCE_MAX_NODES: u32 = 50_000;
pub type NgramVector = HashMap<u32, FiniteF32>;
pub struct CharNgramIndex {
vectors: Vec<NgramVector>,
inverted: HashMap<u32, Vec<(NodeId, FiniteF32)>>,
idf: HashMap<u32, f32>,
ngram_size: usize,
}
impl CharNgramIndex {
pub fn build(graph: &Graph, ngram_size: usize) -> M1ndResult<Self> {
let n = graph.num_nodes() as usize;
let mut raw_vectors: Vec<NgramVector> = Vec::with_capacity(n);
let mut doc_freq: HashMap<u32, u32> = HashMap::new();
for i in 0..n {
let label = graph.strings.resolve(graph.nodes.label[i]);
let normalized = label.to_lowercase();
let vec = Self::build_ngram_vector(&normalized, ngram_size);
for &hash in vec.keys() {
*doc_freq.entry(hash).or_insert(0) += 1;
}
raw_vectors.push(vec);
}
let n_f32 = n.max(1) as f32;
let mut idf: HashMap<u32, f32> = HashMap::new();
for (&hash, &df) in &doc_freq {
idf.insert(hash, (n_f32 / df as f32).ln() + 1.0);
}
let mut vectors = Vec::with_capacity(n);
let mut inverted: HashMap<u32, Vec<(NodeId, FiniteF32)>> = HashMap::new();
for (i, raw_vec) in raw_vectors.into_iter().enumerate() {
let mut tfidf_vec = NgramVector::new();
for (&hash, &tf) in &raw_vec {
let idf_val = idf.get(&hash).copied().unwrap_or(1.0);
let tfidf = tf.get() * idf_val;
tfidf_vec.insert(hash, FiniteF32::new(tfidf));
}
let norm: f32 = tfidf_vec
.values()
.map(|v| v.get() * v.get())
.sum::<f32>()
.sqrt();
if norm > 0.0 {
for (&hash, &weight) in &tfidf_vec {
let normalized_w = FiniteF32::new(weight.get() / norm);
inverted
.entry(hash)
.or_default()
.push((NodeId::new(i as u32), normalized_w));
}
}
vectors.push(tfidf_vec);
}
Ok(Self {
vectors,
inverted,
idf,
ngram_size,
})
}
fn build_ngram_vector(s: &str, ngram_size: usize) -> NgramVector {
let s = if s.len() > MAX_TOKEN_LENGTH {
let mut end = MAX_TOKEN_LENGTH;
while end > 0 && !s.is_char_boundary(end) {
end -= 1;
}
&s[..end]
} else {
s
};
let chars: Vec<char> = s.chars().collect();
let mut vec = NgramVector::new();
if chars.len() < ngram_size {
let hash = Self::hash_ngram(s);
vec.insert(hash, FiniteF32::ONE);
return vec;
}
for window in chars.windows(ngram_size) {
let gram: String = window.iter().collect();
let hash = Self::hash_ngram(&gram);
let entry = vec.entry(hash).or_insert(FiniteF32::ZERO);
*entry = FiniteF32::new(entry.get() + 1.0);
}
vec
}
fn hash_ngram(ngram: &str) -> u32 {
let mut hash: u32 = 2166136261;
for byte in ngram.bytes() {
hash ^= byte as u32;
hash = hash.wrapping_mul(16777619);
}
hash & 0x00FFFFFF }
pub fn query_vector(&self, query: &str) -> NgramVector {
let raw = Self::build_ngram_vector(&query.to_lowercase(), self.ngram_size);
let mut tfidf = NgramVector::new();
for (&hash, &tf) in &raw {
let idf_val = self.idf.get(&hash).copied().unwrap_or(1.0);
tfidf.insert(hash, FiniteF32::new(tf.get() * idf_val));
}
tfidf
}
pub fn query_top_k(&self, query: &str, top_k: usize) -> Vec<(NodeId, FiniteF32)> {
let qvec = self.query_vector(query);
if qvec.is_empty() {
return Vec::new();
}
let q_norm: f32 = qvec.values().map(|v| v.get() * v.get()).sum::<f32>().sqrt();
if q_norm <= 0.0 {
return Vec::new();
}
let mut scores: HashMap<u32, f32> = HashMap::new();
for (&hash, &q_weight) in &qvec {
if let Some(postings) = self.inverted.get(&hash) {
for &(node, norm_weight) in postings {
*scores.entry(node.0).or_insert(0.0) += q_weight.get() * norm_weight.get();
}
}
}
let mut results: Vec<(NodeId, FiniteF32)> = scores
.iter()
.map(|(&node_id, &dot)| {
let sim = dot / q_norm;
(NodeId::new(node_id), FiniteF32::new(sim.min(1.0)))
})
.filter(|(_, s)| s.get() > 0.01)
.collect();
results.sort_by(|a, b| b.1.cmp(&a.1));
results.truncate(top_k);
results
}
pub fn cosine_similarity(a: &NgramVector, b: &NgramVector) -> FiniteF32 {
if a.is_empty() || b.is_empty() {
return FiniteF32::ZERO;
}
let mut dot = 0.0f32;
for (k, va) in a {
if let Some(vb) = b.get(k) {
dot += va.get() * vb.get();
}
}
let norm_a: f32 = a.values().map(|v| v.get() * v.get()).sum::<f32>().sqrt();
let norm_b: f32 = b.values().map(|v| v.get() * v.get()).sum::<f32>().sqrt();
let denom = norm_a * norm_b;
if denom > 0.0 {
FiniteF32::new((dot / denom).min(1.0))
} else {
FiniteF32::ZERO
}
}
}
pub type CoOccurrenceVector = Vec<(NodeId, FiniteF32)>;
pub struct CoOccurrenceIndex {
vectors: Vec<CoOccurrenceVector>,
walk_length: usize,
walks_per_node: usize,
window_size: usize,
}
impl CoOccurrenceIndex {
pub fn build(
graph: &Graph,
walk_length: usize,
walks_per_node: usize,
window_size: usize,
) -> M1ndResult<Self> {
let n = graph.num_nodes() as usize;
if graph.num_nodes() > COOCCURRENCE_MAX_NODES {
return Ok(Self {
vectors: vec![Vec::new(); n],
walk_length,
walks_per_node,
window_size,
});
}
let mut vectors = vec![Vec::new(); n];
let mut rng_state = 42u64;
let mut next_rng = || -> u64 {
rng_state = rng_state
.wrapping_mul(6364136223846793005)
.wrapping_add(1442695040888963407);
rng_state >> 33
};
#[allow(clippy::needless_range_loop)]
for start in 0..n {
let mut co_counts: HashMap<u32, f32> = HashMap::new();
let start_node = NodeId::new(start as u32);
for _ in 0..walks_per_node {
let mut walk = Vec::with_capacity(walk_length);
let mut current = start_node;
walk.push(current);
for _ in 1..walk_length {
let range = graph.csr.out_range(current);
let degree = range.end - range.start;
if degree == 0 {
break;
}
let idx = (next_rng() as usize) % degree;
current = graph.csr.targets[range.start + idx];
walk.push(current);
}
for i in 0..walk.len() {
let lo = i.saturating_sub(window_size);
let hi = (i + window_size + 1).min(walk.len());
for w_node in &walk[lo..hi] {
if *w_node != walk[i] && w_node.0 != start as u32 {
*co_counts.entry(w_node.0).or_insert(0.0) += 1.0;
}
}
}
}
if !co_counts.is_empty() {
vectors[start] = co_counts
.into_iter()
.map(|(id, count)| (NodeId::new(id), FiniteF32::new(count)))
.collect();
}
}
let mut marginal_j: HashMap<u32, f32> = HashMap::new();
let mut marginal_i: Vec<f32> = Vec::with_capacity(n);
let mut total_all = 0.0f32;
for vec in &vectors {
let row_sum: f32 = vec.iter().map(|(_, w)| w.get()).sum();
marginal_i.push(row_sum);
total_all += row_sum;
for &(node, weight) in vec {
*marginal_j.entry(node.0).or_insert(0.0) += weight.get();
}
}
if total_all > 0.0 {
for (i, vec) in vectors.iter_mut().enumerate() {
let mi = marginal_i[i];
if mi <= 0.0 {
continue;
}
let mut ppmi_vec: CoOccurrenceVector = Vec::with_capacity(vec.len());
for &(node, raw_count) in vec.iter() {
let mj = marginal_j.get(&node.0).copied().unwrap_or(1.0);
let pmi = ((raw_count.get() * total_all) / (mi * mj)).ln();
if pmi > 0.0 {
ppmi_vec.push((node, FiniteF32::new(pmi)));
}
}
ppmi_vec.sort_by_key(|e| e.0);
*vec = ppmi_vec;
}
}
Ok(Self {
vectors,
walk_length,
walks_per_node,
window_size,
})
}
pub fn cosine_similarity(a: &CoOccurrenceVector, b: &CoOccurrenceVector) -> FiniteF32 {
if a.is_empty() || b.is_empty() {
return FiniteF32::ZERO;
}
let mut dot = 0.0f32;
let mut norm_a = 0.0f32;
let mut norm_b = 0.0f32;
for (_, w) in a {
norm_a += w.get() * w.get();
}
for (_, w) in b {
norm_b += w.get() * w.get();
}
let mut ia = 0;
let mut ib = 0;
while ia < a.len() && ib < b.len() {
let (na, wa) = a[ia];
let (nb, wb) = b[ib];
if na == nb {
dot += wa.get() * wb.get();
ia += 1;
ib += 1;
} else if na < nb {
ia += 1;
} else {
ib += 1;
}
}
let denom = norm_a.sqrt() * norm_b.sqrt();
if denom > 0.0 {
FiniteF32::new((dot / denom).min(1.0))
} else {
FiniteF32::ZERO
}
}
pub fn query_top_k(&self, query_node: NodeId, top_k: usize) -> Vec<(NodeId, FiniteF32)> {
let idx = query_node.as_usize();
if idx >= self.vectors.len() || self.vectors[idx].is_empty() {
return Vec::new();
}
let query_vec = &self.vectors[idx];
let mut results: Vec<(NodeId, FiniteF32)> = self
.vectors
.iter()
.enumerate()
.filter(|(i, v)| *i != idx && !v.is_empty())
.map(|(i, v)| {
let sim = Self::cosine_similarity(query_vec, v);
(NodeId::new(i as u32), sim)
})
.filter(|(_, s)| s.get() > 0.01)
.collect();
results.sort_by(|a, b| b.1.cmp(&a.1));
results.truncate(top_k);
results
}
}
pub struct SynonymExpander {
groups: Vec<Vec<String>>,
term_to_groups: HashMap<String, SmallVec<[u16; 4]>>,
}
const DEFAULT_SYNONYM_GROUPS: &[&[&str]] = &[
&["plastico", "polimero", "resina"],
&["metal", "liga", "aco", "aluminio"],
&["vidro", "ceramica", "cristal"],
&["processo", "etapa", "fase"],
&["material", "materia_prima", "insumo"],
&["custo", "preco", "valor"],
&["fornecedor", "supplier", "vendor"],
&["qualidade", "quality", "qa"],
&["teste", "test", "ensaio"],
&["norma", "regulamento", "padrão"],
&["function", "fn", "method", "func"],
&["class", "struct", "type"],
&["module", "package", "crate"],
&["import", "use", "require"],
&["error", "exception", "panic"],
];
impl SynonymExpander {
pub fn build_default() -> M1ndResult<Self> {
let groups: Vec<Vec<&str>> = DEFAULT_SYNONYM_GROUPS.iter().map(|g| g.to_vec()).collect();
Self::build(groups)
}
pub fn build(groups: Vec<Vec<&str>>) -> M1ndResult<Self> {
let mut string_groups = Vec::with_capacity(groups.len());
let mut term_to_groups: HashMap<String, SmallVec<[u16; 4]>> = HashMap::new();
for (gi, group) in groups.iter().enumerate() {
let mut str_group: Vec<String> = Vec::with_capacity(group.len());
for &term in group {
let lower = term.to_lowercase();
term_to_groups
.entry(lower.clone())
.or_default()
.push(gi as u16);
str_group.push(lower);
}
string_groups.push(str_group);
}
Ok(Self {
groups: string_groups,
term_to_groups,
})
}
pub fn expand(&self, term: &str) -> Vec<String> {
let lower = term.to_lowercase();
let mut result = vec![lower.clone()];
if let Some(group_indices) = self.term_to_groups.get(&lower) {
for &gi in group_indices {
if (gi as usize) < self.groups.len() {
for member in &self.groups[gi as usize] {
if *member != lower && !result.contains(member) {
result.push(member.clone());
}
}
}
}
}
result
}
pub fn are_synonyms(&self, a: &str, b: &str) -> bool {
let a_lower = a.to_lowercase();
let b_lower = b.to_lowercase();
if a_lower == b_lower {
return true;
}
if let Some(groups_a) = self.term_to_groups.get(&a_lower) {
if let Some(groups_b) = self.term_to_groups.get(&b_lower) {
for &ga in groups_a {
for &gb in groups_b {
if ga == gb {
return true;
}
}
}
}
}
false
}
}
pub struct SemanticEngine {
pub ngram: CharNgramIndex,
pub cooccurrence: CoOccurrenceIndex,
pub synonym: SynonymExpander,
pub weights: SemanticWeights,
}
impl SemanticEngine {
pub fn build(graph: &Graph, weights: SemanticWeights) -> M1ndResult<Self> {
let ngram = CharNgramIndex::build(graph, NGRAM_SIZE)?;
let cooccurrence =
CoOccurrenceIndex::build(graph, WALK_LENGTH, WALKS_PER_NODE, WINDOW_SIZE)?;
let synonym = SynonymExpander::build_default()?;
Ok(Self {
ngram,
cooccurrence,
synonym,
weights,
})
}
pub fn query(
&self,
graph: &Graph,
query: &str,
top_k: usize,
) -> M1ndResult<Vec<(NodeId, FiniteF32)>> {
let ngram_scores = self.ngram.query_top_k(query, top_k * 5);
let mut scores: HashMap<u32, f32> = HashMap::new();
for &(node, score) in &ngram_scores {
*scores.entry(node.0).or_insert(0.0) += score.get() * self.weights.ngram.get();
}
if let Some(&(first_node, _)) = ngram_scores.first() {
let cooc_scores = self.cooccurrence.query_top_k(first_node, top_k * 3);
for (node, score) in cooc_scores {
*scores.entry(node.0).or_insert(0.0) +=
score.get() * self.weights.cooccurrence.get();
}
}
let tokens: Vec<String> = query
.to_lowercase()
.split_whitespace()
.filter(|t| t.len() > 2)
.map(|t| t.to_string())
.collect();
let mut expanded_tokens: Vec<String> = Vec::new();
for token in &tokens {
for syn in self.synonym.expand(token) {
if !expanded_tokens.contains(&syn) {
expanded_tokens.push(syn);
}
}
}
let synonym_weight = self.weights.synonym.get();
for i in 0..graph.num_nodes() as usize {
let label = graph.strings.resolve(graph.nodes.label[i]).to_lowercase();
for expanded in &expanded_tokens {
if !tokens.contains(expanded) && label.contains(expanded.as_str()) {
*scores.entry(i as u32).or_insert(0.0) += synonym_weight;
}
}
}
let mut results: Vec<(NodeId, FiniteF32)> = scores
.into_iter()
.map(|(id, s)| (NodeId::new(id), FiniteF32::new(s.min(1.0))))
.filter(|(_, s)| s.get() > 0.01)
.collect();
results.sort_by(|a, b| b.1.cmp(&a.1));
results.truncate(top_k);
Ok(results)
}
pub fn query_fast(
&self,
graph: &Graph,
query: &str,
top_k: usize,
) -> M1ndResult<Vec<(NodeId, FiniteF32)>> {
let candidates = self.ngram.query_top_k(query, top_k * 3);
if candidates.is_empty() {
return Ok(Vec::new());
}
let seed_count = candidates.len().min(3);
let mut cooc_map: HashMap<u32, f32> = HashMap::new();
for &(node, ngram_score) in &candidates[..seed_count] {
let cooc = self.cooccurrence.query_top_k(node, top_k * 3);
for (n, s) in cooc {
*cooc_map.entry(n.0).or_insert(0.0) += s.get() * ngram_score.get();
}
}
let seed_f = seed_count as f32;
for v in cooc_map.values_mut() {
*v /= seed_f;
}
let total_w = self.weights.ngram.get() + self.weights.cooccurrence.get();
let ngram_w = if total_w > 0.0 {
self.weights.ngram.get() / total_w
} else {
0.6
};
let cooc_w = if total_w > 0.0 {
self.weights.cooccurrence.get() / total_w
} else {
0.4
};
let mut results: Vec<(NodeId, FiniteF32)> = candidates
.iter()
.map(|&(node, ngram_score)| {
let cooc_score = cooc_map.get(&node.0).copied().unwrap_or(0.0);
let combined = ngram_score.get() * ngram_w + cooc_score * cooc_w;
(node, FiniteF32::new(combined.min(1.0)))
})
.collect();
results.sort_by(|a, b| b.1.cmp(&a.1));
results.truncate(top_k);
Ok(results)
}
}
static_assertions::assert_impl_all!(SemanticEngine: Send, Sync);