1use smallvec::SmallVec;
4use std::collections::HashMap;
5
6use crate::error::M1ndResult;
7use crate::graph::Graph;
8use crate::types::*;
9
10const NGRAM_SIZE: usize = 3;
16const MAX_TOKEN_LENGTH: usize = 200;
18const WALKS_PER_NODE: usize = 20;
22const WALK_LENGTH: usize = 10;
23const WINDOW_SIZE: usize = 4;
24const COOCCURRENCE_MAX_NODES: u32 = 50_000;
26
27pub type NgramVector = HashMap<u32, FiniteF32>;
35
36pub struct CharNgramIndex {
40 vectors: Vec<NgramVector>,
42 inverted: HashMap<u32, Vec<(NodeId, FiniteF32)>>,
45 idf: HashMap<u32, f32>,
47 ngram_size: usize,
49}
50
51impl CharNgramIndex {
52 pub fn build(graph: &Graph, ngram_size: usize) -> M1ndResult<Self> {
57 let n = graph.num_nodes() as usize;
58
59 let mut raw_vectors: Vec<NgramVector> = Vec::with_capacity(n);
61 let mut doc_freq: HashMap<u32, u32> = HashMap::new();
62
63 for i in 0..n {
64 let label = graph.strings.resolve(graph.nodes.label[i]);
65 let normalized = label.to_lowercase();
66 let vec = Self::build_ngram_vector(&normalized, ngram_size);
67
68 for &hash in vec.keys() {
70 *doc_freq.entry(hash).or_insert(0) += 1;
71 }
72
73 raw_vectors.push(vec);
74 }
75
76 let n_f32 = n.max(1) as f32;
78 let mut idf: HashMap<u32, f32> = HashMap::new();
79 for (&hash, &df) in &doc_freq {
80 idf.insert(hash, (n_f32 / df as f32).ln() + 1.0);
81 }
82
83 let mut vectors = Vec::with_capacity(n);
84 let mut inverted: HashMap<u32, Vec<(NodeId, FiniteF32)>> = HashMap::new();
85
86 for (i, raw_vec) in raw_vectors.into_iter().enumerate() {
87 let mut tfidf_vec = NgramVector::new();
88
89 for (&hash, &tf) in &raw_vec {
90 let idf_val = idf.get(&hash).copied().unwrap_or(1.0);
91 let tfidf = tf.get() * idf_val;
92 tfidf_vec.insert(hash, FiniteF32::new(tfidf));
93 }
94
95 let norm: f32 = tfidf_vec
97 .values()
98 .map(|v| v.get() * v.get())
99 .sum::<f32>()
100 .sqrt();
101 if norm > 0.0 {
102 for (&hash, &weight) in &tfidf_vec {
103 let normalized_w = FiniteF32::new(weight.get() / norm);
104 inverted
105 .entry(hash)
106 .or_default()
107 .push((NodeId::new(i as u32), normalized_w));
108 }
109 }
110
111 vectors.push(tfidf_vec);
112 }
113
114 Ok(Self {
115 vectors,
116 inverted,
117 idf,
118 ngram_size,
119 })
120 }
121
122 fn build_ngram_vector(s: &str, ngram_size: usize) -> NgramVector {
124 let s = if s.len() > MAX_TOKEN_LENGTH {
125 let mut end = MAX_TOKEN_LENGTH;
126 while end > 0 && !s.is_char_boundary(end) {
127 end -= 1;
128 }
129 &s[..end]
130 } else {
131 s
132 };
133 let chars: Vec<char> = s.chars().collect();
134 let mut vec = NgramVector::new();
135 if chars.len() < ngram_size {
136 let hash = Self::hash_ngram(s);
138 vec.insert(hash, FiniteF32::ONE);
139 return vec;
140 }
141 for window in chars.windows(ngram_size) {
142 let gram: String = window.iter().collect();
143 let hash = Self::hash_ngram(&gram);
144 let entry = vec.entry(hash).or_insert(FiniteF32::ZERO);
145 *entry = FiniteF32::new(entry.get() + 1.0);
146 }
147 vec
148 }
149
150 fn hash_ngram(ngram: &str) -> u32 {
152 let mut hash: u32 = 2166136261;
153 for byte in ngram.bytes() {
154 hash ^= byte as u32;
155 hash = hash.wrapping_mul(16777619);
156 }
157 hash & 0x00FFFFFF }
159
160 pub fn query_vector(&self, query: &str) -> NgramVector {
162 let raw = Self::build_ngram_vector(&query.to_lowercase(), self.ngram_size);
163 let mut tfidf = NgramVector::new();
164 for (&hash, &tf) in &raw {
165 let idf_val = self.idf.get(&hash).copied().unwrap_or(1.0);
166 tfidf.insert(hash, FiniteF32::new(tf.get() * idf_val));
167 }
168 tfidf
169 }
170
171 pub fn query_top_k(&self, query: &str, top_k: usize) -> Vec<(NodeId, FiniteF32)> {
175 let qvec = self.query_vector(query);
176 if qvec.is_empty() {
177 return Vec::new();
178 }
179
180 let q_norm: f32 = qvec.values().map(|v| v.get() * v.get()).sum::<f32>().sqrt();
182 if q_norm <= 0.0 {
183 return Vec::new();
184 }
185
186 let mut scores: HashMap<u32, f32> = HashMap::new();
188 for (&hash, &q_weight) in &qvec {
189 if let Some(postings) = self.inverted.get(&hash) {
190 for &(node, norm_weight) in postings {
191 *scores.entry(node.0).or_insert(0.0) += q_weight.get() * norm_weight.get();
192 }
193 }
194 }
195
196 let mut results: Vec<(NodeId, FiniteF32)> = scores
198 .iter()
199 .map(|(&node_id, &dot)| {
200 let sim = dot / q_norm;
201 (NodeId::new(node_id), FiniteF32::new(sim.min(1.0)))
202 })
203 .filter(|(_, s)| s.get() > 0.01)
204 .collect();
205
206 results.sort_by(|a, b| b.1.cmp(&a.1));
207 results.truncate(top_k);
208 results
209 }
210
211 pub fn cosine_similarity(a: &NgramVector, b: &NgramVector) -> FiniteF32 {
213 if a.is_empty() || b.is_empty() {
214 return FiniteF32::ZERO;
215 }
216 let mut dot = 0.0f32;
217 for (k, va) in a {
218 if let Some(vb) = b.get(k) {
219 dot += va.get() * vb.get();
220 }
221 }
222 let norm_a: f32 = a.values().map(|v| v.get() * v.get()).sum::<f32>().sqrt();
223 let norm_b: f32 = b.values().map(|v| v.get() * v.get()).sum::<f32>().sqrt();
224 let denom = norm_a * norm_b;
225 if denom > 0.0 {
226 FiniteF32::new((dot / denom).min(1.0))
227 } else {
228 FiniteF32::ZERO
229 }
230 }
231}
232
233pub type CoOccurrenceVector = Vec<(NodeId, FiniteF32)>;
241
242pub struct CoOccurrenceIndex {
244 vectors: Vec<CoOccurrenceVector>,
246 walk_length: usize,
248 walks_per_node: usize,
250 window_size: usize,
252}
253
254impl CoOccurrenceIndex {
255 pub fn build(
259 graph: &Graph,
260 walk_length: usize,
261 walks_per_node: usize,
262 window_size: usize,
263 ) -> M1ndResult<Self> {
264 let n = graph.num_nodes() as usize;
265
266 if graph.num_nodes() > COOCCURRENCE_MAX_NODES {
268 return Ok(Self {
269 vectors: vec![Vec::new(); n],
270 walk_length,
271 walks_per_node,
272 window_size,
273 });
274 }
275
276 let mut vectors = vec![Vec::new(); n];
277
278 let mut rng_state = 42u64;
280 let mut next_rng = || -> u64 {
281 rng_state = rng_state
282 .wrapping_mul(6364136223846793005)
283 .wrapping_add(1442695040888963407);
284 rng_state >> 33
285 };
286
287 for start in 0..n {
289 let mut co_counts: HashMap<u32, f32> = HashMap::new();
290 let start_node = NodeId::new(start as u32);
291
292 for _ in 0..walks_per_node {
293 let mut walk = Vec::with_capacity(walk_length);
294 let mut current = start_node;
295 walk.push(current);
296
297 for _ in 1..walk_length {
298 let range = graph.csr.out_range(current);
299 let degree = range.end - range.start;
300 if degree == 0 {
301 break;
302 }
303 let idx = (next_rng() as usize) % degree;
304 current = graph.csr.targets[range.start + idx];
305 walk.push(current);
306 }
307
308 for i in 0..walk.len() {
310 let lo = i.saturating_sub(window_size);
311 let hi = (i + window_size + 1).min(walk.len());
312 for j in lo..hi {
313 if i != j && walk[j].0 != start as u32 {
314 *co_counts.entry(walk[j].0).or_insert(0.0) += 1.0;
315 }
316 }
317 }
318 }
319
320 if !co_counts.is_empty() {
322 vectors[start] = co_counts
323 .into_iter()
324 .map(|(id, count)| (NodeId::new(id), FiniteF32::new(count)))
325 .collect();
326 }
327 }
328
329 let mut marginal_j: HashMap<u32, f32> = HashMap::new();
332 let mut marginal_i: Vec<f32> = Vec::with_capacity(n);
333 let mut total_all = 0.0f32;
334
335 for vec in &vectors {
336 let row_sum: f32 = vec.iter().map(|(_, w)| w.get()).sum();
337 marginal_i.push(row_sum);
338 total_all += row_sum;
339 for &(node, weight) in vec {
340 *marginal_j.entry(node.0).or_insert(0.0) += weight.get();
341 }
342 }
343
344 if total_all > 0.0 {
345 for (i, vec) in vectors.iter_mut().enumerate() {
346 let mi = marginal_i[i];
347 if mi <= 0.0 {
348 continue;
349 }
350 let mut ppmi_vec: CoOccurrenceVector = Vec::with_capacity(vec.len());
351 for &(node, raw_count) in vec.iter() {
352 let mj = marginal_j.get(&node.0).copied().unwrap_or(1.0);
353 let pmi = ((raw_count.get() * total_all) / (mi * mj)).ln();
355 if pmi > 0.0 {
356 ppmi_vec.push((node, FiniteF32::new(pmi)));
357 }
358 }
359 ppmi_vec.sort_by_key(|e| e.0);
360 *vec = ppmi_vec;
361 }
362 }
363
364 Ok(Self {
365 vectors,
366 walk_length,
367 walks_per_node,
368 window_size,
369 })
370 }
371
372 pub fn cosine_similarity(a: &CoOccurrenceVector, b: &CoOccurrenceVector) -> FiniteF32 {
375 if a.is_empty() || b.is_empty() {
376 return FiniteF32::ZERO;
377 }
378
379 let mut dot = 0.0f32;
380 let mut norm_a = 0.0f32;
381 let mut norm_b = 0.0f32;
382
383 for (_, w) in a {
384 norm_a += w.get() * w.get();
385 }
386 for (_, w) in b {
387 norm_b += w.get() * w.get();
388 }
389
390 let mut ia = 0;
392 let mut ib = 0;
393 while ia < a.len() && ib < b.len() {
394 let (na, wa) = a[ia];
395 let (nb, wb) = b[ib];
396 if na == nb {
397 dot += wa.get() * wb.get();
398 ia += 1;
399 ib += 1;
400 } else if na < nb {
401 ia += 1;
402 } else {
403 ib += 1;
404 }
405 }
406
407 let denom = norm_a.sqrt() * norm_b.sqrt();
408 if denom > 0.0 {
409 FiniteF32::new((dot / denom).min(1.0))
410 } else {
411 FiniteF32::ZERO
412 }
413 }
414
415 pub fn query_top_k(&self, query_node: NodeId, top_k: usize) -> Vec<(NodeId, FiniteF32)> {
418 let idx = query_node.as_usize();
419 if idx >= self.vectors.len() || self.vectors[idx].is_empty() {
420 return Vec::new();
421 }
422
423 let query_vec = &self.vectors[idx];
424 let mut results: Vec<(NodeId, FiniteF32)> = self
425 .vectors
426 .iter()
427 .enumerate()
428 .filter(|(i, v)| *i != idx && !v.is_empty())
429 .map(|(i, v)| {
430 let sim = Self::cosine_similarity(query_vec, v);
431 (NodeId::new(i as u32), sim)
432 })
433 .filter(|(_, s)| s.get() > 0.01)
434 .collect();
435
436 results.sort_by(|a, b| b.1.cmp(&a.1));
437 results.truncate(top_k);
438 results
439 }
440}
441
442pub struct SynonymExpander {
451 groups: Vec<Vec<String>>,
453 term_to_groups: HashMap<String, SmallVec<[u16; 4]>>,
455}
456
457const DEFAULT_SYNONYM_GROUPS: &[&[&str]] = &[
459 &["plastico", "polimero", "resina"],
460 &["metal", "liga", "aco", "aluminio"],
461 &["vidro", "ceramica", "cristal"],
462 &["processo", "etapa", "fase"],
463 &["material", "materia_prima", "insumo"],
464 &["custo", "preco", "valor"],
465 &["fornecedor", "supplier", "vendor"],
466 &["qualidade", "quality", "qa"],
467 &["teste", "test", "ensaio"],
468 &["norma", "regulamento", "padrão"],
469 &["function", "fn", "method", "func"],
470 &["class", "struct", "type"],
471 &["module", "package", "crate"],
472 &["import", "use", "require"],
473 &["error", "exception", "panic"],
474];
475
476impl SynonymExpander {
477 pub fn build_default() -> M1ndResult<Self> {
481 let groups: Vec<Vec<&str>> = DEFAULT_SYNONYM_GROUPS.iter().map(|g| g.to_vec()).collect();
482 Self::build(groups)
483 }
484
485 pub fn build(groups: Vec<Vec<&str>>) -> M1ndResult<Self> {
487 let mut string_groups = Vec::with_capacity(groups.len());
488 let mut term_to_groups: HashMap<String, SmallVec<[u16; 4]>> = HashMap::new();
489
490 for (gi, group) in groups.iter().enumerate() {
491 let mut str_group: Vec<String> = Vec::with_capacity(group.len());
492 for &term in group {
493 let lower = term.to_lowercase();
494 term_to_groups
495 .entry(lower.clone())
496 .or_default()
497 .push(gi as u16);
498 str_group.push(lower);
499 }
500 string_groups.push(str_group);
501 }
502
503 Ok(Self {
504 groups: string_groups,
505 term_to_groups,
506 })
507 }
508
509 pub fn expand(&self, term: &str) -> Vec<String> {
512 let lower = term.to_lowercase();
513 let mut result = vec![lower.clone()];
514 if let Some(group_indices) = self.term_to_groups.get(&lower) {
515 for &gi in group_indices {
516 if (gi as usize) < self.groups.len() {
517 for member in &self.groups[gi as usize] {
518 if *member != lower && !result.contains(member) {
519 result.push(member.clone());
520 }
521 }
522 }
523 }
524 }
525 result
526 }
527
528 pub fn are_synonyms(&self, a: &str, b: &str) -> bool {
530 let a_lower = a.to_lowercase();
531 let b_lower = b.to_lowercase();
532 if a_lower == b_lower {
533 return true;
534 }
535 if let Some(groups_a) = self.term_to_groups.get(&a_lower) {
536 if let Some(groups_b) = self.term_to_groups.get(&b_lower) {
537 for &ga in groups_a {
538 for &gb in groups_b {
539 if ga == gb {
540 return true;
541 }
542 }
543 }
544 }
545 }
546 false
547 }
548}
549
550pub struct SemanticEngine {
559 pub ngram: CharNgramIndex,
560 pub cooccurrence: CoOccurrenceIndex,
561 pub synonym: SynonymExpander,
562 pub weights: SemanticWeights,
563}
564
565impl SemanticEngine {
566 pub fn build(graph: &Graph, weights: SemanticWeights) -> M1ndResult<Self> {
569 let ngram = CharNgramIndex::build(graph, NGRAM_SIZE)?;
570 let cooccurrence =
571 CoOccurrenceIndex::build(graph, WALK_LENGTH, WALKS_PER_NODE, WINDOW_SIZE)?;
572 let synonym = SynonymExpander::build_default()?;
573
574 Ok(Self {
575 ngram,
576 cooccurrence,
577 synonym,
578 weights,
579 })
580 }
581
582 pub fn query(
586 &self,
587 graph: &Graph,
588 query: &str,
589 top_k: usize,
590 ) -> M1ndResult<Vec<(NodeId, FiniteF32)>> {
591 let ngram_scores = self.ngram.query_top_k(query, top_k * 5);
593
594 let mut scores: HashMap<u32, f32> = HashMap::new();
596 for &(node, score) in &ngram_scores {
597 *scores.entry(node.0).or_insert(0.0) += score.get() * self.weights.ngram.get();
598 }
599
600 if let Some(&(first_node, _)) = ngram_scores.first() {
603 let cooc_scores = self.cooccurrence.query_top_k(first_node, top_k * 3);
604 for (node, score) in cooc_scores {
605 *scores.entry(node.0).or_insert(0.0) +=
606 score.get() * self.weights.cooccurrence.get();
607 }
608 }
609
610 let tokens: Vec<String> = query
613 .to_lowercase()
614 .split_whitespace()
615 .filter(|t| t.len() > 2)
616 .map(|t| t.to_string())
617 .collect();
618
619 let mut expanded_tokens: Vec<String> = Vec::new();
621 for token in &tokens {
622 for syn in self.synonym.expand(token) {
623 if !expanded_tokens.contains(&syn) {
624 expanded_tokens.push(syn);
625 }
626 }
627 }
628
629 let synonym_weight = self.weights.synonym.get();
631 for i in 0..graph.num_nodes() as usize {
632 let label = graph.strings.resolve(graph.nodes.label[i]).to_lowercase();
633 for expanded in &expanded_tokens {
634 if !tokens.contains(expanded) && label.contains(expanded.as_str()) {
635 *scores.entry(i as u32).or_insert(0.0) += synonym_weight;
636 }
637 }
638 }
639
640 let mut results: Vec<(NodeId, FiniteF32)> = scores
641 .into_iter()
642 .map(|(id, s)| (NodeId::new(id), FiniteF32::new(s.min(1.0))))
643 .filter(|(_, s)| s.get() > 0.01)
644 .collect();
645
646 results.sort_by(|a, b| b.1.cmp(&a.1));
647 results.truncate(top_k);
648 Ok(results)
649 }
650
651 pub fn query_fast(
656 &self,
657 graph: &Graph,
658 query: &str,
659 top_k: usize,
660 ) -> M1ndResult<Vec<(NodeId, FiniteF32)>> {
661 let candidates = self.ngram.query_top_k(query, top_k * 3);
663 if candidates.is_empty() {
664 return Ok(Vec::new());
665 }
666
667 let seed_count = candidates.len().min(3);
670 let mut cooc_map: HashMap<u32, f32> = HashMap::new();
671 for &(node, ngram_score) in &candidates[..seed_count] {
672 let cooc = self.cooccurrence.query_top_k(node, top_k * 3);
673 for (n, s) in cooc {
674 *cooc_map.entry(n.0).or_insert(0.0) += s.get() * ngram_score.get();
675 }
676 }
677 let seed_f = seed_count as f32;
679 for v in cooc_map.values_mut() {
680 *v /= seed_f;
681 }
682
683 let total_w = self.weights.ngram.get() + self.weights.cooccurrence.get();
685 let ngram_w = if total_w > 0.0 {
686 self.weights.ngram.get() / total_w
687 } else {
688 0.6
689 };
690 let cooc_w = if total_w > 0.0 {
691 self.weights.cooccurrence.get() / total_w
692 } else {
693 0.4
694 };
695
696 let mut results: Vec<(NodeId, FiniteF32)> = candidates
697 .iter()
698 .map(|&(node, ngram_score)| {
699 let cooc_score = cooc_map.get(&node.0).copied().unwrap_or(0.0);
700 let combined = ngram_score.get() * ngram_w + cooc_score * cooc_w;
701 (node, FiniteF32::new(combined.min(1.0)))
702 })
703 .collect();
704
705 results.sort_by(|a, b| b.1.cmp(&a.1));
706 results.truncate(top_k);
707 Ok(results)
708 }
709}
710
711static_assertions::assert_impl_all!(SemanticEngine: Send, Sync);