m1nd_core/
semantic.rs

1// === crates/m1nd-core/src/semantic.rs ===
2
3use smallvec::SmallVec;
4use std::collections::HashMap;
5
6use crate::error::M1ndResult;
7use crate::graph::Graph;
8use crate::types::*;
9
10// ---------------------------------------------------------------------------
11// Constants
12// ---------------------------------------------------------------------------
13
14/// Default n-gram size (trigrams).
15const NGRAM_SIZE: usize = 3;
16/// Maximum token length for n-gram extraction.
17const MAX_TOKEN_LENGTH: usize = 200;
18/// Random walk parameters for co-occurrence.
19/// Increased from 10/6/3 for code graphs with deeper hierarchies
20/// (module -> file -> class -> method -> field = 5+ levels).
21const WALKS_PER_NODE: usize = 20;
22const WALK_LENGTH: usize = 10;
23const WINDOW_SIZE: usize = 4;
24/// Max nodes before disabling co-occurrence (DEC-050).
25const COOCCURRENCE_MAX_NODES: u32 = 50_000;
26
27// ---------------------------------------------------------------------------
28// CharNgramIndex — trigram embeddings (semantic_v2.py CharNgramEmbedder)
29// Replaces: semantic_v2.py CharNgramEmbedder
30// ---------------------------------------------------------------------------
31
32/// Sparse trigram vector for a single node.
33/// Key: 24-bit hash of trigram. Value: TF-IDF weight.
34pub type NgramVector = HashMap<u32, FiniteF32>;
35
36/// Pre-built char n-gram index over all node labels.
37/// Stores sparse trigram vectors for each node.
38/// FM-SEM-003 fix: inverted token index for O(K) query instead of O(N^2*S).
39pub struct CharNgramIndex {
40    /// Per-node trigram vectors indexed by NodeId.
41    vectors: Vec<NgramVector>,
42    /// Inverted index: trigram hash -> list of (NodeId, weight).
43    /// Enables sub-linear candidate generation.
44    inverted: HashMap<u32, Vec<(NodeId, FiniteF32)>>,
45    /// IDF values: trigram hash -> log(N/df)+1. Used by query_vector().
46    idf: HashMap<u32, f32>,
47    /// N-gram size (default 3 = trigrams).
48    ngram_size: usize,
49}
50
51impl CharNgramIndex {
52    /// Build index from all node labels in the graph.
53    /// Replaces: semantic_v2.py CharNgramEmbedder.build()
54    /// FM-SEM-001 fix: applies Unicode NFC normalization before trigram extraction.
55    /// TF-IDF weighting: raw TF * log(N/df)+1 for discriminative trigrams.
56    pub fn build(graph: &Graph, ngram_size: usize) -> M1ndResult<Self> {
57        let n = graph.num_nodes() as usize;
58
59        // Phase 1: Build raw TF vectors and compute document frequency
60        let mut raw_vectors: Vec<NgramVector> = Vec::with_capacity(n);
61        let mut doc_freq: HashMap<u32, u32> = HashMap::new();
62
63        for i in 0..n {
64            let label = graph.strings.resolve(graph.nodes.label[i]);
65            let normalized = label.to_lowercase();
66            let vec = Self::build_ngram_vector(&normalized, ngram_size);
67
68            // Count document frequency per trigram
69            for &hash in vec.keys() {
70                *doc_freq.entry(hash).or_insert(0) += 1;
71            }
72
73            raw_vectors.push(vec);
74        }
75
76        // Phase 2: Compute IDF and apply TF-IDF weighting
77        let n_f32 = n.max(1) as f32;
78        let mut idf: HashMap<u32, f32> = HashMap::new();
79        for (&hash, &df) in &doc_freq {
80            idf.insert(hash, (n_f32 / df as f32).ln() + 1.0);
81        }
82
83        let mut vectors = Vec::with_capacity(n);
84        let mut inverted: HashMap<u32, Vec<(NodeId, FiniteF32)>> = HashMap::new();
85
86        for (i, raw_vec) in raw_vectors.into_iter().enumerate() {
87            let mut tfidf_vec = NgramVector::new();
88
89            for (&hash, &tf) in &raw_vec {
90                let idf_val = idf.get(&hash).copied().unwrap_or(1.0);
91                let tfidf = tf.get() * idf_val;
92                tfidf_vec.insert(hash, FiniteF32::new(tfidf));
93            }
94
95            // L2 normalize for cosine similarity
96            let norm: f32 = tfidf_vec
97                .values()
98                .map(|v| v.get() * v.get())
99                .sum::<f32>()
100                .sqrt();
101            if norm > 0.0 {
102                for (&hash, &weight) in &tfidf_vec {
103                    let normalized_w = FiniteF32::new(weight.get() / norm);
104                    inverted
105                        .entry(hash)
106                        .or_default()
107                        .push((NodeId::new(i as u32), normalized_w));
108                }
109            }
110
111            vectors.push(tfidf_vec);
112        }
113
114        Ok(Self {
115            vectors,
116            inverted,
117            idf,
118            ngram_size,
119        })
120    }
121
122    /// Build n-gram frequency vector for a string.
123    fn build_ngram_vector(s: &str, ngram_size: usize) -> NgramVector {
124        let s = if s.len() > MAX_TOKEN_LENGTH {
125            let mut end = MAX_TOKEN_LENGTH;
126            while end > 0 && !s.is_char_boundary(end) {
127                end -= 1;
128            }
129            &s[..end]
130        } else {
131            s
132        };
133        let chars: Vec<char> = s.chars().collect();
134        let mut vec = NgramVector::new();
135        if chars.len() < ngram_size {
136            // For short strings, use the whole string as one gram
137            let hash = Self::hash_ngram(s);
138            vec.insert(hash, FiniteF32::ONE);
139            return vec;
140        }
141        for window in chars.windows(ngram_size) {
142            let gram: String = window.iter().collect();
143            let hash = Self::hash_ngram(&gram);
144            let entry = vec.entry(hash).or_insert(FiniteF32::ZERO);
145            *entry = FiniteF32::new(entry.get() + 1.0);
146        }
147        vec
148    }
149
150    /// Hash a trigram to a 24-bit key. FNV-1a variant.
151    fn hash_ngram(ngram: &str) -> u32 {
152        let mut hash: u32 = 2166136261;
153        for byte in ngram.bytes() {
154            hash ^= byte as u32;
155            hash = hash.wrapping_mul(16777619);
156        }
157        hash & 0x00FFFFFF // 24-bit
158    }
159
160    /// Compute trigram vector for a query string, with IDF weighting.
161    pub fn query_vector(&self, query: &str) -> NgramVector {
162        let raw = Self::build_ngram_vector(&query.to_lowercase(), self.ngram_size);
163        let mut tfidf = NgramVector::new();
164        for (&hash, &tf) in &raw {
165            let idf_val = self.idf.get(&hash).copied().unwrap_or(1.0);
166            tfidf.insert(hash, FiniteF32::new(tf.get() * idf_val));
167        }
168        tfidf
169    }
170
171    /// Score all nodes against a query vector. Returns top_k by cosine similarity.
172    /// Uses inverted index for candidate generation (FM-SEM-003 fix).
173    /// Replaces: semantic_v2.py CharNgramEmbedder.query()
174    pub fn query_top_k(&self, query: &str, top_k: usize) -> Vec<(NodeId, FiniteF32)> {
175        let qvec = self.query_vector(query);
176        if qvec.is_empty() {
177            return Vec::new();
178        }
179
180        // Query norm
181        let q_norm: f32 = qvec.values().map(|v| v.get() * v.get()).sum::<f32>().sqrt();
182        if q_norm <= 0.0 {
183            return Vec::new();
184        }
185
186        // Candidate accumulation via inverted index
187        let mut scores: HashMap<u32, f32> = HashMap::new();
188        for (&hash, &q_weight) in &qvec {
189            if let Some(postings) = self.inverted.get(&hash) {
190                for &(node, norm_weight) in postings {
191                    *scores.entry(node.0).or_insert(0.0) += q_weight.get() * norm_weight.get();
192                }
193            }
194        }
195
196        // Normalize by query norm
197        let mut results: Vec<(NodeId, FiniteF32)> = scores
198            .iter()
199            .map(|(&node_id, &dot)| {
200                let sim = dot / q_norm;
201                (NodeId::new(node_id), FiniteF32::new(sim.min(1.0)))
202            })
203            .filter(|(_, s)| s.get() > 0.01)
204            .collect();
205
206        results.sort_by(|a, b| b.1.cmp(&a.1));
207        results.truncate(top_k);
208        results
209    }
210
211    /// Cosine similarity between two sparse vectors.
212    pub fn cosine_similarity(a: &NgramVector, b: &NgramVector) -> FiniteF32 {
213        if a.is_empty() || b.is_empty() {
214            return FiniteF32::ZERO;
215        }
216        let mut dot = 0.0f32;
217        for (k, va) in a {
218            if let Some(vb) = b.get(k) {
219                dot += va.get() * vb.get();
220            }
221        }
222        let norm_a: f32 = a.values().map(|v| v.get() * v.get()).sum::<f32>().sqrt();
223        let norm_b: f32 = b.values().map(|v| v.get() * v.get()).sum::<f32>().sqrt();
224        let denom = norm_a * norm_b;
225        if denom > 0.0 {
226            FiniteF32::new((dot / denom).min(1.0))
227        } else {
228            FiniteF32::ZERO
229        }
230    }
231}
232
233// ---------------------------------------------------------------------------
234// CoOccurrenceIndex — DeepWalk-lite embeddings (semantic_v2.py CoOccurrenceEmbedder)
235// Replaces: semantic_v2.py CoOccurrenceEmbedder
236// ---------------------------------------------------------------------------
237
238/// Per-node co-occurrence vector: sorted Vec<(NodeId, weight)> for fast intersection.
239/// FM-SEM-004 fix: 12 bytes/entry vs 100 bytes in Python HashMap.
240pub type CoOccurrenceVector = Vec<(NodeId, FiniteF32)>;
241
242/// Co-occurrence embeddings built from short random walks on the graph.
243pub struct CoOccurrenceIndex {
244    /// Per-node co-occurrence vectors, indexed by NodeId.
245    vectors: Vec<CoOccurrenceVector>,
246    /// Walk length for random walk generation.
247    walk_length: usize,
248    /// Number of walks per node.
249    walks_per_node: usize,
250    /// Window size for co-occurrence counting.
251    window_size: usize,
252}
253
254impl CoOccurrenceIndex {
255    /// Build co-occurrence embeddings from random walks.
256    /// Replaces: semantic_v2.py CoOccurrenceEmbedder.build()
257    /// FM-SEM-004: memory warning logged if node_count > 10_000.
258    pub fn build(
259        graph: &Graph,
260        walk_length: usize,
261        walks_per_node: usize,
262        window_size: usize,
263    ) -> M1ndResult<Self> {
264        let n = graph.num_nodes() as usize;
265
266        // DEC-050: disable for large graphs
267        if graph.num_nodes() > COOCCURRENCE_MAX_NODES {
268            return Ok(Self {
269                vectors: vec![Vec::new(); n],
270                walk_length,
271                walks_per_node,
272                window_size,
273            });
274        }
275
276        let mut vectors = vec![Vec::new(); n];
277
278        // Simple PRNG (deterministic with seed 42)
279        let mut rng_state = 42u64;
280        let mut next_rng = || -> u64 {
281            rng_state = rng_state
282                .wrapping_mul(6364136223846793005)
283                .wrapping_add(1442695040888963407);
284            rng_state >> 33
285        };
286
287        // For each node, perform random walks and accumulate co-occurrence
288        for start in 0..n {
289            let mut co_counts: HashMap<u32, f32> = HashMap::new();
290            let start_node = NodeId::new(start as u32);
291
292            for _ in 0..walks_per_node {
293                let mut walk = Vec::with_capacity(walk_length);
294                let mut current = start_node;
295                walk.push(current);
296
297                for _ in 1..walk_length {
298                    let range = graph.csr.out_range(current);
299                    let degree = range.end - range.start;
300                    if degree == 0 {
301                        break;
302                    }
303                    let idx = (next_rng() as usize) % degree;
304                    current = graph.csr.targets[range.start + idx];
305                    walk.push(current);
306                }
307
308                // Extract co-occurrence pairs within window
309                for i in 0..walk.len() {
310                    let lo = i.saturating_sub(window_size);
311                    let hi = (i + window_size + 1).min(walk.len());
312                    for j in lo..hi {
313                        if i != j && walk[j].0 != start as u32 {
314                            *co_counts.entry(walk[j].0).or_insert(0.0) += 1.0;
315                        }
316                    }
317                }
318            }
319
320            // Store raw counts; PPMI normalization below
321            if !co_counts.is_empty() {
322                vectors[start] = co_counts
323                    .into_iter()
324                    .map(|(id, count)| (NodeId::new(id), FiniteF32::new(count)))
325                    .collect();
326            }
327        }
328
329        // PPMI normalization: upweight surprising co-occurrences, downweight expected ones
330        // Marginals: total_j = sum over all i of count(i,j), total_all = sum of everything
331        let mut marginal_j: HashMap<u32, f32> = HashMap::new();
332        let mut marginal_i: Vec<f32> = Vec::with_capacity(n);
333        let mut total_all = 0.0f32;
334
335        for vec in &vectors {
336            let row_sum: f32 = vec.iter().map(|(_, w)| w.get()).sum();
337            marginal_i.push(row_sum);
338            total_all += row_sum;
339            for &(node, weight) in vec {
340                *marginal_j.entry(node.0).or_insert(0.0) += weight.get();
341            }
342        }
343
344        if total_all > 0.0 {
345            for (i, vec) in vectors.iter_mut().enumerate() {
346                let mi = marginal_i[i];
347                if mi <= 0.0 {
348                    continue;
349                }
350                let mut ppmi_vec: CoOccurrenceVector = Vec::with_capacity(vec.len());
351                for &(node, raw_count) in vec.iter() {
352                    let mj = marginal_j.get(&node.0).copied().unwrap_or(1.0);
353                    // PMI = log2( (count * total) / (margin_i * margin_j) )
354                    let pmi = ((raw_count.get() * total_all) / (mi * mj)).ln();
355                    if pmi > 0.0 {
356                        ppmi_vec.push((node, FiniteF32::new(pmi)));
357                    }
358                }
359                ppmi_vec.sort_by_key(|e| e.0);
360                *vec = ppmi_vec;
361            }
362        }
363
364        Ok(Self {
365            vectors,
366            walk_length,
367            walks_per_node,
368            window_size,
369        })
370    }
371
372    /// Cosine similarity between two sorted co-occurrence vectors.
373    /// Uses merge-intersection on sorted vectors for O(D) instead of O(D^2).
374    pub fn cosine_similarity(a: &CoOccurrenceVector, b: &CoOccurrenceVector) -> FiniteF32 {
375        if a.is_empty() || b.is_empty() {
376            return FiniteF32::ZERO;
377        }
378
379        let mut dot = 0.0f32;
380        let mut norm_a = 0.0f32;
381        let mut norm_b = 0.0f32;
382
383        for (_, w) in a {
384            norm_a += w.get() * w.get();
385        }
386        for (_, w) in b {
387            norm_b += w.get() * w.get();
388        }
389
390        // Merge intersection
391        let mut ia = 0;
392        let mut ib = 0;
393        while ia < a.len() && ib < b.len() {
394            let (na, wa) = a[ia];
395            let (nb, wb) = b[ib];
396            if na == nb {
397                dot += wa.get() * wb.get();
398                ia += 1;
399                ib += 1;
400            } else if na < nb {
401                ia += 1;
402            } else {
403                ib += 1;
404            }
405        }
406
407        let denom = norm_a.sqrt() * norm_b.sqrt();
408        if denom > 0.0 {
409            FiniteF32::new((dot / denom).min(1.0))
410        } else {
411            FiniteF32::ZERO
412        }
413    }
414
415    /// Score a query node against all nodes. Returns top_k.
416    /// Replaces: semantic_v2.py CoOccurrenceEmbedder.query()
417    pub fn query_top_k(&self, query_node: NodeId, top_k: usize) -> Vec<(NodeId, FiniteF32)> {
418        let idx = query_node.as_usize();
419        if idx >= self.vectors.len() || self.vectors[idx].is_empty() {
420            return Vec::new();
421        }
422
423        let query_vec = &self.vectors[idx];
424        let mut results: Vec<(NodeId, FiniteF32)> = self
425            .vectors
426            .iter()
427            .enumerate()
428            .filter(|(i, v)| *i != idx && !v.is_empty())
429            .map(|(i, v)| {
430                let sim = Self::cosine_similarity(query_vec, v);
431                (NodeId::new(i as u32), sim)
432            })
433            .filter(|(_, s)| s.get() > 0.01)
434            .collect();
435
436        results.sort_by(|a, b| b.1.cmp(&a.1));
437        results.truncate(top_k);
438        results
439    }
440}
441
442// ---------------------------------------------------------------------------
443// SynonymExpander — bidirectional synonym group lookup
444// Replaces: semantic_v2.py SynonymExpander + SYNONYM_GROUPS constant
445// ---------------------------------------------------------------------------
446
447/// Synonym expansion table. Groups of semantically equivalent terms.
448/// FM-SEM-002 fix: no overlapping terms across groups (transitive closure prevented).
449/// Uses String-based lookups (not InternedStr) to avoid orphan interner bug.
450pub struct SynonymExpander {
451    /// Synonym groups: each group is a Vec of lowercased terms.
452    groups: Vec<Vec<String>>,
453    /// Reverse index: lowercased term -> group indices.
454    term_to_groups: HashMap<String, SmallVec<[u16; 4]>>,
455}
456
457/// Default synonym groups (Portuguese domain terms from semantic_v2.py).
458const DEFAULT_SYNONYM_GROUPS: &[&[&str]] = &[
459    &["plastico", "polimero", "resina"],
460    &["metal", "liga", "aco", "aluminio"],
461    &["vidro", "ceramica", "cristal"],
462    &["processo", "etapa", "fase"],
463    &["material", "materia_prima", "insumo"],
464    &["custo", "preco", "valor"],
465    &["fornecedor", "supplier", "vendor"],
466    &["qualidade", "quality", "qa"],
467    &["teste", "test", "ensaio"],
468    &["norma", "regulamento", "padrão"],
469    &["function", "fn", "method", "func"],
470    &["class", "struct", "type"],
471    &["module", "package", "crate"],
472    &["import", "use", "require"],
473    &["error", "exception", "panic"],
474];
475
476impl SynonymExpander {
477    /// Build from the built-in SYNONYM_GROUPS constant.
478    /// Validates no term appears in multiple groups (FM-SEM-002).
479    /// Replaces: semantic_v2.py SynonymExpander.__init__()
480    pub fn build_default() -> M1ndResult<Self> {
481        let groups: Vec<Vec<&str>> = DEFAULT_SYNONYM_GROUPS.iter().map(|g| g.to_vec()).collect();
482        Self::build(groups)
483    }
484
485    /// Build from custom synonym groups. Uses String-based lookup (no interner needed).
486    pub fn build(groups: Vec<Vec<&str>>) -> M1ndResult<Self> {
487        let mut string_groups = Vec::with_capacity(groups.len());
488        let mut term_to_groups: HashMap<String, SmallVec<[u16; 4]>> = HashMap::new();
489
490        for (gi, group) in groups.iter().enumerate() {
491            let mut str_group: Vec<String> = Vec::with_capacity(group.len());
492            for &term in group {
493                let lower = term.to_lowercase();
494                term_to_groups
495                    .entry(lower.clone())
496                    .or_default()
497                    .push(gi as u16);
498                str_group.push(lower);
499            }
500            string_groups.push(str_group);
501        }
502
503        Ok(Self {
504            groups: string_groups,
505            term_to_groups,
506        })
507    }
508
509    /// Expand a term to all synonyms (including itself).
510    /// Replaces: semantic_v2.py SynonymExpander.expand()
511    pub fn expand(&self, term: &str) -> Vec<String> {
512        let lower = term.to_lowercase();
513        let mut result = vec![lower.clone()];
514        if let Some(group_indices) = self.term_to_groups.get(&lower) {
515            for &gi in group_indices {
516                if (gi as usize) < self.groups.len() {
517                    for member in &self.groups[gi as usize] {
518                        if *member != lower && !result.contains(member) {
519                            result.push(member.clone());
520                        }
521                    }
522                }
523            }
524        }
525        result
526    }
527
528    /// Check if two terms are synonymous.
529    pub fn are_synonyms(&self, a: &str, b: &str) -> bool {
530        let a_lower = a.to_lowercase();
531        let b_lower = b.to_lowercase();
532        if a_lower == b_lower {
533            return true;
534        }
535        if let Some(groups_a) = self.term_to_groups.get(&a_lower) {
536            if let Some(groups_b) = self.term_to_groups.get(&b_lower) {
537                for &ga in groups_a {
538                    for &gb in groups_b {
539                        if ga == gb {
540                            return true;
541                        }
542                    }
543                }
544            }
545        }
546        false
547    }
548}
549
550// ---------------------------------------------------------------------------
551// SemanticEngine — combined 3-component scorer
552// Replaces: semantic_v2.py SemanticEngine
553// ---------------------------------------------------------------------------
554
555/// Combined semantic matching: 0.4*ngram + 0.4*cooccurrence + 0.2*synonym.
556/// Two-phase query_fast: phase 1 ngram+synonym (0.6/0.4), phase 2 re-rank top 3*K with cooccurrence.
557/// Replaces: semantic_v2.py SemanticEngine
558pub struct SemanticEngine {
559    pub ngram: CharNgramIndex,
560    pub cooccurrence: CoOccurrenceIndex,
561    pub synonym: SynonymExpander,
562    pub weights: SemanticWeights,
563}
564
565impl SemanticEngine {
566    /// Build all three indexes from the graph.
567    /// Replaces: semantic_v2.py SemanticEngine.__init__()
568    pub fn build(graph: &Graph, weights: SemanticWeights) -> M1ndResult<Self> {
569        let ngram = CharNgramIndex::build(graph, NGRAM_SIZE)?;
570        let cooccurrence =
571            CoOccurrenceIndex::build(graph, WALK_LENGTH, WALKS_PER_NODE, WINDOW_SIZE)?;
572        let synonym = SynonymExpander::build_default()?;
573
574        Ok(Self {
575            ngram,
576            cooccurrence,
577            synonym,
578            weights,
579        })
580    }
581
582    /// Full query: score all nodes, return top_k.
583    /// Weight: ngram*0.4 + cooccurrence*0.4 + synonym*0.2.
584    /// Replaces: semantic_v2.py SemanticEngine.query()
585    pub fn query(
586        &self,
587        graph: &Graph,
588        query: &str,
589        top_k: usize,
590    ) -> M1ndResult<Vec<(NodeId, FiniteF32)>> {
591        // Phase 1: n-gram scores
592        let ngram_scores = self.ngram.query_top_k(query, top_k * 5);
593
594        // Build combined score map
595        let mut scores: HashMap<u32, f32> = HashMap::new();
596        for &(node, score) in &ngram_scores {
597            *scores.entry(node.0).or_insert(0.0) += score.get() * self.weights.ngram.get();
598        }
599
600        // Phase 2: co-occurrence boost for top candidates
601        // (only if we have seed nodes from ngram phase)
602        if let Some(&(first_node, _)) = ngram_scores.first() {
603            let cooc_scores = self.cooccurrence.query_top_k(first_node, top_k * 3);
604            for (node, score) in cooc_scores {
605                *scores.entry(node.0).or_insert(0.0) +=
606                    score.get() * self.weights.cooccurrence.get();
607            }
608        }
609
610        // Synonym boost: expand query tokens via synonym groups,
611        // then boost nodes whose labels match expanded synonyms.
612        let tokens: Vec<String> = query
613            .to_lowercase()
614            .split_whitespace()
615            .filter(|t| t.len() > 2)
616            .map(|t| t.to_string())
617            .collect();
618
619        // Expand each token via synonym groups
620        let mut expanded_tokens: Vec<String> = Vec::new();
621        for token in &tokens {
622            for syn in self.synonym.expand(token) {
623                if !expanded_tokens.contains(&syn) {
624                    expanded_tokens.push(syn);
625                }
626            }
627        }
628
629        // Boost nodes whose labels match expanded synonyms (not original tokens)
630        let synonym_weight = self.weights.synonym.get();
631        for i in 0..graph.num_nodes() as usize {
632            let label = graph.strings.resolve(graph.nodes.label[i]).to_lowercase();
633            for expanded in &expanded_tokens {
634                if !tokens.contains(expanded) && label.contains(expanded.as_str()) {
635                    *scores.entry(i as u32).or_insert(0.0) += synonym_weight;
636                }
637            }
638        }
639
640        let mut results: Vec<(NodeId, FiniteF32)> = scores
641            .into_iter()
642            .map(|(id, s)| (NodeId::new(id), FiniteF32::new(s.min(1.0))))
643            .filter(|(_, s)| s.get() > 0.01)
644            .collect();
645
646        results.sort_by(|a, b| b.1.cmp(&a.1));
647        results.truncate(top_k);
648        Ok(results)
649    }
650
651    /// Fast two-phase query.
652    /// Phase 1: ngram+synonym (0.6/0.4) -> candidates (3*top_k).
653    /// Phase 2: re-rank candidates with cooccurrence.
654    /// Replaces: semantic_v2.py SemanticEngine.query_fast()
655    pub fn query_fast(
656        &self,
657        graph: &Graph,
658        query: &str,
659        top_k: usize,
660    ) -> M1ndResult<Vec<(NodeId, FiniteF32)>> {
661        // Phase 1: ngram candidates
662        let candidates = self.ngram.query_top_k(query, top_k * 3);
663        if candidates.is_empty() {
664            return Ok(Vec::new());
665        }
666
667        // Multi-seed co-occurrence: aggregate from top-3 ngram hits (not just #1)
668        // to avoid single-point-of-failure when top hit is a leaf node
669        let seed_count = candidates.len().min(3);
670        let mut cooc_map: HashMap<u32, f32> = HashMap::new();
671        for &(node, ngram_score) in &candidates[..seed_count] {
672            let cooc = self.cooccurrence.query_top_k(node, top_k * 3);
673            for (n, s) in cooc {
674                *cooc_map.entry(n.0).or_insert(0.0) += s.get() * ngram_score.get();
675            }
676        }
677        // Normalize by seed count
678        let seed_f = seed_count as f32;
679        for v in cooc_map.values_mut() {
680            *v /= seed_f;
681        }
682
683        // Re-rank using configured weights (normalized to sum to 1.0)
684        let total_w = self.weights.ngram.get() + self.weights.cooccurrence.get();
685        let ngram_w = if total_w > 0.0 {
686            self.weights.ngram.get() / total_w
687        } else {
688            0.6
689        };
690        let cooc_w = if total_w > 0.0 {
691            self.weights.cooccurrence.get() / total_w
692        } else {
693            0.4
694        };
695
696        let mut results: Vec<(NodeId, FiniteF32)> = candidates
697            .iter()
698            .map(|&(node, ngram_score)| {
699                let cooc_score = cooc_map.get(&node.0).copied().unwrap_or(0.0);
700                let combined = ngram_score.get() * ngram_w + cooc_score * cooc_w;
701                (node, FiniteF32::new(combined.min(1.0)))
702            })
703            .collect();
704
705        results.sort_by(|a, b| b.1.cmp(&a.1));
706        results.truncate(top_k);
707        Ok(results)
708    }
709}
710
711// Ensure Send + Sync for concurrent access.
712static_assertions::assert_impl_all!(SemanticEngine: Send, Sync);
m1nd_core/semantic.rs

m1nd_core/
semantic.rs