embeddenator_cli/commands/
query.rs

1//! Query command implementations
2
3use anyhow::Result;
4use embeddenator_fs::embrfs::{
5    DirectorySubEngramStore, EmbrFS, HierarchicalQueryBounds,
6    load_hierarchical_manifest, query_hierarchical_codebook_with_store,
7};
8use embeddenator_vsa::{ReversibleVSAConfig, SparseVec};
9use std::collections::HashMap;
10use std::fs::File;
11use std::io::Read;
12use std::path::PathBuf;
13
14pub fn handle_query(
15    engram: PathBuf,
16    query: PathBuf,
17    hierarchical_manifest: Option<PathBuf>,
18    sub_engrams_dir: Option<PathBuf>,
19    k: usize,
20    verbose: bool,
21) -> Result<()> {
22    if verbose {
23        println!(
24            "Embeddenator v{} - Holographic Query",
25            env!("CARGO_PKG_VERSION")
26        );
27        println!("=================================");
28    }
29
30    let engram_data = EmbrFS::load_engram(&engram)?;
31
32    let mut query_file = File::open(&query)?;
33    let mut query_data = Vec::new();
34    query_file.read_to_end(&mut query_data)?;
35
36    // Chunks are encoded with a path-hash bucket shift; when querying we don't know the
37    // original path, so sweep possible buckets (bounded by config.max_path_depth).
38    let config = ReversibleVSAConfig::default();
39    let base_query = SparseVec::encode_data(&query_data, &config, None);
40
41    // Build the codebook index once and reuse it across the sweep.
42    let codebook_index = engram_data.build_codebook_index();
43
44    let mut best_similarity = f64::MIN;
45    let mut best_shift = 0usize;
46    let mut best_top_cosine = f64::MIN;
47
48    // Merge matches across shifts; keep the best score per chunk.
49    let mut merged: HashMap<usize, (f64, i32)> = HashMap::new();
50
51    // Optionally merge hierarchical hits too.
52    let mut merged_hier: HashMap<(String, usize), (f64, i32)> = HashMap::new();
53
54    let hierarchical_loaded = if let (Some(hier_path), Some(_)) =
55        (hierarchical_manifest.as_ref(), sub_engrams_dir.as_ref())
56    {
57        Some(load_hierarchical_manifest(hier_path)?)
58    } else {
59        None
60    };
61
62    // Increase per-bucket cutoff so global top-k merge is less likely to miss true winners.
63    let k_sweep = (k.saturating_mul(10)).max(100);
64    let candidate_k = (k_sweep.saturating_mul(10)).max(200);
65
66    for depth in 0..config.max_path_depth.max(1) {
67        let shift = depth * config.base_shift;
68        let query_vec = base_query.permute(shift);
69
70        let similarity = query_vec.cosine(&engram_data.root);
71        if similarity > best_similarity {
72            best_similarity = similarity;
73            best_shift = shift;
74        }
75
76        let matches = engram_data.query_codebook_with_index(
77            &codebook_index,
78            &query_vec,
79            candidate_k,
80            k_sweep,
81        );
82
83        if let Some(top) = matches.first() {
84            if top.cosine > best_top_cosine {
85                best_top_cosine = top.cosine;
86                best_shift = shift;
87                best_similarity = similarity;
88            }
89        }
90
91        for m in matches {
92            let entry = merged.entry(m.id).or_insert((m.cosine, m.approx_score));
93            if m.cosine > entry.0 {
94                *entry = (m.cosine, m.approx_score);
95            }
96        }
97    }
98
99    // Hierarchical query can be expensive (sub-engram loads + per-node indexing).
100    // Run it once using the best shift from the sweep.
101    if let (Some(hierarchical), Some(sub_dir)) =
102        (hierarchical_loaded.as_ref(), sub_engrams_dir.as_ref())
103    {
104        let store = DirectorySubEngramStore::new(sub_dir);
105        let bounds = HierarchicalQueryBounds {
106            k,
107            ..HierarchicalQueryBounds::default()
108        };
109        let query_vec = base_query.permute(best_shift);
110        let hier_hits = query_hierarchical_codebook_with_store(
111            hierarchical,
112            &store,
113            &engram_data.codebook,
114            &query_vec,
115            &bounds,
116        );
117        for h in hier_hits {
118            let key = (h.sub_engram_id, h.chunk_id);
119            let entry = merged_hier
120                .entry(key)
121                .or_insert((h.cosine, h.approx_score));
122            if h.cosine > entry.0 {
123                *entry = (h.cosine, h.approx_score);
124            }
125        }
126    }
127
128    println!("Query file: {}", query.display());
129    if verbose {
130        println!(
131            "Best bucket-shift: {} (buckets 0..{})",
132            best_shift,
133            config.max_path_depth.saturating_sub(1)
134        );
135    }
136    println!("Similarity to engram: {:.4}", best_similarity);
137
138    let mut top_matches: Vec<(usize, f64, i32)> = merged
139        .into_iter()
140        .map(|(id, (cosine, approx))| (id, cosine, approx))
141        .collect();
142    top_matches.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
143    top_matches.truncate(k);
144
145    if !top_matches.is_empty() {
146        println!("Top codebook matches:");
147        for (id, cosine, approx) in top_matches {
148            println!("  chunk {}  cosine {:.4}  approx_dot {}", id, cosine, approx);
149        }
150    } else if verbose {
151        println!("Top codebook matches: (none)");
152    }
153
154    let mut top_hier: Vec<(String, usize, f64, i32)> = merged_hier
155        .into_iter()
156        .map(|((sub_id, chunk_id), (cosine, approx))| (sub_id, chunk_id, cosine, approx))
157        .collect();
158    top_hier.sort_by(|a, b| b.2.partial_cmp(&a.2).unwrap_or(std::cmp::Ordering::Equal));
159    top_hier.truncate(k);
160
161    if !top_hier.is_empty() {
162        println!("Top hierarchical matches:");
163        for (sub_id, chunk_id, cosine, approx) in top_hier {
164            println!(
165                "  sub {}  chunk {}  cosine {:.4}  approx_dot {}",
166                sub_id, chunk_id, cosine, approx
167            );
168        }
169    } else if verbose && hierarchical_manifest.is_some() {
170        println!("Top hierarchical matches: (none)");
171    }
172
173    if best_similarity > 0.75 {
174        println!("Status: STRONG MATCH");
175    } else if best_similarity > 0.3 {
176        println!("Status: Partial match");
177    } else {
178        println!("Status: No significant match");
179    }
180
181    Ok(())
182}
183
184pub fn handle_query_text(
185    engram: PathBuf,
186    text: String,
187    hierarchical_manifest: Option<PathBuf>,
188    sub_engrams_dir: Option<PathBuf>,
189    k: usize,
190    verbose: bool,
191) -> Result<()> {
192    if verbose {
193        println!(
194            "Embeddenator v{} - Holographic Query (Text)",
195            env!("CARGO_PKG_VERSION")
196        );
197        println!("========================================");
198    }
199
200    let engram_data = EmbrFS::load_engram(&engram)?;
201
202    let config = ReversibleVSAConfig::default();
203    let base_query = SparseVec::encode_data(text.as_bytes(), &config, None);
204
205    let codebook_index = engram_data.build_codebook_index();
206
207    let mut best_similarity = f64::MIN;
208    let mut best_shift = 0usize;
209    let mut best_top_cosine = f64::MIN;
210
211    let mut merged: HashMap<usize, (f64, i32)> = HashMap::new();
212    let mut merged_hier: HashMap<(String, usize), (f64, i32)> = HashMap::new();
213
214    let hierarchical_loaded = if let (Some(hier_path), Some(_)) =
215        (hierarchical_manifest.as_ref(), sub_engrams_dir.as_ref())
216    {
217        Some(load_hierarchical_manifest(hier_path)?)
218    } else {
219        None
220    };
221
222    let k_sweep = (k.saturating_mul(10)).max(100);
223    let candidate_k = (k_sweep.saturating_mul(10)).max(200);
224
225    for depth in 0..config.max_path_depth.max(1) {
226        let shift = depth * config.base_shift;
227        let query_vec = base_query.permute(shift);
228
229        let similarity = query_vec.cosine(&engram_data.root);
230        if similarity > best_similarity {
231            best_similarity = similarity;
232            best_shift = shift;
233        }
234
235        let matches = engram_data.query_codebook_with_index(
236            &codebook_index,
237            &query_vec,
238            candidate_k,
239            k_sweep,
240        );
241
242        if let Some(top) = matches.first() {
243            if top.cosine > best_top_cosine {
244                best_top_cosine = top.cosine;
245                best_shift = shift;
246                best_similarity = similarity;
247            }
248        }
249
250        for m in matches {
251            let entry = merged.entry(m.id).or_insert((m.cosine, m.approx_score));
252            if m.cosine > entry.0 {
253                *entry = (m.cosine, m.approx_score);
254            }
255        }
256    }
257
258    if let (Some(hierarchical), Some(sub_dir)) =
259        (hierarchical_loaded.as_ref(), sub_engrams_dir.as_ref())
260    {
261        let store = DirectorySubEngramStore::new(sub_dir);
262        let bounds = HierarchicalQueryBounds {
263            k,
264            ..HierarchicalQueryBounds::default()
265        };
266        let query_vec = base_query.permute(best_shift);
267        let hier_hits = query_hierarchical_codebook_with_store(
268            hierarchical,
269            &store,
270            &engram_data.codebook,
271            &query_vec,
272            &bounds,
273        );
274        for h in hier_hits {
275            let key = (h.sub_engram_id, h.chunk_id);
276            let entry = merged_hier
277                .entry(key)
278                .or_insert((h.cosine, h.approx_score));
279            if h.cosine > entry.0 {
280                *entry = (h.cosine, h.approx_score);
281            }
282        }
283    }
284
285    println!("Query text: {}", text);
286    if verbose {
287        println!(
288            "Best bucket-shift: {} (buckets 0..{})",
289            best_shift,
290            config.max_path_depth.saturating_sub(1)
291        );
292    }
293    println!("Similarity to engram: {:.4}", best_similarity);
294
295    let mut top_matches: Vec<(usize, f64, i32)> = merged
296        .into_iter()
297        .map(|(id, (cosine, approx))| (id, cosine, approx))
298        .collect();
299    top_matches.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
300    top_matches.truncate(k);
301
302    if !top_matches.is_empty() {
303        println!("Top codebook matches:");
304        for (id, cosine, approx) in top_matches {
305            println!("  chunk {}  cosine {:.4}  approx_dot {}", id, cosine, approx);
306        }
307    } else if verbose {
308        println!("Top codebook matches: (none)");
309    }
310
311    let mut top_hier: Vec<(String, usize, f64, i32)> = merged_hier
312        .into_iter()
313        .map(|((sub_id, chunk_id), (cosine, approx))| (sub_id, chunk_id, cosine, approx))
314        .collect();
315    top_hier.sort_by(|a, b| b.2.partial_cmp(&a.2).unwrap_or(std::cmp::Ordering::Equal));
316    top_hier.truncate(k);
317
318    if !top_hier.is_empty() {
319        println!("Top hierarchical matches:");
320        for (sub_id, chunk_id, cosine, approx) in top_hier {
321            println!(
322                "  sub {}  chunk {}  cosine {:.4}  approx_dot {}",
323                sub_id, chunk_id, cosine, approx
324            );
325        }
326    } else if verbose && hierarchical_manifest.is_some() {
327        println!("Top hierarchical matches: (none)");
328    }
329
330    Ok(())
331}