Skip to main content

embeddenator_cli/commands/
query.rs

1//! Query command implementations
2
3use anyhow::Result;
4use embeddenator_fs::embrfs::{
5    load_hierarchical_manifest, query_hierarchical_codebook_with_store, DirectorySubEngramStore,
6    EmbrFS, HierarchicalQueryBounds,
7};
8use embeddenator_vsa::{ReversibleVSAConfig, SparseVec};
9use std::collections::HashMap;
10use std::fs::File;
11use std::io::Read;
12use std::path::PathBuf;
13
14pub fn handle_query(
15    engram: PathBuf,
16    query: PathBuf,
17    hierarchical_manifest: Option<PathBuf>,
18    sub_engrams_dir: Option<PathBuf>,
19    k: usize,
20    verbose: bool,
21) -> Result<()> {
22    if verbose {
23        println!(
24            "Embeddenator v{} - Holographic Query",
25            env!("CARGO_PKG_VERSION")
26        );
27        println!("=================================");
28    }
29
30    let engram_data = EmbrFS::load_engram(&engram)?;
31
32    let mut query_file = File::open(&query)?;
33    let mut query_data = Vec::new();
34    query_file.read_to_end(&mut query_data)?;
35
36    // Chunks are encoded with a path-hash bucket shift; when querying we don't know the
37    // original path, so sweep possible buckets (bounded by config.max_path_depth).
38    let config = ReversibleVSAConfig::default();
39    let base_query = SparseVec::encode_data(&query_data, &config, None);
40
41    // Build the codebook index once and reuse it across the sweep.
42    let codebook_index = engram_data.build_codebook_index();
43
44    let mut best_similarity = f64::MIN;
45    let mut best_shift = 0usize;
46    let mut best_top_cosine = f64::MIN;
47
48    // Merge matches across shifts; keep the best score per chunk.
49    let mut merged: HashMap<usize, (f64, i32)> = HashMap::new();
50
51    // Optionally merge hierarchical hits too.
52    let mut merged_hier: HashMap<(String, usize), (f64, i32)> = HashMap::new();
53
54    let hierarchical_loaded = if let (Some(hier_path), Some(_)) =
55        (hierarchical_manifest.as_ref(), sub_engrams_dir.as_ref())
56    {
57        Some(load_hierarchical_manifest(hier_path)?)
58    } else {
59        None
60    };
61
62    // Increase per-bucket cutoff so global top-k merge is less likely to miss true winners.
63    let k_sweep = (k.saturating_mul(10)).max(100);
64    let candidate_k = (k_sweep.saturating_mul(10)).max(200);
65
66    for depth in 0..config.max_path_depth.max(1) {
67        let shift = depth * config.base_shift;
68        let query_vec = base_query.permute(shift);
69
70        let similarity = query_vec.cosine(&engram_data.root);
71        if similarity > best_similarity {
72            best_similarity = similarity;
73            best_shift = shift;
74        }
75
76        let matches = engram_data.query_codebook_with_index(
77            &codebook_index,
78            &query_vec,
79            candidate_k,
80            k_sweep,
81        );
82
83        if let Some(top) = matches.first() {
84            if top.cosine > best_top_cosine {
85                best_top_cosine = top.cosine;
86                best_shift = shift;
87                best_similarity = similarity;
88            }
89        }
90
91        for m in matches {
92            let entry = merged.entry(m.id).or_insert((m.cosine, m.approx_score));
93            if m.cosine > entry.0 {
94                *entry = (m.cosine, m.approx_score);
95            }
96        }
97    }
98
99    // Hierarchical query can be expensive (sub-engram loads + per-node indexing).
100    // Run it once using the best shift from the sweep.
101    if let (Some(hierarchical), Some(sub_dir)) =
102        (hierarchical_loaded.as_ref(), sub_engrams_dir.as_ref())
103    {
104        let store = DirectorySubEngramStore::new(sub_dir);
105        let bounds = HierarchicalQueryBounds {
106            k,
107            ..HierarchicalQueryBounds::default()
108        };
109        let query_vec = base_query.permute(best_shift);
110        let hier_hits = query_hierarchical_codebook_with_store(
111            hierarchical,
112            &store,
113            &engram_data.codebook,
114            &query_vec,
115            &bounds,
116        );
117        for h in hier_hits {
118            let key = (h.sub_engram_id, h.chunk_id);
119            let entry = merged_hier.entry(key).or_insert((h.cosine, h.approx_score));
120            if h.cosine > entry.0 {
121                *entry = (h.cosine, h.approx_score);
122            }
123        }
124    }
125
126    println!("Query file: {}", query.display());
127    if verbose {
128        println!(
129            "Best bucket-shift: {} (buckets 0..{})",
130            best_shift,
131            config.max_path_depth.saturating_sub(1)
132        );
133    }
134    println!("Similarity to engram: {:.4}", best_similarity);
135
136    let mut top_matches: Vec<(usize, f64, i32)> = merged
137        .into_iter()
138        .map(|(id, (cosine, approx))| (id, cosine, approx))
139        .collect();
140    top_matches.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
141    top_matches.truncate(k);
142
143    if !top_matches.is_empty() {
144        println!("Top codebook matches:");
145        for (id, cosine, approx) in top_matches {
146            println!(
147                "  chunk {}  cosine {:.4}  approx_dot {}",
148                id, cosine, approx
149            );
150        }
151    } else if verbose {
152        println!("Top codebook matches: (none)");
153    }
154
155    let mut top_hier: Vec<(String, usize, f64, i32)> = merged_hier
156        .into_iter()
157        .map(|((sub_id, chunk_id), (cosine, approx))| (sub_id, chunk_id, cosine, approx))
158        .collect();
159    top_hier.sort_by(|a, b| b.2.partial_cmp(&a.2).unwrap_or(std::cmp::Ordering::Equal));
160    top_hier.truncate(k);
161
162    if !top_hier.is_empty() {
163        println!("Top hierarchical matches:");
164        for (sub_id, chunk_id, cosine, approx) in top_hier {
165            println!(
166                "  sub {}  chunk {}  cosine {:.4}  approx_dot {}",
167                sub_id, chunk_id, cosine, approx
168            );
169        }
170    } else if verbose && hierarchical_manifest.is_some() {
171        println!("Top hierarchical matches: (none)");
172    }
173
174    if best_similarity > 0.75 {
175        println!("Status: STRONG MATCH");
176    } else if best_similarity > 0.3 {
177        println!("Status: Partial match");
178    } else {
179        println!("Status: No significant match");
180    }
181
182    Ok(())
183}
184
185pub fn handle_query_text(
186    engram: PathBuf,
187    text: String,
188    hierarchical_manifest: Option<PathBuf>,
189    sub_engrams_dir: Option<PathBuf>,
190    k: usize,
191    verbose: bool,
192) -> Result<()> {
193    if verbose {
194        println!(
195            "Embeddenator v{} - Holographic Query (Text)",
196            env!("CARGO_PKG_VERSION")
197        );
198        println!("========================================");
199    }
200
201    let engram_data = EmbrFS::load_engram(&engram)?;
202
203    let config = ReversibleVSAConfig::default();
204    let base_query = SparseVec::encode_data(text.as_bytes(), &config, None);
205
206    let codebook_index = engram_data.build_codebook_index();
207
208    let mut best_similarity = f64::MIN;
209    let mut best_shift = 0usize;
210    let mut best_top_cosine = f64::MIN;
211
212    let mut merged: HashMap<usize, (f64, i32)> = HashMap::new();
213    let mut merged_hier: HashMap<(String, usize), (f64, i32)> = HashMap::new();
214
215    let hierarchical_loaded = if let (Some(hier_path), Some(_)) =
216        (hierarchical_manifest.as_ref(), sub_engrams_dir.as_ref())
217    {
218        Some(load_hierarchical_manifest(hier_path)?)
219    } else {
220        None
221    };
222
223    let k_sweep = (k.saturating_mul(10)).max(100);
224    let candidate_k = (k_sweep.saturating_mul(10)).max(200);
225
226    for depth in 0..config.max_path_depth.max(1) {
227        let shift = depth * config.base_shift;
228        let query_vec = base_query.permute(shift);
229
230        let similarity = query_vec.cosine(&engram_data.root);
231        if similarity > best_similarity {
232            best_similarity = similarity;
233            best_shift = shift;
234        }
235
236        let matches = engram_data.query_codebook_with_index(
237            &codebook_index,
238            &query_vec,
239            candidate_k,
240            k_sweep,
241        );
242
243        if let Some(top) = matches.first() {
244            if top.cosine > best_top_cosine {
245                best_top_cosine = top.cosine;
246                best_shift = shift;
247                best_similarity = similarity;
248            }
249        }
250
251        for m in matches {
252            let entry = merged.entry(m.id).or_insert((m.cosine, m.approx_score));
253            if m.cosine > entry.0 {
254                *entry = (m.cosine, m.approx_score);
255            }
256        }
257    }
258
259    if let (Some(hierarchical), Some(sub_dir)) =
260        (hierarchical_loaded.as_ref(), sub_engrams_dir.as_ref())
261    {
262        let store = DirectorySubEngramStore::new(sub_dir);
263        let bounds = HierarchicalQueryBounds {
264            k,
265            ..HierarchicalQueryBounds::default()
266        };
267        let query_vec = base_query.permute(best_shift);
268        let hier_hits = query_hierarchical_codebook_with_store(
269            hierarchical,
270            &store,
271            &engram_data.codebook,
272            &query_vec,
273            &bounds,
274        );
275        for h in hier_hits {
276            let key = (h.sub_engram_id, h.chunk_id);
277            let entry = merged_hier.entry(key).or_insert((h.cosine, h.approx_score));
278            if h.cosine > entry.0 {
279                *entry = (h.cosine, h.approx_score);
280            }
281        }
282    }
283
284    println!("Query text: {}", text);
285    if verbose {
286        println!(
287            "Best bucket-shift: {} (buckets 0..{})",
288            best_shift,
289            config.max_path_depth.saturating_sub(1)
290        );
291    }
292    println!("Similarity to engram: {:.4}", best_similarity);
293
294    let mut top_matches: Vec<(usize, f64, i32)> = merged
295        .into_iter()
296        .map(|(id, (cosine, approx))| (id, cosine, approx))
297        .collect();
298    top_matches.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
299    top_matches.truncate(k);
300
301    if !top_matches.is_empty() {
302        println!("Top codebook matches:");
303        for (id, cosine, approx) in top_matches {
304            println!(
305                "  chunk {}  cosine {:.4}  approx_dot {}",
306                id, cosine, approx
307            );
308        }
309    } else if verbose {
310        println!("Top codebook matches: (none)");
311    }
312
313    let mut top_hier: Vec<(String, usize, f64, i32)> = merged_hier
314        .into_iter()
315        .map(|((sub_id, chunk_id), (cosine, approx))| (sub_id, chunk_id, cosine, approx))
316        .collect();
317    top_hier.sort_by(|a, b| b.2.partial_cmp(&a.2).unwrap_or(std::cmp::Ordering::Equal));
318    top_hier.truncate(k);
319
320    if !top_hier.is_empty() {
321        println!("Top hierarchical matches:");
322        for (sub_id, chunk_id, cosine, approx) in top_hier {
323            println!(
324                "  sub {}  chunk {}  cosine {:.4}  approx_dot {}",
325                sub_id, chunk_id, cosine, approx
326            );
327        }
328    } else if verbose && hierarchical_manifest.is_some() {
329        println!("Top hierarchical matches: (none)");
330    }
331
332    Ok(())
333}