Skip to main content

cmdhub_cli/
db.rs

1use crate::config::get_data_dir;
2use anyhow::{Context, Result};
3use cmdhub_shared::{
4    AciCommandContract, DbAciRecord, CREATE_APPS_FTS_TABLE, CREATE_APPS_TABLE,
5    CREATE_ARGUMENTS_TABLE, CREATE_COMMANDS_VEC_TABLE,
6};
7use rusqlite::Connection;
8use std::path::PathBuf;
9
10pub fn resolve_db_path() -> PathBuf {
11    get_data_dir().join("cmdhub.db")
12}
13
14/// Compact starter database (top ~3000 commands by popularity) embedded into the
15/// binary so a fresh install can search common commands offline with zero network.
16/// `cmdh update` overlays the full 103k+ registry from the CDN. Generated by
17/// `cmdh-build-db` over a top-N subset; compressed with zstd.
18static EMBEDDED_STARTER_DB_ZST: &[u8] = include_bytes!("../assets/starter.db.zst");
19
20/// Whether the DB at `path` should be seeded from the embedded starter: true only
21/// for a missing file or a valid-but-empty (schema-only) DB. A file that exists but
22/// is unreadable/corrupt returns false — we must NOT mask corruption by overwriting
23/// it; that case is handled by the open/init recovery path (`update --force`).
24fn db_is_empty(path: &std::path::Path) -> bool {
25    if !path.exists() {
26        return true;
27    }
28    match Connection::open(path) {
29        Ok(c) => match c.query_row("SELECT count(*) FROM apps", [], |r| r.get::<_, i64>(0)) {
30            Ok(n) => n == 0,      // valid schema, no rows → seed
31            Err(_) => false,      // corrupt / unexpected schema → don't overwrite
32        },
33        Err(_) => false,          // can't even open → leave it for recovery
34    }
35}
36
37/// If no usable DB exists yet (fresh install, before `cmdh update`), seed it from
38/// the embedded starter DB so the very first `cmdh search` returns results offline.
39/// Never overwrites a populated DB (e.g. after `cmdh update`). Call before open_db.
40pub fn hydrate_starter_if_empty() -> Result<()> {
41    // Escape hatch for tests/CI that intentionally want an empty DB (boundary
42    // input tests, OOD gating) without the embedded starter masking results.
43    if std::env::var_os("CMDH_NO_STARTER").is_some() {
44        return Ok(());
45    }
46    let db_path = resolve_db_path();
47    if !db_is_empty(&db_path) {
48        return Ok(());
49    }
50    if let Some(parent) = db_path.parent() {
51        std::fs::create_dir_all(parent)
52            .context("Failed to create database parent directories")?;
53    }
54    // Clear any stale WAL/SHM so the replacement file isn't shadowed by old journals.
55    for ext in ["-wal", "-shm"] {
56        let p = db_path.with_extension(format!("db{ext}"));
57        let _ = std::fs::remove_file(&p);
58    }
59    let decompressed = zstd::decode_all(EMBEDDED_STARTER_DB_ZST)
60        .context("Failed to decompress embedded starter database")?;
61    std::fs::write(&db_path, &decompressed)
62        .context("Failed to write embedded starter database")?;
63    eprintln!(
64        "Seeded local registry from the built-in starter set. Run `cmdh update` for the full catalog."
65    );
66    Ok(())
67}
68
69pub fn open_db() -> Result<Connection> {
70    let db_path = resolve_db_path();
71    if let Some(parent) = db_path.parent() {
72        std::fs::create_dir_all(parent).context("Failed to create database parent directories")?;
73    }
74
75    unsafe {
76        type SqliteVecInitFn = unsafe extern "C" fn();
77        let init_fn: SqliteVecInitFn = sqlite_vec::sqlite3_vec_init;
78        #[allow(clippy::missing_transmute_annotations)]
79        let _ = rusqlite::ffi::sqlite3_auto_extension(Some(std::mem::transmute(init_fn)));
80    }
81
82    let conn = Connection::open(&db_path).context("Failed to open SQLite database file")?;
83    let _ = conn.execute("PRAGMA journal_mode = WAL;", []);
84    let _ = conn.execute("PRAGMA synchronous = NORMAL;", []);
85    let _ = conn.execute("PRAGMA foreign_keys = ON;", []);
86    Ok(conn)
87}
88
89pub fn init_db(conn: &Connection) -> Result<()> {
90    conn.execute(CREATE_APPS_TABLE, [])
91        .context("Failed to create apps table")?;
92    conn.execute(CREATE_ARGUMENTS_TABLE, [])
93        .context("Failed to create arguments table")?;
94    conn.execute(CREATE_APPS_FTS_TABLE, [])
95        .context("Failed to create apps_fts table")?;
96
97    // Commands vector table may fail to create if sqlite-vec is not fully supported or active
98    if let Err(e) = conn.execute(CREATE_COMMANDS_VEC_TABLE, []) {
99        eprintln!("Warning: Failed to initialize sqlite-vec commands_vec table: {}. Falling back to FTS5 search.", e);
100    }
101
102    conn.execute(
103        "CREATE TABLE IF NOT EXISTS sync_meta (
104            key TEXT PRIMARY KEY,
105            value TEXT NOT NULL
106        );",
107        [],
108    )
109    .context("Failed to create sync_meta table")?;
110
111    Ok(())
112}
113
114/// Domain concept → concrete terms the tools actually use. Bridges abstract
115/// natural-language queries to real command names (cloud CLIs say "vpc"/"subnet",
116/// not "networking"). Only applied to the OR-fallback query, so it never breaks
117/// precise AND matches — it just widens recall for vague intent queries.
118fn concept_synonyms(token: &str) -> &'static [&'static str] {
119    match token {
120        "networking" | "network" => &["vpc", "subnet", "gateway", "route", "firewall"],
121        "firewall" => &["security", "firewall", "acl"],
122        "storage" => &["bucket", "volume", "disk", "blob"],
123        "database" | "db" => &["database", "sql", "table", "rds"],
124        "serverless" => &["lambda", "function", "faas"],
125        "container" | "containers" => &["container", "image", "pod"],
126        "kubernetes" | "k8s" => &["pod", "deployment", "namespace", "cluster"],
127        "secret" | "secrets" => &["secret", "credential", "key", "vault"],
128        "dns" => &["dns", "domain", "record", "zone"],
129        // Intent verbs: user says "delete" but tools say "remove/unlink", etc.
130        "delete" | "erase" => &["remove", "unlink", "trash"],
131        "remove" => &["delete", "unlink"],
132        // Cleanup intent: "clear/clean unused images" must reach `prune`-style commands.
133        "clear" | "clean" | "cleanup" | "purge" => &["prune", "remove", "rm", "delete", "unused"],
134        "prune" => &["clean", "remove", "delete", "unused"],
135        "view" | "read" => &["show", "display"],
136        "deploy" | "deployment" => &["apply", "install"],
137        "history" => &["log", "commits"],
138        "cat" => &["bat", "less", "pager"],
139        "fuzzy" => &["fzf", "skim", "finder"],
140        "finder" => &["find", "fd"],
141        "download" => &["curl", "wget"],
142        "diff" => &["delta", "difft"],
143        "grep" => &["ripgrep", "rg"],
144        _ => &[],
145    }
146}
147
148/// Concept word → CANONICAL TOOL NAMES only. A strict subset of `concept_synonyms` safe to
149/// OR onto the AND-candidate (Stage-1 widening): every value is a specific binary name, so
150/// appending it surfaces the canonical tool a literal query misses (fzf for "fuzzy", rg for
151/// "grep") without flooding the candidate pool the way generic concept words do. Keep this
152/// list tool-names-only; never add generic verbs/nouns here (use `concept_synonyms` for those).
153fn tool_alias_synonyms(token: &str) -> &'static [&'static str] {
154    match token {
155        "fuzzy" => &["fzf", "skim"],
156        "finder" => &["fzf", "fd"],
157        "download" => &["curl", "wget", "aria2"],
158        "diff" => &["delta", "difft"],
159        "grep" => &["ripgrep", "rg"],
160        _ => &[],
161    }
162}
163
164fn preprocess_query(query: &str, use_and: bool) -> String {
165    let stop_words: std::collections::HashSet<&str> = [
166        "how", "to", "a", "the", "on", "in", "of", "for", "with", "an", "is", "at", "by", "and",
167        "or", "from", "my", "your", "our", "me", "us",
168    ]
169    .iter()
170    .cloned()
171    .collect();
172
173    let base: Vec<String> = query
174        .split(|c: char| !c.is_alphanumeric() && c != '_')
175        .filter(|w| !w.is_empty())
176        .map(|w| w.to_lowercase())
177        .filter(|w| !stop_words.contains(w.as_str()))
178        .collect();
179
180    let mut terms: Vec<String> = base.iter().map(|w| format!("{}*", w)).collect();
181
182    // OR-query only: widen with domain synonyms so e.g. "configure networking" also
183    // matches vpc/subnet/gateway commands. AND-query stays strict (exact intent).
184    if !use_and {
185        let mut seen: std::collections::HashSet<String> = base.iter().cloned().collect();
186        for w in &base {
187            for syn in concept_synonyms(w) {
188                if seen.insert((*syn).to_string()) {
189                    terms.push(format!("{}*", syn));
190                }
191            }
192        }
193    }
194
195    if terms.is_empty() {
196        "*".to_string()
197    } else if use_and {
198        terms.join(" ")
199    } else {
200        terms.join(" OR ")
201    }
202}
203
204/// Extract the embedding dimension declared in the commands_vec DDL, e.g. `float[384]` → 384.
205fn detect_vec_dim(conn: &Connection) -> Option<usize> {
206    let sql: String = conn
207        .query_row(
208            "SELECT sql FROM sqlite_master WHERE type='table' AND name='commands_vec'",
209            [],
210            |row| row.get(0),
211        )
212        .ok()?;
213    let pos = sql.find("float[")?;
214    let rest = &sql[pos + 6..];
215    let end = rest.find(']')?;
216    rest[..end].parse().ok()
217}
218
219/// SQL expression for the provenance column, tolerant of pre-provenance databases.
220/// A new client must keep working against an old DB (schema v1): when the column is
221/// absent we select the literal 'inferred' (unverified until proven) instead.
222pub(crate) fn provenance_expr(conn: &Connection) -> &'static str {
223    let has = conn
224        .query_row(
225            "SELECT 1 FROM pragma_table_info('arguments') WHERE name = 'provenance'",
226            [],
227            |_| Ok(()),
228        )
229        .is_ok();
230    if has {
231        "arg.provenance"
232    } else {
233        "'inferred'"
234    }
235}
236
237pub fn calculate_confidence(lowest_dist: f32, and_match: bool) -> String {
238    let hard = 0.82;
239    let soft = 0.76;
240    if lowest_dist > hard && !and_match {
241        "none".to_string()
242    } else if (soft < lowest_dist && lowest_dist <= hard) || (lowest_dist > hard && and_match) {
243        "low".to_string()
244    } else {
245        "high".to_string()
246    }
247}
248
249pub fn search_cascading(
250    conn: &Connection,
251    query: &str,
252    query_vector: Option<&[f32]>,
253    limit: usize,
254    enable_vector: bool,
255) -> Result<Vec<AciCommandContract>> {
256    let cleaned_query = crate::robustness::preprocess_robustness(query);
257    let and_query = preprocess_query(&cleaned_query, true);
258    let or_query = preprocess_query(&cleaned_query, false);
259    let mut confidence = "high".to_string();
260    let prov = provenance_expr(conn);
261
262    // Adapt query vector to the DB's stored embedding dimension. The ONNX model outputs
263    // 384-dim vectors but an older database may still store float[512] (zero-padded).
264    // When there is a mismatch, pad (or truncate) the query vector so KNN succeeds.
265    // Once the database is rebuilt with 384-dim embeddings this branch is never taken.
266    #[allow(unused_assignments)]
267    let mut adapted_query_vector = None;
268    let query_vector: Option<&[f32]> = if enable_vector {
269        if let (Some(q), Some(db_dim)) = (query_vector, detect_vec_dim(conn)) {
270            if q.len() != db_dim {
271                let mut adapted = vec![0.0f32; db_dim];
272                let copy_len = q.len().min(db_dim);
273                adapted[..copy_len].copy_from_slice(&q[..copy_len]);
274                adapted_query_vector = Some(adapted);
275                adapted_query_vector.as_deref()
276            } else {
277                query_vector
278            }
279        } else {
280            query_vector
281        }
282    } else {
283        query_vector
284    };
285
286    // Compute vec_bytes once; reused for all KNN queries below.
287    let vec_bytes: Option<Vec<u8>> = if enable_vector {
288        query_vector.map(|q_vec| {
289            let mut bytes = Vec::with_capacity(q_vec.len() * 4);
290            for &val in q_vec {
291                bytes.extend_from_slice(&val.to_le_bytes());
292            }
293            bytes
294        })
295    } else {
296        None
297    };
298
299    let mut and_match = false;
300    if and_query != "*" {
301        if let Ok(count) = conn.query_row::<u64, _, _>(
302            "SELECT count(*) FROM apps_fts WHERE apps_fts MATCH :query",
303            rusqlite::named_params! { ":query": &and_query },
304            |row| row.get(0),
305        ) {
306            if count > 0 {
307                and_match = true;
308            }
309        }
310    }
311
312    let processed_query = if and_match {
313        let base_tokens: Vec<String> = cleaned_query
314            .split(|c: char| !c.is_alphanumeric() && c != '_')
315            .filter(|w| !w.is_empty())
316            .map(|w| w.to_lowercase())
317            .collect();
318
319        let mut syn_terms = Vec::new();
320        let mut seen = std::collections::HashSet::new();
321        for w in &base_tokens {
322            seen.insert(w.clone());
323        }
324
325        // Widen the AND candidate with TOOL-NAME synonyms only (fuzzy→fzf, grep→rg). These
326        // are specific binary names — OR-ing them in surfaces a canonical tool the literal
327        // AND query misses (fzf has no "file" token) WITHOUT flooding. Generic concept
328        // synonyms (kubernetes→namespace/cluster, view→show) are deliberately EXCLUDED here:
329        // OR-ing those on the AND path floods competitors and demotes the right tool
330        // (verified: it sank kubectl.logs below stern/kail for "kubernetes pod logs"). Those
331        // stay in the or-fallback path (concept_synonyms) for when there is no AND match.
332        for w in &base_tokens {
333            for syn in tool_alias_synonyms(w) {
334                let syn_str = (*syn).to_string();
335                if seen.insert(syn_str.clone()) {
336                    syn_terms.push(format!("{}*", syn_str));
337                }
338            }
339        }
340
341        if syn_terms.is_empty() {
342            and_query.clone()
343        } else {
344            format!("({}) OR {}", and_query, syn_terms.join(" OR "))
345        }
346    } else {
347        or_query.clone()
348    };
349    let cand_query = processed_query.clone();
350
351    // 1. Fast exact-match check (LOWER check for path/name)
352    let trimmed_query = query.trim().to_lowercase();
353    let mut exact_stmt = conn.prepare(&format!(
354        "SELECT \
355            arg.app_id, \
356            app.name, \
357            arg.cmd_path, \
358            arg.node_type, \
359            arg.description, \
360            arg.risk_level, \
361            arg.example_template, \
362            app.os_aliases, \
363            app.install_instructions, \
364            app.popularity, \
365            arg.docker_image, \
366            arg.script_url, \
367            arg.source_url, \
368            {prov} \
369        FROM arguments arg \
370        JOIN apps app ON arg.app_id = app.app_id \
371        WHERE LOWER(arg.cmd_path) = :query \
372           OR (LOWER(app.name) = :query AND arg.node_type = 'root') \
373        LIMIT :limit_num"
374    ))?;
375
376    let exact_rows = exact_stmt.query_map(
377        rusqlite::named_params! {
378            ":query": trimmed_query,
379            ":limit_num": limit,
380        },
381        |row| {
382            Ok(DbAciRecord {
383                app_id: row.get(0)?,
384                name: row.get(1)?,
385                cmd_path: row.get(2)?,
386                node_type: row.get(3)?,
387                description: row.get(4)?,
388                risk_level: row.get(5)?,
389                example_template: row.get(6)?,
390                os_aliases: row.get(7)?,
391                install_instructions: row.get(8)?,
392                popularity: row.get(9)?,
393                docker_image: row.get(10)?,
394                script_url: row.get(11)?,
395                source_url: row.get(12)?,
396                provenance: row.get(13)?,
397            })
398        },
399    )?;
400
401    let mut exact_results = Vec::new();
402    for record in exact_rows.flatten() {
403        if let Ok(contract) = AciCommandContract::try_from(record) {
404            exact_results.push(contract);
405        }
406    }
407
408    // 2. Stage 1 App Filter: Threshold check
409    // KNN query is isolated in a subquery first; the JOIN on arguments
410    // happens outside so SQLite can push the MATCH constraint correctly.
411    // vec0 uses L2 distance. For unit vectors (BGE-micro-v2):
412    //   cos_sim = 1 - L2_dist² / 2
413    //   cos_sim < 0.35 ↔ L2_dist > sqrt(2 * 0.65) ≈ 1.14
414    if let Some(ref vb) = vec_bytes {
415        let lowest_dist: f32 = conn
416            .query_row(
417                "SELECT v.distance \
418                 FROM ( \
419                     SELECT cmd_path, distance \
420                     FROM commands_vec \
421                     WHERE embedding MATCH :query_vector AND k = 100 \
422                 ) v \
423                 JOIN arguments arg ON v.cmd_path = arg.cmd_path \
424                 ORDER BY v.distance ASC \
425                 LIMIT 1",
426                rusqlite::named_params! { ":query_vector": vb },
427                |row| row.get(0),
428            )
429            .unwrap_or(f32::MAX);
430
431        confidence = calculate_confidence(lowest_dist, and_match);
432    }
433
434    // Stage 1: select top 5 app_ids by FTS-RRF + vector-RRF + a popularity prior added by
435    // VALUE (apps.popularity in [0,1], the cross-ecosystem repo-count from the Repology
436    // dump), with name dedup. This lifts the canonical tool for brand/concept words (az for
437    // "azure", kubectl for "kubernetes") even when its name/path only weakly matches, while
438    // staying gentle on multi-token task queries (see pop_w below). No hardcoded vendor
439    // list; new sources just need their popularity column filled. The FTS candidate limit
440    // is widened (300) so a canonical-but-weak-match tool still enters the pool.
441    // BM25 weights: cmd_path=0 (unindexed), name=5.0, capabilities=2.0.
442    // Raising capabilities weight helps description-based queries (e.g. "knowledge" → obsidian).
443    //
444    // Popularity prior (apps.popularity in [0,1], desaturated CAP=100 so values are distinct).
445    // Popularity form is gated by query type. A single bare token is a brand lookup
446    // ("azure") where no tool is strongly relevant: use a LINEAR, strong weight so the
447    // canonical tool (azure-cli 0.85) clearly beats a namesake (opensearch-azure 0.39).
448    // A multi-token query is a task description where one tool is usually strongly
449    // relevant: use a CUBED, gentle weight so only near-canonical tools (rm/git/bat ~1.0)
450    // get lifted for generic tasks, never burying a correct niche tool (vectomancy 0.24,
451    // 0.24^3 ≈ 0.014). Tuned on scripts/eval_golden.py.
452    let qtok_n = content_tokens(query).len();
453    let (pw_lin, pw_cube): (f64, f64) = if qtok_n <= 1 {
454        (0.05, 0.0)
455    } else {
456        (0.0, 0.015)
457    };
458    // cold_floor scales a low-popularity tool's FTS-RRF contribution down. DISABLED by
459    // default (1.0 = no-op): a penalty <1.0 is data-fragile — Repology popularity is
460    // unreliable for core tools (chmod/tar drop to ~0.15 when their TLDR install JSON is
461    // NULL), so penalizing low-pop name matches demotes the REAL canonical tool (verified:
462    // 0.85 regressed "extract tar.gz" — tar sank below archive-cli). Kept as an env knob
463    // for offline calibration only; never ship < 1.0 without re-clearing the golden gate.
464    let cold_floor = std::env::var("CMDH_COLD_FLOOR")
465        .ok()
466        .and_then(|s| s.parse::<f64>().ok())
467        .unwrap_or(1.0);
468    let mut top_apps = Vec::new();
469    if let Some(ref vb) = vec_bytes {
470        let mut app_stmt = conn.prepare(
471            "WITH fts_matched AS ( \
472                SELECT cmd_path, row_number() OVER (ORDER BY bm25(apps_fts, 0.0, 5.0, 2.0) ASC) as fts_pos \
473                FROM apps_fts WHERE apps_fts MATCH :query LIMIT 300 \
474            ), \
475            fts_ordered AS ( \
476                SELECT arg.app_id, MIN(m.fts_pos) as fts_pos \
477                FROM fts_matched m JOIN arguments arg ON m.cmd_path = arg.cmd_path \
478                GROUP BY arg.app_id \
479            ), \
480            vec_knn AS ( \
481                SELECT cmd_path, distance FROM commands_vec \
482                WHERE embedding MATCH :query_vector AND k = 200 \
483            ), \
484            vec_rank AS ( \
485                SELECT arg.app_id, row_number() OVER (ORDER BY vk.distance ASC) as vec_pos \
486                FROM vec_knn vk JOIN arguments arg ON vk.cmd_path = arg.cmd_path \
487                WHERE arg.node_type = 'root' \
488            ), \
489            pre_scored AS ( \
490                SELECT \
491                    COALESCE(fts.app_id, vec.app_id) as app_id, \
492                    fts.fts_pos as fts_pos, vec.vec_pos as vec_pos \
493                FROM (SELECT app_id FROM fts_ordered UNION SELECT app_id FROM vec_rank) u \
494                LEFT JOIN fts_ordered fts ON u.app_id = fts.app_id \
495                LEFT JOIN vec_rank vec ON u.app_id = vec.app_id \
496            ), \
497            pop_ranked AS ( \
498                SELECT ps.app_id, ps.fts_pos, ps.vec_pos, a.name as nm, \
499                       COALESCE(a.popularity, 0.0) as pop, \
500                       row_number() OVER (ORDER BY COALESCE(a.popularity, 0.0) DESC) as pop_pos \
501                FROM pre_scored ps JOIN apps a ON ps.app_id = a.app_id \
502            ), \
503            scored AS ( \
504                SELECT app_id, nm, \
505                       COALESCE((:cold_floor + (1.0 - :cold_floor) * pop) * 1.0 / (60.0 + fts_pos), 0.0) \
506                       + COALESCE(1.0 / (60.0 + vec_pos), 0.0) \
507                       + :pw_lin * pop + :pw_cube * pop * pop * pop as rrf_score \
508                FROM pop_ranked \
509            ), \
510            name_deduped AS ( \
511                SELECT app_id, rrf_score, \
512                       row_number() OVER (PARTITION BY nm ORDER BY rrf_score DESC) as rn \
513                FROM scored \
514            ) \
515            SELECT app_id FROM name_deduped WHERE rn = 1 ORDER BY rrf_score DESC LIMIT 5"
516        )?;
517
518        let app_rows = app_stmt.query_map(
519            rusqlite::named_params! {
520                ":query": &cand_query,
521                ":query_vector": vb,
522                ":pw_lin": pw_lin,
523                ":pw_cube": pw_cube,
524                ":cold_floor": cold_floor,
525            },
526            |row| row.get::<_, String>(0),
527        )?;
528
529        for app_id in app_rows.flatten() {
530            top_apps.push(app_id);
531        }
532    } else {
533        let mut app_stmt = conn.prepare(
534            "WITH fts_matched AS ( \
535                SELECT cmd_path, row_number() OVER (ORDER BY bm25(apps_fts, 0.0, 5.0, 2.0) ASC) as fts_pos \
536                FROM apps_fts WHERE apps_fts MATCH :query LIMIT 300 \
537            ), \
538            fts_ordered AS ( \
539                SELECT arg.app_id, MIN(m.fts_pos) as fts_pos \
540                FROM fts_matched m JOIN arguments arg ON m.cmd_path = arg.cmd_path \
541                GROUP BY arg.app_id \
542            ), \
543            pop_ranked AS ( \
544                SELECT ftso.app_id, ftso.fts_pos, a.name as nm, \
545                       COALESCE(a.popularity, 0.0) as pop, \
546                       row_number() OVER (ORDER BY COALESCE(a.popularity, 0.0) DESC) as pop_pos \
547                FROM fts_ordered ftso JOIN apps a ON ftso.app_id = a.app_id \
548            ), \
549            scored AS ( \
550                SELECT app_id, nm, \
551                       COALESCE((:cold_floor + (1.0 - :cold_floor) * pop) * 1.0 / (60.0 + fts_pos), 0.0) \
552                       + :pw_lin * pop + :pw_cube * pop * pop * pop as rrf_score \
553                FROM pop_ranked \
554            ), \
555            name_deduped AS ( \
556                SELECT app_id, rrf_score, \
557                       row_number() OVER (PARTITION BY nm ORDER BY rrf_score DESC) as rn \
558                FROM scored \
559            ) \
560            SELECT app_id FROM name_deduped WHERE rn = 1 ORDER BY rrf_score DESC LIMIT 5"
561        )?;
562
563        let app_rows = app_stmt.query_map(
564            rusqlite::named_params! {
565                ":query": &cand_query,
566                ":pw_lin": pw_lin,
567                ":pw_cube": pw_cube,
568                ":cold_floor": cold_floor,
569            },
570            |row| row.get::<_, String>(0),
571        )?;
572
573        for app_id in app_rows.flatten() {
574            top_apps.push(app_id);
575        }
576    }
577
578    if top_apps.is_empty() {
579        return Ok(exact_results);
580    }
581
582    // Guarantee: any app that has an FTS5 text match must appear in top_apps.
583    // The combined RRF can push a weak FTS5 match (e.g. obsidian for "knowledge")
584    // below the top-5 cutoff when vector-boosted github apps dominate.
585    // Fix: collect all FTS5-matching app_ids and inject any that are missing.
586    if processed_query != "*" {
587        let mut fts_only_stmt = conn.prepare(
588            "WITH fts_matched AS ( \
589                SELECT cmd_path FROM apps_fts WHERE apps_fts MATCH :query LIMIT 100 \
590            ) \
591            SELECT DISTINCT arg.app_id \
592            FROM fts_matched m JOIN arguments arg ON m.cmd_path = arg.cmd_path \
593            LIMIT 5",
594        )?;
595        let fts_app_rows = fts_only_stmt
596            .query_map(rusqlite::named_params! { ":query": &cand_query }, |row| {
597                row.get::<_, String>(0)
598            })?;
599        for app_id in fts_app_rows.flatten() {
600            if !top_apps.contains(&app_id) {
601                top_apps.push(app_id);
602            }
603        }
604        top_apps.truncate(8); // cap at 8 to keep Stage 2 bounded
605    }
606
607    // Pad top_apps to length 8 for Stage 2 named params
608    while top_apps.len() < 8 {
609        top_apps.push(top_apps[0].clone());
610    }
611
612    // Popularity prior for the selected apps, reused in the Stage-2 re-rank so the
613    // within-app command ordering also favours canonical tools (data-driven, no prefixes).
614    let mut pop_map: std::collections::HashMap<String, f64> = std::collections::HashMap::new();
615    {
616        let mut uniq: Vec<&String> = top_apps.iter().collect();
617        uniq.sort();
618        uniq.dedup();
619        let placeholders = uniq.iter().map(|_| "?").collect::<Vec<_>>().join(",");
620        if let Ok(mut pstmt) = conn.prepare(&format!(
621            "SELECT app_id, COALESCE(popularity, 0.0) FROM apps WHERE app_id IN ({placeholders})"
622        )) {
623            let params = rusqlite::params_from_iter(uniq.iter().map(|s| s.as_str()));
624            if let Ok(rows) = pstmt.query_map(params, |r| {
625                Ok((r.get::<_, String>(0)?, r.get::<_, f64>(1)?))
626            }) {
627                for kv in rows.flatten() {
628                    pop_map.insert(kv.0, kv.1);
629                }
630            }
631        }
632    }
633
634    // Stage 2: Scoped search.
635    // Over-fetch a candidate pool so the leaf-match re-ranker (below) has room to
636    // promote the command whose name most precisely matches the query intent.
637    let pool = std::cmp::max(limit, 30);
638    let mut results = Vec::new();
639    if let Some(ref vb) = vec_bytes {
640        let mut stmt = conn.prepare(&format!(
641            "WITH fts_rank AS ( \
642                SELECT cmd_path, row_number() OVER (ORDER BY bm25(apps_fts, 0.0, 10.0, 1.0) ASC) as fts_pos \
643                FROM apps_fts WHERE apps_fts MATCH :query \
644                LIMIT 100 \
645            ), \
646            vec_rank AS ( \
647                SELECT cmd_path, row_number() OVER (ORDER BY distance ASC) as vec_pos \
648                FROM commands_vec \
649                WHERE embedding MATCH :query_vector AND k = 100 \
650            ) \
651            SELECT \
652                arg.app_id, \
653                app.name, \
654                arg.cmd_path, \
655                arg.node_type, \
656                arg.description, \
657                arg.risk_level, \
658                arg.example_template, \
659                app.os_aliases, \
660                app.install_instructions, \
661                app.popularity, \
662                arg.docker_image, \
663                arg.script_url, \
664                arg.source_url, \
665                {prov} \
666            FROM arguments arg \
667            JOIN apps app ON arg.app_id = app.app_id \
668            LEFT JOIN fts_rank fts ON arg.cmd_path = fts.cmd_path \
669            LEFT JOIN vec_rank vec ON arg.cmd_path = vec.cmd_path \
670            WHERE (fts.cmd_path IS NOT NULL OR vec.cmd_path IS NOT NULL) \
671              AND arg.app_id IN (:app1, :app2, :app3, :app4, :app5, :app6, :app7, :app8) \
672            ORDER BY COALESCE(1.0 / (60.0 + fts.fts_pos), 0.0) + COALESCE(1.0 / (60.0 + vec.vec_pos), 0.0) DESC \
673            LIMIT :limit_num"
674        ))?;
675
676        let rows = stmt.query_map(
677            rusqlite::named_params! {
678                ":query": &processed_query,
679                ":query_vector": vb,
680                ":app1": &top_apps[0],
681                ":app2": &top_apps[1],
682                ":app3": &top_apps[2],
683                ":app4": &top_apps[3],
684                ":app5": &top_apps[4],
685                ":app6": &top_apps[5],
686                ":app7": &top_apps[6],
687                ":app8": &top_apps[7],
688                ":limit_num": pool,
689            },
690            |row| {
691                Ok(DbAciRecord {
692                    app_id: row.get(0)?,
693                    name: row.get(1)?,
694                    cmd_path: row.get(2)?,
695                    node_type: row.get(3)?,
696                    description: row.get(4)?,
697                    risk_level: row.get(5)?,
698                    example_template: row.get(6)?,
699                    os_aliases: row.get(7)?,
700                    install_instructions: row.get(8)?,
701                    popularity: row.get(9)?,
702                    docker_image: row.get(10)?,
703                    script_url: row.get(11)?,
704                    source_url: row.get(12)?,
705                    provenance: row.get(13)?,
706                })
707            },
708        )?;
709
710        for r in rows {
711            let record = r?;
712            if let Ok(contract) = AciCommandContract::try_from(record) {
713                results.push(contract);
714            }
715        }
716    } else {
717        let mut stmt = conn.prepare(&format!(
718            "SELECT \
719                arg.app_id, \
720                app.name, \
721                arg.cmd_path, \
722                arg.node_type, \
723                arg.description, \
724                arg.risk_level, \
725                arg.example_template, \
726                app.os_aliases, \
727                app.install_instructions, \
728                app.popularity, \
729                arg.docker_image, \
730                arg.script_url, \
731                arg.source_url, \
732                {prov} \
733            FROM arguments arg \
734            JOIN apps app ON arg.app_id = app.app_id \
735            JOIN apps_fts fts ON arg.cmd_path = fts.cmd_path \
736            WHERE apps_fts MATCH :query \
737              AND arg.app_id IN (:app1, :app2, :app3, :app4, :app5, :app6, :app7, :app8) \
738            ORDER BY bm25(apps_fts, 0.0, 5.0, 2.0) ASC \
739            LIMIT :limit_num"
740        ))?;
741
742        let rows = stmt.query_map(
743            rusqlite::named_params! {
744                ":query": &processed_query,
745                ":app1": &top_apps[0],
746                ":app2": &top_apps[1],
747                ":app3": &top_apps[2],
748                ":app4": &top_apps[3],
749                ":app5": &top_apps[4],
750                ":app6": &top_apps[5],
751                ":app7": &top_apps[6],
752                ":app8": &top_apps[7],
753                ":limit_num": pool,
754            },
755            |row| {
756                Ok(DbAciRecord {
757                    app_id: row.get(0)?,
758                    name: row.get(1)?,
759                    cmd_path: row.get(2)?,
760                    node_type: row.get(3)?,
761                    description: row.get(4)?,
762                    risk_level: row.get(5)?,
763                    example_template: row.get(6)?,
764                    os_aliases: row.get(7)?,
765                    install_instructions: row.get(8)?,
766                    popularity: row.get(9)?,
767                    docker_image: row.get(10)?,
768                    script_url: row.get(11)?,
769                    source_url: row.get(12)?,
770                    provenance: row.get(13)?,
771                })
772            },
773        )?;
774
775        for r in rows {
776            let record = r?;
777            if let Ok(contract) = AciCommandContract::try_from(record) {
778                results.push(contract);
779            }
780        }
781    }
782
783    // Path-match re-rank: blend the hybrid-RRF order with how precisely each command's
784    // path (service + leaf, e.g. "ec2 create-vpc") matches the query intent. This lifts
785    // `aws.ec2.create-vpc` above `aws.apigatewayv2.create-vpc-link` for "create a vpc on
786    // aws", and keeps the named service (ec2) in play — without discarding semantic
787    // (vector) ranking, since the RRF position still contributes.
788    let q_tokens = content_tokens(query);
789    // Expanded token set for PATH matching only: intent synonyms + singular forms, so
790    // "clear podman unused images" can match a `podman.image.prune` path (clear→prune,
791    // images→image). Gating decisions below still use the raw token count — they measure
792    // how specific the user's own query is, not the synonym expansion.
793    let q_path_tokens = expand_for_path_match(&q_tokens);
794    if !q_tokens.is_empty() && results.len() > 1 {
795        let n = results.len() as i32;
796        // Path-match disambiguation is decisive for action/resource queries (multiple
797        // tokens, e.g. "create a vpc on aws" → aws.ec2.create-vpc) but harmful for a bare
798        // brand/concept word (e.g. "azure"): there a niche tool with the word literally in
799        // its path (prowler.azure) would bury the canonical CLI (az), whose identity lives
800        // in its popularity/topics, not its command path. So weight path-match strongly only
801        // when the query is specific, and lean on the popularity prior for single tokens.
802        let path_w = if q_tokens.len() >= 2 { 4 } else { 1 };
803        // Popularity bonus mirrors the Stage-1 gating: decisive for a bare brand token,
804        // a gentle nudge for descriptive queries so it can't bury a correct niche tool.
805        let pop_bonus_w = if q_tokens.len() <= 1 { 15.0 } else { 3.0 };
806        let mut scored: Vec<(i32, usize, AciCommandContract)> = results
807            .drain(..)
808            .enumerate()
809            .map(|(i, c)| {
810                let rrf = n - i as i32; // higher = better original rank
811                let pop_bonus =
812                    (pop_bonus_w * pop_map.get(&c.app_id).copied().unwrap_or(0.0)) as i32;
813                // Strong boost for root commands on brand lookups so subcommands don't bury them
814                let root_bonus = if matches!(c.node_type, cmdhub_shared::NodeType::Root)
815                    && q_tokens.len() <= 1
816                {
817                    20
818                } else {
819                    0
820                };
821                // Provenance prior: a probe-verified contract gets a modest boost so it
822                // wins ties and corrects inversions against LLM-inferred fabrications,
823                // without burying a strongly-matching inferred result.
824                let verified_bonus = if c.verified { VERIFIED_BONUS } else { 0 };
825                let composite = rrf
826                    + path_w * path_match_score(&c.cmd_path, &q_path_tokens)
827                    + pop_bonus
828                    + root_bonus
829                    + verified_bonus;
830                (composite, i, c)
831            })
832            .collect();
833        // Sort by composite desc; original position as stable tiebreaker.
834        scored.sort_by(|a, b| b.0.cmp(&a.0).then(a.1.cmp(&b.1)));
835        results = scored.into_iter().map(|(_, _, c)| c).collect();
836    }
837
838    let mut final_results = exact_results.clone();
839    final_results.append(&mut results);
840
841    // Deduplicate by cmd_path (same command from multiple sources, e.g. org.archlinux.nb +
842    // org.tldr.nb), AND cap results per app so one tool's subcommands can't flood the list:
843    // a brand/concept query like "azure" must surface the Azure CLI, not 7 of prowler's
844    // "prowler azure ..." checks. Up to PER_APP_CAP keeps "aws ec2 create-vpc/create-subnet"
845    // working while leaving room for other tools.
846    const PER_APP_CAP: usize = 3;
847    let mut seen_paths = std::collections::HashSet::new();
848    let mut per_app: std::collections::HashMap<String, usize> = std::collections::HashMap::new();
849    final_results.retain(|r| {
850        if !seen_paths.insert(r.cmd_path.clone()) {
851            return false;
852        }
853        let n = per_app.entry(r.app_id.clone()).or_insert(0);
854        *n += 1;
855        *n <= PER_APP_CAP
856    });
857
858    final_results.truncate(limit);
859    for r in &mut final_results {
860        r.confidence = confidence.clone();
861    }
862    Ok(final_results)
863}
864
865/// Provenance prior weight in the composite re-rank: equivalent to jumping a few RRF
866/// ranks. Modest by design — corrects probe-vs-inferred inversions without letting a
867/// weakly-matching verified row bury a strongly-matching inferred one.
868const VERIFIED_BONUS: i32 = 3;
869
870/// Expand query tokens for PATH matching: each token contributes itself, its singular
871/// form (images→image), and its intent synonyms (clear→prune). This lets the path
872/// re-ranker reward `podman.image.prune` for "clear podman unused images" even though
873/// no literal query word appears in the path. FTS recall gets the same widening via
874/// preprocess_query; this applies it to the path-match dimension too.
875fn expand_for_path_match(
876    tokens: &std::collections::HashSet<String>,
877) -> std::collections::HashSet<String> {
878    let mut out: std::collections::HashSet<String> = std::collections::HashSet::new();
879    for t in tokens {
880        out.insert(t.clone());
881        if let Some(stem) = t.strip_suffix('s') {
882            if stem.len() >= 3 {
883                out.insert(stem.to_string());
884            }
885        }
886        for syn in concept_synonyms(t) {
887            out.insert((*syn).to_string());
888        }
889    }
890    out
891}
892
893/// Lowercased content tokens of a query (alphanumerics, stop-words removed).
894fn content_tokens(query: &str) -> std::collections::HashSet<String> {
895    // "show" and "view" are natural-language verbs in intent queries ("show git commit history")
896    // that accidentally match command names (git.show). Treating them as stop words here prevents
897    // path_match from penalising git.log relative to git.show for "show git commit history".
898    let stop: std::collections::HashSet<&str> = [
899        "how", "to", "a", "the", "on", "in", "of", "for", "with", "an", "is", "at", "by", "and",
900        "or", "from", "my", "your", "our", "me", "us", "i", "want", "know", "using", "use", "do",
901        "can", "get", "please", "help", "show", "view",
902    ]
903    .iter()
904    .cloned()
905    .collect();
906    query
907        .split(|c: char| !c.is_alphanumeric() && c != '_')
908        .filter(|w| !w.is_empty())
909        .map(|w| w.to_lowercase())
910        .filter(|w| !stop.contains(w.as_str()))
911        .collect()
912}
913
914/// Score how well a command's path (service + leaf, excluding the binary) matches the
915/// query intent: reward overlap with query tokens, lightly penalise extra path tokens.
916/// e.g. for query {create,vpc,aws}: "aws.ec2.create-vpc" → ec2/create/vpc overlap 2,
917/// extra 1 (ec2) → 5; "aws.apigatewayv2.create-vpc-link" → overlap 2, extra 2 → 4.
918fn path_match_score(cmd_path: &str, q_tokens: &std::collections::HashSet<String>) -> i32 {
919    // Root commands (no subcommand hierarchy) cannot be disambiguated by path matching.
920    // Without this guard, hyphenated roots like "git-standup" would gain +8 for the "git"
921    // token appearing in a git-related query, unfairly burying subcommands like "git.log".
922    if !cmd_path.contains('.') {
923        return 0;
924    }
925    // Drop the first segment (the binary, e.g. "aws") — it's already how we got here.
926    let after_binary = cmd_path.split_once('.').map(|x| x.1).unwrap_or(cmd_path);
927    let tokens: Vec<String> = after_binary
928        .split(|c: char| !c.is_alphanumeric() && c != '_')
929        .filter(|w| !w.is_empty())
930        .map(|w| w.to_lowercase())
931        .collect();
932    if tokens.is_empty() {
933        return 0;
934    }
935    // A path token matches on its literal form or its singular (images ~ image), so
936    // plural/singular mismatches between query and path don't lose the overlap.
937    let overlap = tokens
938        .iter()
939        .filter(|t| {
940            q_tokens.contains(*t)
941                || t.strip_suffix('s')
942                    .is_some_and(|s| s.len() >= 3 && q_tokens.contains(s))
943        })
944        .count() as i32;
945    let extra = tokens.len() as i32 - overlap;
946    // Use max(0, ...) to avoid negative scores: unmatched path tokens should not
947    // actively penalise otherwise-good commands (e.g. "git.log" for "git commit history").
948    (3 * overlap - extra).max(0)
949}
950
951pub fn search_commands(
952    conn: &Connection,
953    query: &str,
954    query_vector: Option<&[f32]>,
955    limit: usize,
956) -> Result<Vec<AciCommandContract>> {
957    let mut has_vector_db = false;
958    if query_vector.is_some() {
959        if let Ok(count) = conn.query_row::<u64, _, _>(
960            "SELECT count(*) FROM sqlite_master WHERE type='table' AND name='commands_vec'",
961            [],
962            |row| row.get(0),
963        ) {
964            if count > 0 {
965                if let Ok(vec_count) =
966                    conn.query_row::<u64, _, _>("SELECT count(*) FROM commands_vec", [], |row| {
967                        row.get(0)
968                    })
969                {
970                    if vec_count > 0 {
971                        has_vector_db = true;
972                    }
973                }
974            }
975        }
976    }
977
978    search_cascading(conn, query, query_vector, limit, has_vector_db)
979}
980
981pub fn search_all(
982    conn: &Connection,
983    query: &str,
984    query_vector: Option<&[f32]>,
985    limit: usize,
986) -> Result<Vec<AciCommandContract>> {
987    let mut results = search_commands(conn, query, query_vector, limit)?;
988
989    let config_dir = crate::config::get_config_dir();
990    let skills_dir = config_dir.join("skills");
991    let local_skill = cmdhub_skills::LocalFileSkill::new(skills_dir);
992
993    let mut registry = cmdhub_skills::SkillRegistry::new();
994    registry.register(Box::new(local_skill));
995
996    if let Ok(mut skill_results) = registry.resolve(query) {
997        results.append(&mut skill_results);
998    }
999
1000    let mut seen = std::collections::HashSet::new();
1001    results.retain(|item| seen.insert(item.cmd_path.clone()));
1002    results.truncate(limit);
1003
1004    Ok(results)
1005}
1006
1007#[cfg(test)]
1008mod tests {
1009    use super::*;
1010
1011    #[test]
1012    fn test_calculate_confidence_mapping() {
1013        assert_eq!(calculate_confidence(0.70, false), "high");
1014        assert_eq!(calculate_confidence(0.75, true), "high");
1015        assert_eq!(calculate_confidence(0.78, false), "low");
1016        assert_eq!(calculate_confidence(0.82, false), "low");
1017        assert_eq!(calculate_confidence(0.85, true), "low");
1018        assert_eq!(calculate_confidence(0.83, false), "none");
1019        assert_eq!(calculate_confidence(0.90, false), "none");
1020    }
1021
1022    #[test]
1023    fn test_exact_match_priority() {
1024        let conn = Connection::open_in_memory().unwrap();
1025        init_db(&conn).unwrap();
1026
1027        conn.execute(
1028            "INSERT INTO apps (app_id, name, install_instructions) VALUES (?, ?, ?)",
1029            ("org.test.git", "git", "{}"),
1030        )
1031        .unwrap();
1032
1033        conn.execute(
1034            "INSERT INTO arguments (cmd_path, app_id, node_name, node_type, description, risk_level, example_template) \
1035             VALUES (?, ?, ?, ?, ?, ?, ?)",
1036            ("git", "org.test.git", "git", "root", "Git version control", "safe", "git"),
1037        )
1038        .unwrap();
1039
1040        conn.execute(
1041            "INSERT INTO apps_fts (cmd_path, name, capabilities) VALUES (?, ?, ?)",
1042            ("git", "git", "Git version control"),
1043        )
1044        .unwrap();
1045
1046        conn.execute(
1047            "INSERT INTO apps (app_id, name, install_instructions) VALUES (?, ?, ?)",
1048            ("org.test.gitleaks", "gitleaks", "{}"),
1049        )
1050        .unwrap();
1051
1052        conn.execute(
1053            "INSERT INTO arguments (cmd_path, app_id, node_name, node_type, description, risk_level, example_template) \
1054             VALUES (?, ?, ?, ?, ?, ?, ?)",
1055            ("gitleaks", "org.test.gitleaks", "gitleaks", "root", "Detect secrets in git", "safe", "gitleaks"),
1056        )
1057        .unwrap();
1058
1059        conn.execute(
1060            "INSERT INTO apps_fts (cmd_path, name, capabilities) VALUES (?, ?, ?)",
1061            ("gitleaks", "gitleaks", "Detect secrets in git"),
1062        )
1063        .unwrap();
1064
1065        let res = search_commands(&conn, "git", None, 10).unwrap();
1066        assert!(!res.is_empty());
1067        assert_eq!(res[0].cmd_path, "git");
1068    }
1069
1070    #[test]
1071    fn test_fts_fallback_and_or() {
1072        let conn = Connection::open_in_memory().unwrap();
1073        init_db(&conn).unwrap();
1074
1075        // Setup test apps and arguments
1076        conn.execute(
1077            "INSERT INTO apps (app_id, name, install_instructions) VALUES (?, ?, ?)",
1078            ("org.test.rm", "rm", "{}"),
1079        )
1080        .unwrap();
1081        conn.execute(
1082            "INSERT INTO arguments (cmd_path, app_id, node_name, node_type, description, risk_level, example_template) \
1083             VALUES (?, ?, ?, ?, ?, ?, ?)",
1084            ("rm", "org.test.rm", "rm", "root", "delete local files", "safe", "rm"),
1085        )
1086        .unwrap();
1087        conn.execute(
1088            "INSERT INTO apps_fts (cmd_path, name, capabilities) VALUES (?, ?, ?)",
1089            ("rm", "rm", "delete local files"),
1090        )
1091        .unwrap();
1092
1093        // 1. Search that matches AND exactly: "delete local files"
1094        let res = search_commands(&conn, "delete local files", None, 10).unwrap();
1095        assert!(!res.is_empty());
1096        assert_eq!(res[0].cmd_path, "rm");
1097
1098        // 2. Search that matches AND with stop words: "delete my local files"
1099        let res = search_commands(&conn, "delete my local files", None, 10).unwrap();
1100        assert!(!res.is_empty());
1101        assert_eq!(res[0].cmd_path, "rm");
1102
1103        // 3. Search that has no complete AND matches: "delete missing files" (FTS AND query = "delete* AND missing* AND files*")
1104        // It must fallback to OR and still match "rm" (matches delete, files)
1105        let res = search_commands(&conn, "delete missing files", None, 10).unwrap();
1106        assert!(!res.is_empty());
1107        assert_eq!(res[0].cmd_path, "rm");
1108    }
1109
1110    #[test]
1111    fn test_hybrid_search_knn_match() {
1112        unsafe {
1113            type SqliteVecInitFn = unsafe extern "C" fn();
1114            let init_fn: SqliteVecInitFn = sqlite_vec::sqlite3_vec_init;
1115            #[allow(clippy::missing_transmute_annotations)]
1116            let _ = rusqlite::ffi::sqlite3_auto_extension(Some(std::mem::transmute(init_fn)));
1117        }
1118
1119        let conn = Connection::open_in_memory().unwrap();
1120        init_db(&conn).unwrap();
1121
1122        conn.execute(
1123            "INSERT INTO apps (app_id, name, install_instructions) VALUES (?, ?, ?)",
1124            ("org.test.knn", "knn", "{}"),
1125        )
1126        .unwrap();
1127        conn.execute(
1128            "INSERT INTO arguments (cmd_path, app_id, node_name, node_type, description, risk_level, example_template) \
1129             VALUES (?, ?, ?, ?, ?, ?, ?)",
1130            ("knn", "org.test.knn", "knn", "root", "vector search helper", "safe", "knn"),
1131        )
1132        .unwrap();
1133        conn.execute(
1134            "INSERT INTO apps_fts (cmd_path, name, capabilities) VALUES (?, ?, ?)",
1135            ("knn", "knn", "vector search helper"),
1136        )
1137        .unwrap();
1138
1139        // Insert vector (float32[384], no zero-padding)
1140        let v = vec![0.1f32; 384];
1141        let mut v_bytes = Vec::with_capacity(384 * 4);
1142        for &val in &v {
1143            v_bytes.extend_from_slice(&val.to_le_bytes());
1144        }
1145
1146        conn.execute(
1147            "INSERT INTO commands_vec (cmd_path, embedding) VALUES (?, ?)",
1148            ("knn", v_bytes),
1149        )
1150        .unwrap();
1151
1152        // Search with query vector (384-dim)
1153        let query_vec = vec![0.12f32; 384];
1154        let res = search_commands(&conn, "missing_term", Some(&query_vec), 10).unwrap();
1155        assert!(!res.is_empty());
1156        assert_eq!(res[0].cmd_path, "knn");
1157    }
1158
1159    #[test]
1160    fn test_clear_maps_to_prune_synonyms() {
1161        assert!(concept_synonyms("clear").contains(&"prune"));
1162        assert!(concept_synonyms("clean").contains(&"prune"));
1163        assert!(concept_synonyms("purge").contains(&"prune"));
1164        assert!(concept_synonyms("prune").contains(&"unused"));
1165        assert!(concept_synonyms("fuzzy").contains(&"fzf"));
1166        assert!(concept_synonyms("finder").contains(&"fd"));
1167        assert!(concept_synonyms("download").contains(&"curl"));
1168        assert!(concept_synonyms("diff").contains(&"delta"));
1169        assert!(concept_synonyms("grep").contains(&"ripgrep"));
1170    }
1171
1172    #[test]
1173    fn test_tool_alias_synonyms_are_tool_names_only_not_generic_concepts() {
1174        // The AND-path widening uses ONLY tool-name aliases (specific binaries) so it
1175        // never floods candidates. Generic concept words must NOT appear here (they
1176        // regressed "kubernetes pod logs" by surfacing stern/kail) — they stay in
1177        // concept_synonyms for the or-fallback path only.
1178        assert!(tool_alias_synonyms("fuzzy").contains(&"fzf"));
1179        assert!(tool_alias_synonyms("grep").contains(&"rg"));
1180        assert!(tool_alias_synonyms("download").contains(&"curl"));
1181        // Generic concept tokens expand in concept_synonyms but NOT in the and-path map:
1182        assert!(tool_alias_synonyms("kubernetes").is_empty());
1183        assert!(tool_alias_synonyms("view").is_empty());
1184        assert!(tool_alias_synonyms("clear").is_empty());
1185    }
1186
1187    #[test]
1188    fn test_expand_for_path_match_adds_synonyms_and_singulars() {
1189        let tokens: std::collections::HashSet<String> =
1190            ["clear", "images"].iter().map(|s| s.to_string()).collect();
1191        let expanded = expand_for_path_match(&tokens);
1192        assert!(expanded.contains("prune")); // clear → prune
1193        assert!(expanded.contains("image")); // images → image
1194        assert!(expanded.contains("clear")); // originals kept
1195    }
1196
1197    #[test]
1198    fn test_old_schema_db_without_provenance_still_works() {
1199        // A pre-provenance (schema v1) database must not crash a new client and must
1200        // report verified=false for everything.
1201        let conn = Connection::open_in_memory().unwrap();
1202        conn.execute_batch(
1203            "CREATE TABLE apps (app_id TEXT PRIMARY KEY, name TEXT NOT NULL, os_aliases TEXT, \
1204             install_instructions TEXT, popularity REAL DEFAULT 0.0); \
1205             CREATE TABLE arguments (cmd_path TEXT PRIMARY KEY, app_id TEXT NOT NULL, \
1206             node_name TEXT NOT NULL, node_type TEXT NOT NULL, description TEXT NOT NULL, \
1207             risk_level TEXT NOT NULL, example_template TEXT, docker_image TEXT, \
1208             script_url TEXT, source_url TEXT); \
1209             CREATE VIRTUAL TABLE apps_fts USING fts5(cmd_path UNINDEXED, name, capabilities); \
1210             INSERT INTO apps (app_id, name) VALUES ('org.test.tar', 'tar'); \
1211             INSERT INTO arguments (cmd_path, app_id, node_name, node_type, description, risk_level) \
1212             VALUES ('tar', 'org.test.tar', 'tar', 'root', 'archive files', 'safe'); \
1213             INSERT INTO apps_fts (cmd_path, name, capabilities) VALUES ('tar', 'tar', 'archive files');",
1214        )
1215        .unwrap();
1216
1217        let res = search_commands(&conn, "tar", None, 5).unwrap();
1218        assert!(!res.is_empty());
1219        assert_eq!(res[0].cmd_path, "tar");
1220        assert!(!res[0].verified); // old DB → unverified by definition
1221    }
1222
1223    #[test]
1224    fn test_probe_verified_outranks_inferred_twin() {
1225        // Two equal-text commands from different sources; the probe-verified one must
1226        // rank first via the provenance prior, and `verified` must surface in output.
1227        let conn = Connection::open_in_memory().unwrap();
1228        init_db(&conn).unwrap();
1229        for (app, prov) in [
1230            ("org.inferred.tool", "inferred"),
1231            ("org.probed.tool", "probe"),
1232        ] {
1233            let name = if prov == "probe" { "toolp" } else { "tooli" };
1234            conn.execute(
1235                "INSERT INTO apps (app_id, name, install_instructions) VALUES (?, ?, '{}')",
1236                (app, name),
1237            )
1238            .unwrap();
1239            let path = format!("{}.image.prune", name);
1240            conn.execute(
1241                "INSERT INTO arguments (cmd_path, app_id, node_name, node_type, description, \
1242                 risk_level, provenance) VALUES (?, ?, 'prune', 'sub', \
1243                 'Remove unused container images to free disk space', 'dangerous', ?)",
1244                (&path, app, prov),
1245            )
1246            .unwrap();
1247            conn.execute(
1248                "INSERT INTO apps_fts (cmd_path, name, capabilities) VALUES (?, ?, \
1249                 'Remove unused container images to free disk space')",
1250                (&path, name),
1251            )
1252            .unwrap();
1253        }
1254
1255        let res = search_cascading(&conn, "clear unused images", None, 5, false).unwrap();
1256        assert!(res.len() >= 2, "expected both twins, got {}", res.len());
1257        assert!(res[0].verified, "probe-verified twin must rank first");
1258        assert!(res[0].cmd_path.starts_with("toolp"));
1259        assert!(!res[1].verified);
1260    }
1261}