Skip to main content

cartog_db/store/
queries.rs

1//! Graph traversal queries: search, refs, callees, impact, hierarchy, deps, stats.
2//!
3//! Part of the [`Database`](super::Database) impl, split out of `lib.rs` for navigability.
4
5use super::*;
6
7/// One hop on a call path returned by [`Database::trace`].
8#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
9pub struct PathHop {
10    /// Id of the calling symbol — the exact source on this hop, so callers can
11    /// hydrate its body without re-resolving an ambiguous name.
12    pub source_id: String,
13    pub source_name: String,
14    pub target_name: String,
15    pub kind: EdgeKind,
16    pub file_path: String,
17    pub line: u32,
18}
19
20/// Internal call-edge record used during BFS in [`Database::trace`].
21#[derive(Debug, Clone)]
22struct CallHop {
23    source_id: String,
24    source_name: String,
25    target_name: String,
26    target_id: Option<String>,
27    kind: EdgeKind,
28    file_path: String,
29    line: u32,
30}
31
32impl Database {
33    // ── Queries ──
34
35    /// Search for symbols by name — case-insensitive, prefix match ranks before substring.
36    ///
37    /// `%` and `_` in `query` are treated as literals, not LIKE wildcards.
38    /// Note: `LOWER()` in SQLite is ASCII-only, which is acceptable for code identifiers.
39    /// Returns an error if `query` is empty or `limit` is zero.
40    pub fn search(
41        &self,
42        query: &str,
43        kind_filter: Option<SymbolKind>,
44        file_filter: Option<&str>,
45        limit: u32,
46    ) -> Result<Vec<Symbol>> {
47        anyhow::ensure!(!query.is_empty(), "search query cannot be empty");
48        anyhow::ensure!(limit > 0, "search limit must be at least 1");
49
50        // Escape LIKE special characters so query is matched literally.
51        let escaped = query
52            .replace('\\', "\\\\")
53            .replace('%', "\\%")
54            .replace('_', "\\_");
55        let kind_str = kind_filter.map(|k| k.as_str());
56        // Ranking: match_tier + kind_penalty.
57        //   match_tier: 0 = exact, 1 = prefix, 2 = substring
58        //   kind_penalty: definitions (function/method/class) = 0, variable = 3, import = 6
59        // Definitions always rank above variables/imports across all match tiers:
60        //   exact class=0, prefix function=1, substring method=2,
61        //   exact variable=3, prefix variable=4, substring variable=5,
62        //   exact import=6, ...
63        // Within the same rank score, secondary sort by kind (fn < method < class)
64        // then by file_path and start_line for determinism.
65        let mut stmt = self.conn.prepare(
66            "SELECT id, name, kind, file_path, start_line, end_line,
67                    start_byte, end_byte, parent_id, signature, visibility,
68                    is_async, docstring, in_degree, content_hash, subtree_hash,
69                    (CASE
70                       WHEN LOWER(name) = LOWER(?1)                    THEN 0
71                       WHEN LOWER(name) LIKE LOWER(?2) || '%' ESCAPE '\\' THEN 1
72                       ELSE                                                  2
73                     END) +
74                    (CASE kind
75                       WHEN 'function' THEN 0
76                       WHEN 'method'   THEN 0
77                       WHEN 'class'    THEN 0
78                       WHEN 'variable' THEN 3
79                       WHEN 'import'   THEN 6
80                       ELSE                 3
81                     END) AS rank
82             FROM symbols
83             WHERE LOWER(name) LIKE '%' || LOWER(?2) || '%' ESCAPE '\\'
84               AND (?3 IS NULL OR kind = ?3)
85               AND (?4 IS NULL OR file_path = ?4)
86             ORDER BY rank,
87                      in_degree DESC,
88                      CASE kind
89                        WHEN 'function' THEN 0
90                        WHEN 'method'   THEN 1
91                        WHEN 'class'    THEN 2
92                        ELSE                 3
93                      END,
94                      file_path, start_line
95             LIMIT ?5",
96        )?;
97        // ?1 = raw query (exact equality), ?2 = escaped query (LIKE patterns), ?3 = kind, ?4 = file, ?5 = limit
98        let rows = stmt
99            .query_map(
100                params![query, escaped, kind_str, file_filter, limit],
101                row_to_symbol,
102            )?
103            .collect::<std::result::Result<Vec<_>, _>>()?;
104        Ok(rows)
105    }
106
107    /// Outline: all symbols in a file, ordered by line.
108    pub fn outline(&self, file_path: &str) -> Result<Vec<Symbol>> {
109        let mut stmt = self.conn.prepare(
110            "SELECT id, name, kind, file_path, start_line, end_line, start_byte, end_byte,
111                    parent_id, signature, visibility, is_async, docstring, in_degree,
112                    content_hash, subtree_hash
113             FROM symbols WHERE file_path = ?1
114             ORDER BY start_line",
115        )?;
116        let rows = stmt
117            .query_map(params![file_path], row_to_symbol)?
118            .collect::<std::result::Result<Vec<_>, _>>()?;
119        Ok(rows)
120    }
121
122    /// Find what a symbol calls (edges originating from symbols matching the name).
123    pub fn callees(&self, name: &str) -> Result<Vec<Edge>> {
124        let mut stmt = self.conn.prepare(
125            "SELECT e.id, e.source_id, e.target_name, e.target_id, e.kind, e.file_path, e.line,
126                    e.resolution_source
127             FROM edges e
128             JOIN symbols s ON e.source_id = s.id
129             WHERE s.name = ?1 AND e.kind = 'calls'",
130        )?;
131        let rows = stmt
132            .query_map(params![name], row_to_edge)?
133            .collect::<std::result::Result<Vec<_>, _>>()?;
134        Ok(rows)
135    }
136
137    /// Resolved symbol ids that the symbol `source_id` calls. Only edges with a
138    /// resolved `target_id` are returned — keyed on the exact source id (not a
139    /// name), so an overloaded source resolves to the right callees.
140    pub fn callee_ids_of(&self, source_id: &str) -> Result<Vec<String>> {
141        let mut stmt = self.conn.prepare_cached(
142            "SELECT DISTINCT e.target_id FROM edges e
143             WHERE e.source_id = ?1 AND e.kind = 'calls' AND e.target_id IS NOT NULL",
144        )?;
145        let ids = stmt
146            .query_map(params![source_id], |row| row.get::<_, String>(0))?
147            .collect::<std::result::Result<Vec<_>, _>>()?;
148        Ok(ids)
149    }
150
151    /// Source symbol ids that call the symbol `target_id` (resolved incoming
152    /// `calls` edges). Keyed on the exact target id, so callers of one overload
153    /// aren't confused with another sharing its name.
154    pub fn caller_ids_of(&self, target_id: &str) -> Result<Vec<String>> {
155        let mut stmt = self.conn.prepare_cached(
156            "SELECT DISTINCT e.source_id FROM edges e
157             WHERE e.target_id = ?1 AND e.kind = 'calls'",
158        )?;
159        let ids = stmt
160            .query_map(params![target_id], |row| row.get::<_, String>(0))?
161            .collect::<std::result::Result<Vec<_>, _>>()?;
162        Ok(ids)
163    }
164
165    /// All references to a name, with the source symbol resolved.
166    /// Optionally filter by edge kind.
167    pub fn refs(
168        &self,
169        name: &str,
170        kind_filter: Option<EdgeKind>,
171    ) -> Result<Vec<(Edge, Option<Symbol>)>> {
172        // An edge references `name` if its literal target_name matches, OR its
173        // resolved target_id points at a symbol named `name`. The second arm is
174        // expressed as `target_id IN (SELECT id ... WHERE name = ?)` rather than
175        // a `LEFT JOIN symbols sym2 ... OR sym2.name = ?`: the OR-across-joined-
176        // tables forces SQLite to full-scan `edges`, whereas the subquery lets
177        // the planner pick a MULTI-INDEX OR over idx_edges_target +
178        // idx_edges_target_id (60-3500× faster on a real repo; the scan was a
179        // flat ~10ms regardless of input — even on a no-match). Cost is bounded
180        // by matched edges, not the popularity of `name`, so it stays flat as
181        // the repo grows. Both forms return the identical edge set (NULL
182        // target_id matches neither arm).
183        let map_row = |row: &rusqlite::Row<'_>| -> rusqlite::Result<(Edge, Option<Symbol>)> {
184            // Edge columns 1..=7 (id is column 0), source symbol from column 8.
185            let edge = edge_from_row(row, 1)?;
186            let sym: Option<Symbol> = if row.get::<_, Option<String>>(8)?.is_some() {
187                Some(row_to_symbol_offset(row, 8)?)
188            } else {
189                None
190            };
191            Ok((edge, sym))
192        };
193
194        let rows = if let Some(kind) = kind_filter {
195            let mut stmt = self.conn.prepare_cached(
196                "SELECT e.id, e.source_id, e.target_name, e.target_id, e.kind, e.file_path, e.line,
197                        e.resolution_source,
198                        s.id, s.name, s.kind, s.file_path, s.start_line, s.end_line,
199                        s.start_byte, s.end_byte, s.parent_id, s.signature, s.visibility,
200                        s.is_async, s.docstring, s.in_degree, s.content_hash, s.subtree_hash
201                 FROM edges e
202                 LEFT JOIN symbols s ON e.source_id = s.id
203                 -- Kind pushed into each OR arm (distributive equiv of `(A OR B)
204                 -- AND k`) so both arms seek a target index even unanalyzed; the
205                 -- outer-AND form scans all edges of that kind without stats.
206                 WHERE (e.target_name = ?1 AND e.kind = ?2)
207                    OR (e.target_id IN (SELECT id FROM symbols WHERE name = ?1)
208                        AND e.kind = ?2)",
209            )?;
210            let rows = stmt
211                .query_map(params![name, kind.as_str()], map_row)?
212                .collect::<std::result::Result<Vec<_>, _>>()?;
213            rows
214        } else {
215            let mut stmt = self.conn.prepare_cached(
216                "SELECT e.id, e.source_id, e.target_name, e.target_id, e.kind, e.file_path, e.line,
217                        e.resolution_source,
218                        s.id, s.name, s.kind, s.file_path, s.start_line, s.end_line,
219                        s.start_byte, s.end_byte, s.parent_id, s.signature, s.visibility,
220                        s.is_async, s.docstring, s.in_degree, s.content_hash, s.subtree_hash
221                 FROM edges e
222                 LEFT JOIN symbols s ON e.source_id = s.id
223                 WHERE e.target_name = ?1
224                    OR e.target_id IN (SELECT id FROM symbols WHERE name = ?1)",
225            )?;
226            let rows = stmt
227                .query_map(params![name], map_row)?
228                .collect::<std::result::Result<Vec<_>, _>>()?;
229            rows
230        };
231        Ok(rows)
232    }
233
234    /// Inheritance hierarchy rooted at a class.
235    ///
236    /// Like [`Self::refs`], matches the resolved target symbol's name too:
237    /// qualified `target_name`s (PHP `App\Auth\BaseService`) never equal the
238    /// class's short name, so children would otherwise be invisible. The
239    /// parent column reports the resolved short name when available.
240    pub fn hierarchy(&self, class_name: &str) -> Result<Vec<(String, String)>> {
241        // Returns (child, parent) pairs
242        let mut stmt = self.conn.prepare(
243            "SELECT s.name, COALESCE(sym2.name, e.target_name)
244             FROM edges e
245             JOIN symbols s ON e.source_id = s.id
246             LEFT JOIN symbols sym2 ON e.target_id = sym2.id
247             WHERE e.kind = 'inherits'
248               AND (s.name = ?1 OR e.target_name = ?1 OR sym2.name = ?1)",
249        )?;
250        let rows = stmt
251            .query_map(params![class_name], |row| Ok((row.get(0)?, row.get(1)?)))?
252            .collect::<std::result::Result<Vec<_>, _>>()?;
253        Ok(rows)
254    }
255
256    /// File-level dependencies (imports from a file).
257    pub fn file_deps(&self, file_path: &str) -> Result<Vec<Edge>> {
258        let mut stmt = self.conn.prepare(
259            "SELECT e.id, e.source_id, e.target_name, e.target_id, e.kind, e.file_path, e.line,
260                    e.resolution_source
261             FROM edges e
262             WHERE e.file_path = ?1 AND e.kind = 'imports'",
263        )?;
264        let rows = stmt
265            .query_map(params![file_path], row_to_edge)?
266            .collect::<std::result::Result<Vec<_>, _>>()?;
267        Ok(rows)
268    }
269
270    /// Transitive impact analysis: everything reachable within `depth` hops.
271    ///
272    /// Evaluated as one recursive CTE rather than iterating `refs()` per
273    /// frontier node — saves N round-trips. The recursive step is split into
274    /// two complementary arms (a literal `target_name` match and a resolved
275    /// `target_id` match) so each seeks an index instead of full-scanning
276    /// edges; **their UNION must equal the old single OR-join edge set** — keep
277    /// the two arms jointly exhaustive when editing. Each unique edge is
278    /// returned once, labeled with the minimum depth at which it was reached.
279    pub fn impact(&self, name: &str, max_depth: u32) -> Result<Vec<(Edge, u32)>> {
280        if max_depth == 0 {
281            return Ok(Vec::new());
282        }
283
284        let sql = "
285            WITH RECURSIVE impacted(
286                edge_id, source_id, target_name, target_id, kind,
287                file_path, line, resolution_source, source_name, depth
288            ) AS (
289                -- Anchor: seed the frontier via the indexed OR-subquery form
290                -- (see refs()) — idx_edges_target + idx_edges_target_id rather
291                -- than a full edges scan.
292                SELECT e.id, e.source_id, e.target_name, e.target_id, e.kind,
293                       e.file_path, e.line, e.resolution_source, s.name, 1
294                FROM edges e
295                LEFT JOIN symbols s ON e.source_id = s.id
296                WHERE e.target_name = ?1
297                   OR e.target_id IN (SELECT id FROM symbols WHERE name = ?1)
298
299                UNION
300
301                -- Recursive step, literal arm: an edge whose target_name equals
302                -- a frontier symbol's name. Indexed via idx_edges_target.
303                SELECT e.id, e.source_id, e.target_name, e.target_id, e.kind,
304                       e.file_path, e.line, e.resolution_source, s.name, i.depth + 1
305                FROM impacted i
306                JOIN edges e ON e.target_name = i.source_name
307                LEFT JOIN symbols s ON e.source_id = s.id
308                WHERE i.source_name IS NOT NULL AND i.depth < ?2
309
310                UNION
311
312                -- Recursive step, resolved arm: an edge whose resolved target_id
313                -- points at a frontier symbol's name. Splitting this out of the
314                -- literal arm (instead of `OR EXISTS(...)`) turns the recursive
315                -- step from a full edges scan + correlated subquery into two
316                -- index seeks (idx_symbols_name_file → idx_edges_target_id). The
317                -- two arms' UNION equals the old single OR-join edge set.
318                SELECT e.id, e.source_id, e.target_name, e.target_id, e.kind,
319                       e.file_path, e.line, e.resolution_source, s.name, i.depth + 1
320                FROM impacted i
321                JOIN symbols t ON t.name = i.source_name
322                JOIN edges e ON e.target_id = t.id
323                LEFT JOIN symbols s ON e.source_id = s.id
324                WHERE i.source_name IS NOT NULL AND i.depth < ?2
325            )
326            SELECT source_id, target_name, target_id, kind, file_path, line,
327                   resolution_source, MIN(depth) AS depth
328            FROM impacted
329            GROUP BY edge_id
330            ORDER BY depth, edge_id
331        ";
332
333        let mut stmt = self.conn.prepare_cached(sql)?;
334        let rows = stmt
335            .query_map(params![name, max_depth], |row| {
336                // Bare projection: edge columns 0..=6, depth at column 7.
337                let edge = edge_from_row(row, 0)?;
338                let depth: u32 = row.get(7)?;
339                Ok((edge, depth))
340            })?
341            .collect::<std::result::Result<Vec<_>, _>>()?;
342        Ok(rows)
343    }
344
345    /// Shortest forward `calls` path from `from` to `to`, or `None` if `to` is
346    /// unreachable within `max_depth` hops. `from == to` yields an empty path.
347    /// Matches the source symbol by name, so an ambiguous `from` follows every
348    /// match and the globally-shortest path wins. Only statically-resolved
349    /// `calls` edges participate (dynamic dispatch is absent).
350    ///
351    /// # Errors
352    /// Returns an error if the SQLite query fails.
353    #[must_use = "the traced path is the result; ignoring it wastes the query"]
354    pub fn trace(&self, from: &str, to: &str, max_depth: u32) -> Result<Option<Vec<PathHop>>> {
355        if from == to {
356            return Ok(Some(Vec::new()));
357        }
358        if max_depth == 0 {
359            return Ok(None);
360        }
361
362        // Breadth-first over `calls` edges, expanding each symbol once. Visited
363        // is keyed on exact symbol id (a HashSet, not a delimited string), so
364        // ids containing commas can't corrupt the cycle break, and the global
365        // visited set bounds the search to O(V+E) — no per-path enumeration.
366        let mut visited: std::collections::HashSet<String> = std::collections::HashSet::new();
367        // parent[id] = (edge that reached it, predecessor symbol id).
368        let mut parent: std::collections::HashMap<String, (CallHop, String)> =
369            std::collections::HashMap::new();
370
371        // Seed: every symbol named `from` starts the frontier at depth 0.
372        let mut frontier: Vec<String> = self.symbol_ids_named(from)?;
373        for id in &frontier {
374            visited.insert(id.clone());
375        }
376
377        for _ in 0..max_depth {
378            if frontier.is_empty() {
379                break;
380            }
381            let mut next: Vec<String> = Vec::new();
382            for src_id in &frontier {
383                for hop in self.outgoing_calls(src_id)? {
384                    // A reached symbol is any symbol whose name matches the
385                    // edge's target (resolved id preferred when present).
386                    for tgt_id in self.resolved_call_targets(&hop)? {
387                        if !visited.insert(tgt_id.clone()) {
388                            continue;
389                        }
390                        parent.insert(tgt_id.clone(), (hop.clone(), src_id.clone()));
391                        if self.symbol_name(&tgt_id)?.as_deref() == Some(to) {
392                            return Ok(Some(self.reconstruct_path(&tgt_id, &parent)));
393                        }
394                        next.push(tgt_id);
395                    }
396                }
397            }
398            frontier = next;
399        }
400        Ok(None)
401    }
402
403    /// Walk `parent` back from `end` to the seed, returning hops in call order.
404    fn reconstruct_path(
405        &self,
406        end: &str,
407        parent: &std::collections::HashMap<String, (CallHop, String)>,
408    ) -> Vec<PathHop> {
409        let mut chain: Vec<&CallHop> = Vec::new();
410        let mut cur = end;
411        while let Some((hop, pred)) = parent.get(cur) {
412            chain.push(hop);
413            cur = pred;
414        }
415        chain.reverse();
416        chain
417            .into_iter()
418            .map(|h| PathHop {
419                source_id: h.source_id.clone(),
420                source_name: h.source_name.clone(),
421                target_name: h.target_name.clone(),
422                kind: h.kind,
423                file_path: h.file_path.clone(),
424                line: h.line,
425            })
426            .collect()
427    }
428
429    /// Symbol ids whose name is exactly `name`.
430    fn symbol_ids_named(&self, name: &str) -> Result<Vec<String>> {
431        let mut stmt = self
432            .conn
433            .prepare_cached("SELECT id FROM symbols WHERE name = ?1")?;
434        let ids = stmt
435            .query_map(params![name], |row| row.get::<_, String>(0))?
436            .collect::<std::result::Result<Vec<_>, _>>()?;
437        Ok(ids)
438    }
439
440    /// Name of the symbol with id `id`, if it exists.
441    fn symbol_name(&self, id: &str) -> Result<Option<String>> {
442        let name = self
443            .conn
444            .prepare_cached("SELECT name FROM symbols WHERE id = ?1")?
445            .query_row(params![id], |row| row.get::<_, String>(0))
446            .optional()?;
447        Ok(name)
448    }
449
450    /// Outgoing `calls` edges from the symbol with id `source_id`.
451    fn outgoing_calls(&self, source_id: &str) -> Result<Vec<CallHop>> {
452        let mut stmt = self.conn.prepare_cached(
453            "SELECT s.name, e.target_name, e.target_id, e.kind, e.file_path, e.line
454             FROM edges e JOIN symbols s ON e.source_id = s.id
455             WHERE e.source_id = ?1 AND e.kind = 'calls'",
456        )?;
457        let hops = stmt
458            .query_map(params![source_id], |row| {
459                let kind_str: String = row.get(3)?;
460                Ok(CallHop {
461                    source_id: source_id.to_string(),
462                    source_name: row.get(0)?,
463                    target_name: row.get(1)?,
464                    target_id: row.get(2)?,
465                    kind: kind_str.parse().unwrap_or(EdgeKind::Calls),
466                    file_path: row.get(4)?,
467                    line: row.get(5)?,
468                })
469            })?
470            .collect::<std::result::Result<Vec<_>, _>>()?;
471        Ok(hops)
472    }
473
474    /// Symbol ids a call hop reaches: the resolved `target_id` if set, else
475    /// every symbol whose name matches the edge's `target_name`.
476    fn resolved_call_targets(&self, hop: &CallHop) -> Result<Vec<String>> {
477        if let Some(id) = &hop.target_id {
478            return Ok(vec![id.clone()]);
479        }
480        self.symbol_ids_named(&hop.target_name)
481    }
482
483    /// True when no symbols have been indexed yet (fresh/empty DB). Cheap —
484    /// a single `EXISTS(SELECT 1 FROM symbols)` that stops at the first row.
485    /// Used by query commands to distinguish "no index yet" from a no-match.
486    pub fn is_empty(&self) -> Result<bool> {
487        let exists: bool =
488            self.conn
489                .query_row("SELECT EXISTS(SELECT 1 FROM symbols)", [], |row| row.get(0))?;
490        Ok(!exists)
491    }
492
493    /// Index statistics.
494    pub fn stats(&self) -> Result<IndexStats> {
495        let num_files: u32 = self
496            .conn
497            .query_row("SELECT COUNT(*) FROM files", [], |row| row.get(0))?;
498        let num_symbols: u32 = self
499            .conn
500            .query_row("SELECT COUNT(*) FROM symbols", [], |row| row.get(0))?;
501        // Single GROUP BY over edges replaces what would otherwise be four
502        // sequential full table scans (one COUNT(*) per bucket). The partial
503        // index `idx_edges_unresolved` only covers state=0, so state=2 and
504        // state=3 counts can't use it — one scan + a 4-row Vec is cheaper.
505        let mut bucket_stmt = self
506            .conn
507            .prepare("SELECT resolution_state, COUNT(*) FROM edges GROUP BY resolution_state")?;
508        let mut num_resolved: u32 = 0;
509        let mut num_unresolvable: u32 = 0;
510        let mut num_external: u32 = 0;
511        let mut num_edges: u32 = 0;
512        let rows = bucket_stmt.query_map([], |row| {
513            let state: i64 = row.get(0)?;
514            let count: u32 = row.get(1)?;
515            Ok((state, count))
516        })?;
517        for row in rows {
518            let (state, count) = row?;
519            num_edges += count;
520            match state {
521                1 => num_resolved = count,
522                2 => num_unresolvable = count,
523                3 => num_external = count,
524                _ => {} // state=0 (unresolved) or any future state
525            }
526        }
527
528        let mut lang_stmt = self.conn.prepare(
529            "SELECT language, COUNT(*) FROM files GROUP BY language ORDER BY COUNT(*) DESC",
530        )?;
531        let languages: Vec<(String, u32)> = lang_stmt
532            .query_map([], |row| Ok((row.get(0)?, row.get(1)?)))?
533            .collect::<std::result::Result<Vec<_>, _>>()?;
534
535        let mut kind_stmt = self
536            .conn
537            .prepare("SELECT kind, COUNT(*) FROM symbols GROUP BY kind ORDER BY COUNT(*) DESC")?;
538        let symbol_kinds: Vec<(String, u32)> = kind_stmt
539            .query_map([], |row| Ok((row.get(0)?, row.get(1)?)))?
540            .collect::<std::result::Result<Vec<_>, _>>()?;
541
542        Ok(IndexStats {
543            num_files,
544            num_symbols,
545            num_edges,
546            num_resolved,
547            num_unresolvable,
548            num_external,
549            languages,
550            symbol_kinds,
551        })
552    }
553
554    /// Record a successful query against the index for the `cartog stats --savings`
555    /// / `cartog savings` retention hook.
556    ///
557    /// Best-effort: errors are swallowed (logged via `warn!`) so a failing
558    /// write never aborts the user's actual query.
559    ///
560    /// **Read-only attach skips the write.** Secondary MCP servers opened via
561    /// [`Self::open_readonly`] cannot write at all. As a result, queries
562    /// served by a secondary are NOT reflected in `query_log` — there is no
563    /// IPC that forwards them to the primary. `cartog stats --savings` on a
564    /// machine that runs multiple MCP servers will therefore *undercount*
565    /// secondary traffic, not overcount it. This is a deliberate trade-off:
566    /// the alternative would be a separate per-process file with its own
567    /// merge logic, which is more complexity than the retention hook needs.
568    ///
569    /// Stored fields: `tool` (e.g. `"search"`, `"refs"`, MCP-side already
570    /// strips the `cartog_` prefix so CLI and MCP rows aggregate), `source`
571    /// (`"cli"` or `"mcp"`), and a unix-seconds timestamp. The query payload
572    /// itself is never recorded — see the privacy banner in README.
573    pub fn log_query(&self, tool: &str, source: &str) {
574        if self.is_read_only() {
575            return;
576        }
577        let ts = std::time::SystemTime::now()
578            .duration_since(std::time::UNIX_EPOCH)
579            .map(|d| d.as_secs() as i64)
580            .unwrap_or(0);
581        if let Err(e) = self.conn.execute(
582            "INSERT INTO query_log (tool, source, ts) VALUES (?1, ?2, ?3)",
583            params![tool, source, ts],
584        ) {
585            // Always warn for the individual failure (debuggable in traces),
586            // and additionally emit a one-shot loud-error on the first
587            // failure so a persistently-broken query_log (e.g. SQLITE_FULL
588            // or a missing table on a manually-tampered DB) is visible even
589            // when warns are filtered.
590            warn!(error = %e, tool, source, "query_log insert failed");
591            if !LOG_QUERY_FAILURE_REPORTED.swap(true, std::sync::atomic::Ordering::Relaxed) {
592                tracing::error!(
593                    error = %e,
594                    "query_log is failing — `cartog stats --savings` will undercount. \
595                     Check disk space and `cartog doctor`."
596                );
597            }
598        }
599    }
600
601    /// Aggregate `query_log` for `cartog stats --savings` / `cartog savings`.
602    /// Safe on read-only attach (it's a read). Returns an empty report when
603    /// the `query_log` table is missing (the read-only attach path skips
604    /// schema bootstrap, so a v5 DB that lost the table — manual drop, partial
605    /// snapshot restore — would otherwise surface a `no such table` error).
606    pub fn savings_breakdown(&self) -> Result<SavingsReport> {
607        // Probe for the table once. Only treat "no such table" as the empty-
608        // report case so real DB faults (corruption, locked, permissions)
609        // still propagate to the caller.
610        if let Err(e) = self.conn.prepare("SELECT 1 FROM query_log LIMIT 0") {
611            if is_no_such_table(&e) {
612                return Ok(empty_savings_report());
613            }
614            return Err(e.into());
615        }
616
617        let mut tool_stmt = self.conn.prepare(
618            "SELECT tool, COUNT(*) FROM query_log GROUP BY tool ORDER BY COUNT(*) DESC, tool",
619        )?;
620        let by_tool: Vec<(String, u64)> = tool_stmt
621            .query_map([], |row| Ok((row.get(0)?, row.get::<_, i64>(1)? as u64)))?
622            .collect::<std::result::Result<Vec<_>, _>>()?;
623
624        let mut src_stmt = self.conn.prepare(
625            "SELECT source, COUNT(*) FROM query_log GROUP BY source ORDER BY COUNT(*) DESC, source",
626        )?;
627        let by_source: Vec<(String, u64)> = src_stmt
628            .query_map([], |row| Ok((row.get(0)?, row.get::<_, i64>(1)? as u64)))?
629            .collect::<std::result::Result<Vec<_>, _>>()?;
630
631        let total_queries: u64 = by_tool.iter().map(|(_, c)| c).sum();
632        let tokens_used_cartog = total_queries.saturating_mul(TOKENS_PER_QUERY_CARTOG as u64);
633        let tokens_used_grep = total_queries.saturating_mul(TOKENS_PER_QUERY_GREP as u64);
634        let estimated_tokens_saved = tokens_used_grep.saturating_sub(tokens_used_cartog);
635        // 0–100, computed with saturating_mul to match the rest of the
636        // arithmetic in this function (everything else uses saturating_* to
637        // stay safe at extreme query counts).
638        let percent_saved = estimated_tokens_saved
639            .saturating_mul(100)
640            .checked_div(tokens_used_grep)
641            .unwrap_or(0)
642            .min(100) as u8;
643
644        Ok(SavingsReport {
645            by_tool,
646            by_source,
647            total_queries,
648            tokens_used_cartog,
649            tokens_used_grep,
650            estimated_tokens_saved,
651            percent_saved,
652            baseline_delta: TOKENS_SAVED_PER_QUERY,
653        })
654    }
655
656    /// Get all non-import symbols ordered by in-degree (highest first), then by file.
657    ///
658    /// Used by `cartog map` to produce a centrality-ranked codebase summary.
659    pub fn top_symbols(&self, limit: u32) -> Result<Vec<Symbol>> {
660        let mut stmt = self.conn.prepare(
661            "SELECT id, name, kind, file_path, start_line, end_line, start_byte, end_byte,
662                    parent_id, signature, visibility, is_async, docstring, in_degree,
663                    content_hash, subtree_hash
664             FROM symbols
665             WHERE kind != 'import' AND kind != 'variable'
666             ORDER BY in_degree DESC, file_path, start_line
667             LIMIT ?1",
668        )?;
669        let rows = stmt
670            .query_map(params![limit], row_to_symbol)?
671            .collect::<std::result::Result<Vec<_>, _>>()?;
672        Ok(rows)
673    }
674
675    /// Returns `true` if at least one file has been indexed.
676    ///
677    /// Cheaper than [`Database::stats`] for the common "is the index empty?" check —
678    /// SQLite can satisfy `LIMIT 1` with a single index seek rather than a full count.
679    pub fn has_indexed_files(&self) -> Result<bool> {
680        Ok(self
681            .conn
682            .query_row("SELECT 1 FROM files LIMIT 1", [], |_| Ok(()))
683            .optional()?
684            .is_some())
685    }
686
687    /// Get symbols for a set of file paths, grouped by file, ordered by line.
688    ///
689    /// Optionally filter by symbol kind. Only returns symbols for files that
690    /// exist in the index. Files with no matching symbols are omitted.
691    /// SQLite variable limit per query. Chunking keeps us well under the default
692    /// `SQLITE_MAX_VARIABLE_NUMBER` (999 in older builds, 32766 in newer).
693    pub(crate) const FILE_CHUNK_SIZE: usize = 500;
694
695    pub fn symbols_for_files(
696        &self,
697        file_paths: &[String],
698        kind_filter: Option<SymbolKind>,
699    ) -> Result<Vec<Symbol>> {
700        if file_paths.is_empty() {
701            return Ok(Vec::new());
702        }
703
704        let kind_str = kind_filter.map(|k| k.as_str().to_string());
705        let mut all_results = Vec::new();
706
707        for chunk in file_paths.chunks(Self::FILE_CHUNK_SIZE) {
708            let placeholders: Vec<_> = (1..=chunk.len()).map(|i| format!("?{i}")).collect();
709            let kind_param_idx = chunk.len() + 1;
710
711            let sql = format!(
712                "SELECT id, name, kind, file_path, start_line, end_line, start_byte, end_byte,
713                        parent_id, signature, visibility, is_async, docstring, in_degree,
714                    content_hash, subtree_hash
715                 FROM symbols
716                 WHERE file_path IN ({})
717                   AND (?{kind_param_idx} IS NULL OR kind = ?{kind_param_idx})
718                 ORDER BY file_path, start_line",
719                placeholders.join(", ")
720            );
721            let mut stmt = self.conn.prepare(&sql)?;
722
723            let mut param_values: Vec<Box<dyn rusqlite::types::ToSql>> = chunk
724                .iter()
725                .map(|p| Box::new(p.clone()) as Box<dyn rusqlite::types::ToSql>)
726                .collect();
727            param_values.push(Box::new(kind_str.clone()));
728
729            let params: Vec<&dyn rusqlite::types::ToSql> =
730                param_values.iter().map(|p| &**p).collect();
731            let rows = stmt
732                .query_map(&*params, row_to_symbol)?
733                .collect::<std::result::Result<Vec<_>, _>>()?;
734            all_results.extend(rows);
735        }
736
737        // Re-sort across chunks to maintain file_path, start_line order
738        if file_paths.len() > Self::FILE_CHUNK_SIZE {
739            all_results.sort_by(|a, b| {
740                a.file_path
741                    .cmp(&b.file_path)
742                    .then(a.start_line.cmp(&b.start_line))
743            });
744        }
745
746        Ok(all_results)
747    }
748
749    /// Get all indexed file paths, sorted alphabetically.
750    pub fn all_files(&self) -> Result<Vec<String>> {
751        let mut stmt = self.conn.prepare("SELECT path FROM files ORDER BY path")?;
752        let rows = stmt
753            .query_map([], |row| row.get(0))?
754            .collect::<std::result::Result<Vec<_>, _>>()?;
755        Ok(rows)
756    }
757
758    /// Load (path, hash) pairs for every indexed file in one query.
759    ///
760    /// Used by the parallel indexer to avoid per-file DB round trips when
761    /// deciding whether a file needs re-parsing.
762    pub fn all_file_hashes(&self) -> Result<std::collections::HashMap<String, String>> {
763        let mut stmt = self.conn.prepare("SELECT path, hash FROM files")?;
764        let rows = stmt
765            .query_map([], |row| {
766                Ok((row.get::<_, String>(0)?, row.get::<_, String>(1)?))
767            })?
768            .collect::<std::result::Result<Vec<_>, _>>()?;
769        Ok(rows.into_iter().collect())
770    }
771}