sqlite-graphrag 1.0.70

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/chunking.rs:
    1|       |//! Semantic chunking for embedding inputs (Markdown-aware, 512-token limit).
    2|       |//!
    3|       |//! Splits bodies using [`text_splitter::MarkdownSplitter`] with overlap so
    4|       |//! multi-chunk memories preserve context across chunk boundaries.
    5|       |
    6|       |// src/chunking.rs
    7|       |// Token-based chunking for E5 model (512 token limit)
    8|       |
    9|       |use crate::constants::{CHUNK_OVERLAP_TOKENS, CHUNK_SIZE_TOKENS, EMBEDDING_DIM};
   10|       |use text_splitter::{ChunkConfig, MarkdownSplitter};
   11|       |use tokenizers::Tokenizer;
   12|       |
   13|       |// Conservative heuristic to reduce the risk of underestimating the real token count
   14|       |// in Markdown, code, and multilingual text. The previous value (4 chars/token) allowed
   15|       |// chunks that were too large for some real documents.
   16|       |/// Characters per token heuristic: 2 chars/token reduces the risk of underestimating
   17|       |/// real token counts in Markdown, code, and multilingual text.
   18|       |const CHARS_PER_TOKEN: usize = 2;
   19|       |
   20|       |/// Maximum character length of a single chunk (derived from token limit × chars-per-token).
   21|       |pub const CHUNK_SIZE_CHARS: usize = CHUNK_SIZE_TOKENS * CHARS_PER_TOKEN;
   22|       |
   23|       |/// Character overlap between consecutive chunks to preserve cross-boundary context.
   24|       |pub const CHUNK_OVERLAP_CHARS: usize = CHUNK_OVERLAP_TOKENS * CHARS_PER_TOKEN;
   25|       |
   26|       |/// A contiguous slice of a body string identified by byte offsets.
   27|       |#[derive(Debug, Clone)]
   28|       |pub struct Chunk {
   29|       |    /// Byte offset of the first character (inclusive).
   30|       |    pub start_offset: usize,
   31|       |    /// Byte offset past the last character (exclusive).
   32|       |    pub end_offset: usize,
   33|       |    /// Approximate token count for this chunk (chars / `CHARS_PER_TOKEN`).
   34|       |    pub token_count_approx: usize,
   35|       |}
   36|       |
   37|       |/// Returns `true` when `body` exceeds `CHUNK_SIZE_CHARS` and must be split.
   38|      5|pub fn needs_chunking(body: &str) -> bool {
   39|      5|    body.len() > CHUNK_SIZE_CHARS
   40|      5|}
   41|       |
   42|       |/// Splits `body` into overlapping [`Chunk`]s using a character-based heuristic.
   43|       |///
   44|       |/// Short bodies (≤ `CHUNK_SIZE_CHARS`) are returned as a single chunk.
   45|       |/// Splits prefer paragraph breaks, then sentence-end punctuation, then word boundaries.
   46|       |///
   47|       |/// # Errors
   48|       |/// This function is infallible; it returns a `Vec` directly.
   49|      3|pub fn split_into_chunks(body: &str) -> Vec<Chunk> {
   50|      3|    if !needs_chunking(body) {
   51|      1|        return vec![Chunk {
   52|      1|            token_count_approx: body.chars().count() / CHARS_PER_TOKEN,
   53|      1|            start_offset: 0,
   54|      1|            end_offset: body.len(),
   55|      1|        }];
   56|      2|    }
   57|       |
   58|      2|    let mut chunks = Vec::with_capacity(body.len() / CHUNK_SIZE_CHARS + 1);
   59|      2|    let mut start = 0usize;
   60|       |
   61|     26|    while start < body.len() {
   62|     26|        start = next_char_boundary(body, start);
   63|     26|        let desired_end = previous_char_boundary(body, (start + CHUNK_SIZE_CHARS).min(body.len()));
   64|     26|        let end = if desired_end < body.len() {
   65|     24|            find_split_boundary(body, start, desired_end)
   66|       |        } else {
   67|      2|            desired_end
   68|       |        };
   69|       |
   70|     26|        let end = if end <= start {
   71|      0|            let fallback = previous_char_boundary(body, (start + CHUNK_SIZE_CHARS).min(body.len()));
   72|      0|            if fallback > start {
   73|      0|                fallback
   74|       |            } else {
   75|      0|                body.len()
   76|       |            }
   77|       |        } else {
   78|     26|            end
   79|       |        };
   80|       |
   81|     26|        let token_count_approx = body[start..end].chars().count() / CHARS_PER_TOKEN;
   82|     26|        chunks.push(Chunk {
   83|     26|            start_offset: start,
   84|     26|            end_offset: end,
   85|     26|            token_count_approx,
   86|     26|        });
   87|       |
   88|     26|        if end >= body.len() {
   89|      2|            break;
   90|     24|        }
   91|       |
   92|     24|        let next_start = next_char_boundary(body, end.saturating_sub(CHUNK_OVERLAP_CHARS));
   93|     24|        start = if next_start >= end { end } else { next_start };
                                                     ^0
   94|       |    }
   95|       |
   96|      2|    chunks
   97|      3|}
   98|       |
   99|       |/// Splits `body` into [`Chunk`]s using pre-computed token byte-offsets.
  100|       |///
  101|       |/// Each element of `token_offsets` is a `(start, end)` byte range for one token.
  102|       |/// Respects `CHUNK_SIZE_TOKENS` and `CHUNK_OVERLAP_TOKENS` constants.
  103|       |/// Short bodies (≤ `CHUNK_SIZE_TOKENS` tokens) are returned as a single chunk.
  104|      2|pub fn split_into_chunks_by_token_offsets(
  105|      2|    body: &str,
  106|      2|    token_offsets: &[(usize, usize)],
  107|      2|) -> Vec<Chunk> {
  108|      2|    if token_offsets.len() <= CHUNK_SIZE_TOKENS {
  109|      1|        return vec![Chunk {
  110|      1|            token_count_approx: token_offsets.len(),
  111|      1|            start_offset: 0,
  112|      1|            end_offset: body.len(),
  113|      1|        }];
  114|      1|    }
  115|       |
  116|      1|    let mut chunks = Vec::with_capacity(token_offsets.len() / CHUNK_SIZE_TOKENS + 1);
  117|      1|    let mut start_token = 0usize;
  118|       |
  119|      2|    while start_token < token_offsets.len() {
  120|      2|        let end_token = (start_token + CHUNK_SIZE_TOKENS).min(token_offsets.len());
  121|       |
  122|      2|        chunks.push(Chunk {
  123|      2|            start_offset: if start_token == 0 {
  124|      1|                0
  125|       |            } else {
  126|      1|                token_offsets[start_token].0
  127|       |            },
  128|      2|            end_offset: if end_token == token_offsets.len() {
  129|      1|                body.len()
  130|       |            } else {
  131|      1|                token_offsets[end_token - 1].1
  132|       |            },
  133|      2|            token_count_approx: end_token - start_token,
  134|       |        });
  135|       |
  136|      2|        if end_token == token_offsets.len() {
  137|      1|            break;
  138|      1|        }
  139|       |
  140|      1|        let next_start = end_token.saturating_sub(CHUNK_OVERLAP_TOKENS);
  141|      1|        start_token = if next_start <= start_token {
  142|      0|            end_token
  143|       |        } else {
  144|      1|            next_start
  145|       |        };
  146|       |    }
  147|       |
  148|      1|    chunks
  149|      2|}
  150|       |
  151|       |/// Splits body into chunks using MarkdownSplitter with a real tokenizer.
  152|       |/// Respects Markdown semantic boundaries (H1-H6, paragraphs, blocks).
  153|       |/// For plain text without Markdown markers, falls back to paragraph and sentence breaks.
  154|      0|pub fn split_into_chunks_hierarchical(body: &str, tokenizer: &Tokenizer) -> Vec<Chunk> {
  155|      0|    if body.is_empty() {
  156|      0|        return Vec::new();
  157|      0|    }
  158|       |
  159|      0|    let config = ChunkConfig::new(CHUNK_SIZE_TOKENS)
  160|      0|        .with_sizer(tokenizer)
  161|      0|        .with_overlap(CHUNK_OVERLAP_TOKENS)
  162|      0|        .expect(
  163|      0|            "compile-time invariant: CHUNK_OVERLAP_TOKENS must be smaller than CHUNK_SIZE_TOKENS",
  164|       |        );
  165|       |
  166|      0|    let splitter = MarkdownSplitter::new(config);
  167|       |
  168|      0|    let items: Vec<(usize, &str)> = splitter.chunk_indices(body).collect();
  169|       |
  170|      0|    if items.is_empty() {
  171|      0|        return vec![Chunk {
  172|      0|            start_offset: 0,
  173|      0|            end_offset: body.len(),
  174|      0|            token_count_approx: body.chars().count() / CHARS_PER_TOKEN,
  175|      0|        }];
  176|      0|    }
  177|       |
  178|      0|    items
  179|      0|        .into_iter()
  180|      0|        .map(|(start, text)| {
  181|      0|            let end = start + text.len();
  182|      0|            Chunk {
  183|      0|                start_offset: start,
  184|      0|                end_offset: end,
  185|      0|                token_count_approx: text.chars().count() / CHARS_PER_TOKEN,
  186|      0|            }
  187|      0|        })
  188|      0|        .collect()
  189|      0|}
  190|       |
  191|       |/// Returns the string slice of `body` described by `chunk`'s byte offsets.
  192|    240|pub fn chunk_text<'a>(body: &'a str, chunk: &Chunk) -> &'a str {
  193|    240|    &body[chunk.start_offset..chunk.end_offset]
  194|    240|}
  195|       |
  196|     24|fn find_split_boundary(body: &str, start: usize, desired_end: usize) -> usize {
  197|     24|    let slice = &body[start..desired_end];
  198|     24|    if let Some(pos) = slice.rfind("\n\n") {
                              ^0
  199|      0|        return start + pos + 2;
  200|     24|    }
  201|     24|    if let Some(pos) = slice.rfind(". ") {
                              ^0
  202|      0|        return start + pos + 2;
  203|     24|    }
  204|     24|    if let Some(pos) = slice.rfind(' ') {
  205|     24|        return start + pos + 1;
  206|      0|    }
  207|      0|    desired_end
  208|     24|}
  209|       |
  210|     26|fn previous_char_boundary(body: &str, mut idx: usize) -> usize {
  211|     26|    idx = idx.min(body.len());
  212|     26|    while idx > 0 && !body.is_char_boundary(idx) {
  213|      0|        idx -= 1;
  214|      0|    }
  215|     26|    idx
  216|     26|}
  217|       |
  218|     50|fn next_char_boundary(body: &str, mut idx: usize) -> usize {
  219|     50|    idx = idx.min(body.len());
  220|     59|    while idx < body.len() && !body.is_char_boundary(idx) {
  221|      9|        idx += 1;
  222|      9|    }
  223|     50|    idx
  224|     50|}
  225|       |
  226|       |/// Computes the mean of `chunk_embeddings` and L2-normalizes the result.
  227|       |///
  228|       |/// Returns a zero-vector of length `EMBEDDING_DIM` when the input is empty.
  229|       |/// When a single embedding is provided it is returned as-is (no copy).
  230|      1|pub fn aggregate_embeddings(chunk_embeddings: &[Vec<f32>]) -> Vec<f32> {
  231|      1|    if chunk_embeddings.is_empty() {
  232|      0|        return vec![0.0f32; EMBEDDING_DIM];
  233|      1|    }
  234|      1|    if chunk_embeddings.len() == 1 {
  235|      0|        return chunk_embeddings[0].clone();
  236|      1|    }
  237|       |
  238|      1|    let dim = chunk_embeddings[0].len();
  239|      1|    let mut mean = vec![0.0f32; dim];
  240|      3|    for emb in chunk_embeddings {
                      ^2
  241|      4|        for (i, v) in emb.iter().enumerate() {
                                    ^2         ^2
  242|      4|            mean[i] += v;
  243|      4|        }
  244|       |    }
  245|      1|    let n = chunk_embeddings.len() as f32;
  246|      3|    for v in &mut mean {
                      ^2
  247|      2|        *v /= n;
  248|      2|    }
  249|       |
  250|      2|    let norm: f32 = mean.iter().map(|x| x * x).sum::<f32>().sqrt();
                      ^1    ^1    ^1          ^1             ^1           ^1
  251|      1|    if norm > 1e-9 {
  252|      3|        for v in &mut mean {
                          ^2
  253|      2|            *v /= norm;
  254|      2|        }
  255|      0|    }
  256|      1|    mean
  257|      1|}
  258|       |
  259|       |#[cfg(test)]
  260|       |mod tests {
  261|       |    use super::*;
  262|       |
  263|       |    #[test]
  264|      1|    fn test_short_body_no_chunking() {
  265|      1|        let body = "short text";
  266|      1|        assert!(!needs_chunking(body));
  267|      1|        let chunks = split_into_chunks(body);
  268|      1|        assert_eq!(chunks.len(), 1);
  269|      1|        assert_eq!(chunk_text(body, &chunks[0]), body);
  270|      1|    }
  271|       |
  272|       |    #[test]
  273|      1|    fn test_long_body_produces_multiple_chunks() {
  274|      1|        let body = "word ".repeat(1000);
  275|      1|        assert!(needs_chunking(&body));
  276|      1|        let chunks = split_into_chunks(&body);
  277|      1|        assert!(chunks.len() > 1);
  278|      7|        assert!(chunks.iter().all(|c| !chunk_text(&body, c).is_empty()));
                      ^1      ^1            ^1
  279|      1|    }
  280|       |
  281|       |    #[test]
  282|      1|    fn split_by_token_offsets_respeita_limite_e_overlap() {
  283|      1|        let body = "ab".repeat(460);
  284|      1|        let offsets: Vec<(usize, usize)> = (0..460)
  285|    460|            .map(|i| {
                           ^1
  286|    460|                let start = i * 2;
  287|    460|                (start, start + 2)
  288|    460|            })
  289|      1|            .collect();
  290|       |
  291|      1|        let chunks = split_into_chunks_by_token_offsets(&body, &offsets);
  292|      1|        assert_eq!(chunks.len(), 2);
  293|      1|        assert_eq!(chunks[0].token_count_approx, CHUNK_SIZE_TOKENS);
  294|      1|        assert_eq!(chunks[1].token_count_approx, 110);
  295|      1|        assert_eq!(chunks[0].start_offset, 0);
  296|      1|        assert_eq!(
  297|      1|            chunks[1].start_offset,
  298|      1|            offsets[CHUNK_SIZE_TOKENS - CHUNK_OVERLAP_TOKENS].0
  299|       |        );
  300|      1|    }
  301|       |
  302|       |    #[test]
  303|      1|    fn split_by_token_offsets_returns_one_chunk_when_fits() {
  304|      1|        let body = "texto curto";
  305|      1|        let offsets = vec![(0, 5), (6, 11)];
  306|      1|        let chunks = split_into_chunks_by_token_offsets(body, &offsets);
  307|      1|        assert_eq!(chunks.len(), 1);
  308|      1|        assert_eq!(chunks[0].start_offset, 0);
  309|      1|        assert_eq!(chunks[0].end_offset, body.len());
  310|      1|        assert_eq!(chunks[0].token_count_approx, 2);
  311|      1|    }
  312|       |
  313|       |    #[test]
  314|      1|    fn test_multibyte_body_preserves_progress_and_boundaries() {
  315|       |        // Multibyte body intentionally includes 2-byte UTF-8 sequences (Latin-1 supplement)
  316|       |        // expressed as Unicode escapes so this source file remains ASCII-only per the
  317|       |        // language policy. The original PT-BR phrase "a\u{e7}\u{e3}o \u{fa}til " is preserved
  318|       |        // since the test exercises UTF-8 char-boundary handling.
  319|      1|        let body = "a\u{e7}\u{e3}o \u{fa}til ".repeat(1000);
  320|      1|        let chunks = split_into_chunks(&body);
  321|      1|        assert!(chunks.len() > 1);
  322|     20|        for chunk in &chunks {
                          ^19
  323|     19|            assert!(!chunk_text(&body, chunk).is_empty());
  324|     19|            assert!(body.is_char_boundary(chunk.start_offset));
  325|     19|            assert!(body.is_char_boundary(chunk.end_offset));
  326|     19|            assert!(chunk.end_offset > chunk.start_offset);
  327|       |        }
  328|     18|        for pair in chunks.windows(2) {
                                  ^1     ^1
  329|     18|            assert!(pair[1].start_offset >= pair[0].start_offset);
  330|     18|            assert!(pair[1].end_offset > pair[0].start_offset);
  331|       |        }
  332|      1|    }
  333|       |
  334|       |    #[test]
  335|      1|    fn test_aggregate_embeddings_normalizes() {
  336|      1|        let embs = vec![vec![1.0f32, 0.0], vec![0.0f32, 1.0]];
  337|      1|        let agg = aggregate_embeddings(&embs);
  338|      2|        let norm: f32 = agg.iter().map(|x| x * x).sum::<f32>().sqrt();
                          ^1    ^1    ^1         ^1             ^1           ^1
  339|      1|        assert!((norm - 1.0).abs() < 1e-5);
  340|      1|    }
  341|       |
  342|      5|    fn split_hier_chars(body: &str, size: usize) -> Vec<Chunk> {
  343|       |        use text_splitter::{Characters, ChunkConfig, MarkdownSplitter};
  344|      5|        if body.is_empty() {
  345|      0|            return Vec::new();
  346|      5|        }
  347|      5|        let config = ChunkConfig::new(size)
  348|      5|            .with_sizer(Characters)
  349|      5|            .with_overlap(0)
  350|      5|            .expect("overlap must be smaller than size");
  351|      5|        let splitter = MarkdownSplitter::new(config);
  352|      5|        let items: Vec<(usize, &str)> = splitter.chunk_indices(body).collect();
  353|      5|        if items.is_empty() {
  354|      0|            return vec![Chunk {
  355|      0|                start_offset: 0,
  356|      0|                end_offset: body.len(),
  357|      0|                token_count_approx: body.chars().count() / CHARS_PER_TOKEN,
  358|      0|            }];
  359|      5|        }
  360|      5|        items
  361|      5|            .into_iter()
  362|    255|            .map(|(start, text)| {
                           ^5
  363|    255|                let end = start + text.len();
  364|    255|                Chunk {
  365|    255|                    start_offset: start,
  366|    255|                    end_offset: end,
  367|    255|                    token_count_approx: text.chars().count() / CHARS_PER_TOKEN,
  368|    255|                }
  369|    255|            })
  370|      5|            .collect()
  371|      5|    }
  372|       |
  373|       |    #[test]
  374|      1|    fn test_hierarchical_empty_body_returns_empty() {
  375|       |        use text_splitter::{Characters, ChunkConfig, MarkdownSplitter};
  376|      1|        let config = ChunkConfig::new(100)
  377|      1|            .with_sizer(Characters)
  378|      1|            .with_overlap(0)
  379|      1|            .expect("overlap < size");
  380|      1|        let splitter = MarkdownSplitter::new(config);
  381|      1|        let result: Vec<_> = splitter.chunk_indices("").collect();
  382|      1|        assert!(result.is_empty());
  383|      1|    }
  384|       |
  385|       |    #[test]
  386|      1|    fn test_markdown_h1_boundary_yields_two_chunks() {
  387|      1|        let body = "# Title 1\n\nbody1 body1 body1 body1 body1 body1\n\n# Title 2\n\nbody2 body2 body2 body2 body2 body2";
  388|      1|        let chunks = split_hier_chars(body, 30);
  389|      1|        assert!(
  390|      1|            chunks.len() >= 2,
  391|      0|            "expected >=2 chunks, got {}",
  392|      0|            chunks.len()
  393|       |        );
  394|      7|        for c in &chunks {
                          ^6
  395|      6|            assert!(body.is_char_boundary(c.start_offset));
  396|      6|            assert!(body.is_char_boundary(c.end_offset));
  397|       |        }
  398|      1|    }
  399|       |
  400|       |    #[test]
  401|      1|    fn test_markdown_h2_nested_respects_boundaries() {
  402|      1|        let body = "# H1\n\n## H2a\n\nParagraph A with enough text to force a split.\n\n## H2b\n\nParagraph B with enough text to force a split as well.";
  403|      1|        let chunks = split_hier_chars(body, 40);
  404|      1|        assert!(!chunks.is_empty());
  405|      8|        for c in &chunks {
                          ^7
  406|      7|            assert!(body.is_char_boundary(c.start_offset));
  407|      7|            assert!(body.is_char_boundary(c.end_offset));
  408|      7|            assert!(c.end_offset > c.start_offset);
  409|      7|            assert!(c.end_offset <= body.len());
  410|       |        }
  411|      1|    }
  412|       |
  413|       |    #[test]
  414|      1|    fn test_markdown_paragraph_soft_boundary() {
  415|      1|        let para = "Plain text sentence used to fill the paragraph. ";
  416|      1|        let body = format!(
  417|      1|            "{}\n\n{}\n\n{}",
  418|      1|            para.repeat(3),
  419|      1|            para.repeat(3),
  420|      1|            para.repeat(3)
  421|       |        );
  422|      1|        let chunks = split_hier_chars(&body, 80);
  423|      1|        assert!(
  424|      1|            chunks.len() >= 2,
  425|      0|            "expected >=2 chunks with a body of {} chars",
  426|      0|            body.len()
  427|       |        );
  428|     10|        for c in &chunks {
                          ^9
  429|      9|            assert!(body.is_char_boundary(c.start_offset));
  430|      9|            assert!(body.is_char_boundary(c.end_offset));
  431|       |        }
  432|      1|    }
  433|       |
  434|       |    #[test]
  435|      1|    fn test_markdown_60kb_valid_offsets() {
  436|      1|        let block = "# Section\n\nBlock content text. ".repeat(1700);
  437|      1|        assert!(
  438|      1|            block.len() > 50_000,
  439|      0|            "body must be >50KB, has {} bytes",
  440|      0|            block.len()
  441|       |        );
  442|      1|        let chunks = split_hier_chars(&block, 256);
  443|      1|        assert!(chunks.len() > 1);
  444|    214|        for c in &chunks {
                          ^213
  445|    213|            assert!(block.is_char_boundary(c.start_offset));
  446|    213|            assert!(block.is_char_boundary(c.end_offset));
  447|    213|            assert!(c.end_offset > c.start_offset);
  448|    213|            assert!(!chunk_text(&block, c).is_empty());
  449|       |        }
  450|      1|    }
  451|       |
  452|       |    #[test]
  453|      1|    fn test_fallback_plain_text_without_markers() {
  454|      1|        let body = "a ".repeat(1000);
  455|      1|        let chunks = split_hier_chars(&body, 100);
  456|      1|        assert!(!chunks.is_empty());
  457|     21|        for c in &chunks {
                          ^20
  458|     20|            assert!(body.is_char_boundary(c.start_offset));
  459|     20|            assert!(body.is_char_boundary(c.end_offset));
  460|       |        }
  461|      1|    }
  462|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/cli.rs:
    1|       |//! CLI argument structs and command surface (clap-based).
    2|       |//!
    3|       |//! Defines `Cli` and all subcommand enums; contains no business logic.
    4|       |
    5|       |use crate::commands::*;
    6|       |use crate::i18n::{current, Language};
    7|       |use clap::{Parser, Subcommand};
    8|       |
    9|       |/// Common daemon-control options shared across embedding-heavy subcommands.
   10|       |#[derive(clap::Args, Debug, Clone)]
   11|       |pub struct DaemonOpts {
   12|       |    /// Allow the CLI to spawn a background daemon if none is running.
   13|       |    ///
   14|       |    /// Default `true`. Pass `--autostart-daemon=false` to disable.
   15|       |    /// Env var `SQLITE_GRAPHRAG_DAEMON_DISABLE_AUTOSTART=1` is honoured only when this flag is unset.
   16|       |    #[arg(long, default_value_t = true, action = clap::ArgAction::Set)]
   17|       |    pub autostart_daemon: bool,
   18|       |}
   19|       |
   20|       |/// Returns the maximum simultaneous invocations allowed by the CPU heuristic.
   21|      0|fn max_concurrency_ceiling() -> usize {
   22|      0|    std::thread::available_parallelism()
   23|      0|        .map(|n| n.get() * 2)
   24|      0|        .unwrap_or(8)
   25|      0|}
   26|       |
   27|       |#[derive(Copy, Clone, Debug, PartialEq, Eq, clap::ValueEnum)]
   28|       |pub enum GraphExportFormat {
   29|       |    Json,
   30|       |    Dot,
   31|       |    Mermaid,
   32|       |    /// Stream one JSON object per entity, then one per edge, then a summary line.
   33|       |    Ndjson,
   34|       |}
   35|       |
   36|       |#[derive(Parser)]
   37|       |#[command(name = "sqlite-graphrag")]
   38|       |#[command(version)]
   39|       |#[command(about = "Local GraphRAG memory for LLMs in a single SQLite file")]
   40|       |#[command(arg_required_else_help = true)]
   41|       |pub struct Cli {
   42|       |    /// Maximum number of simultaneous CLI invocations allowed (default: 4).
   43|       |    ///
   44|       |    /// Caps the counting semaphore used for CLI concurrency slots. The value must
   45|       |    /// stay within [1, 2×nCPUs]. Values above the ceiling are rejected with exit 2.
   46|       |    #[arg(long, global = true, value_name = "N")]
   47|       |    pub max_concurrency: Option<usize>,
   48|       |
   49|       |    /// Wait up to SECONDS for a free concurrency slot before giving up (exit 75).
   50|       |    ///
   51|       |    /// Useful in retrying agent pipelines: the process polls every 500 ms until a
   52|       |    /// slot opens or the timeout expires. Default: 300s (5 minutes).
   53|       |    #[arg(long, global = true, value_name = "SECONDS")]
   54|       |    pub wait_lock: Option<u64>,
   55|       |
   56|       |    /// Skip the available-memory check before loading the model.
   57|       |    ///
   58|       |    /// Exclusive use in automated tests where real allocation does not occur.
   59|       |    #[arg(long, global = true, hide = true, default_value_t = false)]
   60|       |    pub skip_memory_guard: bool,
   61|       |
   62|       |    /// Language for human-facing stderr messages. Accepts `en` or `pt`.
   63|       |    ///
   64|       |    /// Without the flag, detection falls back to `SQLITE_GRAPHRAG_LANG` and then
   65|       |    /// `LC_ALL`/`LANG`. JSON stdout stays deterministic and identical across
   66|       |    /// languages; only human-facing strings are affected.
   67|       |    #[arg(long, global = true, value_enum, value_name = "LANG")]
   68|       |    pub lang: Option<crate::i18n::Language>,
   69|       |
   70|       |    /// Time zone for `*_iso` fields in JSON output (for example `America/Sao_Paulo`).
   71|       |    ///
   72|       |    /// Accepts any IANA time zone name. Without the flag, it falls back to
   73|       |    /// `SQLITE_GRAPHRAG_DISPLAY_TZ`; if unset, UTC is used. Integer epoch fields
   74|       |    /// are not affected.
   75|       |    #[arg(long, global = true, value_name = "IANA")]
   76|       |    pub tz: Option<chrono_tz::Tz>,
   77|       |
   78|       |    /// Increase logging verbosity (-v=info, -vv=debug, -vvv=trace).
   79|       |    ///
   80|       |    /// Overrides `SQLITE_GRAPHRAG_LOG_LEVEL` env var when present. Logs are emitted
   81|       |    /// to stderr; JSON stdout is unaffected.
   82|       |    #[arg(short = 'v', long, global = true, action = clap::ArgAction::Count)]
   83|       |    pub verbose: u8,
   84|       |
   85|       |    #[command(subcommand)]
   86|       |    pub command: Commands,
   87|       |}
   88|       |
   89|       |#[cfg(test)]
   90|       |mod json_only_format_tests {
   91|       |    use super::Cli;
   92|       |    use clap::Parser;
   93|       |
   94|       |    #[test]
   95|      1|    fn restore_accepts_only_format_json() {
   96|      1|        assert!(Cli::try_parse_from([
   97|      1|            "sqlite-graphrag",
   98|      1|            "restore",
   99|      1|            "--name",
  100|      1|            "mem",
  101|      1|            "--version",
  102|      1|            "1",
  103|      1|            "--format",
  104|      1|            "json",
  105|      1|        ])
  106|      1|        .is_ok());
  107|       |
  108|      1|        assert!(Cli::try_parse_from([
  109|      1|            "sqlite-graphrag",
  110|      1|            "restore",
  111|      1|            "--name",
  112|      1|            "mem",
  113|      1|            "--version",
  114|      1|            "1",
  115|      1|            "--format",
  116|      1|            "text",
  117|      1|        ])
  118|      1|        .is_err());
  119|      1|    }
  120|       |
  121|       |    #[test]
  122|      1|    fn hybrid_search_accepts_only_format_json() {
  123|      1|        assert!(Cli::try_parse_from([
  124|      1|            "sqlite-graphrag",
  125|      1|            "hybrid-search",
  126|      1|            "query",
  127|      1|            "--format",
  128|      1|            "json",
  129|      1|        ])
  130|      1|        .is_ok());
  131|       |
  132|      1|        assert!(Cli::try_parse_from([
  133|      1|            "sqlite-graphrag",
  134|      1|            "hybrid-search",
  135|      1|            "query",
  136|      1|            "--format",
  137|      1|            "markdown",
  138|      1|        ])
  139|      1|        .is_err());
  140|      1|    }
  141|       |
  142|       |    #[test]
  143|      1|    fn remember_recall_rename_vacuum_json_only() {
  144|      1|        assert!(Cli::try_parse_from([
  145|      1|            "sqlite-graphrag",
  146|      1|            "remember",
  147|      1|            "--name",
  148|      1|            "mem",
  149|      1|            "--type",
  150|      1|            "project",
  151|      1|            "--description",
  152|      1|            "desc",
  153|      1|            "--format",
  154|      1|            "json",
  155|      1|        ])
  156|      1|        .is_ok());
  157|      1|        assert!(Cli::try_parse_from([
  158|      1|            "sqlite-graphrag",
  159|      1|            "remember",
  160|      1|            "--name",
  161|      1|            "mem",
  162|      1|            "--type",
  163|      1|            "project",
  164|      1|            "--description",
  165|      1|            "desc",
  166|      1|            "--format",
  167|      1|            "text",
  168|      1|        ])
  169|      1|        .is_err());
  170|       |
  171|      1|        assert!(
  172|      1|            Cli::try_parse_from(["sqlite-graphrag", "recall", "query", "--format", "json",])
  173|      1|                .is_ok()
  174|       |        );
  175|      1|        assert!(
  176|      1|            Cli::try_parse_from(["sqlite-graphrag", "recall", "query", "--format", "text",])
  177|      1|                .is_err()
  178|       |        );
  179|       |
  180|      1|        assert!(Cli::try_parse_from([
  181|      1|            "sqlite-graphrag",
  182|      1|            "rename",
  183|      1|            "--name",
  184|      1|            "old",
  185|      1|            "--new-name",
  186|      1|            "new",
  187|      1|            "--format",
  188|      1|            "json",
  189|      1|        ])
  190|      1|        .is_ok());
  191|      1|        assert!(Cli::try_parse_from([
  192|      1|            "sqlite-graphrag",
  193|      1|            "rename",
  194|      1|            "--name",
  195|      1|            "old",
  196|      1|            "--new-name",
  197|      1|            "new",
  198|      1|            "--format",
  199|      1|            "markdown",
  200|      1|        ])
  201|      1|        .is_err());
  202|       |
  203|      1|        assert!(Cli::try_parse_from(["sqlite-graphrag", "vacuum", "--format", "json",]).is_ok());
  204|      1|        assert!(Cli::try_parse_from(["sqlite-graphrag", "vacuum", "--format", "text",]).is_err());
  205|      1|    }
  206|       |}
  207|       |
  208|       |impl Cli {
  209|       |    /// Validates concurrency flags and returns a localised descriptive error if invalid.
  210|       |    ///
  211|       |    /// Requires that `crate::i18n::init()` has already been called (happens before this
  212|       |    /// function in the `main` flow). In English it emits EN messages; in Portuguese it emits PT.
  213|      0|    pub fn validate_flags(&self) -> Result<(), String> {
  214|      0|        if let Some(n) = self.max_concurrency {
  215|      0|            if n == 0 {
  216|      0|                return Err(match current() {
  217|      0|                    Language::English => "--max-concurrency must be >= 1".to_string(),
  218|      0|                    Language::Portuguese => "--max-concurrency deve ser >= 1".to_string(),
  219|       |                });
  220|      0|            }
  221|      0|            let teto = max_concurrency_ceiling();
  222|      0|            if n > teto {
  223|      0|                return Err(match current() {
  224|      0|                    Language::English => format!(
  225|      0|                        "--max-concurrency {n} exceeds the ceiling of {teto} (2×nCPUs) on this system"
  226|       |                    ),
  227|      0|                    Language::Portuguese => format!(
  228|      0|                        "--max-concurrency {n} excede o teto de {teto} (2×nCPUs) neste sistema"
  229|       |                    ),
  230|       |                });
  231|      0|            }
  232|      0|        }
  233|      0|        Ok(())
  234|      0|    }
  235|       |}
  236|       |
  237|       |impl Commands {
  238|       |    /// Returns true for subcommands that load the ONNX model locally.
  239|      5|    pub fn is_embedding_heavy(&self) -> bool {
  240|      1|        matches!(
  241|      5|            self,
  242|       |            Self::Init(_)
  243|       |                | Self::Remember(_)
  244|       |                | Self::RememberBatch(_)
  245|       |                | Self::Recall(_)
  246|       |                | Self::HybridSearch(_)
  247|       |                | Self::DeepResearch(_)
  248|       |        )
  249|      5|    }
  250|       |
  251|      0|    pub fn uses_cli_slot(&self) -> bool {
  252|      0|        !matches!(self, Self::Daemon(_))
  253|      0|    }
  254|       |}
  255|       |
  256|       |#[derive(Subcommand)]
  257|       |pub enum Commands {
  258|       |    /// Initialize database and download embedding model
  259|       |    #[command(after_long_help = "EXAMPLES:\n  \
  260|       |        # Initialize in current directory (default behavior)\n  \
  261|       |        sqlite-graphrag init\n\n  \
  262|       |        # Initialize at a specific path\n  \
  263|       |        sqlite-graphrag init --db /path/to/graphrag.sqlite\n\n  \
  264|       |        # Initialize using SQLITE_GRAPHRAG_HOME env var\n  \
  265|       |        SQLITE_GRAPHRAG_HOME=/data sqlite-graphrag init\n\n\
  266|       |        NOTES:\n  \
  267|       |        - `init` is OPTIONAL: any subsequent CRUD command auto-initializes graphrag.sqlite if missing.\n  \
  268|       |        - As a side effect, `init` warms a smoke-test embedding which auto-spawns the persistent daemon (~600s idle timeout).")]
  269|       |    Init(init::InitArgs),
  270|       |    /// Run or control the persistent embedding daemon
  271|       |    Daemon(daemon::DaemonArgs),
  272|       |    /// Save a memory with optional entity graph
  273|       |    #[command(after_long_help = "EXAMPLES:\n  \
  274|       |        # Inline body\n  \
  275|       |        sqlite-graphrag remember --name onboarding --type user --description \"intro\" --body \"hello\"\n\n  \
  276|       |        # Body from file\n  \
  277|       |        sqlite-graphrag remember --name doc1 --type document --description \"...\" --body-file ./README.md\n\n  \
  278|       |        # Body from stdin (pipe)\n  \
  279|       |        cat README.md | sqlite-graphrag remember --name doc1 --type document --description \"...\" --body-stdin\n\n  \
  280|       |        # Enable GLiNER entity extraction (disabled by default)\n  \
  281|       |        sqlite-graphrag remember --name rich --type note --description \"...\" --body \"...\" --enable-ner")]
  282|       |    Remember(remember::RememberArgs),
  283|       |    /// Batch-create memories from NDJSON stdin (one invocation, one slot)
  284|       |    #[command(after_long_help = "EXAMPLES:\n  \
  285|       |        # Batch create from NDJSON\n  \
  286|       |        cat memories.ndjson | sqlite-graphrag remember-batch --force-merge --json\n\n  \
  287|       |        # Atomic batch\n  \
  288|       |        cat memories.ndjson | sqlite-graphrag remember-batch --transaction --json")]
  289|       |    RememberBatch(remember_batch::RememberBatchArgs),
  290|       |    /// Bulk-ingest every file under a directory as separate memories (NDJSON output)
  291|       |    Ingest(ingest::IngestArgs),
  292|       |    /// Search memories semantically
  293|       |    #[command(after_long_help = "EXAMPLES:\n  \
  294|       |        # Top 10 semantic matches (default)\n  \
  295|       |        sqlite-graphrag recall \"agent memory\"\n\n  \
  296|       |        # Top 3 only\n  \
  297|       |        sqlite-graphrag recall \"agent memory\" -k 3\n\n  \
  298|       |        # Search across all namespaces\n  \
  299|       |        sqlite-graphrag recall \"agent memory\" --all-namespaces\n\n  \
  300|       |        # Disable graph traversal (vector-only)\n  \
  301|       |        sqlite-graphrag recall \"agent memory\" --no-graph")]
  302|       |    Recall(recall::RecallArgs),
  303|       |    /// Read a memory by exact name
  304|       |    Read(read::ReadArgs),
  305|       |    /// List memories with filters
  306|       |    List(list::ListArgs),
  307|       |    /// Soft-delete a memory
  308|       |    Forget(forget::ForgetArgs),
  309|       |    /// Permanently delete soft-deleted memories
  310|       |    Purge(purge::PurgeArgs),
  311|       |    /// Rename a memory preserving history
  312|       |    Rename(rename::RenameArgs),
  313|       |    /// Edit a memory's body or description
  314|       |    Edit(edit::EditArgs),
  315|       |    /// List all versions of a memory
  316|       |    History(history::HistoryArgs),
  317|       |    /// Restore a memory to a previous version
  318|       |    Restore(restore::RestoreArgs),
  319|       |    /// Search using hybrid vector + full-text search
  320|       |    #[command(after_long_help = "EXAMPLES:\n  \
  321|       |        # Hybrid search combining KNN + FTS5 BM25 with RRF\n  \
  322|       |        sqlite-graphrag hybrid-search \"agent memory architecture\"\n\n  \
  323|       |        # Custom weights for vector vs full-text components\n  \
  324|       |        sqlite-graphrag hybrid-search \"agent\" --weight-vec 0.7 --weight-fts 0.3")]
  325|       |    HybridSearch(hybrid_search::HybridSearchArgs),
  326|       |    /// Show database health
  327|       |    Health(health::HealthArgs),
  328|       |    /// Apply pending schema migrations
  329|       |    Migrate(migrate::MigrateArgs),
  330|       |    /// Resolve namespace precedence for the current invocation
  331|       |    NamespaceDetect(namespace_detect::NamespaceDetectArgs),
  332|       |    /// Run PRAGMA optimize on the database
  333|       |    Optimize(optimize::OptimizeArgs),
  334|       |    /// Show database statistics
  335|       |    Stats(stats::StatsArgs),
  336|       |    /// Create a checkpointed copy safe for file sync
  337|       |    SyncSafeCopy(sync_safe_copy::SyncSafeCopyArgs),
  338|       |    /// Back up the database using the SQLite Online Backup API
  339|       |    Backup(backup::BackupArgs),
  340|       |    /// Run VACUUM after checkpointing the WAL
  341|       |    Vacuum(vacuum::VacuumArgs),
  342|       |    /// Create an explicit relationship between two entities
  343|       |    Link(link::LinkArgs),
  344|       |    /// Remove a specific relationship between two entities
  345|       |    Unlink(unlink::UnlinkArgs),
  346|       |    /// Deep parallel multi-hop GraphRAG research
  347|       |    #[command(name = "deep-research")]
  348|       |    DeepResearch(deep_research::DeepResearchArgs),
  349|       |    /// List memories connected via the entity graph
  350|       |    Related(related::RelatedArgs),
  351|       |    /// Export a graph snapshot in json, dot or mermaid
  352|       |    Graph(graph_export::GraphArgs),
  353|       |    /// Export memories as NDJSON (one JSON line per memory, plus a summary line)
  354|       |    Export(export::ExportArgs),
  355|       |    /// FTS5 full-text search index management (rebuild or check)
  356|       |    Fts(fts::FtsArgs),
  357|       |    /// Vector index maintenance (orphan detection, purge, stats) — G39
  358|       |    Vec(vec::VecArgs),
  359|       |    /// List codex OAuth models accepted by ChatGPT Pro (G33).
  360|       |    #[command(name = "codex-models")]
  361|       |    CodexModels,
  362|       |    /// Bulk-delete all relationships of a given type (e.g. mentions)
  363|       |    PruneRelations(prune_relations::PruneRelationsArgs),
  364|       |    /// Remove NER bindings (memory_entities rows) for an entity or all entities
  365|       |    #[command(name = "prune-ner")]
  366|       |    PruneNer(prune_ner::PruneNerArgs),
  367|       |    /// Remove entities that have no memories and no relationships
  368|       |    CleanupOrphans(cleanup_orphans::CleanupOrphansArgs),
  369|       |    /// List entities linked to a specific memory
  370|       |    MemoryEntities(memory_entities::MemoryEntitiesArgs),
  371|       |    /// Manage cached resources (embedding models, etc.)
  372|       |    Cache(cache::CacheArgs),
  373|       |    /// Delete an entity and all its relationships from the graph
  374|       |    #[command(name = "delete-entity")]
  375|       |    DeleteEntity(delete_entity::DeleteEntityArgs),
  376|       |    /// Reclassify one entity or a batch of entities to a new type
  377|       |    Reclassify(reclassify::ReclassifyArgs),
  378|       |    /// Rename an entity preserving all relationships and memory bindings
  379|       |    #[command(name = "rename-entity")]
  380|       |    RenameEntity(rename_entity::RenameEntityArgs),
  381|       |    /// Merge multiple source entities into a single target entity
  382|       |    #[command(name = "merge-entities")]
  383|       |    MergeEntities(merge_entities::MergeEntitiesArgs),
  384|       |    /// Enrich graph memories and entities using an LLM provider
  385|       |    Enrich(enrich::EnrichArgs),
  386|       |    /// Reclassify relationship types across the graph using rules or LLM judgment
  387|       |    #[command(name = "reclassify-relation")]
  388|       |    ReclassifyRelation(reclassify_relation::ReclassifyRelationArgs),
  389|       |    /// Normalize entity names (deduplicate, kebab-case, merge near-duplicates)
  390|       |    #[command(name = "normalize-entities")]
  391|       |    NormalizeEntities(normalize_entities::NormalizeEntitiesArgs),
  392|       |    /// Generate shell completions for Bash, Zsh, Fish, PowerShell, or Elvish
  393|       |    Completions(completions::CompletionsArgs),
  394|       |    #[command(name = "debug-schema", hide = true)]
  395|       |    DebugSchema(debug_schema::DebugSchemaArgs),
  396|       |}
  397|       |
  398|       |#[derive(Copy, Clone, Debug, Default, clap::ValueEnum)]
  399|       |pub enum MemoryType {
  400|       |    User,
  401|       |    Feedback,
  402|       |    Project,
  403|       |    Reference,
  404|       |    Decision,
  405|       |    Incident,
  406|       |    Skill,
  407|       |    #[default]
  408|       |    Document,
  409|       |    Note,
  410|       |}
  411|       |
  412|       |#[cfg(test)]
  413|       |mod heavy_concurrency_tests {
  414|       |    use super::*;
  415|       |
  416|       |    #[test]
  417|      1|    fn command_heavy_detects_init_and_embeddings() {
  418|      1|        let init = Cli::try_parse_from(["sqlite-graphrag", "init"]).expect("parse init");
  419|      1|        assert!(init.command.is_embedding_heavy());
  420|       |
  421|      1|        let remember = Cli::try_parse_from([
  422|      1|            "sqlite-graphrag",
  423|      1|            "remember",
  424|      1|            "--name",
  425|      1|            "test-memory",
  426|      1|            "--type",
  427|      1|            "project",
  428|      1|            "--description",
  429|      1|            "desc",
  430|      1|        ])
  431|      1|        .expect("parse remember");
  432|      1|        assert!(remember.command.is_embedding_heavy());
  433|       |
  434|      1|        let recall =
  435|      1|            Cli::try_parse_from(["sqlite-graphrag", "recall", "query"]).expect("parse recall");
  436|      1|        assert!(recall.command.is_embedding_heavy());
  437|       |
  438|      1|        let hybrid = Cli::try_parse_from(["sqlite-graphrag", "hybrid-search", "query"])
  439|      1|            .expect("parse hybrid");
  440|      1|        assert!(hybrid.command.is_embedding_heavy());
  441|      1|    }
  442|       |
  443|       |    #[test]
  444|      1|    fn command_light_does_not_mark_stats() {
  445|      1|        let stats = Cli::try_parse_from(["sqlite-graphrag", "stats"]).expect("parse stats");
  446|      1|        assert!(!stats.command.is_embedding_heavy());
  447|      1|    }
  448|       |}
  449|       |
  450|       |impl MemoryType {
  451|      0|    pub fn as_str(&self) -> &'static str {
  452|      0|        match self {
  453|      0|            Self::User => "user",
  454|      0|            Self::Feedback => "feedback",
  455|      0|            Self::Project => "project",
  456|      0|            Self::Reference => "reference",
  457|      0|            Self::Decision => "decision",
  458|      0|            Self::Incident => "incident",
  459|      0|            Self::Skill => "skill",
  460|      0|            Self::Document => "document",
  461|      0|            Self::Note => "note",
  462|       |        }
  463|      0|    }
  464|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/backup.rs:
    1|       |//! Handler for the `backup` CLI subcommand.
    2|       |//!
    3|       |//! Uses the SQLite Online Backup API (via rusqlite) to produce a consistent
    4|       |//! point-in-time copy of the database file even while the database is in use.
    5|       |
    6|       |use crate::errors::AppError;
    7|       |use crate::output;
    8|       |use crate::paths::AppPaths;
    9|       |use crate::storage::connection::open_ro;
   10|       |use serde::Serialize;
   11|       |use std::path::PathBuf;
   12|       |use tempfile::NamedTempFile;
   13|       |
   14|       |/// Default number of pages copied per backup step.
   15|       |///
   16|       |/// G38: the previous default of 100 pages with 50 ms sleep between steps
   17|       |/// was the dominant cost on large databases (4.3 GB took ~9 minutes purely
   18|       |/// on sleep). 1000 pages × 5 ms is ~25× faster on a 4.3 GB database while
   19|       |/// remaining gentle on SSD I/O. Override with `--backup-step-size`.
   20|       |const DEFAULT_BACKUP_STEP_PAGES: usize = 1000;
   21|       |const DEFAULT_BACKUP_STEP_SLEEP_MS: u64 = 5;
   22|       |
   23|       |#[derive(clap::Args)]
   24|       |#[command(after_long_help = "EXAMPLES:\n  \
   25|       |    # Back up the default database to a specific path\n  \
   26|       |    sqlite-graphrag backup --output /backup/graphrag-$(date +%F).sqlite\n\n  \
   27|       |    # Back up a custom source database\n  \
   28|       |    sqlite-graphrag backup --db /data/graphrag.sqlite --output /backup/snapshot.sqlite\n\n  \
   29|       |    # Tuned for a 4.3 GB database on local SSD\n  \
   30|       |    sqlite-graphrag backup --output /backup/snap.sqlite --backup-step-size 2000 --backup-step-sleep-ms 2\n\n  \
   31|       |    # Maximum throughput (no sleep between steps — risks I/O contention)\n  \
   32|       |    sqlite-graphrag backup --output /backup/snap.sqlite --backup-no-sleep\n\n  \
   33|       |NOTES:\n  \
   34|       |    Uses the SQLite Online Backup API: safe to run while the database is in use.\n  \
   35|       |    The destination is written atomically via tempfile-rename in the same directory.\n  \
   36|       |    If the process is interrupted, the previous file (if any) remains intact.\n  \
   37|       |    On Unix the destination is chmod 0600 after the backup completes.")]
   38|       |pub struct BackupArgs {
   39|       |    /// Destination path for the backup file. Required.
   40|       |    #[arg(long, value_name = "PATH")]
   41|       |    pub output: PathBuf,
   42|       |    #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
   43|       |    pub json: bool,
   44|       |    #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
   45|       |    pub db: Option<String>,
   46|       |    /// Number of pages copied per backup step. Default: 1000 (was 100 before v1.0.69).
   47|       |    /// Larger values finish faster on local SSD but may contend on NFS.
   48|       |    #[arg(long, value_name = "PAGES", default_value_t = DEFAULT_BACKUP_STEP_PAGES)]
   49|       |    pub backup_step_size: usize,
   50|       |    /// Sleep duration in milliseconds between backup steps. Default: 5 (was 50 before v1.0.69).
   51|       |    /// Ignored when --backup-no-sleep is set.
   52|       |    #[arg(long, value_name = "MILLIS", default_value_t = DEFAULT_BACKUP_STEP_SLEEP_MS)]
   53|       |    pub backup_step_sleep_ms: u64,
   54|       |    /// Disable the inter-step sleep entirely. Maximum throughput, but risks
   55|       |    /// starving concurrent I/O on shared storage.
   56|       |    #[arg(long, default_value_t = false)]
   57|       |    pub backup_no_sleep: bool,
   58|       |    /// Emit a progress line to stderr every N pages (G38 observability).
   59|       |    /// Default: 100 (every 100 pages = ~400 KB). Set to 0 to disable.
   60|       |    #[arg(long, value_name = "PAGES", default_value_t = 100)]
   61|       |    pub backup_progress: i32,
   62|       |}
   63|       |
   64|       |#[derive(Serialize)]
   65|       |struct BackupResponse {
   66|       |    action: String,
   67|       |    source: String,
   68|       |    destination: String,
   69|       |    size_bytes: u64,
   70|       |    elapsed_ms: u64,
   71|       |    pages_copied: Option<i64>,
   72|       |    step_size: usize,
   73|       |}
   74|       |
   75|      0|pub fn run(args: BackupArgs) -> Result<(), AppError> {
   76|      0|    let start = std::time::Instant::now();
   77|      0|    let paths = AppPaths::resolve(args.db.as_deref())?;
   78|       |
   79|      0|    crate::storage::connection::ensure_db_ready(&paths)?;
   80|       |
   81|       |    // Validate: destination must differ from source.
   82|      0|    if args.output == paths.db {
   83|      0|        return Err(AppError::Validation(
   84|      0|            "destination path must differ from the source database path".to_string(),
   85|      0|        ));
   86|      0|    }
   87|       |
   88|       |    // Create parent directories if necessary.
   89|      0|    let parent = args.output.parent().unwrap_or(std::path::Path::new("."));
   90|      0|    if !parent.as_os_str().is_empty() {
   91|      0|        std::fs::create_dir_all(parent)?;
   92|      0|    }
   93|       |
   94|       |    // Atomic write: backup to tempfile in the SAME directory, then rename.
   95|      0|    let temp = NamedTempFile::new_in(parent).map_err(AppError::Io)?;
   96|      0|    let temp_path = temp.path().to_path_buf();
   97|       |
   98|      0|    let src_conn = open_ro(&paths.db)?;
   99|      0|    let mut dst_conn = rusqlite::Connection::open(&temp_path)?;
  100|       |
  101|      0|    let step_size = args.backup_step_size.max(1);
  102|      0|    let sleep = if args.backup_no_sleep {
  103|      0|        std::time::Duration::ZERO
  104|       |    } else {
  105|      0|        std::time::Duration::from_millis(args.backup_step_sleep_ms)
  106|       |    };
  107|       |
  108|      0|    let pages_copied: Option<i64> = {
  109|      0|        let backup = rusqlite::backup::Backup::new(&src_conn, &mut dst_conn)?;
  110|       |        // G38: drive the backup in a manual step() loop so we can emit
  111|       |        // per-step progress events without depending on a Copy closure
  112|       |        // (which the rusqlite Progress callback requires). The loop
  113|       |        // mirrors run_to_completion but exposes progress for observability.
  114|      0|        let step_size_i32: i32 = step_size.try_into().unwrap_or(1000);
  115|      0|        let progress_every = args.backup_progress.max(1);
  116|      0|        let mut last_emit_pages: i32 = -1;
  117|       |        loop {
  118|       |            use rusqlite::backup::StepResult;
  119|      0|            match backup.step(step_size_i32) {
  120|       |                Ok(StepResult::More) => {
  121|       |                    // step returned More: backup still in progress.
  122|      0|                    if progress_every > 0 {
  123|      0|                        let p = backup.progress();
  124|      0|                        let copied = p.pagecount - p.remaining;
  125|      0|                        if copied > 0 && copied - last_emit_pages >= progress_every {
  126|      0|                            last_emit_pages = copied;
  127|      0|                            let percent = if p.pagecount > 0 {
  128|      0|                                (copied as f64 / p.pagecount as f64) * 100.0
  129|       |                            } else {
  130|      0|                                100.0
  131|       |                            };
  132|      0|                            eprintln!(
  133|      0|                                "{{\"progress\":{{\"pages_copied\":{copied},\"total_pages\":{pc},\"percent\":{pct:.2}}}}}",
  134|       |                                pc = p.pagecount,
  135|       |                                pct = percent
  136|       |                            );
  137|      0|                        }
  138|      0|                    }
  139|      0|                    if !sleep.is_zero() {
  140|      0|                        std::thread::sleep(sleep);
  141|      0|                    }
  142|       |                }
  143|      0|                Ok(StepResult::Done) => break, // backup complete
  144|      0|                Ok(_) => {
  145|      0|                    // Transient (Busy / Locked on newer rusqlite or any
  146|      0|                    // future non-exhaustive variant): retry after backoff.
  147|      0|                    std::thread::sleep(std::time::Duration::from_millis(50));
  148|      0|                }
  149|      0|                Err(e) => return Err(AppError::Database(e)),
  150|       |            }
  151|       |        }
  152|       |        // `Progress { remaining, pagecount }` (see rusqlite::backup::Progress):
  153|       |        // pages already copied = pagecount - remaining.
  154|      0|        let progress = backup.progress();
  155|      0|        let copied = (progress.pagecount - progress.remaining).max(0);
  156|      0|        Some(copied as i64)
  157|       |    };
  158|      0|    drop(dst_conn);
  159|       |
  160|      0|    temp.persist(&args.output)
  161|      0|        .map_err(|e| AppError::Io(e.error))?;
  162|       |
  163|       |    // Apply 0600 permissions on Unix to prevent leakage in shared directories.
  164|       |    #[cfg(unix)]
  165|       |    {
  166|       |        use std::os::unix::fs::PermissionsExt;
  167|      0|        if let Ok(meta) = std::fs::metadata(&args.output) {
  168|      0|            let mut perms = meta.permissions();
  169|      0|            perms.set_mode(0o600);
  170|      0|            if let Err(e) = std::fs::set_permissions(&args.output, perms) {
  171|      0|                tracing::warn!(target: "backup",
  172|      0|                    path = %args.output.display(),
  173|       |                    error = %e,
  174|      0|                    "failed to set 0600 permissions on backup file"
  175|       |                );
  176|      0|            }
  177|      0|        }
  178|       |    }
  179|       |    #[cfg(windows)]
  180|       |    {
  181|       |        tracing::debug!(target: "backup",
  182|       |            path = %args.output.display(),
  183|       |            "skipping Unix mode 0o600 on Windows; NTFS DACL default is private-to-user"
  184|       |        );
  185|       |    }
  186|       |
  187|      0|    let size_bytes = std::fs::metadata(&args.output)
  188|      0|        .map(|m| m.len())
  189|      0|        .unwrap_or(0);
  190|       |
  191|      0|    output::emit_json(&BackupResponse {
  192|      0|        action: "backed_up".to_string(),
  193|      0|        source: paths.db.display().to_string(),
  194|      0|        destination: args.output.display().to_string(),
  195|      0|        size_bytes,
  196|      0|        elapsed_ms: start.elapsed().as_millis() as u64,
  197|      0|        pages_copied,
  198|      0|        step_size,
  199|      0|    })?;
  200|       |
  201|      0|    Ok(())
  202|      0|}
  203|       |
  204|       |#[cfg(test)]
  205|       |mod tests {
  206|       |    use super::*;
  207|       |
  208|       |    #[test]
  209|      1|    fn backup_response_serializes_all_fields() {
  210|      1|        let resp = BackupResponse {
  211|      1|            action: "backed_up".to_string(),
  212|      1|            source: "/data/graphrag.sqlite".to_string(),
  213|      1|            destination: "/backup/snapshot.sqlite".to_string(),
  214|      1|            size_bytes: 32768,
  215|      1|            elapsed_ms: 42,
  216|      1|            pages_copied: Some(512),
  217|      1|            step_size: 1000,
  218|      1|        };
  219|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  220|      1|        assert_eq!(json["action"], "backed_up");
  221|      1|        assert_eq!(json["source"], "/data/graphrag.sqlite");
  222|      1|        assert_eq!(json["destination"], "/backup/snapshot.sqlite");
  223|      1|        assert_eq!(json["size_bytes"], 32768u64);
  224|      1|        assert_eq!(json["elapsed_ms"], 42u64);
  225|      1|        assert_eq!(json["step_size"], 1000usize);
  226|      1|        assert_eq!(json["pages_copied"], 512i64);
  227|      1|    }
  228|       |
  229|       |    #[test]
  230|      1|    fn backup_response_action_is_backed_up() {
  231|      1|        let resp = BackupResponse {
  232|      1|            action: "backed_up".to_string(),
  233|      1|            source: "/a.sqlite".to_string(),
  234|      1|            destination: "/b.sqlite".to_string(),
  235|      1|            size_bytes: 0,
  236|      1|            elapsed_ms: 0,
  237|      1|            pages_copied: None,
  238|      1|            step_size: 1000,
  239|      1|        };
  240|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  241|      1|        assert_eq!(
  242|      1|            json["action"], "backed_up",
  243|      0|            "action must always be 'backed_up'"
  244|       |        );
  245|      1|    }
  246|       |
  247|       |    #[test]
  248|      1|    fn backup_rejects_destination_equal_to_source() {
  249|       |        // Simulate the guard without a real DB.
  250|      1|        let src = PathBuf::from("/tmp/graphrag.sqlite");
  251|      1|        let dst = PathBuf::from("/tmp/graphrag.sqlite");
  252|      1|        let result: Result<(), AppError> = if dst == src {
  253|      1|            Err(AppError::Validation(
  254|      1|                "destination path must differ from the source database path".to_string(),
  255|      1|            ))
  256|       |        } else {
  257|      0|            Ok(())
  258|       |        };
  259|      1|        assert!(
  260|      1|            result.is_err(),
  261|      0|            "must reject identical source and destination"
  262|       |        );
  263|      1|        if let Err(AppError::Validation(msg)) = result {
  264|      1|            assert!(msg.contains("destination path must differ"));
  265|      0|        }
  266|      1|    }
  267|       |
  268|       |    #[test]
  269|      1|    fn backup_response_size_bytes_zero_is_valid() {
  270|      1|        let resp = BackupResponse {
  271|      1|            action: "backed_up".to_string(),
  272|      1|            source: "/a.sqlite".to_string(),
  273|      1|            destination: "/b.sqlite".to_string(),
  274|      1|            size_bytes: 0,
  275|      1|            elapsed_ms: 1,
  276|      1|            pages_copied: Some(0),
  277|      1|            step_size: 1000,
  278|      1|        };
  279|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  280|      1|        assert!(json["size_bytes"].as_u64().is_some());
  281|      1|    }
  282|       |
  283|       |    #[test]
  284|      1|    fn backup_default_step_size_is_one_thousand() {
  285|       |        // G38: the historical default of 100 pages caused backups of 4.3 GB
  286|       |        // databases to take 9 minutes solely on sleep. The new default of
  287|       |        // 1000 pages with 5 ms sleep gives ~25x speedup.
  288|      1|        assert_eq!(DEFAULT_BACKUP_STEP_PAGES, 1000);
  289|      1|        assert_eq!(DEFAULT_BACKUP_STEP_SLEEP_MS, 5);
  290|      1|    }
  291|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/cache.rs:
    1|       |//! Handler for the `cache` CLI subcommand and its nested operations.
    2|       |//!
    3|       |//! Manages cached resources such as the multilingual-e5-small ONNX model and
    4|       |//! the GLiNER NER classifier downloaded into the XDG cache directory on first
    5|       |//! `init`. Used to reclaim disk space or recover from corrupted cache state.
    6|       |
    7|       |use crate::errors::AppError;
    8|       |use crate::output;
    9|       |use crate::paths::AppPaths;
   10|       |use serde::Serialize;
   11|       |
   12|       |#[derive(clap::Args)]
   13|       |#[command(after_long_help = "EXAMPLES:\n  \
   14|       |    # Remove cached embedding/NER model files (forces re-download on next init)\n  \
   15|       |    sqlite-graphrag cache clear-models\n\n  \
   16|       |    # Skip the confirmation prompt\n  \
   17|       |    sqlite-graphrag cache clear-models --yes\n\n  \
   18|       |    # List cached model files\n  \
   19|       |    sqlite-graphrag cache list\n\n  \
   20|       |    # List cached model files as JSON\n  \
   21|       |    sqlite-graphrag cache list --json")]
   22|       |pub struct CacheArgs {
   23|       |    #[command(subcommand)]
   24|       |    pub command: CacheCommands,
   25|       |}
   26|       |
   27|       |#[derive(clap::Subcommand)]
   28|       |pub enum CacheCommands {
   29|       |    /// Remove cached embedding/NER model files (forces re-download on next `init`).
   30|       |    ClearModels(ClearModelsArgs),
   31|       |    /// List cached embedding/NER model files with sizes and total disk usage.
   32|       |    List(CacheListArgs),
   33|       |}
   34|       |
   35|       |#[derive(clap::Args)]
   36|       |pub struct CacheListArgs {
   37|       |    /// Output as JSON.
   38|       |    #[arg(long)]
   39|       |    pub json: bool,
   40|       |}
   41|       |
   42|       |#[derive(clap::Args)]
   43|       |pub struct ClearModelsArgs {
   44|       |    /// Skip confirmation prompt and proceed with deletion immediately.
   45|       |    #[arg(long, default_value_t = false, help = "Skip confirmation prompt")]
   46|       |    pub yes: bool,
   47|       |    /// Output format: json (default), text, or markdown.
   48|       |    #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
   49|       |    pub json: bool,
   50|       |}
   51|       |
   52|       |#[derive(Serialize)]
   53|       |struct ClearModelsResponse {
   54|       |    cache_path: String,
   55|       |    existed: bool,
   56|       |    bytes_freed: u64,
   57|       |    files_removed: usize,
   58|       |    /// Total execution time in milliseconds from handler start to serialisation.
   59|       |    elapsed_ms: u64,
   60|       |}
   61|       |
   62|      0|pub fn run(args: CacheArgs) -> Result<(), AppError> {
   63|      0|    match args.command {
   64|      0|        CacheCommands::ClearModels(a) => clear_models(a),
   65|      0|        CacheCommands::List(a) => run_list(a),
   66|       |    }
   67|      0|}
   68|       |
   69|      1|fn clear_models(args: ClearModelsArgs) -> Result<(), AppError> {
   70|      1|    let inicio = std::time::Instant::now();
   71|       |    // Resolve the canonical models directory through AppPaths to honour
   72|       |    // SQLITE_GRAPHRAG_CACHE_DIR overrides used by tests and CI.
   73|      1|    let paths = AppPaths::resolve(None)?;
                                                     ^0
   74|      1|    let models_dir = paths.models.clone();
   75|       |
   76|      1|    if !args.yes {
   77|       |        // For machine consumption stay deterministic: refuse without --yes.
   78|      1|        return Err(AppError::Validation(
   79|      1|            "destructive operation: pass --yes to confirm cache deletion".to_string(),
   80|      1|        ));
   81|      0|    }
   82|       |
   83|      0|    let existed = models_dir.exists();
   84|      0|    let mut bytes_freed: u64 = 0;
   85|      0|    let mut files_removed: usize = 0;
   86|       |
   87|      0|    if existed {
   88|      0|        bytes_freed = dir_size(&models_dir).unwrap_or(0);
   89|      0|        files_removed = count_files(&models_dir).unwrap_or(0);
   90|      0|        std::fs::remove_dir_all(&models_dir)?;
   91|      0|    }
   92|       |
   93|      0|    output::emit_json(&ClearModelsResponse {
   94|      0|        cache_path: models_dir.display().to_string(),
   95|      0|        existed,
   96|      0|        bytes_freed,
   97|      0|        files_removed,
   98|      0|        elapsed_ms: inicio.elapsed().as_millis() as u64,
   99|      0|    })?;
  100|       |
  101|      0|    Ok(())
  102|      1|}
  103|       |
  104|       |#[derive(Serialize)]
  105|       |struct CacheFileEntry {
  106|       |    name: String,
  107|       |    path: String,
  108|       |    size_bytes: u64,
  109|       |    modified_at: String,
  110|       |}
  111|       |
  112|       |#[derive(Serialize)]
  113|       |struct CacheListResponse {
  114|       |    schema_version: u32,
  115|       |    cache_path: String,
  116|       |    files: Vec<CacheFileEntry>,
  117|       |    total_bytes: u64,
  118|       |    total_human: String,
  119|       |}
  120|       |
  121|      0|fn format_bytes_human(bytes: u64) -> String {
  122|       |    const KB: u64 = 1024;
  123|       |    const MB: u64 = KB * 1024;
  124|       |    const GB: u64 = MB * 1024;
  125|      0|    if bytes >= GB {
  126|      0|        format!("{:.1} GB", bytes as f64 / GB as f64)
  127|      0|    } else if bytes >= MB {
  128|      0|        format!("{:.1} MB", bytes as f64 / MB as f64)
  129|      0|    } else if bytes >= KB {
  130|      0|        format!("{:.1} KB", bytes as f64 / KB as f64)
  131|       |    } else {
  132|      0|        format!("{bytes} B")
  133|       |    }
  134|      0|}
  135|       |
  136|      0|fn collect_cache_files(
  137|      0|    dir: &std::path::Path,
  138|      0|    base: &std::path::Path,
  139|      0|    entries: &mut Vec<CacheFileEntry>,
  140|      0|) -> std::io::Result<()> {
  141|      0|    for entry in std::fs::read_dir(dir)? {
  142|      0|        let entry = entry?;
  143|      0|        let meta = entry.metadata()?;
  144|      0|        let path = entry.path();
  145|      0|        if meta.is_dir() {
  146|      0|            collect_cache_files(&path, base, entries)?;
  147|       |        } else {
  148|      0|            let size_bytes = meta.len();
  149|      0|            let relative = path.strip_prefix(base).unwrap_or(&path);
  150|      0|            let name = relative.to_string_lossy().into_owned();
  151|      0|            let modified_at = meta
  152|      0|                .modified()
  153|      0|                .ok()
  154|      0|                .map(|t| {
  155|      0|                    let secs = t
  156|      0|                        .duration_since(std::time::UNIX_EPOCH)
  157|      0|                        .unwrap_or_default()
  158|      0|                        .as_secs();
  159|       |                    // Format as RFC 3339 (UTC) without chrono dependency.
  160|      0|                    let secs_i64 = secs as i64;
  161|      0|                    let (y, mo, d, h, mi, s) = epoch_to_ymd_hms(secs_i64);
  162|      0|                    format!("{y:04}-{mo:02}-{d:02}T{h:02}:{mi:02}:{s:02}Z")
  163|      0|                })
  164|      0|                .unwrap_or_else(|| "unknown".to_string());
  165|      0|            entries.push(CacheFileEntry {
  166|      0|                name,
  167|      0|                path: path.display().to_string(),
  168|      0|                size_bytes,
  169|      0|                modified_at,
  170|      0|            });
  171|       |        }
  172|       |    }
  173|      0|    Ok(())
  174|      0|}
  175|       |
  176|       |/// Converts Unix epoch seconds to (year, month, day, hour, minute, second) UTC.
  177|      0|fn epoch_to_ymd_hms(secs: i64) -> (i32, u8, u8, u8, u8, u8) {
  178|      0|    let s = (secs % 60) as u8;
  179|      0|    let total_min = secs / 60;
  180|      0|    let mi = (total_min % 60) as u8;
  181|      0|    let total_h = total_min / 60;
  182|      0|    let h = (total_h % 24) as u8;
  183|      0|    let mut days = total_h / 24;
  184|       |    // Compute year/month/day from days since epoch (1970-01-01).
  185|      0|    let mut y = 1970i32;
  186|       |    loop {
  187|      0|        let days_in_y = if is_leap(y) { 366 } else { 365 };
  188|      0|        if days < days_in_y {
  189|      0|            break;
  190|      0|        }
  191|      0|        days -= days_in_y;
  192|      0|        y += 1;
  193|       |    }
  194|      0|    let leap = is_leap(y);
  195|      0|    let months = [
  196|       |        31u8,
  197|      0|        if leap { 29 } else { 28 },
  198|       |        31,
  199|       |        30,
  200|       |        31,
  201|       |        30,
  202|       |        31,
  203|       |        31,
  204|       |        30,
  205|       |        31,
  206|       |        30,
  207|       |        31,
  208|       |    ];
  209|      0|    let mut mo = 1u8;
  210|      0|    for &days_in_m in &months {
  211|      0|        if days < days_in_m as i64 {
  212|      0|            break;
  213|      0|        }
  214|      0|        days -= days_in_m as i64;
  215|      0|        mo += 1;
  216|       |    }
  217|      0|    let d = (days + 1) as u8;
  218|      0|    (y, mo, d, h, mi, s)
  219|      0|}
  220|       |
  221|      0|fn is_leap(y: i32) -> bool {
  222|      0|    (y % 4 == 0 && y % 100 != 0) || y % 400 == 0
  223|      0|}
  224|       |
  225|      0|fn run_list(args: CacheListArgs) -> Result<(), AppError> {
  226|      0|    let paths = AppPaths::resolve(None)?;
  227|      0|    let models_dir = &paths.models;
  228|       |
  229|      0|    let mut entries: Vec<CacheFileEntry> = Vec::with_capacity(4);
  230|      0|    if models_dir.exists() {
  231|      0|        collect_cache_files(models_dir, models_dir, &mut entries).map_err(AppError::Io)?;
  232|      0|    }
  233|       |
  234|      0|    entries.sort_unstable_by(|a, b| a.name.cmp(&b.name));
  235|      0|    let total_bytes: u64 = entries.iter().map(|e| e.size_bytes).sum();
  236|      0|    let total_human = format_bytes_human(total_bytes);
  237|      0|    let n_files = entries.len();
  238|       |
  239|      0|    if args.json {
  240|      0|        output::emit_json(&CacheListResponse {
  241|      0|            schema_version: 1,
  242|      0|            cache_path: models_dir.display().to_string(),
  243|      0|            files: entries,
  244|      0|            total_bytes,
  245|      0|            total_human,
  246|      0|        })?;
  247|      0|    } else if entries.is_empty() {
  248|      0|        output::emit_text("(empty)");
  249|      0|    } else {
  250|      0|        for e in &entries {
  251|      0|            output::emit_text(&format!(
  252|      0|                "{:<40} {:>10}  {}",
  253|      0|                e.name,
  254|      0|                format_bytes_human(e.size_bytes),
  255|      0|                e.modified_at
  256|      0|            ));
  257|      0|        }
  258|      0|        output::emit_text(&format!("\nTOTAL: {n_files} files, {total_human}"));
  259|       |    }
  260|       |
  261|      0|    Ok(())
  262|      0|}
  263|       |
  264|      0|fn dir_size(path: &std::path::Path) -> std::io::Result<u64> {
  265|      0|    let mut total = 0u64;
  266|      0|    for entry in std::fs::read_dir(path)? {
  267|      0|        let entry = entry?;
  268|      0|        let meta = entry.metadata()?;
  269|      0|        if meta.is_dir() {
  270|      0|            total = total.saturating_add(dir_size(&entry.path()).unwrap_or(0));
  271|      0|        } else {
  272|      0|            total = total.saturating_add(meta.len());
  273|      0|        }
  274|       |    }
  275|      0|    Ok(total)
  276|      0|}
  277|       |
  278|      0|fn count_files(path: &std::path::Path) -> std::io::Result<usize> {
  279|      0|    let mut count = 0usize;
  280|      0|    for entry in std::fs::read_dir(path)? {
  281|      0|        let entry = entry?;
  282|      0|        let meta = entry.metadata()?;
  283|      0|        if meta.is_dir() {
  284|      0|            count = count.saturating_add(count_files(&entry.path()).unwrap_or(0));
  285|      0|        } else {
  286|      0|            count += 1;
  287|      0|        }
  288|       |    }
  289|      0|    Ok(count)
  290|      0|}
  291|       |
  292|       |#[cfg(test)]
  293|       |mod tests {
  294|       |    use super::*;
  295|       |
  296|       |    #[test]
  297|      1|    fn clear_models_response_serializes_all_fields() {
  298|      1|        let resp = ClearModelsResponse {
  299|      1|            cache_path: "/tmp/sqlite-graphrag/models".to_string(),
  300|      1|            existed: true,
  301|      1|            bytes_freed: 465_000_000,
  302|      1|            files_removed: 14,
  303|      1|            elapsed_ms: 12,
  304|      1|        };
  305|      1|        let json = serde_json::to_value(&resp).expect("serialization");
  306|      1|        assert_eq!(json["existed"], true);
  307|      1|        assert_eq!(json["bytes_freed"], 465_000_000u64);
  308|      1|        assert_eq!(json["files_removed"], 14);
  309|      1|        assert_eq!(json["elapsed_ms"], 12);
  310|      1|    }
  311|       |
  312|       |    #[test]
  313|      1|    fn clear_models_without_yes_returns_validation_error() {
  314|      1|        let args = ClearModelsArgs {
  315|      1|            yes: false,
  316|      1|            json: false,
  317|      1|        };
  318|      1|        let result = clear_models(args);
  319|      1|        assert!(matches!(result, Err(AppError::Validation(_))));
                              ^0
  320|      1|    }
  321|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/claude_runner.rs:
    1|       |//! Shared module for spawning Claude Code (`claude -p`) subprocesses.
    2|       |//!
    3|       |//! Eliminates duplication between `enrich.rs` and `ingest_claude.rs` (G02).
    4|       |//! Detects `terminal_reason: "max_turns"` in the JSON output (G03).
    5|       |
    6|       |use crate::errors::AppError;
    7|       |use std::path::Path;
    8|       |use std::process::{Command, Stdio};
    9|       |
   10|       |/// Minimum Claude Code version required for structured JSON output.
   11|       |const MIN_CLAUDE_VERSION: &str = "2.1.0";
   12|       |
   13|       |/// Environment variables whitelisted for the subprocess.
   14|       |const ENV_WHITELIST: &[&str] = &[
   15|       |    "PATH",
   16|       |    "HOME",
   17|       |    "USER",
   18|       |    "SHELL",
   19|       |    "TERM",
   20|       |    "LANG",
   21|       |    "XDG_CONFIG_HOME",
   22|       |    "XDG_DATA_HOME",
   23|       |    "XDG_RUNTIME_DIR",
   24|       |    // NOTE: `ANTHROPIC_API_KEY` is INTENTIONALLY ABSENT from this whitelist
   25|       |    // (gaps.md:47). The OAuth-only flow uses the session token from
   26|       |    // `~/.claude/.credentials.json` (or the OS keychain), not an env var.
   27|       |    // The OAuth-only guard in `build_claude_command` aborts the spawn if
   28|       |    // `ANTHROPIC_API_KEY` is set in the environment, but defence-in-depth
   29|       |    // also requires the variable to never reach the child process.
   30|       |    "CLAUDE_CONFIG_DIR",
   31|       |    "TMPDIR",
   32|       |    "TMP",
   33|       |    "TEMP",
   34|       |    "DYLD_FALLBACK_LIBRARY_PATH",
   35|       |];
   36|       |
   37|       |/// Windows-only environment variables.
   38|       |#[cfg(windows)]
   39|       |const ENV_WHITELIST_WINDOWS: &[&str] = &[
   40|       |    "LOCALAPPDATA",
   41|       |    "APPDATA",
   42|       |    "USERPROFILE",
   43|       |    "SystemRoot",
   44|       |    "COMSPEC",
   45|       |    "PATHEXT",
   46|       |    "HOMEPATH",
   47|       |    "HOMEDRIVE",
   48|       |];
   49|       |
   50|       |/// Default virtual memory limit for LLM subprocesses (4 GiB).
   51|       |#[cfg(target_os = "linux")]
   52|       |const DEFAULT_SUBPROCESS_MEMORY_LIMIT_MB: u64 = 4096;
   53|       |
   54|       |// G28-C (v1.0.69): process lifecycle. The G28 gap asks for
   55|       |// `tokio::process::Command::kill_on_drop(true)`. This codebase uses
   56|       |// `std::process::Command` (synchronous) so the tokio helper is not
   57|       |// available. Equivalent defence-in-depth is provided by:
   58|       |//
   59|       |// 1. `SIGTERM` via `libc::kill` in the timeout branch of `run_claude`
   60|       |//    and `run_codex` (graceful — gives the child a chance to clean up
   61|       |//    MCP children and write logs).
   62|       |// 2. `child.kill()` (SIGKILL) if SIGTERM was ignored.
   63|       |// 3. `reaper::scan_and_kill_orphans()` at startup, which walks `/proc`
   64|       |//    and reaps any `claude`/`codex` processes that were orphaned by a
   65|       |//    previous crash.
   66|       |//
   67|       |// SIGKILL on drop is intentionally NOT used because (a) the gaps.md
   68|       |// Passo C warning flags it as risky per tokio-rs/tokio#7082, and (b)
   69|       |// the SIGTERM-then-SIGKILL pair covers the same threat model with
   70|       |// better cleanup behaviour.
   71|       |
   72|       |/// Spawns a command with a virtual memory limit via `setrlimit(RLIMIT_AS)`.
   73|       |///
   74|       |/// On Linux, applies the limit in a `pre_exec` hook before the child process
   75|       |/// starts.  On non-Linux platforms, falls back to an unlimited spawn.
   76|       |/// The limit is read from `SQLITE_GRAPHRAG_SUBPROCESS_MEMORY_LIMIT_MB`
   77|       |/// (default: 4096 MiB).
   78|       |#[cfg(target_os = "linux")]
   79|       |pub fn spawn_with_memory_limit(cmd: &mut Command) -> std::io::Result<std::process::Child> {
   80|       |    use std::os::unix::process::CommandExt;
   81|       |    let max_mb: u64 = std::env::var("SQLITE_GRAPHRAG_SUBPROCESS_MEMORY_LIMIT_MB")
   82|       |        .ok()
   83|       |        .and_then(|v| v.parse().ok())
   84|       |        .unwrap_or(DEFAULT_SUBPROCESS_MEMORY_LIMIT_MB);
   85|       |    let max_bytes = max_mb * 1024 * 1024;
   86|       |    // SAFETY: pre_exec closure runs between fork() and exec() in the
   87|       |    // single-threaded child process — no other threads exist.
   88|       |    // libc::setsid and libc::setrlimit are async-signal-safe per POSIX.1-2008 §2.4.3.
   89|       |    // RLIMIT_AS limits virtual address space, not physical RSS.
   90|       |    // setsid failure with EPERM is tolerated (process already a session leader).
   91|       |    // On setrlimit failure, Err(last_os_error()) prevents exec.
   92|       |    unsafe {
   93|       |        cmd.pre_exec(move || {
   94|       |            let sid = libc::setsid();
   95|       |            if sid == -1 {
   96|       |                let err = std::io::Error::last_os_error();
   97|       |                if err.raw_os_error() != Some(libc::EPERM) {
   98|       |                    return Err(err);
   99|       |                }
  100|       |            }
  101|       |            let limit = libc::rlimit {
  102|       |                rlim_cur: max_bytes,
  103|       |                rlim_max: max_bytes,
  104|       |            };
  105|       |            if libc::setrlimit(libc::RLIMIT_AS, &limit) != 0 {
  106|       |                return Err(std::io::Error::last_os_error());
  107|       |            }
  108|       |            Ok(())
  109|       |        });
  110|       |    }
  111|       |    tracing::debug!(
  112|       |        target: "process",
  113|       |        program = ?cmd.get_program(),
  114|       |        args = ?cmd.get_args().collect::<Vec<_>>(),
  115|       |        "spawning external process"
  116|       |    );
  117|       |    cmd.spawn()
  118|       |}
  119|       |
  120|       |/// Spawns a command without memory limits (non-Linux fallback).
  121|       |/// On Unix (macOS, FreeBSD), applies setsid for process group isolation.
  122|       |#[cfg(not(target_os = "linux"))]
  123|      0|pub fn spawn_with_memory_limit(cmd: &mut Command) -> std::io::Result<std::process::Child> {
  124|       |    #[cfg(unix)]
  125|       |    {
  126|       |        use std::os::unix::process::CommandExt;
  127|       |        // SAFETY: setsid() is async-signal-safe per POSIX.1-2008 §2.4.3.
  128|       |        // Creates independent session for cascade termination.
  129|       |        unsafe {
  130|      0|            cmd.pre_exec(|| {
  131|      0|                let sid = libc::setsid();
  132|      0|                if sid == -1 {
  133|      0|                    let err = std::io::Error::last_os_error();
  134|      0|                    if err.raw_os_error() != Some(libc::EPERM) {
  135|      0|                        return Err(err);
  136|      0|                    }
  137|      0|                }
  138|      0|                Ok(())
  139|      0|            });
  140|       |        }
  141|       |    }
  142|      0|    tracing::debug!(
  143|       |        target: "process",
  144|      0|        program = ?cmd.get_program(),
  145|      0|        args = ?cmd.get_args().collect::<Vec<_>>(),
  146|      0|        "spawning external process"
  147|       |    );
  148|      0|    cmd.spawn()
  149|      0|}
  150|       |
  151|       |/// Parsed output element from `claude -p --output-format json`.
  152|       |#[derive(Debug, serde::Deserialize)]
  153|       |pub struct ClaudeOutputElement {
  154|       |    pub r#type: Option<String>,
  155|       |    pub subtype: Option<String>,
  156|       |    #[serde(default)]
  157|       |    pub is_error: bool,
  158|       |    pub structured_output: Option<serde_json::Value>,
  159|       |    pub result: Option<String>,
  160|       |    pub total_cost_usd: Option<f64>,
  161|       |    pub error: Option<String>,
  162|       |    pub terminal_reason: Option<String>,
  163|       |    #[serde(rename = "apiKeySource")]
  164|       |    pub api_key_source: Option<String>,
  165|       |}
  166|       |
  167|       |/// Result of a successful Claude invocation.
  168|       |#[derive(Debug)]
  169|       |pub struct ClaudeResult {
  170|       |    pub value: serde_json::Value,
  171|       |    pub cost_usd: f64,
  172|       |    pub is_oauth: bool,
  173|       |}
  174|       |
  175|       |/// Validates that the Claude binary meets the minimum version requirement.
  176|      0|pub fn validate_claude_version(binary: &Path) -> Result<String, AppError> {
  177|      0|    let resolved = which::which(binary).map_err(|_| {
  178|      0|        AppError::Validation(format!(
  179|      0|            "executable '{}' not found in PATH; ensure it is installed and accessible",
  180|      0|            binary.display()
  181|      0|        ))
  182|      0|    })?;
  183|      0|    let output = Command::new(&resolved)
  184|      0|        .arg("--version")
  185|      0|        .stdin(Stdio::null())
  186|      0|        .stdout(Stdio::piped())
  187|      0|        .stderr(Stdio::piped())
  188|      0|        .output()
  189|      0|        .map_err(AppError::Io)?;
  190|       |
  191|      0|    if !output.status.success() {
  192|      0|        return Err(AppError::Validation(
  193|      0|            "failed to run 'claude --version'".to_string(),
  194|      0|        ));
  195|      0|    }
  196|       |
  197|      0|    let version_str = String::from_utf8(output.stdout)
  198|      0|        .map_err(|_| AppError::Validation("claude --version output is not UTF-8".to_string()))?;
  199|      0|    let version = version_str.trim().to_string();
  200|      0|    let numeric = version.split([' ', '(']).next().unwrap_or("").trim();
  201|       |
  202|      0|    fn parse_semver(s: &str) -> Option<(u64, u64, u64)> {
  203|      0|        let parts: Vec<&str> = s.splitn(3, '.').collect();
  204|      0|        if parts.len() < 2 {
  205|      0|            return None;
  206|      0|        }
  207|      0|        let major = parts[0].parse::<u64>().ok()?;
  208|      0|        let minor = parts[1].parse::<u64>().ok()?;
  209|      0|        let patch = parts
  210|      0|            .get(2)
  211|      0|            .and_then(|p| p.parse::<u64>().ok())
  212|      0|            .unwrap_or(0);
  213|      0|        Some((major, minor, patch))
  214|      0|    }
  215|       |
  216|      0|    if let (Some(actual), Some(min)) = (parse_semver(numeric), parse_semver(MIN_CLAUDE_VERSION)) {
  217|      0|        if actual < min {
  218|      0|            return Err(AppError::Validation(format!(
  219|      0|                "Claude Code version {numeric} is below minimum required {MIN_CLAUDE_VERSION}"
  220|      0|            )));
  221|      0|        }
  222|      0|    }
  223|       |
  224|      0|    Ok(version)
  225|      0|}
  226|       |
  227|       |/// Builds a `Command` for `claude -p` with least-privilege environment.
  228|       |///
  229|       |/// G28-A (v1.0.68) + OAuth-only hardening (v1.0.69, mandated by gaps.md
  230|       |/// lines 41-49): the command ALWAYS uses the OAuth flow. The flag set
  231|       |/// is the canonical one documented in gaps.md Correção A:
  232|       |///
  233|       |/// ```text
  234|       |/// claude -p "TAREFA" \
  235|       |///   --strict-mcp-config \
  236|       |///   --mcp-config '{}' \
  237|       |///   --dangerously-skip-permissions \
  238|       |///   --settings '{"hooks":{}}' \
  239|       |///   --model <X> \
  240|       |///   --max-turns <N> \
  241|       |///   --output-format json \
  242|       |///   --no-session-persistence
  243|       |/// ```
  244|       |///
  245|       |/// The combination cuts the typical 8-10 MCP process tree to zero and
  246|       |/// disables user hooks. The reaper sweep at startup (see `reaper::scan_and_kill_orphans`)
  247|       |/// is the last line of defence for any process that ignored the flags.
  248|       |///
  249|       |/// **`--bare` is FORBIDDEN** (gaps.md:49 and operator policy):
  250|       |/// `--bare` cuts MCPs but disables OAuth and demands `ANTHROPIC_API_KEY`,
  251|       |/// which is PROHIBITED in this project. We also ABORT the spawn if
  252|       |/// `ANTHROPIC_API_KEY` is set in the environment, because that is the
  253|       |/// gateway to the prohibited API-key path.
  254|       |///
  255|       |/// GitHub issue [anthropics/claude-code#10787] documents that earlier
  256|       |/// Claude Code CLI builds sometimes ignored `--strict-mcp-config` and
  257|       |/// fell back to `~/.mcp.json`. We still pass the flags as defence-in-depth
  258|       |/// and ALSO honour `SQLITE_GRAPHRAG_CLAUDE_EMPTY_CONFIG_DIR` so users
  259|       |/// who need belt-and-suspenders isolation can point Claude at an empty
  260|       |/// config directory (no MCP, no hooks, no settings).
  261|       |///
  262|       |/// [anthropics/claude-code#10787]: https://github.com/anthropics/claude-code/issues/10787
  263|      2|pub fn build_claude_command(
  264|      2|    binary: &Path,
  265|      2|    prompt: &str,
  266|      2|    json_schema: &str,
  267|      2|    model: Option<&str>,
  268|      2|    max_turns: u32,
  269|      2|) -> Command {
  270|       |    // OAuth-only guard (gaps.md:47). If `ANTHROPIC_API_KEY` is set in the
  271|       |    // environment we MUST abort — that is the API-key path which is
  272|       |    // explicitly PROHIBITED. Use the OAuth flow exclusively.
  273|      2|    if let Ok(_key) = std::env::var("ANTHROPIC_API_KEY") {
                            ^1
  274|       |        // Return a command that will fail loudly at spawn time. We
  275|       |        // intentionally do NOT pass `--bare` (PROHIBITED) and we do NOT
  276|       |        // allow the API-key path at all.
  277|      1|        let mut cmd = Command::new("false");
  278|      1|        cmd.env_clear();
  279|      1|        cmd.env("PATH", "/nonexistent");
  280|      1|        cmd.arg("--oauth-only-violation-anthropic-api-key-set");
  281|      1|        return cmd;
  282|      1|    }
  283|       |
  284|      1|    let mut cmd = Command::new(binary);
  285|       |
  286|      1|    cmd.env_clear();
  287|     15|    for var in ENV_WHITELIST {
                      ^14
  288|     14|        if let Ok(val) = std::env::var(var) {
                                ^10
  289|     10|            cmd.env(var, val);
  290|     10|        }
                      ^4
  291|       |    }
  292|       |
  293|       |    #[cfg(windows)]
  294|       |    for var in ENV_WHITELIST_WINDOWS {
  295|       |        if let Ok(val) = std::env::var(var) {
  296|       |            cmd.env(var, val);
  297|       |        }
  298|       |    }
  299|       |
  300|       |    // G28-A: if the user has pointed us at an empty config dir, force Claude
  301|       |    // Code to use it (which suppresses user-scoped MCP servers and hooks).
  302|      1|    if let Ok(empty_dir) = std::env::var("SQLITE_GRAPHRAG_CLAUDE_EMPTY_CONFIG_DIR") {
                            ^0
  303|      0|        if std::path::Path::new(&empty_dir).is_dir() {
  304|      0|            cmd.env("CLAUDE_CONFIG_DIR", &empty_dir);
  305|      0|            tracing::debug!(
  306|       |                target: "claude_runner",
  307|      0|                "isolating claude subprocess to CLAUDE_CONFIG_DIR={}",
  308|       |                empty_dir
  309|       |            );
  310|       |        } else {
  311|      0|            tracing::warn!(
  312|       |                target: "claude_runner",
  313|       |                path = %empty_dir,
  314|      0|                "SQLITE_GRAPHRAG_CLAUDE_EMPTY_CONFIG_DIR is set but path is not a directory; \
  315|      0|                 ignoring.  MCP isolation will NOT be applied."
  316|       |            );
  317|       |        }
  318|      1|    }
  319|       |
  320|       |    // Canonical OAuth-only command line (gaps.md:201-208). Every flag is
  321|       |    // mandatory; do NOT pass `--bare` (PROHIBITED, gaps.md:49).
  322|      1|    cmd.arg("-p")
  323|      1|        .arg(prompt)
  324|      1|        .arg("--strict-mcp-config")
  325|      1|        .arg("--mcp-config")
  326|      1|        .arg("{}")
  327|      1|        .arg("--dangerously-skip-permissions")
  328|      1|        .arg("--settings")
  329|      1|        .arg(r#"{"hooks":{}}"#)
  330|      1|        .arg("--output-format")
  331|      1|        .arg("json")
  332|      1|        .arg("--json-schema")
  333|      1|        .arg(json_schema)
  334|      1|        .arg("--max-turns")
  335|      1|        .arg(max_turns.to_string())
  336|      1|        .arg("--no-session-persistence");
  337|       |
  338|      1|    if let Some(m) = model {
  339|      1|        cmd.arg("--model").arg(m);
  340|      1|    }
                  ^0
  341|       |
  342|      1|    cmd.stdin(Stdio::null())
  343|      1|        .stdout(Stdio::piped())
  344|      1|        .stderr(Stdio::piped());
  345|       |
  346|      1|    cmd
  347|      2|}
  348|       |
  349|       |/// Parses `claude -p --output-format json` output array.
  350|       |///
  351|       |/// G03: detects `terminal_reason: "max_turns"` and returns a specific error
  352|       |/// instead of a generic failure message.
  353|      7|pub fn parse_claude_output(stdout: &str) -> Result<ClaudeResult, AppError> {
  354|      7|    let elements: Vec<ClaudeOutputElement> = serde_json::from_str(stdout).map_err(|e| {
                                                                                                    ^0
  355|      0|        AppError::Validation(format!("failed to parse claude output as JSON array: {e}"))
  356|      0|    })?;
  357|       |
  358|      7|    let is_oauth = elements
  359|      7|        .iter()
  360|      7|        .find(|e| e.r#type.as_deref() == Some("system") && e.subtype.as_deref() == Some("init"))
                                                                         ^6
  361|      7|        .and_then(|e| e.api_key_source.as_deref())
                                    ^6               ^6
  362|      7|        .map(|s| s == "none")
                               ^3   ^3
  363|      7|        .unwrap_or(false);
  364|       |
  365|      7|    let result_elem = elements
  366|      7|        .iter()
  367|     13|        .find(|e| e.r#type.as_deref() == Some("result"))
                       ^7
  368|      7|        .ok_or_else(|| {
                                     ^0
  369|      0|            AppError::Validation("claude output missing 'result' element".to_string())
  370|      0|        })?;
  371|       |
  372|       |    // G03: detect max_turns exhaustion before checking is_error
  373|      7|    if result_elem.terminal_reason.as_deref() == Some("max_turns") {
  374|      1|        tracing::warn!(
  375|       |            target: "claude_runner",
  376|      0|            "claude -p hit max_turns limit — hooks may have consumed turns"
  377|       |        );
  378|      1|        return Err(AppError::Validation(
  379|      1|            "claude -p hit max_turns: hooks may be consuming turns; increase --max-turns or disable hooks".to_string(),
  380|      1|        ));
  381|      6|    }
  382|       |
  383|      6|    if result_elem.is_error {
  384|      3|        let err_msg = result_elem
  385|      3|            .error
  386|      3|            .as_deref()
  387|      3|            .or(result_elem.result.as_deref())
  388|      3|            .unwrap_or("unknown error");
  389|      3|        if err_msg.contains("rate_limit") || err_msg.contains("overloaded") {
                                                           ^1      ^1
  390|      2|            return Err(AppError::RateLimited {
  391|      2|                detail: err_msg.to_string(),
  392|      2|            });
  393|      1|        }
  394|      1|        if err_msg.contains("Not logged in") || err_msg.contains("authentication") {
  395|      1|            tracing::warn!(
  396|       |                target: "claude_runner",
  397|      0|                "Claude Code authentication failed. Re-authenticate interactively with: claude"
  398|       |            );
  399|      0|        }
  400|      1|        return Err(AppError::Validation(format!(
  401|      1|            "claude extraction failed: {err_msg}"
  402|      1|        )));
  403|      3|    }
  404|       |
  405|      3|    let value = if let Some(v) = result_elem.structured_output.clone() {
  406|      3|        v
  407|      0|    } else if let Some(text) = &result_elem.result {
  408|      0|        serde_json::from_str(text).map_err(|e| {
  409|      0|            AppError::Validation(format!("failed to parse claude result field as JSON: {e}"))
  410|      0|        })?
  411|       |    } else {
  412|      0|        return Err(AppError::Validation(
  413|      0|            "claude result missing structured_output and result field".into(),
  414|      0|        ));
  415|       |    };
  416|       |
  417|      3|    let cost = result_elem.total_cost_usd.unwrap_or(0.0);
  418|      3|    Ok(ClaudeResult {
  419|      3|        value,
  420|      3|        cost_usd: cost,
  421|      3|        is_oauth,
  422|      3|    })
  423|      7|}
  424|       |
  425|       |/// Calls `claude -p` with prompt and schema, waits with timeout, and parses output.
  426|       |///
  427|       |/// G03: parses stdout even on non-zero exit to detect `terminal_reason: "max_turns"`.
  428|       |/// G28-C (v1.0.69): the child is killed explicitly on timeout to avoid
  429|       |/// leaving a `claude -p` zombie with its MCP children behind.
  430|      0|pub fn run_claude(
  431|      0|    binary: &Path,
  432|      0|    prompt: &str,
  433|      0|    json_schema: &str,
  434|      0|    input_text: &str,
  435|      0|    model: Option<&str>,
  436|      0|    timeout_secs: u64,
  437|      0|    max_turns: u32,
  438|      0|) -> Result<ClaudeResult, AppError> {
  439|       |    use wait_timeout::ChildExt;
  440|       |
  441|      0|    let full_prompt = format!("{prompt}\n\n{input_text}");
  442|      0|    let mut cmd = build_claude_command(binary, &full_prompt, json_schema, model, max_turns);
  443|       |
  444|      0|    let mut child = spawn_with_memory_limit(&mut cmd).map_err(|e| {
  445|      0|        AppError::Io(std::io::Error::new(
  446|      0|            e.kind(),
  447|      0|            format!("failed to spawn claude: {e}"),
  448|      0|        ))
  449|      0|    })?;
  450|       |
  451|      0|    let start = std::time::Instant::now();
  452|      0|    let timeout = std::time::Duration::from_secs(timeout_secs);
  453|      0|    let status = child.wait_timeout(timeout).map_err(AppError::Io)?;
  454|       |
  455|      0|    if status.is_none() {
  456|      0|        // G28-C: timeout hit — send SIGTERM to the child so the MCP
  457|      0|        // children it spawned (and their npm/node tree) are also
  458|      0|        // reaped. SIGTERM gives the child a chance to clean up; the
  459|      0|        // reaper sweep in main.rs is the last line of defence for
  460|      0|        // anything that ignored it.
  461|      0|        #[cfg(unix)]
  462|      0|        unsafe {
  463|      0|            libc::kill(child.id() as i32, libc::SIGTERM);
  464|      0|        }
  465|      0|        let _ = child.kill();
  466|      0|        let _ = child.wait();
  467|      0|    }
  468|       |
  469|      0|    match status {
  470|      0|        Some(exit_status) => {
  471|      0|            tracing::debug!(
  472|       |                target: "process",
  473|      0|                exit_code = ?exit_status.code(),
  474|      0|                elapsed_ms = start.elapsed().as_millis() as u64,
  475|      0|                "external process completed"
  476|       |            );
  477|       |
  478|      0|            let mut stdout_buf = Vec::new();
  479|      0|            let mut stderr_buf = Vec::new();
  480|      0|            if let Some(mut out) = child.stdout.take() {
  481|      0|                std::io::Read::read_to_end(&mut out, &mut stdout_buf).map_err(AppError::Io)?;
  482|      0|            }
  483|      0|            if let Some(mut err) = child.stderr.take() {
  484|      0|                std::io::Read::read_to_end(&mut err, &mut stderr_buf).map_err(AppError::Io)?;
  485|      0|            }
  486|       |
  487|      0|            let stdout_str = String::from_utf8(stdout_buf)
  488|      0|                .map_err(|_| AppError::Validation("claude -p stdout is not valid UTF-8".into()))?;
  489|       |
  490|       |            // G03: parse stdout even on failure to detect terminal_reason
  491|      0|            if !exit_status.success() {
  492|      0|                if let Ok(result) = parse_claude_output(&stdout_str) {
  493|      0|                    return Ok(result);
  494|      0|                }
  495|      0|                let stderr_str = String::from_utf8_lossy(&stderr_buf);
  496|      0|                if stderr_str.contains("auth") || stderr_str.contains("login") {
  497|      0|                    tracing::warn!(
  498|       |                        target: "claude_runner",
  499|      0|                        "Claude Code authentication may have failed. Re-authenticate with: claude"
  500|       |                    );
  501|      0|                }
  502|      0|                return Err(AppError::Validation(format!(
  503|      0|                    "claude -p exited with code {:?}: {}",
  504|      0|                    exit_status.code(),
  505|      0|                    stderr_str.trim()
  506|      0|                )));
  507|      0|            }
  508|       |
  509|      0|            parse_claude_output(&stdout_str)
  510|       |        }
  511|       |        None => {
  512|      0|            tracing::warn!(target: "claude_runner", timeout_secs, "claude -p timed out, terminating");
  513|      0|            terminate_gracefully(&mut child, 3);
  514|      0|            Err(AppError::Validation(format!(
  515|      0|                "claude -p timed out after {timeout_secs} seconds"
  516|      0|            )))
  517|       |        }
  518|       |    }
  519|      0|}
  520|       |
  521|       |/// Terminates a child process gracefully: SIGTERM first, SIGKILL after grace period.
  522|       |#[cfg(unix)]
  523|      0|pub fn terminate_gracefully(child: &mut std::process::Child, grace_secs: u64) {
  524|       |    use wait_timeout::ChildExt;
  525|      0|    unsafe {
  526|      0|        libc::kill(child.id() as i32, libc::SIGTERM);
  527|      0|    }
  528|      0|    match child.wait_timeout(std::time::Duration::from_secs(grace_secs)) {
  529|      0|        Ok(Some(_)) => {}
  530|       |        _ => {
  531|      0|            tracing::warn!(target: "process", pid = child.id(), "child ignored SIGTERM, sending SIGKILL");
  532|      0|            let _ = child.kill();
  533|      0|            let _ = child.wait();
  534|       |        }
  535|       |    }
  536|      0|}
  537|       |
  538|       |/// Non-Unix fallback: kill immediately (Windows TerminateProcess).
  539|       |#[cfg(not(unix))]
  540|       |pub fn terminate_gracefully(child: &mut std::process::Child, _grace_secs: u64) {
  541|       |    let _ = child.kill();
  542|       |    let _ = child.wait();
  543|       |}
  544|       |
  545|       |#[cfg(test)]
  546|       |mod tests {
  547|       |    use super::*;
  548|       |
  549|       |    #[test]
  550|      1|    fn parse_output_detects_max_turns() {
  551|      1|        let stdout = r#"[{"type":"system","subtype":"init","apiKeySource":"none"},{"type":"result","is_error":false,"terminal_reason":"max_turns","structured_output":{"name":"t"}}]"#;
  552|      1|        let err = parse_claude_output(stdout).unwrap_err();
  553|      1|        assert!(
  554|      1|            format!("{err}").contains("max_turns"),
  555|      0|            "must detect max_turns in output"
  556|       |        );
  557|      1|    }
  558|       |
  559|       |    #[test]
  560|      1|    fn parse_output_extracts_structured_value() {
  561|      1|        let stdout = r#"[{"type":"system","subtype":"init","apiKeySource":"none"},{"type":"result","is_error":false,"structured_output":{"key":"val"},"total_cost_usd":0.01}]"#;
  562|      1|        let result = parse_claude_output(stdout).unwrap();
  563|      1|        assert_eq!(result.value["key"], "val");
  564|      1|        assert!((result.cost_usd - 0.01).abs() < f64::EPSILON);
  565|      1|        assert!(result.is_oauth);
  566|      1|    }
  567|       |
  568|       |    #[test]
  569|      1|    fn parse_output_detects_rate_limit() {
  570|      1|        let stdout = r#"[{"type":"result","is_error":true,"error":"rate_limit exceeded"}]"#;
  571|      1|        let err = parse_claude_output(stdout).unwrap_err();
  572|      1|        assert!(
  573|      1|            matches!(err, AppError::RateLimited { .. }),
                          ^0
  574|      0|            "expected AppError::RateLimited, got: {err}"
  575|       |        );
  576|      1|    }
  577|       |
  578|       |    /// OAuth-only conformance test (gaps.md:41-49, v1.0.69 mandate).
  579|       |    /// Verifies that `build_claude_command` always emits the canonical
  580|       |    /// flag set and NEVER emits `--bare` or any API-key path.
  581|       |    #[test]
  582|       |    #[serial_test::serial(env)]
  583|      1|    fn build_command_oauth_only_mandatory_flags() {
  584|       |        // SAFETY: this is a unit test, no concurrent env mutation
  585|      1|        unsafe {
  586|      1|            std::env::remove_var("ANTHROPIC_API_KEY");
  587|      1|        }
  588|      1|        let cmd = build_claude_command(
  589|      1|            std::path::Path::new("/usr/bin/false"),
  590|      1|            "test prompt",
  591|      1|            "{}",
  592|      1|            Some("sonnet"),
  593|       |            4,
  594|       |        );
  595|     17|        let args: Vec<&str> = cmd.get_args().filter_map(|a| a.to_str()).collect();
                          ^1    ^1          ^1  ^1         ^1                         ^1
  596|       |        // Mandatory OAuth-only flags from gaps.md lines 201-208
  597|      1|        assert!(args.contains(&"-p"), "must have -p");
                                                    ^0
  598|      1|        assert!(
  599|      1|            args.contains(&"--strict-mcp-config"),
  600|      0|            "must have --strict-mcp-config (gaps.md:206)"
  601|       |        );
  602|      1|        assert!(
  603|      1|            args.contains(&"--mcp-config"),
  604|      0|            "must have --mcp-config (gaps.md:207)"
  605|       |        );
  606|      1|        assert!(
  607|      1|            args.contains(&"--dangerously-skip-permissions"),
  608|      0|            "must have --dangerously-skip-permissions (gaps.md:208)"
  609|       |        );
  610|      1|        assert!(
  611|      1|            args.contains(&"--settings"),
  612|      0|            "must have --settings (gaps.md:209)"
  613|       |        );
  614|      1|        assert!(
  615|      1|            args.contains(&"--output-format"),
  616|      0|            "must have --output-format json (gaps.md:213)"
  617|       |        );
  618|      1|        assert!(args.contains(&"--json-schema"), "must have --json-schema");
                                                               ^0
  619|      1|        assert!(
  620|      1|            args.contains(&"--max-turns"),
  621|      0|            "must have --max-turns (gaps.md:212)"
  622|       |        );
  623|      1|        assert!(
  624|      1|            args.contains(&"--no-session-persistence"),
  625|      0|            "must have --no-session-persistence"
  626|       |        );
  627|      1|        assert!(
  628|      1|            args.contains(&"--model"),
  629|      0|            "must have --model when model is Some"
  630|       |        );
  631|       |        // PROHIBITED flags (gaps.md:49)
  632|      1|        assert!(
  633|      1|            !args.contains(&"--bare"),
  634|      0|            "--bare is PROHIBITED (gaps.md:49)"
  635|       |        );
  636|       |    }
  637|       |
  638|       |    /// OAuth-only guard: when `ANTHROPIC_API_KEY` is in the environment,
  639|       |    /// `build_claude_command` MUST abort the spawn (return a `false`
  640|       |    /// command), NOT silently fall back to the API-key path.
  641|       |    #[test]
  642|       |    #[serial_test::serial(env)]
  643|      1|    fn build_command_aborts_when_anthropic_api_key_set() {
  644|       |        // SAFETY: unit test
  645|      1|        unsafe {
  646|      1|            std::env::set_var("ANTHROPIC_API_KEY", "sk-test-violation");
  647|      1|        }
  648|      1|        let cmd = build_claude_command(
  649|      1|            std::path::Path::new("/usr/bin/claude"),
  650|      1|            "test prompt",
  651|      1|            "{}",
  652|      1|            Some("sonnet"),
  653|       |            4,
  654|       |        );
  655|      1|        let program = cmd.get_program().to_string_lossy().to_string();
  656|      1|        let args: Vec<&str> = cmd.get_args().filter_map(|a| a.to_str()).collect();
  657|      1|        assert_eq!(
  658|       |            program, "false",
  659|      0|            "when ANTHROPIC_API_KEY is set, build_claude_command must abort"
  660|       |        );
  661|      1|        assert!(
  662|      1|            args.contains(&"--oauth-only-violation-anthropic-api-key-set"),
  663|      0|            "aborted command must carry violation marker"
  664|       |        );
  665|      1|        unsafe {
  666|      1|            std::env::remove_var("ANTHROPIC_API_KEY");
  667|      1|        }
  668|       |    }
  669|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/cleanup_orphans.rs:
    1|       |//! Handler for the `cleanup-orphans` CLI subcommand.
    2|       |
    3|       |use crate::errors::AppError;
    4|       |use crate::output::{self, OutputFormat};
    5|       |use crate::paths::AppPaths;
    6|       |use crate::storage::connection::open_rw;
    7|       |use crate::storage::entities;
    8|       |use serde::Serialize;
    9|       |
   10|       |#[derive(clap::Args)]
   11|       |#[command(after_long_help = "EXAMPLES:\n  \
   12|       |    # Remove orphan entities (no memories, no relationships) from the global namespace\n  \
   13|       |    sqlite-graphrag cleanup-orphans\n\n  \
   14|       |    # Preview which entities would be removed without deleting\n  \
   15|       |    sqlite-graphrag cleanup-orphans --dry-run\n\n  \
   16|       |    # Cleanup within a specific namespace\n  \
   17|       |    sqlite-graphrag cleanup-orphans --namespace my-project --yes")]
   18|       |pub struct CleanupOrphansArgs {
   19|       |    #[arg(long)]
   20|       |    pub namespace: Option<String>,
   21|       |    #[arg(long)]
   22|       |    pub dry_run: bool,
   23|       |    #[arg(long)]
   24|       |    pub yes: bool,
   25|       |    #[arg(long, value_enum, default_value = "json")]
   26|       |    pub format: OutputFormat,
   27|       |    #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
   28|       |    pub json: bool,
   29|       |    #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
   30|       |    pub db: Option<String>,
   31|       |}
   32|       |
   33|       |#[derive(Serialize)]
   34|       |struct CleanupResponse {
   35|       |    orphan_count: usize,
   36|       |    deleted: usize,
   37|       |    dry_run: bool,
   38|       |    namespace: Option<String>,
   39|       |    /// Total execution time in milliseconds from handler start to serialisation.
   40|       |    elapsed_ms: u64,
   41|       |}
   42|       |
   43|      0|pub fn run(args: CleanupOrphansArgs) -> Result<(), AppError> {
   44|      0|    let inicio = std::time::Instant::now();
   45|      0|    let paths = AppPaths::resolve(args.db.as_deref())?;
   46|       |
   47|      0|    crate::storage::connection::ensure_db_ready(&paths)?;
   48|       |
   49|      0|    let mut conn = open_rw(&paths.db)?;
   50|       |
   51|      0|    let orphan_ids = entities::find_orphan_entity_ids(&conn, args.namespace.as_deref())?;
   52|      0|    let orphan_count = orphan_ids.len();
   53|       |
   54|      0|    let deleted = if args.dry_run {
   55|      0|        0
   56|       |    } else {
   57|      0|        if orphan_count > 0 && !args.yes {
   58|      0|            output::emit_progress(&format!(
   59|      0|                "removing {orphan_count} orphan entities (use --yes to skip this notice)"
   60|      0|            ));
   61|      0|        }
   62|      0|        let tx = conn.transaction_with_behavior(rusqlite::TransactionBehavior::Immediate)?;
   63|      0|        let removed = entities::delete_entities_by_ids(&tx, &orphan_ids)?;
   64|      0|        tx.commit()?;
   65|      0|        conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);")?;
   66|      0|        removed
   67|       |    };
   68|       |
   69|      0|    let response = CleanupResponse {
   70|      0|        orphan_count,
   71|      0|        deleted,
   72|      0|        dry_run: args.dry_run,
   73|      0|        namespace: args.namespace.clone(),
   74|      0|        elapsed_ms: inicio.elapsed().as_millis() as u64,
   75|      0|    };
   76|       |
   77|      0|    match args.format {
   78|      0|        OutputFormat::Json => output::emit_json(&response)?,
   79|      0|        OutputFormat::Text | OutputFormat::Markdown => {
   80|      0|            let ns = response.namespace.as_deref().unwrap_or("<all>");
   81|      0|            output::emit_text(&format!(
   82|      0|                "orphans: {} found, {} deleted (dry_run={}) [{}]",
   83|      0|                response.orphan_count, response.deleted, response.dry_run, ns
   84|      0|            ));
   85|      0|        }
   86|       |    }
   87|       |
   88|      0|    Ok(())
   89|      0|}
   90|       |
   91|       |#[cfg(test)]
   92|       |mod tests {
   93|       |    use super::*;
   94|       |
   95|       |    #[test]
   96|      1|    fn cleanup_response_serializes_dry_run_true() {
   97|      1|        let resp = CleanupResponse {
   98|      1|            orphan_count: 5,
   99|      1|            deleted: 0,
  100|      1|            dry_run: true,
  101|      1|            namespace: Some("global".to_string()),
  102|      1|            elapsed_ms: 12,
  103|      1|        };
  104|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  105|      1|        assert_eq!(json["orphan_count"], 5);
  106|      1|        assert_eq!(json["deleted"], 0);
  107|      1|        assert_eq!(json["dry_run"], true);
  108|      1|        assert_eq!(json["namespace"], "global");
  109|      1|        assert!(json["elapsed_ms"].is_number());
  110|      1|    }
  111|       |
  112|       |    #[test]
  113|      1|    fn cleanup_response_deleted_zero_when_dry_run() {
  114|      1|        let resp = CleanupResponse {
  115|      1|            orphan_count: 10,
  116|      1|            deleted: 0,
  117|      1|            dry_run: true,
  118|      1|            namespace: None,
  119|      1|            elapsed_ms: 5,
  120|      1|        };
  121|      1|        assert_eq!(resp.deleted, 0, "dry_run must keep deleted at 0");
                                                  ^0
  122|      1|        assert_eq!(resp.orphan_count, 10);
  123|      1|    }
  124|       |
  125|       |    #[test]
  126|      1|    fn cleanup_response_namespace_none_serializes_null() {
  127|      1|        let resp = CleanupResponse {
  128|      1|            orphan_count: 0,
  129|      1|            deleted: 0,
  130|      1|            dry_run: false,
  131|      1|            namespace: None,
  132|      1|            elapsed_ms: 1,
  133|      1|        };
  134|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  135|      1|        assert!(
  136|      1|            json["namespace"].is_null(),
  137|      0|            "namespace None must serialize as null"
  138|       |        );
  139|      1|    }
  140|       |
  141|       |    #[test]
  142|      1|    fn cleanup_response_deleted_equals_orphan_count_when_executed() {
  143|      1|        let resp = CleanupResponse {
  144|      1|            orphan_count: 3,
  145|      1|            deleted: 3,
  146|      1|            dry_run: false,
  147|      1|            namespace: Some("projeto".to_string()),
  148|      1|            elapsed_ms: 20,
  149|      1|        };
  150|      1|        assert_eq!(
  151|       |            resp.deleted, resp.orphan_count,
  152|      0|            "when running without dry_run, deleted must equal orphan_count"
  153|       |        );
  154|      1|    }
  155|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/codex_spawn.rs:
    1|       |//! Codex CLI spawn + JSONL parsing helper shared by `enrich` and `ingest --mode codex`.
    2|       |//!
    3|       |//! G31 (v1.0.69): `enrich --mode codex` was missing five critical hardening
    4|       |//! flags compared to `ingest --mode codex`. This module extracts the
    5|       |//! spawn pipeline into a single helper that BOTH call-sites consume,
    6|       |//! guaranteeing the same defaults everywhere.
    7|       |//!
    8|       |//! G32 (v1.0.69): `enrich --mode codex` used `serde_json::from_str` on the
    9|       |//! raw stdout, but `codex exec --json` emits JSONL (one event per line).
   10|       |//! [`parse_codex_jsonl`] iterates line-by-line, picking the last
   11|       |//! `item.completed` of type `agent_message` as the assistant text.
   12|       |//!
   13|       |//! G33 (v1.0.69): validate the model against the ChatGPT Pro OAuth whitelist
   14|       |//! stored in `~/.codex/models_cache.json` BEFORE spawning the subprocess.
   15|       |
   16|       |use crate::errors::AppError;
   17|       |use crate::extraction::{ExtractedUrl, ExtractionResult};
   18|       |use crate::storage::entities::{NewEntity, NewRelationship};
   19|       |use serde::{Deserialize, Serialize};
   20|       |use std::path::{Path, PathBuf};
   21|       |use std::process::{Command, Stdio};
   22|       |
   23|       |/// Token usage reported by Codex on `turn.completed` events.
   24|       |#[derive(Debug, Clone, Default, Deserialize, Serialize)]
   25|       |pub struct CodexUsage {
   26|       |    #[serde(default)]
   27|       |    pub input_tokens: u64,
   28|       |    #[serde(default)]
   29|       |    pub cached_input_tokens: u64,
   30|       |    #[serde(default)]
   31|       |    pub output_tokens: u64,
   32|       |    #[serde(default)]
   33|       |    pub reasoning_output_tokens: u64,
   34|       |}
   35|       |
   36|       |/// Combined result of one `codex exec` invocation.
   37|       |#[derive(Debug)]
   38|       |pub struct CodexResult {
   39|       |    pub extraction: ExtractionResult,
   40|       |    pub usage: Option<CodexUsage>,
   41|       |    pub rate_limited: bool,
   42|       |    pub schema_error: bool,
   43|       |    pub turn_failed: bool,
   44|       |    pub failed_message: String,
   45|       |}
   46|       |
   47|       |/// Configuration for the codex spawner.
   48|       |#[allow(rustdoc::broken_intra_doc_links)]
   49|       |pub struct CodexSpawnArgs<'a> {
   50|       |    pub binary: &'a Path,
   51|       |    pub prompt: &'a str,
   52|       |    pub json_schema: &'a str,
   53|       |    pub input_text: &'a str,
   54|       |    pub model: Option<&'a str>,
   55|       |    pub timeout_secs: u64,
   56|       |    /// Caller-provided schema path (must be inside a trusted directory
   57|       |    /// that codex recognises as sandbox-safe). Use [`trusted_schema_path`]
   58|       |    /// to compute one under the cache dir.
   59|       |    pub schema_path: PathBuf,
   60|       |}
   61|       |
   62|       |/// Computes a schema path under the cache dir so `codex exec` accepts it
   63|       |/// as part of a trusted directory (rejects `/tmp` on hardened installs).
   64|      0|pub fn trusted_schema_path() -> Result<PathBuf, AppError> {
   65|      0|    let cache = crate::paths::AppPaths::resolve(None)
   66|      0|        .map(|p| p.models.parent().map(|m| m.to_path_buf()))
   67|      0|        .ok()
   68|      0|        .flatten()
   69|      0|        .unwrap_or_else(std::env::temp_dir);
   70|      0|    std::fs::create_dir_all(&cache).map_err(AppError::Io)?;
   71|      0|    Ok(cache.join(format!("enrich-schema-{}.json", std::process::id())))
   72|      0|}
   73|       |
   74|       |/// Models accepted by Codex CLI when using ChatGPT Pro OAuth.
   75|       |///
   76|       |/// Mirrored from `~/.codex/models_cache.json` (which the official CLI
   77|       |/// refreshes on every login). This list is intentionally narrow; passing
   78|       |/// a model not in this set with `--mode codex` returns
   79|       |/// `AppError::Validation` BEFORE any OAuth turn is spent.
   80|       |pub const CODEX_PRO_OAUTH_MODELS: &[&str] = &[
   81|       |    "codex-auto-review",
   82|       |    "gpt-5.3-codex-spark",
   83|       |    "gpt-5.4",
   84|       |    "gpt-5.4-mini",
   85|       |    "gpt-5.5",
   86|       |];
   87|       |
   88|       |/// Validates the requested model against [`CODEX_PRO_OAUTH_MODELS`].
   89|       |///
   90|       |/// # Errors
   91|       |/// Returns [`AppError::Validation`] listing the accepted models when the
   92|       |/// caller supplied a model outside the whitelist.
   93|      4|pub fn validate_codex_model(model: Option<&str>) -> Result<(), AppError> {
   94|      4|    let Some(m) = model else {
                           ^3
   95|      1|        return Ok(()); // no override; codex picks its default
   96|       |    };
   97|      3|    if CODEX_PRO_OAUTH_MODELS.contains(&m) {
   98|      2|        Ok(())
   99|       |    } else {
  100|      1|        Err(AppError::Validation(format!(
  101|      1|            "--codex-model {m:?} is not supported with ChatGPT Pro OAuth. \
  102|      1|             Accepted: {}",
  103|      1|            CODEX_PRO_OAUTH_MODELS.join(", ")
  104|      1|        )))
  105|       |    }
  106|      4|}
  107|       |
  108|       |/// Returns the list of models accepted by Codex with ChatGPT Pro OAuth.
  109|       |///
  110|       |/// Tries to read `~/.codex/models_cache.json` (which the official CLI
  111|       |/// refreshes on every login) and falls back to the static
  112|       |/// [`CODEX_PRO_OAUTH_MODELS`] constant when the file is missing or
  113|       |/// malformed. The returned `Vec<String>` is the union of both sources,
  114|       |/// de-duplicated.
  115|      5|pub fn list_codex_models() -> Vec<String> {
  116|       |    use std::collections::BTreeSet;
  117|      5|    let mut out: BTreeSet<String> = CODEX_PRO_OAUTH_MODELS
  118|      5|        .iter()
  119|     25|        .map(|s| s.to_string())
                       ^5
  120|      5|        .collect();
  121|       |
  122|      5|    if let Some(home) = std::env::var_os("HOME") {
  123|      5|        let path = std::path::Path::new(&home)
  124|      5|            .join(".codex")
  125|      5|            .join("models_cache.json");
  126|      5|        if let Ok(content) = std::fs::read_to_string(&path) {
  127|       |            // The file is a JSON object whose keys are model ids.
  128|       |            // Use serde_json::Value to traverse safely without depending
  129|       |            // on a precise schema.
  130|      5|            if let Ok(value) = serde_json::from_str::<serde_json::Value>(&content) {
  131|      5|                if let Some(obj) = value.as_object() {
  132|     20|                    for key in obj.keys() {
                                             ^5  ^5
  133|     20|                        out.insert(key.clone());
  134|     20|                    }
  135|      0|                } else if let Some(arr) = value.as_array() {
  136|      0|                    for v in arr {
  137|      0|                        if let Some(s) = v.as_str() {
  138|      0|                            out.insert(s.to_string());
  139|      0|                        }
  140|       |                    }
  141|      0|                }
  142|      0|            }
  143|      0|        }
  144|      0|    }
  145|      5|    out.into_iter().collect()
  146|      5|}
  147|       |
  148|       |/// Suggests the closest codex OAuth model to a user-supplied substring
  149|       |/// (G33). Returns `None` when no candidate is close enough.
  150|       |///
  151|       |/// Match strategy: exact substring containment wins; otherwise Levenshtein
  152|       |/// distance below `max_distance = max(2, query.len() / 3)`.
  153|      3|pub fn suggest_codex_model(query: &str) -> Option<String> {
  154|      3|    let query_lc = query.to_ascii_lowercase();
  155|      3|    let models = list_codex_model_lc();
  156|       |
  157|       |    // Exact substring match wins.
  158|     25|    for m in &models {
                      ^23
  159|     23|        if m.contains(&query_lc) {
  160|      1|            return Some(m.clone());
  161|     22|        }
  162|       |    }
  163|       |
  164|       |    // Levenshtein fallback.
  165|      2|    let max_distance = (query.len() / 3).max(2);
  166|      2|    let mut best: Option<(usize, String)> = None;
  167|     20|    for m in &models {
                      ^18
  168|     18|        let d = levenshtein(query_lc.as_str(), m.as_str());
  169|     18|        if d <= max_distance && best.as_ref().is_none_or(|(bd, _)| d < *bd) {
                                              ^2   ^2       ^2                   ^1  ^1
  170|      2|            best = Some((d, m.clone()));
  171|     16|        }
  172|       |    }
  173|      2|    best.map(|(_, m)| m)
  174|      3|}
  175|       |
  176|      3|fn list_codex_model_lc() -> Vec<String> {
  177|      3|    list_codex_models()
  178|      3|        .into_iter()
  179|     27|        .map(|s| s.to_ascii_lowercase())
                       ^3
  180|      3|        .collect()
  181|      3|}
  182|       |
  183|     18|fn levenshtein(a: &str, b: &str) -> usize {
  184|     18|    let a_chars: Vec<char> = a.chars().collect();
  185|     18|    let b_chars: Vec<char> = b.chars().collect();
  186|     18|    if a_chars.is_empty() {
  187|      0|        return b_chars.len();
  188|     18|    }
  189|     18|    if b_chars.is_empty() {
  190|      0|        return a_chars.len();
  191|     18|    }
  192|     18|    let mut prev: Vec<usize> = (0..=b_chars.len()).collect();
  193|     18|    let mut curr = vec![0; b_chars.len() + 1];
  194|    243|    for (i, &ac) in a_chars.iter().enumerate() {
                                  ^18            ^18
  195|    243|        curr[0] = i + 1;
  196|  2.59k|        for (j, &bc) in b_chars.iter().enumerate() {
                                      ^243           ^243
  197|  2.59k|            let cost = if ac == bc { 0 } else { 1 };
                                                   ^128       ^2.46k
  198|  2.59k|            curr[j + 1] = (curr[j] + 1).min(prev[j + 1] + 1).min(prev[j] + cost);
  199|       |        }
  200|    243|        std::mem::swap(&mut prev, &mut curr);
  201|       |    }
  202|     18|    prev[b_chars.len()]
  203|     18|}
  204|       |
  205|       |/// Builds the `codex exec` command with the canonical hardening flags.
  206|       |///
  207|       |/// G31 + OAuth-only hardening (v1.0.69, mandated by gaps.md lines 41-49):
  208|       |/// the command ALWAYS uses the OAuth `auth.json` flow. The flag set is
  209|       |/// the canonical one documented in gaps.md Correção A:
  210|       |///
  211|       |/// ```text
  212|       |/// codex exec \
  213|       |///   -c mcp_servers='{}' \
  214|       |///   --json --output-schema <SCHEMA> \
  215|       |///   --ephemeral \
  216|       |///   --skip-git-repo-check \
  217|       |///   --sandbox read-only \
  218|       |///   --ignore-user-config \
  219|       |///   --ignore-rules \
  220|       |///   --ask-for-approval never \
  221|       |///   -m <MODEL> \
  222|       |///   -
  223|       |/// ```
  224|       |///
  225|       |/// The combination zeroes MCP servers (via two complementary mechanisms:
  226|       |/// the inline `-c mcp_servers='{}'` override AND `--ignore-user-config`),
  227|       |/// disables user-defined rules, and never asks for interactive approval.
  228|       |///
  229|       |/// **`OPENAI_API_KEY` is FORBIDDEN** in the spawned environment (gaps.md:48).
  230|       |/// OAuth flows via `~/.codex/auth.json` and `CODEX_ACCESS_TOKEN` only.
  231|      3|pub fn build_codex_command(args: &CodexSpawnArgs<'_>) -> Command {
  232|      3|    let full_prompt = format!("{}\n\n{}", args.prompt, args.input_text);
  233|       |
  234|       |    // OAuth-only guard (gaps.md:48). If `OPENAI_API_KEY` is set in the
  235|       |    // environment we MUST abort — that is the API-key path which is
  236|       |    // explicitly PROHIBITED. Use the OAuth `auth.json` flow exclusively.
  237|      3|    if let Ok(_key) = std::env::var("OPENAI_API_KEY") {
                            ^1
  238|      1|        let mut cmd = Command::new("false");
  239|      1|        cmd.env_clear();
  240|      1|        cmd.env("PATH", "/nonexistent");
  241|      1|        cmd.arg("--oauth-only-violation-openai-api-key-set");
  242|      1|        return cmd;
  243|      2|    }
  244|       |
  245|       |    // Write the JSON schema to a path the caller controls. Callers should
  246|       |    // pass a path under the cache dir (see [`trusted_schema_path`]).
  247|      2|    std::fs::write(&args.schema_path, args.json_schema).ok();
  248|       |
  249|      2|    let mut cmd = Command::new(args.binary);
  250|      2|    cmd.env_clear();
  251|       |    // OAuth flow: `CODEX_ACCESS_TOKEN` (Bearer) and `CODEX_HOME` (auth.json
  252|       |    // location) are whitelisted. `OPENAI_API_KEY` is INTENTIONALLY ABSENT.
  253|     34|    for var in &[
                      ^32
  254|     34|        "PATH",
  255|     34|        "HOME",
  256|     34|        "USER",
  257|     34|        "SHELL",
  258|     34|        "TERM",
  259|     34|        "LANG",
  260|     34|        "XDG_CONFIG_HOME",
  261|     34|        "XDG_DATA_HOME",
  262|     34|        "XDG_RUNTIME_DIR",
  263|     34|        "XDG_CACHE_HOME",
  264|     34|        "CODEX_ACCESS_TOKEN",
  265|     34|        "CODEX_HOME",
  266|     34|        "TMPDIR",
  267|     34|        "TMP",
  268|     34|        "TEMP",
  269|     34|        "DYLD_FALLBACK_LIBRARY_PATH",
  270|     34|    ] {
  271|     32|        if let Ok(val) = std::env::var(var) {
                                ^22
  272|     22|            cmd.env(var, val);
  273|     22|        }
                      ^10
  274|       |    }
  275|       |
  276|       |    #[cfg(windows)]
  277|       |    for var in &[
  278|       |        "LOCALAPPDATA",
  279|       |        "APPDATA",
  280|       |        "USERPROFILE",
  281|       |        "SystemRoot",
  282|       |        "COMSPEC",
  283|       |        "PATHEXT",
  284|       |    ] {
  285|       |        if let Ok(val) = std::env::var(var) {
  286|       |            cmd.env(var, val);
  287|       |        }
  288|       |    }
  289|       |
  290|      2|    cmd.arg("exec")
  291|      2|        .arg("-c")
  292|      2|        .arg("mcp_servers='{}'")
  293|      2|        .arg("--json")
  294|      2|        .arg("--output-schema")
  295|      2|        .arg(&args.schema_path)
  296|      2|        .arg("--ephemeral")
  297|      2|        .arg("--skip-git-repo-check")
  298|      2|        .arg("--sandbox")
  299|      2|        .arg("read-only")
  300|      2|        .arg("--ignore-user-config")
  301|      2|        .arg("--ignore-rules")
  302|      2|        .arg("--ask-for-approval")
  303|      2|        .arg("never");
  304|       |
  305|      2|    if let Some(m) = args.model {
  306|      2|        cmd.arg("-m").arg(m);
  307|      2|    }
                  ^0
  308|       |
  309|       |    // `-` means: read the prompt from stdin (Codex Paperclip pattern)
  310|      2|    cmd.arg("-");
  311|       |
  312|      2|    cmd.stdin(Stdio::piped())
  313|      2|        .stdout(Stdio::piped())
  314|      2|        .stderr(Stdio::piped());
  315|       |    // Keep the prompt alive for the stdin thread spawned in `spawn_codex`.
  316|      2|    let _ = full_prompt; // captured by closure below
  317|       |
  318|      2|    cmd
  319|      3|}
  320|       |
  321|       |/// Parses JSONL output from `codex exec --json`.
  322|       |///
  323|       |/// Event format (DOTS notation):
  324|       |/// - `thread.started` — session init
  325|       |/// - `turn.started` — model turn begins
  326|       |/// - `item.completed` — message or tool call; last `agent_message` wins
  327|       |/// - `turn.completed` — includes usage stats
  328|       |/// - `turn.failed` — error with optional rate-limit indicator
  329|       |/// - `error` — schema or validation error
  330|       |///
  331|       |/// G32 (v1.0.69): this function is the single source of truth for JSONL
  332|       |/// parsing. Both `enrich` and `ingest --mode codex` consume it.
  333|      5|pub fn parse_codex_jsonl(stdout: &str) -> Result<CodexResult, AppError> {
  334|      5|    let mut last_agent_text: Option<String> = None;
  335|      5|    let mut usage: Option<CodexUsage> = None;
  336|      5|    let mut rate_limited = false;
  337|      5|    let mut schema_error = false;
  338|      5|    let mut turn_failed = false;
  339|      5|    let mut failed_message = String::new();
  340|       |
  341|     17|    for line in stdout.lines() {
                              ^5     ^5
  342|     17|        let line = line.trim();
  343|     17|        if line.is_empty() {
  344|      0|            continue;
  345|     17|        }
  346|       |
  347|     17|        let event: serde_json::Value = match serde_json::from_str(line) {
                          ^16    ^16
  348|     16|            Ok(v) => v,
  349|       |            Err(_) => {
  350|      1|                tracing::warn!(target: "codex_spawn", line, "skipping malformed JSONL line");
                                                                          ^0
  351|      1|                continue;
  352|       |            }
  353|       |        };
  354|       |
  355|     16|        let event_type = match event.get("type").and_then(|t| t.as_str()) {
  356|     16|            Some(t) => t,
  357|      0|            None => continue,
  358|       |        };
  359|       |
  360|     16|        match event_type {
  361|     16|            "item.completed" => {
  362|      6|                if let Some(item) = event.get("item") {
  363|      6|                    if item.get("type").and_then(|t| t.as_str()) == Some("agent_message") {
  364|      4|                        if let Some(text) = item.get("text").and_then(|t| t.as_str()) {
  365|      4|                            last_agent_text = Some(text.to_string());
  366|      4|                        }
                                      ^0
  367|      2|                    }
  368|      0|                }
  369|       |            }
  370|     10|            "turn.completed" => {
  371|      4|                if let Some(u) = event.get("usage") {
  372|       |                    // Skip events that lack the recognised token fields
  373|       |                    // (e.g. partial broadcasts with `{}`) so the last
  374|       |                    // populated usage wins instead of being overwritten
  375|       |                    // by an empty one.
  376|      4|                    let is_populated = u
  377|      4|                        .get("input_tokens")
  378|      4|                        .and_then(|v| v.as_u64())
                                                    ^2^2
  379|      4|                        .map(|n| n > 0)
                                               ^2
  380|      4|                        .unwrap_or(false)
  381|      2|                        || u.get("output_tokens")
  382|      2|                            .and_then(|v| v.as_u64())
                                                        ^0^0
  383|      2|                            .map(|n| n > 0)
                                                   ^0
  384|      2|                            .unwrap_or(false);
  385|      4|                    if is_populated {
  386|      2|                        if let Ok(parsed) = serde_json::from_value::<CodexUsage>(u.clone()) {
  387|      2|                            usage = Some(parsed);
  388|      2|                        }
                                      ^0
  389|      2|                    }
  390|      0|                }
  391|       |            }
  392|      6|            "turn.failed" => {
  393|      1|                turn_failed = true;
  394|      1|                if let Some(err) = event.get("error") {
  395|      1|                    let msg = err
  396|      1|                        .get("message")
  397|      1|                        .and_then(|m| m.as_str())
  398|      1|                        .unwrap_or("unknown error");
  399|      1|                    failed_message = msg.to_string();
  400|      1|                    if msg.contains("rate_limit")
  401|      0|                        || msg.contains("429")
  402|      0|                        || msg.contains("Too Many Requests")
  403|      1|                    {
  404|      1|                        rate_limited = true;
  405|      1|                    }
                                  ^0
  406|      0|                }
  407|       |            }
  408|      5|            "error" => {
  409|      0|                if let Some(msg) = event.get("message").and_then(|m| m.as_str()) {
  410|      0|                    if msg.contains("invalid_json_schema") || msg.contains("schema") {
  411|      0|                        schema_error = true;
  412|      0|                    }
  413|      0|                }
  414|       |            }
  415|      5|            _ => {}
  416|       |        }
  417|       |    }
  418|       |
  419|      5|    let text = last_agent_text.ok_or_else(|| {
                      ^4                                   ^1
  420|      1|        AppError::Validation(format!(
  421|      1|            "no agent_message in codex JSONL output (rate_limited={rate_limited}, schema_error={schema_error}, turn_failed={turn_failed})"
  422|      1|        ))
  423|      1|    })?;
  424|       |
  425|      4|    if turn_failed {
  426|      1|        return Err(AppError::Validation(format!(
  427|      1|            "codex turn failed: {failed_message}"
  428|      1|        )));
  429|      3|    }
  430|      3|    if schema_error {
  431|      0|        return Err(AppError::Validation(
  432|      0|            "codex reported invalid_json_schema; check the --output-schema file".to_string(),
  433|      0|        ));
  434|      3|    }
  435|      3|    if rate_limited {
  436|      0|        return Err(AppError::Validation(format!(
  437|      0|            "codex rate-limited: {failed_message}"
  438|      0|        )));
  439|      3|    }
  440|       |
  441|      3|    let extraction = parse_extraction_text(&text)?;
                                                               ^0
  442|      3|    Ok(CodexResult {
  443|      3|        extraction,
  444|      3|        usage,
  445|      3|        rate_limited,
  446|      3|        schema_error,
  447|      3|        turn_failed,
  448|      3|        failed_message,
  449|      3|    })
  450|      5|}
  451|       |
  452|       |/// Parses the agent_message text as an `ExtractionResult` JSON payload.
  453|       |///
  454|       |/// The schema is shared by both `enrich` and `ingest --mode codex`; the
  455|       |/// `text` is the JSON value the assistant returned, not a wrapper object.
  456|      3|pub fn parse_extraction_text(text: &str) -> Result<ExtractionResult, AppError> {
  457|      3|    let value: serde_json::Value = serde_json::from_str(text).map_err(|e| {
                                                                                        ^0
  458|      0|        AppError::Validation(format!("failed to parse codex agent_message as JSON: {e}"))
  459|      0|    })?;
  460|      3|    let obj = value.as_object().ok_or_else(|| {
                                                            ^0
  461|      0|        AppError::Validation("codex agent_message is not a JSON object".to_string())
  462|      0|    })?;
  463|       |
  464|      3|    let mut entities: Vec<NewEntity> = Vec::new();
  465|      3|    if let Some(arr) = obj.get("entities").and_then(|v| v.as_array()) {
  466|      5|        for e in arr {
                          ^2
  467|      2|            if let Some(name) = e.get("name").and_then(|v| v.as_str()) {
  468|       |                // Accept either "type" or "entity_type" from the LLM payload
  469|       |                // and fall back to "concept" when the LLM omits it.
  470|      2|                let entity_type_str = e
  471|      2|                    .get("type")
  472|      2|                    .or_else(|| e.get("entity_type"))
                                              ^0^0
  473|      2|                    .and_then(|v| v.as_str())
  474|      2|                    .unwrap_or("concept");
  475|      2|                let entity_type = serde_json::from_value::<crate::entity_type::EntityType>(
  476|      2|                    serde_json::Value::String(entity_type_str.to_string()),
  477|       |                )
  478|      2|                .unwrap_or(crate::entity_type::EntityType::Concept);
  479|      2|                entities.push(NewEntity {
  480|      2|                    name: name.to_string(),
  481|      2|                    entity_type,
  482|      2|                    description: None,
  483|      2|                });
  484|      0|            }
  485|       |        }
  486|      0|    }
  487|       |
  488|      3|    let mut relationships: Vec<NewRelationship> = Vec::new();
  489|      3|    if let Some(arr) = obj.get("relationships").and_then(|v| v.as_array()) {
  490|      5|        for r in arr {
                          ^2
  491|      2|            let from = r.get("source").or_else(|| r.get("from"));
                                                                ^0^0
  492|      2|            let to = r.get("target").or_else(|| r.get("to"));
                                                              ^0^0
  493|      2|            let rel = r.get("relation").and_then(|v| v.as_str());
  494|      2|            if let (Some(from_v), Some(to_v), Some(rel_v)) = (
  495|      2|                from.and_then(|v| v.as_str()),
  496|      2|                to.and_then(|v| v.as_str()),
  497|      2|                rel,
  498|       |            ) {
  499|      2|                relationships.push(NewRelationship {
  500|      2|                    source: from_v.to_string(),
  501|      2|                    target: to_v.to_string(),
  502|      2|                    relation: rel_v.to_string(),
  503|      2|                    strength: r.get("strength").and_then(|v| v.as_f64()).unwrap_or(0.5),
  504|      2|                    description: None,
  505|       |                });
  506|      0|            }
  507|       |        }
  508|      0|    }
  509|       |
  510|      3|    let urls: Vec<ExtractedUrl> = obj
  511|      3|        .get("urls")
  512|      3|        .and_then(|v| v.as_array())
                                    ^2^2
  513|      3|        .map(|arr| {
                                 ^2
  514|      2|            arr.iter()
  515|      2|                .filter_map(|u| {
                                              ^0
  516|       |                    Some(ExtractedUrl {
  517|      0|                        url: u.get("url")?.as_str()?.to_string(),
  518|      0|                        offset: u.get("offset").and_then(|v| v.as_u64()).unwrap_or(0) as usize,
  519|       |                    })
  520|      0|                })
  521|      2|                .collect()
  522|      2|        })
  523|      3|        .unwrap_or_default();
  524|       |
  525|       |    Ok(ExtractionResult {
  526|      3|        entities,
  527|      3|        relationships,
  528|      3|        relationships_truncated: obj
  529|      3|            .get("relationships_truncated")
  530|      3|            .and_then(|v| v.as_bool())
                                        ^0^0
  531|      3|            .unwrap_or(false),
  532|      3|        extraction_method: obj
  533|      3|            .get("extraction_method")
  534|      3|            .and_then(|v| v.as_str())
  535|      3|            .unwrap_or("codex")
  536|      3|            .to_string(),
  537|      3|        urls,
  538|       |    })
  539|      3|}
  540|       |
  541|       |#[cfg(test)]
  542|       |mod tests {
  543|       |    use super::*;
  544|       |
  545|       |    const SAMPLE_JSONL: &str = r#"{"type":"thread.started","thread_id":"abc"}
  546|       |{"type":"turn.started"}
  547|       |{"type":"item.completed","item":{"type":"reasoning","text":"thinking"}}
  548|       |{"type":"item.completed","item":{"type":"agent_message","text":"{\"entities\":[{\"name\":\"alpha\",\"type\":\"concept\"}],\"relationships\":[{\"source\":\"alpha\",\"target\":\"beta\",\"relation\":\"uses\",\"strength\":0.7}],\"extraction_method\":\"codex\",\"urls\":[]}"}}
  549|       |{"type":"turn.completed","usage":{"input_tokens":120,"output_tokens":45}}
  550|       |{"type":"turn.completed","usage":{}}
  551|       |"#;
  552|       |
  553|       |    #[test]
  554|      1|    fn parse_codex_jsonl_extracts_last_agent_message() {
  555|      1|        let result = parse_codex_jsonl(SAMPLE_JSONL).expect("parse must succeed");
  556|      1|        assert_eq!(result.extraction.entities.len(), 1);
  557|      1|        assert_eq!(result.extraction.entities[0].name, "alpha");
  558|      1|        assert_eq!(result.extraction.relationships.len(), 1);
  559|      1|        assert_eq!(result.extraction.relationships[0].relation, "uses");
  560|      1|        assert!((result.extraction.relationships[0].strength - 0.7).abs() < 1e-6);
  561|      1|    }
  562|       |
  563|       |    #[test]
  564|      1|    fn parse_codex_jsonl_collects_usage() {
  565|      1|        let result = parse_codex_jsonl(SAMPLE_JSONL).expect("parse must succeed");
  566|      1|        let usage = result.usage.expect("usage must be populated");
  567|      1|        assert_eq!(usage.input_tokens, 120);
  568|      1|        assert_eq!(usage.output_tokens, 45);
  569|      1|    }
  570|       |
  571|       |    #[test]
  572|      1|    fn parse_codex_jsonl_detects_rate_limit() {
  573|      1|        let r = parse_codex_jsonl(
  574|      1|            "{\"type\":\"turn.failed\",\"error\":{\"message\":\"rate_limit: 429 too many\"}}\n{\"type\":\"item.completed\",\"item\":{\"type\":\"agent_message\",\"text\":\"{}\"}}",
  575|       |        );
  576|      1|        assert!(matches!(r, Err(AppError::Validation(_))));
                              ^0
  577|      1|    }
  578|       |
  579|       |    #[test]
  580|      1|    fn parse_codex_jsonl_handles_no_agent_message() {
  581|      1|        let r = parse_codex_jsonl("{\"type\":\"thread.started\"}");
  582|      1|        assert!(matches!(r, Err(AppError::Validation(_))));
                              ^0
  583|      1|    }
  584|       |
  585|       |    #[test]
  586|      1|    fn parse_codex_jsonl_skips_malformed_lines() {
  587|      1|        let r = parse_codex_jsonl(
  588|      1|            "{not valid json\n{\"type\":\"item.completed\",\"item\":{\"type\":\"agent_message\",\"text\":\"{\\\"entities\\\":[],\\\"relationships\\\":[],\\\"extraction_method\\\":\\\"codex\\\"}\"}}",
  589|       |        );
  590|      1|        assert!(r.is_ok(), "malformed lines must be skipped, got {r:?}");
                                         ^0
  591|      1|    }
  592|       |
  593|       |    #[test]
  594|      1|    fn validate_codex_model_accepts_known() {
  595|      1|        assert!(validate_codex_model(Some("gpt-5.5")).is_ok());
  596|      1|        assert!(validate_codex_model(Some("gpt-5.4")).is_ok());
  597|      1|        assert!(validate_codex_model(None).is_ok()); // no override
  598|      1|    }
  599|       |
  600|       |    #[test]
  601|      1|    fn validate_codex_model_rejects_unknown() {
  602|      1|        let err = validate_codex_model(Some("gpt-4")).unwrap_err();
  603|      1|        let msg = format!("{err}");
  604|      1|        assert!(msg.contains("not supported"));
  605|      1|        assert!(msg.contains("gpt-5.5"));
  606|      1|    }
  607|       |
  608|       |    #[test]
  609|      1|    fn list_codex_models_includes_all_static_whitelist() {
  610|      1|        let models = list_codex_models();
  611|      6|        for m in CODEX_PRO_OAUTH_MODELS {
                          ^5
  612|      5|            assert!(models.contains(&m.to_string()), "missing {m} in {models:?}");
                                                                   ^0
  613|       |        }
  614|      1|    }
  615|       |
  616|       |    #[test]
  617|      1|    fn suggest_codex_model_substring_match() {
  618|      1|        let s = suggest_codex_model("gpt-5");
  619|      1|        assert!(s.is_some(), "must suggest a gpt-5.x model");
                                           ^0
  620|      1|    }
  621|       |
  622|       |    #[test]
  623|      1|    fn suggest_codex_model_fuzzy_match() {
  624|       |        // 'gpt5.5' has no hyphen; should still suggest 'gpt-5.5'.
  625|      1|        let s = suggest_codex_model("gpt5.5");
  626|      1|        assert!(s.is_some(), "fuzzy must suggest gpt-5.5 for 'gpt5.5'");
                                           ^0
  627|      1|        assert_eq!(s.unwrap(), "gpt-5.5");
  628|      1|    }
  629|       |
  630|       |    #[test]
  631|      1|    fn suggest_codex_model_unrelated_returns_none() {
  632|      1|        let s = suggest_codex_model("totally-unrelated-zzz");
  633|      1|        assert!(s.is_none());
  634|      1|    }
  635|       |
  636|       |    #[test]
  637|      1|    fn build_codex_command_includes_hardening_flags() {
  638|      1|        let args = CodexSpawnArgs {
  639|      1|            binary: Path::new("/bin/true"),
  640|      1|            prompt: "p",
  641|      1|            json_schema: "{}",
  642|      1|            input_text: "i",
  643|      1|            model: Some("gpt-5.5"),
  644|      1|            timeout_secs: 60,
  645|      1|            schema_path: std::env::temp_dir().join("test-schema.json"),
  646|      1|        };
  647|      1|        let cmd = build_codex_command(&args);
  648|      1|        let collected: Vec<String> = cmd
  649|      1|            .get_args()
  650|     17|            .filter_map(|a| a.to_str().map(|s| s.to_string()))
                           ^1
  651|      1|            .collect();
  652|     13|        for required in &[
                          ^12
  653|     13|            "exec",
  654|     13|            "--json",
  655|     13|            "--output-schema",
  656|     13|            "--ephemeral",
  657|     13|            "--skip-git-repo-check",
  658|     13|            "--sandbox",
  659|     13|            "read-only",
  660|     13|            "--ignore-user-config",
  661|     13|            "--ignore-rules",
  662|     13|            "-m",
  663|     13|            "gpt-5.5",
  664|     13|            "-",
  665|     13|        ] {
  666|     12|            assert!(
  667|    115|                collected.iter().any(|a| a == required),
                              ^12              ^12
  668|      0|                "missing flag {required} in {collected:?}"
  669|       |            );
  670|       |        }
  671|      1|    }
  672|       |
  673|       |    #[test]
  674|      1|    fn list_codex_models_dedupes_with_cache_file() {
  675|       |        // Ensure the union with the cache file (when present) does not
  676|       |        // produce duplicates. We can't actually write a cache file in
  677|       |        // a test, so we just verify the static path is dedup'd.
  678|      1|        let models = list_codex_models();
  679|      1|        let unique: std::collections::HashSet<_> = models.iter().collect();
  680|      1|        assert_eq!(unique.len(), models.len(), "list_codex_models must dedupe");
                                                             ^0
  681|      1|    }
  682|       |
  683|       |    /// OAuth-only conformance test (gaps.md:41-49, v1.0.69 mandate).
  684|       |    /// Verifies that `build_codex_command` always emits `-c mcp_servers='{}'`,
  685|       |    /// `--ignore-user-config`, `--ask-for-approval never` and does NOT
  686|       |    /// whitelist `OPENAI_API_KEY` in the env_clear whitelist.
  687|       |    #[test]
  688|       |    #[serial_test::serial(env)]
  689|      1|    fn build_command_oauth_only_mandatory_flags() {
  690|       |        // SAFETY: unit test
  691|      1|        unsafe {
  692|      1|            std::env::remove_var("OPENAI_API_KEY");
  693|      1|        }
  694|      1|        let schema = std::env::temp_dir().join("codex-test-schema.json");
  695|      1|        let _ = std::fs::remove_file(&schema);
  696|      1|        let args = CodexSpawnArgs {
  697|      1|            binary: std::path::Path::new("/usr/bin/false"),
  698|      1|            prompt: "p",
  699|      1|            json_schema: "{}",
  700|      1|            input_text: "i",
  701|      1|            model: Some("gpt-5.4-mini"),
  702|      1|            timeout_secs: 60,
  703|      1|            schema_path: schema.clone(),
  704|      1|        };
  705|      1|        let cmd = build_codex_command(&args);
  706|     17|        let argv: Vec<&str> = cmd.get_args().filter_map(|a| a.to_str()).collect();
                          ^1    ^1          ^1  ^1         ^1                         ^1
  707|       |        // Mandatory flags from gaps.md lines 233-238
  708|      1|        assert!(argv.contains(&"-c"), "must have -c (gaps.md:234)");
                                                    ^0
  709|      1|        assert!(
  710|      1|            argv.contains(&"mcp_servers='{}'"),
  711|      0|            "must have mcp_servers override (gaps.md:234)"
  712|       |        );
  713|      1|        assert!(
  714|      1|            argv.contains(&"--ignore-user-config"),
  715|      0|            "must have --ignore-user-config (gaps.md:266)"
  716|       |        );
  717|      1|        assert!(
  718|      1|            argv.contains(&"--ask-for-approval"),
  719|      0|            "must have --ask-for-approval never (gaps.md:237)"
  720|       |        );
  721|      1|        assert!(
  722|      1|            argv.contains(&"--sandbox"),
  723|      0|            "must have --sandbox read-only (G31)"
  724|       |        );
  725|      1|        assert!(argv.contains(&"--ephemeral"), "must have --ephemeral (G31)");
                                                             ^0
  726|      1|        assert!(
  727|      1|            argv.contains(&"--skip-git-repo-check"),
  728|      0|            "must have --skip-git-repo-check (G31)"
  729|       |        );
  730|      1|        assert!(
  731|      1|            argv.contains(&"--ignore-rules"),
  732|      0|            "must have --ignore-rules (G31)"
  733|       |        );
  734|       |    }
  735|       |
  736|       |    /// OAuth-only guard: when `OPENAI_API_KEY` is in the environment,
  737|       |    /// `build_codex_command` MUST abort the spawn (return a `false`
  738|       |    /// command), NOT pass the key through to the child.
  739|       |    #[test]
  740|       |    #[serial_test::serial(env)]
  741|      1|    fn build_command_aborts_when_openai_api_key_set() {
  742|       |        // SAFETY: unit test
  743|      1|        unsafe {
  744|      1|            std::env::set_var("OPENAI_API_KEY", "sk-violation-test");
  745|      1|        }
  746|      1|        let schema = std::env::temp_dir().join("codex-test-schema-abort.json");
  747|      1|        let _ = std::fs::remove_file(&schema);
  748|      1|        let args = CodexSpawnArgs {
  749|      1|            binary: std::path::Path::new("/usr/bin/codex"),
  750|      1|            prompt: "p",
  751|      1|            json_schema: "{}",
  752|      1|            input_text: "i",
  753|      1|            model: Some("gpt-5.4-mini"),
  754|      1|            timeout_secs: 60,
  755|      1|            schema_path: schema.clone(),
  756|      1|        };
  757|      1|        let cmd = build_codex_command(&args);
  758|      1|        let program = cmd.get_program().to_string_lossy().to_string();
  759|      1|        let argv: Vec<&str> = cmd.get_args().filter_map(|a| a.to_str()).collect();
  760|      1|        assert_eq!(
  761|       |            program, "false",
  762|      0|            "when OPENAI_API_KEY is set, build_codex_command must abort"
  763|       |        );
  764|      1|        assert!(
  765|      1|            argv.contains(&"--oauth-only-violation-openai-api-key-set"),
  766|      0|            "aborted command must carry violation marker"
  767|       |        );
  768|      1|        unsafe {
  769|      1|            std::env::remove_var("OPENAI_API_KEY");
  770|      1|        }
  771|       |    }
  772|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/completions.rs:
    1|       |//! Shell completion script generation.
    2|       |
    3|       |use clap::CommandFactory;
    4|       |use clap_complete::{generate, Shell};
    5|       |
    6|       |#[derive(clap::Args, Debug)]
    7|       |pub struct CompletionsArgs {
    8|       |    /// Shell to generate completions for
    9|       |    #[arg(value_enum)]
   10|       |    pub shell: Shell,
   11|       |}
   12|       |
   13|      0|pub fn run(args: CompletionsArgs) -> Result<(), crate::errors::AppError> {
   14|      0|    let mut cmd = crate::cli::Cli::command();
   15|      0|    let bin_name = cmd.get_name().to_string();
   16|      0|    generate(args.shell, &mut cmd, bin_name, &mut std::io::stdout());
   17|      0|    Ok(())
   18|      0|}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/daemon.rs:
    1|       |use crate::constants::DAEMON_IDLE_SHUTDOWN_SECS;
    2|       |use crate::errors::AppError;
    3|       |use crate::output;
    4|       |use crate::paths::AppPaths;
    5|       |
    6|       |#[derive(clap::Args)]
    7|       |#[command(after_long_help = "EXAMPLES:\n  \
    8|       |    # Start the embedding daemon in the foreground (default 600s idle timeout)\n  \
    9|       |    sqlite-graphrag daemon\n\n  \
   10|       |    # Start with a longer idle timeout for batch ingestion\n  \
   11|       |    sqlite-graphrag daemon --idle-shutdown-secs 3600\n\n  \
   12|       |    # Health-check a running daemon (exit 4 if not running)\n  \
   13|       |    sqlite-graphrag daemon --ping\n\n  \
   14|       |    # Request graceful shutdown of a running daemon\n  \
   15|       |    sqlite-graphrag daemon --stop\n\n\
   16|       |AUTO-SPAWN BEHAVIOR:\n  \
   17|       |    recall and hybrid-search spawn a daemon automatically when none is running,\n  \
   18|       |    amortising model warm-up across multiple invocations (idle timeout 600s).\n\n  \
   19|       |    Disable per-invocation:  sqlite-graphrag recall \"query\" --autostart-daemon=false\n  \
   20|       |    Disable globally:        export SQLITE_GRAPHRAG_DAEMON_DISABLE_AUTOSTART=1\n\n  \
   21|       |    The --autostart-daemon flag takes precedence over the env var.")]
   22|       |pub struct DaemonArgs {
   23|       |    /// Idle timeout in seconds before the daemon auto-shuts down to release the embedding model.
   24|       |    /// Default 600s; raise for long-running batch ingestion to avoid cold-start overhead.
   25|       |    #[arg(long, default_value_t = DAEMON_IDLE_SHUTDOWN_SECS)]
   26|       |    pub idle_shutdown_secs: u64,
   27|       |    /// Send a health-check ping to a running daemon and exit. Returns NotFound (exit 4) if no daemon.
   28|       |    #[arg(long)]
   29|       |    pub ping: bool,
   30|       |    /// Request graceful shutdown of a running daemon. Returns NotFound (exit 4) if no daemon.
   31|       |    #[arg(long)]
   32|       |    pub stop: bool,
   33|       |    /// Timeout in seconds for graceful shutdown drain of active requests.
   34|       |    #[arg(
   35|       |        long,
   36|       |        env = "SQLITE_GRAPHRAG_SHUTDOWN_TIMEOUT_SECS",
   37|       |        default_value_t = 10
   38|       |    )]
   39|       |    pub shutdown_timeout_secs: u64,
   40|       |    #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
   41|       |    pub json: bool,
   42|       |    #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
   43|       |    pub db: Option<String>,
   44|       |}
   45|       |
   46|      0|pub fn run(args: DaemonArgs) -> Result<(), AppError> {
   47|      0|    let _ = args.json;
   48|      0|    let paths = AppPaths::resolve(args.db.as_deref())?;
   49|      0|    paths.ensure_dirs()?;
   50|       |
   51|      0|    if args.ping {
   52|      0|        let response = crate::daemon::try_ping(&paths.models)?
   53|      0|            .ok_or_else(|| AppError::NotFound("daemon not running".to_string()))?;
   54|      0|        if let crate::daemon::DaemonResponse::Ok { ref version, .. } = response {
   55|      0|            if version != crate::constants::SQLITE_GRAPHRAG_VERSION {
   56|      0|                tracing::warn!(target: "daemon_cmd",
   57|       |                    daemon_version = %version,
   58|       |                    cli_version = crate::constants::SQLITE_GRAPHRAG_VERSION,
   59|      0|                    "daemon version mismatch; auto-restart will occur on the next embedding request"
   60|       |                );
   61|      0|            }
   62|      0|        }
   63|      0|        output::emit_json(&response)?;
   64|      0|        return Ok(());
   65|      0|    }
   66|       |
   67|      0|    if args.stop {
   68|      0|        let response = crate::daemon::try_shutdown(&paths.models)?
   69|      0|            .ok_or_else(|| AppError::NotFound("daemon not running".to_string()))?;
   70|      0|        output::emit_json(&response)?;
   71|      0|        return Ok(());
   72|      0|    }
   73|       |
   74|      0|    crate::daemon::run(
   75|      0|        &paths.models,
   76|      0|        args.idle_shutdown_secs,
   77|      0|        args.shutdown_timeout_secs,
   78|       |    )
   79|      0|}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/debug_schema.rs:
    1|       |//! Handler for the `debug-schema` CLI subcommand.
    2|       |
    3|       |use crate::errors::AppError;
    4|       |use crate::output;
    5|       |use crate::paths::AppPaths;
    6|       |use crate::storage::connection::open_ro;
    7|       |use serde::Serialize;
    8|       |use std::time::Instant;
    9|       |
   10|       |#[derive(clap::Args)]
   11|       |#[command(after_long_help = "EXAMPLES:\n  \
   12|       |    # Dump the SQLite schema (tables, indices, triggers) as JSON\n  \
   13|       |    sqlite-graphrag __debug_schema\n\n  \
   14|       |    # Dump schema of a database at a custom path\n  \
   15|       |    sqlite-graphrag __debug_schema --db /path/to/graphrag.sqlite\n\n  \
   16|       |    # Use SQLITE_GRAPHRAG_DB_PATH env var\n  \
   17|       |    SQLITE_GRAPHRAG_DB_PATH=/data/graphrag.sqlite sqlite-graphrag __debug_schema")]
   18|       |pub struct DebugSchemaArgs {
   19|       |    #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
   20|       |    pub json: bool,
   21|       |    #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
   22|       |    pub db: Option<String>,
   23|       |}
   24|       |
   25|       |#[derive(Serialize)]
   26|       |struct SchemaObject {
   27|       |    name: String,
   28|       |    #[serde(rename = "type")]
   29|       |    object_type: String,
   30|       |}
   31|       |
   32|       |#[derive(Serialize)]
   33|       |struct MigrationRecord {
   34|       |    version: i64,
   35|       |    name: String,
   36|       |    applied_on: String,
   37|       |}
   38|       |
   39|       |#[derive(Serialize)]
   40|       |struct DebugSchemaResponse {
   41|       |    /// Internal SQLite counter incremented on each DDL (PRAGMA schema_version).
   42|       |    /// Distinct from `user_version`: this one is managed automatically by SQLite.
   43|       |    schema_version: i64,
   44|       |    /// Canonical SCHEMA_USER_VERSION value set explicitly by migrations
   45|       |    /// (PRAGMA user_version). Distinct from `schema_version` (SQLite DDL counter)
   46|       |    /// and from `health.schema_version` (MAX version in refinery_schema_history).
   47|       |    user_version: i64,
   48|       |    objects: Vec<SchemaObject>,
   49|       |    migrations: Vec<MigrationRecord>,
   50|       |    elapsed_ms: u64,
   51|       |}
   52|       |
   53|      0|pub fn run(args: DebugSchemaArgs) -> Result<(), AppError> {
   54|      0|    let inicio = Instant::now();
   55|      0|    let paths = AppPaths::resolve(args.db.as_deref())?;
   56|       |
   57|      0|    crate::storage::connection::ensure_db_ready(&paths)?;
   58|       |
   59|      0|    let conn = open_ro(&paths.db)?;
   60|       |
   61|      0|    let schema_version: i64 = conn
   62|      0|        .query_row("PRAGMA schema_version", [], |r| r.get(0))
   63|      0|        .unwrap_or(0);
   64|       |
   65|       |    // PRAGMA user_version is set explicitly after migrations (canonical value SCHEMA_USER_VERSION).
   66|      0|    let user_version: i64 = conn
   67|      0|        .query_row("PRAGMA user_version", [], |r| r.get(0))
   68|      0|        .unwrap_or(0);
   69|       |
   70|      0|    let mut stmt = conn.prepare_cached(
   71|      0|        "SELECT name, type FROM sqlite_master \
   72|      0|         WHERE type IN ('table','view','trigger','index') \
   73|      0|         ORDER BY type, name",
   74|      0|    )?;
   75|      0|    let objects: Vec<SchemaObject> = stmt
   76|      0|        .query_map([], |r| {
   77|       |            Ok(SchemaObject {
   78|      0|                name: r.get(0)?,
   79|      0|                object_type: r.get(1)?,
   80|       |            })
   81|      0|        })?
   82|      0|        .collect::<Result<Vec<_>, _>>()?;
   83|       |
   84|      0|    let existe_hist: i64 = conn
   85|      0|        .query_row(
   86|      0|            "SELECT COUNT(*) FROM sqlite_master WHERE type='table' AND name='refinery_schema_history'",
   87|      0|            [],
   88|      0|            |r| r.get(0),
   89|       |        )
   90|      0|        .unwrap_or(0);
   91|       |
   92|      0|    let migrations: Vec<MigrationRecord> = if existe_hist > 0 {
   93|      0|        let mut stmt_mig = conn.prepare_cached(
   94|      0|            "SELECT version, name, applied_on \
   95|      0|             FROM refinery_schema_history \
   96|      0|             ORDER BY version",
   97|      0|        )?;
   98|      0|        let rows: Vec<MigrationRecord> = stmt_mig
   99|      0|            .query_map([], |r| {
  100|       |                Ok(MigrationRecord {
  101|      0|                    version: r.get(0)?,
  102|      0|                    name: r.get(1)?,
  103|      0|                    applied_on: r.get(2)?,
  104|       |                })
  105|      0|            })?
  106|      0|            .collect::<Result<Vec<_>, _>>()?;
  107|      0|        rows
  108|       |    } else {
  109|      0|        Vec::new()
  110|       |    };
  111|       |
  112|      0|    let elapsed_ms = inicio.elapsed().as_millis() as u64;
  113|       |
  114|      0|    output::emit_json(&DebugSchemaResponse {
  115|      0|        schema_version,
  116|      0|        user_version,
  117|      0|        objects,
  118|      0|        migrations,
  119|      0|        elapsed_ms,
  120|      0|    })?;
  121|       |
  122|      0|    Ok(())
  123|      0|}
  124|       |
  125|       |#[cfg(test)]
  126|       |mod tests {
  127|       |    use super::*;
  128|       |    use serde_json::Value;
  129|       |
  130|       |    #[test]
  131|      1|    fn debug_schema_response_serializes_required_fields() {
  132|      1|        let resp = DebugSchemaResponse {
  133|      1|            schema_version: 42,
  134|      1|            user_version: 49,
  135|      1|            objects: vec![SchemaObject {
  136|      1|                name: "memories".to_string(),
  137|      1|                object_type: "table".to_string(),
  138|      1|            }],
  139|      1|            migrations: vec![MigrationRecord {
  140|      1|                version: 1,
  141|      1|                name: "V001__init".to_string(),
  142|      1|                applied_on: "2026-01-01T00:00:00Z".to_string(),
  143|      1|            }],
  144|      1|            elapsed_ms: 7,
  145|      1|        };
  146|      1|        let json: Value = serde_json::to_value(&resp).unwrap();
  147|      1|        assert_eq!(json["schema_version"], 42);
  148|      1|        assert_eq!(json["user_version"], 49);
  149|      1|        assert!(json["objects"].is_array());
  150|      1|        assert_eq!(json["objects"][0]["name"], "memories");
  151|      1|        assert_eq!(json["objects"][0]["type"], "table");
  152|      1|        assert!(json["migrations"].is_array());
  153|      1|        assert_eq!(json["migrations"][0]["version"], 1);
  154|      1|        assert_eq!(json["elapsed_ms"], 7);
  155|      1|    }
  156|       |
  157|       |    #[test]
  158|      1|    fn schema_object_renomeia_campo_type() {
  159|      1|        let obj = SchemaObject {
  160|      1|            name: "entities".to_string(),
  161|      1|            object_type: "table".to_string(),
  162|      1|        };
  163|      1|        let json: Value = serde_json::to_value(&obj).unwrap();
  164|      1|        assert!(json.get("object_type").is_none());
  165|      1|        assert_eq!(json["type"], "table");
  166|      1|    }
  167|       |
  168|       |    #[test]
  169|      1|    fn migration_record_serializes_all_fields() {
  170|      1|        let rec = MigrationRecord {
  171|      1|            version: 3,
  172|      1|            name: "V003__indexes".to_string(),
  173|      1|            applied_on: "2026-04-19T12:00:00Z".to_string(),
  174|      1|        };
  175|      1|        let json: Value = serde_json::to_value(&rec).unwrap();
  176|      1|        assert_eq!(json["version"], 3);
  177|      1|        assert_eq!(json["name"], "V003__indexes");
  178|      1|        assert_eq!(json["applied_on"], "2026-04-19T12:00:00Z");
  179|      1|    }
  180|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/deep_research.rs:
    1|       |//! Handler for the `deep-research` CLI subcommand.
    2|       |//!
    3|       |//! Orchestrates parallel multi-hop GraphRAG search via query decomposition.
    4|       |//! The workload is I/O-bound (SQLite WAL reads), so tokio is used instead of
    5|       |//! rayon. Each sub-query opens its own read-only connection.
    6|       |
    7|       |use crate::errors::AppError;
    8|       |use crate::graph::{
    9|       |    bfs_with_predecessors, traverse_from_memories_with_hops_capped, PredecessorMap,
   10|       |};
   11|       |use crate::output;
   12|       |use crate::paths::AppPaths;
   13|       |use crate::storage::connection::open_ro;
   14|       |use crate::storage::fusion::{rrf_fuse, rrf_max_possible};
   15|       |use crate::storage::{entities, memories};
   16|       |
   17|       |use serde::Serialize;
   18|       |use std::collections::HashSet;
   19|       |use std::sync::Arc;
   20|       |use tokio::sync::Semaphore;
   21|       |use tokio::task::JoinSet;
   22|       |
   23|       |/// Arguments for the `deep-research` subcommand.
   24|       |#[derive(clap::Args)]
   25|       |#[command(
   26|       |    about = "Deep parallel multi-hop GraphRAG research via query decomposition",
   27|       |    after_long_help = "EXAMPLES:\n  \
   28|       |        # Basic deep research\n  \
   29|       |        sqlite-graphrag deep-research \"auth architecture decisions\"\n\n  \
   30|       |        # With custom parameters\n  \
   31|       |        sqlite-graphrag deep-research \"auth\" --k 20 --max-hops 3 --max-sub-queries 7\n\n  \
   32|       |        # Include full memory bodies in output\n  \
   33|       |        sqlite-graphrag deep-research \"auth\" --with-bodies\n\n  \
   34|       |        # Tune RRF and graph scoring\n  \
   35|       |        sqlite-graphrag deep-research \"auth and deployment\" --rrf-k 60 --graph-decay 0.7"
   36|       |)]
   37|       |pub struct DeepResearchArgs {
   38|       |    /// Research query to decompose and search.
   39|       |    #[arg(
   40|       |        value_name = "QUERY",
   41|       |        allow_hyphen_values = true,
   42|       |        help = "Research query to decompose and search"
   43|       |    )]
   44|       |    pub query: String,
   45|       |    /// Results per sub-query (Recall@20 captures 95%+ relevant hits).
   46|       |    #[arg(
   47|       |        long,
   48|       |        short,
   49|       |        aliases = ["limit", "top-k"],
   50|       |        default_value_t = 20,
   51|       |        help = "Results per sub-query (Recall@20 captures 95%+ relevant hits)"
   52|       |    )]
   53|       |    pub k: usize,
   54|       |    /// Maximum sub-queries from decomposition (covers complex multi-hop queries).
   55|       |    #[arg(
   56|       |        long,
   57|       |        default_value_t = 7,
   58|       |        help = "Maximum sub-queries (covers complex multi-hop queries)"
   59|       |    )]
   60|       |    pub max_sub_queries: usize,
   61|       |    /// Multi-hop graph traversal depth (sweet spot: 2-3 hops).
   62|       |    #[arg(
   63|       |        long,
   64|       |        default_value_t = 3,
   65|       |        help = "Multi-hop graph traversal depth (sweet spot: 2-3 hops)"
   66|       |    )]
   67|       |    pub max_hops: usize,
   68|       |    /// Minimum edge weight for graph traversal.
   69|       |    #[arg(
   70|       |        long,
   71|       |        default_value_t = 0.3,
   72|       |        help = "Minimum edge weight for graph traversal"
   73|       |    )]
   74|       |    pub min_weight: f64,
   75|       |    /// Maximum concurrent sub-queries (default: min(cpus, 8)).
   76|       |    #[arg(long, help = "Maximum concurrent sub-queries (default: min(cpus, 8))")]
   77|       |    pub max_concurrency: Option<usize>,
   78|       |    /// Timeout per sub-query in seconds.
   79|       |    #[arg(long, default_value_t = 30, help = "Timeout per sub-query in seconds")]
   80|       |    pub timeout: u64,
   81|       |    /// Include full memory bodies in results.
   82|       |    #[arg(
   83|       |        long,
   84|       |        default_value_t = false,
   85|       |        help = "Include full memory bodies in results"
   86|       |    )]
   87|       |    pub with_bodies: bool,
   88|       |    /// Maximum results after deduplication.
   89|       |    #[arg(
   90|       |        long,
   91|       |        default_value_t = 50,
   92|       |        help = "Maximum results after deduplication"
   93|       |    )]
   94|       |    pub max_results: usize,
   95|       |    /// RRF k parameter controlling score smoothing (higher = less weight on top ranks).
   96|       |    #[arg(
   97|       |        long,
   98|       |        default_value_t = 60.0,
   99|       |        help = "RRF k parameter (higher = less weight on top ranks)"
  100|       |    )]
  101|       |    pub rrf_k: f64,
  102|       |    /// Decay factor applied to graph scores per hop (score = seed_score * decay^hop).
  103|       |    #[arg(
  104|       |        long,
  105|       |        default_value_t = 0.7,
  106|       |        help = "Graph score decay factor per hop (0.0-1.0)"
  107|       |    )]
  108|       |    pub graph_decay: f64,
  109|       |    /// Minimum score threshold for graph-expanded results (filters noise).
  110|       |    #[arg(
  111|       |        long,
  112|       |        default_value_t = 0.05,
  113|       |        help = "Minimum score threshold for graph-expanded results"
  114|       |    )]
  115|       |    pub graph_min_score: f64,
  116|       |    /// Limit top-k neighbours followed per entity per hop (None = unlimited).
  117|       |    #[arg(
  118|       |        long,
  119|       |        help = "Limit neighbours per entity per hop for graph traversal (default: unlimited)"
  120|       |    )]
  121|       |    pub max_neighbors_per_hop: Option<usize>,
  122|       |    /// Namespace (env: SQLITE_GRAPHRAG_NAMESPACE, default: global).
  123|       |    #[arg(
  124|       |        long,
  125|       |        help = "Namespace (env: SQLITE_GRAPHRAG_NAMESPACE, default: global)"
  126|       |    )]
  127|       |    pub namespace: Option<String>,
  128|       |    /// Research mode: `none` (local heuristic, default), `claude-code`, `codex` (v1.1.0).
  129|       |    #[arg(long, default_value = "none", value_parser = ["none"], hide = true)]
  130|       |    pub mode: String,
  131|       |    /// Maximum LLM cost in USD (effective with --mode claude-code/codex, reserved for v1.1.0).
  132|       |    #[arg(
  133|       |        long,
  134|       |        value_name = "USD",
  135|       |        help = "Max LLM cost in USD (effective with --mode claude-code/codex)"
  136|       |    )]
  137|       |    pub max_cost_usd: Option<f64>,
  138|       |    /// JSON output (always on, kept for consistency).
  139|       |    #[arg(long, hide = true)]
  140|       |    pub json: bool,
  141|       |    /// Database path.
  142|       |    #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
  143|       |    pub db: Option<String>,
  144|       |    #[command(flatten)]
  145|       |    pub daemon: crate::cli::DaemonOpts,
  146|       |}
  147|       |
  148|       |#[derive(Serialize)]
  149|       |struct SubQuery {
  150|       |    id: usize,
  151|       |    text: String,
  152|       |    source: &'static str,
  153|       |}
  154|       |
  155|       |#[derive(Serialize)]
  156|       |struct DeepResult {
  157|       |    name: String,
  158|       |    score: f64,
  159|       |    source: String,
  160|       |    sub_query_ids: Vec<usize>,
  161|       |    snippet: String,
  162|       |    #[serde(skip_serializing_if = "Option::is_none")]
  163|       |    body: Option<String>,
  164|       |    hop_distance: Option<usize>,
  165|       |}
  166|       |
  167|       |/// A node in a reconstructed evidence path.
  168|       |#[derive(Serialize, Clone)]
  169|       |struct EvidenceNode {
  170|       |    entity: String,
  171|       |    #[serde(skip_serializing_if = "Option::is_none")]
  172|       |    relation: Option<String>,
  173|       |    #[serde(skip_serializing_if = "Option::is_none")]
  174|       |    weight: Option<f64>,
  175|       |}
  176|       |
  177|       |/// A directed evidence chain reconstructed from BFS predecessors.
  178|       |///
  179|       |/// Fields:
  180|       |/// - `from`: name of the seed (source) entity.
  181|       |/// - `to`: name of the terminal (target) entity.
  182|       |/// - `path`: ordered list of intermediate nodes from `from` to `to`.
  183|       |/// - `total_weight`: product of edge weights along the path.
  184|       |/// - `sub_query_ids`: which sub-queries produced this chain.
  185|       |#[derive(Serialize)]
  186|       |struct EvidenceChain {
  187|       |    from: String,
  188|       |    to: String,
  189|       |    path: Vec<EvidenceNode>,
  190|       |    total_weight: f64,
  191|       |    depth: usize,
  192|       |    sub_query_ids: Vec<usize>,
  193|       |}
  194|       |
  195|       |#[derive(Serialize)]
  196|       |struct ResearchStats {
  197|       |    sub_queries_total: usize,
  198|       |    sub_queries_completed: usize,
  199|       |    sub_queries_failed: usize,
  200|       |    sub_queries_timed_out: usize,
  201|       |    unique_memories_found: usize,
  202|       |    evidence_chains_found: usize,
  203|       |    elapsed_ms: u64,
  204|       |}
  205|       |
  206|       |#[derive(Serialize)]
  207|       |struct GraphContextEntity {
  208|       |    name: String,
  209|       |    entity_type: String,
  210|       |    degree: u32,
  211|       |}
  212|       |
  213|       |#[derive(Serialize)]
  214|       |struct GraphContextRel {
  215|       |    from: String,
  216|       |    to: String,
  217|       |    relation: String,
  218|       |    weight: f64,
  219|       |}
  220|       |
  221|       |#[derive(Serialize)]
  222|       |struct GraphContext {
  223|       |    entities: Vec<GraphContextEntity>,
  224|       |    relationships: Vec<GraphContextRel>,
  225|       |}
  226|       |
  227|       |#[derive(Serialize)]
  228|       |struct DeepResearchResponse {
  229|       |    query: String,
  230|       |    sub_queries: Vec<SubQuery>,
  231|       |    results: Vec<DeepResult>,
  232|       |    evidence_chains: Vec<EvidenceChain>,
  233|       |    #[serde(skip_serializing_if = "Option::is_none")]
  234|       |    graph_context: Option<GraphContext>,
  235|       |    stats: ResearchStats,
  236|       |}
  237|       |
  238|       |/// Aggregated hit data: (score, source_label, snippet, body, hop_distance, sub_query_ids).
  239|       |type MergedHit = (f64, String, String, String, Option<usize>, Vec<usize>);
  240|       |
  241|       |/// Intermediate result from a single sub-query execution.
  242|       |struct SubQueryResult {
  243|       |    sub_query_id: usize,
  244|       |    /// (memory_id, score, source_label, snippet, body, hop_distance)
  245|       |    hits: Vec<(i64, f64, String, String, String, Option<usize>)>,
  246|       |    /// Evidence chains reconstructed from BFS.
  247|       |    chains: Vec<EvidenceChain>,
  248|       |}
  249|       |
  250|       |/// Sync entry point — builds a tokio runtime for the async fan-out.
  251|       |#[tracing::instrument(skip_all, level = "debug", name = "deep_research")]
  252|      0|pub fn run(args: DeepResearchArgs) -> Result<(), AppError> {
  253|      0|    tracing::debug!(target: "deep_research", query = %args.query, k = args.k, "starting deep research");
  254|      0|    let rt = tokio::runtime::Builder::new_multi_thread()
  255|      0|        .worker_threads(2)
  256|      0|        .enable_all()
  257|      0|        .build()
  258|      0|        .map_err(|e| AppError::Internal(anyhow::anyhow!("failed to build tokio runtime: {e}")))?;
  259|      0|    rt.block_on(run_async(args))
  260|      0|}
  261|       |
  262|       |/// Main async logic: decompose, fan-out, assemble, emit JSON.
  263|      0|async fn run_async(args: DeepResearchArgs) -> Result<(), AppError> {
  264|      0|    let start = std::time::Instant::now();
  265|       |
  266|      0|    if args.query.trim().is_empty() {
  267|      0|        return Err(AppError::Validation(crate::i18n::validation::empty_query()));
  268|      0|    }
  269|       |
  270|      0|    if args.max_cost_usd.is_some() && args.mode == "none" {
  271|      0|        tracing::warn!(target: "deep_research", "--max-cost-usd has no effect without --mode claude-code/codex");
  272|      0|    }
  273|       |
  274|      0|    let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
  275|      0|    let paths = AppPaths::resolve(args.db.as_deref())?;
  276|      0|    crate::storage::connection::ensure_db_ready(&paths)?;
  277|       |
  278|       |    // Phase 1: Query decomposition (sync, pure logic).
  279|      0|    let sub_query_texts = decompose_query(&args.query, args.max_sub_queries);
  280|      0|    let sub_queries: Vec<SubQuery> = sub_query_texts
  281|      0|        .iter()
  282|      0|        .enumerate()
  283|      0|        .map(|(i, text)| SubQuery {
  284|      0|            id: i,
  285|      0|            text: text.clone(),
  286|      0|            source: if sub_query_texts.len() == 1 {
  287|      0|                "original"
  288|       |            } else {
  289|      0|                "decomposed"
  290|       |            },
  291|      0|        })
  292|      0|        .collect();
  293|       |
  294|       |    // GAP-07 FIX: compute ONE embedding PER sub-query text (sequential — daemon serialises).
  295|       |    // The previous code used a single embedding for args.query shared across all sub-queries,
  296|       |    // making decomposition cosmetic.  We now build a Vec<Arc<Vec<f32>>> indexed by sub-query.
  297|      0|    output::emit_progress_i18n(
  298|      0|        "Computing per-sub-query embeddings...",
  299|      0|        "Calculando embeddings por sub-consulta...",
  300|       |    );
  301|      0|    let mut sub_embeddings: Vec<Arc<Vec<f32>>> = Vec::with_capacity(sub_query_texts.len());
  302|      0|    for sq_text in &sub_query_texts {
  303|      0|        let emb = crate::daemon::embed_query_or_local(
  304|      0|            &paths.models,
  305|      0|            sq_text,
  306|      0|            args.daemon.autostart_daemon,
  307|      0|        )?;
  308|      0|        sub_embeddings.push(Arc::new(emb));
  309|       |    }
  310|       |
  311|       |    // Phase 2: Fan-out — parallel sub-query execution.
  312|      0|    let cpu_count = std::thread::available_parallelism()
  313|      0|        .map(|n| n.get())
  314|      0|        .unwrap_or(4);
  315|      0|    let permits = args
  316|      0|        .max_concurrency
  317|      0|        .unwrap_or_else(|| cpu_count.min(8))
  318|      0|        .min(sub_queries.len())
  319|      0|        .max(1);
  320|      0|    let semaphore = Arc::new(Semaphore::new(permits));
  321|      0|    let timeout_dur = std::time::Duration::from_secs(args.timeout);
  322|       |
  323|      0|    let mut join_set: JoinSet<Result<SubQueryResult, (usize, String)>> = JoinSet::new();
  324|       |
  325|      0|    for (idx, sq_text) in sub_query_texts.iter().enumerate() {
  326|      0|        let sem = Arc::clone(&semaphore);
  327|       |        // GAP-07 FIX: pass embedding for THIS specific sub-query.
  328|      0|        let emb = Arc::clone(&sub_embeddings[idx]);
  329|      0|        let ns = namespace.clone();
  330|      0|        let db_path = paths.db.clone();
  331|      0|        let query_text = sq_text.clone();
  332|      0|        let k = args.k;
  333|      0|        let max_hops = args.max_hops;
  334|      0|        let min_weight = args.min_weight;
  335|      0|        let rrf_k = args.rrf_k;
  336|      0|        let graph_decay = args.graph_decay;
  337|      0|        let graph_min_score = args.graph_min_score;
  338|      0|        let max_neighbors_per_hop = args.max_neighbors_per_hop;
  339|       |
  340|      0|        join_set.spawn(async move {
  341|      0|            let _permit = sem
  342|      0|                .acquire_owned()
  343|      0|                .await
  344|      0|                .map_err(|e| (idx, format!("semaphore closed: {e}")))?;
  345|       |
  346|       |            // Dereference the Arc to obtain a &[f32] slice for the sync function.
  347|      0|            let result = tokio::time::timeout(timeout_dur, async move {
  348|      0|                execute_sub_query(
  349|      0|                    idx,
  350|      0|                    &query_text,
  351|      0|                    emb.as_slice(),
  352|      0|                    &ns,
  353|      0|                    &db_path,
  354|      0|                    k,
  355|      0|                    max_hops,
  356|      0|                    min_weight,
  357|      0|                    rrf_k,
  358|      0|                    graph_decay,
  359|      0|                    graph_min_score,
  360|      0|                    max_neighbors_per_hop,
  361|       |                )
  362|      0|            })
  363|      0|            .await;
  364|       |
  365|      0|            match result {
  366|      0|                Ok(inner) => inner.map_err(|e| (idx, e)),
  367|      0|                Err(_) => Err((idx, "timeout".to_string())),
  368|       |            }
  369|      0|        });
  370|       |    }
  371|       |
  372|       |    // Collect results incrementally.
  373|      0|    let mut sub_query_results: Vec<SubQueryResult> = Vec::with_capacity(sub_queries.len());
  374|      0|    let mut failed_count = 0usize;
  375|      0|    let mut timed_out_count = 0usize;
  376|       |
  377|      0|    while let Some(join_result) = join_set.join_next().await {
  378|      0|        match join_result {
  379|      0|            Ok(Ok(sqr)) => sub_query_results.push(sqr),
  380|      0|            Ok(Err((_idx, reason))) => {
  381|      0|                if reason == "timeout" {
  382|      0|                    timed_out_count += 1;
  383|      0|                } else {
  384|      0|                    failed_count += 1;
  385|      0|                }
  386|      0|                tracing::warn!(target: "deep_research", sub_query_id = _idx, reason = %reason, "sub-query failed");
  387|       |            }
  388|      0|            Err(join_err) => {
  389|      0|                failed_count += 1;
  390|      0|                if join_err.is_panic() {
  391|      0|                    tracing::error!(target: "deep_research", error = %join_err, "sub-query task panicked");
  392|       |                } else {
  393|      0|                    tracing::warn!(target: "deep_research", error = %join_err, "sub-query task cancelled");
  394|       |                }
  395|       |            }
  396|       |        }
  397|       |    }
  398|       |
  399|       |    // Phase 3: Evidence assembly — merge, dedup, rank.
  400|       |    // Aggregate hits: memory_id -> (best_score, source, snippet, body, hop_distance, sub_query_ids)
  401|      0|    let mut merged: crate::hash::AHashMap<i64, MergedHit> =
  402|      0|        crate::hash::AHashMap::with_capacity_and_hasher(
  403|      0|            sub_query_results.len() * args.k,
  404|      0|            Default::default(),
  405|       |        );
  406|       |
  407|      0|    for sqr in &sub_query_results {
  408|      0|        for (mem_id, score, source, snippet, body, hop) in &sqr.hits {
  409|      0|            let entry = merged.entry(*mem_id).or_insert_with(|| {
  410|      0|                (
  411|      0|                    *score,
  412|      0|                    source.clone(),
  413|      0|                    snippet.clone(),
  414|      0|                    body.clone(),
  415|      0|                    *hop,
  416|      0|                    Vec::new(),
  417|      0|                )
  418|      0|            });
  419|       |            // Keep best score.
  420|      0|            if *score > entry.0 {
  421|      0|                entry.0 = *score;
  422|      0|                entry.1 = source.clone();
  423|      0|                entry.2 = snippet.clone();
  424|      0|                entry.3 = body.clone();
  425|      0|                entry.4 = *hop;
  426|      0|            }
  427|      0|            if !entry.5.contains(&sqr.sub_query_id) {
  428|      0|                entry.5.push(sqr.sub_query_id);
  429|      0|            }
  430|       |        }
  431|       |    }
  432|       |
  433|       |    // Resolve memory names for merged results.
  434|      0|    let conn = open_ro(&paths.db)?;
  435|      0|    let mut results: Vec<DeepResult> = Vec::with_capacity(merged.len().min(args.max_results));
  436|       |
  437|       |    // Sort by score descending.
  438|      0|    let mut ranked: Vec<(i64, MergedHit)> = merged.into_iter().collect();
  439|      0|    ranked.sort_by(|a, b| {
  440|      0|        b.1 .0
  441|      0|            .partial_cmp(&a.1 .0)
  442|      0|            .unwrap_or(std::cmp::Ordering::Equal)
  443|      0|    });
  444|      0|    ranked.truncate(args.max_results);
  445|       |
  446|      0|    for (mem_id, (score, source, snippet, body, hop, sq_ids)) in ranked {
  447|      0|        let name = match memories::read_full(&conn, mem_id)? {
  448|      0|            Some(row) => row.name,
  449|      0|            None => continue,
  450|       |        };
  451|      0|        results.push(DeepResult {
  452|      0|            name,
  453|      0|            score,
  454|      0|            source,
  455|      0|            sub_query_ids: sq_ids,
  456|      0|            snippet,
  457|      0|            body: if args.with_bodies { Some(body) } else { None },
  458|      0|            hop_distance: hop,
  459|       |        });
  460|       |    }
  461|       |
  462|       |    // GAP-09/10 FIX: Collect evidence chains from reconstructed BFS paths.
  463|       |    // The old code appended flat node pairs from a global SELECT; now each
  464|       |    // sub-query returns directed EvidenceChain structs (from, to, path).
  465|      0|    let completed_count = sub_query_results.len();
  466|      0|    let mut evidence_chains: Vec<EvidenceChain> = Vec::with_capacity(completed_count * 2);
  467|      0|    let mut seen_chain_keys: HashSet<String> = HashSet::with_capacity(completed_count * 2);
  468|       |
  469|      0|    for sqr in sub_query_results {
  470|      0|        for chain in sqr.chains {
  471|       |            // Deduplicate chains by (from, to) pair.
  472|      0|            let key = format!("{}->{}", chain.from, chain.to);
  473|      0|            if seen_chain_keys.insert(key) {
  474|      0|                evidence_chains.push(chain);
  475|      0|            }
  476|       |        }
  477|       |    }
  478|       |
  479|       |    // Sort evidence chains by total_weight descending, discard single-hop trivial chains.
  480|      0|    evidence_chains.retain(|c| c.depth >= 2);
  481|      0|    evidence_chains.sort_by(|a, b| {
  482|      0|        b.total_weight
  483|      0|            .partial_cmp(&a.total_weight)
  484|      0|            .unwrap_or(std::cmp::Ordering::Equal)
  485|      0|    });
  486|       |
  487|      0|    let unique_memories = results.len();
  488|      0|    let evidence_count = evidence_chains.len();
  489|       |
  490|       |    // MEDIUM-01b: Build graph_context with entities and relationships from result memories.
  491|      0|    let graph_context = if !results.is_empty() {
  492|      0|        let result_names: Vec<&str> = results.iter().map(|r| r.name.as_str()).collect();
  493|      0|        let mut ctx_entities: Vec<GraphContextEntity> = Vec::with_capacity(results.len());
  494|      0|        let mut ctx_rels: Vec<GraphContextRel> = Vec::with_capacity(results.len() * 2);
  495|      0|        let mut seen_entity_ids: crate::hash::AHashSet<i64> =
  496|      0|            crate::hash::AHashSet::with_capacity_and_hasher(results.len(), Default::default());
  497|       |
  498|      0|        for name in &result_names {
  499|      0|            if let Ok(Some(eid)) = entities::find_entity_id(&conn, &namespace, name) {
  500|      0|                if seen_entity_ids.insert(eid) {
  501|      0|                    let etype: String = conn
  502|      0|                        .query_row(
  503|      0|                            "SELECT COALESCE(type,'concept') FROM entities WHERE id = ?1",
  504|      0|                            rusqlite::params![eid],
  505|      0|                            |r| r.get(0),
  506|       |                        )
  507|      0|                        .unwrap_or_else(|_| "concept".to_string());
  508|      0|                    let degree: u32 = conn
  509|      0|                        .query_row(
  510|      0|                            "SELECT COUNT(*) FROM relationships WHERE source_id = ?1 OR target_id = ?1",
  511|      0|                            rusqlite::params![eid],
  512|      0|                            |r| r.get(0),
  513|       |                        )
  514|      0|                        .unwrap_or(0);
  515|      0|                    ctx_entities.push(GraphContextEntity {
  516|      0|                        name: name.to_string(),
  517|      0|                        entity_type: etype,
  518|      0|                        degree,
  519|      0|                    });
  520|      0|                }
  521|      0|            }
  522|       |        }
  523|       |
  524|      0|        let entity_ids: Vec<i64> = seen_entity_ids.iter().copied().collect();
  525|      0|        if entity_ids.len() >= 2 {
  526|      0|            let placeholders: String = entity_ids.iter().map(|_| "?").collect::<Vec<_>>().join(",");
  527|      0|            let sql = format!(
  528|      0|                "SELECT s.name, t.name, r.relation, r.weight \
  529|      0|                 FROM relationships r \
  530|      0|                 JOIN entities s ON s.id = r.source_id \
  531|      0|                 JOIN entities t ON t.id = r.target_id \
  532|      0|                 WHERE r.source_id IN ({placeholders}) AND r.target_id IN ({placeholders}) \
  533|      0|                 LIMIT 50"
  534|       |            );
  535|      0|            if let Ok(mut stmt) = conn.prepare(&sql) {
  536|      0|                let mut params: Vec<Box<dyn rusqlite::types::ToSql>> =
  537|      0|                    Vec::with_capacity(entity_ids.len() * 2);
  538|      0|                for id in &entity_ids {
  539|      0|                    params.push(Box::new(*id));
  540|      0|                }
  541|      0|                for id in &entity_ids {
  542|      0|                    params.push(Box::new(*id));
  543|      0|                }
  544|      0|                let param_refs: Vec<&dyn rusqlite::types::ToSql> =
  545|      0|                    params.iter().map(|p| p.as_ref()).collect();
  546|      0|                if let Ok(rows) = stmt.query_map(param_refs.as_slice(), |r| {
  547|       |                    Ok((
  548|      0|                        r.get::<_, String>(0)?,
  549|      0|                        r.get::<_, String>(1)?,
  550|      0|                        r.get::<_, String>(2)?,
  551|      0|                        r.get::<_, f64>(3)?,
  552|       |                    ))
  553|      0|                }) {
  554|      0|                    for row in rows.flatten() {
  555|      0|                        ctx_rels.push(GraphContextRel {
  556|      0|                            from: row.0,
  557|      0|                            to: row.1,
  558|      0|                            relation: row.2,
  559|      0|                            weight: row.3,
  560|      0|                        });
  561|      0|                    }
  562|      0|                }
  563|      0|            }
  564|      0|        }
  565|       |
  566|      0|        if ctx_entities.is_empty() {
  567|      0|            None
  568|       |        } else {
  569|      0|            Some(GraphContext {
  570|      0|                entities: ctx_entities,
  571|      0|                relationships: ctx_rels,
  572|      0|            })
  573|       |        }
  574|       |    } else {
  575|      0|        None
  576|       |    };
  577|       |
  578|      0|    tracing::debug!(target: "deep_research",
  579|      0|        total_results = results.len(),
  580|      0|        total_chains = evidence_chains.len(),
  581|      0|        "assembly complete"
  582|       |    );
  583|       |
  584|       |    // Phase 4: JSON output.
  585|      0|    output::emit_json(&DeepResearchResponse {
  586|      0|        query: args.query,
  587|      0|        sub_queries,
  588|      0|        results,
  589|      0|        evidence_chains,
  590|      0|        graph_context,
  591|      0|        stats: ResearchStats {
  592|      0|            sub_queries_total: sub_query_texts.len(),
  593|      0|            sub_queries_completed: completed_count,
  594|      0|            sub_queries_failed: failed_count,
  595|      0|            sub_queries_timed_out: timed_out_count,
  596|      0|            unique_memories_found: unique_memories,
  597|      0|            evidence_chains_found: evidence_count,
  598|      0|            elapsed_ms: start.elapsed().as_millis() as u64,
  599|      0|        },
  600|      0|    })?;
  601|       |
  602|      0|    Ok(())
  603|      0|}
  604|       |
  605|       |/// Heuristic query decomposition: splits by conjunctions, commas, semicolons,
  606|       |/// relational phrases, and extracts explicit entities (kebab-case or quoted).
  607|      9|fn decompose_query(query: &str, max: usize) -> Vec<String> {
  608|      9|    if query.is_empty() {
  609|      1|        return vec![query.to_string()];
  610|      8|    }
  611|       |
  612|      8|    let mut parts: Vec<String> = Vec::with_capacity(max);
  613|       |
  614|       |    // Split by relational phrases first (most specific).
  615|      8|    let relational = [
  616|      8|        " that caused ",
  617|      8|        " depending on ",
  618|      8|        " related to ",
  619|      8|        " connected to ",
  620|      8|        " linked to ",
  621|      8|        " caused by ",
  622|      8|        " followed by ",
  623|      8|    ];
  624|      8|    let mut text = query.to_string();
  625|      8|    let mut did_relational_split = false;
  626|     64|    for phrase in &relational {
                      ^56
  627|     56|        if text.to_lowercase().contains(phrase) {
  628|      1|            let lower = text.to_lowercase();
  629|      1|            if let Some(pos) = lower.find(phrase) {
  630|      1|                let left = text[..pos].trim().to_string();
  631|      1|                let right = text[pos + phrase.len()..].trim().to_string();
  632|      1|                if !left.is_empty() {
  633|      1|                    parts.push(left);
  634|      1|                }
                              ^0
  635|      1|                if !right.is_empty() {
  636|      1|                    text = right;
  637|      1|                }
                              ^0
  638|      1|                did_relational_split = true;
  639|      0|            }
  640|     55|        }
  641|       |    }
  642|      8|    if did_relational_split && !text.is_empty() {
                                             ^1
  643|      1|        parts.push(text.clone());
  644|      7|    }
  645|       |
  646|       |    // If no relational split, try conjunctions and delimiters.
  647|      8|    if parts.is_empty() {
  648|       |        // Split by semicolons first.
  649|      7|        let semi_parts: Vec<&str> = query.split(';').collect();
  650|      7|        if semi_parts.len() > 1 {
  651|      7|            for p in &semi_parts {
                              ^5
  652|      5|                let trimmed = p.trim();
  653|      5|                if !trimmed.is_empty() {
  654|      5|                    parts.push(trimmed.to_string());
  655|      5|                }
                              ^0
  656|       |            }
  657|       |        } else {
  658|       |            // Split by commas and conjunctions.
  659|       |            // Replace " and " and " e " (Portuguese) with comma, then split.
  660|      5|            let normalized = query
  661|      5|                .replace(" and ", ", ")
  662|      5|                .replace(" AND ", ", ")
  663|      5|                .replace(" e ", ", ")
  664|      5|                .replace(" E ", ", ");
  665|      5|            let comma_parts: Vec<&str> = normalized.split(',').collect();
  666|      5|            if comma_parts.len() > 1 {
  667|     21|                for p in &comma_parts {
                                  ^17
  668|     17|                    let trimmed = p.trim();
  669|     17|                    if !trimmed.is_empty() {
  670|     17|                        parts.push(trimmed.to_string());
  671|     17|                    }
                                  ^0
  672|       |                }
  673|      1|            }
  674|       |        }
  675|      1|    }
  676|       |
  677|       |    // If still no split, try word-pair decomposition for multi-word queries.
  678|      8|    if parts.is_empty() {
  679|      2|        let words: Vec<&str> = query.split_whitespace().filter(|w| w.len() > 2).collect();
                          ^1     ^1          ^1    ^1                 ^1                      ^1
  680|      1|        if words.len() >= 3 {
  681|      0|            parts.push(query.to_string());
  682|      0|            parts.push(format!("{} {}", words[0], words[1]));
  683|      0|            parts.push(format!(
  684|      0|                "{} {}",
  685|      0|                words[words.len() - 2],
  686|      0|                words[words.len() - 1]
  687|      0|            ));
  688|      1|        }
  689|      7|    }
  690|       |
  691|      8|    if parts.is_empty() {
  692|      1|        return vec![query.to_string()];
  693|      7|    }
  694|       |
  695|       |    // Cap at max.
  696|      7|    parts.truncate(max);
  697|      7|    parts
  698|      9|}
  699|       |
  700|       |/// Reconstruct a directed path from `target_entity_id` back to a seed using the
  701|       |/// predecessor map built by BFS.  Returns the path nodes from root to target
  702|       |/// plus the accumulated edge weights.
  703|      1|fn reconstruct_path(
  704|      1|    target_id: i64,
  705|      1|    seed_entity_ids: &HashSet<i64>,
  706|      1|    predecessor: &PredecessorMap,
  707|      1|    entity_names: &crate::hash::AHashMap<i64, String>,
  708|      1|) -> Option<(Vec<EvidenceNode>, f64)> {
  709|      1|    let mut path_ids: Vec<(i64, Option<String>, Option<f64>)> = Vec::with_capacity(8);
  710|      1|    let mut total_weight = 1.0_f64;
  711|      1|    let mut current = target_id;
  712|       |
  713|       |    loop {
  714|      3|        if seed_entity_ids.contains(&current) {
  715|      1|            break;
  716|      2|        }
  717|      2|        let (parent, relation, weight) = predecessor.get(&current)?;
                                                                                ^0
  718|      2|        total_weight *= weight;
  719|      2|        path_ids.push((current, Some(relation.clone()), Some(*weight)));
  720|      2|        current = *parent;
  721|       |    }
  722|       |    // Push the seed entity (root).
  723|      1|    path_ids.push((current, None, None));
  724|       |
  725|       |    // Reverse so path goes from seed → target.
  726|      1|    path_ids.reverse();
  727|       |
  728|      1|    let nodes: Vec<EvidenceNode> = path_ids
  729|      1|        .into_iter()
  730|      1|        .map(|(id, relation, weight)| EvidenceNode {
  731|      3|            entity: entity_names
  732|      3|                .get(&id)
  733|      3|                .cloned()
  734|      3|                .unwrap_or_else(|| format!("entity-{id}")),
                                                         ^0
  735|      3|            relation,
  736|      3|            weight,
  737|      3|        })
  738|      1|        .collect();
  739|       |
  740|      1|    Some((nodes, total_weight))
  741|      1|}
  742|       |
  743|       |/// Execute a single sub-query: hybrid search (KNN + FTS fused via RRF) + graph traversal.
  744|       |///
  745|       |/// GAP-07 fix: receives the embedding for THIS sub-query (not the shared original).
  746|       |/// GAP-08/11 fix: uses rrf_fuse() for proper score fusion instead of hardcoded 0.5.
  747|       |/// GAP-09/10 fix: builds directed evidence chains filtered to discovered entities.
  748|       |/// GAP-17: respects max_neighbors_per_hop cap in BFS.
  749|       |///
  750|       |/// Runs synchronously on a blocking thread (called from a tokio spawn context).
  751|       |/// Each call opens its own read-only SQLite connection to leverage WAL concurrency.
  752|       |#[allow(clippy::too_many_arguments)]
  753|      0|fn execute_sub_query(
  754|      0|    sub_query_id: usize,
  755|      0|    query_text: &str,
  756|      0|    embedding: &[f32],
  757|      0|    namespace: &str,
  758|      0|    db_path: &std::path::Path,
  759|      0|    k: usize,
  760|      0|    max_hops: usize,
  761|      0|    min_weight: f64,
  762|      0|    rrf_k: f64,
  763|      0|    graph_decay: f64,
  764|      0|    graph_min_score: f64,
  765|      0|    max_neighbors_per_hop: Option<usize>,
  766|      0|) -> Result<SubQueryResult, String> {
  767|      0|    let conn = open_ro(db_path).map_err(|e| format!("failed to open db: {e}"))?;
  768|       |
  769|      0|    let mut hits: Vec<(i64, f64, String, String, String, Option<usize>)> =
  770|      0|        Vec::with_capacity(k * 2);
  771|      0|    let mut seen_ids: crate::hash::AHashSet<i64> =
  772|      0|        crate::hash::AHashSet::with_capacity_and_hasher(k * 2, Default::default());
  773|       |
  774|       |    // --- GAP-08/11 FIX: Use RRF fusion for KNN + FTS instead of hardcoded 0.5 ---
  775|       |
  776|       |    // 1. KNN vector search — collect ranked IDs.
  777|      0|    let knn_results = memories::knn_search(&conn, embedding, &[namespace.to_string()], None, k)
  778|      0|        .map_err(|e| format!("knn_search failed: {e}"))?;
  779|      0|    let knn_ids: Vec<i64> = knn_results.iter().map(|(id, _)| *id).collect();
  780|      0|    tracing::debug!(target: "deep_research", sub_query_id, knn_count = knn_ids.len(), "KNN complete");
  781|       |
  782|       |    // Build distance map for score computation.
  783|      0|    let knn_distance_map: crate::hash::AHashMap<i64, f64> = knn_results
  784|      0|        .iter()
  785|      0|        .map(|(id, dist)| (*id, *dist as f64))
  786|      0|        .collect();
  787|       |
  788|       |    // 2. FTS5 search — collect ranked IDs.
  789|      0|    let fts_results = match memories::fts_search(&conn, query_text, namespace, None, k) {
  790|      0|        Ok(rows) => rows,
  791|      0|        Err(e) => {
  792|      0|            tracing::warn!(target: "deep_research",
  793|       |                sub_query_id,
  794|      0|                "FTS5 search failed, continuing with KNN only: {e}"
  795|       |            );
  796|      0|            vec![]
  797|       |        }
  798|       |    };
  799|      0|    let fts_ids: Vec<i64> = fts_results.iter().map(|r| r.id).collect();
  800|      0|    tracing::debug!(target: "deep_research", sub_query_id, fts_count = fts_ids.len(), "FTS complete");
  801|       |
  802|       |    // 3. Fuse via RRF.
  803|      0|    let rrf_scores = rrf_fuse(&[(1.0, &knn_ids), (1.0, &fts_ids)], rrf_k);
  804|      0|    let max_possible = rrf_max_possible(&[1.0, 1.0], rrf_k);
  805|       |
  806|       |    // 4. Sort fused results and build hits.
  807|      0|    let mut fused: Vec<(i64, f64)> = rrf_scores.into_iter().collect();
  808|      0|    fused.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
  809|      0|    fused.truncate(k * 2);
  810|      0|    tracing::debug!(target: "deep_research",
  811|       |        sub_query_id,
  812|      0|        fused_count = fused.len(),
  813|      0|        "RRF fusion complete"
  814|       |    );
  815|       |
  816|      0|    if fused.is_empty() && !knn_ids.is_empty() {
  817|      0|        tracing::warn!(target: "deep_research", sub_query_id, knn_count = knn_ids.len(), fts_count = fts_ids.len(),
  818|      0|            "RRF fusion returned 0 results despite KNN/FTS hits; consider lowering --graph-min-score");
  819|      0|    }
  820|       |
  821|      0|    for (memory_id, combined_score) in &fused {
  822|      0|        if seen_ids.insert(*memory_id) {
  823|      0|            let normalized = if max_possible > 0.0 {
  824|      0|                combined_score / max_possible
  825|       |            } else {
  826|      0|                0.0
  827|       |            };
  828|      0|            let score = normalized.clamp(0.0, 1.0);
  829|      0|            let in_knn = knn_distance_map.contains_key(memory_id);
  830|      0|            let in_fts = fts_ids.contains(memory_id);
  831|      0|            let source = match (in_knn, in_fts) {
  832|      0|                (true, true) => "hybrid",
  833|      0|                (true, false) => "knn",
  834|      0|                (false, true) => "fts",
  835|      0|                (false, false) => "graph",
  836|       |            };
  837|      0|            if let Ok(Some(row)) = memories::read_full(&conn, *memory_id) {
  838|      0|                let snippet: String = row.body.chars().take(300).collect();
  839|      0|                hits.push((
  840|      0|                    *memory_id,
  841|      0|                    score,
  842|      0|                    source.to_string(),
  843|      0|                    snippet,
  844|      0|                    row.body,
  845|      0|                    None,
  846|      0|                ));
  847|      0|            }
  848|      0|        }
  849|       |    }
  850|       |
  851|       |    // 5. Graph traversal from discovered memories.
  852|       |    // GAP-09/10 FIX: entity KNN also uses this sub-query's embedding.
  853|      0|    let memory_ids: Vec<i64> = hits.iter().map(|(id, ..)| *id).collect();
  854|      0|    let mut chains: Vec<EvidenceChain> = Vec::with_capacity(memory_ids.len());
  855|       |
  856|      0|    if !memory_ids.is_empty() && max_hops > 0 {
  857|       |        // Seed entities from KNN on entity vectors using THIS sub-query's embedding.
  858|      0|        let entity_knn = entities::knn_search(&conn, embedding, namespace, 5)
  859|      0|            .inspect_err(|e| tracing::warn!(target: "deep_research", error = %e, "entity KNN search failed, skipping graph seed"))
  860|      0|            .unwrap_or_default();
  861|      0|        let entity_ids: Vec<i64> = entity_knn.iter().map(|(id, _)| *id).collect();
  862|       |
  863|       |        // HIGH-01 FIX: limit seeds to top-5 memories by score to prevent
  864|       |        // BFS from starting at every node when k >= total memories.
  865|      0|        let top_seed_count = 5.min(memory_ids.len());
  866|      0|        let top_memory_ids = &memory_ids[..top_seed_count];
  867|      0|        let mut seed_entity_ids: Vec<i64> = entity_ids.clone();
  868|      0|        for &mem_id in top_memory_ids {
  869|      0|            let mut stmt = conn
  870|      0|                .prepare_cached("SELECT entity_id FROM memory_entities WHERE memory_id = ?1")
  871|      0|                .map_err(|e| format!("prepare failed: {e}"))?;
  872|      0|            let ids: Vec<i64> = stmt
  873|      0|                .query_map(rusqlite::params![mem_id], |r| r.get(0))
  874|      0|                .map_err(|e| format!("query failed: {e}"))?
  875|      0|                .filter_map(|r| r.ok())
  876|      0|                .collect();
  877|      0|            seed_entity_ids.extend(ids);
  878|       |        }
  879|      0|        seed_entity_ids.sort_unstable();
  880|      0|        seed_entity_ids.dedup();
  881|      0|        tracing::debug!(target: "deep_research",
  882|       |            sub_query_id,
  883|      0|            seed_count = seed_entity_ids.len(),
  884|      0|            "seed entities collected"
  885|       |        );
  886|       |
  887|      0|        let all_seed_ids: Vec<i64> = memory_ids
  888|      0|            .iter()
  889|      0|            .chain(entity_ids.iter())
  890|      0|            .copied()
  891|      0|            .collect();
  892|       |
  893|       |        // Graph traversal with hop scores.
  894|      0|        if let Ok(graph_results) = traverse_from_memories_with_hops_capped(
  895|      0|            &conn,
  896|      0|            &all_seed_ids,
  897|      0|            namespace,
  898|      0|            min_weight,
  899|      0|            max_hops as u32,
  900|      0|            max_neighbors_per_hop,
  901|      0|        ) {
  902|       |            // Build seed score map from RRF-fused scores for graph decay computation.
  903|      0|            let seed_score_map: crate::hash::AHashMap<i64, f64> = fused
  904|      0|                .iter()
  905|      0|                .map(|(id, s)| {
  906|      0|                    let normalized = if max_possible > 0.0 {
  907|      0|                        s / max_possible
  908|       |                    } else {
  909|      0|                        0.0
  910|       |                    };
  911|      0|                    (*id, normalized.clamp(0.0, 1.0))
  912|      0|                })
  913|      0|                .collect();
  914|       |
  915|      0|            for (graph_mem_id, hop) in graph_results {
  916|      0|                if seen_ids.insert(graph_mem_id) {
  917|       |                    // GAP-08/11 FIX: graph score = seed_score * decay^hop * edge_weight.
  918|       |                    // For the seed score, use the best score among the seed memories that
  919|       |                    // transitively reached this graph memory (approximate with the average
  920|       |                    // seed score since we don't track the exact path yet).
  921|      0|                    let avg_seed_score: f64 = if seed_score_map.is_empty() {
  922|      0|                        0.5
  923|       |                    } else {
  924|      0|                        let sum: f64 = seed_score_map.values().sum();
  925|      0|                        sum / seed_score_map.len() as f64
  926|       |                    };
  927|      0|                    let graph_score =
  928|      0|                        (avg_seed_score * graph_decay.powi(hop as i32)).clamp(0.0, 1.0);
  929|       |
  930|      0|                    if graph_score < graph_min_score {
  931|      0|                        continue;
  932|      0|                    }
  933|       |
  934|      0|                    if let Ok(Some(row)) = memories::read_full(&conn, graph_mem_id) {
  935|      0|                        let snippet: String = row.body.chars().take(300).collect();
  936|      0|                        hits.push((
  937|      0|                            graph_mem_id,
  938|      0|                            graph_score,
  939|      0|                            "graph".to_string(),
  940|      0|                            snippet,
  941|      0|                            row.body,
  942|      0|                            Some(hop as usize),
  943|      0|                        ));
  944|      0|                    }
  945|      0|                }
  946|       |            }
  947|      0|        }
  948|       |
  949|       |        // GAP-09/10 FIX: Build directed evidence chains using BFS with predecessor map,
  950|       |        // filtered to entities discovered in this sub-query.
  951|      0|        if !seed_entity_ids.is_empty() {
  952|      0|            let (entity_depth, predecessor) = bfs_with_predecessors(
  953|      0|                &conn,
  954|      0|                &seed_entity_ids,
  955|      0|                namespace,
  956|      0|                min_weight,
  957|      0|                max_hops as u32,
  958|      0|                max_neighbors_per_hop,
  959|      0|            )
  960|      0|            .unwrap_or_default();
  961|       |
  962|      0|            tracing::debug!(target: "deep_research",
  963|       |                sub_query_id,
  964|      0|                bfs_nodes = entity_depth.len(),
  965|      0|                predecessors = predecessor.len(),
  966|      0|                "BFS complete"
  967|       |            );
  968|       |
  969|      0|            let seed_entity_set: HashSet<i64> = seed_entity_ids.iter().copied().collect();
  970|       |
  971|       |            // Collect entity IDs we need names for.
  972|      0|            let all_entity_ids: Vec<i64> = entity_depth.keys().copied().collect();
  973|      0|            let mut entity_names: crate::hash::AHashMap<i64, String> =
  974|      0|                crate::hash::AHashMap::with_capacity_and_hasher(
  975|      0|                    all_entity_ids.len(),
  976|      0|                    ahash::RandomState::default(),
  977|       |                );
  978|      0|            for &eid in &all_entity_ids {
  979|      0|                let name_res: rusqlite::Result<String> = conn.query_row(
  980|      0|                    "SELECT name FROM entities WHERE id = ?1",
  981|      0|                    rusqlite::params![eid],
  982|      0|                    |r| r.get(0),
  983|       |                );
  984|      0|                if let Ok(name) = name_res {
  985|      0|                    entity_names.insert(eid, name);
  986|      0|                }
  987|       |            }
  988|       |
  989|       |            // Reconstruct a path for each non-seed entity that has a predecessor.
  990|      0|            for (&target_id, &_hop) in &entity_depth {
  991|      0|                if seed_entity_set.contains(&target_id) {
  992|      0|                    continue;
  993|      0|                }
  994|      0|                if !predecessor.contains_key(&target_id) {
  995|      0|                    continue;
  996|      0|                }
  997|      0|                if let Some((path_nodes, total_weight)) =
  998|      0|                    reconstruct_path(target_id, &seed_entity_set, &predecessor, &entity_names)
  999|       |                {
 1000|      0|                    if path_nodes.len() < 2 {
 1001|      0|                        continue;
 1002|      0|                    }
 1003|      0|                    let from = path_nodes
 1004|      0|                        .first()
 1005|      0|                        .map(|n| n.entity.clone())
 1006|      0|                        .unwrap_or_default();
 1007|      0|                    let to = path_nodes
 1008|      0|                        .last()
 1009|      0|                        .map(|n| n.entity.clone())
 1010|      0|                        .unwrap_or_default();
 1011|      0|                    let depth = path_nodes.len();
 1012|      0|                    chains.push(EvidenceChain {
 1013|      0|                        from,
 1014|      0|                        to,
 1015|      0|                        path: path_nodes,
 1016|      0|                        total_weight,
 1017|      0|                        depth,
 1018|      0|                        sub_query_ids: vec![sub_query_id],
 1019|      0|                    });
 1020|      0|                }
 1021|       |            }
 1022|       |
 1023|       |            // Sort chains by total_weight descending and cap to avoid huge output.
 1024|      0|            chains.sort_by(|a, b| {
 1025|      0|                b.total_weight
 1026|      0|                    .partial_cmp(&a.total_weight)
 1027|      0|                    .unwrap_or(std::cmp::Ordering::Equal)
 1028|      0|            });
 1029|      0|            chains.truncate(20);
 1030|      0|            tracing::debug!(target: "deep_research",
 1031|       |                sub_query_id,
 1032|      0|                chains_count = chains.len(),
 1033|      0|                "evidence chains built"
 1034|       |            );
 1035|      0|        }
 1036|      0|    }
 1037|       |
 1038|      0|    Ok(SubQueryResult {
 1039|      0|        sub_query_id,
 1040|      0|        hits,
 1041|      0|        chains,
 1042|      0|    })
 1043|      0|}
 1044|       |
 1045|       |// ────────────────────────────────────────────────────────────────────────────
 1046|       |// Re-export sub_query_results field initialisation for the stats counter.
 1047|       |// The field is moved out of run_async after the join loop; we need to shadow it.
 1048|       |// ────────────────────────────────────────────────────────────────────────────
 1049|       |
 1050|       |#[cfg(test)]
 1051|       |mod tests {
 1052|       |    use super::*;
 1053|       |
 1054|       |    #[test]
 1055|      1|    fn test_decompose_and_conjunction() {
 1056|      1|        let result = decompose_query("A and B", 7);
 1057|      1|        assert_eq!(result, vec!["A", "B"]);
 1058|      1|    }
 1059|       |
 1060|       |    #[test]
 1061|      1|    fn test_decompose_no_split() {
 1062|      1|        let result = decompose_query("simple query", 7);
 1063|      1|        assert_eq!(result, vec!["simple query"]);
 1064|      1|    }
 1065|       |
 1066|       |    #[test]
 1067|      1|    fn test_decompose_three_parts() {
 1068|      1|        let result = decompose_query("A, B and C", 7);
 1069|      1|        assert_eq!(result, vec!["A", "B", "C"]);
 1070|      1|    }
 1071|       |
 1072|       |    #[test]
 1073|      1|    fn test_decompose_portuguese_conjunctions() {
 1074|      1|        let result = decompose_query("A e B", 7);
 1075|      1|        assert_eq!(result, vec!["A", "B"]);
 1076|      1|    }
 1077|       |
 1078|       |    #[test]
 1079|      1|    fn test_decompose_max_cap() {
 1080|     10|        let parts: Vec<String> = (0..10).map(|i| format!("part{i}")).collect();
                          ^1     ^1            ^1      ^1                          ^1
 1081|      1|        let query = parts.join(", ");
 1082|      1|        let result = decompose_query(&query, 7);
 1083|      1|        assert!(
 1084|      1|            result.len() <= 7,
 1085|      0|            "expected at most 7 sub-queries, got {}",
 1086|      0|            result.len()
 1087|       |        );
 1088|      1|    }
 1089|       |
 1090|       |    #[test]
 1091|      1|    fn test_decompose_empty_preserves_original() {
 1092|      1|        let result = decompose_query("", 7);
 1093|      1|        assert_eq!(result, vec![""]);
 1094|      1|    }
 1095|       |
 1096|       |    #[test]
 1097|      1|    fn test_decompose_semicolons() {
 1098|      1|        let result = decompose_query("auth design; deployment config; logging", 7);
 1099|      1|        assert_eq!(result, vec!["auth design", "deployment config", "logging"]);
 1100|      1|    }
 1101|       |
 1102|       |    #[test]
 1103|      1|    fn test_decompose_relational_phrase() {
 1104|      1|        let result = decompose_query("auth that caused deployment failure", 7);
 1105|      1|        assert_eq!(result, vec!["auth", "deployment failure"]);
 1106|      1|    }
 1107|       |
 1108|       |    #[test]
 1109|      1|    fn test_sub_query_serialization() {
 1110|      1|        let sq = SubQuery {
 1111|      1|            id: 0,
 1112|      1|            text: "test query".to_string(),
 1113|      1|            source: "original",
 1114|      1|        };
 1115|      1|        let json = serde_json::to_value(&sq).expect("serialization failed");
 1116|      1|        assert_eq!(json["id"], 0);
 1117|      1|        assert_eq!(json["text"], "test query");
 1118|      1|        assert_eq!(json["source"], "original");
 1119|      1|    }
 1120|       |
 1121|       |    #[test]
 1122|      1|    fn test_deep_result_omits_body_when_none() {
 1123|      1|        let result = DeepResult {
 1124|      1|            name: "test".to_string(),
 1125|      1|            score: 0.9,
 1126|      1|            source: "knn".to_string(),
 1127|      1|            sub_query_ids: vec![0],
 1128|      1|            snippet: "snippet".to_string(),
 1129|      1|            body: None,
 1130|      1|            hop_distance: None,
 1131|      1|        };
 1132|      1|        let json = serde_json::to_string(&result).expect("serialization failed");
 1133|      1|        assert!(!json.contains("\"body\""), "body must be omitted when None");
                                                          ^0
 1134|      1|    }
 1135|       |
 1136|       |    #[test]
 1137|      1|    fn test_deep_result_includes_body_when_some() {
 1138|      1|        let result = DeepResult {
 1139|      1|            name: "test".to_string(),
 1140|      1|            score: 0.9,
 1141|      1|            source: "knn".to_string(),
 1142|      1|            sub_query_ids: vec![0, 1],
 1143|      1|            snippet: "snippet".to_string(),
 1144|      1|            body: Some("full body content".to_string()),
 1145|      1|            hop_distance: Some(2),
 1146|      1|        };
 1147|      1|        let json = serde_json::to_string(&result).expect("serialization failed");
 1148|      1|        assert!(json.contains("\"body\""), "body must be present when Some");
                                                         ^0
 1149|      1|        assert!(json.contains("full body content"));
 1150|      1|    }
 1151|       |
 1152|       |    #[test]
 1153|      1|    fn test_evidence_node_omits_none_fields() {
 1154|      1|        let node = EvidenceNode {
 1155|      1|            entity: "auth-module".to_string(),
 1156|      1|            relation: None,
 1157|      1|            weight: None,
 1158|      1|        };
 1159|      1|        let json = serde_json::to_string(&node).expect("serialization failed");
 1160|      1|        assert!(
 1161|      1|            !json.contains("\"relation\""),
 1162|      0|            "relation must be omitted when None"
 1163|       |        );
 1164|      1|        assert!(
 1165|      1|            !json.contains("\"weight\""),
 1166|      0|            "weight must be omitted when None"
 1167|       |        );
 1168|      1|    }
 1169|       |
 1170|       |    #[test]
 1171|      1|    fn test_research_stats_serialization() {
 1172|      1|        let stats = ResearchStats {
 1173|      1|            sub_queries_total: 3,
 1174|      1|            sub_queries_completed: 2,
 1175|      1|            sub_queries_failed: 1,
 1176|      1|            sub_queries_timed_out: 0,
 1177|      1|            unique_memories_found: 10,
 1178|      1|            evidence_chains_found: 2,
 1179|      1|            elapsed_ms: 1234,
 1180|      1|        };
 1181|      1|        let json = serde_json::to_value(&stats).expect("serialization failed");
 1182|      1|        assert_eq!(json["sub_queries_total"], 3);
 1183|      1|        assert_eq!(json["sub_queries_completed"], 2);
 1184|      1|        assert_eq!(json["sub_queries_failed"], 1);
 1185|      1|        assert_eq!(json["elapsed_ms"], 1234);
 1186|      1|    }
 1187|       |
 1188|       |    #[test]
 1189|      1|    fn test_deep_research_response_serialization() {
 1190|      1|        let resp = DeepResearchResponse {
 1191|      1|            query: "test query".to_string(),
 1192|      1|            sub_queries: vec![SubQuery {
 1193|      1|                id: 0,
 1194|      1|                text: "test query".to_string(),
 1195|      1|                source: "original",
 1196|      1|            }],
 1197|      1|            results: vec![],
 1198|      1|            evidence_chains: vec![],
 1199|      1|            graph_context: None,
 1200|      1|            stats: ResearchStats {
 1201|      1|                sub_queries_total: 1,
 1202|      1|                sub_queries_completed: 1,
 1203|      1|                sub_queries_failed: 0,
 1204|      1|                sub_queries_timed_out: 0,
 1205|      1|                unique_memories_found: 0,
 1206|      1|                evidence_chains_found: 0,
 1207|      1|                elapsed_ms: 42,
 1208|      1|            },
 1209|      1|        };
 1210|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
 1211|      1|        assert_eq!(json["query"], "test query");
 1212|      1|        assert!(json["sub_queries"].is_array());
 1213|      1|        assert!(json["results"].is_array());
 1214|      1|        assert!(json["evidence_chains"].is_array());
 1215|      1|        assert_eq!(json["stats"]["elapsed_ms"], 42);
 1216|      1|    }
 1217|       |
 1218|       |    // ---- GAP-07 regression: different sub-queries produce distinct embeddings ----
 1219|       |    // We test decompose_query returns texts that *would* produce distinct embeddings
 1220|       |    // (different text inputs → different embedding inputs → different search results).
 1221|       |    #[test]
 1222|      1|    fn test_distinct_sub_queries_produce_distinct_texts() {
 1223|      1|        let queries = [
 1224|      1|            "authentication design decisions",
 1225|      1|            "deployment configuration and infrastructure",
 1226|      1|        ];
 1227|       |        // These two texts must be different strings (prerequisite for distinct embeddings).
 1228|      1|        assert_ne!(queries[0], queries[1]);
 1229|       |
 1230|       |        // decompose_query with semicolons must preserve distinct texts.
 1231|      1|        let decomposed = decompose_query(
 1232|      1|            "authentication design decisions; deployment configuration and infrastructure",
 1233|       |            7,
 1234|       |        );
 1235|      1|        assert_eq!(decomposed.len(), 2);
 1236|      1|        assert_ne!(decomposed[0], decomposed[1]);
 1237|      1|    }
 1238|       |
 1239|       |    // ---- GAP-08/11 regression: rrf_fuse integration via fusion module ----
 1240|       |    #[test]
 1241|      1|    fn test_rrf_fuse_via_fusion_module() {
 1242|       |        use crate::storage::fusion::rrf_fuse;
 1243|       |
 1244|      1|        let knn_ids: Vec<i64> = vec![1, 2, 3];
 1245|      1|        let fts_ids: Vec<i64> = vec![2, 1, 4];
 1246|      1|        let scores = rrf_fuse(&[(1.0, &knn_ids), (1.0, &fts_ids)], 60.0);
 1247|       |
 1248|       |        // Items appearing in both lists must score higher than items in only one list.
 1249|      1|        let score_1 = scores[&1];
 1250|      1|        let score_2 = scores[&2];
 1251|      1|        let score_3 = scores[&3]; // knn only, rank 3
 1252|      1|        let score_4 = scores[&4]; // fts only, rank 3
 1253|       |
 1254|      1|        assert!(
 1255|      1|            score_1 > score_3,
 1256|      0|            "id 1 (both lists) must beat id 3 (knn-only rank 3)"
 1257|       |        );
 1258|      1|        assert!(
 1259|      1|            score_2 > score_4,
 1260|      0|            "id 2 (both lists) must beat id 4 (fts-only rank 3)"
 1261|       |        );
 1262|      1|    }
 1263|       |
 1264|       |    // ---- GAP-09/10 regression: evidence chains must be directed paths ----
 1265|       |    #[test]
 1266|      1|    fn test_evidence_chain_has_from_to_and_path() {
 1267|      1|        let chain = EvidenceChain {
 1268|      1|            from: "auth-module".to_string(),
 1269|      1|            to: "jwt-service".to_string(),
 1270|      1|            path: vec![
 1271|      1|                EvidenceNode {
 1272|      1|                    entity: "auth-module".to_string(),
 1273|      1|                    relation: None,
 1274|      1|                    weight: None,
 1275|      1|                },
 1276|      1|                EvidenceNode {
 1277|      1|                    entity: "token-validator".to_string(),
 1278|      1|                    relation: Some("depends-on".to_string()),
 1279|      1|                    weight: Some(0.9),
 1280|      1|                },
 1281|      1|                EvidenceNode {
 1282|      1|                    entity: "jwt-service".to_string(),
 1283|      1|                    relation: Some("uses".to_string()),
 1284|      1|                    weight: Some(0.8),
 1285|      1|                },
 1286|      1|            ],
 1287|      1|            total_weight: 0.72,
 1288|      1|            depth: 3,
 1289|      1|            sub_query_ids: vec![0],
 1290|      1|        };
 1291|       |
 1292|      1|        let json = serde_json::to_value(&chain).expect("serialization failed");
 1293|      1|        assert!(
 1294|      1|            json["from"].is_string(),
 1295|      0|            "evidence chain must have 'from' field"
 1296|       |        );
 1297|      1|        assert!(
 1298|      1|            json["to"].is_string(),
 1299|      0|            "evidence chain must have 'to' field"
 1300|       |        );
 1301|      1|        assert!(
 1302|      1|            json["path"].is_array(),
 1303|      0|            "evidence chain must have 'path' array"
 1304|       |        );
 1305|      1|        assert_eq!(json["path"].as_array().unwrap().len(), 3);
 1306|      1|        assert!(json["total_weight"].is_number(), "must have total_weight");
                                                                ^0
 1307|      1|        assert_eq!(json["depth"], 3);
 1308|      1|    }
 1309|       |
 1310|       |    // ---- GAP-10 regression: reconstruct_path returns correct node order ----
 1311|       |    #[test]
 1312|      1|    fn test_reconstruct_path_root_to_target_order() {
 1313|       |        // Build a simple chain: entity 10 (seed) -> entity 20 -> entity 30 (target)
 1314|      1|        let seed_set: HashSet<i64> = [10i64].into_iter().collect();
 1315|      1|        let mut predecessor: PredecessorMap = std::collections::HashMap::new();
 1316|      1|        predecessor.insert(20, (10, "depends-on".to_string(), 0.9));
 1317|      1|        predecessor.insert(30, (20, "uses".to_string(), 0.8));
 1318|      1|        let mut entity_names: crate::hash::AHashMap<i64, String> = crate::hash::AHashMap::default();
 1319|      1|        entity_names.insert(10, "seed-entity".to_string());
 1320|      1|        entity_names.insert(20, "middle-entity".to_string());
 1321|      1|        entity_names.insert(30, "target-entity".to_string());
 1322|       |
 1323|      1|        let result = reconstruct_path(30, &seed_set, &predecessor, &entity_names);
 1324|      1|        assert!(result.is_some(), "path must be reconstructed");
                                                ^0
 1325|      1|        let (nodes, weight) = result.unwrap();
 1326|       |        // Path must be [seed, middle, target]
 1327|      1|        assert_eq!(nodes.len(), 3);
 1328|      1|        assert_eq!(nodes[0].entity, "seed-entity");
 1329|      1|        assert_eq!(nodes[1].entity, "middle-entity");
 1330|      1|        assert_eq!(nodes[2].entity, "target-entity");
 1331|       |        // total_weight = 0.9 * 0.8
 1332|      1|        assert!((weight - 0.72).abs() < 1e-6);
 1333|      1|    }
 1334|       |
 1335|       |    // ---- GAP-09 regression: evidence chains must NOT be present for 1-hop trivial pairs ----
 1336|       |    #[test]
 1337|      1|    fn test_evidence_chains_single_hop_filtered_out() {
 1338|       |        // A chain of depth 1 (only root node) should be discarded.
 1339|      1|        let chain = EvidenceChain {
 1340|      1|            from: "a".to_string(),
 1341|      1|            to: "a".to_string(),
 1342|      1|            path: vec![EvidenceNode {
 1343|      1|                entity: "a".to_string(),
 1344|      1|                relation: None,
 1345|      1|                weight: None,
 1346|      1|            }],
 1347|      1|            total_weight: 1.0,
 1348|      1|            depth: 1,
 1349|      1|            sub_query_ids: vec![0],
 1350|      1|        };
 1351|       |        // Simulate the filter: retain chains with depth >= 2.
 1352|      1|        let chains = vec![chain];
 1353|      1|        let retained: Vec<_> = chains.into_iter().filter(|c| c.depth >= 2).collect();
 1354|      1|        assert!(retained.is_empty(), "depth-1 chains must be filtered out");
                                                   ^0
 1355|      1|    }
 1356|       |
 1357|       |    // ---- GAP-17 regression: bfs_with_predecessors honours max_neighbors_per_hop ----
 1358|       |    #[test]
 1359|      1|    fn test_bfs_with_predecessors_respects_neighbor_cap() {
 1360|       |        use crate::graph::bfs_with_predecessors;
 1361|       |        use rusqlite::Connection;
 1362|       |
 1363|      1|        let conn = Connection::open_in_memory().unwrap();
 1364|      1|        conn.execute_batch(
 1365|      1|            "CREATE TABLE relationships (
 1366|      1|                source_id INTEGER NOT NULL,
 1367|      1|                target_id INTEGER NOT NULL,
 1368|      1|                weight REAL NOT NULL,
 1369|      1|                namespace TEXT NOT NULL,
 1370|      1|                relation TEXT NOT NULL DEFAULT 'related'
 1371|      1|             );",
 1372|       |        )
 1373|      1|        .unwrap();
 1374|       |
 1375|       |        // Seed entity 1 has 5 neighbours.
 1376|      6|        for target in 2i64..=6 {
                          ^5
 1377|      5|            conn.execute(
 1378|      5|                "INSERT INTO relationships (source_id, target_id, weight, namespace) VALUES (?1, ?2, ?3, 'ns')",
 1379|      5|                rusqlite::params![1i64, target, 1.0f64],
 1380|      5|            )
 1381|      5|            .unwrap();
 1382|      5|        }
 1383|       |
 1384|       |        // Without cap: all 5 neighbours reached.
 1385|      1|        let (depth_uncapped, _) = bfs_with_predecessors(&conn, &[1], "ns", 0.0, 1, None).unwrap();
 1386|      1|        assert_eq!(
 1387|      1|            depth_uncapped.len() - 1,
 1388|       |            5,
 1389|      0|            "uncapped must discover all 5 neighbours (plus seed)"
 1390|       |        );
 1391|       |
 1392|       |        // With cap=2: only top-2 neighbours (by weight; all equal here so first 2 returned).
 1393|      1|        let (depth_capped, _) = bfs_with_predecessors(&conn, &[1], "ns", 0.0, 1, Some(2)).unwrap();
 1394|       |        // seed + 2 neighbours = 3 entries.
 1395|      1|        assert_eq!(
 1396|      1|            depth_capped.len(),
 1397|       |            3,
 1398|      0|            "capped to 2 must yield seed + 2 neighbours"
 1399|       |        );
 1400|      1|    }
 1401|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/delete_entity.rs:
    1|       |//! Handler for the `delete-entity` CLI subcommand (GAP-17).
    2|       |//!
    3|       |//! Deletes an entity and, with `--cascade`, all of its relationships and
    4|       |//! memory bindings. Without `--cascade` the command refuses to proceed, which
    5|       |//! prevents accidental data loss.
    6|       |
    7|       |use crate::errors::AppError;
    8|       |use crate::i18n::errors_msg;
    9|       |use crate::output::{self, OutputFormat};
   10|       |use crate::paths::AppPaths;
   11|       |use crate::storage::connection::open_rw;
   12|       |use crate::storage::entities;
   13|       |use rusqlite::params;
   14|       |use serde::Serialize;
   15|       |
   16|       |#[derive(clap::Args)]
   17|       |#[command(after_long_help = "EXAMPLES:\n  \
   18|       |    # Delete an entity and all its relationships (cascade required)\n  \
   19|       |    sqlite-graphrag delete-entity --name auth-module --cascade\n\n  \
   20|       |    # Delete an entity in a specific namespace\n  \
   21|       |    sqlite-graphrag delete-entity --name legacy-service --cascade --namespace my-project\n\n  \
   22|       |    # Without --cascade the command exits with an error:\n  \
   23|       |    sqlite-graphrag delete-entity --name auth-module\n  \
   24|       |    # => Error: use --cascade to confirm deletion of entity and all its relationships\n\n\
   25|       |NOTE:\n  \
   26|       |    --cascade is required and acts as an explicit confirmation gate.\n  \
   27|       |    All relationships where this entity is source or target are removed.\n  \
   28|       |    All memory-entity bindings (memory_entities rows) are also removed.\n  \
   29|       |    Run `sqlite-graphrag cleanup-orphans` afterwards to remove any newly orphaned entities.")]
   30|       |pub struct DeleteEntityArgs {
   31|       |    /// Entity name to delete (graph node, not memory name).
   32|       |    #[arg(long)]
   33|       |    pub name: String,
   34|       |    /// Required confirmation flag. Without it the command exits with an error.
   35|       |    ///
   36|       |    /// Deletes all relationships and memory bindings attached to this entity.
   37|       |    #[arg(long, default_value_t = false)]
   38|       |    pub cascade: bool,
   39|       |    #[arg(long)]
   40|       |    pub namespace: Option<String>,
   41|       |    #[arg(long, value_enum, default_value = "json")]
   42|       |    pub format: OutputFormat,
   43|       |    #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
   44|       |    pub json: bool,
   45|       |    #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
   46|       |    pub db: Option<String>,
   47|       |}
   48|       |
   49|       |#[derive(Serialize)]
   50|       |struct DeleteEntityResponse {
   51|       |    action: String,
   52|       |    entity_name: String,
   53|       |    namespace: String,
   54|       |    relationships_removed: usize,
   55|       |    bindings_removed: usize,
   56|       |    /// Total execution time in milliseconds from handler start to serialisation.
   57|       |    elapsed_ms: u64,
   58|       |}
   59|       |
   60|      0|pub fn run(args: DeleteEntityArgs) -> Result<(), AppError> {
   61|      0|    let inicio = std::time::Instant::now();
   62|       |
   63|      0|    if !args.cascade {
   64|      0|        return Err(AppError::Validation(
   65|      0|            "use --cascade to confirm deletion of entity and all its relationships".to_string(),
   66|      0|        ));
   67|      0|    }
   68|       |
   69|      0|    let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
   70|      0|    let paths = AppPaths::resolve(args.db.as_deref())?;
   71|       |
   72|      0|    crate::storage::connection::ensure_db_ready(&paths)?;
   73|       |
   74|      0|    let mut conn = open_rw(&paths.db)?;
   75|       |
   76|      0|    let entity_id = entities::find_entity_id(&conn, &namespace, &args.name)?
   77|      0|        .ok_or_else(|| AppError::NotFound(errors_msg::entity_not_found(&args.name, &namespace)))?;
   78|       |
   79|      0|    let tx = conn.transaction_with_behavior(rusqlite::TransactionBehavior::Immediate)?;
   80|       |
   81|       |    // Step 0: collect adjacent entity IDs BEFORE deleting relationships.
   82|      0|    let adjacent_ids: Vec<i64> = {
   83|      0|        let mut stmt = tx.prepare(
   84|      0|            "SELECT DISTINCT CASE WHEN source_id = ?1 THEN target_id ELSE source_id END
   85|      0|             FROM relationships WHERE source_id = ?1 OR target_id = ?1",
   86|      0|        )?;
   87|      0|        let ids: Vec<i64> = stmt
   88|      0|            .query_map(params![entity_id], |r| r.get(0))?
   89|      0|            .collect::<Result<Vec<_>, _>>()?;
   90|      0|        ids
   91|       |    };
   92|       |
   93|       |    // Step 1: collect relationship IDs for this entity (source or target).
   94|      0|    let rel_ids: Vec<i64> = {
   95|      0|        let mut stmt =
   96|      0|            tx.prepare("SELECT id FROM relationships WHERE source_id = ?1 OR target_id = ?1")?;
   97|      0|        let ids: Vec<i64> = stmt
   98|      0|            .query_map(params![entity_id], |r| r.get::<_, i64>(0))?
   99|      0|            .collect::<Result<Vec<_>, _>>()?;
  100|      0|        ids
  101|       |    };
  102|       |
  103|       |    // Step 2: delete memory_relationships for each collected relationship id.
  104|      0|    for &rel_id in &rel_ids {
  105|      0|        tx.execute(
  106|      0|            "DELETE FROM memory_relationships WHERE relationship_id = ?1",
  107|      0|            params![rel_id],
  108|      0|        )?;
  109|       |    }
  110|       |
  111|       |    // Step 3: delete the relationships themselves.
  112|      0|    let relationships_removed = tx.execute(
  113|      0|        "DELETE FROM relationships WHERE source_id = ?1 OR target_id = ?1",
  114|      0|        params![entity_id],
  115|      0|    )?;
  116|       |
  117|       |    // Step 4: delete memory_entities bindings.
  118|      0|    let bindings_removed = tx.execute(
  119|      0|        "DELETE FROM memory_entities WHERE entity_id = ?1",
  120|      0|        params![entity_id],
  121|      0|    )?;
  122|       |
  123|       |    // Step 5: delete vec_entities row (ignore error — row may not exist).
  124|      0|    let _ = tx.execute(
  125|      0|        "DELETE FROM vec_entities WHERE entity_id = ?1",
  126|      0|        params![entity_id],
  127|      0|    );
  128|       |
  129|       |    // Step 6: delete the entity itself.
  130|      0|    tx.execute("DELETE FROM entities WHERE id = ?1", params![entity_id])?;
  131|       |
  132|       |    // Step 7: recalculate degree for adjacent entities that lost relationships.
  133|      0|    for &adj_id in &adjacent_ids {
  134|      0|        if adj_id != entity_id {
  135|      0|            entities::recalculate_degree(&tx, adj_id)?;
  136|      0|        }
  137|       |    }
  138|       |
  139|      0|    tx.commit()?;
  140|       |
  141|      0|    conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);")?;
  142|       |
  143|      0|    let response = DeleteEntityResponse {
  144|      0|        action: "deleted".to_string(),
  145|      0|        entity_name: args.name.clone(),
  146|      0|        namespace: namespace.clone(),
  147|      0|        relationships_removed,
  148|      0|        bindings_removed,
  149|      0|        elapsed_ms: inicio.elapsed().as_millis() as u64,
  150|      0|    };
  151|       |
  152|      0|    match args.format {
  153|      0|        OutputFormat::Json => output::emit_json(&response)?,
  154|      0|        OutputFormat::Text | OutputFormat::Markdown => {
  155|      0|            output::emit_text(&format!(
  156|      0|                "deleted: {} (relationships_removed={}, bindings_removed={}) [{}]",
  157|      0|                response.entity_name,
  158|      0|                response.relationships_removed,
  159|      0|                response.bindings_removed,
  160|      0|                response.namespace
  161|      0|            ));
  162|      0|        }
  163|       |    }
  164|       |
  165|      0|    Ok(())
  166|      0|}
  167|       |
  168|       |#[cfg(test)]
  169|       |mod tests {
  170|       |    use super::*;
  171|       |
  172|       |    #[test]
  173|      1|    fn delete_entity_response_serializes_all_fields() {
  174|      1|        let resp = DeleteEntityResponse {
  175|      1|            action: "deleted".to_string(),
  176|      1|            entity_name: "auth-module".to_string(),
  177|      1|            namespace: "global".to_string(),
  178|      1|            relationships_removed: 3,
  179|      1|            bindings_removed: 2,
  180|      1|            elapsed_ms: 7,
  181|      1|        };
  182|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  183|      1|        assert_eq!(json["action"], "deleted");
  184|      1|        assert_eq!(json["entity_name"], "auth-module");
  185|      1|        assert_eq!(json["namespace"], "global");
  186|      1|        assert_eq!(json["relationships_removed"], 3);
  187|      1|        assert_eq!(json["bindings_removed"], 2);
  188|      1|        assert!(json["elapsed_ms"].is_number());
  189|      1|    }
  190|       |
  191|       |    #[test]
  192|      1|    fn delete_entity_response_action_is_deleted() {
  193|      1|        let resp = DeleteEntityResponse {
  194|      1|            action: "deleted".to_string(),
  195|      1|            entity_name: "x".to_string(),
  196|      1|            namespace: "ns".to_string(),
  197|      1|            relationships_removed: 0,
  198|      1|            bindings_removed: 0,
  199|      1|            elapsed_ms: 0,
  200|      1|        };
  201|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  202|      1|        assert_eq!(json["action"], "deleted");
  203|      1|    }
  204|       |
  205|       |    #[test]
  206|      1|    fn delete_entity_response_zero_counts_allowed() {
  207|      1|        let resp = DeleteEntityResponse {
  208|      1|            action: "deleted".to_string(),
  209|      1|            entity_name: "orphan-entity".to_string(),
  210|      1|            namespace: "global".to_string(),
  211|      1|            relationships_removed: 0,
  212|      1|            bindings_removed: 0,
  213|      1|            elapsed_ms: 1,
  214|      1|        };
  215|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  216|      1|        assert_eq!(json["relationships_removed"], 0);
  217|      1|        assert_eq!(json["bindings_removed"], 0);
  218|      1|    }
  219|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/edit.rs:
    1|       |//! Handler for the `edit` CLI subcommand.
    2|       |
    3|       |use crate::errors::AppError;
    4|       |use crate::i18n::errors_msg;
    5|       |use crate::output;
    6|       |use crate::paths::AppPaths;
    7|       |use crate::storage::connection::open_rw;
    8|       |use crate::storage::{memories, versions};
    9|       |use serde::Serialize;
   10|       |
   11|       |#[derive(clap::Args)]
   12|       |#[command(after_long_help = "EXAMPLES:\n  \
   13|       |    # Edit body inline\n  \
   14|       |    sqlite-graphrag edit onboarding --body \"updated content\"\n\n  \
   15|       |    # Edit body from a file\n  \
   16|       |    sqlite-graphrag edit onboarding --body-file ./updated.md\n\n  \
   17|       |    # Edit body from stdin (pipe)\n  \
   18|       |    cat updated.md | sqlite-graphrag edit onboarding --body-stdin\n\n  \
   19|       |    # Update only the description\n  \
   20|       |    sqlite-graphrag edit onboarding --description \"new short description\"")]
   21|       |pub struct EditArgs {
   22|       |    /// Memory name as a positional argument. Alternative to `--name`.
   23|       |    #[arg(
   24|       |        value_name = "NAME",
   25|       |        conflicts_with = "name",
   26|       |        help = "Memory name to edit; alternative to --name"
   27|       |    )]
   28|       |    pub name_positional: Option<String>,
   29|       |    /// Memory name to edit. Soft-deleted memories are not editable; use `restore` first.
   30|       |    #[arg(long)]
   31|       |    pub name: Option<String>,
   32|       |    /// New inline body content. Mutually exclusive with --body-file and --body-stdin.
   33|       |    #[arg(long, conflicts_with_all = ["body_file", "body_stdin"])]
   34|       |    pub body: Option<String>,
   35|       |    /// Read new body from a file. Mutually exclusive with --body and --body-stdin.
   36|       |    #[arg(long, conflicts_with_all = ["body", "body_stdin"])]
   37|       |    pub body_file: Option<std::path::PathBuf>,
   38|       |    /// Read new body from stdin until EOF. Mutually exclusive with --body and --body-file.
   39|       |    #[arg(long, conflicts_with_all = ["body", "body_file"])]
   40|       |    pub body_stdin: bool,
   41|       |    /// New description (≤500 chars) replacing the existing one.
   42|       |    #[arg(long)]
   43|       |    pub description: Option<String>,
   44|       |    /// Change the memory type (e.g. note, skill, decision).
   45|       |    #[arg(long, value_enum, help = "Change memory type")]
   46|       |    pub memory_type: Option<crate::cli::MemoryType>,
   47|       |    #[arg(
   48|       |        long,
   49|       |        value_name = "EPOCH_OR_RFC3339",
   50|       |        value_parser = crate::parsers::parse_expected_updated_at,
   51|       |        long_help = "Optimistic lock: reject if updated_at does not match. \
   52|       |Accepts Unix epoch (e.g. 1700000000) or RFC 3339 (e.g. 2026-04-19T12:00:00Z)."
   53|       |    )]
   54|       |    pub expected_updated_at: Option<i64>,
   55|       |    #[arg(
   56|       |        long,
   57|       |        help = "Namespace (env: SQLITE_GRAPHRAG_NAMESPACE, default: global)"
   58|       |    )]
   59|       |    pub namespace: Option<String>,
   60|       |    #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
   61|       |    pub json: bool,
   62|       |    #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
   63|       |    pub db: Option<String>,
   64|       |}
   65|       |
   66|       |#[derive(Serialize)]
   67|       |struct EditResponse {
   68|       |    memory_id: i64,
   69|       |    name: String,
   70|       |    action: String,
   71|       |    version: i64,
   72|       |    /// Total execution time in milliseconds from handler start to serialisation.
   73|       |    elapsed_ms: u64,
   74|       |}
   75|       |
   76|      0|pub fn run(args: EditArgs) -> Result<(), AppError> {
   77|       |    use crate::constants::*;
   78|       |
   79|      0|    let inicio = std::time::Instant::now();
   80|      0|    tracing::debug!(target: "edit", name = ?args.name_positional.as_deref().or(args.name.as_deref()), "updating memory");
   81|       |    // Resolve name from positional or --name flag; both are optional, at least one is required.
   82|      0|    let name = args.name_positional.or(args.name).ok_or_else(|| {
   83|      0|        AppError::Validation("name required: pass as positional argument or via --name".to_string())
   84|      0|    })?;
   85|      0|    let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
   86|       |
   87|      0|    let paths = AppPaths::resolve(args.db.as_deref())?;
   88|      0|    crate::storage::connection::ensure_db_ready(&paths)?;
   89|      0|    let mut conn = open_rw(&paths.db)?;
   90|       |
   91|      0|    let (memory_id, current_updated_at, _current_version) =
   92|      0|        memories::find_by_name(&conn, &namespace, &name)?
   93|      0|            .ok_or_else(|| AppError::NotFound(errors_msg::memory_not_found(&name, &namespace)))?;
   94|       |
   95|      0|    if let Some(expected) = args.expected_updated_at {
   96|      0|        if expected != current_updated_at {
   97|      0|            return Err(AppError::Conflict(errors_msg::optimistic_lock_conflict(
   98|      0|                expected,
   99|      0|                current_updated_at,
  100|      0|            )));
  101|      0|        }
  102|      0|    }
  103|       |
  104|      0|    let mut raw_body: Option<String> = None;
  105|      0|    if args.body.is_some() || args.body_file.is_some() || args.body_stdin {
  106|      0|        let b = if let Some(b) = args.body {
  107|      0|            b
  108|      0|        } else if let Some(path) = &args.body_file {
  109|      0|            let file_size = std::fs::metadata(path).map_err(AppError::Io)?.len();
  110|      0|            if file_size > MAX_MEMORY_BODY_LEN as u64 {
  111|      0|                return Err(AppError::LimitExceeded(
  112|      0|                    crate::i18n::validation::body_exceeds(MAX_MEMORY_BODY_LEN),
  113|      0|                ));
  114|      0|            }
  115|      0|            std::fs::read_to_string(path).map_err(AppError::Io)?
  116|       |        } else {
  117|      0|            crate::stdin_helper::read_stdin_with_timeout(60)?
  118|       |        };
  119|      0|        if b.len() > MAX_MEMORY_BODY_LEN {
  120|      0|            return Err(AppError::LimitExceeded(
  121|      0|                crate::i18n::validation::body_exceeds(MAX_MEMORY_BODY_LEN),
  122|      0|            ));
  123|      0|        }
  124|      0|        raw_body = Some(b);
  125|      0|    }
  126|       |
  127|      0|    if let Some(ref desc) = args.description {
  128|      0|        if desc.len() > MAX_MEMORY_DESCRIPTION_LEN {
  129|      0|            return Err(AppError::Validation(
  130|      0|                crate::i18n::validation::description_exceeds(MAX_MEMORY_DESCRIPTION_LEN),
  131|      0|            ));
  132|      0|        }
  133|      0|    }
  134|       |
  135|      0|    let row = memories::read_by_name(&conn, &namespace, &name)?
  136|      0|        .ok_or_else(|| AppError::Internal(anyhow::anyhow!("memory row not found after check")))?;
  137|       |
  138|      0|    let body_changed = raw_body.is_some();
  139|      0|    let new_body = raw_body.unwrap_or(row.body.clone());
  140|      0|    let new_description = args.description.unwrap_or(row.description.clone());
  141|      0|    let new_hash = blake3::hash(new_body.as_bytes()).to_hex().to_string();
  142|       |    // Skip re-embedding when body content is identical to the stored version.
  143|      0|    let body_changed = body_changed && new_hash != row.body_hash;
  144|      0|    let memory_type = args
  145|      0|        .memory_type
  146|      0|        .map(|t| t.as_str().to_string())
  147|      0|        .unwrap_or_else(|| row.memory_type.clone());
  148|      0|    let type_changed = memory_type != row.memory_type;
  149|      0|    let metadata = row.metadata.clone();
  150|       |
  151|      0|    let tx = conn.transaction_with_behavior(rusqlite::TransactionBehavior::Immediate)?;
  152|       |
  153|      0|    let affected = if let Some(ts) = args.expected_updated_at {
  154|      0|        tx.execute(
  155|      0|            "UPDATE memories SET description=?2, body=?3, body_hash=?4, type=?5
  156|      0|             WHERE id=?1 AND updated_at=?6 AND deleted_at IS NULL",
  157|      0|            rusqlite::params![
  158|      0|                memory_id,
  159|      0|                new_description,
  160|      0|                new_body,
  161|      0|                new_hash,
  162|      0|                memory_type,
  163|      0|                ts
  164|      0|            ],
  165|      0|        )?
  166|       |    } else {
  167|      0|        tx.execute(
  168|      0|            "UPDATE memories SET description=?2, body=?3, body_hash=?4, type=?5
  169|      0|             WHERE id=?1 AND deleted_at IS NULL",
  170|      0|            rusqlite::params![memory_id, new_description, new_body, new_hash, memory_type],
  171|      0|        )?
  172|       |    };
  173|       |
  174|      0|    if affected == 0 {
  175|      0|        return Err(AppError::Conflict(
  176|      0|            "optimistic lock conflict: memory was modified by another process".to_string(),
  177|      0|        ));
  178|      0|    }
  179|       |
  180|      0|    if body_changed || type_changed {
  181|      0|        output::emit_progress_i18n(
  182|      0|            "Re-computing embedding for edited body...",
  183|      0|            crate::i18n::validation::runtime_pt::edit_recomputing_embedding(),
  184|       |        );
  185|      0|        let embedding = crate::daemon::embed_passage_or_local(&paths.models, &new_body)?;
  186|      0|        let snippet: String = new_body.chars().take(300).collect();
  187|      0|        memories::upsert_vec(
  188|      0|            &tx,
  189|      0|            memory_id,
  190|      0|            &namespace,
  191|      0|            &memory_type,
  192|      0|            &embedding,
  193|      0|            &name,
  194|      0|            &snippet,
  195|      0|        )?;
  196|      0|    }
  197|       |
  198|      0|    let next_v = versions::next_version(&tx, memory_id)?;
  199|       |
  200|      0|    versions::insert_version(
  201|      0|        &tx,
  202|      0|        memory_id,
  203|      0|        next_v,
  204|      0|        &name,
  205|      0|        &memory_type,
  206|      0|        &new_description,
  207|      0|        &new_body,
  208|      0|        &metadata,
  209|      0|        None,
  210|      0|        "edit",
  211|      0|    )?;
  212|       |
  213|      0|    memories::sync_fts_after_update(
  214|      0|        &tx,
  215|      0|        memory_id,
  216|      0|        &row.name,
  217|      0|        &row.description,
  218|      0|        &row.body,
  219|      0|        &row.name,
  220|      0|        &new_description,
  221|      0|        &new_body,
  222|      0|    )?;
  223|       |
  224|      0|    tx.commit()?;
  225|       |
  226|      0|    conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);")?;
  227|       |
  228|      0|    output::emit_json(&EditResponse {
  229|      0|        memory_id,
  230|      0|        name,
  231|      0|        action: "updated".to_string(),
  232|      0|        version: next_v,
  233|      0|        elapsed_ms: inicio.elapsed().as_millis() as u64,
  234|      0|    })?;
  235|       |
  236|      0|    Ok(())
  237|      0|}
  238|       |
  239|       |#[cfg(test)]
  240|       |mod tests {
  241|       |    use super::*;
  242|       |
  243|       |    #[test]
  244|      1|    fn edit_response_serializes_all_fields() {
  245|      1|        let resp = EditResponse {
  246|      1|            memory_id: 42,
  247|      1|            name: "my-memory".to_string(),
  248|      1|            action: "updated".to_string(),
  249|      1|            version: 3,
  250|      1|            elapsed_ms: 7,
  251|      1|        };
  252|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  253|      1|        assert_eq!(json["memory_id"], 42i64);
  254|      1|        assert_eq!(json["name"], "my-memory");
  255|      1|        assert_eq!(json["action"], "updated");
  256|      1|        assert_eq!(json["version"], 3i64);
  257|      1|        assert!(json["elapsed_ms"].is_number());
  258|      1|    }
  259|       |
  260|       |    #[test]
  261|      1|    fn edit_response_action_contains_updated() {
  262|      1|        let resp = EditResponse {
  263|      1|            memory_id: 1,
  264|      1|            name: "n".to_string(),
  265|      1|            action: "updated".to_string(),
  266|      1|            version: 1,
  267|      1|            elapsed_ms: 0,
  268|      1|        };
  269|      1|        assert_eq!(
  270|       |            resp.action, "updated",
  271|      0|            "action must be 'updated' for successful edits"
  272|       |        );
  273|      1|    }
  274|       |
  275|       |    #[test]
  276|      1|    fn edit_body_exceeds_limit_returns_error() {
  277|      1|        let limit = crate::constants::MAX_MEMORY_BODY_LEN;
  278|      1|        let large_body: String = "a".repeat(limit + 1);
  279|      1|        assert!(
  280|      1|            large_body.len() > limit,
  281|      0|            "body above limit must have length > MAX_MEMORY_BODY_LEN"
  282|       |        );
  283|      1|    }
  284|       |
  285|       |    #[test]
  286|      1|    fn edit_description_exceeds_limit_returns_error() {
  287|      1|        let limit = crate::constants::MAX_MEMORY_DESCRIPTION_LEN;
  288|      1|        let large_desc: String = "d".repeat(limit + 1);
  289|      1|        assert!(
  290|      1|            large_desc.len() > limit,
  291|      0|            "description above limit must have length > MAX_MEMORY_DESCRIPTION_LEN"
  292|       |        );
  293|      1|    }
  294|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/enrich.rs:
    1|       |//! Handler for the `enrich` CLI subcommand (GAP-14 + GAP-18).
    2|       |//!
    3|       |//! Enriches the knowledge graph by running LLM-powered analysis over memories
    4|       |//! and entities that are missing key structural data. Operations are:
    5|       |//!
    6|       |//! - `memory-bindings`: memories without `memory_entities` rows get entity extraction
    7|       |//! - `entity-descriptions`: entities with NULL/empty descriptions get LLM descriptions
    8|       |//! - `body-enrich`: memories with short bodies get expanded by the LLM (GAP-18)
    9|       |//! - all others: scan + structured NDJSON output (not-yet-implemented dispatch)
   10|       |//!
   11|       |//! Architecture mirrors `ingest_claude.rs`: SCAN → JUDGE (LLM) → PERSIST, with a
   12|       |//! SQLite queue DB (`.enrich-queue.sqlite`) for resume/retry support.
   13|       |// Workload: Subprocess I/O-bound (claude/codex API calls with network wait)
   14|       |//!
   15|       |//! # DRY opportunity
   16|       |//!
   17|       |//! `extract_with_claude`, `parse_claude_output`, `emit_json`, and the `open_queue_db`
   18|       |//! queue schema in `ingest_claude.rs` are private functions that duplicate patterns used
   19|       |//! here verbatim. A future refactoring could extract them into a shared
   20|       |//! `src/commands/llm_runner.rs` module (or `src/llm_runner.rs`) without changing any
   21|       |//! public APIs. That extraction requires editing `ingest_claude.rs`, which is outside
   22|       |//! this stream's boundary — flagged here for the Integration stream to evaluate.
   23|       |
   24|       |use crate::commands::ingest_claude::find_claude_binary;
   25|       |use crate::constants::MAX_MEMORY_BODY_LEN;
   26|       |use crate::entity_type::EntityType;
   27|       |use crate::errors::AppError;
   28|       |use crate::paths::AppPaths;
   29|       |use crate::storage::connection::{ensure_db_ready, open_rw};
   30|       |use crate::storage::entities::{self, NewEntity, NewRelationship};
   31|       |use crate::storage::memories;
   32|       |
   33|       |use rusqlite::Connection;
   34|       |use serde::{Deserialize, Serialize};
   35|       |use std::io::Write;
   36|       |use std::path::{Path, PathBuf};
   37|       |use std::time::Instant;
   38|       |
   39|       |// ---------------------------------------------------------------------------
   40|       |// Constants
   41|       |// ---------------------------------------------------------------------------
   42|       |
   43|       |const DEFAULT_QUEUE_DB: &str = ".enrich-queue.sqlite";
   44|       |const DEFAULT_RATE_LIMIT_WAIT: u64 = 60;
   45|       |const DEFAULT_BODY_ENRICH_MIN_CHARS: usize = 500;
   46|       |const DEFAULT_BODY_ENRICH_MAX_CHARS: usize = 2000;
   47|       |
   48|       |// ---------------------------------------------------------------------------
   49|       |// JSON schema used for memory-bindings and body-enrich extraction
   50|       |// ---------------------------------------------------------------------------
   51|       |
   52|       |const BINDINGS_SCHEMA: &str = r#"{
   53|       |  "type": "object",
   54|       |  "properties": {
   55|       |    "entities": {
   56|       |      "type": "array",
   57|       |      "items": {
   58|       |        "type": "object",
   59|       |        "properties": {
   60|       |          "name": { "type": "string" },
   61|       |          "entity_type": {
   62|       |            "type": "string",
   63|       |            "enum": ["project","tool","person","file","concept","incident","decision","organization","location","date"]
   64|       |          }
   65|       |        },
   66|       |        "required": ["name", "entity_type"],
   67|       |        "additionalProperties": false
   68|       |      }
   69|       |    },
   70|       |    "relationships": {
   71|       |      "type": "array",
   72|       |      "items": {
   73|       |        "type": "object",
   74|       |        "properties": {
   75|       |          "source": { "type": "string" },
   76|       |          "target": { "type": "string" },
   77|       |          "relation": {
   78|       |            "type": "string",
   79|       |            "enum": ["applies-to","uses","depends-on","causes","fixes","contradicts","supports","follows","related","replaces","tracked-in"]
   80|       |          },
   81|       |          "strength": { "type": "number", "minimum": 0, "maximum": 1 }
   82|       |        },
   83|       |        "required": ["source","target","relation","strength"],
   84|       |        "additionalProperties": false
   85|       |      }
   86|       |    }
   87|       |  },
   88|       |  "required": ["entities","relationships"],
   89|       |  "additionalProperties": false
   90|       |}"#;
   91|       |
   92|       |const ENTITY_DESCRIPTION_SCHEMA: &str = r#"{
   93|       |  "type": "object",
   94|       |  "properties": {
   95|       |    "description": { "type": "string" }
   96|       |  },
   97|       |  "required": ["description"],
   98|       |  "additionalProperties": false
   99|       |}"#;
  100|       |
  101|       |const BODY_ENRICH_SCHEMA: &str = r#"{
  102|       |  "type": "object",
  103|       |  "properties": {
  104|       |    "enriched_body": { "type": "string" }
  105|       |  },
  106|       |  "required": ["enriched_body"],
  107|       |  "additionalProperties": false
  108|       |}"#;
  109|       |
  110|       |// G27 P1: weight-calibrate
  111|       |const WEIGHT_CALIBRATE_PROMPT: &str = "You are a knowledge graph quality auditor. Evaluate whether this relationship weight is correctly calibrated.\n\n\
  112|       |Scale:\n\
  113|       |- 0.9 = vital hard dependency (A cannot function without B)\n\
  114|       |- 0.7 = important design relationship (A strongly supports/enables B)\n\
  115|       |- 0.5 = useful contextual link (A and B share relevant context)\n\
  116|       |- 0.3 = weak reference (A mentions B without strong coupling)\n\n\
  117|       |Respond with the calibrated weight and brief reasoning.";
  118|       |
  119|       |const WEIGHT_CALIBRATE_SCHEMA: &str = r#"{
  120|       |  "type": "object",
  121|       |  "properties": {
  122|       |    "calibrated_weight": { "type": "number", "minimum": 0.0, "maximum": 1.0 },
  123|       |    "reasoning": { "type": "string" }
  124|       |  },
  125|       |  "required": ["calibrated_weight", "reasoning"],
  126|       |  "additionalProperties": false
  127|       |}"#;
  128|       |
  129|       |// G27 P1: relation-reclassify
  130|       |const RELATION_RECLASSIFY_PROMPT: &str = "You are a knowledge graph quality auditor. The relationship between these entities uses a generic type. Determine the REAL semantic relationship.\n\n\
  131|       |Valid canonical relations (pick exactly one):\n\
  132|       |- depends-on: A cannot function without B\n\
  133|       |- uses: A utilizes B but could substitute it\n\
  134|       |- supports: A reinforces or enables B\n\
  135|       |- causes: A triggers or produces B\n\
  136|       |- fixes: A resolves a problem in B\n\
  137|       |- contradicts: A conflicts with or invalidates B\n\
  138|       |- applies-to: A is relevant to or scoped within B\n\
  139|       |- follows: A comes after B in sequence\n\
  140|       |- replaces: A substitutes B\n\
  141|       |- tracked-in: A is monitored in B\n\
  142|       |- related: A and B share context (use sparingly)\n\n\
  143|       |Respond with the correct relation, strength, and reasoning.";
  144|       |
  145|       |const RELATION_RECLASSIFY_SCHEMA: &str = r#"{
  146|       |  "type": "object",
  147|       |  "properties": {
  148|       |    "relation": { "type": "string" },
  149|       |    "strength": { "type": "number", "minimum": 0.0, "maximum": 1.0 },
  150|       |    "reasoning": { "type": "string" }
  151|       |  },
  152|       |  "required": ["relation", "strength", "reasoning"],
  153|       |  "additionalProperties": false
  154|       |}"#;
  155|       |
  156|       |// G27 P2: entity-connect — suggest relationships between isolated entities
  157|       |const ENTITY_CONNECT_PROMPT: &str = "You are a knowledge graph quality auditor. Two entities exist in the same graph but have no relationship between them. Determine if a meaningful relationship exists.\n\n\
  158|       |Valid canonical relations: depends-on, uses, supports, causes, fixes, contradicts, applies-to, follows, replaces, tracked-in, related.\n\n\
  159|       |If NO meaningful relationship exists, set relation to \"none\".\n\
  160|       |Respond with the relation (or \"none\"), strength, and reasoning.";
  161|       |
  162|       |const ENTITY_CONNECT_SCHEMA: &str = r#"{
  163|       |  "type": "object",
  164|       |  "properties": {
  165|       |    "relation": { "type": "string" },
  166|       |    "strength": { "type": "number", "minimum": 0.0, "maximum": 1.0 },
  167|       |    "reasoning": { "type": "string" }
  168|       |  },
  169|       |  "required": ["relation", "strength", "reasoning"],
  170|       |  "additionalProperties": false
  171|       |}"#;
  172|       |
  173|       |// G27 P2: entity-type-validate — verify entity type assignments
  174|       |const ENTITY_TYPE_VALIDATE_PROMPT: &str = "You are a knowledge graph quality auditor. Verify whether this entity's type is correct.\n\n\
  175|       |Valid entity types: project, tool, person, file, concept, incident, decision, organization, location, date.\n\n\
  176|       |If the current type is correct, keep it. If wrong, suggest the correct type.\n\
  177|       |Respond with the validated type and reasoning.";
  178|       |
  179|       |const ENTITY_TYPE_VALIDATE_SCHEMA: &str = r#"{
  180|       |  "type": "object",
  181|       |  "properties": {
  182|       |    "validated_type": { "type": "string" },
  183|       |    "was_correct": { "type": "boolean" },
  184|       |    "reasoning": { "type": "string" }
  185|       |  },
  186|       |  "required": ["validated_type", "was_correct", "reasoning"],
  187|       |  "additionalProperties": false
  188|       |}"#;
  189|       |
  190|       |// G27 P2: description-enrich — improve generic memory descriptions
  191|       |const DESCRIPTION_ENRICH_PROMPT: &str = "You are a knowledge graph quality auditor. This memory has a generic or auto-generated description. Write a concise, semantic description (10-20 words) that captures WHAT this memory is about and WHY it matters.\n\n\
  192|       |BAD: 'ingested from docs/auth.md'\n\
  193|       |GOOD: 'JWT token rotation strategy with 15-min expiry and refresh flow'\n\n\
  194|       |Respond with the improved description and reasoning.";
  195|       |
  196|       |const DESCRIPTION_ENRICH_SCHEMA: &str = r#"{
  197|       |  "type": "object",
  198|       |  "properties": {
  199|       |    "description": { "type": "string" },
  200|       |    "reasoning": { "type": "string" }
  201|       |  },
  202|       |  "required": ["description", "reasoning"],
  203|       |  "additionalProperties": false
  204|       |}"#;
  205|       |
  206|       |// G27 P2: domain-classify — classify memory into domain category
  207|       |const DOMAIN_CLASSIFY_PROMPT: &str = "You are a knowledge graph quality auditor. Classify this memory into its primary domain category.\n\n\
  208|       |Respond with the domain name (kebab-case, 2-4 words) and reasoning.";
  209|       |
  210|       |const DOMAIN_CLASSIFY_SCHEMA: &str = r#"{
  211|       |  "type": "object",
  212|       |  "properties": {
  213|       |    "domain": { "type": "string" },
  214|       |    "confidence": { "type": "number", "minimum": 0.0, "maximum": 1.0 },
  215|       |    "reasoning": { "type": "string" }
  216|       |  },
  217|       |  "required": ["domain", "confidence", "reasoning"],
  218|       |  "additionalProperties": false
  219|       |}"#;
  220|       |
  221|       |// G27 P2: graph-audit — audit graph for quality issues
  222|       |const GRAPH_AUDIT_PROMPT: &str = "You are a knowledge graph quality auditor. Analyze this memory and its entity bindings for quality issues.\n\n\
  223|       |Check for: missing entities, wrong entity types, redundant relationships, orphaned entities, generic descriptions, low-signal relationships.\n\n\
  224|       |Respond with a list of issues found (or empty if none) and an overall quality score.";
  225|       |
  226|       |const GRAPH_AUDIT_SCHEMA: &str = r#"{
  227|       |  "type": "object",
  228|       |  "properties": {
  229|       |    "quality_score": { "type": "number", "minimum": 0.0, "maximum": 1.0 },
  230|       |    "issues": { "type": "array", "items": { "type": "object", "properties": { "kind": { "type": "string" }, "detail": { "type": "string" } }, "required": ["kind", "detail"] } },
  231|       |    "reasoning": { "type": "string" }
  232|       |  },
  233|       |  "required": ["quality_score", "issues", "reasoning"],
  234|       |  "additionalProperties": false
  235|       |}"#;
  236|       |
  237|       |// G27 P2: deep-research-synth — synthesize research findings into graph
  238|       |const DEEP_RESEARCH_SYNTH_PROMPT: &str = "You are a knowledge graph synthesizer. Given this memory body, extract key findings and synthesize them into structured entities and relationships.\n\n\
  239|       |Entity names: lowercase kebab-case, domain-specific.\n\
  240|       |Relations: depends-on, uses, supports, causes, fixes, contradicts, applies-to, follows, related, replaces, tracked-in.\n\n\
  241|       |Respond with extracted entities, relationships, and a synthesis summary.";
  242|       |
  243|       |const DEEP_RESEARCH_SYNTH_SCHEMA: &str = r#"{
  244|       |  "type": "object",
  245|       |  "properties": {
  246|       |    "entities": { "type": "array", "items": { "type": "object", "properties": { "name": { "type": "string" }, "entity_type": { "type": "string" } }, "required": ["name", "entity_type"] } },
  247|       |    "relationships": { "type": "array", "items": { "type": "object", "properties": { "source": { "type": "string" }, "target": { "type": "string" }, "relation": { "type": "string" }, "strength": { "type": "number" } }, "required": ["source", "target", "relation", "strength"] } },
  248|       |    "summary": { "type": "string" }
  249|       |  },
  250|       |  "required": ["entities", "relationships", "summary"],
  251|       |  "additionalProperties": false
  252|       |}"#;
  253|       |
  254|       |// G27 P2: body-extract — extract structured content from unstructured text
  255|       |const BODY_EXTRACT_PROMPT: &str = "You are a structured data extractor. Given this memory body (which may be unstructured text, raw notes, or a transcript), extract and restructure the content into a clean, well-organized markdown body.\n\n\
  256|       |Preserve all factual content. Remove noise, fix formatting, add section headers where appropriate.\n\
  257|       |Respond with the restructured body and a brief summary of changes.";
  258|       |
  259|       |const BODY_EXTRACT_SCHEMA: &str = r#"{
  260|       |  "type": "object",
  261|       |  "properties": {
  262|       |    "restructured_body": { "type": "string" },
  263|       |    "changes_summary": { "type": "string" }
  264|       |  },
  265|       |  "required": ["restructured_body", "changes_summary"],
  266|       |  "additionalProperties": false
  267|       |}"#;
  268|       |
  269|       |// ---------------------------------------------------------------------------
  270|       |// Prompts
  271|       |// ---------------------------------------------------------------------------
  272|       |
  273|       |const BINDINGS_PROMPT: &str = "You are a knowledge graph entity extractor. Given a memory body, extract:\n\
  274|       |1. Domain-specific entities (concepts, tools, people, decisions, projects, files)\n\
  275|       |2. Typed relationships between entities with strength scores\n\n\
  276|       |Rules:\n\
  277|       |- Entity names: lowercase kebab-case, 2+ chars, domain-specific only\n\
  278|       |- NEVER extract generic terms, stop words, numbers, UUIDs, or single characters\n\
  279|       |- Relationship types MUST be one of: applies-to, uses, depends-on, causes, fixes, contradicts, supports, follows, related, replaces, tracked-in\n\
  280|       |- NEVER use 'mentions' as relationship type\n\
  281|       |- Strength: 0.9 for hard dependencies, 0.7 for design relationships, 0.5 for contextual links, 0.3 for weak references\n\
  282|       |- Prefer fewer high-quality entities over many low-quality ones";
  283|       |
  284|       |const ENTITY_DESCRIPTION_PROMPT_PREFIX: &str = "You are a knowledge graph annotator. Given an entity name and type, write a concise one-sentence description (10-20 words) that explains what this entity IS and WHY it matters in the context of software/system design.\n\nEntity name: ";
  285|       |
  286|       |const BODY_ENRICH_PROMPT_PREFIX: &str = "You are a knowledge assistant. Given a short or sparse memory body, expand it into a richer, more complete and useful description. Preserve all existing facts. Add context, implications, and relationships that would be valuable for knowledge retrieval.\n\nConstraints:\n- Output only the enriched body text (no metadata, no headers)\n- Preserve the original meaning exactly\n- Target length is provided in the system context\n\nMemory body to enrich:\n\n";
  287|       |
  288|       |// ---------------------------------------------------------------------------
  289|       |// CLI args
  290|       |// ---------------------------------------------------------------------------
  291|       |
  292|       |/// Operation to perform in the `enrich` command.
  293|       |#[derive(Debug, Clone, PartialEq, Eq, clap::ValueEnum, Serialize, Deserialize)]
  294|       |#[serde(rename_all = "kebab-case")]
  295|       |pub enum EnrichOperation {
  296|       |    /// Add missing entity/relationship bindings to memories (fully implemented).
  297|       |    MemoryBindings,
  298|       |    /// Fill NULL/empty entity descriptions with LLM-generated summaries (fully implemented).
  299|       |    EntityDescriptions,
  300|       |    /// Expand short memory bodies into richer content (fully implemented, GAP-18).
  301|       |    BodyEnrich,
  302|       |    /// Calibrate relationship weights using LLM analysis (scan only).
  303|       |    WeightCalibrate,
  304|       |    /// Reclassify relationship types using LLM judgment (scan only).
  305|       |    RelationReclassify,
  306|       |    /// Connect isolated entities by suggesting new relationships (scan only).
  307|       |    EntityConnect,
  308|       |    /// Validate entity type assignments using LLM judgment (scan only).
  309|       |    EntityTypeValidate,
  310|       |    /// Enrich memory descriptions that are generic/auto-generated (scan only).
  311|       |    DescriptionEnrich,
  312|       |    /// Identify cross-domain bridges between disconnected subgraphs (scan only).
  313|       |    CrossDomainBridges,
  314|       |    /// Classify memories into domain categories (scan only).
  315|       |    DomainClassify,
  316|       |    /// Audit the graph for quality issues (scan only).
  317|       |    GraphAudit,
  318|       |    /// Synthesize deep-research findings into graph memories (scan only).
  319|       |    DeepResearchSynth,
  320|       |    /// Extract structured body from unstructured text (scan only).
  321|       |    BodyExtract,
  322|       |}
  323|       |
  324|       |/// LLM provider for enrichment.
  325|       |#[derive(Debug, Clone, PartialEq, Eq, clap::ValueEnum)]
  326|       |pub enum EnrichMode {
  327|       |    /// Use locally installed Claude Code CLI (OAuth-first).
  328|       |    ClaudeCode,
  329|       |    /// Use locally installed OpenAI Codex CLI.
  330|       |    Codex,
  331|       |}
  332|       |
  333|       |impl std::fmt::Display for EnrichMode {
  334|      0|    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
  335|      0|        match self {
  336|      0|            EnrichMode::ClaudeCode => write!(f, "claude-code"),
  337|      0|            EnrichMode::Codex => write!(f, "codex"),
  338|       |        }
  339|      0|    }
  340|       |}
  341|       |
  342|       |/// Arguments for the `enrich` subcommand.
  343|       |#[derive(clap::Args)]
  344|       |#[command(
  345|       |    about = "Enrich graph memories and entities using an LLM provider",
  346|       |    after_long_help = "EXAMPLES:\n  \
  347|       |    # Add missing entity bindings to all unbound memories\n  \
  348|       |    sqlite-graphrag enrich --operation memory-bindings --mode claude-code\n\n  \
  349|       |    # Fill entity descriptions (dry-run preview, no tokens spent)\n  \
  350|       |    sqlite-graphrag enrich --operation entity-descriptions --dry-run --json\n\n  \
  351|       |    # Expand short memory bodies (GAP-18)\n  \
  352|       |    sqlite-graphrag enrich --operation body-enrich --min-output-chars 600\n\n  \
  353|       |    # Resume an interrupted body-enrich run\n  \
  354|       |    sqlite-graphrag enrich --operation body-enrich --resume --json\n\n  \
  355|       |    # Retry only failed items from a previous run\n  \
  356|       |    sqlite-graphrag enrich --operation memory-bindings --retry-failed --json\n\n\
  357|       |    EXIT CODES:\n  \
  358|       |    0  success\n  \
  359|       |    1  validation error (bad args, binary not found)\n  \
  360|       |    14 I/O error"
  361|       |)]
  362|       |pub struct EnrichArgs {
  363|       |    /// Enrichment operation to run.
  364|       |    #[arg(long, short = 'o', value_enum, value_name = "OPERATION")]
  365|       |    pub operation: EnrichOperation,
  366|       |
  367|       |    /// LLM provider to use. Default: claude-code (OAuth-first).
  368|       |    #[arg(long, value_enum, default_value = "claude-code")]
  369|       |    pub mode: EnrichMode,
  370|       |
  371|       |    /// Maximum number of items to process in this run. Omit for all.
  372|       |    #[arg(long, value_name = "N")]
  373|       |    pub limit: Option<usize>,
  374|       |
  375|       |    /// Preview items without calling the LLM (zero tokens consumed).
  376|       |    #[arg(long)]
  377|       |    pub dry_run: bool,
  378|       |
  379|       |    /// Namespace to operate on. Default: global.
  380|       |    #[arg(long, env = "SQLITE_GRAPHRAG_NAMESPACE")]
  381|       |    pub namespace: Option<String>,
  382|       |
  383|       |    // -- Provider flags (Claude) --
  384|       |    /// Path to the Claude Code binary. Default: auto-detect from PATH.
  385|       |    #[arg(long, value_name = "PATH")]
  386|       |    pub claude_binary: Option<PathBuf>,
  387|       |
  388|       |    /// Claude model to use (e.g. claude-sonnet-4-6).
  389|       |    #[arg(long, value_name = "MODEL")]
  390|       |    pub claude_model: Option<String>,
  391|       |
  392|       |    /// Timeout per item in seconds when using Claude Code. Default: 300.
  393|       |    #[arg(long, value_name = "SECONDS", default_value_t = 300)]
  394|       |    pub claude_timeout: u64,
  395|       |
  396|       |    // -- Provider flags (Codex) --
  397|       |    /// Path to the Codex CLI binary. Default: auto-detect from PATH.
  398|       |    #[arg(long, value_name = "PATH")]
  399|       |    pub codex_binary: Option<PathBuf>,
  400|       |
  401|       |    /// Codex model to use (e.g. o4-mini).
  402|       |    #[arg(long, value_name = "MODEL")]
  403|       |    pub codex_model: Option<String>,
  404|       |
  405|       |    /// Timeout per item in seconds when using Codex. Default: 300.
  406|       |    #[arg(long, value_name = "SECONDS", default_value_t = 300)]
  407|       |    pub codex_timeout: u64,
  408|       |
  409|       |    // -- Cost controls --
  410|       |    /// Abort when cumulative cost exceeds this USD budget (API key only; ignored for OAuth).
  411|       |    #[arg(long, value_name = "USD")]
  412|       |    pub max_cost_usd: Option<f64>,
  413|       |
  414|       |    // -- Queue controls --
  415|       |    /// Resume a previously interrupted run (skip already-done items).
  416|       |    #[arg(long)]
  417|       |    pub resume: bool,
  418|       |
  419|       |    /// Retry only items that failed in a previous run.
  420|       |    #[arg(long)]
  421|       |    pub retry_failed: bool,
  422|       |
  423|       |    // -- body-enrich specific flags (GAP-18) --
  424|       |    /// Minimum output character count for body-enrich. Default: 500.
  425|       |    #[arg(long, value_name = "CHARS", default_value_t = DEFAULT_BODY_ENRICH_MIN_CHARS)]
  426|       |    pub min_output_chars: usize,
  427|       |
  428|       |    /// Maximum output character count for body-enrich. Default: 2000.
  429|       |    #[arg(long, value_name = "CHARS", default_value_t = DEFAULT_BODY_ENRICH_MAX_CHARS)]
  430|       |    pub max_output_chars: usize,
  431|       |
  432|       |    /// Check that enriched body preserves all facts from the original (LLM judge). Default: true.
  433|       |    #[arg(long, default_value_t = true)]
  434|       |    pub preserve_check: bool,
  435|       |
  436|       |    /// Path to a custom prompt template file for body-enrich.
  437|       |    #[arg(long, value_name = "PATH")]
  438|       |    pub prompt_template: Option<PathBuf>,
  439|       |
  440|       |    /// Number of parallel LLM workers (default 1 = serial).
  441|       |    /// Each worker claims items atomically from the queue DB via UPDATE...RETURNING.
  442|       |    /// Range: 1–32. For 2321 entities, --llm-parallelism 4 reduces wall time ~4×.
  443|       |    #[arg(long, default_value_t = 1, value_name = "N", value_parser = clap::value_parser!(u32).range(1..=32))]
  444|       |    pub llm_parallelism: u32,
  445|       |
  446|       |    // -- Output / infra --
  447|       |    /// Emit NDJSON output. Always true; flag accepted for compatibility.
  448|       |    #[arg(long)]
  449|       |    pub json: bool,
  450|       |
  451|       |    /// Database path override.
  452|       |    #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
  453|       |    pub db: Option<String>,
  454|       |
  455|       |    /// G30: poll for the job singleton every second for up to N seconds
  456|       |    /// when another invocation holds the lock. Default: 0 (fail fast).
  457|       |    #[arg(long, value_name = "SECONDS")]
  458|       |    pub wait_job_singleton: Option<u64>,
  459|       |
  460|       |    /// G30: force acquisition of the singleton lock by removing a stale
  461|       |    /// lock file from a previously crashed invocation. Use only when you
  462|       |    /// are certain no other `enrich`/`ingest` is running.
  463|       |    #[arg(long, default_value_t = false)]
  464|       |    pub force_job_singleton: bool,
  465|       |
  466|       |    /// G37: select a specific subset of memory names to enrich instead of
  467|       |    /// the full candidate set. Comma-separated, e.g. `--names a,b,c`.
  468|       |    /// Empty when omitted (processes all candidates).
  469|       |    #[arg(long, value_name = "NAMES", value_delimiter = ',')]
  470|       |    pub names: Vec<String>,
  471|       |
  472|       |    /// G37: read the subset of memory names from a file (one per line).
  473|       |    /// Lines starting with `#` and empty lines are ignored. Combined with
  474|       |    /// `--names` (union) when both are set.
  475|       |    #[arg(long, value_name = "PATH")]
  476|       |    pub names_file: Option<PathBuf>,
  477|       |
  478|       |    /// G35: probe the LLM provider with a 1-turn ping before processing
  479|       |    /// the batch. Aborts with a clear error if the rate-limit window is
  480|       |    /// closed (avoids burning N turns only to fail on item 1).
  481|       |    #[arg(long, default_value_t = false)]
  482|       |    pub preflight_check: bool,
  483|       |
  484|       |    /// G35: if a preflight probe or in-flight call hits the Claude rate
  485|       |    /// limit, fall back to `--fallback-mode` (typically `codex`) instead
  486|       |    /// of failing the batch. Ignored when `--mode` is already `codex`.
  487|       |    #[arg(long, value_enum)]
  488|       |    pub fallback_mode: Option<EnrichMode>,
  489|       |
  490|       |    /// G35: number of seconds before the OAuth rate-limit reset at which
  491|       |    /// the preflight probe should refuse to start. Default 300 (5 min).
  492|       |    #[arg(long, value_name = "SECONDS", default_value_t = 300)]
  493|       |    pub rate_limit_buffer: u64,
  494|       |
  495|       |    /// G28-D: refuse to start when the 1-minute load average exceeds
  496|       |    /// `2 × ncpus` (or `SQLITE_GRAPHRAG_MAX_LOAD_PER_NCPU` if set).
  497|       |    /// Set to false to skip the check on contended CI runners.
  498|       |    #[arg(long, default_value_t = true)]
  499|       |    pub max_load_check: bool,
  500|       |
  501|       |    /// G28-D: when the system is saturated, abort the job after this
  502|       |    /// many consecutive HardFailure outcomes. Default 5.
  503|       |    #[arg(long, value_name = "N", default_value_t = 5)]
  504|       |    pub circuit_breaker_threshold: u32,
  505|       |
  506|       |    /// G29 Passo 4: minimum trigram-Jaccard similarity between the
  507|       |    /// original body and the LLM-rewritten body for the rewrite to be
  508|       |    /// accepted. Scores below the threshold are rejected and emitted as
  509|       |    /// `EnrichItemResult::PreservationFailed`. Default 0.7 (per the G29
  510|       |    /// gap specification). Ignored when `--operation` is not
  511|       |    /// `body-enrich`.
  512|       |    #[arg(long, value_name = "FLOAT", default_value_t = 0.7)]
  513|       |    pub preserve_threshold: f64,
  514|       |
  515|       |    /// G33 Passo 3: when set, validate `--codex-model` against the
  516|       |    /// ChatGPT Pro OAuth accepted-model list and abort with a
  517|       |    /// suggestion when the value is unknown. Default true (fail fast
  518|       |    /// to avoid burning OAuth turns). Set to false to opt out.
  519|       |    #[arg(long, default_value_t = true)]
  520|       |    pub codex_model_validate: bool,
  521|       |
  522|       |    /// G33 Passo 3: when set together with an invalid `--codex-model`,
  523|       |    /// automatically substitute the supplied default (e.g. `gpt-5.5`)
  524|       |    /// instead of aborting. The substitution is recorded in the NDJSON
  525|       |    /// stream as `provider_substituted: true` for traceability.
  526|       |    #[arg(long, value_name = "MODEL")]
  527|       |    pub codex_model_fallback: Option<String>,
  528|       |}
  529|       |
  530|       |// ---------------------------------------------------------------------------
  531|       |// Internal types — raw LLM output structs
  532|       |// ---------------------------------------------------------------------------
  533|       |
  534|       |// ---------------------------------------------------------------------------
  535|       |// NDJSON event types emitted to stdout
  536|       |// ---------------------------------------------------------------------------
  537|       |
  538|       |#[derive(Debug, Serialize)]
  539|       |struct PhaseEvent<'a> {
  540|       |    phase: &'a str,
  541|       |    #[serde(skip_serializing_if = "Option::is_none")]
  542|       |    binary_path: Option<&'a str>,
  543|       |    #[serde(skip_serializing_if = "Option::is_none")]
  544|       |    version: Option<&'a str>,
  545|       |    #[serde(skip_serializing_if = "Option::is_none")]
  546|       |    items_total: Option<usize>,
  547|       |    #[serde(skip_serializing_if = "Option::is_none")]
  548|       |    items_pending: Option<usize>,
  549|       |    /// Active parallel LLM worker count (1 = serial). Present only on the "scan" phase event.
  550|       |    #[serde(skip_serializing_if = "Option::is_none")]
  551|       |    llm_parallelism: Option<u32>,
  552|       |}
  553|       |
  554|       |#[derive(Debug, Serialize)]
  555|       |struct ItemEvent<'a> {
  556|       |    /// Item identifier (memory name or entity name).
  557|       |    item: &'a str,
  558|       |    status: &'a str,
  559|       |    #[serde(skip_serializing_if = "Option::is_none")]
  560|       |    memory_id: Option<i64>,
  561|       |    #[serde(skip_serializing_if = "Option::is_none")]
  562|       |    entity_id: Option<i64>,
  563|       |    #[serde(skip_serializing_if = "Option::is_none")]
  564|       |    entities: Option<usize>,
  565|       |    #[serde(skip_serializing_if = "Option::is_none")]
  566|       |    rels: Option<usize>,
  567|       |    #[serde(skip_serializing_if = "Option::is_none")]
  568|       |    chars_before: Option<usize>,
  569|       |    #[serde(skip_serializing_if = "Option::is_none")]
  570|       |    chars_after: Option<usize>,
  571|       |    #[serde(skip_serializing_if = "Option::is_none")]
  572|       |    cost_usd: Option<f64>,
  573|       |    #[serde(skip_serializing_if = "Option::is_none")]
  574|       |    elapsed_ms: Option<u64>,
  575|       |    #[serde(skip_serializing_if = "Option::is_none")]
  576|       |    error: Option<String>,
  577|       |    index: usize,
  578|       |    total: usize,
  579|       |}
  580|       |
  581|       |#[derive(Debug, Serialize)]
  582|       |struct EnrichSummary {
  583|       |    summary: bool,
  584|       |    operation: String,
  585|       |    items_total: usize,
  586|       |    completed: usize,
  587|       |    failed: usize,
  588|       |    skipped: usize,
  589|       |    cost_usd: f64,
  590|       |    elapsed_ms: u64,
  591|       |}
  592|       |
  593|       |use crate::output::emit_json_line as emit_json;
  594|       |
  595|       |// ---------------------------------------------------------------------------
  596|       |// Queue DB
  597|       |// ---------------------------------------------------------------------------
  598|       |
  599|       |/// Opens or creates the enrichment queue database.
  600|       |///
  601|       |/// The queue schema mirrors `ingest_claude` for resume/retry parity.
  602|       |/// Uses a different filename (`.enrich-queue.sqlite`) to avoid collision.
  603|       |///
  604|       |/// # DRY note
  605|       |///
  606|       |/// This is a near-verbatim copy of `open_queue_db` in `ingest_claude.rs`.
  607|       |/// Both should be unified in a shared `llm_runner.rs` module by the
  608|       |/// Integration stream.
  609|      1|fn open_queue_db(path: &str) -> Result<Connection, AppError> {
  610|      1|    let conn = Connection::open(path)?;
                                                   ^0
  611|      1|    conn.pragma_update(None, "journal_mode", "wal")?;
                                                                 ^0
  612|      1|    conn.execute_batch(
  613|      1|        "CREATE TABLE IF NOT EXISTS queue (
  614|      1|            id          INTEGER PRIMARY KEY AUTOINCREMENT,
  615|      1|            item_key    TEXT NOT NULL UNIQUE,
  616|      1|            item_type   TEXT NOT NULL DEFAULT 'memory',
  617|      1|            status      TEXT NOT NULL DEFAULT 'pending',
  618|      1|            memory_id   INTEGER,
  619|      1|            entity_id   INTEGER,
  620|      1|            entities    INTEGER DEFAULT 0,
  621|      1|            rels        INTEGER DEFAULT 0,
  622|      1|            error       TEXT,
  623|      1|            cost_usd    REAL DEFAULT 0.0,
  624|      1|            attempt     INTEGER DEFAULT 0,
  625|      1|            elapsed_ms  INTEGER,
  626|      1|            created_at  TEXT DEFAULT (datetime('now')),
  627|      1|            done_at     TEXT
  628|      1|        );
  629|      1|        CREATE INDEX IF NOT EXISTS idx_enrich_queue_status ON queue(status);",
  630|      0|    )?;
  631|      1|    Ok(conn)
  632|      1|}
  633|       |
  634|       |// ---------------------------------------------------------------------------
  635|       |// LLM invocation — Claude Code
  636|       |// ---------------------------------------------------------------------------
  637|       |
  638|       |/// Calls `claude -p` via the shared `claude_runner` module (G02).
  639|       |///
  640|       |/// Returns `(output_value, cost_usd, is_oauth)`.
  641|      0|fn call_claude(
  642|      0|    binary: &Path,
  643|      0|    prompt: &str,
  644|      0|    json_schema: &str,
  645|      0|    input_text: &str,
  646|      0|    model: Option<&str>,
  647|      0|    timeout_secs: u64,
  648|      0|) -> Result<(serde_json::Value, f64, bool), AppError> {
  649|      0|    let result = crate::commands::claude_runner::run_claude(
  650|      0|        binary,
  651|      0|        prompt,
  652|      0|        json_schema,
  653|      0|        input_text,
  654|      0|        model,
  655|      0|        timeout_secs,
  656|       |        7,
  657|      0|    )?;
  658|      0|    Ok((result.value, result.cost_usd, result.is_oauth))
  659|      0|}
  660|       |
  661|       |// ---------------------------------------------------------------------------
  662|       |// Preflight probe (G35) — single-turn ping to verify the LLM provider
  663|       |// ---------------------------------------------------------------------------
  664|       |
  665|       |/// Result of a single preflight ping (G35).
  666|       |enum PreflightOutcome {
  667|       |    /// The provider accepted the ping without rate-limit or other errors.
  668|       |    Healthy,
  669|       |    /// The provider rejected the ping due to OAuth rate limit. The
  670|       |    /// `suggestion` field is a human hint that callers can embed in the
  671|       |    /// user-facing error.
  672|       |    RateLimited {
  673|       |        reason: String,
  674|       |        suggestion: &'static str,
  675|       |    },
  676|       |    /// Any other provider error (binary missing, auth failure, etc.).
  677|       |    Error(AppError),
  678|       |}
  679|       |
  680|       |/// Probes the configured LLM provider with a 1-turn ping.
  681|       |///
  682|       |/// - Claude: `claude -p "ping" --max-turns 1 --strict-mcp-config --mcp-config '{}'`
  683|       |/// - Codex:  `codex exec -c mcp_servers='{}' "ping" --json`
  684|       |///
  685|       |/// The probe intentionally avoids spawning any MCP server children (G28-A)
  686|       |/// to keep its own process footprint at the minimum.
  687|      0|fn run_preflight_probe(args: &EnrichArgs) -> PreflightOutcome {
  688|      0|    let timeout = std::time::Duration::from_secs(args.rate_limit_buffer.max(60));
  689|       |
  690|      0|    match args.mode {
  691|       |        EnrichMode::ClaudeCode => {
  692|      0|            let bin = match find_claude_binary(args.claude_binary.as_deref()) {
  693|      0|                Ok(b) => b,
  694|      0|                Err(e) => return PreflightOutcome::Error(e),
  695|       |            };
  696|      0|            let mut cmd = std::process::Command::new(&bin);
  697|      0|            cmd.env_clear();
  698|      0|            for var in &["PATH", "HOME", "USER"] {
  699|      0|                if let Ok(val) = std::env::var(var) {
  700|      0|                    cmd.env(var, val);
  701|      0|                }
  702|       |            }
  703|      0|            cmd.arg("-p")
  704|      0|                .arg("ping")
  705|      0|                .arg("--max-turns")
  706|      0|                .arg("1")
  707|      0|                .arg("--strict-mcp-config")
  708|      0|                .arg("--mcp-config")
  709|      0|                .arg("{}")
  710|      0|                .arg("--dangerously-skip-permissions")
  711|      0|                .arg("--settings")
  712|      0|                .arg("{\"hooks\":{}}")
  713|      0|                .arg("--output-format")
  714|      0|                .arg("json")
  715|      0|                .stdin(std::process::Stdio::null())
  716|      0|                .stdout(std::process::Stdio::piped())
  717|      0|                .stderr(std::process::Stdio::piped());
  718|       |
  719|      0|            let child = match super::claude_runner::spawn_with_memory_limit(&mut cmd) {
  720|      0|                Ok(c) => c,
  721|      0|                Err(e) => {
  722|      0|                    return PreflightOutcome::Error(AppError::Io(e));
  723|       |                }
  724|       |            };
  725|      0|            let output = match wait_with_timeout(child, timeout) {
  726|      0|                Ok(out) => out,
  727|      0|                Err(e) => return PreflightOutcome::Error(e),
  728|       |            };
  729|      0|            if !output.status.success() {
  730|      0|                let stderr = String::from_utf8_lossy(&output.stderr);
  731|      0|                if stderr.contains("hit your session limit")
  732|      0|                    || stderr.contains("rate_limit")
  733|      0|                    || stderr.contains("429")
  734|       |                {
  735|      0|                    return PreflightOutcome::RateLimited {
  736|      0|                        reason: stderr.trim().to_string(),
  737|      0|                        suggestion:
  738|      0|                            "wait for the OAuth window to reset or use --fallback-mode codex",
  739|      0|                    };
  740|      0|                }
  741|      0|                return PreflightOutcome::Error(AppError::Validation(format!(
  742|      0|                    "preflight probe failed: {stderr}",
  743|      0|                    stderr = stderr.trim()
  744|      0|                )));
  745|      0|            }
  746|      0|            PreflightOutcome::Healthy
  747|       |        }
  748|       |        EnrichMode::Codex => {
  749|      0|            let bin = match find_codex_binary(args.codex_binary.as_deref()) {
  750|      0|                Ok(b) => b,
  751|      0|                Err(e) => return PreflightOutcome::Error(e),
  752|       |            };
  753|      0|            super::codex_spawn::validate_codex_model(args.codex_model.as_deref())
  754|      0|                .map_err(PreflightOutcome::Error)
  755|      0|                .ok();
  756|      0|            let schema = "{}";
  757|      0|            let schema_path = match super::codex_spawn::trusted_schema_path() {
  758|      0|                Ok(p) => p,
  759|      0|                Err(e) => return PreflightOutcome::Error(e),
  760|       |            };
  761|      0|            let spawn_args = super::codex_spawn::CodexSpawnArgs {
  762|      0|                binary: &bin,
  763|      0|                prompt: "ping",
  764|      0|                json_schema: schema,
  765|      0|                input_text: "",
  766|      0|                model: args.codex_model.as_deref(),
  767|      0|                timeout_secs: args.rate_limit_buffer.max(60),
  768|      0|                schema_path: schema_path.clone(),
  769|      0|            };
  770|      0|            let mut cmd = super::codex_spawn::build_codex_command(&spawn_args);
  771|      0|            let child = match super::claude_runner::spawn_with_memory_limit(&mut cmd) {
  772|      0|                Ok(c) => c,
  773|      0|                Err(e) => return PreflightOutcome::Error(AppError::Io(e)),
  774|       |            };
  775|      0|            let output = match wait_with_timeout(child, timeout) {
  776|      0|                Ok(out) => out,
  777|      0|                Err(e) => return PreflightOutcome::Error(e),
  778|       |            };
  779|      0|            let _ = std::fs::remove_file(&schema_path);
  780|      0|            if !output.status.success() {
  781|      0|                let stderr = String::from_utf8_lossy(&output.stderr);
  782|      0|                if stderr.contains("rate_limit")
  783|      0|                    || stderr.contains("429")
  784|      0|                    || stderr.contains("Too Many Requests")
  785|       |                {
  786|      0|                    return PreflightOutcome::RateLimited {
  787|      0|                        reason: stderr.trim().to_string(),
  788|      0|                        suggestion: "wait for the rate-limit window to reset",
  789|      0|                    };
  790|      0|                }
  791|      0|                return PreflightOutcome::Error(AppError::Validation(format!(
  792|      0|                    "preflight probe failed: {stderr}",
  793|      0|                    stderr = stderr.trim()
  794|      0|                )));
  795|      0|            }
  796|      0|            PreflightOutcome::Healthy
  797|       |        }
  798|       |    }
  799|      0|}
  800|       |
  801|       |/// Cross-platform wait with timeout (no extra crate dependency).
  802|      0|fn wait_with_timeout(
  803|      0|    mut child: std::process::Child,
  804|      0|    timeout: std::time::Duration,
  805|      0|) -> Result<std::process::Output, AppError> {
  806|       |    use wait_timeout::ChildExt;
  807|      0|    let start = std::time::Instant::now();
  808|      0|    let status = child.wait_timeout(timeout).map_err(AppError::Io)?;
  809|      0|    if status.is_none() {
  810|      0|        let _ = child.kill();
  811|      0|        let _ = child.wait();
  812|      0|        return Err(AppError::Validation(format!(
  813|      0|            "preflight probe timed out after {}s",
  814|      0|            start.elapsed().as_secs()
  815|      0|        )));
  816|      0|    }
  817|      0|    let mut stdout = Vec::new();
  818|      0|    if let Some(mut out) = child.stdout.take() {
  819|      0|        std::io::Read::read_to_end(&mut out, &mut stdout).map_err(AppError::Io)?;
  820|      0|    }
  821|      0|    let mut stderr = Vec::new();
  822|      0|    if let Some(mut err) = child.stderr.take() {
  823|      0|        std::io::Read::read_to_end(&mut err, &mut stderr).map_err(AppError::Io)?;
  824|      0|    }
  825|      0|    let exit = status.unwrap();
  826|      0|    Ok(std::process::Output {
  827|      0|        status: exit,
  828|      0|        stdout,
  829|      0|        stderr,
  830|      0|    })
  831|      0|}
  832|       |
  833|       |// ---------------------------------------------------------------------------
  834|       |// SCAN helpers — SQL queries that find items needing enrichment
  835|       |// ---------------------------------------------------------------------------
  836|       |
  837|       |/// Returns memories without any `memory_entities` binding.
  838|       |///
  839|       |/// These are the targets for `memory-bindings` enrichment. When `name_filter`
  840|       |/// is non-empty, restricts the scan to the given names (G37); unknown names
  841|       |/// are silently skipped (the caller can detect them by comparing
  842|       |/// requested vs. returned).
  843|      2|fn scan_unbound_memories(
  844|      2|    conn: &Connection,
  845|      2|    namespace: &str,
  846|      2|    limit: Option<usize>,
  847|      2|    name_filter: &[String],
  848|      2|) -> Result<Vec<(i64, String, String)>, AppError> {
  849|      2|    let limit_clause = limit.map(|n| format!("LIMIT {n}")).unwrap_or_default();
                                                           ^0
  850|       |
  851|      2|    if name_filter.is_empty() {
  852|      2|        let sql = format!(
  853|      2|            "SELECT m.id, m.name, m.body
  854|      2|             FROM memories m
  855|      2|             WHERE m.namespace = ?1
  856|      2|               AND m.deleted_at IS NULL
  857|      2|               AND NOT EXISTS (
  858|      2|                   SELECT 1 FROM memory_entities me WHERE me.memory_id = m.id
  859|      2|               )
  860|      2|             ORDER BY m.id
  861|      2|             {limit_clause}"
  862|       |        );
  863|      2|        let mut stmt = conn.prepare(&sql)?;
                                                       ^0
  864|      2|        let rows = stmt
  865|      2|            .query_map(rusqlite::params![namespace], |r| {
                                                                       ^1
  866|       |                Ok((
  867|      1|                    r.get::<_, i64>(0)?,
                                                    ^0
  868|      1|                    r.get::<_, String>(1)?,
                                                       ^0
  869|      1|                    r.get::<_, String>(2)?,
                                                       ^0
  870|       |                ))
  871|      1|            })?
                            ^0
  872|      2|            .collect::<Result<Vec<_>, _>>()?;
                                                         ^0
  873|      2|        Ok(rows)
  874|       |    } else {
  875|       |        // Build a parameterised IN clause: ?2, ?3, ..., ?{1+n}
  876|      0|        let placeholders: Vec<String> = (2..=name_filter.len() + 1)
  877|      0|            .map(|i| format!("?{i}"))
  878|      0|            .collect();
  879|      0|        let in_clause = placeholders.join(", ");
  880|      0|        let sql = format!(
  881|      0|            "SELECT m.id, m.name, m.body
  882|      0|             FROM memories m
  883|      0|             WHERE m.namespace = ?1
  884|      0|               AND m.deleted_at IS NULL
  885|      0|               AND m.name IN ({in_clause})
  886|      0|               AND NOT EXISTS (
  887|      0|                   SELECT 1 FROM memory_entities me WHERE me.memory_id = m.id
  888|      0|               )
  889|      0|             ORDER BY m.id
  890|      0|             {limit_clause}"
  891|       |        );
  892|      0|        let mut params_vec: Vec<&dyn rusqlite::ToSql> = Vec::with_capacity(1 + name_filter.len());
  893|      0|        params_vec.push(&namespace);
  894|      0|        for n in name_filter {
  895|      0|            params_vec.push(n);
  896|      0|        }
  897|      0|        let mut stmt = conn.prepare(&sql)?;
  898|      0|        let rows = stmt
  899|      0|            .query_map(
  900|      0|                rusqlite::params_from_iter(params_vec.iter().copied()),
  901|      0|                |r| {
  902|       |                    Ok((
  903|      0|                        r.get::<_, i64>(0)?,
  904|      0|                        r.get::<_, String>(1)?,
  905|      0|                        r.get::<_, String>(2)?,
  906|       |                    ))
  907|      0|                },
  908|      0|            )?
  909|      0|            .collect::<Result<Vec<_>, _>>()?;
  910|      0|        Ok(rows)
  911|       |    }
  912|      2|}
  913|       |
  914|       |/// Reads a list of memory names from a UTF-8 text file (G37).
  915|       |///
  916|       |/// Empty lines and lines beginning with `#` are skipped. Returns a
  917|       |/// de-duplicated, order-preserving list of trimmed names.
  918|      0|fn read_names_file(path: &Path) -> Result<Vec<String>, AppError> {
  919|      0|    let content = std::fs::read_to_string(path).map_err(|e| {
  920|      0|        AppError::Validation(format!("failed to read names file {}: {e}", path.display()))
  921|      0|    })?;
  922|      0|    let mut seen = std::collections::HashSet::new();
  923|      0|    let mut out = Vec::new();
  924|      0|    for line in content.lines() {
  925|      0|        let trimmed = line.trim();
  926|      0|        if trimmed.is_empty() || trimmed.starts_with('#') {
  927|      0|            continue;
  928|      0|        }
  929|      0|        if seen.insert(trimmed.to_string()) {
  930|      0|            out.push(trimmed.to_string());
  931|      0|        }
  932|       |    }
  933|      0|    Ok(out)
  934|      0|}
  935|       |
  936|       |/// Resolves the union of `--names` and `--names-file` (G37).
  937|      0|fn resolve_name_filter(args: &EnrichArgs) -> Result<Vec<String>, AppError> {
  938|      0|    let mut combined: Vec<String> = args.names.clone();
  939|      0|    if let Some(p) = &args.names_file {
  940|      0|        let from_file = read_names_file(p)?;
  941|      0|        for n in from_file {
  942|      0|            if !combined.contains(&n) {
  943|      0|                combined.push(n);
  944|      0|            }
  945|       |        }
  946|      0|    }
  947|      0|    Ok(combined)
  948|      0|}
  949|       |
  950|       |/// Returns entities with NULL or empty description.
  951|       |///
  952|       |/// These are the targets for `entity-descriptions` enrichment.
  953|      2|fn scan_entities_without_description(
  954|      2|    conn: &Connection,
  955|      2|    namespace: &str,
  956|      2|    limit: Option<usize>,
  957|      2|) -> Result<Vec<(i64, String, String)>, AppError> {
  958|      2|    let limit_clause = limit.map(|n| format!("LIMIT {n}")).unwrap_or_default();
                                                           ^0
  959|      2|    let sql = format!(
  960|      2|        "SELECT id, name, type
  961|      2|         FROM entities
  962|      2|         WHERE namespace = ?1
  963|      2|           AND (description IS NULL OR description = '')
  964|      2|         ORDER BY id
  965|      2|         {limit_clause}"
  966|       |    );
  967|      2|    let mut stmt = conn.prepare(&sql)?;
                                                   ^0
  968|      2|    let rows = stmt
  969|      2|        .query_map(rusqlite::params![namespace], |r| {
                                                                   ^1
  970|       |            Ok((
  971|      1|                r.get::<_, i64>(0)?,
                                                ^0
  972|      1|                r.get::<_, String>(1)?,
                                                   ^0
  973|      1|                r.get::<_, String>(2)?,
                                                   ^0
  974|       |            ))
  975|      1|        })?
                        ^0
  976|      2|        .collect::<Result<Vec<_>, _>>()?;
                                                     ^0
  977|      2|    Ok(rows)
  978|      2|}
  979|       |
  980|       |/// Returns memories whose body length is below the configured minimum.
  981|       |///
  982|       |/// These are the targets for `body-enrich` (GAP-18).
  983|      4|fn scan_short_body_memories(
  984|      4|    conn: &Connection,
  985|      4|    namespace: &str,
  986|      4|    min_chars: usize,
  987|      4|    limit: Option<usize>,
  988|      4|) -> Result<Vec<(i64, String, String)>, AppError> {
  989|      4|    let limit_clause = limit.map(|n| format!("LIMIT {n}")).unwrap_or_default();
                                                           ^1
  990|      4|    let sql = format!(
  991|      4|        "SELECT m.id, m.name, m.body
  992|      4|         FROM memories m
  993|      4|         WHERE m.namespace = ?1
  994|      4|           AND m.deleted_at IS NULL
  995|      4|           AND LENGTH(COALESCE(m.body,'')) < ?2
  996|      4|         ORDER BY m.id
  997|      4|         {limit_clause}"
  998|       |    );
  999|      4|    let mut stmt = conn.prepare(&sql)?;
                                                   ^0
 1000|      4|    let rows = stmt
 1001|      5|        .query_map(rusqlite::params![namespace, min_chars as i64], |r| {
                       ^4        ^4                           ^4
 1002|       |            Ok((
 1003|      5|                r.get::<_, i64>(0)?,
                                                ^0
 1004|      5|                r.get::<_, String>(1)?,
                                                   ^0
 1005|      5|                r.get::<_, String>(2)?,
                                                   ^0
 1006|       |            ))
 1007|      5|        })?
                        ^0
 1008|      4|        .collect::<Result<Vec<_>, _>>()?;
                                                     ^0
 1009|      4|    Ok(rows)
 1010|      4|}
 1011|       |
 1012|       |/// G27: Returns relationships with weight >= 0.7 that may need recalibration.
 1013|       |#[allow(clippy::type_complexity)]
 1014|      0|fn scan_weight_candidates(
 1015|      0|    conn: &Connection,
 1016|      0|    namespace: &str,
 1017|      0|    limit: Option<usize>,
 1018|      0|) -> Result<Vec<(i64, String, String, String, f64)>, AppError> {
 1019|      0|    let limit_clause = limit.map(|n| format!("LIMIT {n}")).unwrap_or_default();
 1020|      0|    let sql = format!(
 1021|      0|        "SELECT r.id, e1.name, e2.name, r.relation, r.weight \
 1022|      0|         FROM relationships r \
 1023|      0|         JOIN entities e1 ON e1.id = r.source_id \
 1024|      0|         JOIN entities e2 ON e2.id = r.target_id \
 1025|      0|         WHERE r.weight >= 0.7 AND e1.namespace = ?1 \
 1026|      0|         ORDER BY r.weight DESC {limit_clause}"
 1027|       |    );
 1028|      0|    let mut stmt = conn.prepare(&sql)?;
 1029|      0|    let rows = stmt
 1030|      0|        .query_map(rusqlite::params![namespace], |r| {
 1031|       |            Ok((
 1032|      0|                r.get::<_, i64>(0)?,
 1033|      0|                r.get::<_, String>(1)?,
 1034|      0|                r.get::<_, String>(2)?,
 1035|      0|                r.get::<_, String>(3)?,
 1036|      0|                r.get::<_, f64>(4)?,
 1037|       |            ))
 1038|      0|        })?
 1039|      0|        .collect::<Result<Vec<_>, _>>()?;
 1040|      0|    Ok(rows)
 1041|      0|}
 1042|       |
 1043|       |/// G27: Returns relationships with generic relation types (applies_to).
 1044|      0|fn scan_generic_relations(
 1045|      0|    conn: &Connection,
 1046|      0|    namespace: &str,
 1047|      0|    limit: Option<usize>,
 1048|      0|) -> Result<Vec<(i64, String, String, String)>, AppError> {
 1049|      0|    let limit_clause = limit.map(|n| format!("LIMIT {n}")).unwrap_or_default();
 1050|      0|    let sql = format!(
 1051|      0|        "SELECT r.id, e1.name, e2.name, r.relation \
 1052|      0|         FROM relationships r \
 1053|      0|         JOIN entities e1 ON e1.id = r.source_id \
 1054|      0|         JOIN entities e2 ON e2.id = r.target_id \
 1055|      0|         WHERE r.relation = 'applies_to' AND e1.namespace = ?1 \
 1056|      0|         ORDER BY r.id {limit_clause}"
 1057|       |    );
 1058|      0|    let mut stmt = conn.prepare(&sql)?;
 1059|      0|    let rows = stmt
 1060|      0|        .query_map(rusqlite::params![namespace], |r| {
 1061|       |            Ok((
 1062|      0|                r.get::<_, i64>(0)?,
 1063|      0|                r.get::<_, String>(1)?,
 1064|      0|                r.get::<_, String>(2)?,
 1065|      0|                r.get::<_, String>(3)?,
 1066|       |            ))
 1067|      0|        })?
 1068|      0|        .collect::<Result<Vec<_>, _>>()?;
 1069|      0|    Ok(rows)
 1070|      0|}
 1071|       |
 1072|       |// ---------------------------------------------------------------------------
 1073|       |// PERSIST helpers for fully-implemented operations
 1074|       |// ---------------------------------------------------------------------------
 1075|       |
 1076|       |/// Persists entity bindings extracted by the LLM for a memory.
 1077|       |///
 1078|       |/// Creates entities via `upsert_entity`, links them to the memory via
 1079|       |/// `link_memory_entity`, and upserts relationships found between entities.
 1080|      0|fn persist_memory_bindings(
 1081|      0|    conn: &Connection,
 1082|      0|    namespace: &str,
 1083|      0|    memory_id: i64,
 1084|      0|    entities_json: &serde_json::Value,
 1085|      0|    rels_json: &serde_json::Value,
 1086|      0|) -> Result<(usize, usize), AppError> {
 1087|       |    #[derive(Deserialize)]
 1088|       |    struct EntityItem {
 1089|       |        name: String,
 1090|       |        entity_type: String,
 1091|       |    }
 1092|       |    #[derive(Deserialize)]
 1093|       |    struct RelItem {
 1094|       |        source: String,
 1095|       |        target: String,
 1096|       |        relation: String,
 1097|       |        strength: f64,
 1098|       |    }
 1099|       |
 1100|      0|    let extracted_entities: Vec<EntityItem> = serde_json::from_value(entities_json.clone())
 1101|      0|        .map_err(|e| AppError::Validation(format!("failed to parse entities array: {e}")))?;
 1102|       |
 1103|      0|    let extracted_rels: Vec<RelItem> = serde_json::from_value(rels_json.clone())
 1104|      0|        .map_err(|e| AppError::Validation(format!("failed to parse relationships array: {e}")))?;
 1105|       |
 1106|      0|    let mut ent_count = 0usize;
 1107|      0|    let mut rel_count = 0usize;
 1108|       |
 1109|      0|    for item in &extracted_entities {
 1110|      0|        let entity_type = match item.entity_type.parse::<EntityType>() {
 1111|      0|            Ok(et) => et,
 1112|       |            Err(_) => {
 1113|      0|                tracing::warn!(
 1114|       |                    target: "enrich",
 1115|       |                    entity = %item.name,
 1116|       |                    entity_type = %item.entity_type,
 1117|      0|                    "entity type not recognized, skipping"
 1118|       |                );
 1119|      0|                continue;
 1120|       |            }
 1121|       |        };
 1122|      0|        match entities::upsert_entity(
 1123|      0|            conn,
 1124|      0|            namespace,
 1125|      0|            &NewEntity {
 1126|      0|                name: item.name.clone(),
 1127|      0|                entity_type,
 1128|      0|                description: None,
 1129|      0|            },
 1130|      0|        ) {
 1131|      0|            Ok(eid) => {
 1132|      0|                let _ = entities::link_memory_entity(conn, memory_id, eid);
 1133|      0|                ent_count += 1;
 1134|      0|            }
 1135|      0|            Err(e) => {
 1136|      0|                tracing::warn!(
 1137|       |                    target: "enrich",
 1138|       |                    entity = %item.name,
 1139|       |                    error = %e,
 1140|      0|                    "entity upsert skipped"
 1141|       |                );
 1142|       |            }
 1143|       |        }
 1144|       |    }
 1145|       |
 1146|      0|    for rel in &extracted_rels {
 1147|      0|        let normalized = crate::parsers::normalize_relation(&rel.relation);
 1148|      0|        crate::parsers::warn_if_non_canonical(&normalized);
 1149|       |
 1150|       |        // Normalize entity names before lookup: upsert_entity normalizes on write,
 1151|       |        // so the lookup must use the same normalized form to find the row.
 1152|      0|        let src_name = crate::parsers::normalize_entity_name(&rel.source);
 1153|      0|        let tgt_name = crate::parsers::normalize_entity_name(&rel.target);
 1154|      0|        let src_id = entities::find_entity_id(conn, namespace, &src_name);
 1155|      0|        let tgt_id = entities::find_entity_id(conn, namespace, &tgt_name);
 1156|      0|        if let (Ok(Some(sid)), Ok(Some(tid))) = (src_id, tgt_id) {
 1157|      0|            let new_rel = NewRelationship {
 1158|      0|                source: rel.source.clone(),
 1159|      0|                target: rel.target.clone(),
 1160|      0|                relation: normalized,
 1161|      0|                strength: rel.strength,
 1162|      0|                description: None,
 1163|      0|            };
 1164|      0|            if entities::upsert_relationship(conn, namespace, sid, tid, &new_rel).is_ok() {
 1165|      0|                rel_count += 1;
 1166|      0|            }
 1167|      0|        }
 1168|       |    }
 1169|       |
 1170|      0|    Ok((ent_count, rel_count))
 1171|      0|}
 1172|       |
 1173|       |/// Updates an entity's description directly in the `entities` table.
 1174|      1|fn persist_entity_description(
 1175|      1|    conn: &Connection,
 1176|      1|    entity_id: i64,
 1177|      1|    description: &str,
 1178|      1|) -> Result<(), AppError> {
 1179|      1|    conn.execute(
 1180|      1|        "UPDATE entities SET description = ?1, updated_at = unixepoch() WHERE id = ?2",
 1181|      1|        rusqlite::params![description, entity_id],
 1182|      0|    )?;
 1183|      1|    Ok(())
 1184|      1|}
 1185|       |
 1186|       |/// Persists an enriched memory body (body-enrich, GAP-18).
 1187|       |///
 1188|       |/// Uses `memories::update` to set the new body and `sync_fts_after_update`
 1189|       |/// to keep FTS5 in sync. Also re-embeds the memory for recall accuracy.
 1190|      0|fn persist_enriched_body(
 1191|      0|    conn: &Connection,
 1192|      0|    namespace: &str,
 1193|      0|    memory_id: i64,
 1194|      0|    memory_name: &str,
 1195|      0|    new_body: &str,
 1196|      0|    paths: &crate::paths::AppPaths,
 1197|      0|) -> Result<(), AppError> {
 1198|       |    // Read current values for FTS sync
 1199|      0|    let (old_name, old_desc, old_body): (String, String, String) = conn.query_row(
 1200|      0|        "SELECT name, COALESCE(description,''), COALESCE(body,'') FROM memories WHERE id=?1",
 1201|      0|        rusqlite::params![memory_id],
 1202|      0|        |r| Ok((r.get(0)?, r.get(1)?, r.get(2)?)),
 1203|      0|    )?;
 1204|       |
 1205|      0|    let memory_type: String = conn.query_row(
 1206|      0|        "SELECT type FROM memories WHERE id=?1",
 1207|      0|        rusqlite::params![memory_id],
 1208|      0|        |r| r.get(0),
 1209|      0|    )?;
 1210|       |
 1211|      0|    let description: String = conn.query_row(
 1212|      0|        "SELECT COALESCE(description,'') FROM memories WHERE id=?1",
 1213|      0|        rusqlite::params![memory_id],
 1214|      0|        |r| r.get(0),
 1215|      0|    )?;
 1216|       |
 1217|      0|    let body_hash = blake3::hash(new_body.as_bytes()).to_hex().to_string();
 1218|       |
 1219|      0|    let new_memory = memories::NewMemory {
 1220|      0|        namespace: namespace.to_string(),
 1221|      0|        name: memory_name.to_string(),
 1222|      0|        memory_type: memory_type.clone(),
 1223|      0|        description: description.clone(),
 1224|      0|        body: new_body.to_string(),
 1225|      0|        body_hash,
 1226|      0|        session_id: None,
 1227|      0|        source: "agent".to_string(),
 1228|      0|        metadata: serde_json::json!({
 1229|      0|            "operation": "body-enrich",
 1230|      0|            "orig_chars": old_body.chars().count(),
 1231|      0|            "new_chars": new_body.chars().count(),
 1232|      0|        }),
 1233|      0|    };
 1234|       |
 1235|       |    // G29 audit: insert a new immutable version BEFORE the update so the
 1236|       |    // enriched body is reachable through `history --name <X>` and
 1237|       |    // `restore --version N` can roll back to the pre-enrich state.
 1238|      0|    let next_version = crate::storage::versions::next_version(conn, memory_id)?;
 1239|      0|    let version_metadata = serde_json::json!({
 1240|      0|        "operation": "body-enrich",
 1241|      0|        "orig_chars": old_body.chars().count(),
 1242|      0|        "new_chars": new_body.chars().count(),
 1243|       |    })
 1244|      0|    .to_string();
 1245|      0|    crate::storage::versions::insert_version(
 1246|      0|        conn,
 1247|      0|        memory_id,
 1248|      0|        next_version,
 1249|      0|        memory_name,
 1250|      0|        &memory_type,
 1251|      0|        &description,
 1252|      0|        new_body,
 1253|      0|        &version_metadata,
 1254|      0|        Some("enrich"),
 1255|      0|        "edit",
 1256|      0|    )?;
 1257|       |
 1258|      0|    memories::update(conn, memory_id, &new_memory, None)?;
 1259|      0|    memories::sync_fts_after_update(
 1260|      0|        conn,
 1261|      0|        memory_id,
 1262|      0|        &old_name,
 1263|      0|        &old_desc,
 1264|      0|        &old_body,
 1265|      0|        &new_memory.name,
 1266|      0|        &new_memory.description,
 1267|      0|        &new_memory.body,
 1268|      0|    )?;
 1269|       |
 1270|       |    // Re-embed for recall accuracy
 1271|      0|    let snippet: String = new_body.chars().take(200).collect();
 1272|      0|    let tokenizer = crate::tokenizer::get_tokenizer(&paths.models)?;
 1273|      0|    let chunks_info = crate::chunking::split_into_chunks_hierarchical(new_body, tokenizer);
 1274|      0|    let embedding_result = if chunks_info.len() <= 1 {
 1275|      0|        crate::daemon::embed_passage_or_local(&paths.models, new_body)
 1276|       |    } else {
 1277|      0|        let mut chunk_embeddings: Vec<Vec<f32>> = Vec::with_capacity(chunks_info.len());
 1278|      0|        let mut ok = true;
 1279|      0|        for chunk in &chunks_info {
 1280|      0|            let text = crate::chunking::chunk_text(new_body, chunk);
 1281|      0|            match crate::daemon::embed_passage_or_local(&paths.models, text) {
 1282|      0|                Ok(emb) => chunk_embeddings.push(emb),
 1283|      0|                Err(e) => {
 1284|      0|                    tracing::warn!(target: "enrich", error = %e, "chunk embedding failed");
 1285|      0|                    ok = false;
 1286|      0|                    break;
 1287|       |                }
 1288|       |            }
 1289|       |        }
 1290|      0|        if ok {
 1291|      0|            Ok(crate::chunking::aggregate_embeddings(&chunk_embeddings))
 1292|       |        } else {
 1293|      0|            crate::daemon::embed_passage_or_local(&paths.models, new_body)
 1294|       |        }
 1295|       |    };
 1296|       |
 1297|      0|    if let Ok(embedding) = embedding_result {
 1298|      0|        if let Err(e) = memories::upsert_vec(
 1299|      0|            conn,
 1300|      0|            memory_id,
 1301|      0|            namespace,
 1302|      0|            &memory_type,
 1303|      0|            &embedding,
 1304|      0|            memory_name,
 1305|      0|            &snippet,
 1306|      0|        ) {
 1307|      0|            tracing::warn!(target: "enrich", memory = %memory_name, error = %e, "vec upsert failed after body-enrich");
 1308|      0|        }
 1309|      0|    }
 1310|       |
 1311|      0|    Ok(())
 1312|      0|}
 1313|       |
 1314|       |// ---------------------------------------------------------------------------
 1315|       |// Main entry point
 1316|       |// ---------------------------------------------------------------------------
 1317|       |
 1318|       |/// Main entry point for the `enrich` command.
 1319|      0|pub fn run(args: &EnrichArgs) -> Result<(), AppError> {
 1320|       |    // TODO(G20): add mode-conditional flag validation before DB access.
 1321|       |    // Flags that are silently discarded when the wrong mode is active:
 1322|       |    //   --mode claude-code: codex_binary, codex_model, codex_timeout
 1323|       |    //   --mode codex:       claude_binary, claude_model, claude_timeout,
 1324|       |    //                       max_cost_usd, rate_limit_wait
 1325|       |    // Approach: check each non-default flag value early and return
 1326|       |    // Err(AppError::Validation(...)) for incompatible mode+flag combinations.
 1327|      0|    let started = Instant::now();
 1328|       |
 1329|      0|    let paths = AppPaths::resolve(args.db.as_deref())?;
 1330|      0|    ensure_db_ready(&paths)?;
 1331|      0|    let conn = open_rw(&paths.db)?;
 1332|      0|    let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
 1333|       |
 1334|       |    // G28-B (v1.0.68) + G30 (v1.0.69): enforce singleton per
 1335|       |    // (job_type, namespace, db_hash) so two parallel `enrich` invocations
 1336|       |    // on the same DB cannot co-exist, but concurrent enrich on different
 1337|       |    // databases works as expected. The force flag (--force) breaks a
 1338|       |    // stale lock from a previously crashed invocation.
 1339|      0|    let wait_secs = args.wait_job_singleton;
 1340|      0|    let force_flag = args.force_job_singleton;
 1341|      0|    let _singleton = crate::lock::acquire_job_singleton(
 1342|      0|        crate::lock::JobType::Enrich,
 1343|      0|        &namespace,
 1344|      0|        &paths.db,
 1345|      0|        wait_secs,
 1346|      0|        force_flag,
 1347|      0|    )?;
 1348|       |
 1349|       |    // Validate provider binary upfront
 1350|      0|    let _effective_mode: EnrichMode = args.mode.clone();
 1351|      0|    let provider_binary = match args.mode {
 1352|       |        EnrichMode::ClaudeCode => {
 1353|      0|            let bin = find_claude_binary(args.claude_binary.as_deref())?;
 1354|      0|            let version = super::claude_runner::validate_claude_version(&bin)?;
 1355|      0|            tracing::info!(target: "enrich", binary = %bin.display(), version = %version, "Claude Code binary validated");
 1356|      0|            emit_json(&PhaseEvent {
 1357|      0|                phase: "validate",
 1358|      0|                binary_path: bin.to_str(),
 1359|      0|                version: Some(&version),
 1360|      0|                items_total: None,
 1361|      0|                items_pending: None,
 1362|      0|                llm_parallelism: None,
 1363|      0|            });
 1364|      0|            bin
 1365|       |        }
 1366|       |        EnrichMode::Codex => {
 1367|       |            // Codex provider: locate binary using env or PATH
 1368|      0|            let bin = find_codex_binary(args.codex_binary.as_deref())?;
 1369|      0|            emit_json(&PhaseEvent {
 1370|      0|                phase: "validate",
 1371|      0|                binary_path: bin.to_str(),
 1372|      0|                version: None,
 1373|      0|                items_total: None,
 1374|      0|                items_pending: None,
 1375|      0|                llm_parallelism: None,
 1376|      0|            });
 1377|      0|            bin
 1378|       |        }
 1379|       |    };
 1380|       |
 1381|       |    // G28-D: refuse to start when the system is saturated. This check
 1382|       |    // is BEFORE preflight so we never spend an OAuth turn on a host
 1383|       |    // that is already at the limit.
 1384|      0|    if args.max_load_check && !args.dry_run && crate::system_load::is_system_saturated() {
 1385|      0|        let load = crate::system_load::load_average_one();
 1386|      0|        let n = crate::system_load::ncpus();
 1387|      0|        return Err(AppError::Validation(format!(
 1388|      0|            "system load average {load:.2} exceeds 2x ncpus ({n}); \
 1389|      0|             pass --no-max-load-check to override (not recommended)"
 1390|      0|        )));
 1391|      0|    }
 1392|       |
 1393|       |    // G35: preflight probe — issue a single ping turn to verify the
 1394|       |    // provider is healthy before scanning N candidates. If the probe
 1395|       |    // fails with a rate-limit error, optionally fall back to a
 1396|       |    // different mode (typically codex) instead of failing the entire
 1397|       |    // batch. The probe itself consumes 1 OAuth turn, so it stays
 1398|       |    // opt-in (default off) to keep --dry-run and CI flows zero-cost.
 1399|      0|    if args.preflight_check && !args.dry_run {
 1400|      0|        let preflight_result = run_preflight_probe(args);
 1401|      0|        match preflight_result {
 1402|       |            PreflightOutcome::Healthy => {
 1403|      0|                tracing::info!(target: "enrich", mode = ?args.mode, "preflight probe healthy");
 1404|       |            }
 1405|      0|            PreflightOutcome::RateLimited { reason, suggestion } => {
 1406|      0|                if let Some(fallback) = args.fallback_mode.clone() {
 1407|      0|                    if fallback != args.mode {
 1408|       |                        // G35 (v1.0.69): the mid-batch mode switch is
 1409|       |                        // intentionally NOT applied because it would
 1410|       |                        // desynchronise the per-item rate-limit wait
 1411|       |                        // state (rate-limited items in the worker are
 1412|       |                        // timed against the original provider). Instead
 1413|       |                        // we abort cleanly so the operator can re-invoke
 1414|       |                        // with `--mode {fallback:?}`. This guarantees no
 1415|       |                        // OAuth window is wasted and no partial state
 1416|       |                        // is left in the queue.
 1417|      0|                        return Err(AppError::Validation(format!(
 1418|      0|                            "preflight detected rate limit on {mode:?}: {reason}; \
 1419|      0|                             re-invoke with `--mode {fallback:?}` to use the fallback provider",
 1420|      0|                            mode = args.mode
 1421|      0|                        )));
 1422|      0|                    }
 1423|      0|                    return Err(AppError::Validation(format!(
 1424|      0|                        "preflight detected rate limit on {mode:?}: {reason}; \
 1425|      0|                         --fallback-mode matches --mode, no recovery possible",
 1426|      0|                        mode = args.mode
 1427|      0|                    )));
 1428|      0|                }
 1429|      0|                return Err(AppError::Validation(format!(
 1430|      0|                    "preflight detected rate limit on {mode:?}: {reason}; \
 1431|      0|                     {suggestion}; pass --fallback-mode codex to recover",
 1432|      0|                    mode = args.mode
 1433|      0|                )));
 1434|       |            }
 1435|      0|            PreflightOutcome::Error(e) => {
 1436|      0|                return Err(e);
 1437|       |            }
 1438|       |        }
 1439|      0|    }
 1440|       |
 1441|       |    // SCAN phase
 1442|      0|    let scan_result = scan_operation(&conn, &namespace, args)?;
 1443|      0|    let total = scan_result.len();
 1444|       |
 1445|      0|    emit_json(&PhaseEvent {
 1446|      0|        phase: "scan",
 1447|      0|        binary_path: None,
 1448|      0|        version: None,
 1449|      0|        items_total: Some(total),
 1450|      0|        items_pending: Some(total),
 1451|      0|        llm_parallelism: Some(args.llm_parallelism),
 1452|      0|    });
 1453|       |
 1454|       |    // Dry-run: emit preview events and summary without calling LLM
 1455|      0|    if args.dry_run {
 1456|      0|        for (idx, key) in scan_result.iter().enumerate() {
 1457|      0|            emit_json(&ItemEvent {
 1458|      0|                item: key,
 1459|      0|                status: "preview",
 1460|      0|                memory_id: None,
 1461|      0|                entity_id: None,
 1462|      0|                entities: None,
 1463|      0|                rels: None,
 1464|      0|                chars_before: None,
 1465|      0|                chars_after: None,
 1466|      0|                cost_usd: None,
 1467|      0|                elapsed_ms: None,
 1468|      0|                error: None,
 1469|      0|                index: idx,
 1470|      0|                total,
 1471|      0|            });
 1472|      0|        }
 1473|      0|        emit_json(&EnrichSummary {
 1474|      0|            summary: true,
 1475|      0|            operation: format!("{:?}", args.operation),
 1476|      0|            items_total: total,
 1477|      0|            completed: 0,
 1478|      0|            failed: 0,
 1479|      0|            skipped: 0,
 1480|      0|            cost_usd: 0.0,
 1481|      0|            elapsed_ms: started.elapsed().as_millis() as u64,
 1482|      0|        });
 1483|      0|        return Ok(());
 1484|      0|    }
 1485|       |
 1486|       |    // All 13 operations are now implemented (G27 complete).
 1487|       |
 1488|       |    // Queue setup for resume/retry
 1489|      0|    let queue_conn = open_queue_db(DEFAULT_QUEUE_DB)?;
 1490|       |
 1491|      0|    if args.resume {
 1492|      0|        let reset = queue_conn
 1493|      0|            .execute(
 1494|      0|                "UPDATE queue SET status='pending' WHERE status='processing'",
 1495|      0|                [],
 1496|       |            )
 1497|      0|            .map_err(|e| AppError::Validation(format!("queue resume failed: {e}")))?;
 1498|      0|        if reset > 0 {
 1499|      0|            tracing::info!(target: "enrich", count = reset, "reset stuck processing items to pending");
 1500|      0|        }
 1501|      0|    }
 1502|       |
 1503|      0|    if args.retry_failed {
 1504|      0|        let count = queue_conn
 1505|      0|            .execute(
 1506|      0|                "UPDATE queue SET status='pending', attempt=0 WHERE status='failed'",
 1507|      0|                [],
 1508|       |            )
 1509|      0|            .map_err(|e| AppError::Validation(format!("queue retry-failed reset failed: {e}")))?;
 1510|      0|        tracing::info!(target: "enrich", count, "retrying failed items");
 1511|      0|    }
 1512|       |
 1513|      0|    if !args.resume && !args.retry_failed {
 1514|      0|        queue_conn
 1515|      0|            .execute("DELETE FROM queue", [])
 1516|      0|            .map_err(|e| AppError::Validation(format!("queue clear failed: {e}")))?;
 1517|      0|    }
 1518|       |
 1519|       |    // Populate queue
 1520|      0|    for (idx, key) in scan_result.iter().enumerate() {
 1521|      0|        let item_type = match args.operation {
 1522|      0|            EnrichOperation::EntityDescriptions => "entity",
 1523|      0|            _ => "memory",
 1524|       |        };
 1525|      0|        if let Err(e) = queue_conn.execute(
 1526|      0|            "INSERT OR IGNORE INTO queue (item_key, item_type, status) VALUES (?1, ?2, 'pending')",
 1527|      0|            rusqlite::params![key, item_type],
 1528|      0|        ) {
 1529|      0|            tracing::warn!(target: "enrich", error = %e, "queue insert failed");
 1530|      0|        }
 1531|      0|        let _ = idx; // suppress unused warning
 1532|       |    }
 1533|       |
 1534|       |    // G19: parallel LLM processing via std::thread::scope when parallelism > 1.
 1535|       |    // Clamp enforces the range even if the caller bypasses clap validation.
 1536|      0|    let parallelism = args.llm_parallelism.clamp(1, 32) as usize;
 1537|      0|    if parallelism > 1 {
 1538|      0|        tracing::info!(
 1539|       |            target: "enrich",
 1540|       |            llm_parallelism = parallelism,
 1541|      0|            "parallel LLM processing with bounded thread pool"
 1542|       |        );
 1543|      0|    }
 1544|       |    // G28-D (v1.0.68) + G34 (v1.0.69): warn above the recommended parallelism
 1545|       |    // ceiling. The threshold and message depend on the LLM mode because
 1546|       |    // Claude Code spawns MCP children (G28-A) while Codex does not.
 1547|      0|    if parallelism > 4 {
 1548|      0|        match args.mode {
 1549|       |            EnrichMode::ClaudeCode => {
 1550|      0|                tracing::warn!(
 1551|       |                    target: "enrich",
 1552|       |                    llm_parallelism = parallelism,
 1553|       |                    recommended_max = 4,
 1554|       |                    mode = "claude-code",
 1555|      0|                    "llm_parallelism above 4 multiplies Claude Code subprocess fan-out; \
 1556|      0|                     consider combining with SQLITE_GRAPHRAG_CLAUDE_EMPTY_CONFIG_DIR \
 1557|      0|                     to cut MCP children (G28-A)"
 1558|       |                );
 1559|       |            }
 1560|      0|            EnrichMode::Codex if parallelism > 16 => {
 1561|      0|                tracing::warn!(
 1562|       |                    target: "enrich",
 1563|       |                    llm_parallelism = parallelism,
 1564|       |                    recommended_max = 16,
 1565|       |                    mode = "codex",
 1566|      0|                    "llm_parallelism above 16 risks OAuth rate-limit on Codex; \
 1567|      0|                     consider --llm-parallelism 8 for safer concurrency"
 1568|       |                );
 1569|       |            }
 1570|      0|            EnrichMode::Codex => {
 1571|      0|                // No warning: codex does not spawn MCP children and was
 1572|      0|                // validated at parallelism 8 in production (1161 items,
 1573|      0|                // 0 failures) per the 2026-06-04 session audit.
 1574|      0|            }
 1575|       |        }
 1576|      0|    }
 1577|       |
 1578|      0|    let mut completed = 0usize;
 1579|      0|    let mut failed = 0usize;
 1580|      0|    let mut skipped = 0usize;
 1581|      0|    let mut cost_total = 0.0f64;
 1582|      0|    let mut oauth_detected = false;
 1583|      0|    let mut backoff_secs = DEFAULT_RATE_LIMIT_WAIT;
 1584|      0|    let rate_limit_deadline = std::time::Instant::now() + std::time::Duration::from_secs(3600);
 1585|      0|    let enrich_started = std::time::Instant::now();
 1586|       |
 1587|      0|    let provider_timeout = match args.mode {
 1588|      0|        EnrichMode::ClaudeCode => args.claude_timeout,
 1589|      0|        EnrichMode::Codex => args.codex_timeout,
 1590|       |    };
 1591|       |
 1592|      0|    let provider_model: Option<&str> = match args.mode {
 1593|      0|        EnrichMode::ClaudeCode => args.claude_model.as_deref(),
 1594|      0|        EnrichMode::Codex => args.codex_model.as_deref(),
 1595|       |    };
 1596|       |
 1597|       |    // G19: when parallelism > 1, spawn bounded worker threads.
 1598|       |    // Each worker opens its own DB connections (WAL supports concurrent readers + serialized writers).
 1599|       |    // The queue DB claim is atomic via UPDATE...RETURNING — no external lock needed.
 1600|      0|    if parallelism > 1 {
 1601|      0|        let stdout_mu = parking_lot::Mutex::new(());
 1602|      0|        let budget = args.max_cost_usd;
 1603|      0|        let operation = args.operation.clone();
 1604|      0|        let mode = args.mode.clone();
 1605|      0|        let min_oc = args.min_output_chars;
 1606|      0|        let max_oc = args.max_output_chars;
 1607|      0|        let prompt_tpl = args.prompt_template.as_deref().map(|p| p.to_path_buf());
 1608|       |
 1609|       |        struct WorkerResult {
 1610|       |            completed: usize,
 1611|       |            failed: usize,
 1612|       |            skipped: usize,
 1613|       |            cost: f64,
 1614|       |            oauth: bool,
 1615|       |        }
 1616|       |
 1617|      0|        let results: Vec<WorkerResult> = std::thread::scope(|s| {
 1618|      0|            let handles: Vec<_> = (0..parallelism)
 1619|      0|                .map(|worker_id| {
 1620|      0|                    let stdout_mu = &stdout_mu;
 1621|      0|                    let paths = &paths;
 1622|      0|                    let namespace = &namespace;
 1623|      0|                    let provider_binary = &provider_binary;
 1624|      0|                    let operation = &operation;
 1625|      0|                    let mode = &mode;
 1626|      0|                    let prompt_tpl = prompt_tpl.as_deref();
 1627|      0|                    s.spawn(move || {
 1628|      0|                        let w_conn = match open_rw(&paths.db) {
 1629|      0|                            Ok(c) => c,
 1630|      0|                            Err(e) => {
 1631|      0|                                tracing::error!(target: "enrich", worker = worker_id, error = %e, "worker failed to open DB");
 1632|      0|                                return WorkerResult { completed: 0, failed: 0, skipped: 0, cost: 0.0, oauth: false };
 1633|       |                            }
 1634|       |                        };
 1635|      0|                        let w_queue = match open_queue_db(DEFAULT_QUEUE_DB) {
 1636|      0|                            Ok(c) => c,
 1637|      0|                            Err(e) => {
 1638|      0|                                tracing::error!(target: "enrich", worker = worker_id, error = %e, "worker failed to open queue DB");
 1639|      0|                                return WorkerResult { completed: 0, failed: 0, skipped: 0, cost: 0.0, oauth: false };
 1640|       |                            }
 1641|       |                        };
 1642|      0|                        let mut w_completed = 0usize;
 1643|      0|                        let mut w_failed = 0usize;
 1644|      0|                        let mut w_skipped = 0usize;
 1645|      0|                        let mut w_cost = 0.0f64;
 1646|      0|                        let mut w_oauth = false;
 1647|      0|                        let mut w_backoff = DEFAULT_RATE_LIMIT_WAIT;
 1648|      0|                        let w_deadline = std::time::Instant::now() + std::time::Duration::from_secs(3600);
 1649|       |                        // G28-D: per-worker circuit breaker that aborts the
 1650|       |                        // loop after `circuit_breaker_threshold` consecutive
 1651|       |                        // HardFailure outcomes (transient/rate-limited errors
 1652|       |                        // do NOT count, so a recovering provider is not
 1653|       |                        // penalised).
 1654|      0|                        let mut w_breaker = crate::retry::CircuitBreaker::new(
 1655|      0|                            args.circuit_breaker_threshold.max(1),
 1656|      0|                            std::time::Duration::from_secs(60),
 1657|       |                        );
 1658|       |
 1659|       |                        loop {
 1660|      0|                            if crate::shutdown_requested() {
 1661|      0|                                tracing::info!(target: "enrich", "shutdown requested, worker stopping");
 1662|      0|                                break;
 1663|      0|                            }
 1664|      0|                            if let Some(b) = budget {
 1665|      0|                                if !w_oauth && w_cost >= b {
 1666|      0|                                    break;
 1667|      0|                                }
 1668|      0|                            }
 1669|      0|                            let pending: Option<(i64, String, String)> = w_queue
 1670|      0|                                .query_row(
 1671|      0|                                    "UPDATE queue SET status='processing', attempt=attempt+1 \
 1672|      0|                                     WHERE id = (SELECT id FROM queue WHERE status='pending' ORDER BY id LIMIT 1) \
 1673|      0|                                     RETURNING id, item_key, item_type",
 1674|      0|                                    [],
 1675|      0|                                    |row| Ok((row.get(0)?, row.get(1)?, row.get(2)?)),
 1676|       |                                )
 1677|      0|                                .ok();
 1678|      0|                            let (queue_id, item_key, _item_type) = match pending {
 1679|      0|                                Some(p) => p,
 1680|      0|                                None => break,
 1681|       |                            };
 1682|      0|                            let item_started = Instant::now();
 1683|      0|                            let current_index = w_completed + w_failed + w_skipped;
 1684|       |
 1685|      0|                            let call_result = match operation {
 1686|      0|                                EnrichOperation::MemoryBindings => call_memory_bindings(&w_conn, namespace, &item_key, provider_binary, provider_model, provider_timeout, mode),
 1687|      0|                                EnrichOperation::EntityDescriptions => call_entity_description(&w_conn, namespace, &item_key, provider_binary, provider_model, provider_timeout, mode),
 1688|      0|                                EnrichOperation::BodyEnrich => call_body_enrich(&w_conn, namespace, &item_key, provider_binary, provider_model, provider_timeout, mode, min_oc, max_oc, prompt_tpl, args.preserve_threshold, paths),
 1689|      0|                                EnrichOperation::WeightCalibrate => call_weight_calibrate(&w_conn, namespace, &item_key, provider_binary, provider_model, provider_timeout, mode),
 1690|      0|                                EnrichOperation::RelationReclassify => call_relation_reclassify(&w_conn, namespace, &item_key, provider_binary, provider_model, provider_timeout, mode),
 1691|      0|                                EnrichOperation::EntityConnect | EnrichOperation::CrossDomainBridges => call_entity_connect(&w_conn, namespace, &item_key, provider_binary, provider_model, provider_timeout, mode),
 1692|      0|                                EnrichOperation::EntityTypeValidate => call_entity_type_validate(&w_conn, namespace, &item_key, provider_binary, provider_model, provider_timeout, mode),
 1693|      0|                                EnrichOperation::DescriptionEnrich => call_description_enrich(&w_conn, namespace, &item_key, provider_binary, provider_model, provider_timeout, mode),
 1694|      0|                                EnrichOperation::DomainClassify => call_domain_classify(&w_conn, namespace, &item_key, provider_binary, provider_model, provider_timeout, mode),
 1695|      0|                                EnrichOperation::GraphAudit => call_graph_audit(&w_conn, namespace, &item_key, provider_binary, provider_model, provider_timeout, mode),
 1696|      0|                                EnrichOperation::DeepResearchSynth => call_deep_research_synth(&w_conn, namespace, &item_key, provider_binary, provider_model, provider_timeout, mode),
 1697|      0|                                EnrichOperation::BodyExtract => call_body_extract(&w_conn, namespace, &item_key, provider_binary, provider_model, provider_timeout, mode),
 1698|       |                            };
 1699|       |
 1700|      0|                            match call_result {
 1701|      0|                                Ok(EnrichItemResult::Done { cost, is_oauth, memory_id, entity_id, entities, rels, chars_before, chars_after }) => {
 1702|      0|                                    if is_oauth { w_oauth = true; }
 1703|      0|                                    w_backoff = DEFAULT_RATE_LIMIT_WAIT;
 1704|      0|                                    let _ = w_queue.execute(
 1705|      0|                                        "UPDATE queue SET status='done', memory_id=?1, entity_id=?2, entities=?3, rels=?4, cost_usd=?5, elapsed_ms=?6, done_at=datetime('now') WHERE id=?7",
 1706|      0|                                        rusqlite::params![memory_id, entity_id, entities as i64, rels as i64, cost, item_started.elapsed().as_millis() as i64, queue_id],
 1707|      0|                                    );
 1708|      0|                                    w_completed += 1;
 1709|      0|                                    if !is_oauth { w_cost += cost; }
 1710|       |                                    // G28-D: count success; resets breaker.
 1711|      0|                                    let _ = w_breaker
 1712|      0|                                        .record(crate::retry::AttemptOutcome::Success);
 1713|      0|                                    let _guard = stdout_mu.lock();
 1714|      0|                                    emit_json(&ItemEvent { item: &item_key, status: "done", memory_id, entity_id, entities: Some(entities), rels: Some(rels), chars_before, chars_after, cost_usd: if is_oauth { None } else { Some(cost) }, elapsed_ms: Some(item_started.elapsed().as_millis() as u64), error: None, index: current_index, total });
 1715|       |                                }
 1716|      0|                                Ok(EnrichItemResult::Skipped { reason }) => {
 1717|      0|                                    w_skipped += 1;
 1718|      0|                                    let _ = w_queue.execute("UPDATE queue SET status='skipped', error=?1, done_at=datetime('now') WHERE id=?2", rusqlite::params![reason, queue_id]);
 1719|      0|                                    let _guard = stdout_mu.lock();
 1720|      0|                                    emit_json(&ItemEvent { item: &item_key, status: "skipped", memory_id: None, entity_id: None, entities: None, rels: None, chars_before: None, chars_after: None, cost_usd: None, elapsed_ms: Some(item_started.elapsed().as_millis() as u64), error: None, index: current_index, total });
 1721|      0|                                }
 1722|      0|                                Ok(EnrichItemResult::PreservationFailed { score, threshold, chars_before, chars_after }) => {
 1723|      0|                                    // G29 Passo 4: worker mirror of the
 1724|      0|                                    // serial path. Counted as a soft
 1725|      0|                                    // skip so the queue surface shows
 1726|      0|                                    // a quality issue rather than a
 1727|      0|                                    // transport failure.
 1728|      0|                                    w_skipped += 1;
 1729|      0|                                    let reason = format!(
 1730|      0|                                        "preservation_failed: jaccard={score:.3} threshold={threshold:.3} (orig={chars_before} chars, new={chars_after} chars)"
 1731|      0|                                    );
 1732|      0|                                    let _ = w_queue.execute(
 1733|      0|                                        "UPDATE queue SET status='skipped', error=?1, done_at=datetime('now') WHERE id=?2",
 1734|      0|                                        rusqlite::params![reason, queue_id],
 1735|      0|                                    );
 1736|      0|                                    let _guard = stdout_mu.lock();
 1737|      0|                                    emit_json(&ItemEvent {
 1738|      0|                                        item: &item_key,
 1739|      0|                                        status: "preservation_failed",
 1740|      0|                                        memory_id: None,
 1741|      0|                                        entity_id: None,
 1742|      0|                                        entities: None,
 1743|      0|                                        rels: None,
 1744|      0|                                        chars_before: Some(chars_before),
 1745|      0|                                        chars_after: Some(chars_after),
 1746|      0|                                        cost_usd: None,
 1747|      0|                                        elapsed_ms: Some(item_started.elapsed().as_millis() as u64),
 1748|      0|                                        error: Some(reason),
 1749|      0|                                        index: current_index,
 1750|      0|                                        total,
 1751|      0|                                    });
 1752|      0|                                }
 1753|      0|                                Err(e) => {
 1754|      0|                                    let err_str = format!("{e}");
 1755|      0|                                    if matches!(e, AppError::RateLimited { .. }) {
 1756|      0|                                        if crate::retry::is_kill_switch_active() {
 1757|      0|                                            tracing::warn!(target: "enrich", "SQLITE_GRAPHRAG_DISABLE_RETRY=1, skipping rate-limit retry");
 1758|      0|                                        } else if std::time::Instant::now() >= w_deadline {
 1759|      0|                                            tracing::error!(target: "enrich", "rate-limit retry deadline (1h) exhausted in worker");
 1760|       |                                        } else {
 1761|      0|                                            let half = w_backoff / 2;
 1762|      0|                                            let jitter = if half == 0 { 0 } else { fastrand::u64(0..half) };
 1763|      0|                                            let actual_wait = half + jitter;
 1764|      0|                                            tracing::warn!(target: "enrich", delay_secs = actual_wait, error_kind = "rate_limited", "rate limited in worker, backing off");
 1765|      0|                                            let _ = w_queue.execute("UPDATE queue SET status='pending' WHERE id=?1", rusqlite::params![queue_id]);
 1766|      0|                                            std::thread::sleep(std::time::Duration::from_secs(actual_wait));
 1767|      0|                                            w_backoff = (w_backoff * 2).min(900);
 1768|      0|                                            continue;
 1769|       |                                        }
 1770|      0|                                    }
 1771|      0|                                    w_failed += 1;
 1772|      0|                                    let _ = w_queue.execute("UPDATE queue SET status='failed', error=?1, done_at=datetime('now') WHERE id=?2", rusqlite::params![err_str, queue_id]);
 1773|      0|                                    let _guard = stdout_mu.lock();
 1774|      0|                                    emit_json(&ItemEvent { item: &item_key, status: "failed", memory_id: None, entity_id: None, entities: None, rels: None, chars_before: None, chars_after: None, cost_usd: None, elapsed_ms: Some(item_started.elapsed().as_millis() as u64), error: Some(err_str), index: current_index, total });
 1775|       |                                    // G28-D: count hard failure against breaker.
 1776|      0|                                    let breaker_opened = w_breaker
 1777|      0|                                        .record(crate::retry::AttemptOutcome::HardFailure);
 1778|      0|                                    if breaker_opened {
 1779|      0|                                        tracing::error!(target: "enrich",
 1780|      0|                                            consecutive_failures = w_breaker.consecutive_failures(),
 1781|      0|                                            "circuit breaker opened — aborting worker"
 1782|       |                                        );
 1783|      0|                                        break;
 1784|      0|                                    }
 1785|       |                                }
 1786|       |                            }
 1787|       |                        }
 1788|      0|                        WorkerResult { completed: w_completed, failed: w_failed, skipped: w_skipped, cost: w_cost, oauth: w_oauth }
 1789|      0|                    })
 1790|      0|                })
 1791|      0|                .collect();
 1792|      0|            handles
 1793|      0|                .into_iter()
 1794|      0|                .map(|h| {
 1795|      0|                    h.join().unwrap_or(WorkerResult {
 1796|      0|                        completed: 0,
 1797|      0|                        failed: 0,
 1798|      0|                        skipped: 0,
 1799|      0|                        cost: 0.0,
 1800|      0|                        oauth: false,
 1801|      0|                    })
 1802|      0|                })
 1803|      0|                .collect()
 1804|      0|        });
 1805|       |
 1806|      0|        for r in &results {
 1807|      0|            completed += r.completed;
 1808|      0|            failed += r.failed;
 1809|      0|            skipped += r.skipped;
 1810|      0|            cost_total += r.cost;
 1811|      0|            oauth_detected |= r.oauth;
 1812|      0|        }
 1813|       |    } else {
 1814|       |        // Serial path (parallelism == 1) — original loop
 1815|       |        loop {
 1816|      0|            if crate::shutdown_requested() {
 1817|      0|                tracing::info!(target: "enrich", "shutdown requested, stopping enrichment");
 1818|      0|                break;
 1819|      0|            }
 1820|       |
 1821|       |            // Budget check
 1822|      0|            if let Some(budget) = args.max_cost_usd {
 1823|      0|                if !oauth_detected && cost_total >= budget {
 1824|      0|                    tracing::warn!(target: "enrich", spent = cost_total, budget, "budget exceeded, stopping");
 1825|      0|                    break;
 1826|      0|                }
 1827|      0|            }
 1828|       |
 1829|       |            // Dequeue next pending item
 1830|      0|            let pending: Option<(i64, String, String)> = queue_conn
 1831|      0|                .query_row(
 1832|      0|                    "UPDATE queue SET status='processing', attempt=attempt+1 \
 1833|      0|                 WHERE id = (SELECT id FROM queue WHERE status='pending' ORDER BY id LIMIT 1) \
 1834|      0|                 RETURNING id, item_key, item_type",
 1835|      0|                    [],
 1836|      0|                    |row| Ok((row.get(0)?, row.get(1)?, row.get(2)?)),
 1837|       |                )
 1838|      0|                .ok();
 1839|       |
 1840|      0|            let (queue_id, item_key, item_type) = match pending {
 1841|      0|                Some(p) => p,
 1842|      0|                None => break,
 1843|       |            };
 1844|       |
 1845|      0|            let item_started = Instant::now();
 1846|      0|            let current_index = completed + failed + skipped;
 1847|       |
 1848|      0|            let call_result = match args.operation {
 1849|      0|                EnrichOperation::MemoryBindings => call_memory_bindings(
 1850|      0|                    &conn,
 1851|      0|                    &namespace,
 1852|      0|                    &item_key,
 1853|      0|                    &provider_binary,
 1854|      0|                    provider_model,
 1855|      0|                    provider_timeout,
 1856|      0|                    &args.mode,
 1857|       |                ),
 1858|      0|                EnrichOperation::EntityDescriptions => call_entity_description(
 1859|      0|                    &conn,
 1860|      0|                    &namespace,
 1861|      0|                    &item_key,
 1862|      0|                    &provider_binary,
 1863|      0|                    provider_model,
 1864|      0|                    provider_timeout,
 1865|      0|                    &args.mode,
 1866|       |                ),
 1867|      0|                EnrichOperation::BodyEnrich => call_body_enrich(
 1868|      0|                    &conn,
 1869|      0|                    &namespace,
 1870|      0|                    &item_key,
 1871|      0|                    &provider_binary,
 1872|      0|                    provider_model,
 1873|      0|                    provider_timeout,
 1874|      0|                    &args.mode,
 1875|      0|                    args.min_output_chars,
 1876|      0|                    args.max_output_chars,
 1877|      0|                    args.prompt_template.as_deref(),
 1878|      0|                    args.preserve_threshold,
 1879|      0|                    &paths,
 1880|       |                ),
 1881|      0|                EnrichOperation::WeightCalibrate => call_weight_calibrate(
 1882|      0|                    &conn,
 1883|      0|                    &namespace,
 1884|      0|                    &item_key,
 1885|      0|                    &provider_binary,
 1886|      0|                    provider_model,
 1887|      0|                    provider_timeout,
 1888|      0|                    &args.mode,
 1889|       |                ),
 1890|      0|                EnrichOperation::RelationReclassify => call_relation_reclassify(
 1891|      0|                    &conn,
 1892|      0|                    &namespace,
 1893|      0|                    &item_key,
 1894|      0|                    &provider_binary,
 1895|      0|                    provider_model,
 1896|      0|                    provider_timeout,
 1897|      0|                    &args.mode,
 1898|       |                ),
 1899|       |                EnrichOperation::EntityConnect | EnrichOperation::CrossDomainBridges => {
 1900|      0|                    call_entity_connect(
 1901|      0|                        &conn,
 1902|      0|                        &namespace,
 1903|      0|                        &item_key,
 1904|      0|                        &provider_binary,
 1905|      0|                        provider_model,
 1906|      0|                        provider_timeout,
 1907|      0|                        &args.mode,
 1908|       |                    )
 1909|       |                }
 1910|      0|                EnrichOperation::EntityTypeValidate => call_entity_type_validate(
 1911|      0|                    &conn,
 1912|      0|                    &namespace,
 1913|      0|                    &item_key,
 1914|      0|                    &provider_binary,
 1915|      0|                    provider_model,
 1916|      0|                    provider_timeout,
 1917|      0|                    &args.mode,
 1918|       |                ),
 1919|      0|                EnrichOperation::DescriptionEnrich => call_description_enrich(
 1920|      0|                    &conn,
 1921|      0|                    &namespace,
 1922|      0|                    &item_key,
 1923|      0|                    &provider_binary,
 1924|      0|                    provider_model,
 1925|      0|                    provider_timeout,
 1926|      0|                    &args.mode,
 1927|       |                ),
 1928|      0|                EnrichOperation::DomainClassify => call_domain_classify(
 1929|      0|                    &conn,
 1930|      0|                    &namespace,
 1931|      0|                    &item_key,
 1932|      0|                    &provider_binary,
 1933|      0|                    provider_model,
 1934|      0|                    provider_timeout,
 1935|      0|                    &args.mode,
 1936|       |                ),
 1937|      0|                EnrichOperation::GraphAudit => call_graph_audit(
 1938|      0|                    &conn,
 1939|      0|                    &namespace,
 1940|      0|                    &item_key,
 1941|      0|                    &provider_binary,
 1942|      0|                    provider_model,
 1943|      0|                    provider_timeout,
 1944|      0|                    &args.mode,
 1945|       |                ),
 1946|      0|                EnrichOperation::DeepResearchSynth => call_deep_research_synth(
 1947|      0|                    &conn,
 1948|      0|                    &namespace,
 1949|      0|                    &item_key,
 1950|      0|                    &provider_binary,
 1951|      0|                    provider_model,
 1952|      0|                    provider_timeout,
 1953|      0|                    &args.mode,
 1954|       |                ),
 1955|      0|                EnrichOperation::BodyExtract => call_body_extract(
 1956|      0|                    &conn,
 1957|      0|                    &namespace,
 1958|      0|                    &item_key,
 1959|      0|                    &provider_binary,
 1960|      0|                    provider_model,
 1961|      0|                    provider_timeout,
 1962|      0|                    &args.mode,
 1963|       |                ),
 1964|       |            };
 1965|       |
 1966|      0|            match call_result {
 1967|       |                Ok(EnrichItemResult::Done {
 1968|      0|                    memory_id,
 1969|      0|                    entity_id,
 1970|      0|                    entities,
 1971|      0|                    rels,
 1972|      0|                    chars_before,
 1973|      0|                    chars_after,
 1974|      0|                    cost,
 1975|      0|                    is_oauth,
 1976|       |                }) => {
 1977|      0|                    if is_oauth && !oauth_detected {
 1978|      0|                        oauth_detected = true;
 1979|      0|                        tracing::info!(target: "enrich", "OAuth subscription detected — cost_usd omitted from output");
 1980|      0|                    }
 1981|      0|                    backoff_secs = DEFAULT_RATE_LIMIT_WAIT;
 1982|       |
 1983|       |                    // Persist depends on the operation
 1984|      0|                    let persist_err: Option<String> = match args.operation {
 1985|       |                        EnrichOperation::MemoryBindings => {
 1986|       |                            // Bindings already persisted inside call_memory_bindings
 1987|      0|                            None
 1988|       |                        }
 1989|       |                        EnrichOperation::EntityDescriptions => {
 1990|       |                            // Description already persisted inside call_entity_description
 1991|      0|                            None
 1992|       |                        }
 1993|       |                        EnrichOperation::BodyEnrich => {
 1994|       |                            // Body already persisted inside call_body_enrich
 1995|      0|                            None
 1996|       |                        }
 1997|       |                        _ => {
 1998|       |                            // All G27 operations persist inside their call_* function
 1999|      0|                            None
 2000|       |                        }
 2001|       |                    };
 2002|       |
 2003|      0|                    if let Err(e) = queue_conn.execute(
 2004|      0|                    "UPDATE queue SET status='done', memory_id=?1, entity_id=?2, entities=?3, rels=?4, cost_usd=?5, elapsed_ms=?6, done_at=datetime('now') WHERE id=?7",
 2005|      0|                    rusqlite::params![
 2006|      0|                        memory_id,
 2007|      0|                        entity_id,
 2008|      0|                        entities as i64,
 2009|      0|                        rels as i64,
 2010|      0|                        cost,
 2011|      0|                        item_started.elapsed().as_millis() as i64,
 2012|      0|                        queue_id
 2013|      0|                    ],
 2014|      0|                ) {
 2015|      0|                        tracing::warn!(target: "enrich", error = %e, "queue done update failed");
 2016|      0|                    }
 2017|       |
 2018|      0|                    if persist_err.is_none() {
 2019|      0|                        completed += 1;
 2020|      0|                        if !is_oauth {
 2021|      0|                            cost_total += cost;
 2022|      0|                        }
 2023|      0|                        emit_json(&ItemEvent {
 2024|      0|                            item: &item_key,
 2025|      0|                            status: "done",
 2026|      0|                            memory_id,
 2027|      0|                            entity_id,
 2028|      0|                            entities: Some(entities),
 2029|      0|                            rels: Some(rels),
 2030|      0|                            chars_before,
 2031|      0|                            chars_after,
 2032|      0|                            cost_usd: if is_oauth { None } else { Some(cost) },
 2033|      0|                            elapsed_ms: Some(item_started.elapsed().as_millis() as u64),
 2034|      0|                            error: None,
 2035|      0|                            index: current_index,
 2036|      0|                            total,
 2037|       |                        });
 2038|      0|                    } else {
 2039|      0|                        failed += 1;
 2040|      0|                        emit_json(&ItemEvent {
 2041|      0|                            item: &item_key,
 2042|      0|                            status: "failed",
 2043|      0|                            memory_id: None,
 2044|      0|                            entity_id: None,
 2045|      0|                            entities: None,
 2046|      0|                            rels: None,
 2047|      0|                            chars_before: None,
 2048|      0|                            chars_after: None,
 2049|      0|                            cost_usd: None,
 2050|      0|                            elapsed_ms: Some(item_started.elapsed().as_millis() as u64),
 2051|      0|                            error: persist_err,
 2052|      0|                            index: current_index,
 2053|      0|                            total,
 2054|      0|                        });
 2055|      0|                    }
 2056|       |                }
 2057|      0|                Ok(EnrichItemResult::Skipped { reason }) => {
 2058|      0|                    skipped += 1;
 2059|      0|                    if let Err(e) = queue_conn.execute(
 2060|      0|                    "UPDATE queue SET status='skipped', error=?1, done_at=datetime('now') WHERE id=?2",
 2061|      0|                    rusqlite::params![reason, queue_id],
 2062|      0|                ) {
 2063|      0|                        tracing::warn!(target: "enrich", error = %e, "queue skipped update failed");
 2064|      0|                    }
 2065|      0|                    emit_json(&ItemEvent {
 2066|      0|                        item: &item_key,
 2067|      0|                        status: "skipped",
 2068|      0|                        memory_id: None,
 2069|      0|                        entity_id: None,
 2070|      0|                        entities: None,
 2071|      0|                        rels: None,
 2072|      0|                        chars_before: None,
 2073|      0|                        chars_after: None,
 2074|      0|                        cost_usd: None,
 2075|      0|                        elapsed_ms: Some(item_started.elapsed().as_millis() as u64),
 2076|      0|                        error: None,
 2077|      0|                        index: current_index,
 2078|      0|                        total,
 2079|      0|                    });
 2080|       |                }
 2081|       |                Ok(EnrichItemResult::PreservationFailed {
 2082|      0|                    score,
 2083|      0|                    threshold,
 2084|      0|                    chars_before,
 2085|      0|                    chars_after,
 2086|       |                }) => {
 2087|       |                    // G29 Passo 4: the LLM rewrite diverged too far from
 2088|       |                    // the original body. Count as a soft failure (not
 2089|       |                    // `failed`) so the queue surfaces it as a quality
 2090|       |                    // issue, not a transport error. The reason is
 2091|       |                    // structured so the operator can audit why a body
 2092|       |                    // was rejected.
 2093|      0|                    skipped += 1;
 2094|      0|                    let reason = format!(
 2095|      0|                        "preservation_failed: jaccard={score:.3} threshold={threshold:.3} (orig={chars_before} chars, new={chars_after} chars)"
 2096|       |                    );
 2097|      0|                    if let Err(qe) = queue_conn.execute(
 2098|      0|                        "UPDATE queue SET status='skipped', error=?1, done_at=datetime('now') WHERE id=?2",
 2099|      0|                        rusqlite::params![reason, queue_id],
 2100|      0|                    ) {
 2101|      0|                        tracing::warn!(target: "enrich", error = %qe, "queue preservation_failed update failed");
 2102|      0|                    }
 2103|      0|                    emit_json(&ItemEvent {
 2104|      0|                        item: &item_key,
 2105|      0|                        status: "preservation_failed",
 2106|      0|                        memory_id: None,
 2107|      0|                        entity_id: None,
 2108|      0|                        entities: None,
 2109|      0|                        rels: None,
 2110|      0|                        chars_before: Some(chars_before),
 2111|      0|                        chars_after: Some(chars_after),
 2112|      0|                        cost_usd: None,
 2113|      0|                        elapsed_ms: Some(item_started.elapsed().as_millis() as u64),
 2114|      0|                        error: Some(reason),
 2115|      0|                        index: current_index,
 2116|      0|                        total,
 2117|      0|                    });
 2118|       |                }
 2119|      0|                Err(e) => {
 2120|      0|                    let err_str = format!("{e}");
 2121|      0|                    if matches!(e, AppError::RateLimited { .. }) {
 2122|      0|                        if crate::retry::is_kill_switch_active() {
 2123|      0|                            tracing::warn!(target: "enrich", "SQLITE_GRAPHRAG_DISABLE_RETRY=1, skipping rate-limit retry");
 2124|      0|                        } else if std::time::Instant::now() >= rate_limit_deadline {
 2125|      0|                            tracing::error!(target: "enrich", total_elapsed_secs = enrich_started.elapsed().as_secs(), "rate-limit retry deadline (1h) exhausted");
 2126|       |                        } else {
 2127|      0|                            let half = backoff_secs / 2;
 2128|      0|                            let jitter = if half == 0 { 0 } else { fastrand::u64(0..half) };
 2129|      0|                            let actual_wait = half + jitter;
 2130|      0|                            tracing::warn!(target: "enrich", delay_secs = actual_wait, error_kind = "rate_limited", "rate limited, backing off");
 2131|      0|                            if let Err(qe) = queue_conn.execute(
 2132|      0|                                "UPDATE queue SET status='pending' WHERE id=?1",
 2133|      0|                                rusqlite::params![queue_id],
 2134|      0|                            ) {
 2135|      0|                                tracing::warn!(target: "enrich", error = %qe, "queue pending update failed");
 2136|      0|                            }
 2137|      0|                            std::thread::sleep(std::time::Duration::from_secs(actual_wait));
 2138|      0|                            backoff_secs = (backoff_secs * 2).min(900);
 2139|      0|                            continue;
 2140|       |                        }
 2141|      0|                    }
 2142|       |
 2143|      0|                    failed += 1;
 2144|      0|                    if let Err(qe) = queue_conn.execute(
 2145|      0|                    "UPDATE queue SET status='failed', error=?1, done_at=datetime('now') WHERE id=?2",
 2146|      0|                    rusqlite::params![err_str, queue_id],
 2147|      0|                ) {
 2148|      0|                        tracing::warn!(target: "enrich", error = %qe, "queue failed update failed");
 2149|      0|                    }
 2150|      0|                    emit_json(&ItemEvent {
 2151|      0|                        item: &item_key,
 2152|      0|                        status: "failed",
 2153|      0|                        memory_id: None,
 2154|      0|                        entity_id: None,
 2155|      0|                        entities: None,
 2156|      0|                        rels: None,
 2157|      0|                        chars_before: None,
 2158|      0|                        chars_after: None,
 2159|      0|                        cost_usd: None,
 2160|      0|                        elapsed_ms: Some(item_started.elapsed().as_millis() as u64),
 2161|      0|                        error: Some(err_str),
 2162|      0|                        index: current_index,
 2163|      0|                        total,
 2164|      0|                    });
 2165|       |                }
 2166|       |            }
 2167|       |
 2168|      0|            let _ = item_type; // used via queue schema only
 2169|       |        }
 2170|       |    } // end else (serial path)
 2171|       |
 2172|      0|    let _ = conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);");
 2173|      0|    let _ = queue_conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);");
 2174|       |
 2175|      0|    emit_json(&EnrichSummary {
 2176|      0|        summary: true,
 2177|      0|        operation: format!("{:?}", args.operation),
 2178|      0|        items_total: total,
 2179|      0|        completed,
 2180|      0|        failed,
 2181|      0|        skipped,
 2182|      0|        cost_usd: cost_total,
 2183|      0|        elapsed_ms: started.elapsed().as_millis() as u64,
 2184|      0|    });
 2185|       |
 2186|      0|    if failed == 0 {
 2187|      0|        let _ = std::fs::remove_file(DEFAULT_QUEUE_DB);
 2188|      0|    }
 2189|       |
 2190|      0|    Ok(())
 2191|      0|}
 2192|       |
 2193|       |// ---------------------------------------------------------------------------
 2194|       |// Internal result type for a single item call
 2195|       |// ---------------------------------------------------------------------------
 2196|       |
 2197|       |enum EnrichItemResult {
 2198|       |    Done {
 2199|       |        memory_id: Option<i64>,
 2200|       |        entity_id: Option<i64>,
 2201|       |        entities: usize,
 2202|       |        rels: usize,
 2203|       |        chars_before: Option<usize>,
 2204|       |        chars_after: Option<usize>,
 2205|       |        cost: f64,
 2206|       |        is_oauth: bool,
 2207|       |    },
 2208|       |    Skipped {
 2209|       |        reason: String,
 2210|       |    },
 2211|       |    /// G29 Passo 4 (v1.0.69): the LLM rewrite diverged from the original
 2212|       |    /// body beyond the configured `--preserve-threshold` and was rejected
 2213|       |    /// before persistence. The trigram-Jaccard score and threshold are
 2214|       |    /// emitted in the NDJSON stream for operator audit.
 2215|       |    PreservationFailed {
 2216|       |        score: f64,
 2217|       |        threshold: f64,
 2218|       |        chars_before: usize,
 2219|       |        chars_after: usize,
 2220|       |    },
 2221|       |}
 2222|       |
 2223|       |// ---------------------------------------------------------------------------
 2224|       |// Per-operation call helpers (SCAN + JUDGE + PERSIST in one unit)
 2225|       |// ---------------------------------------------------------------------------
 2226|       |
 2227|      0|fn call_memory_bindings(
 2228|      0|    conn: &Connection,
 2229|      0|    namespace: &str,
 2230|      0|    memory_name: &str,
 2231|      0|    binary: &Path,
 2232|      0|    model: Option<&str>,
 2233|      0|    timeout: u64,
 2234|      0|    mode: &EnrichMode,
 2235|      0|) -> Result<EnrichItemResult, AppError> {
 2236|       |    // Look up the memory
 2237|      0|    let (memory_id, body): (i64, String) = conn.query_row(
 2238|      0|        "SELECT id, COALESCE(body,'') FROM memories WHERE namespace=?1 AND name=?2 AND deleted_at IS NULL",
 2239|      0|        rusqlite::params![namespace, memory_name],
 2240|      0|        |r| Ok((r.get(0)?, r.get(1)?)),
 2241|      0|    ).map_err(|e| match e {
 2242|      0|        rusqlite::Error::QueryReturnedNoRows => AppError::NotFound(format!("memory '{memory_name}' not found")),
 2243|      0|        other => AppError::Database(other),
 2244|      0|    })?;
 2245|       |
 2246|      0|    if body.trim().is_empty() {
 2247|      0|        return Ok(EnrichItemResult::Skipped {
 2248|      0|            reason: "body is empty".to_string(),
 2249|      0|        });
 2250|      0|    }
 2251|       |
 2252|      0|    let (value, cost, is_oauth) = match mode {
 2253|      0|        EnrichMode::ClaudeCode => call_claude(
 2254|      0|            binary,
 2255|      0|            BINDINGS_PROMPT,
 2256|      0|            BINDINGS_SCHEMA,
 2257|      0|            &body,
 2258|      0|            model,
 2259|      0|            timeout,
 2260|      0|        )?,
 2261|      0|        EnrichMode::Codex => call_codex(
 2262|      0|            binary,
 2263|      0|            BINDINGS_PROMPT,
 2264|      0|            BINDINGS_SCHEMA,
 2265|      0|            &body,
 2266|      0|            model,
 2267|      0|            timeout,
 2268|      0|        )?,
 2269|       |    };
 2270|       |
 2271|      0|    let empty_arr = serde_json::Value::Array(vec![]);
 2272|      0|    let entities_val = value.get("entities").unwrap_or(&empty_arr);
 2273|      0|    let rels_val = value.get("relationships").unwrap_or(&empty_arr);
 2274|       |
 2275|      0|    let (ent_count, rel_count) =
 2276|      0|        persist_memory_bindings(conn, namespace, memory_id, entities_val, rels_val)?;
 2277|       |
 2278|      0|    Ok(EnrichItemResult::Done {
 2279|      0|        memory_id: Some(memory_id),
 2280|      0|        entity_id: None,
 2281|      0|        entities: ent_count,
 2282|      0|        rels: rel_count,
 2283|      0|        chars_before: None,
 2284|      0|        chars_after: None,
 2285|      0|        cost,
 2286|      0|        is_oauth,
 2287|      0|    })
 2288|      0|}
 2289|       |
 2290|      0|fn call_entity_description(
 2291|      0|    conn: &Connection,
 2292|      0|    namespace: &str,
 2293|      0|    entity_name: &str,
 2294|      0|    binary: &Path,
 2295|      0|    model: Option<&str>,
 2296|      0|    timeout: u64,
 2297|      0|    mode: &EnrichMode,
 2298|      0|) -> Result<EnrichItemResult, AppError> {
 2299|      0|    let (entity_id, entity_type): (i64, String) = conn
 2300|      0|        .query_row(
 2301|      0|            "SELECT id, type FROM entities WHERE namespace=?1 AND name=?2",
 2302|      0|            rusqlite::params![namespace, entity_name],
 2303|      0|            |r| Ok((r.get(0)?, r.get(1)?)),
 2304|       |        )
 2305|      0|        .map_err(|e| match e {
 2306|       |            rusqlite::Error::QueryReturnedNoRows => {
 2307|      0|                AppError::NotFound(format!("entity '{entity_name}' not found"))
 2308|       |            }
 2309|      0|            other => AppError::Database(other),
 2310|      0|        })?;
 2311|       |
 2312|      0|    let prompt = format!(
 2313|      0|        "{ENTITY_DESCRIPTION_PROMPT_PREFIX}{entity_name}\nEntity type: {entity_type}\n\nGenerate a description:"
 2314|       |    );
 2315|       |
 2316|      0|    let (value, cost, is_oauth) = match mode {
 2317|      0|        EnrichMode::ClaudeCode => call_claude(
 2318|      0|            binary,
 2319|      0|            &prompt,
 2320|      0|            ENTITY_DESCRIPTION_SCHEMA,
 2321|      0|            "",
 2322|      0|            model,
 2323|      0|            timeout,
 2324|      0|        )?,
 2325|      0|        EnrichMode::Codex => call_codex(
 2326|      0|            binary,
 2327|      0|            &prompt,
 2328|      0|            ENTITY_DESCRIPTION_SCHEMA,
 2329|      0|            "",
 2330|      0|            model,
 2331|      0|            timeout,
 2332|      0|        )?,
 2333|       |    };
 2334|       |
 2335|      0|    let description = value
 2336|      0|        .get("description")
 2337|      0|        .and_then(|v| v.as_str())
 2338|      0|        .ok_or_else(|| AppError::Validation("LLM result missing 'description' field".into()))?;
 2339|       |
 2340|      0|    persist_entity_description(conn, entity_id, description)?;
 2341|       |
 2342|      0|    Ok(EnrichItemResult::Done {
 2343|      0|        memory_id: None,
 2344|      0|        entity_id: Some(entity_id),
 2345|      0|        entities: 0,
 2346|      0|        rels: 0,
 2347|      0|        chars_before: None,
 2348|      0|        chars_after: None,
 2349|      0|        cost,
 2350|      0|        is_oauth,
 2351|      0|    })
 2352|      0|}
 2353|       |
 2354|       |#[allow(clippy::too_many_arguments)]
 2355|      0|fn call_body_enrich(
 2356|      0|    conn: &Connection,
 2357|      0|    namespace: &str,
 2358|      0|    memory_name: &str,
 2359|      0|    binary: &Path,
 2360|      0|    model: Option<&str>,
 2361|      0|    timeout: u64,
 2362|      0|    mode: &EnrichMode,
 2363|      0|    min_output_chars: usize,
 2364|      0|    max_output_chars: usize,
 2365|      0|    prompt_template: Option<&Path>,
 2366|      0|    preserve_threshold: f64,
 2367|      0|    paths: &crate::paths::AppPaths,
 2368|      0|) -> Result<EnrichItemResult, AppError> {
 2369|      0|    let (memory_id, body, description, memory_type): (i64, String, String, String) = conn
 2370|      0|        .query_row(
 2371|      0|            "SELECT id, COALESCE(body,''), COALESCE(description,''), COALESCE(type,'note') \
 2372|      0|         FROM memories WHERE namespace=?1 AND name=?2 AND deleted_at IS NULL",
 2373|      0|            rusqlite::params![namespace, memory_name],
 2374|      0|            |r| Ok((r.get(0)?, r.get(1)?, r.get(2)?, r.get(3)?)),
 2375|       |        )
 2376|      0|        .map_err(|e| match e {
 2377|       |            rusqlite::Error::QueryReturnedNoRows => {
 2378|      0|                AppError::NotFound(format!("memory '{memory_name}' not found"))
 2379|       |            }
 2380|      0|            other => AppError::Database(other),
 2381|      0|        })?;
 2382|       |
 2383|      0|    let chars_before = body.chars().count();
 2384|       |
 2385|       |    // G26: gather graph context for contextualized enrichment
 2386|      0|    let linked_entities: Vec<String> = {
 2387|      0|        let mut stmt = conn.prepare_cached(
 2388|      0|            "SELECT e.name FROM memory_entities me \
 2389|      0|             JOIN entities e ON e.id = me.entity_id \
 2390|      0|             WHERE me.memory_id = ?1 LIMIT 10",
 2391|      0|        )?;
 2392|      0|        let result: Vec<String> = stmt
 2393|      0|            .query_map(rusqlite::params![memory_id], |r| r.get::<_, String>(0))?
 2394|      0|            .filter_map(|r| r.ok())
 2395|      0|            .collect();
 2396|      0|        drop(stmt);
 2397|      0|        result
 2398|       |    };
 2399|       |
 2400|       |    // Load custom prompt template if provided
 2401|      0|    let prompt_prefix = if let Some(tmpl_path) = prompt_template {
 2402|      0|        let file_size = std::fs::metadata(tmpl_path)
 2403|      0|            .map_err(|e| {
 2404|      0|                AppError::Io(std::io::Error::new(
 2405|      0|                    e.kind(),
 2406|      0|                    format!("failed to stat prompt template: {e}"),
 2407|      0|                ))
 2408|      0|            })?
 2409|      0|            .len();
 2410|      0|        if file_size > MAX_MEMORY_BODY_LEN as u64 {
 2411|      0|            return Err(AppError::LimitExceeded(
 2412|      0|                crate::i18n::validation::body_exceeds(MAX_MEMORY_BODY_LEN),
 2413|      0|            ));
 2414|      0|        }
 2415|      0|        std::fs::read_to_string(tmpl_path).map_err(|e| {
 2416|      0|            AppError::Io(std::io::Error::new(
 2417|      0|                e.kind(),
 2418|      0|                format!("failed to read prompt template: {e}"),
 2419|      0|            ))
 2420|      0|        })?
 2421|       |    } else {
 2422|      0|        BODY_ENRICH_PROMPT_PREFIX.to_string()
 2423|       |    };
 2424|       |
 2425|       |    // G26: build contextualized prompt with graph data
 2426|      0|    let context_section = if !linked_entities.is_empty() || !description.is_empty() {
 2427|      0|        let mut ctx = String::new();
 2428|      0|        ctx.push_str(&format!(
 2429|      0|            "\nContext:\n- Memory name: {memory_name}\n- Type: {memory_type}\n"
 2430|      0|        ));
 2431|      0|        if !description.is_empty() {
 2432|      0|            ctx.push_str(&format!("- Description: {description}\n"));
 2433|      0|        }
 2434|      0|        ctx.push_str(&format!("- Domain: {namespace}\n"));
 2435|      0|        if !linked_entities.is_empty() {
 2436|      0|            ctx.push_str(&format!(
 2437|      0|                "- Linked entities: {}\n",
 2438|      0|                linked_entities.join(", ")
 2439|      0|            ));
 2440|      0|        }
 2441|      0|        ctx
 2442|       |    } else {
 2443|      0|        String::new()
 2444|       |    };
 2445|       |
 2446|      0|    let prompt = format!(
 2447|      0|        "{prompt_prefix}{context_section}\nTarget minimum length: {min_output_chars} characters. Maximum: {max_output_chars} characters."
 2448|       |    );
 2449|       |
 2450|       |    // The body schema uses a free-form enriched_body field
 2451|      0|    let (value, cost, is_oauth) = match mode {
 2452|       |        EnrichMode::ClaudeCode => {
 2453|      0|            call_claude(binary, &prompt, BODY_ENRICH_SCHEMA, &body, model, timeout)?
 2454|       |        }
 2455|       |        EnrichMode::Codex => {
 2456|      0|            call_codex(binary, &prompt, BODY_ENRICH_SCHEMA, &body, model, timeout)?
 2457|       |        }
 2458|       |    };
 2459|       |
 2460|      0|    let enriched_body = value
 2461|      0|        .get("enriched_body")
 2462|      0|        .and_then(|v| v.as_str())
 2463|      0|        .ok_or_else(|| AppError::Validation("LLM result missing 'enriched_body' field".into()))?;
 2464|       |
 2465|      0|    let chars_after = enriched_body.chars().count();
 2466|       |
 2467|       |    // G29 Passo 4 (v1.0.69): preservation check. Before persisting, run
 2468|       |    // a trigram-Jaccard similarity between the original body and the
 2469|       |    // LLM-rewritten body. When the score falls below
 2470|       |    // `args.preserve_threshold` (default 0.7 per the G29 gap), reject the
 2471|       |    // rewrite as a likely hallucination. The result is recorded in the
 2472|       |    // NDJSON stream so operators can audit what the LLM tried to do.
 2473|      0|    let threshold = preserve_threshold;
 2474|      0|    let verdict =
 2475|      0|        crate::preservation::PreservationVerdict::evaluate(&body, enriched_body, threshold);
 2476|      0|    if !verdict.is_accepted() {
 2477|       |        return Ok(EnrichItemResult::PreservationFailed {
 2478|      0|            score: match verdict {
 2479|      0|                crate::preservation::PreservationVerdict::Preserved { score, .. } => score,
 2480|      0|                crate::preservation::PreservationVerdict::Rejected { score, .. } => score,
 2481|      0|                crate::preservation::PreservationVerdict::Unchanged { .. } => 1.0,
 2482|       |            },
 2483|      0|            threshold,
 2484|      0|            chars_before,
 2485|      0|            chars_after,
 2486|       |        });
 2487|      0|    }
 2488|       |
 2489|       |    // G29 Passo 5 (v1.0.69): idempotency via blake3 hash. Before persisting,
 2490|       |    // compare the hash of the original body against the hash of the enriched
 2491|       |    // body. Identical hashes mean the LLM produced a byte-for-byte identical
 2492|       |    // body (rare but possible) — treat as `Skipped` so re-running the batch
 2493|       |    // is safe and the queue does not get re-persisted entries.
 2494|      0|    let old_hash = blake3::hash(body.as_bytes()).to_hex().to_string();
 2495|      0|    let new_hash = blake3::hash(enriched_body.as_bytes()).to_hex().to_string();
 2496|      0|    if old_hash == new_hash {
 2497|      0|        return Ok(EnrichItemResult::Skipped {
 2498|      0|            reason: format!(
 2499|      0|                "enriched body hash matches original (blake3:{old_hash}); idempotency skip"
 2500|      0|            ),
 2501|      0|        });
 2502|      0|    }
 2503|       |
 2504|       |    // Only persist if the enriched body is genuinely longer
 2505|      0|    if chars_after <= chars_before {
 2506|      0|        return Ok(EnrichItemResult::Skipped {
 2507|      0|            reason: format!(
 2508|      0|                "enriched body ({chars_after} chars) not longer than original ({chars_before} chars)"
 2509|      0|            ),
 2510|      0|        });
 2511|      0|    }
 2512|       |
 2513|      0|    persist_enriched_body(
 2514|      0|        conn,
 2515|      0|        namespace,
 2516|      0|        memory_id,
 2517|      0|        memory_name,
 2518|      0|        enriched_body,
 2519|      0|        paths,
 2520|      0|    )?;
 2521|       |
 2522|      0|    Ok(EnrichItemResult::Done {
 2523|      0|        memory_id: Some(memory_id),
 2524|      0|        entity_id: None,
 2525|      0|        entities: 0,
 2526|      0|        rels: 0,
 2527|      0|        chars_before: Some(chars_before),
 2528|      0|        chars_after: Some(chars_after),
 2529|      0|        cost,
 2530|      0|        is_oauth,
 2531|      0|    })
 2532|      0|}
 2533|       |
 2534|       |// ---------------------------------------------------------------------------
 2535|       |// Scan dispatcher — maps operation to scan query result (item keys)
 2536|       |// ---------------------------------------------------------------------------
 2537|       |
 2538|      0|fn scan_operation(
 2539|      0|    conn: &Connection,
 2540|      0|    namespace: &str,
 2541|      0|    args: &EnrichArgs,
 2542|      0|) -> Result<Vec<String>, AppError> {
 2543|       |    // G37: resolve --names + --names-file once and apply to every scan path.
 2544|      0|    let name_filter = resolve_name_filter(args)?;
 2545|      0|    match args.operation {
 2546|       |        EnrichOperation::MemoryBindings => {
 2547|      0|            let rows = scan_unbound_memories(conn, namespace, args.limit, &name_filter)?;
 2548|      0|            Ok(rows.into_iter().map(|(_, name, _)| name).collect())
 2549|       |        }
 2550|       |        EnrichOperation::EntityDescriptions => {
 2551|      0|            let rows = scan_entities_without_description(conn, namespace, args.limit)?;
 2552|      0|            Ok(rows.into_iter().map(|(_, name, _)| name).collect())
 2553|       |        }
 2554|       |        EnrichOperation::BodyEnrich => {
 2555|      0|            let rows =
 2556|      0|                scan_short_body_memories(conn, namespace, args.min_output_chars, args.limit)?;
 2557|      0|            Ok(rows.into_iter().map(|(_, name, _)| name).collect())
 2558|       |        }
 2559|       |        EnrichOperation::WeightCalibrate => {
 2560|      0|            let rows = scan_weight_candidates(conn, namespace, args.limit)?;
 2561|      0|            Ok(rows
 2562|      0|                .into_iter()
 2563|      0|                .map(|(id, _, _, _, _)| id.to_string())
 2564|      0|                .collect())
 2565|       |        }
 2566|       |        EnrichOperation::RelationReclassify => {
 2567|      0|            let rows = scan_generic_relations(conn, namespace, args.limit)?;
 2568|      0|            Ok(rows
 2569|      0|                .into_iter()
 2570|      0|                .map(|(id, _, _, _)| id.to_string())
 2571|      0|                .collect())
 2572|       |        }
 2573|       |        EnrichOperation::EntityConnect | EnrichOperation::CrossDomainBridges => {
 2574|      0|            let pairs = scan_isolated_entity_pairs(conn, namespace, args.limit)?;
 2575|      0|            Ok(pairs.into_iter().map(|(_, name, _, _)| name).collect())
 2576|       |        }
 2577|       |        EnrichOperation::EntityTypeValidate => {
 2578|      0|            let rows = scan_entities_for_type_validation(conn, namespace, args.limit)?;
 2579|      0|            Ok(rows.into_iter().map(|(_, name, _)| name).collect())
 2580|       |        }
 2581|       |        EnrichOperation::DescriptionEnrich => {
 2582|      0|            let rows = scan_generic_descriptions(conn, namespace, args.limit)?;
 2583|      0|            Ok(rows.into_iter().map(|(_, name, _)| name).collect())
 2584|       |        }
 2585|       |        EnrichOperation::DomainClassify
 2586|       |        | EnrichOperation::GraphAudit
 2587|       |        | EnrichOperation::DeepResearchSynth
 2588|       |        | EnrichOperation::BodyExtract => {
 2589|      0|            let limit_clause = args.limit.map(|n| format!("LIMIT {n}")).unwrap_or_default();
 2590|      0|            let sql = format!(
 2591|      0|                "SELECT name FROM memories WHERE namespace=?1 AND deleted_at IS NULL ORDER BY id {limit_clause}"
 2592|       |            );
 2593|      0|            let mut stmt = conn.prepare(&sql)?;
 2594|      0|            let names = stmt
 2595|      0|                .query_map(rusqlite::params![namespace], |r| r.get::<_, String>(0))?
 2596|      0|                .collect::<Result<Vec<_>, _>>()?;
 2597|      0|            Ok(names)
 2598|       |        }
 2599|       |    }
 2600|      0|}
 2601|       |
 2602|       |// ---------------------------------------------------------------------------
 2603|       |// Codex stub provider
 2604|       |// ---------------------------------------------------------------------------
 2605|       |
 2606|       |/// Locates the Codex CLI binary.
 2607|      0|fn find_codex_binary(explicit: Option<&Path>) -> Result<PathBuf, AppError> {
 2608|      0|    if let Some(p) = explicit {
 2609|      0|        if p.exists() {
 2610|      0|            return Ok(p.to_path_buf());
 2611|      0|        }
 2612|      0|        return Err(AppError::Validation(format!(
 2613|      0|            "Codex binary not found at explicit path: {}",
 2614|      0|            p.display()
 2615|      0|        )));
 2616|      0|    }
 2617|       |
 2618|      0|    if let Ok(env_path) = std::env::var("SQLITE_GRAPHRAG_CODEX_BINARY") {
 2619|      0|        let p = PathBuf::from(&env_path);
 2620|      0|        if p.exists() {
 2621|      0|            return Ok(p);
 2622|      0|        }
 2623|      0|    }
 2624|       |
 2625|      0|    let name = if cfg!(windows) { "codex.exe" } else { "codex" };
 2626|      0|    if let Some(path_var) = std::env::var_os("PATH") {
 2627|      0|        for dir in std::env::split_paths(&path_var) {
 2628|      0|            let candidate = dir.join(name);
 2629|      0|            if candidate.exists() {
 2630|      0|                return Ok(candidate);
 2631|      0|            }
 2632|       |        }
 2633|      0|    }
 2634|       |
 2635|      0|    Err(AppError::Validation(
 2636|      0|        "Codex CLI binary not found in PATH. Install it or specify --codex-binary".to_string(),
 2637|      0|    ))
 2638|      0|}
 2639|       |
 2640|       |/// G27: Calibrate weight of a single relationship via LLM.
 2641|      0|fn call_weight_calibrate(
 2642|      0|    conn: &Connection,
 2643|      0|    _namespace: &str,
 2644|      0|    item_key: &str,
 2645|      0|    binary: &Path,
 2646|      0|    model: Option<&str>,
 2647|      0|    timeout: u64,
 2648|      0|    mode: &EnrichMode,
 2649|      0|) -> Result<EnrichItemResult, AppError> {
 2650|      0|    let rel_id: i64 = item_key
 2651|      0|        .parse()
 2652|      0|        .map_err(|_| AppError::Validation(format!("invalid relationship id: {item_key}")))?;
 2653|      0|    let (source_name, target_name, relation, current_weight): (String, String, String, f64) = conn
 2654|      0|        .query_row(
 2655|      0|            "SELECT e1.name, e2.name, r.relation, r.weight \
 2656|      0|             FROM relationships r \
 2657|      0|             JOIN entities e1 ON e1.id = r.source_id \
 2658|      0|             JOIN entities e2 ON e2.id = r.target_id \
 2659|      0|             WHERE r.id = ?1",
 2660|      0|            rusqlite::params![rel_id],
 2661|      0|            |r| Ok((r.get(0)?, r.get(1)?, r.get(2)?, r.get(3)?)),
 2662|       |        )
 2663|      0|        .map_err(|_| AppError::NotFound(format!("relationship {rel_id} not found")))?;
 2664|       |
 2665|      0|    let input_text = format!(
 2666|      0|        "Source: {source_name}\nTarget: {target_name}\nRelation: {relation}\nCurrent weight: {current_weight}"
 2667|       |    );
 2668|      0|    let (value, cost, is_oauth) = match mode {
 2669|      0|        EnrichMode::ClaudeCode => call_claude(
 2670|      0|            binary,
 2671|      0|            WEIGHT_CALIBRATE_PROMPT,
 2672|      0|            WEIGHT_CALIBRATE_SCHEMA,
 2673|      0|            &input_text,
 2674|      0|            model,
 2675|      0|            timeout,
 2676|      0|        )?,
 2677|      0|        EnrichMode::Codex => call_codex(
 2678|      0|            binary,
 2679|      0|            WEIGHT_CALIBRATE_PROMPT,
 2680|      0|            WEIGHT_CALIBRATE_SCHEMA,
 2681|      0|            &input_text,
 2682|      0|            model,
 2683|      0|            timeout,
 2684|      0|        )?,
 2685|       |    };
 2686|       |
 2687|      0|    let calibrated = value
 2688|      0|        .get("calibrated_weight")
 2689|      0|        .and_then(|v| v.as_f64())
 2690|      0|        .ok_or_else(|| AppError::Validation("LLM result missing 'calibrated_weight'".into()))?;
 2691|       |
 2692|      0|    conn.execute(
 2693|      0|        "UPDATE relationships SET weight = ?1 WHERE id = ?2",
 2694|      0|        rusqlite::params![calibrated, rel_id],
 2695|      0|    )?;
 2696|       |
 2697|      0|    Ok(EnrichItemResult::Done {
 2698|      0|        memory_id: None,
 2699|      0|        entity_id: None,
 2700|      0|        entities: 0,
 2701|      0|        rels: 1,
 2702|      0|        chars_before: None,
 2703|      0|        chars_after: None,
 2704|      0|        cost,
 2705|      0|        is_oauth,
 2706|      0|    })
 2707|      0|}
 2708|       |
 2709|       |/// G27: Reclassify a generic relationship type via LLM.
 2710|      0|fn call_relation_reclassify(
 2711|      0|    conn: &Connection,
 2712|      0|    _namespace: &str,
 2713|      0|    item_key: &str,
 2714|      0|    binary: &Path,
 2715|      0|    model: Option<&str>,
 2716|      0|    timeout: u64,
 2717|      0|    mode: &EnrichMode,
 2718|      0|) -> Result<EnrichItemResult, AppError> {
 2719|      0|    let rel_id: i64 = item_key
 2720|      0|        .parse()
 2721|      0|        .map_err(|_| AppError::Validation(format!("invalid relationship id: {item_key}")))?;
 2722|      0|    let (source_name, target_name, current_relation): (String, String, String) = conn
 2723|      0|        .query_row(
 2724|      0|            "SELECT e1.name, e2.name, r.relation \
 2725|      0|             FROM relationships r \
 2726|      0|             JOIN entities e1 ON e1.id = r.source_id \
 2727|      0|             JOIN entities e2 ON e2.id = r.target_id \
 2728|      0|             WHERE r.id = ?1",
 2729|      0|            rusqlite::params![rel_id],
 2730|      0|            |r| Ok((r.get(0)?, r.get(1)?, r.get(2)?)),
 2731|       |        )
 2732|      0|        .map_err(|_| AppError::NotFound(format!("relationship {rel_id} not found")))?;
 2733|       |
 2734|      0|    let input_text = format!(
 2735|      0|        "Source entity: {source_name}\nTarget entity: {target_name}\nCurrent relation: {current_relation}"
 2736|       |    );
 2737|      0|    let (value, cost, is_oauth) = match mode {
 2738|      0|        EnrichMode::ClaudeCode => call_claude(
 2739|      0|            binary,
 2740|      0|            RELATION_RECLASSIFY_PROMPT,
 2741|      0|            RELATION_RECLASSIFY_SCHEMA,
 2742|      0|            &input_text,
 2743|      0|            model,
 2744|      0|            timeout,
 2745|      0|        )?,
 2746|      0|        EnrichMode::Codex => call_codex(
 2747|      0|            binary,
 2748|      0|            RELATION_RECLASSIFY_PROMPT,
 2749|      0|            RELATION_RECLASSIFY_SCHEMA,
 2750|      0|            &input_text,
 2751|      0|            model,
 2752|      0|            timeout,
 2753|      0|        )?,
 2754|       |    };
 2755|       |
 2756|      0|    let new_relation = value
 2757|      0|        .get("relation")
 2758|      0|        .and_then(|v| v.as_str())
 2759|      0|        .ok_or_else(|| AppError::Validation("LLM result missing 'relation'".into()))?;
 2760|      0|    let new_strength = value
 2761|      0|        .get("strength")
 2762|      0|        .and_then(|v| v.as_f64())
 2763|      0|        .unwrap_or(0.5);
 2764|       |
 2765|      0|    conn.execute(
 2766|      0|        "UPDATE relationships SET relation = ?1, weight = ?2 WHERE id = ?3",
 2767|      0|        rusqlite::params![new_relation, new_strength, rel_id],
 2768|      0|    )?;
 2769|       |
 2770|      0|    Ok(EnrichItemResult::Done {
 2771|      0|        memory_id: None,
 2772|      0|        entity_id: None,
 2773|      0|        entities: 0,
 2774|      0|        rels: 1,
 2775|      0|        chars_before: None,
 2776|      0|        chars_after: None,
 2777|      0|        cost,
 2778|      0|        is_oauth,
 2779|      0|    })
 2780|      0|}
 2781|       |
 2782|       |/// G27 P2: Connect isolated entities via LLM-suggested relationship.
 2783|      0|fn call_entity_connect(
 2784|      0|    conn: &Connection,
 2785|      0|    namespace: &str,
 2786|      0|    item_key: &str,
 2787|      0|    binary: &Path,
 2788|      0|    model: Option<&str>,
 2789|      0|    timeout: u64,
 2790|      0|    mode: &EnrichMode,
 2791|      0|) -> Result<EnrichItemResult, AppError> {
 2792|      0|    let pairs = scan_isolated_entity_pairs(conn, namespace, Some(1))?;
 2793|      0|    let (e1_id, e1_name, e2_id, e2_name) =
 2794|      0|        match pairs.into_iter().find(|(_, n, _, _)| n == item_key) {
 2795|      0|            Some(p) => p,
 2796|       |            None => {
 2797|      0|                return Ok(EnrichItemResult::Skipped {
 2798|      0|                    reason: "pair no longer isolated".into(),
 2799|      0|                })
 2800|       |            }
 2801|       |        };
 2802|      0|    let input_text = format!("Entity A: {e1_name}\nEntity B: {e2_name}");
 2803|      0|    let (value, cost, is_oauth) = match mode {
 2804|      0|        EnrichMode::ClaudeCode => call_claude(
 2805|      0|            binary,
 2806|      0|            ENTITY_CONNECT_PROMPT,
 2807|      0|            ENTITY_CONNECT_SCHEMA,
 2808|      0|            &input_text,
 2809|      0|            model,
 2810|      0|            timeout,
 2811|      0|        )?,
 2812|      0|        EnrichMode::Codex => call_codex(
 2813|      0|            binary,
 2814|      0|            ENTITY_CONNECT_PROMPT,
 2815|      0|            ENTITY_CONNECT_SCHEMA,
 2816|      0|            &input_text,
 2817|      0|            model,
 2818|      0|            timeout,
 2819|      0|        )?,
 2820|       |    };
 2821|      0|    let relation = value
 2822|      0|        .get("relation")
 2823|      0|        .and_then(|v| v.as_str())
 2824|      0|        .unwrap_or("none");
 2825|      0|    if relation == "none" {
 2826|      0|        return Ok(EnrichItemResult::Skipped {
 2827|      0|            reason: "LLM determined no relationship".into(),
 2828|      0|        });
 2829|      0|    }
 2830|      0|    let strength = value
 2831|      0|        .get("strength")
 2832|      0|        .and_then(|v| v.as_f64())
 2833|      0|        .unwrap_or(0.5);
 2834|      0|    conn.execute(
 2835|      0|        "INSERT OR IGNORE INTO relationships (namespace, source_id, target_id, relation, weight) VALUES (?1, ?2, ?3, ?4, ?5)",
 2836|      0|        rusqlite::params![namespace, e1_id, e2_id, relation, strength],
 2837|      0|    )?;
 2838|      0|    Ok(EnrichItemResult::Done {
 2839|      0|        memory_id: None,
 2840|      0|        entity_id: None,
 2841|      0|        entities: 0,
 2842|      0|        rels: 1,
 2843|      0|        chars_before: None,
 2844|      0|        chars_after: None,
 2845|      0|        cost,
 2846|      0|        is_oauth,
 2847|      0|    })
 2848|      0|}
 2849|       |
 2850|       |/// G27 P2: Validate entity type assignment via LLM.
 2851|      0|fn call_entity_type_validate(
 2852|      0|    conn: &Connection,
 2853|      0|    _namespace: &str,
 2854|      0|    item_key: &str,
 2855|      0|    binary: &Path,
 2856|      0|    model: Option<&str>,
 2857|      0|    timeout: u64,
 2858|      0|    mode: &EnrichMode,
 2859|      0|) -> Result<EnrichItemResult, AppError> {
 2860|      0|    let (ent_id, ent_name, ent_type): (i64, String, String) = conn
 2861|      0|        .query_row(
 2862|      0|            "SELECT id, name, type FROM entities WHERE name = ?1",
 2863|      0|            rusqlite::params![item_key],
 2864|      0|            |r| Ok((r.get(0)?, r.get(1)?, r.get(2)?)),
 2865|       |        )
 2866|      0|        .map_err(|_| AppError::NotFound(format!("entity '{item_key}' not found")))?;
 2867|      0|    let input_text = format!("Entity: {ent_name}\nCurrent type: {ent_type}");
 2868|      0|    let (value, cost, is_oauth) = match mode {
 2869|      0|        EnrichMode::ClaudeCode => call_claude(
 2870|      0|            binary,
 2871|      0|            ENTITY_TYPE_VALIDATE_PROMPT,
 2872|      0|            ENTITY_TYPE_VALIDATE_SCHEMA,
 2873|      0|            &input_text,
 2874|      0|            model,
 2875|      0|            timeout,
 2876|      0|        )?,
 2877|      0|        EnrichMode::Codex => call_codex(
 2878|      0|            binary,
 2879|      0|            ENTITY_TYPE_VALIDATE_PROMPT,
 2880|      0|            ENTITY_TYPE_VALIDATE_SCHEMA,
 2881|      0|            &input_text,
 2882|      0|            model,
 2883|      0|            timeout,
 2884|      0|        )?,
 2885|       |    };
 2886|      0|    let validated_type = value
 2887|      0|        .get("validated_type")
 2888|      0|        .and_then(|v| v.as_str())
 2889|      0|        .unwrap_or(&ent_type);
 2890|      0|    let was_correct = value
 2891|      0|        .get("was_correct")
 2892|      0|        .and_then(|v| v.as_bool())
 2893|      0|        .unwrap_or(true);
 2894|      0|    if !was_correct {
 2895|      0|        conn.execute(
 2896|      0|            "UPDATE entities SET type = ?1 WHERE id = ?2",
 2897|      0|            rusqlite::params![validated_type, ent_id],
 2898|      0|        )?;
 2899|      0|    }
 2900|      0|    Ok(EnrichItemResult::Done {
 2901|      0|        memory_id: None,
 2902|      0|        entity_id: Some(ent_id),
 2903|      0|        entities: 1,
 2904|      0|        rels: 0,
 2905|      0|        chars_before: None,
 2906|      0|        chars_after: None,
 2907|      0|        cost,
 2908|      0|        is_oauth,
 2909|      0|    })
 2910|      0|}
 2911|       |
 2912|       |/// G27 P2: Enrich generic memory description via LLM.
 2913|      0|fn call_description_enrich(
 2914|      0|    conn: &Connection,
 2915|      0|    _namespace: &str,
 2916|      0|    item_key: &str,
 2917|      0|    binary: &Path,
 2918|      0|    model: Option<&str>,
 2919|      0|    timeout: u64,
 2920|      0|    mode: &EnrichMode,
 2921|      0|) -> Result<EnrichItemResult, AppError> {
 2922|      0|    let (mem_id, body, old_desc): (i64, String, String) = conn
 2923|      0|        .query_row(
 2924|      0|            "SELECT id, body, description FROM memories WHERE name = ?1 AND deleted_at IS NULL",
 2925|      0|            rusqlite::params![item_key],
 2926|      0|            |r| Ok((r.get(0)?, r.get::<_, String>(1)?, r.get::<_, String>(2)?)),
 2927|       |        )
 2928|      0|        .map_err(|_| AppError::NotFound(format!("memory '{item_key}' not found")))?;
 2929|      0|    let snippet: String = body.chars().take(500).collect();
 2930|      0|    let input_text = format!(
 2931|      0|        "Memory name: {item_key}\nCurrent description: {old_desc}\nBody preview: {snippet}"
 2932|       |    );
 2933|      0|    let (value, cost, is_oauth) = match mode {
 2934|      0|        EnrichMode::ClaudeCode => call_claude(
 2935|      0|            binary,
 2936|      0|            DESCRIPTION_ENRICH_PROMPT,
 2937|      0|            DESCRIPTION_ENRICH_SCHEMA,
 2938|      0|            &input_text,
 2939|      0|            model,
 2940|      0|            timeout,
 2941|      0|        )?,
 2942|      0|        EnrichMode::Codex => call_codex(
 2943|      0|            binary,
 2944|      0|            DESCRIPTION_ENRICH_PROMPT,
 2945|      0|            DESCRIPTION_ENRICH_SCHEMA,
 2946|      0|            &input_text,
 2947|      0|            model,
 2948|      0|            timeout,
 2949|      0|        )?,
 2950|       |    };
 2951|      0|    let new_desc = value
 2952|      0|        .get("description")
 2953|      0|        .and_then(|v| v.as_str())
 2954|      0|        .unwrap_or(&old_desc);
 2955|      0|    conn.execute(
 2956|      0|        "UPDATE memories SET description = ?1 WHERE id = ?2",
 2957|      0|        rusqlite::params![new_desc, mem_id],
 2958|      0|    )?;
 2959|      0|    Ok(EnrichItemResult::Done {
 2960|      0|        memory_id: Some(mem_id),
 2961|      0|        entity_id: None,
 2962|      0|        entities: 0,
 2963|      0|        rels: 0,
 2964|      0|        chars_before: Some(old_desc.len()),
 2965|      0|        chars_after: Some(new_desc.len()),
 2966|      0|        cost,
 2967|      0|        is_oauth,
 2968|      0|    })
 2969|      0|}
 2970|       |
 2971|       |/// G27 P2: Classify memory into domain category via LLM.
 2972|      0|fn call_domain_classify(
 2973|      0|    conn: &Connection,
 2974|      0|    _namespace: &str,
 2975|      0|    item_key: &str,
 2976|      0|    binary: &Path,
 2977|      0|    model: Option<&str>,
 2978|      0|    timeout: u64,
 2979|      0|    mode: &EnrichMode,
 2980|      0|) -> Result<EnrichItemResult, AppError> {
 2981|      0|    let (mem_id, body, desc): (i64, String, String) = conn
 2982|      0|        .query_row(
 2983|      0|            "SELECT id, body, description FROM memories WHERE name = ?1 AND deleted_at IS NULL",
 2984|      0|            rusqlite::params![item_key],
 2985|      0|            |r| Ok((r.get(0)?, r.get::<_, String>(1)?, r.get::<_, String>(2)?)),
 2986|       |        )
 2987|      0|        .map_err(|_| AppError::NotFound(format!("memory '{item_key}' not found")))?;
 2988|      0|    let snippet: String = body.chars().take(500).collect();
 2989|      0|    let input_text = format!("Memory: {item_key}\nDescription: {desc}\nBody preview: {snippet}");
 2990|      0|    let (value, cost, is_oauth) = match mode {
 2991|      0|        EnrichMode::ClaudeCode => call_claude(
 2992|      0|            binary,
 2993|      0|            DOMAIN_CLASSIFY_PROMPT,
 2994|      0|            DOMAIN_CLASSIFY_SCHEMA,
 2995|      0|            &input_text,
 2996|      0|            model,
 2997|      0|            timeout,
 2998|      0|        )?,
 2999|      0|        EnrichMode::Codex => call_codex(
 3000|      0|            binary,
 3001|      0|            DOMAIN_CLASSIFY_PROMPT,
 3002|      0|            DOMAIN_CLASSIFY_SCHEMA,
 3003|      0|            &input_text,
 3004|      0|            model,
 3005|      0|            timeout,
 3006|      0|        )?,
 3007|       |    };
 3008|      0|    let domain = value
 3009|      0|        .get("domain")
 3010|      0|        .and_then(|v| v.as_str())
 3011|      0|        .unwrap_or("uncategorized");
 3012|      0|    let metadata = format!(r#"{{"domain":"{}"}}"#, domain.replace('"', "\\\""));
 3013|      0|    conn.execute(
 3014|      0|        "UPDATE memories SET metadata = ?1 WHERE id = ?2",
 3015|      0|        rusqlite::params![metadata, mem_id],
 3016|      0|    )?;
 3017|      0|    Ok(EnrichItemResult::Done {
 3018|      0|        memory_id: Some(mem_id),
 3019|      0|        entity_id: None,
 3020|      0|        entities: 0,
 3021|      0|        rels: 0,
 3022|      0|        chars_before: None,
 3023|      0|        chars_after: None,
 3024|      0|        cost,
 3025|      0|        is_oauth,
 3026|      0|    })
 3027|      0|}
 3028|       |
 3029|       |/// G27 P2: Audit memory graph quality via LLM.
 3030|      0|fn call_graph_audit(
 3031|      0|    conn: &Connection,
 3032|      0|    _namespace: &str,
 3033|      0|    item_key: &str,
 3034|      0|    binary: &Path,
 3035|      0|    model: Option<&str>,
 3036|      0|    timeout: u64,
 3037|      0|    mode: &EnrichMode,
 3038|      0|) -> Result<EnrichItemResult, AppError> {
 3039|      0|    let (mem_id, body, desc): (i64, String, String) = conn
 3040|      0|        .query_row(
 3041|      0|            "SELECT id, body, description FROM memories WHERE name = ?1 AND deleted_at IS NULL",
 3042|      0|            rusqlite::params![item_key],
 3043|      0|            |r| Ok((r.get(0)?, r.get::<_, String>(1)?, r.get::<_, String>(2)?)),
 3044|       |        )
 3045|      0|        .map_err(|_| AppError::NotFound(format!("memory '{item_key}' not found")))?;
 3046|      0|    let snippet: String = body.chars().take(500).collect();
 3047|      0|    let ent_count: i64 = conn
 3048|      0|        .query_row(
 3049|      0|            "SELECT COUNT(*) FROM memory_entities WHERE memory_id = ?1",
 3050|      0|            rusqlite::params![mem_id],
 3051|      0|            |r| r.get(0),
 3052|       |        )
 3053|      0|        .unwrap_or(0);
 3054|      0|    let input_text = format!("Memory: {item_key}\nDescription: {desc}\nEntity bindings: {ent_count}\nBody preview: {snippet}");
 3055|      0|    let (value, cost, is_oauth) = match mode {
 3056|      0|        EnrichMode::ClaudeCode => call_claude(
 3057|      0|            binary,
 3058|      0|            GRAPH_AUDIT_PROMPT,
 3059|      0|            GRAPH_AUDIT_SCHEMA,
 3060|      0|            &input_text,
 3061|      0|            model,
 3062|      0|            timeout,
 3063|      0|        )?,
 3064|      0|        EnrichMode::Codex => call_codex(
 3065|      0|            binary,
 3066|      0|            GRAPH_AUDIT_PROMPT,
 3067|      0|            GRAPH_AUDIT_SCHEMA,
 3068|      0|            &input_text,
 3069|      0|            model,
 3070|      0|            timeout,
 3071|      0|        )?,
 3072|       |    };
 3073|      0|    let issues = value
 3074|      0|        .get("issues")
 3075|      0|        .and_then(|v| v.as_array())
 3076|      0|        .map(|a| a.len())
 3077|      0|        .unwrap_or(0);
 3078|      0|    Ok(EnrichItemResult::Done {
 3079|      0|        memory_id: Some(mem_id),
 3080|      0|        entity_id: None,
 3081|      0|        entities: 0,
 3082|      0|        rels: issues,
 3083|      0|        chars_before: None,
 3084|      0|        chars_after: None,
 3085|      0|        cost,
 3086|      0|        is_oauth,
 3087|      0|    })
 3088|      0|}
 3089|       |
 3090|       |/// G27 P2: Synthesize research findings into graph entities/relationships via LLM.
 3091|      0|fn call_deep_research_synth(
 3092|      0|    conn: &Connection,
 3093|      0|    namespace: &str,
 3094|      0|    item_key: &str,
 3095|      0|    binary: &Path,
 3096|      0|    model: Option<&str>,
 3097|      0|    timeout: u64,
 3098|      0|    mode: &EnrichMode,
 3099|      0|) -> Result<EnrichItemResult, AppError> {
 3100|      0|    let (mem_id, body): (i64, String) = conn
 3101|      0|        .query_row(
 3102|      0|            "SELECT id, body FROM memories WHERE name = ?1 AND deleted_at IS NULL",
 3103|      0|            rusqlite::params![item_key],
 3104|      0|            |r| Ok((r.get(0)?, r.get::<_, String>(1)?)),
 3105|       |        )
 3106|      0|        .map_err(|_| AppError::NotFound(format!("memory '{item_key}' not found")))?;
 3107|      0|    let snippet: String = body.chars().take(2000).collect();
 3108|      0|    let input_text = format!("Memory: {item_key}\nBody:\n{snippet}");
 3109|      0|    let (value, cost, is_oauth) = match mode {
 3110|      0|        EnrichMode::ClaudeCode => call_claude(
 3111|      0|            binary,
 3112|      0|            DEEP_RESEARCH_SYNTH_PROMPT,
 3113|      0|            DEEP_RESEARCH_SYNTH_SCHEMA,
 3114|      0|            &input_text,
 3115|      0|            model,
 3116|      0|            timeout,
 3117|      0|        )?,
 3118|      0|        EnrichMode::Codex => call_codex(
 3119|      0|            binary,
 3120|      0|            DEEP_RESEARCH_SYNTH_PROMPT,
 3121|      0|            DEEP_RESEARCH_SYNTH_SCHEMA,
 3122|      0|            &input_text,
 3123|      0|            model,
 3124|      0|            timeout,
 3125|      0|        )?,
 3126|       |    };
 3127|      0|    let mut ent_count = 0usize;
 3128|      0|    let mut rel_count = 0usize;
 3129|      0|    if let Some(ents) = value.get("entities").and_then(|v| v.as_array()) {
 3130|      0|        for e in ents {
 3131|      0|            let name = e.get("name").and_then(|v| v.as_str()).unwrap_or_default();
 3132|      0|            let etype_str = e
 3133|      0|                .get("entity_type")
 3134|      0|                .and_then(|v| v.as_str())
 3135|      0|                .unwrap_or("concept");
 3136|      0|            let etype: EntityType = etype_str.parse().unwrap_or(EntityType::Concept);
 3137|      0|            if name.len() >= 2 {
 3138|      0|                let ne = NewEntity {
 3139|      0|                    name: name.to_string(),
 3140|      0|                    entity_type: etype,
 3141|      0|                    description: None,
 3142|      0|                };
 3143|      0|                let _ = entities::upsert_entity(conn, namespace, &ne);
 3144|      0|                ent_count += 1;
 3145|      0|            }
 3146|       |        }
 3147|      0|    }
 3148|      0|    if let Some(rels) = value.get("relationships").and_then(|v| v.as_array()) {
 3149|      0|        for r in rels {
 3150|      0|            let src = r.get("source").and_then(|v| v.as_str()).unwrap_or_default();
 3151|      0|            let tgt = r.get("target").and_then(|v| v.as_str()).unwrap_or_default();
 3152|      0|            if src.is_empty() || tgt.is_empty() {
 3153|      0|                continue;
 3154|      0|            }
 3155|      0|            let rel = r
 3156|      0|                .get("relation")
 3157|      0|                .and_then(|v| v.as_str())
 3158|      0|                .unwrap_or("related");
 3159|      0|            let str_ = r.get("strength").and_then(|v| v.as_f64()).unwrap_or(0.5);
 3160|      0|            if let (Some(sid), Some(tid)) = (
 3161|      0|                entities::find_entity_id(conn, namespace, src)?,
 3162|      0|                entities::find_entity_id(conn, namespace, tgt)?,
 3163|      0|            ) {
 3164|      0|                let _ = entities::create_or_fetch_relationship(
 3165|      0|                    conn, namespace, sid, tid, rel, str_, None,
 3166|      0|                );
 3167|      0|                rel_count += 1;
 3168|      0|            }
 3169|       |        }
 3170|      0|    }
 3171|      0|    Ok(EnrichItemResult::Done {
 3172|      0|        memory_id: Some(mem_id),
 3173|      0|        entity_id: None,
 3174|      0|        entities: ent_count,
 3175|      0|        rels: rel_count,
 3176|      0|        chars_before: None,
 3177|      0|        chars_after: None,
 3178|      0|        cost,
 3179|      0|        is_oauth,
 3180|      0|    })
 3181|      0|}
 3182|       |
 3183|       |/// G27 P2: Extract structured body from unstructured text via LLM.
 3184|      0|fn call_body_extract(
 3185|      0|    conn: &Connection,
 3186|      0|    _namespace: &str,
 3187|      0|    item_key: &str,
 3188|      0|    binary: &Path,
 3189|      0|    model: Option<&str>,
 3190|      0|    timeout: u64,
 3191|      0|    mode: &EnrichMode,
 3192|      0|) -> Result<EnrichItemResult, AppError> {
 3193|      0|    let (mem_id, body): (i64, String) = conn
 3194|      0|        .query_row(
 3195|      0|            "SELECT id, body FROM memories WHERE name = ?1 AND deleted_at IS NULL",
 3196|      0|            rusqlite::params![item_key],
 3197|      0|            |r| Ok((r.get(0)?, r.get::<_, String>(1)?)),
 3198|       |        )
 3199|      0|        .map_err(|_| AppError::NotFound(format!("memory '{item_key}' not found")))?;
 3200|      0|    let input_text = format!("Memory: {item_key}\nBody:\n{body}");
 3201|      0|    let (value, cost, is_oauth) = match mode {
 3202|      0|        EnrichMode::ClaudeCode => call_claude(
 3203|      0|            binary,
 3204|      0|            BODY_EXTRACT_PROMPT,
 3205|      0|            BODY_EXTRACT_SCHEMA,
 3206|      0|            &input_text,
 3207|      0|            model,
 3208|      0|            timeout,
 3209|      0|        )?,
 3210|      0|        EnrichMode::Codex => call_codex(
 3211|      0|            binary,
 3212|      0|            BODY_EXTRACT_PROMPT,
 3213|      0|            BODY_EXTRACT_SCHEMA,
 3214|      0|            &input_text,
 3215|      0|            model,
 3216|      0|            timeout,
 3217|      0|        )?,
 3218|       |    };
 3219|      0|    let restructured = value
 3220|      0|        .get("restructured_body")
 3221|      0|        .and_then(|v| v.as_str())
 3222|      0|        .unwrap_or(&body);
 3223|      0|    let chars_before = body.len();
 3224|      0|    let chars_after = restructured.len();
 3225|      0|    let new_hash = blake3::hash(restructured.as_bytes()).to_hex().to_string();
 3226|      0|    conn.execute(
 3227|      0|        "UPDATE memories SET body = ?1, body_hash = ?2, updated_at = unixepoch() WHERE id = ?3",
 3228|      0|        rusqlite::params![restructured, new_hash, mem_id],
 3229|      0|    )?;
 3230|      0|    Ok(EnrichItemResult::Done {
 3231|      0|        memory_id: Some(mem_id),
 3232|      0|        entity_id: None,
 3233|      0|        entities: 0,
 3234|      0|        rels: 0,
 3235|      0|        chars_before: Some(chars_before),
 3236|      0|        chars_after: Some(chars_after),
 3237|      0|        cost,
 3238|      0|        is_oauth,
 3239|      0|    })
 3240|      0|}
 3241|       |
 3242|       |/// Scan for pairs of entities that share no direct relationship.
 3243|       |#[allow(clippy::type_complexity)]
 3244|      0|fn scan_isolated_entity_pairs(
 3245|      0|    conn: &Connection,
 3246|      0|    namespace: &str,
 3247|      0|    limit: Option<usize>,
 3248|      0|) -> Result<Vec<(i64, String, i64, String)>, AppError> {
 3249|      0|    let limit_val = limit.unwrap_or(50) as i64;
 3250|      0|    let mut stmt = conn.prepare_cached(
 3251|      0|        "SELECT e1.id, e1.name, e2.id, e2.name FROM entities e1, entities e2 \
 3252|      0|         WHERE e1.namespace = ?1 AND e2.namespace = ?1 AND e1.id < e2.id \
 3253|      0|         AND NOT EXISTS (SELECT 1 FROM relationships r WHERE \
 3254|      0|           (r.source_id = e1.id AND r.target_id = e2.id) OR \
 3255|      0|           (r.source_id = e2.id AND r.target_id = e1.id)) \
 3256|      0|         LIMIT ?2",
 3257|      0|    )?;
 3258|      0|    let rows = stmt
 3259|      0|        .query_map(rusqlite::params![namespace, limit_val], |r| {
 3260|      0|            Ok((r.get(0)?, r.get(1)?, r.get(2)?, r.get(3)?))
 3261|      0|        })?
 3262|      0|        .collect::<Result<Vec<_>, _>>()?;
 3263|      0|    Ok(rows)
 3264|      0|}
 3265|       |
 3266|       |/// Scan for entities with non-validated types (all entities for type audit).
 3267|      0|fn scan_entities_for_type_validation(
 3268|      0|    conn: &Connection,
 3269|      0|    namespace: &str,
 3270|      0|    limit: Option<usize>,
 3271|      0|) -> Result<Vec<(i64, String, String)>, AppError> {
 3272|      0|    let limit_clause = limit.map(|n| format!("LIMIT {n}")).unwrap_or_default();
 3273|      0|    let sql = format!(
 3274|      0|        "SELECT id, name, type FROM entities WHERE namespace = ?1 ORDER BY id {limit_clause}"
 3275|       |    );
 3276|      0|    let mut stmt = conn.prepare(&sql)?;
 3277|      0|    let rows = stmt
 3278|      0|        .query_map(rusqlite::params![namespace], |r| {
 3279|      0|            Ok((r.get(0)?, r.get(1)?, r.get(2)?))
 3280|      0|        })?
 3281|      0|        .collect::<Result<Vec<_>, _>>()?;
 3282|      0|    Ok(rows)
 3283|      0|}
 3284|       |
 3285|       |/// Scan for memories with generic descriptions (ingested, imported, etc).
 3286|      0|fn scan_generic_descriptions(
 3287|      0|    conn: &Connection,
 3288|      0|    namespace: &str,
 3289|      0|    limit: Option<usize>,
 3290|      0|) -> Result<Vec<(i64, String, String)>, AppError> {
 3291|      0|    let limit_clause = limit.map(|n| format!("LIMIT {n}")).unwrap_or_default();
 3292|      0|    let sql = format!(
 3293|      0|        "SELECT id, name, description FROM memories WHERE namespace = ?1 AND deleted_at IS NULL \
 3294|      0|         AND (description LIKE '%ingested%' OR description LIKE '%imported%' OR description LIKE '%added%' OR length(description) < 30) \
 3295|      0|         ORDER BY id {limit_clause}"
 3296|       |    );
 3297|      0|    let mut stmt = conn.prepare(&sql)?;
 3298|      0|    let rows = stmt
 3299|      0|        .query_map(rusqlite::params![namespace], |r| {
 3300|      0|            Ok((r.get(0)?, r.get(1)?, r.get(2)?))
 3301|      0|        })?
 3302|      0|        .collect::<Result<Vec<_>, _>>()?;
 3303|      0|    Ok(rows)
 3304|      0|}
 3305|       |
 3306|       |/// Calls the Codex CLI for a single enrichment item.
 3307|       |///
 3308|       |/// Follows the same contract as `call_claude`: returns `(value, cost_usd, is_oauth=false)`.
 3309|      0|fn call_codex(
 3310|      0|    binary: &Path,
 3311|      0|    prompt: &str,
 3312|      0|    json_schema: &str,
 3313|      0|    input_text: &str,
 3314|      0|    model: Option<&str>,
 3315|      0|    timeout_secs: u64,
 3316|      0|) -> Result<(serde_json::Value, f64, bool), AppError> {
 3317|       |    use wait_timeout::ChildExt;
 3318|       |
 3319|       |    // G31+G32+G33 (v1.0.69): validate the model BEFORE spawn, write the
 3320|       |    // schema to a trusted cache path (not /tmp), and reuse the
 3321|       |    // consolidated JSONL parser. See `codex_spawn.rs` for the canonical
 3322|       |    // hardening rationale.
 3323|      0|    super::codex_spawn::validate_codex_model(model)?;
 3324|      0|    let schema_file = super::codex_spawn::trusted_schema_path()?;
 3325|       |
 3326|      0|    let args = super::codex_spawn::CodexSpawnArgs {
 3327|      0|        binary,
 3328|      0|        prompt,
 3329|      0|        json_schema,
 3330|      0|        input_text,
 3331|      0|        model,
 3332|      0|        timeout_secs,
 3333|      0|        schema_path: schema_file.clone(),
 3334|      0|    };
 3335|      0|    let mut cmd = super::codex_spawn::build_codex_command(&args);
 3336|       |
 3337|      0|    let mut child = super::claude_runner::spawn_with_memory_limit(&mut cmd).map_err(|e| {
 3338|      0|        AppError::Io(std::io::Error::new(
 3339|      0|            e.kind(),
 3340|      0|            format!("failed to spawn codex: {e}"),
 3341|      0|        ))
 3342|      0|    })?;
 3343|       |
 3344|      0|    let full_prompt = format!("{prompt}\n\n{input_text}");
 3345|      0|    let stdin_bytes = full_prompt.into_bytes();
 3346|      0|    let mut child_stdin = child
 3347|      0|        .stdin
 3348|      0|        .take()
 3349|      0|        .ok_or_else(|| AppError::Validation("failed to open codex stdin".into()))?;
 3350|      0|    let stdin_thread = std::thread::spawn(move || -> Result<(), std::io::Error> {
 3351|      0|        child_stdin.write_all(&stdin_bytes)?;
 3352|      0|        drop(child_stdin);
 3353|      0|        Ok(())
 3354|      0|    });
 3355|       |
 3356|      0|    let start = std::time::Instant::now();
 3357|      0|    let timeout = std::time::Duration::from_secs(timeout_secs);
 3358|      0|    let status = child.wait_timeout(timeout).map_err(AppError::Io)?;
 3359|      0|    let _ = std::fs::remove_file(&schema_file);
 3360|       |
 3361|      0|    match status {
 3362|      0|        Some(exit_status) => {
 3363|      0|            stdin_thread
 3364|      0|                .join()
 3365|      0|                .map_err(|_| AppError::Validation("stdin thread panicked".into()))?
 3366|      0|                .map_err(AppError::Io)?;
 3367|       |
 3368|      0|            tracing::debug!(
 3369|       |                target: "process",
 3370|      0|                exit_code = ?exit_status.code(),
 3371|      0|                elapsed_ms = start.elapsed().as_millis() as u64,
 3372|      0|                "external process completed"
 3373|       |            );
 3374|       |
 3375|      0|            let mut stdout_buf = Vec::new();
 3376|      0|            if let Some(mut out) = child.stdout.take() {
 3377|      0|                std::io::Read::read_to_end(&mut out, &mut stdout_buf).map_err(AppError::Io)?;
 3378|      0|            }
 3379|      0|            if !exit_status.success() {
 3380|      0|                let mut stderr_buf = Vec::new();
 3381|      0|                if let Some(mut err) = child.stderr.take() {
 3382|      0|                    std::io::Read::read_to_end(&mut err, &mut stderr_buf).map_err(AppError::Io)?;
 3383|      0|                }
 3384|      0|                let stderr_str = String::from_utf8_lossy(&stderr_buf);
 3385|      0|                tracing::warn!(
 3386|       |                    target: "enrich",
 3387|      0|                    exit_code = ?exit_status.code(),
 3388|      0|                    stderr = %stderr_str.trim(),
 3389|      0|                    "codex process failed"
 3390|       |                );
 3391|      0|                return Err(AppError::Validation(format!(
 3392|      0|                    "codex exited with code {:?}: {}",
 3393|      0|                    exit_status.code(),
 3394|      0|                    stderr_str.trim()
 3395|      0|                )));
 3396|      0|            }
 3397|      0|            let stdout_str = String::from_utf8(stdout_buf)
 3398|      0|                .map_err(|_| AppError::Validation("codex stdout is not valid UTF-8".into()))?;
 3399|       |            // G32: use the JSONL parser, NOT serde_json::from_str on the
 3400|       |            // entire stdout (codex emits one event per line).
 3401|      0|            let result = super::codex_spawn::parse_codex_jsonl(&stdout_str)?;
 3402|       |            // Wrap the extraction as a JSON object so downstream code
 3403|       |            // (which expects a single `serde_json::Value`) keeps working.
 3404|       |            // `ExtractedUrl` lacks `Serialize` so we project to a
 3405|       |            // serde-friendly vector.
 3406|      0|            let urls: Vec<serde_json::Value> = result
 3407|      0|                .extraction
 3408|      0|                .urls
 3409|      0|                .iter()
 3410|      0|                .map(|u| serde_json::json!({"url": u.url, "offset": u.offset}))
 3411|      0|                .collect();
 3412|      0|            let value = serde_json::json!({
 3413|      0|                "entities": result.extraction.entities,
 3414|      0|                "relationships": result.extraction.relationships,
 3415|      0|                "urls": urls,
 3416|      0|                "extraction_method": result.extraction.extraction_method,
 3417|       |            });
 3418|      0|            Ok((value, 0.0, false))
 3419|       |        }
 3420|       |        None => {
 3421|      0|            let _ = child.kill();
 3422|      0|            let _ = child.wait();
 3423|      0|            let _ = stdin_thread.join();
 3424|      0|            Err(AppError::Validation(format!(
 3425|      0|                "codex timed out after {timeout_secs} seconds"
 3426|      0|            )))
 3427|       |        }
 3428|       |    }
 3429|      0|}
 3430|       |
 3431|       |// ---------------------------------------------------------------------------
 3432|       |// Tests
 3433|       |// ---------------------------------------------------------------------------
 3434|       |
 3435|       |#[cfg(test)]
 3436|       |mod tests {
 3437|       |    use super::*;
 3438|       |    use rusqlite::Connection;
 3439|       |
 3440|       |    /// Opens an in-memory SQLite database with a minimal schema for unit tests.
 3441|      9|    fn open_test_db() -> Connection {
 3442|      9|        let conn = Connection::open_in_memory().expect("in-memory db");
 3443|      9|        conn.execute_batch(
 3444|      9|            "CREATE TABLE memories (
 3445|      9|                id          INTEGER PRIMARY KEY AUTOINCREMENT,
 3446|      9|                namespace   TEXT NOT NULL DEFAULT 'global',
 3447|      9|                name        TEXT NOT NULL,
 3448|      9|                type        TEXT NOT NULL DEFAULT 'note',
 3449|      9|                description TEXT NOT NULL DEFAULT '',
 3450|      9|                body        TEXT NOT NULL DEFAULT '',
 3451|      9|                body_hash   TEXT NOT NULL DEFAULT '',
 3452|      9|                session_id  TEXT,
 3453|      9|                source      TEXT NOT NULL DEFAULT 'agent',
 3454|      9|                metadata    TEXT NOT NULL DEFAULT '{}',
 3455|      9|                created_at  INTEGER NOT NULL DEFAULT (unixepoch()),
 3456|      9|                updated_at  INTEGER NOT NULL DEFAULT (unixepoch()),
 3457|      9|                deleted_at  INTEGER,
 3458|      9|                UNIQUE(namespace, name)
 3459|      9|            );
 3460|      9|            CREATE TABLE entities (
 3461|      9|                id          INTEGER PRIMARY KEY AUTOINCREMENT,
 3462|      9|                namespace   TEXT NOT NULL DEFAULT 'global',
 3463|      9|                name        TEXT NOT NULL,
 3464|      9|                type        TEXT NOT NULL DEFAULT 'concept',
 3465|      9|                description TEXT,
 3466|      9|                degree      INTEGER NOT NULL DEFAULT 0,
 3467|      9|                created_at  INTEGER NOT NULL DEFAULT (unixepoch()),
 3468|      9|                updated_at  INTEGER NOT NULL DEFAULT (unixepoch()),
 3469|      9|                UNIQUE(namespace, name)
 3470|      9|            );
 3471|      9|            CREATE TABLE memory_entities (
 3472|      9|                memory_id  INTEGER NOT NULL,
 3473|      9|                entity_id  INTEGER NOT NULL,
 3474|      9|                PRIMARY KEY (memory_id, entity_id)
 3475|      9|            );
 3476|      9|            CREATE TABLE relationships (
 3477|      9|                id         INTEGER PRIMARY KEY AUTOINCREMENT,
 3478|      9|                namespace  TEXT NOT NULL DEFAULT 'global',
 3479|      9|                source_id  INTEGER NOT NULL,
 3480|      9|                target_id  INTEGER NOT NULL,
 3481|      9|                relation   TEXT NOT NULL,
 3482|      9|                weight     REAL NOT NULL DEFAULT 0.5,
 3483|      9|                description TEXT,
 3484|      9|                UNIQUE(source_id, target_id, relation)
 3485|      9|            );",
 3486|       |        )
 3487|      9|        .expect("schema creation must succeed");
 3488|      9|        conn
 3489|      9|    }
 3490|       |
 3491|       |    #[test]
 3492|      1|    fn scan_unbound_memories_finds_memories_without_bindings() {
 3493|      1|        let conn = open_test_db();
 3494|      1|        conn.execute(
 3495|      1|            "INSERT INTO memories (namespace, name, body) VALUES ('global', 'test-mem', 'some body content')",
 3496|      1|            [],
 3497|       |        )
 3498|      1|        .unwrap();
 3499|       |
 3500|      1|        let results = scan_unbound_memories(&conn, "global", None, &[]).unwrap();
 3501|      1|        assert_eq!(results.len(), 1);
 3502|      1|        assert_eq!(results[0].1, "test-mem");
 3503|      1|    }
 3504|       |
 3505|       |    #[test]
 3506|      1|    fn scan_unbound_memories_excludes_bound_memories() {
 3507|      1|        let conn = open_test_db();
 3508|      1|        conn.execute(
 3509|      1|            "INSERT INTO memories (namespace, name, body) VALUES ('global', 'bound-mem', 'body')",
 3510|      1|            [],
 3511|       |        )
 3512|      1|        .unwrap();
 3513|      1|        let mem_id: i64 = conn
 3514|      1|            .query_row("SELECT id FROM memories WHERE name='bound-mem'", [], |r| {
 3515|      1|                r.get(0)
 3516|      1|            })
 3517|      1|            .unwrap();
 3518|      1|        conn.execute(
 3519|      1|            "INSERT INTO entities (namespace, name) VALUES ('global', 'some-entity')",
 3520|      1|            [],
 3521|       |        )
 3522|      1|        .unwrap();
 3523|      1|        let ent_id: i64 = conn
 3524|      1|            .query_row(
 3525|      1|                "SELECT id FROM entities WHERE name='some-entity'",
 3526|      1|                [],
 3527|      1|                |r| r.get(0),
 3528|       |            )
 3529|      1|            .unwrap();
 3530|      1|        conn.execute(
 3531|      1|            "INSERT INTO memory_entities (memory_id, entity_id) VALUES (?1, ?2)",
 3532|      1|            rusqlite::params![mem_id, ent_id],
 3533|       |        )
 3534|      1|        .unwrap();
 3535|       |
 3536|      1|        let results = scan_unbound_memories(&conn, "global", None, &[]).unwrap();
 3537|      1|        assert!(results.is_empty(), "bound memory must not appear in scan");
                                                  ^0
 3538|      1|    }
 3539|       |
 3540|       |    #[test]
 3541|      1|    fn scan_entities_without_description_finds_null_description() {
 3542|      1|        let conn = open_test_db();
 3543|      1|        conn.execute(
 3544|      1|            "INSERT INTO entities (namespace, name, type, description) VALUES ('global', 'my-tool', 'tool', NULL)",
 3545|      1|            [],
 3546|       |        )
 3547|      1|        .unwrap();
 3548|       |
 3549|      1|        let results = scan_entities_without_description(&conn, "global", None).unwrap();
 3550|      1|        assert_eq!(results.len(), 1);
 3551|      1|        assert_eq!(results[0].1, "my-tool");
 3552|      1|    }
 3553|       |
 3554|       |    #[test]
 3555|      1|    fn scan_entities_without_description_excludes_entities_with_description() {
 3556|      1|        let conn = open_test_db();
 3557|      1|        conn.execute(
 3558|      1|            "INSERT INTO entities (namespace, name, type, description) VALUES ('global', 'described-tool', 'tool', 'Has a description already')",
 3559|      1|            [],
 3560|       |        )
 3561|      1|        .unwrap();
 3562|       |
 3563|      1|        let results = scan_entities_without_description(&conn, "global", None).unwrap();
 3564|      1|        assert!(
 3565|      1|            results.is_empty(),
 3566|      0|            "entity with description must not appear"
 3567|       |        );
 3568|      1|    }
 3569|       |
 3570|       |    #[test]
 3571|      1|    fn scan_short_body_memories_finds_short_bodies() {
 3572|      1|        let conn = open_test_db();
 3573|      1|        conn.execute(
 3574|      1|            "INSERT INTO memories (namespace, name, body) VALUES ('global', 'short-mem', 'hi')",
 3575|      1|            [],
 3576|       |        )
 3577|      1|        .unwrap();
 3578|       |
 3579|      1|        let results = scan_short_body_memories(&conn, "global", 100, None).unwrap();
 3580|      1|        assert_eq!(results.len(), 1);
 3581|      1|        assert_eq!(results[0].1, "short-mem");
 3582|      1|    }
 3583|       |
 3584|       |    #[test]
 3585|      1|    fn scan_short_body_memories_excludes_long_bodies() {
 3586|      1|        let conn = open_test_db();
 3587|      1|        let long_body = "a".repeat(1000);
 3588|      1|        conn.execute(
 3589|      1|            "INSERT INTO memories (namespace, name, body) VALUES ('global', 'long-mem', ?1)",
 3590|      1|            rusqlite::params![long_body],
 3591|       |        )
 3592|      1|        .unwrap();
 3593|       |
 3594|      1|        let results = scan_short_body_memories(&conn, "global", 100, None).unwrap();
 3595|      1|        assert!(results.is_empty(), "long memory must not appear in scan");
                                                  ^0
 3596|      1|    }
 3597|       |
 3598|       |    #[test]
 3599|      1|    fn scan_respects_limit() {
 3600|      1|        let conn = open_test_db();
 3601|      6|        for i in 0..5 {
                          ^5
 3602|      5|            conn.execute(
 3603|      5|                &format!("INSERT INTO memories (namespace, name, body) VALUES ('global', 'mem-{i}', 'short')"),
 3604|      5|                [],
 3605|      5|            )
 3606|      5|            .unwrap();
 3607|      5|        }
 3608|       |
 3609|      1|        let results = scan_short_body_memories(&conn, "global", 1000, Some(3)).unwrap();
 3610|      1|        assert_eq!(results.len(), 3, "limit must be respected");
                                                   ^0
 3611|      1|    }
 3612|       |
 3613|       |    #[test]
 3614|      1|    fn queue_db_schema_creates_correctly() {
 3615|      1|        let tmp_path = format!("/tmp/test-enrich-queue-{}.sqlite", std::process::id());
 3616|      1|        let conn = open_queue_db(&tmp_path).expect("queue db must open");
 3617|      1|        let count: i64 = conn
 3618|      1|            .query_row("SELECT COUNT(*) FROM queue", [], |r| r.get(0))
 3619|      1|            .unwrap();
 3620|      1|        assert_eq!(count, 0);
 3621|      1|        let _ = std::fs::remove_file(&tmp_path);
 3622|      1|    }
 3623|       |
 3624|       |    #[test]
 3625|      1|    fn parse_claude_output_valid_bindings() {
 3626|      1|        let output = r#"[
 3627|      1|            {"type":"system","subtype":"init"},
 3628|      1|            {"type":"result","is_error":false,"total_cost_usd":0.01,
 3629|      1|             "structured_output":{"entities":[{"name":"rust-lang","entity_type":"tool"}],"relationships":[]}}
 3630|      1|        ]"#;
 3631|      1|        let result = crate::commands::claude_runner::parse_claude_output(output)
 3632|      1|            .expect("must parse successfully");
 3633|      1|        assert!(result.value.get("entities").is_some());
 3634|      1|        assert!((result.cost_usd - 0.01).abs() < f64::EPSILON);
 3635|      1|        assert!(!result.is_oauth);
 3636|      1|    }
 3637|       |
 3638|       |    #[test]
 3639|      1|    fn parse_claude_output_detects_oauth() {
 3640|      1|        let output = r#"[
 3641|      1|            {"type":"system","subtype":"init","apiKeySource":"none"},
 3642|      1|            {"type":"result","is_error":false,"total_cost_usd":0.0,
 3643|      1|             "structured_output":{"entities":[],"relationships":[]}}
 3644|      1|        ]"#;
 3645|      1|        let result = crate::commands::claude_runner::parse_claude_output(output).unwrap();
 3646|      1|        assert!(result.is_oauth);
 3647|      1|    }
 3648|       |
 3649|       |    #[test]
 3650|      1|    fn parse_claude_output_rate_limit_returns_error() {
 3651|      1|        let output = r#"[
 3652|      1|            {"type":"system","subtype":"init"},
 3653|      1|            {"type":"result","is_error":true,"error":"rate_limit exceeded"}
 3654|      1|        ]"#;
 3655|      1|        let err = crate::commands::claude_runner::parse_claude_output(output).unwrap_err();
 3656|      1|        assert!(matches!(err, AppError::RateLimited { .. }));
                              ^0
 3657|      1|    }
 3658|       |
 3659|       |    #[test]
 3660|      1|    fn parse_claude_output_auth_error() {
 3661|      1|        let output = r#"[
 3662|      1|            {"type":"system","subtype":"init"},
 3663|      1|            {"type":"result","is_error":true,"error":"authentication failed"}
 3664|      1|        ]"#;
 3665|      1|        let err = crate::commands::claude_runner::parse_claude_output(output).unwrap_err();
 3666|      1|        assert!(format!("{err}").contains("authentication failed"));
 3667|      1|    }
 3668|       |
 3669|       |    #[test]
 3670|      1|    fn dry_run_emits_preview_without_calling_llm() {
 3671|       |        // This test validates the dry-run NDJSON contract without spawning any process.
 3672|       |        // The scan_operation function requires a DB; we build one in-memory but cannot
 3673|       |        // call run() directly because it needs AppPaths (disk). Instead we test the
 3674|       |        // lower-level helpers that the dry-run path relies on.
 3675|      1|        let conn = open_test_db();
 3676|      1|        conn.execute(
 3677|      1|            "INSERT INTO memories (namespace, name, body) VALUES ('global', 'dry-mem', 'tiny')",
 3678|      1|            [],
 3679|       |        )
 3680|      1|        .unwrap();
 3681|       |
 3682|      1|        let results = scan_short_body_memories(&conn, "global", 1000, None).unwrap();
 3683|      1|        assert_eq!(results.len(), 1);
 3684|      1|        assert_eq!(results[0].1, "dry-mem");
 3685|       |        // If scan finds the item and dry_run is set, no LLM would be called.
 3686|       |        // The NDJSON emission is tested via integration tests with a fake binary.
 3687|      1|    }
 3688|       |
 3689|       |    #[test]
 3690|      1|    fn persist_entity_description_updates_db() {
 3691|      1|        let conn = open_test_db();
 3692|      1|        conn.execute(
 3693|      1|            "INSERT INTO entities (namespace, name, type) VALUES ('global', 'tokio-runtime', 'tool')",
 3694|      1|            [],
 3695|       |        )
 3696|      1|        .unwrap();
 3697|      1|        let eid: i64 = conn
 3698|      1|            .query_row(
 3699|      1|                "SELECT id FROM entities WHERE name='tokio-runtime'",
 3700|      1|                [],
 3701|      1|                |r| r.get(0),
 3702|       |            )
 3703|      1|            .unwrap();
 3704|       |
 3705|      1|        persist_entity_description(&conn, eid, "Async runtime for Rust applications").unwrap();
 3706|       |
 3707|      1|        let desc: String = conn
 3708|      1|            .query_row(
 3709|      1|                "SELECT description FROM entities WHERE id=?1",
 3710|      1|                rusqlite::params![eid],
 3711|      1|                |r| r.get(0),
 3712|       |            )
 3713|      1|            .unwrap();
 3714|      1|        assert_eq!(desc, "Async runtime for Rust applications");
 3715|      1|    }
 3716|       |
 3717|       |    #[test]
 3718|      1|    fn bindings_schema_is_valid_json() {
 3719|      1|        let _: serde_json::Value =
 3720|      1|            serde_json::from_str(BINDINGS_SCHEMA).expect("BINDINGS_SCHEMA must be valid JSON");
 3721|      1|    }
 3722|       |
 3723|       |    #[test]
 3724|      1|    fn entity_description_schema_is_valid_json() {
 3725|      1|        let _: serde_json::Value = serde_json::from_str(ENTITY_DESCRIPTION_SCHEMA)
 3726|      1|            .expect("ENTITY_DESCRIPTION_SCHEMA must be valid JSON");
 3727|      1|    }
 3728|       |
 3729|       |    #[test]
 3730|      1|    fn body_enrich_schema_is_valid_json() {
 3731|      1|        let _: serde_json::Value = serde_json::from_str(BODY_ENRICH_SCHEMA)
 3732|      1|            .expect("BODY_ENRICH_SCHEMA must be valid JSON");
 3733|      1|    }
 3734|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/export.rs:
    1|       |//! Handler for the `export` CLI subcommand.
    2|       |
    3|       |use crate::cli::MemoryType;
    4|       |use crate::errors::AppError;
    5|       |use crate::output;
    6|       |use crate::paths::AppPaths;
    7|       |use crate::storage::connection::open_ro;
    8|       |use serde::Serialize;
    9|       |
   10|       |#[derive(clap::Args)]
   11|       |#[command(after_long_help = "EXAMPLES:\n  \
   12|       |    # Export all memories as NDJSON\n  \
   13|       |    sqlite-graphrag export\n\n  \
   14|       |    # Export only decision memories from a namespace\n  \
   15|       |    sqlite-graphrag export --type decision --namespace my-project\n\n  \
   16|       |    # Export including soft-deleted memories\n  \
   17|       |    sqlite-graphrag export --include-deleted\n\n  \
   18|       |    # Pipe to file for backup\n  \
   19|       |    sqlite-graphrag export > backup.ndjson")]
   20|       |pub struct ExportArgs {
   21|       |    /// Namespace (env: SQLITE_GRAPHRAG_NAMESPACE, default: global).
   22|       |    #[arg(
   23|       |        long,
   24|       |        help = "Namespace (env: SQLITE_GRAPHRAG_NAMESPACE, default: global)"
   25|       |    )]
   26|       |    pub namespace: Option<String>,
   27|       |    /// Filter by memory type.
   28|       |    #[arg(long, value_enum)]
   29|       |    pub r#type: Option<MemoryType>,
   30|       |    /// Include soft-deleted memories in the export.
   31|       |    #[arg(long, default_value_t = false)]
   32|       |    pub include_deleted: bool,
   33|       |    /// Maximum number of memories to export (default: 100000).
   34|       |    #[arg(long, default_value_t = 100_000)]
   35|       |    pub limit: usize,
   36|       |    /// Offset for pagination.
   37|       |    #[arg(long, default_value_t = 0)]
   38|       |    pub offset: usize,
   39|       |    #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
   40|       |    pub json: bool,
   41|       |    /// Path to graphrag.sqlite (overrides SQLITE_GRAPHRAG_DB_PATH and default CWD).
   42|       |    #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
   43|       |    pub db: Option<String>,
   44|       |}
   45|       |
   46|       |#[derive(Serialize)]
   47|       |struct ExportMemoryLine {
   48|       |    name: String,
   49|       |    r#type: String,
   50|       |    memory_type: String,
   51|       |    description: String,
   52|       |    body: String,
   53|       |    namespace: String,
   54|       |    created_at_iso: String,
   55|       |    updated_at_iso: String,
   56|       |    #[serde(skip_serializing_if = "Option::is_none")]
   57|       |    deleted_at_iso: Option<String>,
   58|       |}
   59|       |
   60|       |#[derive(Serialize)]
   61|       |struct ExportSummary {
   62|       |    summary: bool,
   63|       |    exported: usize,
   64|       |    namespace: String,
   65|       |    elapsed_ms: u64,
   66|       |}
   67|       |
   68|       |/// Exports memories as NDJSON (one JSON line per memory, followed by a summary line).
   69|      0|pub fn run(args: ExportArgs) -> Result<(), AppError> {
   70|      0|    let start = std::time::Instant::now();
   71|      0|    let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
   72|      0|    let paths = AppPaths::resolve(args.db.as_deref())?;
   73|      0|    crate::storage::connection::ensure_db_ready(&paths)?;
   74|      0|    let conn = open_ro(&paths.db)?;
   75|       |
   76|      0|    let deleted_filter = if args.include_deleted {
   77|      0|        ""
   78|       |    } else {
   79|      0|        "AND m.deleted_at IS NULL"
   80|       |    };
   81|       |
   82|      0|    let limit_i64 = args.limit as i64;
   83|      0|    let offset_i64 = args.offset as i64;
   84|      0|    let type_str: Option<String> = args.r#type.map(|t| t.as_str().to_string());
   85|       |
   86|      0|    let rows = fetch_rows(
   87|      0|        &conn,
   88|      0|        &namespace,
   89|      0|        &type_str,
   90|      0|        deleted_filter,
   91|      0|        limit_i64,
   92|      0|        offset_i64,
   93|      0|    )?;
   94|       |
   95|      0|    let exported = rows.len();
   96|      0|    for line in &rows {
   97|      0|        output::emit_json_compact(line)?;
   98|       |    }
   99|       |
  100|      0|    output::emit_json_compact(&ExportSummary {
  101|      0|        summary: true,
  102|      0|        exported,
  103|      0|        namespace: namespace.clone(),
  104|      0|        elapsed_ms: start.elapsed().as_millis() as u64,
  105|      0|    })?;
  106|       |
  107|      0|    Ok(())
  108|      0|}
  109|       |
  110|      0|fn fetch_rows(
  111|      0|    conn: &rusqlite::Connection,
  112|      0|    namespace: &str,
  113|      0|    type_str: &Option<String>,
  114|      0|    deleted_filter: &str,
  115|      0|    limit: i64,
  116|      0|    offset: i64,
  117|      0|) -> Result<Vec<ExportMemoryLine>, AppError> {
  118|      0|    let rows = if let Some(t) = type_str {
  119|      0|        let sql = format!(
  120|      0|            "SELECT m.name, m.type, m.description, m.body, m.namespace, \
  121|      0|                    m.created_at, m.updated_at, m.deleted_at \
  122|      0|             FROM memories m \
  123|      0|             WHERE m.namespace = ?1 {deleted_filter} AND m.type = ?2 \
  124|      0|             ORDER BY m.name \
  125|      0|             LIMIT ?3 OFFSET ?4"
  126|       |        );
  127|      0|        let mut stmt = conn.prepare(&sql)?;
  128|      0|        let result = stmt
  129|      0|            .query_map(rusqlite::params![namespace, t, limit, offset], map_row)?
  130|      0|            .collect::<Result<Vec<_>, _>>()?;
  131|      0|        result
  132|       |    } else {
  133|      0|        let sql = format!(
  134|      0|            "SELECT m.name, m.type, m.description, m.body, m.namespace, \
  135|      0|                    m.created_at, m.updated_at, m.deleted_at \
  136|      0|             FROM memories m \
  137|      0|             WHERE m.namespace = ?1 {deleted_filter} \
  138|      0|             ORDER BY m.name \
  139|      0|             LIMIT ?2 OFFSET ?3"
  140|       |        );
  141|      0|        let mut stmt = conn.prepare(&sql)?;
  142|      0|        let result = stmt
  143|      0|            .query_map(rusqlite::params![namespace, limit, offset], map_row)?
  144|      0|            .collect::<Result<Vec<_>, _>>()?;
  145|      0|        result
  146|       |    };
  147|      0|    Ok(rows)
  148|      0|}
  149|       |
  150|      0|fn map_row(row: &rusqlite::Row<'_>) -> rusqlite::Result<ExportMemoryLine> {
  151|      0|    let memory_type_val: String = row.get(1)?;
  152|       |    Ok(ExportMemoryLine {
  153|      0|        name: row.get(0)?,
  154|      0|        r#type: memory_type_val.clone(),
  155|      0|        memory_type: memory_type_val,
  156|      0|        description: row.get(2)?,
  157|      0|        body: row.get(3)?,
  158|      0|        namespace: row.get(4)?,
  159|      0|        created_at_iso: crate::tz::epoch_to_iso(row.get::<_, i64>(5)?),
  160|      0|        updated_at_iso: crate::tz::epoch_to_iso(row.get::<_, i64>(6)?),
  161|      0|        deleted_at_iso: row.get::<_, Option<i64>>(7)?.map(crate::tz::epoch_to_iso),
  162|       |    })
  163|      0|}
  164|       |
  165|       |#[cfg(test)]
  166|       |mod tests {
  167|       |    use super::*;
  168|       |
  169|       |    #[test]
  170|      1|    fn export_line_emits_both_type_and_memory_type() {
  171|      1|        let line = ExportMemoryLine {
  172|      1|            name: "test".to_string(),
  173|      1|            r#type: "document".to_string(),
  174|      1|            memory_type: "document".to_string(),
  175|      1|            description: "desc".to_string(),
  176|      1|            body: "body".to_string(),
  177|      1|            namespace: "global".to_string(),
  178|      1|            created_at_iso: "2025-01-01T00:00:00Z".to_string(),
  179|      1|            updated_at_iso: "2025-01-01T00:00:00Z".to_string(),
  180|      1|            deleted_at_iso: None,
  181|      1|        };
  182|      1|        let json = serde_json::to_value(&line).unwrap();
  183|      1|        assert_eq!(json["type"], "document");
  184|      1|        assert_eq!(json["memory_type"], "document");
  185|      1|    }
  186|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/forget.rs:
    1|       |//! Handler for the `forget` CLI subcommand.
    2|       |
    3|       |use crate::errors::AppError;
    4|       |use crate::i18n::errors_msg;
    5|       |use crate::output;
    6|       |use crate::paths::AppPaths;
    7|       |use crate::storage::connection::open_rw;
    8|       |use crate::storage::memories;
    9|       |use rusqlite::{params, OptionalExtension};
   10|       |use serde::Serialize;
   11|       |
   12|       |#[derive(clap::Args)]
   13|       |#[command(after_long_help = "EXAMPLES:\n  \
   14|       |    # Soft-delete a memory by name (positional form)\n  \
   15|       |    sqlite-graphrag forget onboarding\n\n  \
   16|       |    # Soft-delete using the named flag form\n  \
   17|       |    sqlite-graphrag forget --name onboarding\n\n  \
   18|       |    # Soft-delete from a specific namespace\n  \
   19|       |    sqlite-graphrag forget onboarding --namespace my-project")]
   20|       |pub struct ForgetArgs {
   21|       |    /// Memory name as a positional argument. Alternative to `--name`.
   22|       |    #[arg(
   23|       |        value_name = "NAME",
   24|       |        conflicts_with = "name",
   25|       |        help = "Memory name to soft-delete; alternative to --name"
   26|       |    )]
   27|       |    pub name_positional: Option<String>,
   28|       |    /// Memory name to soft-delete. The row is preserved with `deleted_at` set, recoverable via `restore`.
   29|       |    /// Use `purge` to permanently remove soft-deleted memories.
   30|       |    #[arg(long)]
   31|       |    pub name: Option<String>,
   32|       |    #[arg(
   33|       |        long,
   34|       |        help = "Namespace (env: SQLITE_GRAPHRAG_NAMESPACE, default: global)"
   35|       |    )]
   36|       |    pub namespace: Option<String>,
   37|       |    #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
   38|       |    pub json: bool,
   39|       |    #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
   40|       |    pub db: Option<String>,
   41|       |}
   42|       |
   43|       |#[derive(Serialize)]
   44|       |struct ForgetResponse {
   45|       |    /// Outcome of the forget operation: `soft_deleted`, `already_deleted`, or `not_found`.
   46|       |    action: String,
   47|       |    /// True only when this invocation actively transitioned the memory from live to soft-deleted.
   48|       |    forgotten: bool,
   49|       |    name: String,
   50|       |    namespace: String,
   51|       |    /// Unix epoch seconds when the memory was soft-deleted; `None` when `action="not_found"`.
   52|       |    #[serde(skip_serializing_if = "Option::is_none")]
   53|       |    deleted_at: Option<i64>,
   54|       |    /// RFC 3339 UTC timestamp parallel to `deleted_at` for ISO 8601 parsers.
   55|       |    #[serde(skip_serializing_if = "Option::is_none")]
   56|       |    deleted_at_iso: Option<String>,
   57|       |    /// Total execution time in milliseconds from handler start to serialisation.
   58|       |    elapsed_ms: u64,
   59|       |}
   60|       |
   61|      0|pub fn run(args: ForgetArgs) -> Result<(), AppError> {
   62|      0|    let start = std::time::Instant::now();
   63|      0|    tracing::debug!(target: "forget", name = ?args.name_positional.as_deref().or(args.name.as_deref()), "soft-deleting memory");
   64|       |    // Resolve name from positional or --name flag; both are optional, at least one is required.
   65|      0|    let name = args.name_positional.or(args.name).ok_or_else(|| {
   66|      0|        AppError::Validation("name required: pass as positional argument or via --name".to_string())
   67|      0|    })?;
   68|      0|    let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
   69|      0|    let paths = AppPaths::resolve(args.db.as_deref())?;
   70|      0|    crate::storage::connection::ensure_db_ready(&paths)?;
   71|       |
   72|      0|    let conn = open_rw(&paths.db)?;
   73|       |
   74|       |    // Probe state without filtering on `deleted_at` so we can distinguish
   75|       |    // `not_found` (no row) from `already_deleted` (row with deleted_at set)
   76|       |    // from the live case (deleted_at IS NULL) handled by `soft_delete`.
   77|      0|    let probe: Option<(i64, Option<i64>)> = conn
   78|      0|        .query_row(
   79|      0|            "SELECT id, deleted_at FROM memories WHERE namespace = ?1 AND name = ?2",
   80|      0|            params![namespace, name],
   81|      0|            |r| Ok((r.get::<_, i64>(0)?, r.get::<_, Option<i64>>(1)?)),
   82|       |        )
   83|      0|        .optional()?;
   84|       |
   85|      0|    let (action, forgotten, deleted_at, memory_id) = match probe {
   86|      0|        None => ("not_found", false, None, None),
   87|      0|        Some((id, Some(existing))) => ("already_deleted", false, Some(existing), Some(id)),
   88|      0|        Some((id, None)) => {
   89|       |            // G39 Passo 4 (v1.0.69): remove the embedding vector BEFORE the
   90|       |            // soft-delete so we do not leave a `vec_memories` row that will
   91|       |            // show up as `vec_memories_orphaned` in `health --json`. The
   92|       |            // operation is best-effort: a failure is logged but does not
   93|       |            // abort the soft-delete (the user-visible action is the same).
   94|      0|            if let Err(e) = memories::delete_vec(&conn, id) {
   95|      0|                tracing::warn!(
   96|       |                    target: "forget",
   97|       |                    memory_id = id,
   98|       |                    error = %e,
   99|      0|                    "vec cleanup before soft-delete failed — orphan vector may be left",
  100|       |                );
  101|      0|            }
  102|      0|            let ok = memories::soft_delete(&conn, &namespace, &name)?;
  103|      0|            if !ok {
  104|       |                // Race: row was concurrently soft-deleted between probe and update.
  105|       |                // Re-read to get the current `deleted_at`.
  106|      0|                let current: Option<i64> = conn
  107|      0|                    .query_row(
  108|      0|                        "SELECT deleted_at FROM memories WHERE id = ?1",
  109|      0|                        params![id],
  110|      0|                        |r| r.get::<_, Option<i64>>(0),
  111|       |                    )
  112|      0|                    .optional()?
  113|      0|                    .flatten();
  114|      0|                ("already_deleted", false, current, Some(id))
  115|       |            } else {
  116|      0|                let ts: Option<i64> = conn
  117|      0|                    .query_row(
  118|      0|                        "SELECT deleted_at FROM memories WHERE id = ?1",
  119|      0|                        params![id],
  120|      0|                        |r| r.get::<_, Option<i64>>(0),
  121|       |                    )
  122|      0|                    .optional()?
  123|      0|                    .flatten();
  124|      0|                ("soft_deleted", true, ts, Some(id))
  125|       |            }
  126|       |        }
  127|       |    };
  128|       |
  129|      0|    if forgotten {
  130|      0|        if let Some(id) = memory_id {
  131|       |            // FTS5 external-content: manual `DELETE FROM fts_memories WHERE rowid=?`
  132|       |            // corrupts the index. The correct cleanup happens via the `trg_fts_ad` trigger
  133|       |            // when `purge` physically removes the row from `memories`. Between soft-delete
  134|       |            // and purge, FTS queries filter `m.deleted_at IS NULL` in the JOIN.
  135|      0|            if let Err(e) = memories::delete_vec(&conn, id) {
  136|      0|                tracing::warn!(target: "forget", memory_id = id, error = %e, "vec cleanup failed — orphan vector left");
  137|      0|            }
  138|      0|        }
  139|      0|    }
  140|       |
  141|      0|    conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);")?;
  142|       |
  143|      0|    if action == "not_found" {
  144|      0|        return Err(AppError::NotFound(errors_msg::memory_not_found(
  145|      0|            &name, &namespace,
  146|      0|        )));
  147|      0|    }
  148|       |
  149|      0|    let deleted_at_iso = deleted_at.map(crate::tz::epoch_to_iso);
  150|      0|    let response = ForgetResponse {
  151|      0|        action: action.to_string(),
  152|      0|        forgotten,
  153|      0|        name: name.clone(),
  154|      0|        namespace: namespace.clone(),
  155|      0|        deleted_at,
  156|      0|        deleted_at_iso,
  157|      0|        elapsed_ms: start.elapsed().as_millis() as u64,
  158|      0|    };
  159|      0|    output::emit_json(&response)?;
  160|       |
  161|      0|    Ok(())
  162|      0|}
  163|       |
  164|       |#[cfg(test)]
  165|       |mod tests {
  166|       |    use super::*;
  167|       |
  168|       |    #[test]
  169|      1|    fn forget_response_serializes_basic_fields() {
  170|      1|        let resp = ForgetResponse {
  171|      1|            action: "soft_deleted".to_string(),
  172|      1|            forgotten: true,
  173|      1|            name: "my-memory".to_string(),
  174|      1|            namespace: "global".to_string(),
  175|      1|            deleted_at: Some(1_700_000_000),
  176|      1|            deleted_at_iso: Some("2023-11-14T22:13:20+00:00".to_string()),
  177|      1|            elapsed_ms: 5,
  178|      1|        };
  179|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  180|      1|        assert_eq!(json["action"], "soft_deleted");
  181|      1|        assert_eq!(json["forgotten"], true);
  182|      1|        assert_eq!(json["name"], "my-memory");
  183|      1|        assert_eq!(json["namespace"], "global");
  184|      1|        assert_eq!(json["deleted_at"], 1_700_000_000i64);
  185|      1|        assert!(json["deleted_at_iso"].is_string());
  186|      1|        assert!(json["elapsed_ms"].is_number());
  187|      1|    }
  188|       |
  189|       |    #[test]
  190|      1|    fn forget_response_action_soft_deleted_implies_forgotten_true() {
  191|      1|        let resp = ForgetResponse {
  192|      1|            action: "soft_deleted".to_string(),
  193|      1|            forgotten: true,
  194|      1|            name: "test".to_string(),
  195|      1|            namespace: "ns".to_string(),
  196|      1|            deleted_at: Some(42),
  197|      1|            deleted_at_iso: Some(crate::tz::epoch_to_iso(42)),
  198|      1|            elapsed_ms: 1,
  199|      1|        };
  200|      1|        assert_eq!(resp.action, "soft_deleted");
  201|      1|        assert!(resp.forgotten);
  202|      1|        assert_eq!(resp.deleted_at, Some(42));
  203|      1|        assert!(resp.deleted_at_iso.is_some());
  204|      1|    }
  205|       |
  206|       |    #[test]
  207|      1|    fn forget_response_already_deleted_preserves_timestamp() {
  208|      1|        let resp = ForgetResponse {
  209|      1|            action: "already_deleted".to_string(),
  210|      1|            forgotten: false,
  211|      1|            name: "abc".to_string(),
  212|      1|            namespace: "my-project".to_string(),
  213|      1|            deleted_at: Some(1_650_000_000),
  214|      1|            deleted_at_iso: Some(crate::tz::epoch_to_iso(1_650_000_000)),
  215|      1|            elapsed_ms: 2,
  216|      1|        };
  217|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  218|      1|        assert_eq!(json["action"], "already_deleted");
  219|      1|        assert_eq!(json["forgotten"], false);
  220|      1|        assert_eq!(json["deleted_at"], 1_650_000_000i64);
  221|      1|        assert!(json["deleted_at_iso"].is_string());
  222|      1|    }
  223|       |
  224|       |    #[test]
  225|      1|    fn forget_response_not_found_omits_deleted_at_fields() {
  226|      1|        let resp = ForgetResponse {
  227|      1|            action: "not_found".to_string(),
  228|      1|            forgotten: false,
  229|      1|            name: "phantom".to_string(),
  230|      1|            namespace: "global".to_string(),
  231|      1|            deleted_at: None,
  232|      1|            deleted_at_iso: None,
  233|      1|            elapsed_ms: 0,
  234|      1|        };
  235|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  236|      1|        assert_eq!(json["action"], "not_found");
  237|      1|        assert_eq!(json["forgotten"], false);
  238|       |        // skip_serializing_if = "Option::is_none" means both fields are absent
  239|      1|        assert!(json.get("deleted_at").is_none());
  240|      1|        assert!(json.get("deleted_at_iso").is_none());
  241|      1|        assert_eq!(json["elapsed_ms"], 0u64);
  242|      1|    }
  243|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/fts.rs:
    1|       |//! Handler for the `fts` CLI subcommand family.
    2|       |//!
    3|       |//! Provides two maintenance operations for the FTS5 full-text search index:
    4|       |//! - `rebuild`: drops and reconstructs the index from the `memories` table.
    5|       |//! - `check`: runs the FTS5 integrity-check without modifying the index.
    6|       |
    7|       |use crate::errors::AppError;
    8|       |use crate::output;
    9|       |use crate::paths::AppPaths;
   10|       |use crate::storage::connection::{open_ro, open_rw};
   11|       |use serde::Serialize;
   12|       |
   13|       |/// Arguments for the `fts` subcommand family.
   14|       |#[derive(clap::Args)]
   15|       |#[command(
   16|       |    about = "FTS5 full-text search index management",
   17|       |    after_long_help = "EXAMPLES:\n  \
   18|       |        # Rebuild the full-text search index from memories table\n  \
   19|       |        sqlite-graphrag fts rebuild\n\n  \
   20|       |        # Check FTS5 index integrity\n  \
   21|       |        sqlite-graphrag fts check --json\n\n  \
   22|       |        # Show FTS5 index statistics\n  \
   23|       |        sqlite-graphrag fts stats --json"
   24|       |)]
   25|       |pub struct FtsArgs {
   26|       |    #[command(subcommand)]
   27|       |    pub command: FtsSubcommand,
   28|       |}
   29|       |
   30|       |/// Subcommands nested under `fts`.
   31|       |#[derive(clap::Subcommand)]
   32|       |pub enum FtsSubcommand {
   33|       |    /// Rebuild the FTS5 index from the memories table.
   34|       |    #[command(after_long_help = "EXAMPLES:\n  \
   35|       |        # Rebuild the full-text search index\n  \
   36|       |        sqlite-graphrag fts rebuild\n\n  \
   37|       |        # Rebuild with custom database path\n  \
   38|       |        sqlite-graphrag fts rebuild --db /path/to/graphrag.sqlite")]
   39|       |    Rebuild(FtsRebuildArgs),
   40|       |    /// Run FTS5 integrity-check without modifying the index.
   41|       |    #[command(after_long_help = "EXAMPLES:\n  \
   42|       |        # Check FTS5 index integrity\n  \
   43|       |        sqlite-graphrag fts check\n\n  \
   44|       |        # Check with custom database path\n  \
   45|       |        sqlite-graphrag fts check --db /path/to/graphrag.sqlite")]
   46|       |    Check(FtsCheckArgs),
   47|       |    /// Show FTS5 index statistics (row count, shadow pages, functional status).
   48|       |    #[command(after_long_help = "EXAMPLES:\n  \
   49|       |        # Show FTS5 index statistics\n  \
   50|       |        sqlite-graphrag fts stats\n\n  \
   51|       |        # Stats with custom database path\n  \
   52|       |        sqlite-graphrag fts stats --db /path/to/graphrag.sqlite")]
   53|       |    Stats(FtsStatsArgs),
   54|       |}
   55|       |
   56|       |/// Arguments for `fts rebuild`.
   57|       |#[derive(clap::Args)]
   58|       |pub struct FtsRebuildArgs {
   59|       |    /// No-op; JSON is always emitted on stdout.
   60|       |    #[arg(long, hide = true)]
   61|       |    pub json: bool,
   62|       |    /// Path to the SQLite database file.
   63|       |    #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
   64|       |    pub db: Option<String>,
   65|       |}
   66|       |
   67|       |/// Arguments for `fts check`.
   68|       |#[derive(clap::Args)]
   69|       |pub struct FtsCheckArgs {
   70|       |    /// No-op; JSON is always emitted on stdout.
   71|       |    #[arg(long, hide = true)]
   72|       |    pub json: bool,
   73|       |    /// Path to the SQLite database file.
   74|       |    #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
   75|       |    pub db: Option<String>,
   76|       |}
   77|       |
   78|       |/// Arguments for `fts stats`.
   79|       |#[derive(clap::Args)]
   80|       |pub struct FtsStatsArgs {
   81|       |    /// No-op; JSON is always emitted on stdout.
   82|       |    #[arg(long, hide = true)]
   83|       |    pub json: bool,
   84|       |    /// Path to the SQLite database file.
   85|       |    #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
   86|       |    pub db: Option<String>,
   87|       |}
   88|       |
   89|       |#[derive(Serialize)]
   90|       |struct FtsRebuildResponse {
   91|       |    action: String,
   92|       |    rows_indexed: i64,
   93|       |    elapsed_ms: u64,
   94|       |}
   95|       |
   96|       |#[derive(Serialize)]
   97|       |struct FtsCheckResponse {
   98|       |    action: String,
   99|       |    integrity_ok: bool,
  100|       |    #[serde(skip_serializing_if = "Option::is_none")]
  101|       |    detail: Option<String>,
  102|       |    elapsed_ms: u64,
  103|       |}
  104|       |
  105|       |#[derive(Serialize)]
  106|       |struct FtsStatsResponse {
  107|       |    total_rows: i64,
  108|       |    #[serde(skip_serializing_if = "Option::is_none")]
  109|       |    shadow_pages: Option<i64>,
  110|       |    fts_functional: bool,
  111|       |    elapsed_ms: u64,
  112|       |}
  113|       |
  114|       |/// Dispatch entry point called from `main`.
  115|       |///
  116|       |/// # Errors
  117|       |/// Propagates any [`AppError`] raised by the underlying subcommand.
  118|      0|pub fn run(args: FtsArgs) -> Result<(), AppError> {
  119|      0|    match args.command {
  120|      0|        FtsSubcommand::Rebuild(a) => run_rebuild(a),
  121|      0|        FtsSubcommand::Check(a) => run_check(a),
  122|      0|        FtsSubcommand::Stats(a) => run_stats(a),
  123|       |    }
  124|      0|}
  125|       |
  126|       |/// Rebuilds the FTS5 index by issuing the `'rebuild'` special command.
  127|       |///
  128|       |/// The FTS5 `INSERT INTO fts_memories(fts_memories) VALUES('rebuild')` statement
  129|       |/// drops all index data and re-populates it from the content table in a single
  130|       |/// transaction. Use this after bulk imports or when `fts check` reports a failure.
  131|       |///
  132|       |/// # Errors
  133|       |/// Returns [`AppError::Database`] on any SQLite failure.
  134|      0|fn run_rebuild(args: FtsRebuildArgs) -> Result<(), AppError> {
  135|      0|    let start = std::time::Instant::now();
  136|      0|    let paths = AppPaths::resolve(args.db.as_deref())?;
  137|      0|    crate::storage::connection::ensure_db_ready(&paths)?;
  138|      0|    let conn = open_rw(&paths.db)?;
  139|       |
  140|      0|    let table_exists: bool = conn.query_row(
  141|      0|        "SELECT COUNT(*) FROM sqlite_master WHERE type='table' AND name='fts_memories'",
  142|      0|        [],
  143|      0|        |r| r.get::<_, i64>(0).map(|v| v > 0),
  144|      0|    )?;
  145|      0|    if !table_exists {
  146|      0|        return Err(AppError::Validation(
  147|      0|            "FTS5 table 'fts_memories' does not exist — run 'sqlite-graphrag init' first"
  148|      0|                .to_string(),
  149|      0|        ));
  150|      0|    }
  151|       |
  152|      0|    conn.execute_batch("INSERT INTO fts_memories(fts_memories) VALUES('rebuild');")?;
  153|       |
  154|      0|    let rows: i64 = conn.query_row("SELECT COUNT(*) FROM fts_memories", [], |r| r.get(0))?;
  155|       |
  156|      0|    conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);")?;
  157|       |
  158|      0|    output::emit_json(&FtsRebuildResponse {
  159|      0|        action: "rebuilt".to_string(),
  160|      0|        rows_indexed: rows,
  161|      0|        elapsed_ms: start.elapsed().as_millis() as u64,
  162|      0|    })?;
  163|       |
  164|      0|    Ok(())
  165|      0|}
  166|       |
  167|       |/// Runs the FTS5 integrity-check without modifying the index.
  168|       |///
  169|       |/// The FTS5 integrity-check is triggered by:
  170|       |/// ```sql
  171|       |/// INSERT INTO fts_memories(fts_memories, rank) VALUES('integrity-check', 1);
  172|       |/// ```
  173|       |/// SQLite raises an error if the index is corrupt, so a successful `execute_batch`
  174|       |/// means the index is healthy. On failure, `integrity_ok` is `false` and the
  175|       |/// `detail` field carries an actionable hint.
  176|       |///
  177|       |/// # Errors
  178|       |/// Returns [`AppError`] only on unexpected I/O or path resolution failures;
  179|       |/// an FTS5 corruption is reported as `integrity_ok: false`, not as a Rust error.
  180|      0|fn run_check(args: FtsCheckArgs) -> Result<(), AppError> {
  181|      0|    let start = std::time::Instant::now();
  182|      0|    let paths = AppPaths::resolve(args.db.as_deref())?;
  183|      0|    crate::storage::connection::ensure_db_ready(&paths)?;
  184|      0|    let conn = open_rw(&paths.db)?;
  185|       |
  186|      0|    let integrity_ok = conn
  187|      0|        .execute_batch("INSERT INTO fts_memories(fts_memories, rank) VALUES('integrity-check', 1);")
  188|      0|        .is_ok();
  189|       |
  190|      0|    conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);").ok();
  191|       |
  192|      0|    output::emit_json(&FtsCheckResponse {
  193|      0|        action: "checked".to_string(),
  194|      0|        integrity_ok,
  195|      0|        detail: if integrity_ok {
  196|      0|            None
  197|       |        } else {
  198|      0|            Some("FTS5 integrity-check failed — run 'sqlite-graphrag fts rebuild'".to_string())
  199|       |        },
  200|      0|        elapsed_ms: start.elapsed().as_millis() as u64,
  201|      0|    })?;
  202|       |
  203|      0|    Ok(())
  204|      0|}
  205|       |
  206|       |/// Returns FTS5 index statistics: total indexed rows, shadow table page count (best-effort),
  207|       |/// and a functional liveness check.
  208|       |///
  209|       |/// # Errors
  210|       |/// Returns [`AppError`] only on unexpected I/O or path resolution failures.
  211|      0|fn run_stats(args: FtsStatsArgs) -> Result<(), AppError> {
  212|      0|    let start = std::time::Instant::now();
  213|      0|    let paths = AppPaths::resolve(args.db.as_deref())?;
  214|      0|    crate::storage::connection::ensure_db_ready(&paths)?;
  215|      0|    let conn = open_ro(&paths.db)?;
  216|       |
  217|       |    // 1. Total indexed rows in the FTS5 content table.
  218|      0|    let total_rows: i64 = conn.query_row("SELECT COUNT(*) FROM fts_memories", [], |r| r.get(0))?;
  219|       |
  220|       |    // 2. Shadow pages — queries the internal `_data` shadow table.
  221|       |    //    This may not exist on all SQLite builds; treat any failure as None.
  222|      0|    let shadow_pages: Option<i64> = conn
  223|      0|        .query_row("SELECT COUNT(*) FROM fts_memories_data", [], |r| r.get(0))
  224|      0|        .ok();
  225|       |
  226|       |    // 3. Functional liveness: SELECT with FTS5 match syntax against a wildcard.
  227|       |    //    A successful LIMIT 0 query confirms the FTS5 module is operational.
  228|      0|    let fts_functional = conn
  229|      0|        .execute_batch("SELECT * FROM fts_memories('*') LIMIT 0;")
  230|      0|        .is_ok();
  231|       |
  232|      0|    output::emit_json(&FtsStatsResponse {
  233|      0|        total_rows,
  234|      0|        shadow_pages,
  235|      0|        fts_functional,
  236|      0|        elapsed_ms: start.elapsed().as_millis() as u64,
  237|      0|    })?;
  238|       |
  239|      0|    Ok(())
  240|      0|}
  241|       |
  242|       |/// Public helper: returns `true` when the FTS5 module is loadable AND the
  243|       |/// `fts_memories` virtual table exists AND a wildcard MATCH query succeeds.
  244|       |///
  245|       |/// Used by [`crate::commands::optimize`] to skip the (potentially minute-long)
  246|       |/// FTS5 rebuild when the index is already healthy. Also used by `health` and
  247|       |/// by future `vec check` implementations.
  248|       |///
  249|       |/// # Errors
  250|       |/// Returns `Err(AppError::Database)` only when the connection cannot be opened
  251|       |/// for reasons unrelated to FTS5 itself (permission denied, corrupted file).
  252|       |/// A missing FTS5 module or table is reported as `Ok(false)`.
  253|      1|pub fn check_fts_functional(conn: &rusqlite::Connection) -> Result<bool, AppError> {
  254|      1|    let table_exists: bool = conn
  255|      1|        .query_row(
  256|      1|            "SELECT COUNT(*) FROM sqlite_master WHERE type='table' AND name='fts_memories'",
  257|      1|            [],
  258|      1|            |r| r.get::<_, i64>(0).map(|v| v > 0),
  259|       |        )
  260|      1|        .unwrap_or(false);
  261|      1|    if !table_exists {
  262|      0|        return Ok(false);
  263|      1|    }
  264|      1|    let liveness = conn
  265|      1|        .execute_batch("SELECT * FROM fts_memories('*') LIMIT 0;")
  266|      1|        .is_ok();
  267|      1|    Ok(liveness)
  268|      1|}
  269|       |
  270|       |#[cfg(test)]
  271|       |mod tests {
  272|       |    use super::*;
  273|       |
  274|       |    #[test]
  275|      1|    fn fts_rebuild_response_serializes_all_fields() {
  276|      1|        let resp = FtsRebuildResponse {
  277|      1|            action: "rebuilt".to_string(),
  278|      1|            rows_indexed: 42,
  279|      1|            elapsed_ms: 10,
  280|      1|        };
  281|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  282|      1|        assert_eq!(json["action"], "rebuilt");
  283|      1|        assert_eq!(json["rows_indexed"], 42i64);
  284|      1|        assert_eq!(json["elapsed_ms"], 10u64);
  285|      1|    }
  286|       |
  287|       |    #[test]
  288|      1|    fn fts_check_response_integrity_ok_omits_detail() {
  289|      1|        let resp = FtsCheckResponse {
  290|      1|            action: "checked".to_string(),
  291|      1|            integrity_ok: true,
  292|      1|            detail: None,
  293|      1|            elapsed_ms: 5,
  294|      1|        };
  295|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  296|      1|        assert_eq!(json["action"], "checked");
  297|      1|        assert_eq!(json["integrity_ok"], true);
  298|      1|        assert!(
  299|      1|            json.get("detail").is_none(),
  300|      0|            "detail must be absent when integrity_ok is true"
  301|       |        );
  302|      1|        assert_eq!(json["elapsed_ms"], 5u64);
  303|      1|    }
  304|       |
  305|       |    #[test]
  306|      1|    fn fts_check_response_corruption_includes_detail() {
  307|      1|        let resp = FtsCheckResponse {
  308|      1|            action: "checked".to_string(),
  309|      1|            integrity_ok: false,
  310|      1|            detail: Some(
  311|      1|                "FTS5 integrity-check failed — run 'sqlite-graphrag fts rebuild'".to_string(),
  312|      1|            ),
  313|      1|            elapsed_ms: 3,
  314|      1|        };
  315|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  316|      1|        assert_eq!(json["integrity_ok"], false);
  317|      1|        assert!(
  318|      1|            json["detail"].as_str().unwrap().contains("fts rebuild"),
  319|      0|            "detail must mention the remediation command"
  320|       |        );
  321|      1|    }
  322|       |
  323|       |    #[test]
  324|      1|    fn fts_rebuild_response_elapsed_ms_non_negative() {
  325|      1|        let resp = FtsRebuildResponse {
  326|      1|            action: "rebuilt".to_string(),
  327|      1|            rows_indexed: 0,
  328|      1|            elapsed_ms: 0,
  329|      1|        };
  330|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  331|      1|        assert!(json["elapsed_ms"].as_u64().is_some());
  332|      1|    }
  333|       |
  334|       |    #[test]
  335|      1|    fn fts_check_response_elapsed_ms_non_negative() {
  336|      1|        let resp = FtsCheckResponse {
  337|      1|            action: "checked".to_string(),
  338|      1|            integrity_ok: true,
  339|      1|            detail: None,
  340|      1|            elapsed_ms: 0,
  341|      1|        };
  342|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  343|      1|        assert!(json["elapsed_ms"].as_u64().is_some());
  344|      1|    }
  345|       |
  346|       |    #[test]
  347|      1|    fn fts_stats_response_serializes_all_fields() {
  348|      1|        let resp = FtsStatsResponse {
  349|      1|            total_rows: 150,
  350|      1|            shadow_pages: Some(12),
  351|      1|            fts_functional: true,
  352|      1|            elapsed_ms: 8,
  353|      1|        };
  354|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  355|      1|        assert_eq!(json["total_rows"], 150i64);
  356|      1|        assert_eq!(json["shadow_pages"], 12i64);
  357|      1|        assert_eq!(json["fts_functional"], true);
  358|      1|        assert_eq!(json["elapsed_ms"], 8u64);
  359|      1|    }
  360|       |
  361|       |    #[test]
  362|      1|    fn fts_stats_response_omits_shadow_pages_when_none() {
  363|      1|        let resp = FtsStatsResponse {
  364|      1|            total_rows: 0,
  365|      1|            shadow_pages: None,
  366|      1|            fts_functional: false,
  367|      1|            elapsed_ms: 2,
  368|      1|        };
  369|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  370|      1|        assert!(
  371|      1|            json.get("shadow_pages").is_none(),
  372|      0|            "shadow_pages must be absent when None"
  373|       |        );
  374|      1|        assert_eq!(json["fts_functional"], false);
  375|      1|    }
  376|       |
  377|       |    #[test]
  378|      1|    fn fts_stats_response_fts_not_functional() {
  379|      1|        let resp = FtsStatsResponse {
  380|      1|            total_rows: 5,
  381|      1|            shadow_pages: None,
  382|      1|            fts_functional: false,
  383|      1|            elapsed_ms: 1,
  384|      1|        };
  385|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  386|      1|        assert_eq!(json["fts_functional"], false);
  387|      1|        assert_eq!(json["total_rows"], 5i64);
  388|      1|    }
  389|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/graph_export.rs:
    1|       |//! Handler for the `graph-export` CLI subcommand.
    2|       |
    3|       |use crate::cli::GraphExportFormat;
    4|       |use crate::entity_type::EntityType;
    5|       |use crate::errors::AppError;
    6|       |use crate::output;
    7|       |use crate::paths::AppPaths;
    8|       |use crate::storage::connection::open_ro;
    9|       |use crate::storage::entities;
   10|       |use serde::Serialize;
   11|       |use std::collections::HashMap;
   12|       |use std::fs;
   13|       |use std::path::PathBuf;
   14|       |use std::time::Instant;
   15|       |
   16|       |/// Optional nested subcommands. When absent, the default behavior exports
   17|       |/// the full entity snapshot for backward compatibility.
   18|       |#[derive(clap::Subcommand)]
   19|       |pub enum GraphSubcommand {
   20|       |    /// Traverse relationships from a starting entity using BFS
   21|       |    Traverse(GraphTraverseArgs),
   22|       |    /// Show graph statistics (node/edge counts, degree distribution)
   23|       |    Stats(GraphStatsArgs),
   24|       |    /// List entities stored in the graph with optional filters
   25|       |    Entities(GraphEntitiesArgs),
   26|       |}
   27|       |
   28|       |#[derive(clap::ValueEnum, Clone, Copy, Debug, PartialEq, Eq)]
   29|       |pub enum GraphTraverseFormat {
   30|       |    Json,
   31|       |}
   32|       |
   33|       |#[derive(clap::ValueEnum, Clone, Copy, Debug, PartialEq, Eq)]
   34|       |pub enum GraphStatsFormat {
   35|       |    Json,
   36|       |    Text,
   37|       |}
   38|       |
   39|       |#[derive(clap::Args)]
   40|       |#[command(after_long_help = "EXAMPLES:\n  \
   41|       |    # Export full entity snapshot as JSON (default)\n  \
   42|       |    sqlite-graphrag graph\n\n  \
   43|       |    # Traverse relationships from a starting entity\n  \
   44|       |    sqlite-graphrag graph traverse --from acme-corp --depth 2\n\n  \
   45|       |    # Show graph statistics as structured JSON\n  \
   46|       |    sqlite-graphrag graph stats --format json\n\n  \
   47|       |    # List entities filtered by type\n  \
   48|       |    sqlite-graphrag graph entities --entity-type person\n\n  \
   49|       |    # Export full snapshot in DOT format for Graphviz\n  \
   50|       |    sqlite-graphrag graph --format dot --output graph.dot\n\n  \
   51|       |NOTES:\n  \
   52|       |    Without a subcommand, exports the full entity+edge snapshot.\n  \
   53|       |    Use `traverse`, `stats`, or `entities` for targeted queries.")]
   54|       |pub struct GraphArgs {
   55|       |    /// Optional subcommand; without one, export the full entity snapshot.
   56|       |    #[command(subcommand)]
   57|       |    pub subcommand: Option<GraphSubcommand>,
   58|       |    /// Filter by namespace. Defaults to all namespaces.
   59|       |    #[arg(long)]
   60|       |    pub namespace: Option<String>,
   61|       |    /// Snapshot output format.
   62|       |    #[arg(long, value_enum, default_value = "json")]
   63|       |    pub format: GraphExportFormat,
   64|       |    /// File path to write output instead of stdout.
   65|       |    #[arg(long)]
   66|       |    pub output: Option<PathBuf>,
   67|       |    #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
   68|       |    pub json: bool,
   69|       |    #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
   70|       |    pub db: Option<String>,
   71|       |}
   72|       |
   73|       |#[derive(clap::Args)]
   74|       |#[command(after_long_help = "EXAMPLES:\n  \
   75|       |    # Traverse relationships from an entity with default depth (2)\n  \
   76|       |    sqlite-graphrag graph traverse --from acme-corp\n\n  \
   77|       |    # Increase traversal depth to 3 hops\n  \
   78|       |    sqlite-graphrag graph traverse --from acme-corp --depth 3\n\n  \
   79|       |    # Traverse within a specific namespace\n  \
   80|       |    sqlite-graphrag graph traverse --from acme-corp --namespace project-x\n\n  \
   81|       |NOTES:\n  \
   82|       |    Output is always JSON. The `hops` array contains each reachable entity\n  \
   83|       |    with its relation, direction (inbound/outbound), weight, and depth level.")]
   84|       |pub struct GraphTraverseArgs {
   85|       |    /// Root entity name for the traversal.
   86|       |    #[arg(long)]
   87|       |    pub from: String,
   88|       |    /// Maximum traversal depth.
   89|       |    #[arg(long, default_value_t = 2u32)]
   90|       |    pub depth: u32,
   91|       |    #[arg(long)]
   92|       |    pub namespace: Option<String>,
   93|       |    #[arg(long, value_enum, default_value = "json")]
   94|       |    pub format: GraphTraverseFormat,
   95|       |    #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
   96|       |    pub json: bool,
   97|       |    #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
   98|       |    pub db: Option<String>,
   99|       |}
  100|       |
  101|       |#[derive(clap::Args)]
  102|       |#[command(after_long_help = "EXAMPLES:\n  \
  103|       |    # Show stats for all namespaces (human-readable text)\n  \
  104|       |    sqlite-graphrag graph stats --format text\n\n  \
  105|       |    # Show stats as structured JSON\n  \
  106|       |    sqlite-graphrag graph stats --format json\n\n  \
  107|       |    # Show stats for a specific namespace\n  \
  108|       |    sqlite-graphrag graph stats --namespace project-x --format text\n\n  \
  109|       |NOTES:\n  \
  110|       |    Reports node_count, edge_count, avg_degree, and max_degree.\n  \
  111|       |    Default format is JSON. Use `--format text` for a compact single-line summary.")]
  112|       |pub struct GraphStatsArgs {
  113|       |    #[arg(long)]
  114|       |    pub namespace: Option<String>,
  115|       |    /// Output format for the stats response.
  116|       |    #[arg(long, value_enum, default_value = "json")]
  117|       |    pub format: GraphStatsFormat,
  118|       |    #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
  119|       |    pub json: bool,
  120|       |    #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
  121|       |    pub db: Option<String>,
  122|       |}
  123|       |
  124|       |/// Field to sort entities by in `graph entities`.
  125|       |#[derive(Debug, Clone, Copy, clap::ValueEnum)]
  126|       |pub enum EntitySortField {
  127|       |    /// Sort alphabetically by entity name.
  128|       |    Name,
  129|       |    /// Sort by degree (total number of relationships, descending by default).
  130|       |    Degree,
  131|       |    /// Sort by entity creation timestamp.
  132|       |    CreatedAt,
  133|       |}
  134|       |
  135|       |/// Sort direction for `graph entities`.
  136|       |#[derive(Debug, Clone, Copy, Default, clap::ValueEnum)]
  137|       |pub enum SortOrder {
  138|       |    #[default]
  139|       |    Asc,
  140|       |    Desc,
  141|       |}
  142|       |
  143|       |#[derive(clap::Args)]
  144|       |#[command(after_long_help = "EXAMPLES:\n  \
  145|       |    # List all entities (default limit applies)\n  \
  146|       |    sqlite-graphrag graph entities\n\n  \
  147|       |    # Filter by entity type\n  \
  148|       |    sqlite-graphrag graph entities --entity-type person\n\n  \
  149|       |    # Filter by namespace and type\n  \
  150|       |    sqlite-graphrag graph entities --namespace project-x --entity-type concept\n\n  \
  151|       |    # Paginate results (skip first 20, return next 10)\n  \
  152|       |    sqlite-graphrag graph entities --offset 20 --limit 10\n\n  \
  153|       |    # Sort by degree descending (most connected first)\n  \
  154|       |    sqlite-graphrag graph entities --sort-by degree --order desc\n\n  \
  155|       |    # Sort by creation date ascending\n  \
  156|       |    sqlite-graphrag graph entities --sort-by created-at --order asc\n\n  \
  157|       |NOTES:\n  \
  158|       |    Output is always JSON with `entities`, `total_count`, `limit`, and `offset` fields.\n  \
  159|       |    Entity types are strings extracted by GLiNER NER (e.g. `person`, `organization`, `location`).")]
  160|       |pub struct GraphEntitiesArgs {
  161|       |    #[arg(long)]
  162|       |    pub namespace: Option<String>,
  163|       |    /// Filter by entity type (one of the 13 canonical types).
  164|       |    #[arg(long, value_enum)]
  165|       |    pub entity_type: Option<EntityType>,
  166|       |    /// Maximum number of results to return.
  167|       |    #[arg(long, default_value_t = crate::constants::K_GRAPH_ENTITIES_DEFAULT_LIMIT)]
  168|       |    pub limit: usize,
  169|       |    /// Number of results to skip for pagination.
  170|       |    #[arg(long, default_value_t = 0usize)]
  171|       |    pub offset: usize,
  172|       |    /// Sort entities by this field. When omitted, the default order is by name ascending.
  173|       |    #[arg(long, value_enum, help = "Sort entities by field")]
  174|       |    pub sort_by: Option<EntitySortField>,
  175|       |    /// Sort direction: `asc` (default) or `desc`.
  176|       |    #[arg(long, value_enum, default_value_t = SortOrder::Asc, help = "Sort order")]
  177|       |    pub order: SortOrder,
  178|       |    #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
  179|       |    pub json: bool,
  180|       |    #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
  181|       |    pub db: Option<String>,
  182|       |}
  183|       |
  184|       |#[derive(Serialize, Clone)]
  185|       |struct NodeOut {
  186|       |    id: i64,
  187|       |    name: String,
  188|       |    namespace: String,
  189|       |    /// Deprecated alias of `type` kept for backward-compat with pre-v1.0.35 clients.
  190|       |    /// New consumers MUST read `type` instead. Will be removed in a future major release.
  191|       |    kind: String,
  192|       |    /// Canonical entity classification (organization, concept, person, etc.).
  193|       |    /// Mirrors `kind` while the deprecation window is active.
  194|       |    #[serde(rename = "type")]
  195|       |    r#type: String,
  196|       |}
  197|       |
  198|       |#[derive(Serialize)]
  199|       |struct EdgeOut {
  200|       |    from: String,
  201|       |    to: String,
  202|       |    relation: String,
  203|       |    weight: f64,
  204|       |}
  205|       |
  206|       |#[derive(Serialize)]
  207|       |struct GraphSnapshot {
  208|       |    nodes: Vec<NodeOut>,
  209|       |    entities: Vec<NodeOut>,
  210|       |    edges: Vec<EdgeOut>,
  211|       |    elapsed_ms: u64,
  212|       |}
  213|       |
  214|       |#[derive(Serialize)]
  215|       |struct TraverseHop {
  216|       |    entity: String,
  217|       |    relation: String,
  218|       |    direction: String,
  219|       |    weight: f64,
  220|       |    depth: u32,
  221|       |}
  222|       |
  223|       |#[derive(Serialize)]
  224|       |struct GraphTraverseResponse {
  225|       |    from: String,
  226|       |    namespace: String,
  227|       |    depth: u32,
  228|       |    hops: Vec<TraverseHop>,
  229|       |    elapsed_ms: u64,
  230|       |}
  231|       |
  232|       |#[derive(Serialize)]
  233|       |struct GraphStatsResponse {
  234|       |    namespace: Option<String>,
  235|       |    node_count: i64,
  236|       |    edge_count: i64,
  237|       |    avg_degree: f64,
  238|       |    max_degree: i64,
  239|       |    elapsed_ms: u64,
  240|       |}
  241|       |
  242|       |#[derive(Serialize)]
  243|       |struct EntityItem {
  244|       |    id: i64,
  245|       |    name: String,
  246|       |    entity_type: String,
  247|       |    namespace: String,
  248|       |    created_at: String,
  249|       |    /// Total number of relationships (inbound + outbound) for this entity.
  250|       |    degree: u32,
  251|       |    #[serde(skip_serializing_if = "Option::is_none")]
  252|       |    description: Option<String>,
  253|       |}
  254|       |
  255|       |#[derive(Serialize)]
  256|       |struct GraphEntitiesResponse {
  257|       |    entities: Vec<EntityItem>,
  258|       |    total_count: i64,
  259|       |    limit: usize,
  260|       |    offset: usize,
  261|       |    namespace: Option<String>,
  262|       |    elapsed_ms: u64,
  263|       |}
  264|       |
  265|      0|pub fn run(args: GraphArgs) -> Result<(), AppError> {
  266|      0|    match args.subcommand {
  267|      0|        None => run_entities_snapshot(
  268|      0|            args.db.as_deref(),
  269|      0|            args.namespace.as_deref(),
  270|      0|            args.format,
  271|      0|            args.json,
  272|      0|            args.output.as_deref(),
  273|       |        ),
  274|      0|        Some(GraphSubcommand::Traverse(a)) => run_traverse(a),
  275|      0|        Some(GraphSubcommand::Stats(a)) => run_stats(a),
  276|      0|        Some(GraphSubcommand::Entities(a)) => run_entities(a),
  277|       |    }
  278|      0|}
  279|       |
  280|      0|fn run_entities_snapshot(
  281|      0|    db: Option<&str>,
  282|      0|    namespace: Option<&str>,
  283|      0|    format: GraphExportFormat,
  284|      0|    json: bool,
  285|      0|    output_path: Option<&std::path::Path>,
  286|      0|) -> Result<(), AppError> {
  287|      0|    let inicio = Instant::now();
  288|      0|    let paths = AppPaths::resolve(db)?;
  289|       |
  290|      0|    crate::storage::connection::ensure_db_ready(&paths)?;
  291|       |
  292|      0|    let conn = open_ro(&paths.db)?;
  293|       |
  294|      0|    let nodes_raw = entities::list_entities(&conn, namespace)?;
  295|      0|    let edges_raw = entities::list_relationships_by_namespace(&conn, namespace)?;
  296|       |
  297|      0|    let id_to_name: HashMap<i64, String> =
  298|      0|        nodes_raw.iter().map(|n| (n.id, n.name.clone())).collect();
  299|       |
  300|      0|    let nodes: Vec<NodeOut> = nodes_raw
  301|      0|        .into_iter()
  302|      0|        .map(|n| NodeOut {
  303|      0|            id: n.id,
  304|      0|            name: n.name,
  305|      0|            namespace: n.namespace,
  306|      0|            r#type: n.kind.clone(),
  307|      0|            kind: n.kind,
  308|      0|        })
  309|      0|        .collect();
  310|       |
  311|      0|    let mut edges: Vec<EdgeOut> = Vec::with_capacity(edges_raw.len());
  312|      0|    let mut orphan_edges: usize = 0;
  313|      0|    for r in edges_raw {
  314|      0|        let from = match id_to_name.get(&r.source_id) {
  315|      0|            Some(n) => n.clone(),
  316|       |            None => {
  317|      0|                orphan_edges += 1;
  318|      0|                tracing::warn!(target: "graph_export", source_id = r.source_id, relation = %r.relation, "edge skipped: source entity not found in id_to_name map");
  319|      0|                continue;
  320|       |            }
  321|       |        };
  322|      0|        let to = match id_to_name.get(&r.target_id) {
  323|      0|            Some(n) => n.clone(),
  324|       |            None => {
  325|      0|                orphan_edges += 1;
  326|      0|                tracing::warn!(target: "graph_export", target_id = r.target_id, relation = %r.relation, "edge skipped: target entity not found in id_to_name map");
  327|      0|                continue;
  328|       |            }
  329|       |        };
  330|      0|        edges.push(EdgeOut {
  331|      0|            from,
  332|      0|            to,
  333|      0|            relation: r.relation,
  334|      0|            weight: r.weight,
  335|      0|        });
  336|       |    }
  337|      0|    if orphan_edges > 0 {
  338|      0|        tracing::warn!(target: "graph_export",
  339|       |            count = orphan_edges,
  340|      0|            "edges skipped due to orphaned entity references"
  341|       |        );
  342|      0|    }
  343|       |
  344|      0|    let effective_format = if json {
  345|      0|        GraphExportFormat::Json
  346|       |    } else {
  347|      0|        format
  348|       |    };
  349|       |
  350|      0|    if effective_format == GraphExportFormat::Ndjson {
  351|      0|        let elapsed_ms = inicio.elapsed().as_millis() as u64;
  352|      0|        render_ndjson_streaming(&nodes, &edges, elapsed_ms, output_path)?;
  353|      0|        return Ok(());
  354|      0|    }
  355|       |
  356|      0|    let rendered = match effective_format {
  357|       |        GraphExportFormat::Json => {
  358|      0|            let entities = nodes.clone();
  359|      0|            render_json(&GraphSnapshot {
  360|      0|                nodes,
  361|      0|                entities,
  362|      0|                edges,
  363|      0|                elapsed_ms: inicio.elapsed().as_millis() as u64,
  364|      0|            })?
  365|       |        }
  366|      0|        GraphExportFormat::Dot => render_dot(&nodes, &edges),
  367|      0|        GraphExportFormat::Mermaid => render_mermaid(&nodes, &edges),
  368|      0|        GraphExportFormat::Ndjson => unreachable!("ndjson handled above"),
  369|       |    };
  370|       |
  371|      0|    if let Some(path) = output_path.filter(|_| !json) {
  372|      0|        fs::write(path, &rendered)?;
  373|      0|        output::emit_progress(&format!("wrote {}", path.display()));
  374|      0|    } else {
  375|      0|        output::emit_text(&rendered);
  376|      0|    }
  377|       |
  378|      0|    Ok(())
  379|      0|}
  380|       |
  381|      0|fn run_traverse(args: GraphTraverseArgs) -> Result<(), AppError> {
  382|      0|    let inicio = Instant::now();
  383|      0|    let _ = args.format;
  384|      0|    let paths = AppPaths::resolve(args.db.as_deref())?;
  385|       |
  386|      0|    crate::storage::connection::ensure_db_ready(&paths)?;
  387|       |
  388|      0|    let conn = open_ro(&paths.db)?;
  389|      0|    let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
  390|       |
  391|      0|    let from_id = entities::find_entity_id(&conn, &namespace, &args.from)?
  392|      0|        .ok_or_else(|| AppError::NotFound(format!("entity '{}' not found", args.from)))?;
  393|       |
  394|      0|    let all_rels = entities::list_relationships_by_namespace(&conn, Some(&namespace))?;
  395|      0|    let all_entities = entities::list_entities(&conn, Some(&namespace))?;
  396|      0|    let id_to_name: HashMap<i64, String> = all_entities
  397|      0|        .iter()
  398|      0|        .map(|e| (e.id, e.name.clone()))
  399|      0|        .collect();
  400|       |
  401|      0|    let mut hops: Vec<TraverseHop> = Vec::with_capacity(16);
  402|      0|    let mut visited: std::collections::HashSet<i64> =
  403|      0|        std::collections::HashSet::with_capacity(args.depth as usize * 10);
  404|      0|    let mut frontier: Vec<(i64, u32)> = vec![(from_id, 0)];
  405|       |
  406|      0|    while let Some((current_id, current_depth)) = frontier.pop() {
  407|      0|        if current_depth >= args.depth || visited.contains(&current_id) {
  408|      0|            continue;
  409|      0|        }
  410|      0|        visited.insert(current_id);
  411|       |
  412|      0|        for rel in &all_rels {
  413|      0|            if rel.source_id == current_id {
  414|      0|                if let Some(target_name) = id_to_name.get(&rel.target_id) {
  415|      0|                    hops.push(TraverseHop {
  416|      0|                        entity: target_name.clone(),
  417|      0|                        relation: rel.relation.clone(),
  418|      0|                        direction: "outbound".to_string(),
  419|      0|                        weight: rel.weight,
  420|      0|                        depth: current_depth + 1,
  421|      0|                    });
  422|      0|                    frontier.push((rel.target_id, current_depth + 1));
  423|      0|                }
  424|      0|            } else if rel.target_id == current_id {
  425|      0|                if let Some(source_name) = id_to_name.get(&rel.source_id) {
  426|      0|                    hops.push(TraverseHop {
  427|      0|                        entity: source_name.clone(),
  428|      0|                        relation: rel.relation.clone(),
  429|      0|                        direction: "inbound".to_string(),
  430|      0|                        weight: rel.weight,
  431|      0|                        depth: current_depth + 1,
  432|      0|                    });
  433|      0|                    frontier.push((rel.source_id, current_depth + 1));
  434|      0|                }
  435|      0|            }
  436|       |        }
  437|       |    }
  438|       |
  439|      0|    output::emit_json(&GraphTraverseResponse {
  440|      0|        from: args.from,
  441|      0|        namespace,
  442|      0|        depth: args.depth,
  443|      0|        hops,
  444|      0|        elapsed_ms: inicio.elapsed().as_millis() as u64,
  445|      0|    })?;
  446|       |
  447|      0|    Ok(())
  448|      0|}
  449|       |
  450|      0|fn run_stats(args: GraphStatsArgs) -> Result<(), AppError> {
  451|      0|    let inicio = Instant::now();
  452|      0|    let paths = AppPaths::resolve(args.db.as_deref())?;
  453|       |
  454|      0|    crate::storage::connection::ensure_db_ready(&paths)?;
  455|       |
  456|      0|    let conn = open_ro(&paths.db)?;
  457|      0|    let ns = args.namespace.as_deref();
  458|       |
  459|      0|    let node_count: i64 = if let Some(n) = ns {
  460|      0|        conn.query_row(
  461|      0|            "SELECT COUNT(*) FROM entities WHERE namespace = ?1",
  462|      0|            rusqlite::params![n],
  463|      0|            |r| r.get(0),
  464|      0|        )?
  465|       |    } else {
  466|      0|        conn.query_row("SELECT COUNT(*) FROM entities", [], |r| r.get(0))?
  467|       |    };
  468|       |
  469|      0|    let edge_count: i64 = if let Some(n) = ns {
  470|      0|        conn.query_row(
  471|      0|            "SELECT COUNT(*) FROM relationships r
  472|      0|             JOIN entities s ON s.id = r.source_id
  473|      0|             WHERE s.namespace = ?1",
  474|      0|            rusqlite::params![n],
  475|      0|            |r| r.get(0),
  476|      0|        )?
  477|       |    } else {
  478|      0|        conn.query_row("SELECT COUNT(*) FROM relationships", [], |r| r.get(0))?
  479|       |    };
  480|       |
  481|      0|    let max_degree: i64 = if let Some(n) = ns {
  482|      0|        conn.query_row(
  483|      0|            "SELECT COALESCE(MAX(degree), 0) FROM entities WHERE namespace = ?1",
  484|      0|            rusqlite::params![n],
  485|      0|            |r| r.get(0),
  486|      0|        )?
  487|       |    } else {
  488|      0|        conn.query_row("SELECT COALESCE(MAX(degree), 0) FROM entities", [], |r| {
  489|      0|            r.get(0)
  490|      0|        })?
  491|       |    };
  492|       |
  493|       |    // avg_degree = 2 * edge_count / node_count (each edge contributes 2 to total degree sum).
  494|      0|    let avg_degree = if node_count > 0 {
  495|      0|        2.0 * (edge_count as f64) / (node_count as f64)
  496|       |    } else {
  497|      0|        0.0
  498|       |    };
  499|       |
  500|      0|    let resp = GraphStatsResponse {
  501|      0|        namespace: args.namespace,
  502|      0|        node_count,
  503|      0|        edge_count,
  504|      0|        avg_degree,
  505|      0|        max_degree,
  506|      0|        elapsed_ms: inicio.elapsed().as_millis() as u64,
  507|      0|    };
  508|       |
  509|      0|    let effective_format = if args.json {
  510|      0|        GraphStatsFormat::Json
  511|       |    } else {
  512|      0|        args.format
  513|       |    };
  514|       |
  515|      0|    match effective_format {
  516|      0|        GraphStatsFormat::Json => output::emit_json(&resp)?,
  517|      0|        GraphStatsFormat::Text => {
  518|      0|            output::emit_text(&format!(
  519|      0|                "nodes={} edges={} avg_degree={:.2} max_degree={} namespace={}",
  520|      0|                resp.node_count,
  521|      0|                resp.edge_count,
  522|      0|                resp.avg_degree,
  523|      0|                resp.max_degree,
  524|      0|                resp.namespace.as_deref().unwrap_or("all"),
  525|      0|            ));
  526|      0|        }
  527|       |    }
  528|       |
  529|      0|    Ok(())
  530|      0|}
  531|       |
  532|       |/// Builds the `ORDER BY` clause fragment from sort options.
  533|       |///
  534|       |/// Returns a static SQL fragment such as `ORDER BY e.name ASC`.
  535|      6|fn build_order_by(sort_by: Option<EntitySortField>, order: SortOrder) -> &'static str {
  536|       |    // The combinations are enumerated as static strings to avoid
  537|       |    // format!() allocations in the hot path and satisfy the borrow checker
  538|       |    // when the string is used inside conn.prepare().
  539|      6|    match (sort_by, order) {
  540|       |        (None, SortOrder::Asc) | (Some(EntitySortField::Name), SortOrder::Asc) => {
  541|      1|            "ORDER BY e.name ASC"
  542|       |        }
  543|      1|        (Some(EntitySortField::Name), SortOrder::Desc) => "ORDER BY e.name DESC",
  544|      1|        (Some(EntitySortField::Degree), SortOrder::Asc) => "ORDER BY degree ASC",
  545|      1|        (Some(EntitySortField::Degree), SortOrder::Desc) => "ORDER BY degree DESC",
  546|      1|        (Some(EntitySortField::CreatedAt), SortOrder::Asc) => "ORDER BY e.created_at ASC",
  547|      1|        (Some(EntitySortField::CreatedAt), SortOrder::Desc) => "ORDER BY e.created_at DESC",
  548|       |        // Fallback: None/Desc → sort by name desc (consistent with dir variable).
  549|      0|        (None, SortOrder::Desc) => "ORDER BY e.name DESC",
  550|       |    }
  551|      6|}
  552|       |
  553|      0|fn run_entities(args: GraphEntitiesArgs) -> Result<(), AppError> {
  554|      0|    let inicio = Instant::now();
  555|      0|    let paths = AppPaths::resolve(args.db.as_deref())?;
  556|       |
  557|      0|    crate::storage::connection::ensure_db_ready(&paths)?;
  558|       |
  559|      0|    let conn = open_ro(&paths.db)?;
  560|       |
  561|      0|    let row_to_item = |r: &rusqlite::Row<'_>| -> rusqlite::Result<EntityItem> {
  562|      0|        let ts: i64 = r.get(4)?;
  563|      0|        let created_at = chrono::DateTime::from_timestamp(ts, 0)
  564|      0|            .unwrap_or_default()
  565|      0|            .format("%Y-%m-%dT%H:%M:%SZ")
  566|      0|            .to_string();
  567|       |        Ok(EntityItem {
  568|      0|            id: r.get(0)?,
  569|      0|            name: r.get(1)?,
  570|      0|            entity_type: r.get(2)?,
  571|      0|            namespace: r.get(3)?,
  572|      0|            created_at,
  573|      0|            degree: r.get(5)?,
  574|      0|            description: r.get(6)?,
  575|       |        })
  576|      0|    };
  577|       |
  578|      0|    let limit_i = args.limit as i64;
  579|      0|    let offset_i = args.offset as i64;
  580|      0|    let order_clause = build_order_by(args.sort_by, args.order);
  581|       |
  582|      0|    let base_select = "SELECT e.id, e.name, COALESCE(e.type, ''), e.namespace, e.created_at,
  583|      0|                        (SELECT COUNT(*) FROM relationships r
  584|      0|                         WHERE r.source_id = e.id OR r.target_id = e.id) AS degree,
  585|      0|                        e.description
  586|      0|                 FROM entities e";
  587|       |
  588|      0|    let (total_count, items) = match (
  589|      0|        args.namespace.as_deref(),
  590|      0|        args.entity_type.map(|et| et.as_str()),
  591|       |    ) {
  592|      0|        (Some(ns), Some(et)) => {
  593|      0|            let count: i64 = conn.query_row(
  594|      0|                "SELECT COUNT(*) FROM entities WHERE namespace = ?1 AND type = ?2",
  595|      0|                rusqlite::params![ns, et],
  596|      0|                |r| r.get(0),
  597|      0|            )?;
  598|      0|            let sql = format!(
  599|      0|                "{base_select} WHERE e.namespace = ?1 AND e.type = ?2 {order_clause} LIMIT ?3 OFFSET ?4"
  600|       |            );
  601|      0|            let mut stmt = conn.prepare(&sql)?;
  602|      0|            let rows = stmt
  603|      0|                .query_map(rusqlite::params![ns, et, limit_i, offset_i], row_to_item)?
  604|      0|                .collect::<rusqlite::Result<Vec<_>>>()?;
  605|      0|            (count, rows)
  606|       |        }
  607|      0|        (Some(ns), None) => {
  608|      0|            let count: i64 = conn.query_row(
  609|      0|                "SELECT COUNT(*) FROM entities WHERE namespace = ?1",
  610|      0|                rusqlite::params![ns],
  611|      0|                |r| r.get(0),
  612|      0|            )?;
  613|      0|            let sql =
  614|      0|                format!("{base_select} WHERE e.namespace = ?1 {order_clause} LIMIT ?2 OFFSET ?3");
  615|      0|            let mut stmt = conn.prepare(&sql)?;
  616|      0|            let rows = stmt
  617|      0|                .query_map(rusqlite::params![ns, limit_i, offset_i], row_to_item)?
  618|      0|                .collect::<rusqlite::Result<Vec<_>>>()?;
  619|      0|            (count, rows)
  620|       |        }
  621|      0|        (None, Some(et)) => {
  622|      0|            let count: i64 = conn.query_row(
  623|      0|                "SELECT COUNT(*) FROM entities WHERE type = ?1",
  624|      0|                rusqlite::params![et],
  625|      0|                |r| r.get(0),
  626|      0|            )?;
  627|      0|            let sql = format!("{base_select} WHERE e.type = ?1 {order_clause} LIMIT ?2 OFFSET ?3");
  628|      0|            let mut stmt = conn.prepare(&sql)?;
  629|      0|            let rows = stmt
  630|      0|                .query_map(rusqlite::params![et, limit_i, offset_i], row_to_item)?
  631|      0|                .collect::<rusqlite::Result<Vec<_>>>()?;
  632|      0|            (count, rows)
  633|       |        }
  634|       |        (None, None) => {
  635|      0|            let count: i64 = conn.query_row("SELECT COUNT(*) FROM entities", [], |r| r.get(0))?;
  636|      0|            let sql = format!("{base_select} {order_clause} LIMIT ?1 OFFSET ?2");
  637|      0|            let mut stmt = conn.prepare(&sql)?;
  638|      0|            let rows = stmt
  639|      0|                .query_map(rusqlite::params![limit_i, offset_i], row_to_item)?
  640|      0|                .collect::<rusqlite::Result<Vec<_>>>()?;
  641|      0|            (count, rows)
  642|       |        }
  643|       |    };
  644|       |
  645|      0|    output::emit_json(&GraphEntitiesResponse {
  646|      0|        entities: items,
  647|      0|        total_count,
  648|      0|        limit: args.limit,
  649|      0|        offset: args.offset,
  650|      0|        namespace: args.namespace,
  651|      0|        elapsed_ms: inicio.elapsed().as_millis() as u64,
  652|      0|    })
  653|      0|}
  654|       |
  655|      1|fn render_json(snapshot: &GraphSnapshot) -> Result<String, AppError> {
  656|      1|    Ok(serde_json::to_string_pretty(snapshot)?)
                                                           ^0
  657|      1|}
  658|       |
  659|       |/// Streams the graph as NDJSON: one object per node, one per edge, then a summary.
  660|       |///
  661|       |/// Each line is flushed immediately so consumers can process incrementally.
  662|       |/// When `output_path` is `Some`, lines are written to the file; otherwise to stdout.
  663|      0|fn render_ndjson_streaming(
  664|      0|    nodes: &[NodeOut],
  665|      0|    edges: &[EdgeOut],
  666|      0|    elapsed_ms: u64,
  667|      0|    output_path: Option<&std::path::Path>,
  668|      0|) -> Result<(), AppError> {
  669|       |    #[derive(serde::Serialize)]
  670|       |    struct NdjsonNode<'a> {
  671|       |        kind: &'static str,
  672|       |        id: i64,
  673|       |        name: &'a str,
  674|       |        namespace: &'a str,
  675|       |        #[serde(rename = "type")]
  676|       |        r#type: &'a str,
  677|       |    }
  678|       |    #[derive(serde::Serialize)]
  679|       |    struct NdjsonEdge<'a> {
  680|       |        kind: &'static str,
  681|       |        from: &'a str,
  682|       |        to: &'a str,
  683|       |        relation: &'a str,
  684|       |        weight: f64,
  685|       |    }
  686|       |    #[derive(serde::Serialize)]
  687|       |    struct NdjsonSummary {
  688|       |        kind: &'static str,
  689|       |        nodes: usize,
  690|       |        edges: usize,
  691|       |        elapsed_ms: u64,
  692|       |    }
  693|       |
  694|       |    use std::io::Write as IoWrite;
  695|       |
  696|      0|    let mut buf: Vec<u8> = Vec::with_capacity(4096);
  697|       |
  698|      0|    let emit_line =
  699|      0|        |buf: &mut Vec<u8>, line: &str, path: Option<&std::path::Path>| -> Result<(), AppError> {
  700|      0|            buf.clear();
  701|      0|            buf.extend_from_slice(line.as_bytes());
  702|      0|            buf.push(b'\n');
  703|      0|            if let Some(p) = path {
  704|      0|                let mut f = std::fs::OpenOptions::new()
  705|      0|                    .create(true)
  706|      0|                    .append(true)
  707|      0|                    .open(p)
  708|      0|                    .map_err(AppError::Io)?;
  709|      0|                f.write_all(buf).map_err(AppError::Io)?;
  710|      0|            } else {
  711|      0|                output::emit_text(line);
  712|      0|            }
  713|      0|            Ok(())
  714|      0|        };
  715|       |
  716|       |    // Truncate the output file once before starting (avoids re-opening with append for every line).
  717|      0|    if let Some(p) = output_path {
  718|      0|        fs::write(p, b"")?;
  719|      0|    }
  720|       |
  721|      0|    for node in nodes {
  722|      0|        let obj = NdjsonNode {
  723|      0|            kind: "node",
  724|      0|            id: node.id,
  725|      0|            name: &node.name,
  726|      0|            namespace: &node.namespace,
  727|      0|            r#type: &node.r#type,
  728|      0|        };
  729|      0|        let line = serde_json::to_string(&obj)?;
  730|      0|        emit_line(&mut buf, &line, output_path)?;
  731|       |    }
  732|       |
  733|      0|    for edge in edges {
  734|      0|        let obj = NdjsonEdge {
  735|      0|            kind: "edge",
  736|      0|            from: &edge.from,
  737|      0|            to: &edge.to,
  738|      0|            relation: &edge.relation,
  739|      0|            weight: edge.weight,
  740|      0|        };
  741|      0|        let line = serde_json::to_string(&obj)?;
  742|      0|        emit_line(&mut buf, &line, output_path)?;
  743|       |    }
  744|       |
  745|      0|    let summary = NdjsonSummary {
  746|      0|        kind: "summary",
  747|      0|        nodes: nodes.len(),
  748|      0|        edges: edges.len(),
  749|      0|        elapsed_ms,
  750|      0|    };
  751|      0|    let line = serde_json::to_string(&summary)?;
  752|      0|    emit_line(&mut buf, &line, output_path)?;
  753|       |
  754|      0|    Ok(())
  755|      0|}
  756|       |
  757|      0|fn sanitize_dot_id(raw: &str) -> String {
  758|      0|    raw.chars()
  759|      0|        .map(|c| {
  760|      0|            if c.is_ascii_alphanumeric() || c == '_' {
  761|      0|                c
  762|       |            } else {
  763|      0|                '_'
  764|       |            }
  765|      0|        })
  766|      0|        .collect()
  767|      0|}
  768|       |
  769|      0|fn render_dot(nodes: &[NodeOut], edges: &[EdgeOut]) -> String {
  770|       |    use std::fmt::Write;
  771|      0|    let mut out = String::with_capacity(nodes.len() * 80 + edges.len() * 60 + 300);
  772|      0|    out.push_str("digraph sqlite_graphrag {\n");
  773|      0|    out.push_str("  graph [bgcolor=\"white\", fontname=\"Helvetica Neue\", fontsize=12, rankdir=LR, nodesep=0.8, ranksep=1.2];\n");
  774|      0|    out.push_str("  node [shape=box, style=\"filled,rounded\", fillcolor=\"#F2F2F7\", fontname=\"Helvetica Neue\", fontsize=11, color=\"#C7C7CC\"];\n");
  775|      0|    out.push_str("  edge [fontname=\"Helvetica Neue\", fontsize=9, color=\"#8E8E93\"];\n");
  776|      0|    for node in nodes {
  777|      0|        let node_id = sanitize_dot_id(&node.name);
  778|      0|        let escaped = node.name.replace('"', "\\\"");
  779|      0|        let _ = writeln!(out, "  {node_id} [label=\"{escaped}\"];");
  780|      0|    }
  781|      0|    for edge in edges {
  782|      0|        let from = sanitize_dot_id(&edge.from);
  783|      0|        let to = sanitize_dot_id(&edge.to);
  784|      0|        let label = edge.relation.replace('"', "\\\"");
  785|      0|        let _ = writeln!(out, "  {from} -> {to} [label=\"{label}\"];");
  786|      0|    }
  787|      0|    out.push_str("}\n");
  788|      0|    out
  789|      0|}
  790|       |
  791|      0|fn sanitize_mermaid_id(raw: &str) -> String {
  792|      0|    raw.chars()
  793|      0|        .map(|c| {
  794|      0|            if c.is_ascii_alphanumeric() || c == '_' {
  795|      0|                c
  796|       |            } else {
  797|      0|                '_'
  798|       |            }
  799|      0|        })
  800|      0|        .collect()
  801|      0|}
  802|       |
  803|      0|fn render_mermaid(nodes: &[NodeOut], edges: &[EdgeOut]) -> String {
  804|       |    use std::fmt::Write;
  805|      0|    let mut out = String::with_capacity(nodes.len() * 50 + edges.len() * 40 + 200);
  806|      0|    out.push_str("%%{init: {'theme': 'neutral', 'themeVariables': {'primaryColor': '#F2F2F7', 'primaryTextColor': '#1C1C1E', 'primaryBorderColor': '#C7C7CC', 'lineColor': '#8E8E93'}}}%%\n");
  807|      0|    out.push_str("graph LR\n");
  808|      0|    for node in nodes {
  809|      0|        let id = sanitize_mermaid_id(&node.name);
  810|      0|        let escaped = node.name.replace('"', "\\\"");
  811|      0|        let _ = writeln!(out, "  {id}[\"{escaped}\"]");
  812|      0|    }
  813|      0|    for edge in edges {
  814|      0|        let from = sanitize_mermaid_id(&edge.from);
  815|      0|        let to = sanitize_mermaid_id(&edge.to);
  816|      0|        let label = edge.relation.replace('|', "\\|");
  817|      0|        let _ = writeln!(out, "  {from} -->|{label}| {to}");
  818|      0|    }
  819|      0|    out
  820|      0|}
  821|       |
  822|       |#[cfg(test)]
  823|       |mod tests {
  824|       |    use super::*;
  825|       |    use crate::cli::{Cli, Commands};
  826|       |    use clap::Parser;
  827|       |
  828|      4|    fn make_node(kind: &str) -> NodeOut {
  829|      4|        NodeOut {
  830|      4|            id: 1,
  831|      4|            name: "test-entity".to_string(),
  832|      4|            namespace: "default".to_string(),
  833|      4|            kind: kind.to_string(),
  834|      4|            r#type: kind.to_string(),
  835|      4|        }
  836|      4|    }
  837|       |
  838|       |    #[test]
  839|      1|    fn node_out_type_duplicates_kind() {
  840|      1|        let node = make_node("agent");
  841|      1|        let json = serde_json::to_value(&node).expect("serialization must work");
  842|      1|        assert_eq!(json["kind"], json["type"]);
  843|      1|        assert_eq!(json["kind"], "agent");
  844|      1|        assert_eq!(json["type"], "agent");
  845|      1|    }
  846|       |
  847|       |    #[test]
  848|      1|    fn node_out_serializes_all_fields() {
  849|      1|        let node = make_node("document");
  850|      1|        let json = serde_json::to_value(&node).expect("serialization must work");
  851|      1|        assert!(json.get("id").is_some());
  852|      1|        assert!(json.get("name").is_some());
  853|      1|        assert!(json.get("namespace").is_some());
  854|      1|        assert!(json.get("kind").is_some());
  855|      1|        assert!(json.get("type").is_some());
  856|      1|    }
  857|       |
  858|       |    #[test]
  859|      1|    fn graph_snapshot_serializes_nodes_with_type() {
  860|      1|        let node = make_node("concept");
  861|      1|        let entities = vec![make_node("concept")];
  862|      1|        let snapshot = GraphSnapshot {
  863|      1|            nodes: vec![node],
  864|      1|            entities,
  865|      1|            edges: vec![],
  866|      1|            elapsed_ms: 0,
  867|      1|        };
  868|      1|        let json_str = render_json(&snapshot).expect("rendering must work");
  869|      1|        let json: serde_json::Value = serde_json::from_str(&json_str).expect("valid json");
  870|      1|        let first_node = &json["nodes"][0];
  871|      1|        assert_eq!(first_node["kind"], first_node["type"]);
  872|      1|        assert_eq!(first_node["type"], "concept");
  873|      1|    }
  874|       |
  875|       |    #[test]
  876|      1|    fn graph_traverse_response_serializes_correctly() {
  877|      1|        let resp = GraphTraverseResponse {
  878|      1|            from: "entity-a".to_string(),
  879|      1|            namespace: "global".to_string(),
  880|      1|            depth: 2,
  881|      1|            hops: vec![TraverseHop {
  882|      1|                entity: "entity-b".to_string(),
  883|      1|                relation: "uses".to_string(),
  884|      1|                direction: "outbound".to_string(),
  885|      1|                weight: 1.0,
  886|      1|                depth: 1,
  887|      1|            }],
  888|      1|            elapsed_ms: 5,
  889|      1|        };
  890|      1|        let json = serde_json::to_value(&resp).unwrap();
  891|      1|        assert_eq!(json["from"], "entity-a");
  892|      1|        assert_eq!(json["depth"], 2);
  893|      1|        assert!(json["hops"].is_array());
  894|      1|        assert_eq!(json["hops"][0]["direction"], "outbound");
  895|      1|    }
  896|       |
  897|       |    #[test]
  898|      1|    fn graph_stats_response_serializes_correctly() {
  899|      1|        let resp = GraphStatsResponse {
  900|      1|            namespace: Some("global".to_string()),
  901|      1|            node_count: 10,
  902|      1|            edge_count: 15,
  903|      1|            avg_degree: 3.0,
  904|      1|            max_degree: 7,
  905|      1|            elapsed_ms: 2,
  906|      1|        };
  907|      1|        let json = serde_json::to_value(&resp).unwrap();
  908|      1|        assert_eq!(json["node_count"], 10);
  909|      1|        assert_eq!(json["edge_count"], 15);
  910|      1|        assert_eq!(json["avg_degree"], 3.0);
  911|      1|        assert_eq!(json["max_degree"], 7);
  912|      1|    }
  913|       |
  914|      3|    fn compute_avg_degree(node_count: i64, edge_count: i64) -> f64 {
  915|      3|        if node_count > 0 {
  916|      2|            2.0 * (edge_count as f64) / (node_count as f64)
  917|       |        } else {
  918|      1|            0.0
  919|       |        }
  920|      3|    }
  921|       |
  922|       |    #[test]
  923|      1|    fn avg_degree_is_zero_when_no_nodes() {
  924|      1|        assert_eq!(compute_avg_degree(0, 0), 0.0);
  925|      1|    }
  926|       |
  927|       |    #[test]
  928|      1|    fn avg_degree_is_zero_when_nodes_but_no_edges() {
  929|       |        // Reproduces L1 bug: previously returned 1.0 instead of 0.0.
  930|      1|        assert_eq!(compute_avg_degree(2, 0), 0.0);
  931|      1|    }
  932|       |
  933|       |    #[test]
  934|      1|    fn avg_degree_is_two_when_triangle() {
  935|       |        // 3 nodes, 3 edges: 2 * 3 / 3 = 2.0
  936|      1|        assert_eq!(compute_avg_degree(3, 3), 2.0);
  937|      1|    }
  938|       |
  939|       |    #[test]
  940|      1|    fn graph_entities_response_serializes_required_fields() {
  941|      1|        let resp = GraphEntitiesResponse {
  942|      1|            entities: vec![EntityItem {
  943|      1|                id: 1,
  944|      1|                name: "claude-code".to_string(),
  945|      1|                entity_type: "agent".to_string(),
  946|      1|                namespace: "global".to_string(),
  947|      1|                created_at: "2026-01-01T00:00:00Z".to_string(),
  948|      1|                degree: 0,
  949|      1|                description: None,
  950|      1|            }],
  951|      1|            total_count: 1,
  952|      1|            limit: 50,
  953|      1|            offset: 0,
  954|      1|            namespace: Some("global".to_string()),
  955|      1|            elapsed_ms: 3,
  956|      1|        };
  957|      1|        let json = serde_json::to_value(&resp).unwrap();
  958|      1|        assert!(json["entities"].is_array());
  959|      1|        assert_eq!(json["entities"][0]["name"], "claude-code");
  960|      1|        assert_eq!(json["entities"][0]["entity_type"], "agent");
  961|      1|        assert_eq!(json["total_count"], 1);
  962|      1|        assert_eq!(json["limit"], 50);
  963|      1|        assert_eq!(json["offset"], 0);
  964|      1|        assert_eq!(json["namespace"], "global");
  965|      1|    }
  966|       |
  967|       |    #[test]
  968|      1|    fn entity_item_serializes_all_fields() {
  969|      1|        let item = EntityItem {
  970|      1|            id: 42,
  971|      1|            name: "test-entity".to_string(),
  972|      1|            entity_type: "concept".to_string(),
  973|      1|            namespace: "project-a".to_string(),
  974|      1|            created_at: "2026-04-19T12:00:00Z".to_string(),
  975|      1|            degree: 3,
  976|      1|            description: Some("test description".to_string()),
  977|      1|        };
  978|      1|        let json = serde_json::to_value(&item).unwrap();
  979|      1|        assert_eq!(json["id"], 42);
  980|      1|        assert_eq!(json["name"], "test-entity");
  981|      1|        assert_eq!(json["entity_type"], "concept");
  982|      1|        assert_eq!(json["namespace"], "project-a");
  983|      1|        assert_eq!(json["created_at"], "2026-04-19T12:00:00Z");
  984|      1|    }
  985|       |
  986|       |    #[test]
  987|      1|    fn entity_item_entity_type_is_never_null() {
  988|       |        // P2-C: entity_type must never be null, even when DB column is empty.
  989|      1|        let item = EntityItem {
  990|      1|            id: 1,
  991|      1|            name: "sem-tipo".to_string(),
  992|      1|            entity_type: String::new(),
  993|      1|            namespace: "ns".to_string(),
  994|      1|            created_at: "2026-01-01T00:00:00Z".to_string(),
  995|      1|            degree: 0,
  996|      1|            description: None,
  997|      1|        };
  998|      1|        let json = serde_json::to_value(&item).unwrap();
  999|      1|        assert!(
 1000|      1|            !json["entity_type"].is_null(),
 1001|      0|            "entity_type must not be null"
 1002|       |        );
 1003|      1|        assert!(json["entity_type"].is_string());
 1004|      1|    }
 1005|       |
 1006|       |    #[test]
 1007|      1|    fn graph_traverse_cli_rejects_format_dot() {
 1008|      1|        let parsed = Cli::try_parse_from([
 1009|      1|            "sqlite-graphrag",
 1010|      1|            "graph",
 1011|      1|            "traverse",
 1012|      1|            "--from",
 1013|      1|            "AuthDecision",
 1014|      1|            "--format",
 1015|      1|            "dot",
 1016|      1|        ]);
 1017|      1|        assert!(parsed.is_err(), "graph traverse must reject format=dot");
                                               ^0
 1018|      1|    }
 1019|       |
 1020|       |    #[test]
 1021|      1|    fn graph_stats_cli_accepts_format_text() {
 1022|      1|        let parsed = Cli::try_parse_from(["sqlite-graphrag", "graph", "stats", "--format", "text"])
 1023|      1|            .expect("graph stats --format text must be accepted");
 1024|       |
 1025|      1|        match parsed.command {
 1026|      1|            Commands::Graph(args) => match args.subcommand {
 1027|      1|                Some(GraphSubcommand::Stats(stats)) => {
 1028|      1|                    assert_eq!(stats.format, GraphStatsFormat::Text);
 1029|       |                }
 1030|      0|                _ => unreachable!("unexpected subcommand"),
 1031|       |            },
 1032|      0|            _ => unreachable!("unexpected command"),
 1033|       |        }
 1034|      1|    }
 1035|       |
 1036|       |    #[test]
 1037|      1|    fn graph_stats_cli_rejects_format_mermaid() {
 1038|      1|        let parsed =
 1039|      1|            Cli::try_parse_from(["sqlite-graphrag", "graph", "stats", "--format", "mermaid"]);
 1040|      1|        assert!(parsed.is_err(), "graph stats must reject format=mermaid");
                                               ^0
 1041|      1|    }
 1042|       |
 1043|       |    #[test]
 1044|      1|    fn graph_entities_response_has_no_items_key() {
 1045|      1|        let resp = GraphEntitiesResponse {
 1046|      1|            entities: vec![],
 1047|      1|            total_count: 0,
 1048|      1|            limit: 50,
 1049|      1|            offset: 0,
 1050|      1|            namespace: None,
 1051|      1|            elapsed_ms: 0,
 1052|      1|        };
 1053|      1|        let json = serde_json::to_value(&resp).unwrap();
 1054|      1|        assert!(
 1055|      1|            json.get("items").is_none(),
 1056|      0|            "legacy 'items' key must not appear"
 1057|       |        );
 1058|      1|        assert!(
 1059|      1|            json.get("entities").is_some(),
 1060|      0|            "'entities' key must be present"
 1061|       |        );
 1062|      1|    }
 1063|       |
 1064|       |    #[test]
 1065|      1|    fn build_order_by_defaults_to_name_asc() {
 1066|      1|        let clause = build_order_by(None, SortOrder::Asc);
 1067|      1|        assert_eq!(clause, "ORDER BY e.name ASC");
 1068|      1|    }
 1069|       |
 1070|       |    #[test]
 1071|      1|    fn build_order_by_name_desc() {
 1072|      1|        let clause = build_order_by(Some(EntitySortField::Name), SortOrder::Desc);
 1073|      1|        assert_eq!(clause, "ORDER BY e.name DESC");
 1074|      1|    }
 1075|       |
 1076|       |    #[test]
 1077|      1|    fn build_order_by_degree_desc() {
 1078|      1|        let clause = build_order_by(Some(EntitySortField::Degree), SortOrder::Desc);
 1079|      1|        assert_eq!(clause, "ORDER BY degree DESC");
 1080|      1|    }
 1081|       |
 1082|       |    #[test]
 1083|      1|    fn build_order_by_degree_asc() {
 1084|      1|        let clause = build_order_by(Some(EntitySortField::Degree), SortOrder::Asc);
 1085|      1|        assert_eq!(clause, "ORDER BY degree ASC");
 1086|      1|    }
 1087|       |
 1088|       |    #[test]
 1089|      1|    fn build_order_by_created_at_asc() {
 1090|      1|        let clause = build_order_by(Some(EntitySortField::CreatedAt), SortOrder::Asc);
 1091|      1|        assert_eq!(clause, "ORDER BY e.created_at ASC");
 1092|      1|    }
 1093|       |
 1094|       |    #[test]
 1095|      1|    fn build_order_by_created_at_desc() {
 1096|      1|        let clause = build_order_by(Some(EntitySortField::CreatedAt), SortOrder::Desc);
 1097|      1|        assert_eq!(clause, "ORDER BY e.created_at DESC");
 1098|      1|    }
 1099|       |
 1100|       |    #[test]
 1101|      1|    fn graph_entities_cli_accepts_sort_by_degree_desc() {
 1102|      1|        let parsed = Cli::try_parse_from([
 1103|      1|            "sqlite-graphrag",
 1104|      1|            "graph",
 1105|      1|            "entities",
 1106|      1|            "--sort-by",
 1107|      1|            "degree",
 1108|      1|            "--order",
 1109|      1|            "desc",
 1110|      1|        ])
 1111|      1|        .expect("graph entities --sort-by degree --order desc must parse");
 1112|      1|        match parsed.command {
 1113|      1|            Commands::Graph(args) => match args.subcommand {
 1114|      1|                Some(GraphSubcommand::Entities(e)) => {
 1115|      1|                    assert!(matches!(e.sort_by, Some(EntitySortField::Degree)));
                                          ^0
 1116|      1|                    assert!(matches!(e.order, SortOrder::Desc));
                                          ^0
 1117|       |                }
 1118|      0|                _ => unreachable!("unexpected subcommand"),
 1119|       |            },
 1120|      0|            _ => unreachable!("unexpected command"),
 1121|       |        }
 1122|      1|    }
 1123|       |
 1124|       |    #[test]
 1125|      1|    fn graph_entities_cli_accepts_sort_by_created_at_asc() {
 1126|      1|        let parsed = Cli::try_parse_from([
 1127|      1|            "sqlite-graphrag",
 1128|      1|            "graph",
 1129|      1|            "entities",
 1130|      1|            "--sort-by",
 1131|      1|            "created-at",
 1132|      1|        ])
 1133|      1|        .expect("graph entities --sort-by created-at must parse");
 1134|      1|        match parsed.command {
 1135|      1|            Commands::Graph(args) => match args.subcommand {
 1136|      1|                Some(GraphSubcommand::Entities(e)) => {
 1137|      1|                    assert!(matches!(e.sort_by, Some(EntitySortField::CreatedAt)));
                                          ^0
 1138|      1|                    assert!(matches!(e.order, SortOrder::Asc));
                                          ^0
 1139|       |                }
 1140|      0|                _ => unreachable!("unexpected subcommand"),
 1141|       |            },
 1142|      0|            _ => unreachable!("unexpected command"),
 1143|       |        }
 1144|      1|    }
 1145|       |
 1146|       |    #[test]
 1147|      1|    fn graph_entities_cli_defaults_to_no_sort_by() {
 1148|      1|        let parsed = Cli::try_parse_from(["sqlite-graphrag", "graph", "entities"])
 1149|      1|            .expect("graph entities must parse without sort flags");
 1150|      1|        match parsed.command {
 1151|      1|            Commands::Graph(args) => match args.subcommand {
 1152|      1|                Some(GraphSubcommand::Entities(e)) => {
 1153|      1|                    assert!(e.sort_by.is_none(), "sort_by must default to None");
                                                               ^0
 1154|      1|                    assert!(
 1155|      1|                        matches!(e.order, SortOrder::Asc),
                                      ^0
 1156|      0|                        "order must default to Asc"
 1157|       |                    );
 1158|       |                }
 1159|      0|                _ => unreachable!("unexpected subcommand"),
 1160|       |            },
 1161|      0|            _ => unreachable!("unexpected command"),
 1162|       |        }
 1163|      1|    }
 1164|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/health.rs:
    1|       |//! Handler for the `health` CLI subcommand.
    2|       |
    3|       |use crate::errors::AppError;
    4|       |use crate::output;
    5|       |use crate::paths::AppPaths;
    6|       |use crate::storage::connection::open_ro;
    7|       |use serde::Serialize;
    8|       |use std::fs;
    9|       |use std::time::Instant;
   10|       |
   11|       |#[derive(clap::Args)]
   12|       |#[command(after_long_help = "EXAMPLES:\n  \
   13|       |    # Check database health (connectivity, integrity, vector index)\n  \
   14|       |    sqlite-graphrag health\n\n  \
   15|       |    # Check health of a database at a custom path\n  \
   16|       |    sqlite-graphrag health --db /path/to/graphrag.sqlite\n\n  \
   17|       |    # Use SQLITE_GRAPHRAG_DB_PATH env var\n  \
   18|       |    SQLITE_GRAPHRAG_DB_PATH=/data/graphrag.sqlite sqlite-graphrag health")]
   19|       |pub struct HealthArgs {
   20|       |    #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
   21|       |    pub db: Option<String>,
   22|       |    /// Explicit JSON flag. Accepted as a no-op because output is already JSON by default.
   23|       |    #[arg(long, default_value_t = false)]
   24|       |    pub json: bool,
   25|       |    /// Output format: `json` or `text`. JSON is always emitted on stdout regardless of the value.
   26|       |    #[arg(long, value_parser = ["json", "text"], hide = true)]
   27|       |    pub format: Option<String>,
   28|       |}
   29|       |
   30|       |#[derive(Serialize)]
   31|       |struct HealthCounts {
   32|       |    memories: i64,
   33|       |    /// Alias of `memories` for the documented contract in SKILL.md.
   34|       |    memories_total: i64,
   35|       |    entities: i64,
   36|       |    relationships: i64,
   37|       |    vec_memories: i64,
   38|       |}
   39|       |
   40|       |#[derive(Serialize)]
   41|       |struct HealthCheck {
   42|       |    name: String,
   43|       |    ok: bool,
   44|       |    #[serde(skip_serializing_if = "Option::is_none")]
   45|       |    detail: Option<String>,
   46|       |}
   47|       |
   48|       |#[derive(Serialize)]
   49|       |struct HealthResponse {
   50|       |    status: String,
   51|       |    integrity: String,
   52|       |    integrity_ok: bool,
   53|       |    schema_ok: bool,
   54|       |    vec_memories_ok: bool,
   55|       |    vec_memories_missing: i64,
   56|       |    vec_memories_orphaned: i64,
   57|       |    vec_entities_ok: bool,
   58|       |    vec_chunks_ok: bool,
   59|       |    fts_ok: bool,
   60|       |    /// Whether a live FTS5 MATCH query against fts_memories succeeded.
   61|       |    fts_query_ok: bool,
   62|       |    model_ok: bool,
   63|       |    counts: HealthCounts,
   64|       |    db_path: String,
   65|       |    db_size_bytes: u64,
   66|       |    /// MAX(version) from refinery_schema_history — number of the last applied migration.
   67|       |    /// Distinct from PRAGMA schema_version (SQLite DDL counter) and PRAGMA user_version
   68|       |    /// (canonical SCHEMA_USER_VERSION from __debug_schema).
   69|       |    schema_version: u32,
   70|       |    /// List of entities referenced by memories but absent from the entities table.
   71|       |    /// Empty in a healthy DB. Per the contract documented in SKILL.md.
   72|       |    missing_entities: Vec<String>,
   73|       |    /// WAL file size in MB (0.0 if WAL does not exist or journal_mode != wal).
   74|       |    wal_size_mb: f64,
   75|       |    /// SQLite journaling mode (wal, delete, truncate, persist, memory, off).
   76|       |    journal_mode: String,
   77|       |    /// SQLite version string, e.g. `"3.46.0"`.
   78|       |    sqlite_version: String,
   79|       |    /// Fraction of relationships that use the `mentions` relation type (0.0–1.0).
   80|       |    /// Omitted when there are no relationships in the database.
   81|       |    #[serde(skip_serializing_if = "Option::is_none")]
   82|       |    mentions_ratio: Option<f64>,
   83|       |    /// Human-readable warning when `mentions` relationships dominate the graph (ratio > 0.5).
   84|       |    /// Omitted when the ratio is within acceptable bounds or there are no relationships.
   85|       |    #[serde(skip_serializing_if = "Option::is_none")]
   86|       |    mentions_warning: Option<String>,
   87|       |    /// The relation type with the highest edge count in the namespace.
   88|       |    /// Omitted when there are no relationships in the database.
   89|       |    #[serde(skip_serializing_if = "Option::is_none")]
   90|       |    top_relation: Option<String>,
   91|       |    /// Fraction of all edges occupied by `top_relation` (0.0–1.0).
   92|       |    /// Omitted when there are no relationships in the database.
   93|       |    #[serde(skip_serializing_if = "Option::is_none")]
   94|       |    top_relation_ratio: Option<f64>,
   95|       |    /// Fraction of relationships that use the `applies_to` relation type (0.0–1.0).
   96|       |    /// Omitted when there are no relationships or when `applies_to` is absent.
   97|       |    #[serde(skip_serializing_if = "Option::is_none")]
   98|       |    applies_to_ratio: Option<f64>,
   99|       |    /// Human-readable warning when a single relation type occupies more than 40 % of edges.
  100|       |    /// Omitted when concentration is within acceptable bounds or there are no relationships.
  101|       |    #[serde(skip_serializing_if = "Option::is_none")]
  102|       |    relation_concentration_warning: Option<String>,
  103|       |    /// Number of entities whose name differs from its normalized kebab-case form.
  104|       |    #[serde(skip_serializing_if = "Option::is_none")]
  105|       |    non_normalized_count: Option<i64>,
  106|       |    /// Warning when non-normalized entities are detected.
  107|       |    #[serde(skip_serializing_if = "Option::is_none")]
  108|       |    normalization_warning: Option<String>,
  109|       |    /// Number of entities with degree exceeding the super-hub threshold (default 50).
  110|       |    #[serde(skip_serializing_if = "Option::is_none")]
  111|       |    super_hub_count: Option<i64>,
  112|       |    /// Warning listing top super-hub entity names.
  113|       |    #[serde(skip_serializing_if = "Option::is_none")]
  114|       |    super_hub_warning: Option<String>,
  115|       |    /// Name of the entity with the highest connection count in the namespace.
  116|       |    /// Omitted when there are no entities in the database.
  117|       |    #[serde(skip_serializing_if = "Option::is_none")]
  118|       |    top_hub_entity: Option<String>,
  119|       |    /// Number of connections (degree) of `top_hub_entity`.
  120|       |    /// Omitted when there are no entities in the database.
  121|       |    #[serde(skip_serializing_if = "Option::is_none")]
  122|       |    top_hub_degree: Option<i64>,
  123|       |    /// Human-readable warning when `top_hub_entity` exceeds 50 connections.
  124|       |    /// Omitted when degree is within acceptable bounds or there are no entities.
  125|       |    #[serde(skip_serializing_if = "Option::is_none")]
  126|       |    hub_warning: Option<String>,
  127|       |    checks: Vec<HealthCheck>,
  128|       |    elapsed_ms: u64,
  129|       |}
  130|       |
  131|       |/// Checks whether a table (including virtual ones) exists in sqlite_master.
  132|      0|fn table_exists(conn: &rusqlite::Connection, table_name: &str) -> bool {
  133|      0|    conn.query_row(
  134|      0|        "SELECT COUNT(*) FROM sqlite_master WHERE type IN ('table', 'shadow') AND name = ?1",
  135|      0|        rusqlite::params![table_name],
  136|      0|        |r| r.get::<_, i64>(0),
  137|       |    )
  138|      0|    .unwrap_or(0)
  139|       |        > 0
  140|      0|}
  141|       |
  142|      0|pub fn run(args: HealthArgs) -> Result<(), AppError> {
  143|      0|    let start = Instant::now();
  144|      0|    let _ = args.json; // --json is a no-op because output is already JSON by default
  145|      0|    let _ = args.format; // --format is a no-op; JSON is always emitted on stdout
  146|      0|    let paths = AppPaths::resolve(args.db.as_deref())?;
  147|       |
  148|      0|    crate::storage::connection::ensure_db_ready(&paths)?;
  149|       |
  150|      0|    let conn = open_ro(&paths.db)?;
  151|       |
  152|      0|    let integrity: String = conn.query_row("PRAGMA integrity_check;", [], |r| r.get(0))?;
  153|      0|    let integrity_ok = integrity == "ok";
  154|      0|    tracing::info!(target: "health", integrity_ok = %integrity_ok, "PRAGMA integrity_check complete");
  155|       |
  156|      0|    if !integrity_ok {
  157|      0|        let db_size_bytes = fs::metadata(&paths.db).map(|m| m.len()).unwrap_or(0);
  158|      0|        output::emit_json(&HealthResponse {
  159|      0|            status: "degraded".to_string(),
  160|      0|            integrity: integrity.clone(),
  161|      0|            integrity_ok: false,
  162|      0|            schema_ok: false,
  163|      0|            vec_memories_ok: false,
  164|      0|            vec_memories_missing: 0,
  165|      0|            vec_memories_orphaned: 0,
  166|      0|            vec_entities_ok: false,
  167|      0|            vec_chunks_ok: false,
  168|      0|            fts_ok: false,
  169|      0|            fts_query_ok: false,
  170|      0|            model_ok: false,
  171|      0|            counts: HealthCounts {
  172|      0|                memories: 0,
  173|      0|                memories_total: 0,
  174|      0|                entities: 0,
  175|      0|                relationships: 0,
  176|      0|                vec_memories: 0,
  177|      0|            },
  178|      0|            db_path: paths.db.display().to_string(),
  179|      0|            db_size_bytes,
  180|      0|            schema_version: 0,
  181|      0|            sqlite_version: "unknown".to_string(),
  182|      0|            missing_entities: vec![],
  183|      0|            wal_size_mb: 0.0,
  184|      0|            journal_mode: "unknown".to_string(),
  185|      0|            mentions_ratio: None,
  186|      0|            mentions_warning: None,
  187|      0|            top_relation: None,
  188|      0|            top_relation_ratio: None,
  189|      0|            applies_to_ratio: None,
  190|      0|            relation_concentration_warning: None,
  191|      0|            non_normalized_count: None,
  192|      0|            normalization_warning: None,
  193|      0|            super_hub_count: None,
  194|      0|            super_hub_warning: None,
  195|      0|            top_hub_entity: None,
  196|      0|            top_hub_degree: None,
  197|      0|            hub_warning: None,
  198|      0|            checks: vec![HealthCheck {
  199|      0|                name: "integrity".to_string(),
  200|      0|                ok: false,
  201|      0|                detail: Some(integrity),
  202|      0|            }],
  203|      0|            elapsed_ms: start.elapsed().as_millis() as u64,
  204|      0|        })?;
  205|      0|        return Err(AppError::Database(rusqlite::Error::SqliteFailure(
  206|      0|            rusqlite::ffi::Error::new(rusqlite::ffi::SQLITE_CORRUPT),
  207|      0|            Some("integrity check failed".to_string()),
  208|      0|        )));
  209|      0|    }
  210|       |
  211|      0|    let memories_count: i64 = conn.query_row(
  212|      0|        "SELECT COUNT(*) FROM memories WHERE deleted_at IS NULL",
  213|      0|        [],
  214|      0|        |r| r.get(0),
  215|      0|    )?;
  216|      0|    let entities_count: i64 = conn.query_row("SELECT COUNT(*) FROM entities", [], |r| r.get(0))?;
  217|      0|    let relationships_count: i64 =
  218|      0|        conn.query_row("SELECT COUNT(*) FROM relationships", [], |r| r.get(0))?;
  219|      0|    let vec_memories_count: i64 =
  220|      0|        conn.query_row("SELECT COUNT(*) FROM vec_memories", [], |r| r.get(0))?;
  221|       |
  222|      0|    let mentions_count: i64 = conn.query_row(
  223|      0|        "SELECT COUNT(*) FROM relationships WHERE relation = 'mentions'",
  224|      0|        [],
  225|      0|        |r| r.get(0),
  226|      0|    )?;
  227|      0|    let (mentions_ratio, mentions_warning) = if relationships_count > 0 {
  228|      0|        let ratio = mentions_count as f64 / relationships_count as f64;
  229|      0|        let warning = if ratio > 0.5 {
  230|      0|            Some(format!(
  231|      0|                "mentions relationships dominate graph at {:.1}% ({}/{} total); consider running prune-relations --relation mentions --dry-run",
  232|      0|                ratio * 100.0,
  233|      0|                mentions_count,
  234|      0|                relationships_count
  235|      0|            ))
  236|       |        } else {
  237|      0|            None
  238|       |        };
  239|      0|        (Some(ratio), warning)
  240|       |    } else {
  241|      0|        (None, None)
  242|       |    };
  243|       |
  244|       |    // Relation concentration: find the most frequent relation type and check threshold.
  245|      0|    let (top_relation, top_relation_ratio, applies_to_ratio, relation_concentration_warning) =
  246|      0|        if relationships_count > 0 {
  247|       |            // Identify the relation with the highest edge count.
  248|      0|            let (top_rel, top_count): (String, i64) = conn
  249|      0|                .query_row(
  250|      0|                    "SELECT relation, COUNT(*) AS cnt
  251|      0|                     FROM relationships
  252|      0|                     GROUP BY relation
  253|      0|                     ORDER BY cnt DESC
  254|      0|                     LIMIT 1",
  255|      0|                    [],
  256|      0|                    |r| Ok((r.get::<_, String>(0)?, r.get::<_, i64>(1)?)),
  257|       |                )
  258|      0|                .unwrap_or_else(|_| ("unknown".to_string(), 0));
  259|       |
  260|      0|            let top_ratio = top_count as f64 / relationships_count as f64;
  261|       |
  262|       |            // Compute applies_to ratio separately (may be 0 if absent).
  263|      0|            let applies_count: i64 = conn
  264|      0|                .query_row(
  265|      0|                    "SELECT COUNT(*) FROM relationships WHERE relation = 'applies_to'",
  266|      0|                    [],
  267|      0|                    |r| r.get(0),
  268|       |                )
  269|      0|                .unwrap_or(0);
  270|      0|            let at_ratio = if applies_count > 0 {
  271|      0|                Some(applies_count as f64 / relationships_count as f64)
  272|       |            } else {
  273|      0|                None
  274|       |            };
  275|       |
  276|      0|            let concentration_warning = if top_ratio > 0.40 {
  277|      0|                Some(format!(
  278|      0|                    "relation '{}' dominates graph at {:.1}% ({}/{} total); consider running prune-relations --relation {} --dry-run",
  279|      0|                    top_rel,
  280|      0|                    top_ratio * 100.0,
  281|      0|                    top_count,
  282|      0|                    relationships_count,
  283|      0|                    top_rel,
  284|      0|                ))
  285|       |            } else {
  286|      0|                None
  287|       |            };
  288|       |
  289|      0|            (
  290|      0|                Some(top_rel),
  291|      0|                Some(top_ratio),
  292|      0|                at_ratio,
  293|      0|                concentration_warning,
  294|      0|            )
  295|       |        } else {
  296|      0|            (None, None, None, None)
  297|       |        };
  298|       |
  299|      0|    let status = "ok";
  300|       |
  301|      0|    let schema_version: u32 = conn
  302|      0|        .query_row(
  303|      0|            "SELECT COALESCE(MAX(version), 0) FROM refinery_schema_history",
  304|      0|            [],
  305|      0|            |r| r.get::<_, i64>(0),
  306|       |        )
  307|      0|        .unwrap_or(0) as u32;
  308|       |
  309|      0|    let schema_ok = schema_version > 0;
  310|       |
  311|       |    // Checks vector tables via sqlite_master
  312|      0|    let vec_memories_ok = table_exists(&conn, "vec_memories");
  313|      0|    let vec_entities_ok = table_exists(&conn, "vec_entities");
  314|      0|    let vec_chunks_ok = table_exists(&conn, "vec_chunks");
  315|       |
  316|      0|    let vec_memories_missing: i64 = if vec_memories_ok {
  317|      0|        conn.query_row(
  318|      0|            "SELECT COUNT(*) FROM memories m LEFT JOIN vec_memories v ON v.memory_id = m.id WHERE v.memory_id IS NULL AND m.deleted_at IS NULL",
  319|      0|            [], |r| r.get(0),
  320|      0|        ).unwrap_or(0)
  321|       |    } else {
  322|      0|        0
  323|       |    };
  324|       |
  325|      0|    let vec_memories_orphaned: i64 = if vec_memories_ok {
  326|      0|        conn.query_row(
  327|      0|            "SELECT COUNT(*) FROM vec_memories v LEFT JOIN memories m ON m.id = v.memory_id WHERE m.id IS NULL",
  328|      0|            [], |r| r.get(0),
  329|      0|        ).unwrap_or(0)
  330|       |    } else {
  331|      0|        0
  332|       |    };
  333|       |
  334|      0|    tracing::info!(target: "health", vec_memories_ok = %vec_memories_ok, vec_entities_ok = %vec_entities_ok, vec_missing = vec_memories_missing, vec_orphaned = vec_memories_orphaned, "vector table checks complete");
  335|      0|    let fts_ok = table_exists(&conn, "fts_memories");
  336|       |
  337|       |    // Verifies that FTS5 can execute a MATCH query (catches index corruption distinct from table absence).
  338|      0|    let fts_query_ok = if fts_ok {
  339|      0|        conn.query_row(
  340|      0|            "SELECT COUNT(*) FROM fts_memories WHERE fts_memories MATCH 'a' LIMIT 1",
  341|      0|            [],
  342|      0|            |r| r.get::<_, i64>(0),
  343|       |        )
  344|      0|        .is_ok()
  345|       |    } else {
  346|      0|        false
  347|       |    };
  348|       |
  349|      0|    tracing::info!(target: "health", fts_ok = %fts_ok, fts_query_ok = %fts_query_ok, "FTS5 checks complete");
  350|       |
  351|       |    // Captures the SQLite runtime version for observability.
  352|      0|    let sqlite_version: String = conn
  353|      0|        .query_row("SELECT sqlite_version()", [], |r| r.get(0))
  354|      0|        .unwrap_or_else(|_| "unknown".to_string());
  355|       |
  356|       |    // Detects orphan entities referenced by memories but absent from the entities table.
  357|      0|    let mut missing_entities: Vec<String> = Vec::with_capacity(4);
  358|      0|    let mut stmt = conn.prepare_cached(
  359|      0|        "SELECT DISTINCT me.entity_id
  360|      0|         FROM memory_entities me
  361|      0|         LEFT JOIN entities e ON e.id = me.entity_id
  362|      0|         WHERE e.id IS NULL",
  363|      0|    )?;
  364|      0|    let orphans: Vec<i64> = stmt
  365|      0|        .query_map([], |r| r.get(0))?
  366|      0|        .collect::<Result<Vec<_>, _>>()?;
  367|      0|    for id in orphans {
  368|      0|        missing_entities.push(format!("entity_id={id}"));
  369|      0|    }
  370|       |
  371|      0|    let journal_mode: String = conn
  372|      0|        .query_row("PRAGMA journal_mode", [], |row| row.get::<_, String>(0))
  373|      0|        .unwrap_or_else(|_| "unknown".to_string());
  374|       |
  375|      0|    let wal_size_mb = fs::metadata(format!("{}-wal", paths.db.display()))
  376|      0|        .map(|m| m.len() as f64 / 1024.0 / 1024.0)
  377|      0|        .unwrap_or(0.0);
  378|       |
  379|       |    // Database file size in bytes
  380|      0|    let db_size_bytes = fs::metadata(&paths.db).map(|m| m.len()).unwrap_or(0);
  381|       |
  382|       |    // Checks whether the ONNX model is present in the cache
  383|      0|    let model_dir = paths.models.join("models--intfloat--multilingual-e5-small");
  384|      0|    let model_ok = model_dir.exists();
  385|      0|    tracing::info!(target: "health", model_ok = %model_ok, "embedding model check complete");
  386|       |
  387|       |    // Builds the checks array for detailed diagnostics
  388|      0|    let mut checks: Vec<HealthCheck> = Vec::with_capacity(8);
  389|       |
  390|       |    // At this point integrity_ok is always true (corrupt DB returned early above).
  391|      0|    checks.push(HealthCheck {
  392|      0|        name: "integrity".to_string(),
  393|      0|        ok: true,
  394|      0|        detail: None,
  395|      0|    });
  396|       |
  397|      0|    checks.push(HealthCheck {
  398|      0|        name: "schema_version".to_string(),
  399|      0|        ok: schema_ok,
  400|      0|        detail: if schema_ok {
  401|      0|            None
  402|       |        } else {
  403|      0|            Some(format!("schema_version={schema_version} (expected >0)"))
  404|       |        },
  405|       |    });
  406|       |
  407|      0|    checks.push(HealthCheck {
  408|      0|        name: "vec_memories".to_string(),
  409|      0|        ok: vec_memories_ok,
  410|      0|        detail: if vec_memories_ok {
  411|      0|            None
  412|       |        } else {
  413|      0|            Some("vec_memories table missing from sqlite_master".to_string())
  414|       |        },
  415|       |    });
  416|       |
  417|      0|    checks.push(HealthCheck {
  418|      0|        name: "vec_entities".to_string(),
  419|      0|        ok: vec_entities_ok,
  420|      0|        detail: if vec_entities_ok {
  421|      0|            None
  422|       |        } else {
  423|      0|            Some("vec_entities table missing from sqlite_master".to_string())
  424|       |        },
  425|       |    });
  426|       |
  427|      0|    checks.push(HealthCheck {
  428|      0|        name: "vec_chunks".to_string(),
  429|      0|        ok: vec_chunks_ok,
  430|      0|        detail: if vec_chunks_ok {
  431|      0|            None
  432|       |        } else {
  433|      0|            Some("vec_chunks table missing from sqlite_master".to_string())
  434|       |        },
  435|       |    });
  436|       |
  437|      0|    checks.push(HealthCheck {
  438|      0|        name: "fts_memories".to_string(),
  439|      0|        ok: fts_ok,
  440|      0|        detail: if fts_ok {
  441|      0|            None
  442|       |        } else {
  443|      0|            Some("fts_memories table missing from sqlite_master".to_string())
  444|       |        },
  445|       |    });
  446|       |
  447|      0|    checks.push(HealthCheck {
  448|      0|        name: "fts_query".to_string(),
  449|      0|        ok: fts_query_ok,
  450|      0|        detail: if fts_query_ok {
  451|      0|            None
  452|       |        } else {
  453|      0|            Some("FTS5 MATCH query failed — run 'sqlite-graphrag fts rebuild'".to_string())
  454|       |        },
  455|       |    });
  456|       |
  457|      0|    checks.push(HealthCheck {
  458|      0|        name: "model_onnx".to_string(),
  459|      0|        ok: model_ok,
  460|      0|        detail: if model_ok {
  461|      0|            None
  462|       |        } else {
  463|      0|            Some(format!(
  464|      0|                "model missing at {}; run 'sqlite-graphrag models download'",
  465|      0|                model_dir.display()
  466|      0|            ))
  467|       |        },
  468|       |    });
  469|       |
  470|       |    // G24: detect non-normalized entity names
  471|      0|    let (non_normalized_count, normalization_warning) = {
  472|      0|        let mut stmt = conn.prepare_cached("SELECT name FROM entities")?;
  473|      0|        let names: Vec<String> = stmt
  474|      0|            .query_map([], |r| r.get(0))?
  475|      0|            .filter_map(|r| r.ok())
  476|      0|            .collect();
  477|      0|        let count = names
  478|      0|            .iter()
  479|      0|            .filter(|n| crate::parsers::normalize_entity_name(n) != **n)
  480|      0|            .count() as i64;
  481|      0|        let warning = if count > 0 {
  482|      0|            Some(format!(
  483|      0|                "run 'normalize-entities --yes' to fix {count} non-normalized entities"
  484|      0|            ))
  485|       |        } else {
  486|      0|            None
  487|       |        };
  488|      0|        (Some(count), warning)
  489|       |    };
  490|       |
  491|       |    // G25: detect super-hub entities (degree > 50)
  492|      0|    let (super_hub_count, super_hub_warning) = {
  493|      0|        let mut stmt = conn.prepare_cached(
  494|      0|            "SELECT e.name, COUNT(r.id) as deg FROM entities e \
  495|      0|             LEFT JOIN relationships r ON e.id = r.source_id OR e.id = r.target_id \
  496|      0|             GROUP BY e.id HAVING deg > 50 ORDER BY deg DESC LIMIT 5",
  497|      0|        )?;
  498|      0|        let hubs: Vec<(String, i64)> = stmt
  499|      0|            .query_map([], |r| Ok((r.get(0)?, r.get(1)?)))?
  500|      0|            .filter_map(|r| r.ok())
  501|      0|            .collect();
  502|      0|        let count = hubs.len() as i64;
  503|      0|        let warning = if count > 0 {
  504|      0|            let names: Vec<String> = hubs
  505|      0|                .iter()
  506|      0|                .map(|(n, d)| format!("{n} (degree {d})"))
  507|      0|                .collect();
  508|      0|            Some(format!("super-hubs detected: {}", names.join(", ")))
  509|       |        } else {
  510|      0|            None
  511|       |        };
  512|      0|        (Some(count), warning)
  513|       |    };
  514|       |
  515|       |    // G25 (extended): identify the single highest-degree entity for programmatic use.
  516|      0|    let (top_hub_entity, top_hub_degree, hub_warning) = {
  517|      0|        let result: Option<(String, i64)> = conn
  518|      0|            .query_row(
  519|      0|                "SELECT e.name, COUNT(r.id) AS degree
  520|      0|                 FROM entities e
  521|      0|                 LEFT JOIN relationships r ON e.id = r.source_id OR e.id = r.target_id
  522|      0|                 GROUP BY e.id
  523|      0|                 ORDER BY degree DESC
  524|      0|                 LIMIT 1",
  525|      0|                [],
  526|      0|                |r| Ok((r.get::<_, String>(0)?, r.get::<_, i64>(1)?)),
  527|       |            )
  528|      0|            .ok();
  529|      0|        match result {
  530|      0|            Some((name, degree)) => {
  531|      0|                let warning = if degree > 50 {
  532|      0|                    Some(format!(
  533|      0|                        "entity '{name}' has {degree} connections; consider splitting or using --max-neighbors-per-hop"
  534|      0|                    ))
  535|       |                } else {
  536|      0|                    None
  537|       |                };
  538|      0|                (Some(name), Some(degree), warning)
  539|       |            }
  540|      0|            None => (None, None, None),
  541|       |        }
  542|       |    };
  543|       |
  544|      0|    let response = HealthResponse {
  545|      0|        status: status.to_string(),
  546|      0|        integrity,
  547|      0|        integrity_ok,
  548|      0|        schema_ok,
  549|      0|        vec_memories_ok,
  550|      0|        vec_memories_missing,
  551|      0|        vec_memories_orphaned,
  552|      0|        vec_entities_ok,
  553|      0|        vec_chunks_ok,
  554|      0|        fts_ok,
  555|      0|        fts_query_ok,
  556|      0|        model_ok,
  557|      0|        counts: HealthCounts {
  558|      0|            memories: memories_count,
  559|      0|            memories_total: memories_count,
  560|      0|            entities: entities_count,
  561|      0|            relationships: relationships_count,
  562|      0|            vec_memories: vec_memories_count,
  563|      0|        },
  564|      0|        db_path: paths.db.display().to_string(),
  565|      0|        db_size_bytes,
  566|      0|        schema_version,
  567|      0|        sqlite_version,
  568|      0|        missing_entities,
  569|      0|        wal_size_mb,
  570|      0|        journal_mode,
  571|      0|        mentions_ratio,
  572|      0|        mentions_warning,
  573|      0|        top_relation,
  574|      0|        top_relation_ratio,
  575|      0|        applies_to_ratio,
  576|      0|        relation_concentration_warning,
  577|      0|        non_normalized_count,
  578|      0|        normalization_warning,
  579|      0|        super_hub_count,
  580|      0|        super_hub_warning,
  581|      0|        top_hub_entity,
  582|      0|        top_hub_degree,
  583|      0|        hub_warning,
  584|      0|        checks,
  585|      0|        elapsed_ms: start.elapsed().as_millis() as u64,
  586|      0|    };
  587|       |
  588|      0|    output::emit_json(&response)?;
  589|       |
  590|      0|    Ok(())
  591|      0|}
  592|       |
  593|       |#[cfg(test)]
  594|       |mod tests {
  595|       |    use super::*;
  596|       |
  597|       |    #[test]
  598|      1|    fn health_check_serializes_all_new_fields() {
  599|      1|        let response = HealthResponse {
  600|      1|            status: "ok".to_string(),
  601|      1|            integrity: "ok".to_string(),
  602|      1|            integrity_ok: true,
  603|      1|            schema_ok: true,
  604|      1|            vec_memories_ok: true,
  605|      1|            vec_memories_missing: 0,
  606|      1|            vec_memories_orphaned: 0,
  607|      1|            vec_entities_ok: true,
  608|      1|            vec_chunks_ok: true,
  609|      1|            fts_ok: true,
  610|      1|            fts_query_ok: true,
  611|      1|            model_ok: false,
  612|      1|            counts: HealthCounts {
  613|      1|                memories: 5,
  614|      1|                memories_total: 5,
  615|      1|                entities: 3,
  616|      1|                relationships: 2,
  617|      1|                vec_memories: 5,
  618|      1|            },
  619|      1|            db_path: "/tmp/test.sqlite".to_string(),
  620|      1|            db_size_bytes: 4096,
  621|      1|            schema_version: 6,
  622|      1|            sqlite_version: "3.46.0".to_string(),
  623|      1|            elapsed_ms: 0,
  624|      1|            missing_entities: vec![],
  625|      1|            wal_size_mb: 0.0,
  626|      1|            journal_mode: "wal".to_string(),
  627|      1|            mentions_ratio: None,
  628|      1|            mentions_warning: None,
  629|      1|            top_relation: None,
  630|      1|            top_relation_ratio: None,
  631|      1|            applies_to_ratio: None,
  632|      1|            relation_concentration_warning: None,
  633|      1|            non_normalized_count: None,
  634|      1|            normalization_warning: None,
  635|      1|            super_hub_count: None,
  636|      1|            super_hub_warning: None,
  637|      1|            top_hub_entity: None,
  638|      1|            top_hub_degree: None,
  639|      1|            hub_warning: None,
  640|      1|            checks: vec![
  641|      1|                HealthCheck {
  642|      1|                    name: "integrity".to_string(),
  643|      1|                    ok: true,
  644|      1|                    detail: None,
  645|      1|                },
  646|      1|                HealthCheck {
  647|      1|                    name: "model_onnx".to_string(),
  648|      1|                    ok: false,
  649|      1|                    detail: Some("model missing".to_string()),
  650|      1|                },
  651|      1|            ],
  652|      1|        };
  653|       |
  654|      1|        let json = serde_json::to_value(&response).unwrap();
  655|      1|        assert_eq!(json["status"], "ok");
  656|      1|        assert_eq!(json["integrity_ok"], true);
  657|      1|        assert_eq!(json["schema_ok"], true);
  658|      1|        assert_eq!(json["vec_memories_ok"], true);
  659|      1|        assert_eq!(json["vec_entities_ok"], true);
  660|      1|        assert_eq!(json["vec_chunks_ok"], true);
  661|      1|        assert_eq!(json["fts_ok"], true);
  662|      1|        assert_eq!(json["model_ok"], false);
  663|      1|        assert_eq!(json["db_size_bytes"], 4096u64);
  664|      1|        assert!(json["checks"].is_array());
  665|      1|        assert_eq!(json["checks"].as_array().unwrap().len(), 2);
  666|       |
  667|       |        // Verifies that detail is absent when ok=true (skip_serializing_if)
  668|      1|        let integrity_check = &json["checks"][0];
  669|      1|        assert_eq!(integrity_check["name"], "integrity");
  670|      1|        assert_eq!(integrity_check["ok"], true);
  671|      1|        assert!(integrity_check.get("detail").is_none());
  672|       |
  673|       |        // Verifies that detail is present when ok=false
  674|      1|        let model_check = &json["checks"][1];
  675|      1|        assert_eq!(model_check["name"], "model_onnx");
  676|      1|        assert_eq!(model_check["ok"], false);
  677|      1|        assert_eq!(model_check["detail"], "model missing");
  678|      1|    }
  679|       |
  680|       |    #[test]
  681|      1|    fn health_check_without_detail_omits_field() {
  682|      1|        let check = HealthCheck {
  683|      1|            name: "vec_memories".to_string(),
  684|      1|            ok: true,
  685|      1|            detail: None,
  686|      1|        };
  687|      1|        let json = serde_json::to_value(&check).unwrap();
  688|      1|        assert!(
  689|      1|            json.get("detail").is_none(),
  690|      0|            "detail field must be omitted when None"
  691|       |        );
  692|      1|    }
  693|       |
  694|       |    #[test]
  695|      1|    fn health_check_with_detail_serializes_field() {
  696|      1|        let check = HealthCheck {
  697|      1|            name: "fts_memories".to_string(),
  698|      1|            ok: false,
  699|      1|            detail: Some("fts_memories table missing from sqlite_master".to_string()),
  700|      1|        };
  701|      1|        let json = serde_json::to_value(&check).unwrap();
  702|      1|        assert_eq!(
  703|      1|            json["detail"],
  704|       |            "fts_memories table missing from sqlite_master"
  705|       |        );
  706|      1|    }
  707|       |
  708|       |    #[test]
  709|      1|    fn health_response_fts_query_ok_and_sqlite_version_serialize() {
  710|       |        // Verifies that fts_query_ok and sqlite_version appear in the serialized JSON
  711|       |        // with the expected keys and values.
  712|      1|        let response = HealthResponse {
  713|      1|            status: "ok".to_string(),
  714|      1|            integrity: "ok".to_string(),
  715|      1|            integrity_ok: true,
  716|      1|            schema_ok: true,
  717|      1|            vec_memories_ok: true,
  718|      1|            vec_memories_missing: 0,
  719|      1|            vec_memories_orphaned: 0,
  720|      1|            vec_entities_ok: true,
  721|      1|            vec_chunks_ok: true,
  722|      1|            fts_ok: true,
  723|      1|            fts_query_ok: true,
  724|      1|            model_ok: true,
  725|      1|            counts: HealthCounts {
  726|      1|                memories: 0,
  727|      1|                memories_total: 0,
  728|      1|                entities: 0,
  729|      1|                relationships: 0,
  730|      1|                vec_memories: 0,
  731|      1|            },
  732|      1|            db_path: "/tmp/test.sqlite".to_string(),
  733|      1|            db_size_bytes: 0,
  734|      1|            schema_version: 1,
  735|      1|            sqlite_version: "3.45.1".to_string(),
  736|      1|            elapsed_ms: 0,
  737|      1|            missing_entities: vec![],
  738|      1|            wal_size_mb: 0.0,
  739|      1|            journal_mode: "wal".to_string(),
  740|      1|            mentions_ratio: None,
  741|      1|            mentions_warning: None,
  742|      1|            top_relation: None,
  743|      1|            top_relation_ratio: None,
  744|      1|            applies_to_ratio: None,
  745|      1|            relation_concentration_warning: None,
  746|      1|            non_normalized_count: None,
  747|      1|            normalization_warning: None,
  748|      1|            super_hub_count: None,
  749|      1|            super_hub_warning: None,
  750|      1|            top_hub_entity: None,
  751|      1|            top_hub_degree: None,
  752|      1|            hub_warning: None,
  753|      1|            checks: vec![],
  754|      1|        };
  755|       |
  756|      1|        let json = serde_json::to_value(&response).unwrap();
  757|       |
  758|       |        // fts_query_ok must appear at the top level
  759|      1|        assert_eq!(
  760|      1|            json["fts_query_ok"], true,
  761|      0|            "fts_query_ok must be present and true in serialized JSON"
  762|       |        );
  763|       |
  764|       |        // sqlite_version must appear at the top level with the exact string
  765|      1|        assert_eq!(
  766|      1|            json["sqlite_version"], "3.45.1",
  767|      0|            "sqlite_version must be present and match the provided string"
  768|       |        );
  769|       |
  770|       |        // Verify fts_query_ok=false path includes the expected detail message
  771|      1|        let check_fail = HealthCheck {
  772|      1|            name: "fts_query".to_string(),
  773|      1|            ok: false,
  774|      1|            detail: Some("FTS5 MATCH query failed — run 'sqlite-graphrag fts rebuild'".to_string()),
  775|      1|        };
  776|      1|        let check_json = serde_json::to_value(&check_fail).unwrap();
  777|      1|        assert_eq!(check_json["name"], "fts_query");
  778|      1|        assert_eq!(check_json["ok"], false);
  779|      1|        assert_eq!(
  780|      1|            check_json["detail"],
  781|       |            "FTS5 MATCH query failed — run 'sqlite-graphrag fts rebuild'"
  782|       |        );
  783|      1|    }
  784|       |
  785|      5|    fn make_full_response(
  786|      5|        top_relation: Option<String>,
  787|      5|        top_relation_ratio: Option<f64>,
  788|      5|        applies_to_ratio: Option<f64>,
  789|      5|        relation_concentration_warning: Option<String>,
  790|      5|    ) -> HealthResponse {
  791|      5|        HealthResponse {
  792|      5|            status: "ok".to_string(),
  793|      5|            integrity: "ok".to_string(),
  794|      5|            integrity_ok: true,
  795|      5|            schema_ok: true,
  796|      5|            vec_memories_ok: true,
  797|      5|            vec_memories_missing: 0,
  798|      5|            vec_memories_orphaned: 0,
  799|      5|            vec_entities_ok: true,
  800|      5|            vec_chunks_ok: true,
  801|      5|            fts_ok: true,
  802|      5|            fts_query_ok: true,
  803|      5|            model_ok: true,
  804|      5|            counts: HealthCounts {
  805|      5|                memories: 10,
  806|      5|                memories_total: 10,
  807|      5|                entities: 5,
  808|      5|                relationships: 20,
  809|      5|                vec_memories: 10,
  810|      5|            },
  811|      5|            db_path: "/tmp/test.sqlite".to_string(),
  812|      5|            db_size_bytes: 8192,
  813|      5|            schema_version: 3,
  814|      5|            sqlite_version: "3.46.0".to_string(),
  815|      5|            elapsed_ms: 1,
  816|      5|            missing_entities: vec![],
  817|      5|            wal_size_mb: 0.0,
  818|      5|            journal_mode: "wal".to_string(),
  819|      5|            mentions_ratio: None,
  820|      5|            mentions_warning: None,
  821|      5|            top_relation,
  822|      5|            top_relation_ratio,
  823|      5|            applies_to_ratio,
  824|      5|            relation_concentration_warning,
  825|      5|            non_normalized_count: None,
  826|      5|            normalization_warning: None,
  827|      5|            super_hub_count: None,
  828|      5|            super_hub_warning: None,
  829|      5|            top_hub_entity: None,
  830|      5|            top_hub_degree: None,
  831|      5|            hub_warning: None,
  832|      5|            checks: vec![],
  833|      5|        }
  834|      5|    }
  835|       |
  836|       |    #[test]
  837|      1|    fn health_concentration_fields_omitted_when_no_relationships() {
  838|       |        // Represents a DB with zero relationships.
  839|      1|        let resp = make_full_response(None, None, None, None);
  840|      1|        let json = serde_json::to_value(&resp).unwrap();
  841|      1|        assert!(
  842|      1|            json.get("top_relation").is_none(),
  843|      0|            "top_relation must be omitted when None"
  844|       |        );
  845|      1|        assert!(
  846|      1|            json.get("top_relation_ratio").is_none(),
  847|      0|            "top_relation_ratio must be omitted when None"
  848|       |        );
  849|      1|        assert!(
  850|      1|            json.get("applies_to_ratio").is_none(),
  851|      0|            "applies_to_ratio must be omitted when None"
  852|       |        );
  853|      1|        assert!(
  854|      1|            json.get("relation_concentration_warning").is_none(),
  855|      0|            "relation_concentration_warning must be omitted when None"
  856|       |        );
  857|      1|    }
  858|       |
  859|       |    #[test]
  860|      1|    fn health_concentration_fields_present_with_data() {
  861|      1|        let resp = make_full_response(
  862|      1|            Some("mentions".to_string()),
  863|      1|            Some(0.60),
  864|      1|            Some(0.10),
  865|      1|            Some("relation 'mentions' dominates graph at 60.0%".to_string()),
  866|       |        );
  867|      1|        let json = serde_json::to_value(&resp).unwrap();
  868|      1|        assert_eq!(json["top_relation"], "mentions");
  869|      1|        assert!((json["top_relation_ratio"].as_f64().unwrap() - 0.60).abs() < 1e-9);
  870|      1|        assert!((json["applies_to_ratio"].as_f64().unwrap() - 0.10).abs() < 1e-9);
  871|      1|        assert!(json["relation_concentration_warning"]
  872|      1|            .as_str()
  873|      1|            .unwrap()
  874|      1|            .contains("60.0%"));
  875|      1|    }
  876|       |
  877|       |    #[test]
  878|      1|    fn health_concentration_warning_absent_when_ratio_below_threshold() {
  879|       |        // top_relation_ratio of 0.39 is below the 0.40 threshold — no warning.
  880|      1|        let resp = make_full_response(Some("uses".to_string()), Some(0.39), None, None);
  881|      1|        let json = serde_json::to_value(&resp).unwrap();
  882|      1|        assert_eq!(json["top_relation"], "uses");
  883|      1|        assert!(
  884|      1|            json.get("relation_concentration_warning").is_none(),
  885|      0|            "warning must be absent when ratio <= 0.40"
  886|       |        );
  887|      1|    }
  888|       |
  889|       |    #[test]
  890|      1|    fn health_concentration_warning_present_at_threshold() {
  891|       |        // Exactly at 0.41 (above 0.40) — warning must appear.
  892|      1|        let resp = make_full_response(
  893|      1|            Some("depends_on".to_string()),
  894|      1|            Some(0.41),
  895|      1|            None,
  896|      1|            Some("relation 'depends_on' dominates graph at 41.0%".to_string()),
  897|       |        );
  898|      1|        let json = serde_json::to_value(&resp).unwrap();
  899|      1|        assert!(
  900|      1|            json["relation_concentration_warning"].is_string(),
  901|      0|            "warning must be present when top_relation_ratio > 0.40"
  902|       |        );
  903|      1|    }
  904|       |
  905|       |    #[test]
  906|      1|    fn health_applies_to_ratio_omitted_when_none() {
  907|       |        // applies_to_ratio is None when there are no applies_to edges.
  908|      1|        let resp = make_full_response(Some("related".to_string()), Some(0.30), None, None);
  909|      1|        let json = serde_json::to_value(&resp).unwrap();
  910|      1|        assert!(
  911|      1|            json.get("applies_to_ratio").is_none(),
  912|      0|            "applies_to_ratio must be omitted when None"
  913|       |        );
  914|      1|    }
  915|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/history.rs:
    1|       |//! Handler for the `history` CLI subcommand.
    2|       |
    3|       |use crate::errors::AppError;
    4|       |use crate::i18n::errors_msg;
    5|       |use crate::output;
    6|       |use crate::paths::AppPaths;
    7|       |use crate::storage::connection::open_ro;
    8|       |use rusqlite::params;
    9|       |use rusqlite::OptionalExtension;
   10|       |use serde::Serialize;
   11|       |
   12|       |#[derive(clap::Args)]
   13|       |#[command(after_long_help = "EXAMPLES:\n  \
   14|       |    # List all versions of a memory (positional form)\n  \
   15|       |    sqlite-graphrag history onboarding\n\n  \
   16|       |    # List versions using the named flag form\n  \
   17|       |    sqlite-graphrag history --name onboarding\n\n  \
   18|       |    # Omit body content to reduce response size\n  \
   19|       |    sqlite-graphrag history onboarding --no-body\n\n  \
   20|       |    # Include character-level change summary between versions\n  \
   21|       |    sqlite-graphrag history onboarding --diff\n\n\
   22|       |DIFF OUTPUT:\n  \
   23|       |    When --diff is active, each version (except the first) includes a `changes`\n  \
   24|       |    object with `added_chars` and `removed_chars` — the character count difference\n  \
   25|       |    between that version and its predecessor.")]
   26|       |pub struct HistoryArgs {
   27|       |    /// Memory name as a positional argument. Alternative to `--name`.
   28|       |    #[arg(
   29|       |        value_name = "NAME",
   30|       |        conflicts_with = "name",
   31|       |        help = "Memory name whose version history to return; alternative to --name"
   32|       |    )]
   33|       |    pub name_positional: Option<String>,
   34|       |    /// Memory name whose version history will be returned. Includes soft-deleted memories
   35|       |    /// so that `restore --version <V>` workflow remains discoverable after `forget`.
   36|       |    #[arg(long)]
   37|       |    pub name: Option<String>,
   38|       |    #[arg(
   39|       |        long,
   40|       |        help = "Namespace (env: SQLITE_GRAPHRAG_NAMESPACE, default: global)"
   41|       |    )]
   42|       |    pub namespace: Option<String>,
   43|       |    /// Omit body content from each version to reduce response size.
   44|       |    #[arg(
   45|       |        long,
   46|       |        default_value_t = false,
   47|       |        help = "Omit body content from response"
   48|       |    )]
   49|       |    pub no_body: bool,
   50|       |    /// Include character-level change summary between consecutive versions.
   51|       |    #[arg(
   52|       |        long,
   53|       |        default_value_t = false,
   54|       |        help = "Include character-level change summary between consecutive versions"
   55|       |    )]
   56|       |    pub diff: bool,
   57|       |    #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
   58|       |    pub json: bool,
   59|       |    /// Path to graphrag.sqlite (overrides SQLITE_GRAPHRAG_DB_PATH and default CWD).
   60|       |    #[arg(
   61|       |        long,
   62|       |        env = "SQLITE_GRAPHRAG_DB_PATH",
   63|       |        help = "Path to graphrag.sqlite"
   64|       |    )]
   65|       |    pub db: Option<String>,
   66|       |}
   67|       |
   68|       |/// Character-level change summary between two consecutive versions.
   69|       |#[derive(Serialize)]
   70|       |struct VersionChanges {
   71|       |    added_chars: usize,
   72|       |    removed_chars: usize,
   73|       |}
   74|       |
   75|       |#[derive(Serialize)]
   76|       |struct HistoryVersion {
   77|       |    version: i64,
   78|       |    name: String,
   79|       |    #[serde(rename = "type")]
   80|       |    memory_type: String,
   81|       |    description: String,
   82|       |    #[serde(skip_serializing_if = "Option::is_none")]
   83|       |    body: Option<String>,
   84|       |    metadata: serde_json::Value,
   85|       |    /// Past-tense action label derived from `change_reason`; always populated
   86|       |    /// so consumers do not see `null` for the documented `action` contract
   87|       |    /// (M-A6 fix in v1.0.40). Known mappings: `create→created`, `edit→edited`,
   88|       |    /// `rename→renamed`, `restore→restored`, `merge→merged`, `forget→forgotten`.
   89|       |    /// Unknown verbs are passed through unchanged.
   90|       |    action: String,
   91|       |    change_reason: String,
   92|       |    changed_by: Option<String>,
   93|       |    created_at: i64,
   94|       |    created_at_iso: String,
   95|       |    #[serde(skip_serializing_if = "Option::is_none")]
   96|       |    pub changes: Option<VersionChanges>,
   97|       |}
   98|       |
   99|       |/// Maps the raw `change_reason` stored in `memory_versions` to the past-tense
  100|       |/// `action` exposed in the JSON contract. Centralized so future call sites
  101|       |/// (e.g. `read --include-history`) reuse the same mapping.
  102|      7|fn change_reason_to_action(reason: &str) -> String {
  103|      7|    match reason {
  104|      7|        "create" => "created",
                                  ^1
  105|      6|        "edit" => "edited",
                                ^1
  106|      5|        "update" => "updated",
                                  ^0
  107|      5|        "rename" => "renamed",
                                  ^1
  108|      4|        "restore" => "restored",
                                   ^1
  109|      3|        "merge" => "merged",
                                 ^1
  110|      2|        "forget" => "forgotten",
                                  ^1
  111|      1|        other => other,
  112|       |    }
  113|      7|    .to_string()
  114|      7|}
  115|       |
  116|       |#[derive(Serialize)]
  117|       |struct HistoryResponse {
  118|       |    name: String,
  119|       |    namespace: String,
  120|       |    /// True when the memory is currently soft-deleted (forgotten).
  121|       |    /// Allows the user to discover the version for `restore` even after `forget`.
  122|       |    deleted: bool,
  123|       |    versions: Vec<HistoryVersion>,
  124|       |    /// Total execution time in milliseconds from handler start to serialisation.
  125|       |    elapsed_ms: u64,
  126|       |}
  127|       |
  128|      0|pub fn run(args: HistoryArgs) -> Result<(), AppError> {
  129|      0|    let start = std::time::Instant::now();
  130|       |    // Resolve name from positional or --name flag; both are optional, at least one is required.
  131|      0|    let name = args.name_positional.or(args.name).ok_or_else(|| {
  132|      0|        AppError::Validation("name required: pass as positional argument or via --name".to_string())
  133|      0|    })?;
  134|      0|    let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
  135|      0|    let paths = AppPaths::resolve(args.db.as_deref())?;
  136|      0|    crate::storage::connection::ensure_db_ready(&paths)?;
  137|      0|    let conn = open_ro(&paths.db)?;
  138|       |
  139|       |    // v1.0.22 P0: direct query WITHOUT deleted_at filter — history MUST return versions
  140|       |    // of forgotten memories so the user can discover the version to use in `restore`.
  141|       |    // The old find_by_name filtered deleted_at IS NULL and was a dead-end in the forget+restore workflow.
  142|      0|    let row: Option<(i64, Option<i64>)> = conn
  143|      0|        .query_row(
  144|      0|            "SELECT id, deleted_at FROM memories WHERE namespace = ?1 AND name = ?2",
  145|      0|            params![namespace, name],
  146|      0|            |r| Ok((r.get(0)?, r.get(1)?)),
  147|       |        )
  148|      0|        .optional()?;
  149|      0|    let (memory_id, deleted_at) =
  150|      0|        row.ok_or_else(|| AppError::NotFound(errors_msg::memory_not_found(&name, &namespace)))?;
  151|      0|    let deleted = deleted_at.is_some();
  152|       |
  153|      0|    let mut stmt = conn.prepare_cached(
  154|      0|        "SELECT version, name, type, description, body, metadata,
  155|      0|                change_reason, changed_by, created_at
  156|      0|         FROM memory_versions
  157|      0|         WHERE memory_id = ?1
  158|      0|         ORDER BY version ASC",
  159|      0|    )?;
  160|       |
  161|      0|    let no_body = args.no_body;
  162|      0|    let want_diff = args.diff;
  163|      0|    let mut versions = stmt
  164|      0|        .query_map(params![memory_id], |r| {
  165|      0|            let created_at: i64 = r.get(8)?;
  166|      0|            let created_at_iso = crate::tz::epoch_to_iso(created_at);
  167|      0|            let body_str: String = r.get(4)?;
  168|      0|            let metadata_str: String = r.get(5)?;
  169|      0|            let metadata_value: serde_json::Value = serde_json::from_str(&metadata_str)
  170|      0|                .unwrap_or(serde_json::Value::Object(serde_json::Map::new()));
  171|      0|            let change_reason: String = r.get(6)?;
  172|      0|            let action = change_reason_to_action(&change_reason);
  173|       |            Ok(HistoryVersion {
  174|      0|                version: r.get(0)?,
  175|      0|                name: r.get(1)?,
  176|      0|                memory_type: r.get(2)?,
  177|      0|                description: r.get(3)?,
  178|      0|                body: if no_body { None } else { Some(body_str) },
  179|      0|                metadata: metadata_value,
  180|      0|                action,
  181|      0|                change_reason,
  182|      0|                changed_by: r.get(7)?,
  183|      0|                created_at,
  184|      0|                created_at_iso,
  185|      0|                changes: None,
  186|       |            })
  187|      0|        })?
  188|      0|        .collect::<Result<Vec<_>, _>>()?;
  189|       |
  190|      0|    if want_diff && !versions.is_empty() {
  191|      0|        let body_lens: Vec<usize> = versions
  192|      0|            .iter()
  193|      0|            .map(|v| v.body.as_deref().map_or(0, str::len))
  194|      0|            .collect();
  195|       |
  196|      0|        versions[0].changes = Some(VersionChanges {
  197|      0|            added_chars: body_lens[0],
  198|      0|            removed_chars: 0,
  199|      0|        });
  200|       |
  201|      0|        for i in 1..versions.len() {
  202|      0|            let old_len = body_lens[i - 1];
  203|      0|            let new_len = body_lens[i];
  204|      0|            versions[i].changes = Some(VersionChanges {
  205|      0|                added_chars: new_len.saturating_sub(old_len),
  206|      0|                removed_chars: old_len.saturating_sub(new_len),
  207|      0|            });
  208|      0|        }
  209|      0|    }
  210|       |
  211|      0|    output::emit_json(&HistoryResponse {
  212|      0|        name,
  213|      0|        namespace,
  214|      0|        deleted,
  215|      0|        versions,
  216|      0|        elapsed_ms: start.elapsed().as_millis() as u64,
  217|      0|    })?;
  218|       |
  219|      0|    Ok(())
  220|      0|}
  221|       |
  222|       |#[cfg(test)]
  223|       |mod tests {
  224|       |    use super::{change_reason_to_action, VersionChanges};
  225|       |
  226|       |    // Bug M-A6: action is always populated and maps known reasons to past tense.
  227|       |    #[test]
  228|      1|    fn version_changes_serializes_correctly() {
  229|      1|        let changes = VersionChanges {
  230|      1|            added_chars: 10,
  231|      1|            removed_chars: 3,
  232|      1|        };
  233|      1|        let json = serde_json::to_value(&changes).expect("serialization failed");
  234|      1|        assert_eq!(json["added_chars"], 10u64);
  235|      1|        assert_eq!(json["removed_chars"], 3u64);
  236|      1|    }
  237|       |
  238|       |    #[test]
  239|      1|    fn added_chars_saturating_sub_no_underflow() {
  240|       |        // new body shorter than old — added_chars must be 0, not wrapping
  241|      1|        let old_len: usize = 100;
  242|      1|        let new_len: usize = 40;
  243|      1|        let added = new_len.saturating_sub(old_len);
  244|      1|        let removed = old_len.saturating_sub(new_len);
  245|      1|        assert_eq!(added, 0);
  246|      1|        assert_eq!(removed, 60);
  247|      1|    }
  248|       |
  249|       |    #[test]
  250|      1|    fn removed_chars_saturating_sub_no_underflow() {
  251|       |        // new body longer than old — removed_chars must be 0
  252|      1|        let old_len: usize = 20;
  253|      1|        let new_len: usize = 80;
  254|      1|        let added = new_len.saturating_sub(old_len);
  255|      1|        let removed = old_len.saturating_sub(new_len);
  256|      1|        assert_eq!(added, 60);
  257|      1|        assert_eq!(removed, 0);
  258|      1|    }
  259|       |
  260|       |    #[test]
  261|      1|    fn change_reason_create_maps_to_created() {
  262|      1|        assert_eq!(change_reason_to_action("create"), "created");
  263|      1|    }
  264|       |
  265|       |    #[test]
  266|      1|    fn change_reason_edit_maps_to_edited() {
  267|      1|        assert_eq!(change_reason_to_action("edit"), "edited");
  268|      1|    }
  269|       |
  270|       |    #[test]
  271|      1|    fn change_reason_rename_maps_to_renamed() {
  272|      1|        assert_eq!(change_reason_to_action("rename"), "renamed");
  273|      1|    }
  274|       |
  275|       |    #[test]
  276|      1|    fn change_reason_restore_maps_to_restored() {
  277|      1|        assert_eq!(change_reason_to_action("restore"), "restored");
  278|      1|    }
  279|       |
  280|       |    #[test]
  281|      1|    fn change_reason_merge_maps_to_merged() {
  282|      1|        assert_eq!(change_reason_to_action("merge"), "merged");
  283|      1|    }
  284|       |
  285|       |    #[test]
  286|      1|    fn change_reason_forget_maps_to_forgotten() {
  287|      1|        assert_eq!(change_reason_to_action("forget"), "forgotten");
  288|      1|    }
  289|       |
  290|       |    #[test]
  291|      1|    fn change_reason_unknown_passes_through() {
  292|      1|        assert_eq!(change_reason_to_action("custom-action"), "custom-action");
  293|      1|    }
  294|       |
  295|       |    #[test]
  296|      1|    fn epoch_zero_yields_valid_iso() {
  297|       |        // v1.0.68 (test fix): timezone-agnostic — parse the ISO and compare
  298|       |        // the instant with the Unix epoch.  The previous starts_with check
  299|       |        // leaked the SQLITE_GRAPHRAG_DISPLAY_TZ env var from sibling tests
  300|       |        // and failed on hosts whose default display timezone is not UTC.
  301|      1|        let iso = crate::tz::epoch_to_iso(0);
  302|      1|        let parsed = chrono::DateTime::parse_from_rfc3339(&iso)
  303|      1|            .unwrap_or_else(|e| panic!("expected RFC3339, got `{iso}`: {e}"));
                                                     ^0
  304|      1|        assert_eq!(
  305|      1|            parsed.timestamp(),
  306|      1|            chrono::DateTime::UNIX_EPOCH.timestamp(),
  307|      0|            "epoch 0 must map to the Unix epoch instant, got: {iso}"
  308|       |        );
  309|      1|    }
  310|       |
  311|       |    #[test]
  312|      1|    fn typical_epoch_yields_iso_rfc3339() {
  313|      1|        let iso = crate::tz::epoch_to_iso(1_745_000_000);
  314|      1|        assert!(!iso.is_empty(), "created_at_iso must not be empty");
                                               ^0
  315|      1|        assert!(iso.contains('T'), "created_at_iso must contain T separator");
                                                 ^0
  316|       |        // With UTC the offset is +00:00; verifies general format without relying on the global tz
  317|      1|        assert!(
  318|      1|            iso.contains('+') || iso.contains('-'),
  319|      0|            "must contain offset sign, got: {iso}"
  320|       |        );
  321|      1|    }
  322|       |
  323|       |    #[test]
  324|      1|    fn invalid_epoch_returns_fallback() {
  325|      1|        let iso = crate::tz::epoch_to_iso(i64::MIN);
  326|      1|        assert!(
  327|      1|            !iso.is_empty(),
  328|      0|            "invalid epoch must return non-empty fallback"
  329|       |        );
  330|      1|    }
  331|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/hybrid_search.rs:
    1|       |//! Handler for the `hybrid-search` CLI subcommand.
    2|       |
    3|       |use crate::cli::MemoryType;
    4|       |use crate::errors::AppError;
    5|       |use crate::graph::traverse_from_memories_with_hops;
    6|       |use crate::output::{self, JsonOutputFormat, RecallItem};
    7|       |use crate::paths::AppPaths;
    8|       |use crate::storage::connection::open_ro;
    9|       |use crate::storage::entities;
   10|       |use crate::storage::memories;
   11|       |
   12|       |use std::collections::HashMap;
   13|       |
   14|       |/// Arguments for the `hybrid-search` subcommand.
   15|       |///
   16|       |/// When `--namespace` is omitted the search runs against the `global` namespace,
   17|       |/// which is the default namespace used by `remember` when no `--namespace` flag
   18|       |/// is provided. Pass an explicit `--namespace` value to search a different
   19|       |/// isolated namespace.
   20|       |#[derive(clap::Args)]
   21|       |#[command(after_long_help = "EXAMPLES:\n  \
   22|       |    # Basic hybrid search combining FTS5 + vector via RRF\n  \
   23|       |    sqlite-graphrag hybrid-search \"postgres migration deadlock\" --k 10\n\n  \
   24|       |    # Tune RRF weights to favor keyword matches over semantic similarity\n  \
   25|       |    sqlite-graphrag hybrid-search \"jwt auth\" --weight-fts 1.5 --weight-vec 0.5 --k 5\n\n  \
   26|       |    # Add graph traversal matches (entities connected to top results)\n  \
   27|       |    sqlite-graphrag hybrid-search \"frontend architecture\" --with-graph --k 10\n\n  \
   28|       |    # Graph traversal with custom depth and minimum edge weight\n  \
   29|       |    sqlite-graphrag hybrid-search \"auth design\" --with-graph --max-hops 3 --min-weight 0.5 --k 10\n\n  \
   30|       |NOTES:\n  \
   31|       |    --with-graph enables entity graph traversal seeded by the top RRF results.\n  \
   32|       |    Graph matches appear in the `graph_matches` array (separate from `results`).\n  \
   33|       |    Without --with-graph, `graph_matches` is always empty.")]
   34|       |pub struct HybridSearchArgs {
   35|       |    #[arg(
   36|       |        allow_hyphen_values = true,
   37|       |        help = "Hybrid search query (vector KNN + FTS5 BM25 fused via RRF)"
   38|       |    )]
   39|       |    pub query: String,
   40|       |    /// Maximum number of fused results to return after RRF combines vector + FTS5 candidates.
   41|       |    ///
   42|       |    /// Validated to the inclusive range `1..=4096` (the upper bound matches `sqlite-vec`'s knn
   43|       |    /// limit). Each underlying search fetches `k * 2` candidates before fusion.
   44|       |    #[arg(short = 'k', long, aliases = ["limit", "top-k"], default_value = "10", value_parser = crate::parsers::parse_k_range)]
   45|       |    pub k: usize,
   46|       |    #[arg(long, default_value = "60")]
   47|       |    pub rrf_k: u32,
   48|       |    #[arg(long, default_value = "1.0")]
   49|       |    pub weight_vec: f32,
   50|       |    #[arg(long, default_value = "1.0")]
   51|       |    pub weight_fts: f32,
   52|       |    /// Filter by memory.type. Note: distinct from graph entity_type
   53|       |    /// (project/tool/person/file/concept/incident/decision/memory/dashboard/issue_tracker/organization/location/date)
   54|       |    /// used in --entities-file.
   55|       |    #[arg(long, value_enum)]
   56|       |    pub r#type: Option<MemoryType>,
   57|       |    #[arg(long)]
   58|       |    pub namespace: Option<String>,
   59|       |    #[arg(long)]
   60|       |    pub with_graph: bool,
   61|       |    #[arg(long, default_value = "2")]
   62|       |    pub max_hops: u32,
   63|       |    #[arg(long, default_value = "0.3")]
   64|       |    pub min_weight: f64,
   65|       |    #[arg(long, value_enum, default_value_t = JsonOutputFormat::Json)]
   66|       |    pub format: JsonOutputFormat,
   67|       |    #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
   68|       |    pub db: Option<String>,
   69|       |    /// Accept `--json` as a no-op because output is already JSON by default.
   70|       |    #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
   71|       |    pub json: bool,
   72|       |    #[command(flatten)]
   73|       |    pub daemon: crate::cli::DaemonOpts,
   74|       |}
   75|       |
   76|       |#[derive(serde::Serialize)]
   77|       |pub struct HybridSearchItem {
   78|       |    pub memory_id: i64,
   79|       |    pub name: String,
   80|       |    pub namespace: String,
   81|       |    #[serde(rename = "type")]
   82|       |    pub memory_type: String,
   83|       |    pub description: String,
   84|       |    pub body: String,
   85|       |    pub snippet: String,
   86|       |    pub combined_score: f64,
   87|       |    /// Alias of `combined_score` for the documented contract in SKILL.md.
   88|       |    pub score: f64,
   89|       |    /// Source of the match: always "hybrid" (RRF of vec + fts). Added in v2.0.1.
   90|       |    pub source: String,
   91|       |    #[serde(skip_serializing_if = "Option::is_none")]
   92|       |    pub vec_rank: Option<usize>,
   93|       |    #[serde(skip_serializing_if = "Option::is_none")]
   94|       |    pub fts_rank: Option<usize>,
   95|       |    /// Combined RRF score — explicit alias of `combined_score` for integration contracts.
   96|       |    #[serde(skip_serializing_if = "Option::is_none")]
   97|       |    pub rrf_score: Option<f64>,
   98|       |    /// RRF score normalized to [0.0, 1.0] for cross-method comparability.
   99|       |    pub normalized_score: f64,
  100|       |    /// Raw KNN distance from the vector index (lower = more similar).
  101|       |    ///
  102|       |    /// Present when the result came from the vector search path; `None` when the
  103|       |    /// result appeared only in the FTS5 results and was not ranked by the KNN index.
  104|       |    #[serde(skip_serializing_if = "Option::is_none")]
  105|       |    pub vec_distance: Option<f64>,
  106|       |    /// Raw BM25 score from the FTS5 index. Currently always `None`; reserved for
  107|       |    /// a future release when the FTS5 BM25 score is exposed by the storage layer.
  108|       |    #[serde(skip_serializing_if = "Option::is_none")]
  109|       |    pub fts_bm25: Option<f64>,
  110|       |}
  111|       |
  112|       |/// RRF weights used in hybrid search: vec (vector) and fts (text).
  113|       |#[derive(serde::Serialize)]
  114|       |pub struct Weights {
  115|       |    pub vec: f32,
  116|       |    pub fts: f32,
  117|       |}
  118|       |
  119|       |#[derive(serde::Serialize)]
  120|       |pub struct HybridSearchResponse {
  121|       |    pub query: String,
  122|       |    pub k: usize,
  123|       |    /// RRF k parameter used in the combined ranking.
  124|       |    pub rrf_k: u32,
  125|       |    /// Weights applied to vec and fts sources in the RRF fusion.
  126|       |    pub weights: Weights,
  127|       |    pub results: Vec<HybridSearchItem>,
  128|       |    pub graph_matches: Vec<RecallItem>,
  129|       |    /// True when FTS5 failed and the response is vec-only.
  130|       |    ///
  131|       |    /// Omitted from JSON when `false` to keep the happy-path envelope clean.
  132|       |    #[serde(skip_serializing_if = "std::ops::Not::not")]
  133|       |    pub fts_degraded: bool,
  134|       |    /// Human-readable description of the FTS5 failure when `fts_degraded` is true.
  135|       |    ///
  136|       |    /// Omitted from JSON when `None`.
  137|       |    #[serde(skip_serializing_if = "Option::is_none")]
  138|       |    pub fts_error: Option<String>,
  139|       |    /// True when the FTS5 index was corrupted and successfully auto-rebuilt during this request.
  140|       |    ///
  141|       |    /// Omitted from JSON when `false` to keep the happy-path envelope clean.
  142|       |    #[serde(skip_serializing_if = "std::ops::Not::not")]
  143|       |    pub fts_auto_rebuilt: bool,
  144|       |    /// Total execution time in milliseconds from handler start to serialisation.
  145|       |    pub elapsed_ms: u64,
  146|       |}
  147|       |
  148|       |#[tracing::instrument(skip_all, level = "debug", name = "hybrid_search")]
  149|      0|pub fn run(args: HybridSearchArgs) -> Result<(), AppError> {
  150|      0|    let start = std::time::Instant::now();
  151|      0|    let _ = args.format;
  152|      0|    tracing::debug!(target: "hybrid_search", query = %args.query, k = args.k, "fusing results");
  153|       |
  154|       |    // G20: reject graph-specific flags when --with-graph is not active
  155|      0|    if !args.with_graph {
  156|      0|        if args.max_hops != 2 {
  157|      0|            return Err(AppError::Validation(
  158|      0|                "--max-hops requires --with-graph to be active".to_string(),
  159|      0|            ));
  160|      0|        }
  161|      0|        if (args.min_weight - 0.3).abs() > f64::EPSILON {
  162|      0|            return Err(AppError::Validation(
  163|      0|                "--min-weight requires --with-graph to be active".to_string(),
  164|      0|            ));
  165|      0|        }
  166|      0|    }
  167|       |
  168|      0|    let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
  169|      0|    let paths = AppPaths::resolve(args.db.as_deref())?;
  170|      0|    crate::storage::connection::ensure_db_ready(&paths)?;
  171|       |
  172|      0|    output::emit_progress_i18n(
  173|      0|        "Computing query embedding...",
  174|      0|        "Calculando embedding da consulta...",
  175|       |    );
  176|      0|    let embedding = crate::daemon::embed_query_or_local(
  177|      0|        &paths.models,
  178|      0|        &args.query,
  179|      0|        args.daemon.autostart_daemon,
  180|      0|    )?;
  181|       |
  182|      0|    let conn = open_ro(&paths.db)?;
  183|       |
  184|      0|    let memory_type_str = args.r#type.map(|t| t.as_str());
  185|       |
  186|      0|    let vec_results = memories::knn_search(
  187|      0|        &conn,
  188|      0|        &embedding,
  189|      0|        &[namespace.clone()],
  190|      0|        memory_type_str,
  191|      0|        args.k * 2,
  192|      0|    )?;
  193|       |
  194|       |    // Map vector ranking position by memory_id (1-indexed per schema)
  195|      0|    let vec_rank_map: HashMap<i64, usize> = vec_results
  196|      0|        .iter()
  197|      0|        .enumerate()
  198|      0|        .map(|(pos, (id, _))| (*id, pos + 1))
  199|      0|        .collect();
  200|       |
  201|       |    // Map raw KNN distance by memory_id for GAP-30: vec_distance field.
  202|      0|    let vec_distance_map: HashMap<i64, f64> = vec_results
  203|      0|        .iter()
  204|      0|        .map(|(id, dist)| (*id, *dist as f64))
  205|      0|        .collect();
  206|       |
  207|      0|    let (fts_results, fts_degraded, fts_error, fts_auto_rebuilt) = if args.weight_fts == 0.0 {
  208|      0|        (vec![], false, None, false)
  209|       |    } else {
  210|      0|        match memories::fts_search(&conn, &args.query, &namespace, memory_type_str, args.k * 2) {
  211|      0|            Ok(r) => (r, false, None, false),
  212|      0|            Err(e) => {
  213|      0|                let err_msg = e.to_string();
  214|      0|                let is_malformed = err_msg.contains("malformed") || err_msg.contains("corrupt");
  215|      0|                if is_malformed {
  216|      0|                    tracing::warn!(target: "hybrid_search", "FTS5 index corrupted, attempting auto-rebuild");
  217|      0|                    if conn
  218|      0|                        .execute_batch("INSERT INTO fts_memories(fts_memories) VALUES('rebuild');")
  219|      0|                        .is_ok()
  220|       |                    {
  221|      0|                        match memories::fts_search(
  222|      0|                            &conn,
  223|      0|                            &args.query,
  224|      0|                            &namespace,
  225|      0|                            memory_type_str,
  226|      0|                            args.k * 2,
  227|      0|                        ) {
  228|      0|                            Ok(r) => (r, false, None, true),
  229|      0|                            Err(e2) => {
  230|      0|                                tracing::error!(target: "hybrid_search", error = %e2, "FTS5 auto-rebuild failed to recover");
  231|      0|                                (vec![], true, Some(e2.to_string()), true)
  232|       |                            }
  233|       |                        }
  234|       |                    } else {
  235|      0|                        (vec![], true, Some(err_msg), false)
  236|       |                    }
  237|       |                } else {
  238|      0|                    tracing::warn!(target: "hybrid_search", error = %e, "FTS5 query failed, falling back to vec-only");
  239|      0|                    (vec![], true, Some(err_msg), false)
  240|       |                }
  241|       |            }
  242|       |        }
  243|       |    };
  244|       |
  245|       |    // Map FTS ranking position by memory_id (1-indexed per schema)
  246|      0|    let fts_rank_map: HashMap<i64, usize> = fts_results
  247|      0|        .iter()
  248|      0|        .enumerate()
  249|      0|        .map(|(pos, row)| (row.id, pos + 1))
  250|      0|        .collect();
  251|       |
  252|      0|    let rrf_k = args.rrf_k as f64;
  253|       |
  254|       |    // Accumulate combined RRF scores
  255|      0|    let mut combined_scores: crate::hash::AHashMap<i64, f64> =
  256|      0|        crate::hash::AHashMap::with_capacity_and_hasher(
  257|      0|            vec_results.len() + fts_results.len(),
  258|      0|            Default::default(),
  259|       |        );
  260|       |
  261|      0|    for (rank, (memory_id, _)) in vec_results.iter().enumerate() {
  262|      0|        let score = args.weight_vec as f64 * (1.0 / (rrf_k + rank as f64 + 1.0));
  263|      0|        *combined_scores.entry(*memory_id).or_insert(0.0) += score;
  264|      0|    }
  265|       |
  266|      0|    for (rank, row) in fts_results.iter().enumerate() {
  267|      0|        let score = args.weight_fts as f64 * (1.0 / (rrf_k + rank as f64 + 1.0));
  268|      0|        *combined_scores.entry(row.id).or_insert(0.0) += score;
  269|      0|    }
  270|       |
  271|       |    // Sort by score descending and take the top-k
  272|      0|    let mut ranked: Vec<(i64, f64)> = combined_scores.into_iter().collect();
  273|      0|    ranked.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
  274|      0|    ranked.truncate(args.k);
  275|       |
  276|       |    // Collect all IDs for batch fetch (avoiding N+1)
  277|      0|    let top_ids: Vec<i64> = ranked.iter().map(|(id, _)| *id).collect();
  278|       |
  279|       |    // Fetch full data for the top memories
  280|      0|    let mut memory_data: crate::hash::AHashMap<i64, memories::MemoryRow> =
  281|      0|        crate::hash::AHashMap::with_capacity_and_hasher(ranked.len(), Default::default());
  282|      0|    for id in &top_ids {
  283|      0|        if let Some(row) = memories::read_full(&conn, *id)? {
  284|      0|            memory_data.insert(*id, row);
  285|      0|        }
  286|       |    }
  287|       |
  288|      0|    let max_possible = args.weight_vec as f64 * (1.0 / (rrf_k + 1.0))
  289|      0|        + args.weight_fts as f64 * (1.0 / (rrf_k + 1.0));
  290|       |
  291|       |    // Build final results in ranking order
  292|      0|    let results: Vec<HybridSearchItem> = ranked
  293|      0|        .into_iter()
  294|      0|        .filter_map(|(memory_id, combined_score)| {
  295|      0|            let normalized_score = if max_possible > 0.0 {
  296|      0|                combined_score / max_possible
  297|       |            } else {
  298|      0|                0.0
  299|       |            };
  300|      0|            memory_data.remove(&memory_id).map(|row| {
  301|      0|                let snippet: String = row.body.chars().take(300).collect();
  302|      0|                HybridSearchItem {
  303|      0|                    memory_id: row.id,
  304|      0|                    name: row.name,
  305|      0|                    namespace: row.namespace,
  306|      0|                    memory_type: row.memory_type,
  307|      0|                    description: row.description,
  308|      0|                    body: row.body,
  309|      0|                    snippet,
  310|      0|                    combined_score,
  311|      0|                    score: combined_score,
  312|      0|                    source: "hybrid".to_string(),
  313|      0|                    vec_rank: vec_rank_map.get(&memory_id).copied(),
  314|      0|                    fts_rank: fts_rank_map.get(&memory_id).copied(),
  315|      0|                    rrf_score: Some(combined_score),
  316|      0|                    normalized_score,
  317|      0|                    vec_distance: vec_distance_map.get(&memory_id).copied(),
  318|      0|                    fts_bm25: None,
  319|      0|                }
  320|      0|            })
  321|      0|        })
  322|      0|        .collect();
  323|       |
  324|       |    // --- Graph traversal (activated by --with-graph) ---
  325|      0|    let mut graph_matches: Vec<RecallItem> = Vec::with_capacity(8);
  326|      0|    if args.with_graph && !results.is_empty() {
  327|      0|        let namespace_for_graph = namespace.clone();
  328|      0|        let memory_ids: Vec<i64> = results.iter().map(|r| r.memory_id).collect();
  329|       |
  330|      0|        let entity_knn = entities::knn_search(&conn, &embedding, &namespace_for_graph, 5)?;
  331|      0|        let entity_ids: Vec<i64> = entity_knn.iter().map(|(id, _)| *id).collect();
  332|       |
  333|      0|        let all_seed_ids: Vec<i64> = memory_ids
  334|      0|            .iter()
  335|      0|            .chain(entity_ids.iter())
  336|      0|            .copied()
  337|      0|            .collect();
  338|       |
  339|      0|        if !all_seed_ids.is_empty() {
  340|      0|            let graph_memory_ids = traverse_from_memories_with_hops(
  341|      0|                &conn,
  342|      0|                &all_seed_ids,
  343|      0|                &namespace_for_graph,
  344|      0|                args.min_weight,
  345|      0|                args.max_hops,
  346|      0|            )?;
  347|       |
  348|      0|            let already_in_results: std::collections::HashSet<i64> =
  349|      0|                results.iter().map(|r| r.memory_id).collect();
  350|       |
  351|      0|            for (graph_mem_id, hop) in graph_memory_ids {
  352|      0|                if already_in_results.contains(&graph_mem_id) {
  353|      0|                    continue;
  354|      0|                }
  355|      0|                if let Some(row) = memories::read_full(&conn, graph_mem_id)? {
  356|      0|                    let snippet: String = row.body.chars().take(300).collect();
  357|      0|                    let graph_distance = 1.0 - 1.0 / (hop as f32 + 1.0);
  358|      0|                    graph_matches.push(RecallItem {
  359|      0|                        memory_id: row.id,
  360|      0|                        name: row.name,
  361|      0|                        namespace: row.namespace,
  362|      0|                        memory_type: row.memory_type,
  363|      0|                        description: row.description,
  364|      0|                        snippet,
  365|      0|                        distance: graph_distance,
  366|      0|                        score: RecallItem::score_from_distance(graph_distance),
  367|      0|                        source: "graph".to_string(),
  368|      0|                        graph_depth: Some(hop),
  369|      0|                    });
  370|      0|                }
  371|       |            }
  372|      0|        }
  373|      0|    }
  374|       |
  375|      0|    output::emit_json(&HybridSearchResponse {
  376|      0|        query: args.query,
  377|      0|        k: args.k,
  378|      0|        rrf_k: args.rrf_k,
  379|      0|        weights: Weights {
  380|      0|            vec: args.weight_vec,
  381|      0|            fts: args.weight_fts,
  382|      0|        },
  383|      0|        results,
  384|      0|        graph_matches,
  385|      0|        fts_degraded,
  386|      0|        fts_error,
  387|      0|        fts_auto_rebuilt,
  388|      0|        elapsed_ms: start.elapsed().as_millis() as u64,
  389|      0|    })?;
  390|       |
  391|      0|    Ok(())
  392|      0|}
  393|       |
  394|       |#[cfg(test)]
  395|       |mod tests {
  396|       |    use super::*;
  397|       |
  398|      6|    fn empty_response(
  399|      6|        k: usize,
  400|      6|        rrf_k: u32,
  401|      6|        weight_vec: f32,
  402|      6|        weight_fts: f32,
  403|      6|    ) -> HybridSearchResponse {
  404|      6|        HybridSearchResponse {
  405|      6|            query: "test query".to_string(),
  406|      6|            k,
  407|      6|            rrf_k,
  408|      6|            weights: Weights {
  409|      6|                vec: weight_vec,
  410|      6|                fts: weight_fts,
  411|      6|            },
  412|      6|            results: vec![],
  413|      6|            graph_matches: vec![],
  414|      6|            fts_degraded: false,
  415|      6|            fts_error: None,
  416|      6|            fts_auto_rebuilt: false,
  417|      6|            elapsed_ms: 0,
  418|      6|        }
  419|      6|    }
  420|       |
  421|       |    #[test]
  422|      1|    fn hybrid_search_response_empty_serializes_correct_fields() {
  423|      1|        let resp = empty_response(10, 60, 1.0, 1.0);
  424|      1|        let json = serde_json::to_string(&resp).unwrap();
  425|      1|        assert!(json.contains("\"results\""), "must contain results field");
                                                            ^0
  426|      1|        assert!(json.contains("\"query\""), "must contain query field");
                                                          ^0
  427|      1|        assert!(json.contains("\"k\""), "must contain k field");
                                                      ^0
  428|      1|        assert!(
  429|      1|            json.contains("\"graph_matches\""),
  430|      0|            "must contain graph_matches field"
  431|       |        );
  432|      1|        assert!(
  433|      1|            !json.contains("\"combined_rank\""),
  434|      0|            "must not contain combined_rank"
  435|       |        );
  436|      1|        assert!(
  437|      1|            !json.contains("\"vec_rank_list\""),
  438|      0|            "must not contain vec_rank_list"
  439|       |        );
  440|      1|        assert!(
  441|      1|            !json.contains("\"fts_rank_list\""),
  442|      0|            "must not contain fts_rank_list"
  443|       |        );
  444|      1|    }
  445|       |
  446|       |    #[test]
  447|      1|    fn hybrid_search_response_serializes_rrf_k_and_weights() {
  448|      1|        let resp = empty_response(5, 60, 0.7, 0.3);
  449|      1|        let json = serde_json::to_string(&resp).unwrap();
  450|      1|        assert!(json.contains("\"rrf_k\""), "must contain rrf_k field");
                                                          ^0
  451|      1|        assert!(json.contains("\"weights\""), "must contain weights field");
                                                            ^0
  452|      1|        assert!(json.contains("\"vec\""), "must contain weights.vec field");
                                                        ^0
  453|      1|        assert!(json.contains("\"fts\""), "must contain weights.fts field");
                                                        ^0
  454|      1|    }
  455|       |
  456|       |    #[test]
  457|      1|    fn hybrid_search_response_serializes_elapsed_ms() {
  458|      1|        let mut resp = empty_response(5, 60, 1.0, 1.0);
  459|      1|        resp.elapsed_ms = 123;
  460|      1|        let json = serde_json::to_string(&resp).unwrap();
  461|      1|        assert!(
  462|      1|            json.contains("\"elapsed_ms\""),
  463|      0|            "must contain elapsed_ms field"
  464|       |        );
  465|      1|        assert!(json.contains("123"), "deve serializar valor de elapsed_ms");
                                                    ^0
  466|      1|    }
  467|       |
  468|       |    #[test]
  469|      1|    fn weights_struct_serializes_correctly() {
  470|      1|        let w = Weights { vec: 0.6, fts: 0.4 };
  471|      1|        let json = serde_json::to_string(&w).unwrap();
  472|      1|        assert!(json.contains("\"vec\""));
  473|      1|        assert!(json.contains("\"fts\""));
  474|      1|    }
  475|       |
  476|       |    #[test]
  477|      1|    fn hybrid_search_item_omits_fts_rank_when_none() {
  478|      1|        let item = HybridSearchItem {
  479|      1|            memory_id: 1,
  480|      1|            name: "mem".to_string(),
  481|      1|            namespace: "default".to_string(),
  482|      1|            memory_type: "user".to_string(),
  483|      1|            description: "desc".to_string(),
  484|      1|            body: "content".to_string(),
  485|      1|            snippet: "content".to_string(),
  486|      1|            combined_score: 0.0328,
  487|      1|            score: 0.0328,
  488|      1|            source: "hybrid".to_string(),
  489|      1|            vec_rank: Some(1),
  490|      1|            fts_rank: None,
  491|      1|            rrf_score: Some(0.0328),
  492|      1|            normalized_score: 1.0,
  493|      1|            vec_distance: Some(0.12),
  494|      1|            fts_bm25: None,
  495|      1|        };
  496|      1|        let json = serde_json::to_string(&item).unwrap();
  497|      1|        assert!(
  498|      1|            json.contains("\"vec_rank\""),
  499|      0|            "must contain vec_rank when Some"
  500|       |        );
  501|      1|        assert!(
  502|      1|            !json.contains("\"fts_rank\""),
  503|      0|            "must not contain fts_rank when None"
  504|       |        );
  505|      1|    }
  506|       |
  507|       |    #[test]
  508|      1|    fn hybrid_search_item_omits_vec_rank_when_none() {
  509|      1|        let item = HybridSearchItem {
  510|      1|            memory_id: 2,
  511|      1|            name: "mem2".to_string(),
  512|      1|            namespace: "default".to_string(),
  513|      1|            memory_type: "fact".to_string(),
  514|      1|            description: "desc2".to_string(),
  515|      1|            body: "corpo2".to_string(),
  516|      1|            snippet: "corpo2".to_string(),
  517|      1|            combined_score: 0.016,
  518|      1|            score: 0.016,
  519|      1|            source: "hybrid".to_string(),
  520|      1|            vec_rank: None,
  521|      1|            fts_rank: Some(2),
  522|      1|            rrf_score: Some(0.016),
  523|      1|            normalized_score: 0.5,
  524|      1|            vec_distance: None,
  525|      1|            fts_bm25: None,
  526|      1|        };
  527|      1|        let json = serde_json::to_string(&item).unwrap();
  528|      1|        assert!(
  529|      1|            !json.contains("\"vec_rank\""),
  530|      0|            "must not contain vec_rank when None"
  531|       |        );
  532|      1|        assert!(
  533|      1|            json.contains("\"fts_rank\""),
  534|      0|            "must contain fts_rank when Some"
  535|       |        );
  536|      1|    }
  537|       |
  538|       |    #[test]
  539|      1|    fn hybrid_search_item_serializes_both_ranks_when_some() {
  540|      1|        let item = HybridSearchItem {
  541|      1|            memory_id: 3,
  542|      1|            name: "mem3".to_string(),
  543|      1|            namespace: "ns".to_string(),
  544|      1|            memory_type: "entity".to_string(),
  545|      1|            description: "desc3".to_string(),
  546|      1|            body: "corpo3".to_string(),
  547|      1|            snippet: "corpo3".to_string(),
  548|      1|            combined_score: 0.05,
  549|      1|            score: 0.05,
  550|      1|            source: "hybrid".to_string(),
  551|      1|            vec_rank: Some(3),
  552|      1|            fts_rank: Some(1),
  553|      1|            rrf_score: Some(0.05),
  554|      1|            normalized_score: 0.8,
  555|      1|            vec_distance: Some(0.25),
  556|      1|            fts_bm25: None,
  557|      1|        };
  558|      1|        let json = serde_json::to_string(&item).unwrap();
  559|      1|        assert!(json.contains("\"vec_rank\""), "must contain vec_rank");
                                                             ^0
  560|      1|        assert!(json.contains("\"fts_rank\""), "must contain fts_rank");
                                                             ^0
  561|      1|        assert!(json.contains("\"type\""), "deve serializar type renomeado");
                                                         ^0
  562|      1|        assert!(!json.contains("memory_type"), "must not expose memory_type");
                                                             ^0
  563|      1|    }
  564|       |
  565|       |    #[test]
  566|      1|    fn hybrid_search_response_serializes_k_correctly() {
  567|      1|        let resp = empty_response(5, 60, 1.0, 1.0);
  568|      1|        let json = serde_json::to_string(&resp).unwrap();
  569|      1|        assert!(json.contains("\"k\":5"), "deve serializar k=5");
                                                        ^0
  570|      1|    }
  571|       |
  572|       |    #[test]
  573|      1|    fn hybrid_search_response_with_graph_matches() {
  574|       |        use crate::output::RecallItem;
  575|      1|        let resp = HybridSearchResponse {
  576|      1|            query: "test".to_string(),
  577|      1|            k: 5,
  578|      1|            rrf_k: 60,
  579|      1|            weights: Weights { vec: 1.0, fts: 1.0 },
  580|      1|            results: vec![],
  581|      1|            graph_matches: vec![RecallItem {
  582|      1|                memory_id: 1,
  583|      1|                name: "graph-hit".to_string(),
  584|      1|                namespace: "global".to_string(),
  585|      1|                memory_type: "document".to_string(),
  586|      1|                description: "found via graph".to_string(),
  587|      1|                snippet: "graph content".to_string(),
  588|      1|                distance: 0.1,
  589|      1|                score: 0.9,
  590|      1|                source: "graph".to_string(),
  591|      1|                graph_depth: Some(1),
  592|      1|            }],
  593|      1|            fts_degraded: false,
  594|      1|            fts_error: None,
  595|      1|            fts_auto_rebuilt: false,
  596|      1|            elapsed_ms: 42,
  597|      1|        };
  598|      1|        let json = serde_json::to_value(&resp).unwrap();
  599|      1|        assert_eq!(json["graph_matches"].as_array().unwrap().len(), 1);
  600|      1|        assert_eq!(json["graph_matches"][0]["source"], "graph");
  601|      1|        assert_eq!(json["graph_matches"][0]["graph_depth"], 1);
  602|      1|    }
  603|       |
  604|       |    #[test]
  605|      1|    fn fts_degraded_omitted_on_success_present_on_failure() {
  606|       |        // Happy path: fts_degraded=false must be absent from JSON (skip_serializing_if).
  607|      1|        let ok_resp = empty_response(5, 60, 1.0, 1.0);
  608|      1|        let ok_json = serde_json::to_string(&ok_resp).unwrap();
  609|      1|        assert!(
  610|      1|            !ok_json.contains("\"fts_degraded\""),
  611|      0|            "fts_degraded must be absent when false"
  612|       |        );
  613|      1|        assert!(
  614|      1|            !ok_json.contains("\"fts_error\""),
  615|      0|            "fts_error must be absent when None"
  616|       |        );
  617|       |
  618|       |        // Degraded path: fts_degraded=true and fts_error=Some must appear in JSON.
  619|      1|        let mut degraded_resp = empty_response(5, 60, 1.0, 1.0);
  620|      1|        degraded_resp.fts_degraded = true;
  621|      1|        degraded_resp.fts_error = Some("FTS5 table corrupted".to_string());
  622|      1|        let degraded_json = serde_json::to_string(&degraded_resp).unwrap();
  623|      1|        assert!(
  624|      1|            degraded_json.contains("\"fts_degraded\":true"),
  625|      0|            "fts_degraded must be present and true when degraded"
  626|       |        );
  627|      1|        assert!(
  628|      1|            degraded_json.contains("\"fts_error\""),
  629|      0|            "fts_error must be present when Some"
  630|       |        );
  631|      1|        assert!(
  632|      1|            degraded_json.contains("FTS5 table corrupted"),
  633|      0|            "fts_error must contain the error message"
  634|       |        );
  635|      1|    }
  636|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/ingest.rs:
    1|       |//! Handler for the `ingest` CLI subcommand.
    2|       |//!
    3|       |//! Bulk-ingests every file under a directory that matches a glob pattern.
    4|       |//! Each matched file is persisted as a separate memory using the same
    5|       |//! validation, chunking, embedding and persistence pipeline as `remember`,
    6|       |//! but executed in-process so the ONNX model is loaded only once per
    7|       |//! invocation. This is the v1.0.32 Onda 4B (finding A2) refactor that
    8|       |//! replaced a fork-spawn-per-file pipeline (every file paid the ~17s ONNX
    9|       |//! cold-start cost) with an in-process loop reusing the warm embedder
   10|       |//! (daemon when available, in-process `Embedder::new` otherwise).
   11|       |//!
   12|       |//! Memory names are derived from file basenames (kebab-case, lowercase,
   13|       |//! ASCII alphanumerics + hyphens). Output is line-delimited JSON: one
   14|       |//! object per processed file (success or error), followed by a final
   15|       |//! summary object. Designed for streaming consumption by agents.
   16|       |//!
   17|       |//! ## Incremental pipeline (v1.0.43)
   18|       |//!
   19|       |//! Phase A runs on a rayon thread pool (size = `--ingest-parallelism`):
   20|       |//! read + chunk + embed + NER per file. Results are sent immediately via a
   21|       |//! bounded `mpsc::sync_channel` to Phase B so persistence starts as soon
   22|       |//! as the first file completes — no waiting for all files to finish Phase A.
   23|       |//!
   24|       |//! Phase B runs on the main thread: receives staged files from the channel,
   25|       |//! writes to SQLite per-file (WAL absorbs individual commits), and emits
   26|       |//! NDJSON progress events to stderr as each file is persisted. `Connection`
   27|       |//! is not `Sync` so it never crosses thread boundaries.
   28|       |//!
   29|       |//! This fixes B1: with the old 2-phase design, a 50-file corpus with 27s/file
   30|       |//! NER would spend ~22min in Phase A alone, exceeding the user's 900s timeout
   31|       |//! before Phase B (and any DB writes) could begin. With this pipeline, the
   32|       |//! first file is committed within seconds of starting.
   33|       |
   34|       |use crate::chunking;
   35|       |use crate::cli::MemoryType;
   36|       |use crate::entity_type::EntityType;
   37|       |use crate::errors::AppError;
   38|       |use crate::i18n::errors_msg;
   39|       |use crate::output::{self, JsonOutputFormat};
   40|       |use crate::paths::AppPaths;
   41|       |use crate::storage::chunks as storage_chunks;
   42|       |use crate::storage::connection::{ensure_db_ready, open_rw};
   43|       |use crate::storage::entities::{NewEntity, NewRelationship};
   44|       |use crate::storage::memories::NewMemory;
   45|       |use crate::storage::{entities, memories, urls as storage_urls, versions};
   46|       |use rayon::prelude::*;
   47|       |use rusqlite::Connection;
   48|       |use serde::Serialize;
   49|       |use std::collections::BTreeSet;
   50|       |use std::path::{Path, PathBuf};
   51|       |use std::sync::mpsc;
   52|       |use unicode_normalization::UnicodeNormalization;
   53|       |
   54|       |use crate::constants::DERIVED_NAME_MAX_LEN;
   55|       |
   56|       |/// Hard cap on the numeric suffix appended for collision resolution. If 1000
   57|       |/// candidates collide we surface an error rather than loop forever.
   58|       |const MAX_NAME_COLLISION_SUFFIX: usize = 1000;
   59|       |
   60|       |#[derive(clap::Args)]
   61|       |#[command(after_long_help = "EXAMPLES:\n  \
   62|       |    # Ingest every Markdown file under ./docs as `document` memories\n  \
   63|       |    sqlite-graphrag ingest ./docs --type document\n\n  \
   64|       |    # Ingest .txt files recursively under ./notes\n  \
   65|       |    sqlite-graphrag ingest ./notes --type note --pattern '*.txt' --recursive\n\n  \
   66|       |    # Enable GLiNER NER extraction (disabled by default, slower)\n  \
   67|       |    sqlite-graphrag ingest ./big-corpus --type reference --enable-ner\n\n  \
   68|       |    # Preview file-to-name mapping without ingesting\n  \
   69|       |    sqlite-graphrag ingest ./docs --dry-run\n\n  \
   70|       |    # LLM-curated extraction via Claude Code CLI\n  \
   71|       |    sqlite-graphrag ingest ./docs --mode claude-code --recursive --json\n\n  \
   72|       |    # Resume interrupted claude-code ingest\n  \
   73|       |    sqlite-graphrag ingest ./docs --mode claude-code --resume --json\n\n  \
   74|       |    # Claude Code with budget cap and custom timeout\n  \
   75|       |    sqlite-graphrag ingest ./docs --mode claude-code --max-cost-usd 5.00 --claude-timeout 600 --json\n\n  \
   76|       |AUTHENTICATION:\n  \
   77|       |    --mode claude-code: Uses existing Claude Code authentication.\n  \
   78|       |      OAuth (Pro/Max/Team): works automatically from ~/.claude/.credentials.json\n  \
   79|       |      API key: set ANTHROPIC_API_KEY for faster startup (optional)\n\n  \
   80|       |    --mode codex: Uses existing Codex CLI authentication.\n  \
   81|       |      Device auth: run `codex auth login` first\n  \
   82|       |      API key: set OPENAI_API_KEY (optional)\n\n  \
   83|       |NOTES:\n  \
   84|       |    Each file becomes a separate memory. Names derive from file basenames\n  \
   85|       |    (kebab-case, lowercase, ASCII). Output is NDJSON: one JSON object per file,\n  \
   86|       |    followed by a final summary line with counts. Per-file errors are reported\n  \
   87|       |    inline and processing continues unless --fail-fast is set.")]
   88|       |pub struct IngestArgs {
   89|       |    /// Directory containing files to ingest.
   90|       |    #[arg(
   91|       |        value_name = "DIR",
   92|       |        help = "Directory to ingest recursively (each matching file becomes a memory)"
   93|       |    )]
   94|       |    pub dir: PathBuf,
   95|       |
   96|       |    /// Memory type stored in `memories.type` for every ingested file. Defaults to `document`.
   97|       |    #[arg(long, value_enum, default_value_t = MemoryType::Document)]
   98|       |    pub r#type: MemoryType,
   99|       |
  100|       |    /// Glob pattern matched against file basenames (default: `*.md`). Supports
  101|       |    /// `*.<ext>`, `<prefix>*`, and exact filename match.
  102|       |    #[arg(long, default_value = "*.md")]
  103|       |    pub pattern: String,
  104|       |
  105|       |    /// Recurse into subdirectories.
  106|       |    #[arg(long, default_value_t = false)]
  107|       |    pub recursive: bool,
  108|       |
  109|       |    #[arg(
  110|       |        long,
  111|       |        env = "SQLITE_GRAPHRAG_ENABLE_NER",
  112|       |        value_parser = crate::parsers::parse_bool_flexible,
  113|       |        action = clap::ArgAction::Set,
  114|       |        num_args = 0..=1,
  115|       |        default_missing_value = "true",
  116|       |        default_value = "false",
  117|       |        help = "Enable automatic GLiNER NER entity/relationship extraction (disabled by default)"
  118|       |    )]
  119|       |    pub enable_ner: bool,
  120|       |    #[arg(
  121|       |        long,
  122|       |        env = "SQLITE_GRAPHRAG_GLINER_VARIANT",
  123|       |        default_value = "fp32",
  124|       |        help = "GLiNER model variant: fp32 (1.1GB, best quality), fp16 (580MB), int8 (349MB, fastest but may miss entities on short texts), q4, q4f16"
  125|       |    )]
  126|       |    pub gliner_variant: String,
  127|       |
  128|       |    /// Deprecated: NER is now disabled by default. Kept for backwards compatibility.
  129|       |    #[arg(long, default_value_t = false, hide = true)]
  130|       |    pub skip_extraction: bool,
  131|       |
  132|       |    /// Stop on first per-file error instead of continuing with the next file.
  133|       |    #[arg(long, default_value_t = false)]
  134|       |    pub fail_fast: bool,
  135|       |
  136|       |    /// Preview file-to-name mapping without loading model or persisting.
  137|       |    #[arg(long, default_value_t = false)]
  138|       |    pub dry_run: bool,
  139|       |
  140|       |    /// Maximum number of files to ingest (safety cap to prevent runaway ingestion).
  141|       |    #[arg(long, default_value_t = 10_000)]
  142|       |    pub max_files: usize,
  143|       |
  144|       |    /// Namespace for the ingested memories.
  145|       |    #[arg(long)]
  146|       |    pub namespace: Option<String>,
  147|       |
  148|       |    /// Database path. Falls back to `SQLITE_GRAPHRAG_DB_PATH`, then `./graphrag.sqlite`.
  149|       |    #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
  150|       |    pub db: Option<String>,
  151|       |
  152|       |    #[arg(long, value_enum, default_value_t = JsonOutputFormat::Json)]
  153|       |    pub format: JsonOutputFormat,
  154|       |
  155|       |    #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
  156|       |    pub json: bool,
  157|       |
  158|       |    /// Number of files to extract+embed in parallel; default = max(1, cpus/2).min(4).
  159|       |    #[arg(
  160|       |        long,
  161|       |        help = "Number of files to extract+embed in parallel; default = max(1, cpus/2).min(4)"
  162|       |    )]
  163|       |    pub ingest_parallelism: Option<usize>,
  164|       |
  165|       |    /// Force single-threaded ingest to reduce RSS pressure.
  166|       |    ///
  167|       |    /// Equivalent to `--ingest-parallelism 1`, takes precedence over any
  168|       |    /// explicit value. Recommended for environments with <4 GB available
  169|       |    /// RAM or container/cgroup constraints. Trade-off: 3-4x longer wall
  170|       |    /// time. Also honored via `SQLITE_GRAPHRAG_LOW_MEMORY=1` env var
  171|       |    /// (CLI flag has higher precedence than the env var).
  172|       |    #[arg(
  173|       |        long,
  174|       |        default_value_t = false,
  175|       |        help = "Forces single-threaded ingest (--ingest-parallelism 1) to reduce RSS pressure. \
  176|       |                Recommended for environments with <4 GB available RAM or container/cgroup \
  177|       |                constraints. Trade-off: 3-4x longer wall time. Also honored via \
  178|       |                SQLITE_GRAPHRAG_LOW_MEMORY=1 env var."
  179|       |    )]
  180|       |    pub low_memory: bool,
  181|       |
  182|       |    /// Maximum process RSS in MiB; abort if exceeded during embedding.
  183|       |    #[arg(long, default_value_t = crate::constants::DEFAULT_MAX_RSS_MB,
  184|       |          help = "Maximum process RSS in MiB; abort if exceeded during embedding (default: 8192)")]
  185|       |    pub max_rss_mb: u64,
  186|       |
  187|       |    /// Maximum character length for derived memory names from file basenames.
  188|       |    ///
  189|       |    /// Overrides the compile-time `DERIVED_NAME_MAX_LEN` constant (default 60).
  190|       |    /// Shorter values leave more headroom for collision suffix resolution.
  191|       |    #[arg(long, default_value_t = crate::constants::DERIVED_NAME_MAX_LEN,
  192|       |          help = "Maximum length for derived memory names (default: 60)")]
  193|       |    pub max_name_length: usize,
  194|       |
  195|       |    /// Extraction mode: `none` (body-only, default), `gliner` (NER), or `claude-code` (LLM-curated via Claude Code CLI).
  196|       |    #[arg(long, value_enum, default_value_t = IngestMode::None)]
  197|       |    pub mode: IngestMode,
  198|       |
  199|       |    /// Explicit path to the Claude Code binary (only with --mode claude-code).
  200|       |    #[arg(long, env = "SQLITE_GRAPHRAG_CLAUDE_BINARY")]
  201|       |    pub claude_binary: Option<std::path::PathBuf>,
  202|       |
  203|       |    /// Model override for Claude Code extraction (e.g. claude-sonnet-4-6).
  204|       |    #[arg(long)]
  205|       |    pub claude_model: Option<String>,
  206|       |
  207|       |    /// Resume a previously interrupted claude-code ingest from the queue DB.
  208|       |    #[arg(long, default_value_t = false)]
  209|       |    pub resume: bool,
  210|       |
  211|       |    /// Retry only failed files from a previous claude-code ingest.
  212|       |    #[arg(long, default_value_t = false)]
  213|       |    pub retry_failed: bool,
  214|       |
  215|       |    /// Keep the queue DB (.ingest-queue.sqlite) after completion.
  216|       |    #[arg(long, default_value_t = false)]
  217|       |    pub keep_queue: bool,
  218|       |
  219|       |    /// Custom path for the claude-code ingest queue database.
  220|       |    #[arg(long, default_value = ".ingest-queue.sqlite")]
  221|       |    pub queue_db: String,
  222|       |
  223|       |    /// Initial wait time in seconds when rate-limited (only with --mode claude-code).
  224|       |    #[arg(long, default_value_t = 60)]
  225|       |    pub rate_limit_wait: u64,
  226|       |
  227|       |    /// Maximum cumulative cost in USD before aborting (only with --mode claude-code).
  228|       |    #[arg(long)]
  229|       |    pub max_cost_usd: Option<f64>,
  230|       |
  231|       |    /// Timeout in seconds for each claude -p invocation (only with --mode claude-code).
  232|       |    #[arg(
  233|       |        long,
  234|       |        default_value_t = 300,
  235|       |        help = "Timeout in seconds for each claude -p invocation (default: 300)"
  236|       |    )]
  237|       |    pub claude_timeout: u64,
  238|       |
  239|       |    /// Explicit path to the Codex CLI binary (only with --mode codex).
  240|       |    #[arg(
  241|       |        long,
  242|       |        env = "SQLITE_GRAPHRAG_CODEX_BINARY",
  243|       |        help = "Explicit path to the Codex CLI binary (only with --mode codex)"
  244|       |    )]
  245|       |    pub codex_binary: Option<PathBuf>,
  246|       |
  247|       |    /// Model override for Codex extraction (e.g. o4-mini, gpt-5.1-codex).
  248|       |    #[arg(
  249|       |        long,
  250|       |        help = "Model override for Codex extraction (e.g. o4-mini, gpt-5.1-codex)"
  251|       |    )]
  252|       |    pub codex_model: Option<String>,
  253|       |
  254|       |    /// Timeout in seconds for each codex exec invocation.
  255|       |    #[arg(
  256|       |        long,
  257|       |        default_value_t = 300,
  258|       |        help = "Timeout in seconds for each codex exec invocation (default: 300)"
  259|       |    )]
  260|       |    pub codex_timeout: u64,
  261|       |
  262|       |    /// G30: poll for the job singleton every second for up to N seconds
  263|       |    /// when another invocation holds the lock. Default: 0 (fail fast).
  264|       |    #[arg(long, value_name = "SECONDS")]
  265|       |    pub wait_job_singleton: Option<u64>,
  266|       |
  267|       |    /// G30: force acquisition of the singleton lock by removing a stale
  268|       |    /// lock file from a previously crashed invocation.
  269|       |    #[arg(long, default_value_t = false)]
  270|       |    pub force_job_singleton: bool,
  271|       |}
  272|       |
  273|       |/// Extraction mode for the ingest pipeline.
  274|       |#[derive(Clone, Debug, PartialEq, Eq, clap::ValueEnum)]
  275|       |pub enum IngestMode {
  276|       |    /// Body-only ingestion without entity/relationship extraction (default).
  277|       |    None,
  278|       |    /// GLiNER zero-shot NER extraction (requires --enable-ner).
  279|       |    Gliner,
  280|       |    /// LLM-curated extraction via locally installed Claude Code CLI.
  281|       |    ClaudeCode,
  282|       |    /// LLM-curated extraction via locally installed OpenAI Codex CLI.
  283|       |    Codex,
  284|       |}
  285|       |
  286|       |/// Returns true when the `SQLITE_GRAPHRAG_LOW_MEMORY` env var is set to a
  287|       |/// truthy value (`1`, `true`, `yes`, `on`, case-insensitive). Empty or unset
  288|       |/// values evaluate to false. Unrecognized non-empty values emit a
  289|       |/// `tracing::warn!` and evaluate to false.
  290|     24|fn env_low_memory_enabled() -> bool {
  291|     24|    match std::env::var("SQLITE_GRAPHRAG_LOW_MEMORY") {
  292|     17|        Ok(v) if v.is_empty() => false,
                         ^1               ^1   ^1
  293|     16|        Ok(v) => match v.to_lowercase().as_str() {
  294|     16|            "1" | "true" | "yes" | "on" => true,
                                ^13      ^11     ^9      ^9
  295|      7|            "0" | "false" | "no" | "off" => false,
                                ^5        ^3     ^2       ^6
  296|      1|            other => {
  297|      1|                tracing::warn!(
  298|       |                    target: "ingest",
  299|       |                    value = %other,
  300|      0|                    "SQLITE_GRAPHRAG_LOW_MEMORY value not recognized; treating as disabled"
  301|       |                );
  302|      1|                false
  303|       |            }
  304|       |        },
  305|      7|        Err(_) => false,
  306|       |    }
  307|     24|}
  308|       |
  309|       |/// Resolves the effective ingest parallelism honoring `--low-memory` and the
  310|       |/// `SQLITE_GRAPHRAG_LOW_MEMORY` env var.
  311|       |///
  312|       |/// Precedence:
  313|       |/// 1. `--low-memory` CLI flag forces parallelism = 1.
  314|       |/// 2. `SQLITE_GRAPHRAG_LOW_MEMORY=1` env var forces parallelism = 1.
  315|       |/// 3. Explicit `--ingest-parallelism N` (when low-memory is off).
  316|       |/// 4. Default heuristic `(cpus/2).clamp(1, 4)`.
  317|       |///
  318|       |/// When low-memory wins and the user also passed `--ingest-parallelism N>1`,
  319|       |/// emits a `tracing::warn!` advertising the override.
  320|      9|fn resolve_parallelism(low_memory_flag: bool, ingest_parallelism: Option<usize>) -> usize {
  321|      9|    let env_flag = env_low_memory_enabled();
  322|      9|    let low_memory = low_memory_flag || env_flag;
                                                      ^6
  323|       |
  324|      9|    if low_memory {
  325|      5|        if let Some(n) = ingest_parallelism {
                                  ^3
  326|      3|            if n > 1 {
  327|      3|                tracing::warn!(
  328|       |                    target: "ingest",
  329|       |                    requested = n,
  330|      0|                    "--ingest-parallelism overridden by --low-memory; using 1"
  331|       |                );
  332|      0|            }
  333|      2|        }
  334|      5|        if low_memory_flag {
  335|      3|            tracing::info!(
  336|       |                target: "ingest",
  337|       |                source = "flag",
  338|      0|                "low-memory mode enabled: forcing --ingest-parallelism 1"
  339|       |            );
  340|       |        } else {
  341|      2|            tracing::info!(
  342|       |                target: "ingest",
  343|       |                source = "env",
  344|      0|                "low-memory mode enabled via SQLITE_GRAPHRAG_LOW_MEMORY: forcing --ingest-parallelism 1"
  345|       |            );
  346|       |        }
  347|      5|        return 1;
  348|      4|    }
  349|       |
  350|      4|    ingest_parallelism
  351|      4|        .unwrap_or_else(|| {
                                         ^1
  352|      1|            std::thread::available_parallelism()
  353|      1|                .map(|v| v.get() / 2)
  354|      1|                .unwrap_or(1)
  355|      1|                .clamp(1, 4)
  356|      1|        })
  357|      4|        .max(1)
  358|      9|}
  359|       |
  360|       |#[derive(Serialize)]
  361|       |struct IngestFileEvent<'a> {
  362|       |    file: &'a str,
  363|       |    name: &'a str,
  364|       |    status: &'a str,
  365|       |    /// True when the derived name was truncated to fit `DERIVED_NAME_MAX_LEN`. False otherwise.
  366|       |    truncated: bool,
  367|       |    /// Original derived name before truncation; only present when `truncated=true`.
  368|       |    #[serde(skip_serializing_if = "Option::is_none")]
  369|       |    original_name: Option<String>,
  370|       |    /// Original file basename (without extension); only present when it differs from `name`.
  371|       |    #[serde(skip_serializing_if = "Option::is_none")]
  372|       |    original_filename: Option<&'a str>,
  373|       |    #[serde(skip_serializing_if = "Option::is_none")]
  374|       |    error: Option<String>,
  375|       |    #[serde(skip_serializing_if = "Option::is_none")]
  376|       |    memory_id: Option<i64>,
  377|       |    #[serde(skip_serializing_if = "Option::is_none")]
  378|       |    action: Option<String>,
  379|       |    /// Byte length of the body ingested; 0 when not yet read (e.g. skip or dry-run events).
  380|       |    body_length: usize,
  381|       |}
  382|       |
  383|       |#[derive(Serialize)]
  384|       |struct IngestSummary {
  385|       |    summary: bool,
  386|       |    dir: String,
  387|       |    pattern: String,
  388|       |    recursive: bool,
  389|       |    files_total: usize,
  390|       |    files_succeeded: usize,
  391|       |    files_failed: usize,
  392|       |    files_skipped: usize,
  393|       |    elapsed_ms: u64,
  394|       |}
  395|       |
  396|       |/// Outcome of a successful per-file ingest, used to build the NDJSON event.
  397|       |struct FileSuccess {
  398|       |    memory_id: i64,
  399|       |    action: String,
  400|       |    body_length: usize,
  401|       |}
  402|       |
  403|       |/// NDJSON progress event emitted to stderr after each file completes Phase A.
  404|       |/// Schema version 1; consumers should check `schema_version` before parsing.
  405|       |#[derive(Serialize)]
  406|       |struct StageProgressEvent<'a> {
  407|       |    schema_version: u8,
  408|       |    event: &'a str,
  409|       |    path: &'a str,
  410|       |    ms: u64,
  411|       |    entities: usize,
  412|       |    relationships: usize,
  413|       |}
  414|       |
  415|       |/// All artefacts pre-computed by Phase A (CPU-bound, runs on rayon thread pool).
  416|       |/// Phase B persists these to SQLite on the main thread in submission order.
  417|       |struct StagedFile {
  418|       |    body: String,
  419|       |    body_hash: String,
  420|       |    snippet: String,
  421|       |    name: String,
  422|       |    description: String,
  423|       |    embedding: Vec<f32>,
  424|       |    chunk_embeddings: Option<Vec<Vec<f32>>>,
  425|       |    chunks_info: Vec<crate::chunking::Chunk>,
  426|       |    entities: Vec<NewEntity>,
  427|       |    relationships: Vec<NewRelationship>,
  428|       |    entity_embeddings: Vec<Vec<f32>>,
  429|       |    urls: Vec<crate::extraction::ExtractedUrl>,
  430|       |}
  431|       |
  432|       |/// Phase A worker: reads, chunks, embeds and extracts NER for one file.
  433|       |/// Never touches the database — safe to run on any rayon thread.
  434|      0|fn stage_file(
  435|      0|    _idx: usize,
  436|      0|    path: &Path,
  437|      0|    name: &str,
  438|      0|    paths: &AppPaths,
  439|      0|    enable_ner: bool,
  440|      0|    gliner_variant: crate::extraction::GlinerVariant,
  441|      0|    max_rss_mb: u64,
  442|      0|) -> Result<StagedFile, AppError> {
  443|       |    use crate::constants::*;
  444|       |
  445|      0|    if name.len() > MAX_MEMORY_NAME_LEN {
  446|      0|        return Err(AppError::LimitExceeded(
  447|      0|            crate::i18n::validation::name_length(MAX_MEMORY_NAME_LEN),
  448|      0|        ));
  449|      0|    }
  450|      0|    if name.starts_with("__") {
  451|      0|        return Err(AppError::Validation(
  452|      0|            crate::i18n::validation::reserved_name(),
  453|      0|        ));
  454|      0|    }
  455|       |    {
  456|      0|        let slug_re = crate::constants::name_slug_regex();
  457|      0|        if !slug_re.is_match(name) {
  458|      0|            return Err(AppError::Validation(crate::i18n::validation::name_kebab(
  459|      0|                name,
  460|      0|            )));
  461|      0|        }
  462|       |    }
  463|       |
  464|      0|    let file_size = std::fs::metadata(path).map_err(AppError::Io)?.len();
  465|      0|    if file_size > MAX_MEMORY_BODY_LEN as u64 {
  466|      0|        return Err(AppError::LimitExceeded(
  467|      0|            crate::i18n::validation::body_exceeds(MAX_MEMORY_BODY_LEN),
  468|      0|        ));
  469|      0|    }
  470|      0|    let raw_body = std::fs::read_to_string(path).map_err(AppError::Io)?;
  471|      0|    if raw_body.len() > MAX_MEMORY_BODY_LEN {
  472|      0|        return Err(AppError::LimitExceeded(
  473|      0|            crate::i18n::validation::body_exceeds(MAX_MEMORY_BODY_LEN),
  474|      0|        ));
  475|      0|    }
  476|      0|    if raw_body.trim().is_empty() {
  477|      0|        return Err(AppError::Validation(crate::i18n::validation::empty_body()));
  478|      0|    }
  479|       |
  480|      0|    let description = format!("ingested from {}", path.display());
  481|      0|    if description.len() > MAX_MEMORY_DESCRIPTION_LEN {
  482|      0|        return Err(AppError::Validation(
  483|      0|            crate::i18n::validation::description_exceeds(MAX_MEMORY_DESCRIPTION_LEN),
  484|      0|        ));
  485|      0|    }
  486|       |
  487|      0|    let mut extracted_entities: Vec<NewEntity> = Vec::with_capacity(30);
  488|      0|    let mut extracted_relationships: Vec<NewRelationship> = Vec::with_capacity(50);
  489|      0|    let mut extracted_urls: Vec<crate::extraction::ExtractedUrl> = Vec::with_capacity(4);
  490|      0|    if enable_ner {
  491|      0|        match crate::extraction::extract_graph_auto(&raw_body, paths, gliner_variant) {
  492|      0|            Ok(extracted) => {
  493|      0|                extracted_urls = extracted.urls;
  494|      0|                extracted_entities = extracted.entities;
  495|      0|                extracted_relationships = extracted.relationships;
  496|       |
  497|      0|                if extracted_entities.len() > max_entities_per_memory() {
  498|      0|                    extracted_entities.truncate(max_entities_per_memory());
  499|      0|                }
  500|      0|                if extracted_relationships.len() > max_relationships_per_memory() {
  501|      0|                    extracted_relationships.truncate(max_relationships_per_memory());
  502|      0|                }
  503|       |            }
  504|      0|            Err(e) => {
  505|      0|                tracing::warn!(
  506|       |                    target: "ingest",
  507|      0|                    file = %path.display(),
  508|      0|                    "auto-extraction failed (graceful degradation): {e:#}"
  509|       |                );
  510|       |            }
  511|       |        }
  512|      0|    }
  513|       |
  514|      0|    for rel in &mut extracted_relationships {
  515|      0|        rel.relation = crate::parsers::normalize_relation(&rel.relation);
  516|      0|        if let Err(e) = crate::parsers::validate_relation_format(&rel.relation) {
  517|      0|            return Err(AppError::Validation(format!(
  518|      0|                "{e} for relationship '{}' -> '{}'",
  519|      0|                rel.source, rel.target
  520|      0|            )));
  521|      0|        }
  522|      0|        crate::parsers::warn_if_non_canonical(&rel.relation);
  523|      0|        if !(0.0..=1.0).contains(&rel.strength) {
  524|      0|            return Err(AppError::Validation(format!(
  525|      0|                "invalid strength {} for relationship '{}' -> '{}'; expected value in [0.0, 1.0]",
  526|      0|                rel.strength, rel.source, rel.target
  527|      0|            )));
  528|      0|        }
  529|       |    }
  530|       |
  531|      0|    let body_hash = blake3::hash(raw_body.as_bytes()).to_hex().to_string();
  532|      0|    let snippet: String = raw_body.chars().take(200).collect();
  533|       |
  534|      0|    let tokenizer = crate::tokenizer::get_tokenizer(&paths.models)?;
  535|      0|    let chunks_info = chunking::split_into_chunks_hierarchical(&raw_body, tokenizer);
  536|      0|    if chunks_info.len() > REMEMBER_MAX_SAFE_MULTI_CHUNKS {
  537|      0|        return Err(AppError::LimitExceeded(format!(
  538|      0|            "document produces {} chunks; current safe operational limit is {} chunks; split the document before using remember",
  539|      0|            chunks_info.len(),
  540|      0|            REMEMBER_MAX_SAFE_MULTI_CHUNKS
  541|      0|        )));
  542|      0|    }
  543|       |
  544|      0|    let mut chunk_embeddings_opt: Option<Vec<Vec<f32>>> = None;
  545|      0|    let embedding = if chunks_info.len() == 1 {
  546|      0|        crate::daemon::embed_passage_or_local(&paths.models, &raw_body)?
  547|       |    } else {
  548|      0|        let chunk_texts: Vec<&str> = chunks_info
  549|      0|            .iter()
  550|      0|            .map(|c| chunking::chunk_text(&raw_body, c))
  551|      0|            .collect();
  552|      0|        let embed_cap = chunk_texts.len();
  553|      0|        let mut chunk_embeddings = Vec::new();
  554|      0|        chunk_embeddings.try_reserve(embed_cap).map_err(|_| {
  555|      0|            AppError::LimitExceeded(format!(
  556|      0|                "allocation of {embed_cap} chunk embeddings would exceed available memory"
  557|      0|            ))
  558|      0|        })?;
  559|      0|        for chunk_text in &chunk_texts {
  560|      0|            if let Some(rss) = crate::memory_guard::current_process_memory_mb() {
  561|      0|                if rss > max_rss_mb {
  562|      0|                    tracing::error!(
  563|       |                        target: "ingest",
  564|       |                        rss_mb = rss,
  565|       |                        max_rss_mb = max_rss_mb,
  566|      0|                        file = %path.display(),
  567|      0|                        "RSS exceeded --max-rss-mb threshold; aborting to prevent system instability"
  568|       |                    );
  569|      0|                    return Err(AppError::LowMemory {
  570|      0|                        available_mb: crate::memory_guard::available_memory_mb(),
  571|      0|                        required_mb: max_rss_mb,
  572|      0|                    });
  573|      0|                }
  574|      0|            }
  575|      0|            chunk_embeddings.push(crate::daemon::embed_passage_or_local(
  576|      0|                &paths.models,
  577|      0|                chunk_text,
  578|      0|            )?);
  579|       |        }
  580|      0|        let aggregated = chunking::aggregate_embeddings(&chunk_embeddings);
  581|      0|        chunk_embeddings_opt = Some(chunk_embeddings);
  582|      0|        aggregated
  583|       |    };
  584|       |
  585|      0|    let entity_embeddings = extracted_entities
  586|      0|        .iter()
  587|      0|        .map(|entity| {
  588|      0|            let entity_text = match &entity.description {
  589|      0|                Some(desc) => format!("{} {}", entity.name, desc),
  590|      0|                None => entity.name.clone(),
  591|       |            };
  592|      0|            crate::daemon::embed_passage_or_local(&paths.models, &entity_text)
  593|      0|        })
  594|      0|        .collect::<Result<Vec<_>, _>>()?;
  595|       |
  596|      0|    Ok(StagedFile {
  597|      0|        body: raw_body,
  598|      0|        body_hash,
  599|      0|        snippet,
  600|      0|        name: name.to_string(),
  601|      0|        description,
  602|      0|        embedding,
  603|      0|        chunk_embeddings: chunk_embeddings_opt,
  604|      0|        chunks_info,
  605|      0|        entities: extracted_entities,
  606|      0|        relationships: extracted_relationships,
  607|      0|        entity_embeddings,
  608|      0|        urls: extracted_urls,
  609|      0|    })
  610|      0|}
  611|       |
  612|       |/// Phase B: persists one `StagedFile` to the database on the main thread.
  613|      0|fn persist_staged(
  614|      0|    conn: &mut Connection,
  615|      0|    namespace: &str,
  616|      0|    memory_type: &str,
  617|      0|    staged: StagedFile,
  618|      0|) -> Result<FileSuccess, AppError> {
  619|       |    {
  620|      0|        let active_count: u32 = conn.query_row(
  621|      0|            "SELECT COUNT(DISTINCT namespace) FROM memories WHERE deleted_at IS NULL",
  622|      0|            [],
  623|      0|            |r| r.get::<_, i64>(0).map(|v| v as u32),
  624|      0|        )?;
  625|      0|        let ns_exists: bool = conn.query_row(
  626|      0|            "SELECT EXISTS(SELECT 1 FROM memories WHERE namespace = ?1 AND deleted_at IS NULL)",
  627|      0|            rusqlite::params![namespace],
  628|      0|            |r| r.get::<_, i64>(0).map(|v| v > 0),
  629|      0|        )?;
  630|      0|        if !ns_exists && active_count >= crate::constants::MAX_NAMESPACES_ACTIVE {
  631|      0|            return Err(AppError::NamespaceError(format!(
  632|      0|                "active namespace limit of {} exceeded while creating '{namespace}'",
  633|      0|                crate::constants::MAX_NAMESPACES_ACTIVE
  634|      0|            )));
  635|      0|        }
  636|       |    }
  637|       |
  638|      0|    let existing_memory = memories::find_by_name(conn, namespace, &staged.name)?;
  639|      0|    if existing_memory.is_some() {
  640|      0|        return Err(AppError::Duplicate(errors_msg::duplicate_memory(
  641|      0|            &staged.name,
  642|      0|            namespace,
  643|      0|        )));
  644|      0|    }
  645|      0|    let duplicate_hash_id = memories::find_by_hash(conn, namespace, &staged.body_hash)?;
  646|       |
  647|      0|    let new_memory = NewMemory {
  648|      0|        namespace: namespace.to_string(),
  649|      0|        name: staged.name.clone(),
  650|      0|        memory_type: memory_type.to_string(),
  651|      0|        description: staged.description.clone(),
  652|      0|        body: staged.body,
  653|      0|        body_hash: staged.body_hash,
  654|      0|        session_id: None,
  655|      0|        source: "agent".to_string(),
  656|      0|        metadata: serde_json::json!({}),
  657|      0|    };
  658|       |
  659|      0|    if let Some(hash_id) = duplicate_hash_id {
  660|      0|        tracing::debug!(
  661|       |            target: "ingest",
  662|       |            duplicate_memory_id = hash_id,
  663|      0|            "identical body already exists; persisting a new memory anyway"
  664|       |        );
  665|      0|    }
  666|       |
  667|      0|    let tx = conn.transaction_with_behavior(rusqlite::TransactionBehavior::Immediate)?;
  668|       |
  669|      0|    let memory_id = memories::insert(&tx, &new_memory)?;
  670|      0|    versions::insert_version(
  671|      0|        &tx,
  672|      0|        memory_id,
  673|       |        1,
  674|      0|        &staged.name,
  675|      0|        memory_type,
  676|      0|        &staged.description,
  677|      0|        &new_memory.body,
  678|      0|        &serde_json::to_string(&new_memory.metadata)?,
  679|      0|        None,
  680|      0|        "create",
  681|      0|    )?;
  682|      0|    memories::upsert_vec(
  683|      0|        &tx,
  684|      0|        memory_id,
  685|      0|        namespace,
  686|      0|        memory_type,
  687|      0|        &staged.embedding,
  688|      0|        &staged.name,
  689|      0|        &staged.snippet,
  690|      0|    )?;
  691|       |
  692|      0|    if staged.chunks_info.len() > 1 {
  693|      0|        storage_chunks::insert_chunk_slices(&tx, memory_id, &new_memory.body, &staged.chunks_info)?;
  694|      0|        let chunk_embeddings = staged.chunk_embeddings.ok_or_else(|| {
  695|      0|            AppError::Internal(anyhow::anyhow!(
  696|      0|                "missing chunk embeddings cache on multi-chunk ingest path"
  697|      0|            ))
  698|      0|        })?;
  699|      0|        for (i, emb) in chunk_embeddings.iter().enumerate() {
  700|      0|            storage_chunks::upsert_chunk_vec(&tx, i as i64, memory_id, i as i32, emb)?;
  701|       |        }
  702|      0|    }
  703|       |
  704|      0|    if !staged.entities.is_empty() || !staged.relationships.is_empty() {
  705|      0|        for (idx, entity) in staged.entities.iter().enumerate() {
  706|      0|            let entity_id = entities::upsert_entity(&tx, namespace, entity)?;
  707|      0|            let entity_embedding = &staged.entity_embeddings[idx];
  708|      0|            entities::upsert_entity_vec(
  709|      0|                &tx,
  710|      0|                entity_id,
  711|      0|                namespace,
  712|      0|                entity.entity_type,
  713|      0|                entity_embedding,
  714|      0|                &entity.name,
  715|      0|            )?;
  716|      0|            entities::link_memory_entity(&tx, memory_id, entity_id)?;
  717|      0|            entities::increment_degree(&tx, entity_id)?;
  718|       |        }
  719|      0|        let entity_types: std::collections::HashMap<&str, EntityType> = staged
  720|      0|            .entities
  721|      0|            .iter()
  722|      0|            .map(|entity| (entity.name.as_str(), entity.entity_type))
  723|      0|            .collect();
  724|      0|        for rel in &staged.relationships {
  725|      0|            let source_entity = NewEntity {
  726|      0|                name: rel.source.clone(),
  727|      0|                entity_type: entity_types
  728|      0|                    .get(rel.source.as_str())
  729|      0|                    .copied()
  730|      0|                    .unwrap_or(EntityType::Concept),
  731|      0|                description: None,
  732|      0|            };
  733|      0|            let target_entity = NewEntity {
  734|      0|                name: rel.target.clone(),
  735|      0|                entity_type: entity_types
  736|      0|                    .get(rel.target.as_str())
  737|      0|                    .copied()
  738|      0|                    .unwrap_or(EntityType::Concept),
  739|      0|                description: None,
  740|      0|            };
  741|      0|            let source_id = entities::upsert_entity(&tx, namespace, &source_entity)?;
  742|      0|            let target_id = entities::upsert_entity(&tx, namespace, &target_entity)?;
  743|      0|            let rel_id = entities::upsert_relationship(&tx, namespace, source_id, target_id, rel)?;
  744|      0|            entities::link_memory_relationship(&tx, memory_id, rel_id)?;
  745|       |        }
  746|      0|    }
  747|       |
  748|      0|    tx.commit()?;
  749|       |
  750|      0|    if !staged.urls.is_empty() {
  751|      0|        let url_entries: Vec<storage_urls::MemoryUrl> = staged
  752|      0|            .urls
  753|      0|            .into_iter()
  754|      0|            .map(|u| storage_urls::MemoryUrl {
  755|      0|                url: u.url,
  756|      0|                offset: Some(u.offset as i64),
  757|      0|            })
  758|      0|            .collect();
  759|      0|        let _ = storage_urls::insert_urls(conn, memory_id, &url_entries);
  760|      0|    }
  761|       |
  762|      0|    Ok(FileSuccess {
  763|      0|        memory_id,
  764|      0|        action: "created".to_string(),
  765|      0|        body_length: new_memory.body.len(),
  766|      0|    })
  767|      0|}
  768|       |
  769|       |#[tracing::instrument(skip_all, level = "debug", name = "ingest")]
  770|      0|pub fn run(args: IngestArgs) -> Result<(), AppError> {
  771|       |    // TODO(G20): add mode-conditional flag validation before DB access.
  772|       |    // Flags that are silently discarded when the wrong mode is active:
  773|       |    //   --mode none/gliner:   claude_binary, claude_model, claude_timeout,
  774|       |    //                         max_cost_usd, rate_limit_wait, resume,
  775|       |    //                         retry_failed, keep_queue, queue_db
  776|       |    //   --mode none/gliner:   codex_binary, codex_model, codex_timeout
  777|       |    //   --mode claude-code:   codex_binary, codex_model, codex_timeout
  778|       |    //   --mode codex:         claude_binary, claude_model, claude_timeout,
  779|       |    //                         max_cost_usd, rate_limit_wait
  780|       |    //   --mode none:          gliner_variant (only meaningful with --enable-ner
  781|       |    //                         or --mode gliner)
  782|       |    // Approach: after the mode dispatch block below, check each non-default
  783|       |    // flag value and return Err(AppError::Validation(...)) for mismatches.
  784|      0|    tracing::debug!(target: "ingest", dir = %args.dir.display(), mode = ?args.mode, "starting ingest");
  785|      0|    if args.mode == IngestMode::ClaudeCode {
  786|      0|        return super::ingest_claude::run_claude_ingest(&args);
  787|      0|    }
  788|      0|    if args.mode == IngestMode::Codex {
  789|      0|        return super::ingest_codex::run_codex_ingest(&args);
  790|      0|    }
  791|       |
  792|      0|    let started = std::time::Instant::now();
  793|       |
  794|      0|    if !args.dir.exists() {
  795|      0|        return Err(AppError::Validation(format!(
  796|      0|            "directory not found: {}",
  797|      0|            args.dir.display()
  798|      0|        )));
  799|      0|    }
  800|      0|    if !args.dir.is_dir() {
  801|      0|        return Err(AppError::Validation(format!(
  802|      0|            "path is not a directory: {}",
  803|      0|            args.dir.display()
  804|      0|        )));
  805|      0|    }
  806|       |
  807|      0|    let mut files: Vec<PathBuf> = Vec::with_capacity(128);
  808|      0|    collect_files(&args.dir, &args.pattern, args.recursive, &mut files)?;
  809|      0|    files.sort_unstable();
  810|       |
  811|      0|    if files.len() > args.max_files {
  812|      0|        return Err(AppError::Validation(format!(
  813|      0|            "found {} files matching pattern, exceeds --max-files cap of {} (raise the cap or narrow the pattern)",
  814|      0|            files.len(),
  815|      0|            args.max_files
  816|      0|        )));
  817|      0|    }
  818|       |
  819|      0|    let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
  820|      0|    let memory_type_str = args.r#type.as_str().to_string();
  821|       |
  822|      0|    let paths = AppPaths::resolve(args.db.as_deref())?;
  823|      0|    let mut conn_or_err = match init_storage(&paths) {
  824|      0|        Ok(c) => Ok(c),
  825|      0|        Err(e) => Err(format!("{e}")),
  826|       |    };
  827|       |
  828|      0|    let mut succeeded: usize = 0;
  829|      0|    let mut failed: usize = 0;
  830|      0|    let mut skipped: usize = 0;
  831|      0|    let total = files.len();
  832|       |
  833|       |    // Pre-resolve all names before parallelisation so Phase A workers see a
  834|       |    // consistent, immutable name assignment (v1.0.31 A10 contract preserved).
  835|      0|    let mut taken_names: BTreeSet<String> = BTreeSet::new();
  836|       |
  837|       |    // SlotMeta: per-slot output metadata retained on the main thread for NDJSON.
  838|       |    // ProcessItem: the data moved into the producer thread for Phase A computation.
  839|       |    // We split these so `slots_meta` (non-Send BTreeSet-dependent) stays on main
  840|       |    // thread while `process_items` (Send: only PathBuf + String) crosses the thread
  841|       |    // boundary into the rayon producer.
  842|       |    enum SlotMeta {
  843|       |        Skip {
  844|       |            file_str: String,
  845|       |            derived_base: String,
  846|       |            name_truncated: bool,
  847|       |            original_name: Option<String>,
  848|       |            original_filename: Option<String>,
  849|       |            reason: String,
  850|       |        },
  851|       |        Process {
  852|       |            file_str: String,
  853|       |            derived_name: String,
  854|       |            name_truncated: bool,
  855|       |            original_name: Option<String>,
  856|       |            original_filename: Option<String>,
  857|       |        },
  858|       |    }
  859|       |
  860|       |    struct ProcessItem {
  861|       |        idx: usize,
  862|       |        path: PathBuf,
  863|       |        file_str: String,
  864|       |        derived_name: String,
  865|       |    }
  866|       |
  867|      0|    let files_cap = files.len();
  868|      0|    let mut slots_meta: Vec<SlotMeta> = Vec::new();
  869|      0|    slots_meta.try_reserve(files_cap).map_err(|_| {
  870|      0|        AppError::LimitExceeded(format!(
  871|      0|            "allocation of {files_cap} slot metadata entries would exceed available memory"
  872|      0|        ))
  873|      0|    })?;
  874|      0|    let mut process_items: Vec<ProcessItem> = Vec::new();
  875|      0|    process_items.try_reserve(files_cap).map_err(|_| {
  876|      0|        AppError::LimitExceeded(format!(
  877|      0|            "allocation of {files_cap} process items would exceed available memory"
  878|      0|        ))
  879|      0|    })?;
  880|      0|    let mut truncations: Vec<(String, String)> = Vec::new();
  881|      0|    truncations.try_reserve(files_cap).map_err(|_| {
  882|      0|        AppError::LimitExceeded(format!(
  883|      0|            "allocation of {files_cap} truncation entries would exceed available memory"
  884|      0|        ))
  885|      0|    })?;
  886|       |
  887|      0|    let max_name_length = args.max_name_length;
  888|      0|    for path in &files {
  889|      0|        let file_str = path.to_string_lossy().into_owned();
  890|      0|        let (derived_base, name_truncated, original_name) =
  891|      0|            derive_kebab_name(path, max_name_length);
  892|      0|        let original_basename = path.file_stem().and_then(|s| s.to_str()).unwrap_or("");
  893|       |
  894|      0|        if name_truncated {
  895|      0|            if let Some(ref orig) = original_name {
  896|      0|                truncations.push((orig.clone(), derived_base.clone()));
  897|      0|            }
  898|      0|        }
  899|       |
  900|      0|        if derived_base.is_empty() {
  901|       |            // original_filename: always include when it differs from the empty derived name
  902|      0|            let orig_filename = if !original_basename.is_empty() {
  903|      0|                Some(original_basename.to_string())
  904|       |            } else {
  905|      0|                None
  906|       |            };
  907|      0|            slots_meta.push(SlotMeta::Skip {
  908|      0|                file_str,
  909|      0|                derived_base: String::new(),
  910|      0|                name_truncated: false,
  911|      0|                original_name: None,
  912|      0|                original_filename: orig_filename,
  913|      0|                reason: "could not derive a non-empty kebab-case name from filename".to_string(),
  914|      0|            });
  915|      0|            continue;
  916|      0|        }
  917|       |
  918|      0|        match unique_name(&derived_base, &taken_names) {
  919|      0|            Ok(derived_name) => {
  920|      0|                taken_names.insert(derived_name.clone());
  921|      0|                let idx = slots_meta.len();
  922|       |                // original_filename: present only when the raw basename differs from the derived name
  923|      0|                let orig_filename = if original_basename != derived_name {
  924|      0|                    Some(original_basename.to_string())
  925|       |                } else {
  926|      0|                    None
  927|       |                };
  928|      0|                process_items.push(ProcessItem {
  929|      0|                    idx,
  930|      0|                    path: path.clone(),
  931|      0|                    file_str: file_str.clone(),
  932|      0|                    derived_name: derived_name.clone(),
  933|      0|                });
  934|      0|                slots_meta.push(SlotMeta::Process {
  935|      0|                    file_str,
  936|      0|                    derived_name,
  937|      0|                    name_truncated,
  938|      0|                    original_name,
  939|      0|                    original_filename: orig_filename,
  940|      0|                });
  941|       |            }
  942|      0|            Err(e) => {
  943|      0|                let orig_filename = if original_basename != derived_base {
  944|      0|                    Some(original_basename.to_string())
  945|       |                } else {
  946|      0|                    None
  947|       |                };
  948|      0|                slots_meta.push(SlotMeta::Skip {
  949|      0|                    file_str,
  950|      0|                    derived_base,
  951|      0|                    name_truncated,
  952|      0|                    original_name,
  953|      0|                    original_filename: orig_filename,
  954|      0|                    reason: e.to_string(),
  955|      0|                });
  956|       |            }
  957|       |        }
  958|       |    }
  959|       |
  960|      0|    if !truncations.is_empty() {
  961|      0|        tracing::info!(
  962|       |            target: "ingest",
  963|      0|            count = truncations.len(),
  964|       |            max_name_length = max_name_length,
  965|       |            max_len = DERIVED_NAME_MAX_LEN,
  966|      0|            "derived names truncated; pass -vv (debug) for per-file detail"
  967|       |        );
  968|      0|    }
  969|       |
  970|       |    // --dry-run: emit preview events and exit before loading ONNX or touching DB.
  971|      0|    if args.dry_run {
  972|      0|        for meta in &slots_meta {
  973|      0|            match meta {
  974|       |                SlotMeta::Skip {
  975|      0|                    file_str,
  976|      0|                    derived_base,
  977|      0|                    name_truncated,
  978|      0|                    original_name,
  979|      0|                    original_filename,
  980|      0|                    reason,
  981|       |                } => {
  982|      0|                    output::emit_json_compact(&IngestFileEvent {
  983|      0|                        file: file_str,
  984|      0|                        name: derived_base,
  985|      0|                        status: "skip",
  986|      0|                        truncated: *name_truncated,
  987|      0|                        original_name: original_name.clone(),
  988|      0|                        original_filename: original_filename.as_deref(),
  989|      0|                        error: Some(reason.clone()),
  990|      0|                        memory_id: None,
  991|      0|                        action: None,
  992|      0|                        body_length: 0,
  993|      0|                    })?;
  994|       |                }
  995|       |                SlotMeta::Process {
  996|      0|                    file_str,
  997|      0|                    derived_name,
  998|      0|                    name_truncated,
  999|      0|                    original_name,
 1000|      0|                    original_filename,
 1001|       |                } => {
 1002|      0|                    output::emit_json_compact(&IngestFileEvent {
 1003|      0|                        file: file_str,
 1004|      0|                        name: derived_name,
 1005|      0|                        status: "preview",
 1006|      0|                        truncated: *name_truncated,
 1007|      0|                        original_name: original_name.clone(),
 1008|      0|                        original_filename: original_filename.as_deref(),
 1009|      0|                        error: None,
 1010|      0|                        memory_id: None,
 1011|      0|                        action: None,
 1012|      0|                        body_length: 0,
 1013|      0|                    })?;
 1014|       |                }
 1015|       |            }
 1016|       |        }
 1017|      0|        output::emit_json_compact(&IngestSummary {
 1018|      0|            summary: true,
 1019|      0|            dir: args.dir.to_string_lossy().into_owned(),
 1020|      0|            pattern: args.pattern.clone(),
 1021|      0|            recursive: args.recursive,
 1022|      0|            files_total: total,
 1023|      0|            files_succeeded: 0,
 1024|      0|            files_failed: 0,
 1025|      0|            files_skipped: 0,
 1026|      0|            elapsed_ms: started.elapsed().as_millis() as u64,
 1027|      0|        })?;
 1028|      0|        return Ok(());
 1029|      0|    }
 1030|       |
 1031|       |    // Reject contradictory flag combination: explicit parallelism > 1 with --low-memory.
 1032|      0|    if args.low_memory {
 1033|      0|        if let Some(n) = args.ingest_parallelism {
 1034|      0|            if n > 1 {
 1035|      0|                return Err(AppError::Validation(
 1036|      0|                    "--ingest-parallelism N>1 conflicts with --low-memory; use one or the other"
 1037|      0|                        .to_string(),
 1038|      0|                ));
 1039|      0|            }
 1040|      0|        }
 1041|      0|    }
 1042|       |
 1043|       |    // Determine rayon thread pool size, honoring --low-memory and the
 1044|       |    // SQLITE_GRAPHRAG_LOW_MEMORY env var (both force parallelism = 1).
 1045|      0|    let parallelism = resolve_parallelism(args.low_memory, args.ingest_parallelism);
 1046|       |
 1047|      0|    let pool = rayon::ThreadPoolBuilder::new()
 1048|      0|        .num_threads(parallelism)
 1049|      0|        .build()
 1050|      0|        .map_err(|e| AppError::Internal(anyhow::anyhow!("rayon pool: {e}")))?;
 1051|       |
 1052|      0|    if args.enable_ner && args.skip_extraction {
 1053|      0|        return Err(AppError::Validation(
 1054|      0|            "--enable-ner and --skip-extraction are mutually exclusive; remove one".to_string(),
 1055|      0|        ));
 1056|      0|    }
 1057|      0|    if args.skip_extraction && !args.enable_ner {
 1058|      0|        return Err(AppError::Validation(
 1059|      0|            "--skip-extraction is deprecated since v1.0.45 and has no effect; remove this flag"
 1060|      0|                .to_string(),
 1061|      0|        ));
 1062|      0|    }
 1063|      0|    let enable_ner = args.enable_ner;
 1064|      0|    let max_rss_mb = args.max_rss_mb;
 1065|      0|    let gliner_variant: crate::extraction::GlinerVariant =
 1066|      0|        args.gliner_variant.parse().unwrap_or_else(|e| {
 1067|      0|            tracing::warn!(target: "ingest", error = %e, "invalid --gliner-variant, defaulting to fp32");
 1068|      0|            crate::extraction::GlinerVariant::Fp32
 1069|      0|        });
 1070|       |
 1071|      0|    let total_to_process = process_items.len();
 1072|      0|    tracing::info!(
 1073|       |        target: "ingest",
 1074|       |        phase = "pipeline_start",
 1075|       |        files = total_to_process,
 1076|       |        ingest_parallelism = parallelism,
 1077|      0|        "incremental pipeline starting: Phase A (rayon) → channel → Phase B (main thread)",
 1078|       |    );
 1079|       |
 1080|       |    // Bounded channel: producer never gets more than parallelism*2 items ahead of
 1081|       |    // the consumer, preventing memory blowup when Phase A is faster than Phase B.
 1082|       |    // Each message carries the slot index so Phase B can look up SlotMeta in order.
 1083|      0|    let channel_bound = (parallelism * 2).max(1);
 1084|      0|    let (tx, rx) = mpsc::sync_channel::<(usize, Result<StagedFile, AppError>)>(channel_bound);
 1085|       |
 1086|       |    // Phase A: launched in a dedicated OS thread so the main thread can consume
 1087|       |    // the channel concurrently. pool.install() blocks the calling thread until
 1088|       |    // all rayon workers finish — if called on the main thread it would
 1089|       |    // reintroduce the 2-phase blocking behaviour we are eliminating.
 1090|      0|    let paths_owned = paths.clone();
 1091|      0|    let producer_handle = std::thread::spawn(move || {
 1092|      0|        pool.install(|| {
 1093|      0|            process_items.into_par_iter().for_each(|item| {
 1094|      0|                if crate::shutdown_requested() {
 1095|      0|                    return;
 1096|      0|                }
 1097|      0|                let t0 = std::time::Instant::now();
 1098|      0|                let result = stage_file(
 1099|      0|                    item.idx,
 1100|      0|                    &item.path,
 1101|      0|                    &item.derived_name,
 1102|      0|                    &paths_owned,
 1103|      0|                    enable_ner,
 1104|      0|                    gliner_variant,
 1105|      0|                    max_rss_mb,
 1106|       |                );
 1107|      0|                let elapsed_ms = t0.elapsed().as_millis() as u64;
 1108|       |
 1109|       |                // Emit NDJSON progress event to stderr so the user sees work
 1110|       |                // happening during long NER runs (e.g. 50 files × 27s each).
 1111|      0|                let (n_entities, n_relationships) = match &result {
 1112|      0|                    Ok(sf) => (sf.entities.len(), sf.relationships.len()),
 1113|      0|                    Err(_) => (0, 0),
 1114|       |                };
 1115|      0|                let progress = StageProgressEvent {
 1116|      0|                    schema_version: 1,
 1117|      0|                    event: "file_extracted",
 1118|      0|                    path: &item.file_str,
 1119|      0|                    ms: elapsed_ms,
 1120|      0|                    entities: n_entities,
 1121|      0|                    relationships: n_relationships,
 1122|      0|                };
 1123|      0|                if let Ok(line) = serde_json::to_string(&progress) {
 1124|      0|                    tracing::info!(target: "ingest_progress", "{}", line);
 1125|      0|                }
 1126|       |
 1127|       |                // Blocking send applies backpressure: if Phase B is slower,
 1128|       |                // Phase A workers wait here instead of accumulating staged files
 1129|       |                // in memory. If the receiver is dropped (fail_fast abort), ignore.
 1130|      0|                let _ = tx.send((item.idx, result));
 1131|      0|            });
 1132|       |            // Explicit drop of tx signals Phase B (rx iteration) to stop.
 1133|      0|            drop(tx);
 1134|      0|        });
 1135|      0|    });
 1136|       |
 1137|       |    // Phase B: main thread persists files as results arrive from the channel.
 1138|       |    // Results arrive in completion order (par_iter is unordered). We persist
 1139|       |    // each file immediately on arrival — this is the key fix for B1: with the
 1140|       |    // old 2-phase design the first DB write happened only after ALL files had
 1141|       |    // finished Phase A. Now the first commit happens as soon as the first file
 1142|       |    // completes Phase A, regardless of how many files remain.
 1143|       |    //
 1144|       |    // NDJSON output order follows completion order (not file-system sort order).
 1145|       |    // Skip slots are emitted at the end, after all Process results are consumed.
 1146|       |    // This trade-off is intentional: deterministic NDJSON ordering is a lesser
 1147|       |    // requirement than ensuring data is persisted before the user's timeout fires.
 1148|      0|    let fail_fast = args.fail_fast;
 1149|       |
 1150|       |    // Emit pending Skip events first so agents see them early.
 1151|      0|    for meta in &slots_meta {
 1152|       |        if let SlotMeta::Skip {
 1153|      0|            file_str,
 1154|      0|            derived_base,
 1155|      0|            name_truncated,
 1156|      0|            original_name,
 1157|      0|            original_filename,
 1158|      0|            reason,
 1159|      0|        } = meta
 1160|       |        {
 1161|      0|            output::emit_json_compact(&IngestFileEvent {
 1162|      0|                file: file_str,
 1163|      0|                name: derived_base,
 1164|      0|                status: "skipped",
 1165|      0|                truncated: *name_truncated,
 1166|      0|                original_name: original_name.clone(),
 1167|      0|                original_filename: original_filename.as_deref(),
 1168|      0|                error: Some(reason.clone()),
 1169|      0|                memory_id: None,
 1170|      0|                action: None,
 1171|      0|                body_length: 0,
 1172|      0|            })?;
 1173|      0|            skipped += 1;
 1174|      0|        }
 1175|       |    }
 1176|       |
 1177|       |    // Build a quick index from slot index → SlotMeta reference for O(1) lookups
 1178|       |    // as channel messages arrive in completion order.
 1179|      0|    let meta_index: std::collections::HashMap<usize, &SlotMeta> = slots_meta
 1180|      0|        .iter()
 1181|      0|        .enumerate()
 1182|      0|        .filter(|(_, m)| matches!(m, SlotMeta::Process { .. }))
 1183|      0|        .collect();
 1184|       |
 1185|      0|    tracing::info!(
 1186|       |        target: "ingest",
 1187|       |        phase = "persist_start",
 1188|       |        files = total_to_process,
 1189|      0|        "phase B starting: persisting files incrementally as Phase A completes each one",
 1190|       |    );
 1191|       |
 1192|       |    // Drain channel and persist each file immediately — no accumulation into a
 1193|       |    // HashMap. The bounded channel ensures Phase A cannot run too far ahead of
 1194|       |    // Phase B without applying backpressure.
 1195|      0|    for (idx, stage_result) in rx {
 1196|      0|        if crate::shutdown_requested() {
 1197|      0|            tracing::info!(target: "ingest", "shutdown requested, stopping persistence loop");
 1198|      0|            break;
 1199|      0|        }
 1200|      0|        let meta = meta_index.get(&idx).ok_or_else(|| {
 1201|      0|            AppError::Internal(anyhow::anyhow!(
 1202|      0|                "channel idx {idx} has no corresponding Process slot"
 1203|      0|            ))
 1204|      0|        })?;
 1205|      0|        let (file_str, derived_name, name_truncated, original_name, original_filename) = match meta
 1206|       |        {
 1207|       |            SlotMeta::Process {
 1208|      0|                file_str,
 1209|      0|                derived_name,
 1210|      0|                name_truncated,
 1211|      0|                original_name,
 1212|      0|                original_filename,
 1213|      0|            } => (
 1214|      0|                file_str,
 1215|      0|                derived_name,
 1216|      0|                name_truncated,
 1217|      0|                original_name,
 1218|      0|                original_filename,
 1219|      0|            ),
 1220|      0|            SlotMeta::Skip { .. } => unreachable!("channel only carries Process results"),
 1221|       |        };
 1222|       |
 1223|       |        // If storage init failed, every file fails with the same error.
 1224|      0|        let conn = match conn_or_err.as_mut() {
 1225|      0|            Ok(c) => c,
 1226|      0|            Err(err_msg) => {
 1227|      0|                let err_clone = err_msg.clone();
 1228|      0|                output::emit_json_compact(&IngestFileEvent {
 1229|      0|                    file: file_str,
 1230|      0|                    name: derived_name,
 1231|      0|                    status: "failed",
 1232|      0|                    truncated: *name_truncated,
 1233|      0|                    original_name: original_name.clone(),
 1234|      0|                    original_filename: original_filename.as_deref(),
 1235|      0|                    error: Some(err_clone.clone()),
 1236|      0|                    memory_id: None,
 1237|      0|                    action: None,
 1238|      0|                    body_length: 0,
 1239|      0|                })?;
 1240|      0|                failed += 1;
 1241|      0|                if fail_fast {
 1242|      0|                    output::emit_json_compact(&IngestSummary {
 1243|      0|                        summary: true,
 1244|      0|                        dir: args.dir.display().to_string(),
 1245|      0|                        pattern: args.pattern.clone(),
 1246|      0|                        recursive: args.recursive,
 1247|      0|                        files_total: total,
 1248|      0|                        files_succeeded: succeeded,
 1249|      0|                        files_failed: failed,
 1250|      0|                        files_skipped: skipped,
 1251|      0|                        elapsed_ms: started.elapsed().as_millis() as u64,
 1252|      0|                    })?;
 1253|      0|                    return Err(AppError::Validation(format!(
 1254|      0|                        "ingest aborted on first failure: {err_clone}"
 1255|      0|                    )));
 1256|      0|                }
 1257|      0|                continue;
 1258|       |            }
 1259|       |        };
 1260|       |
 1261|      0|        let outcome =
 1262|      0|            stage_result.and_then(|sf| persist_staged(conn, &namespace, &memory_type_str, sf));
 1263|       |
 1264|      0|        match outcome {
 1265|       |            Ok(FileSuccess {
 1266|      0|                memory_id,
 1267|      0|                action,
 1268|      0|                body_length,
 1269|       |            }) => {
 1270|      0|                output::emit_json_compact(&IngestFileEvent {
 1271|      0|                    file: file_str,
 1272|      0|                    name: derived_name,
 1273|      0|                    status: "indexed",
 1274|      0|                    truncated: *name_truncated,
 1275|      0|                    original_name: original_name.clone(),
 1276|      0|                    original_filename: original_filename.as_deref(),
 1277|      0|                    error: None,
 1278|      0|                    memory_id: Some(memory_id),
 1279|      0|                    action: Some(action),
 1280|      0|                    body_length,
 1281|      0|                })?;
 1282|      0|                succeeded += 1;
 1283|       |            }
 1284|      0|            Err(ref e) if matches!(e, AppError::Duplicate(_)) => {
 1285|      0|                output::emit_json_compact(&IngestFileEvent {
 1286|      0|                    file: file_str,
 1287|      0|                    name: derived_name,
 1288|      0|                    status: "skipped",
 1289|      0|                    truncated: *name_truncated,
 1290|      0|                    original_name: original_name.clone(),
 1291|      0|                    original_filename: original_filename.as_deref(),
 1292|      0|                    error: Some(format!("{e}")),
 1293|      0|                    memory_id: None,
 1294|      0|                    action: Some("duplicate".to_string()),
 1295|      0|                    body_length: 0,
 1296|      0|                })?;
 1297|      0|                skipped += 1;
 1298|       |            }
 1299|      0|            Err(e) => {
 1300|      0|                let err_msg = format!("{e}");
 1301|      0|                output::emit_json_compact(&IngestFileEvent {
 1302|      0|                    file: file_str,
 1303|      0|                    name: derived_name,
 1304|      0|                    status: "failed",
 1305|      0|                    truncated: *name_truncated,
 1306|      0|                    original_name: original_name.clone(),
 1307|      0|                    original_filename: original_filename.as_deref(),
 1308|      0|                    error: Some(err_msg.clone()),
 1309|      0|                    memory_id: None,
 1310|      0|                    action: None,
 1311|      0|                    body_length: 0,
 1312|      0|                })?;
 1313|      0|                failed += 1;
 1314|      0|                if fail_fast {
 1315|      0|                    output::emit_json_compact(&IngestSummary {
 1316|      0|                        summary: true,
 1317|      0|                        dir: args.dir.display().to_string(),
 1318|      0|                        pattern: args.pattern.clone(),
 1319|      0|                        recursive: args.recursive,
 1320|      0|                        files_total: total,
 1321|      0|                        files_succeeded: succeeded,
 1322|      0|                        files_failed: failed,
 1323|      0|                        files_skipped: skipped,
 1324|      0|                        elapsed_ms: started.elapsed().as_millis() as u64,
 1325|      0|                    })?;
 1326|      0|                    return Err(AppError::Validation(format!(
 1327|      0|                        "ingest aborted on first failure: {err_msg}"
 1328|      0|                    )));
 1329|      0|                }
 1330|       |            }
 1331|       |        }
 1332|       |    }
 1333|       |
 1334|       |    // Wait for the producer thread to finish cleanly.
 1335|      0|    producer_handle
 1336|      0|        .join()
 1337|      0|        .map_err(|_| AppError::Internal(anyhow::anyhow!("ingest producer thread panicked")))?;
 1338|       |
 1339|      0|    if let Ok(ref conn) = conn_or_err {
 1340|      0|        if succeeded > 0 {
 1341|      0|            let _ = conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);");
 1342|      0|        }
 1343|      0|    }
 1344|       |
 1345|      0|    output::emit_json_compact(&IngestSummary {
 1346|      0|        summary: true,
 1347|      0|        dir: args.dir.display().to_string(),
 1348|      0|        pattern: args.pattern.clone(),
 1349|      0|        recursive: args.recursive,
 1350|      0|        files_total: total,
 1351|      0|        files_succeeded: succeeded,
 1352|      0|        files_failed: failed,
 1353|      0|        files_skipped: skipped,
 1354|      0|        elapsed_ms: started.elapsed().as_millis() as u64,
 1355|      0|    })?;
 1356|       |
 1357|      0|    Ok(())
 1358|      0|}
 1359|       |
 1360|       |/// Auto-initialises the database (matches the contract of every other CRUD
 1361|       |/// handler) and returns a fresh read/write connection ready for the ingest
 1362|       |/// loop. Errors here are recoverable per-file: the caller surfaces them as
 1363|       |/// failure events so `--fail-fast` and the continue-on-error path keep
 1364|       |/// working when, for example, the user points `--db` at an unwritable path.
 1365|      0|fn init_storage(paths: &AppPaths) -> Result<Connection, AppError> {
 1366|      0|    ensure_db_ready(paths)?;
 1367|      0|    let conn = open_rw(&paths.db)?;
 1368|      0|    Ok(conn)
 1369|      0|}
 1370|       |
 1371|      4|pub(crate) fn collect_files(
 1372|      4|    dir: &Path,
 1373|      4|    pattern: &str,
 1374|      4|    recursive: bool,
 1375|      4|    out: &mut Vec<PathBuf>,
 1376|      4|) -> Result<(), AppError> {
 1377|      4|    let entries = std::fs::read_dir(dir).map_err(AppError::Io)?;
                                                                            ^0
 1378|     12|    for entry in entries {
                      ^8
 1379|      8|        let entry = entry.map_err(AppError::Io)?;
                                                             ^0
 1380|      8|        let path = entry.path();
 1381|      8|        let file_type = entry.file_type().map_err(AppError::Io)?;
                                                                             ^0
 1382|      8|        if file_type.is_file() {
 1383|      6|            let name = entry.file_name();
 1384|      6|            let name_str = name.to_string_lossy();
 1385|      6|            if matches_pattern(&name_str, pattern) {
 1386|      5|                out.push(path);
 1387|      5|            }
                          ^1
 1388|      2|        } else if file_type.is_dir() && recursive {
 1389|      1|            collect_files(&path, pattern, recursive, out)?;
                                                                       ^0
 1390|      1|        }
 1391|       |    }
 1392|      4|    Ok(())
 1393|      4|}
 1394|       |
 1395|     13|fn matches_pattern(name: &str, pattern: &str) -> bool {
 1396|     13|    if let Some(suffix) = pattern.strip_prefix('*') {
                              ^9
 1397|      9|        name.ends_with(suffix)
 1398|      4|    } else if let Some(prefix) = pattern.strip_suffix('*') {
                                     ^2
 1399|      2|        name.starts_with(prefix)
 1400|       |    } else {
 1401|      2|        name == pattern
 1402|       |    }
 1403|     13|}
 1404|       |
 1405|       |/// Returns `(final_name, truncated, original_name)`.
 1406|       |/// `truncated` is true when the derived name exceeded `max_len`.
 1407|       |/// `original_name` holds the pre-truncation name only when `truncated=true`.
 1408|       |///
 1409|       |/// Non-ASCII characters are first decomposed via NFD and then stripped of
 1410|       |/// combining marks so accented letters fold to their base ASCII letter
 1411|       |/// (e.g. `acai` from accented input, `naive` from diaeresis). Characters with no ASCII
 1412|       |/// fallback (emoji, CJK ideographs, symbols) are dropped silently. This
 1413|       |/// preserves meaningful word content rather than collapsing the basename
 1414|       |/// to a few stray ASCII letters as the previous filter did.
 1415|     11|pub(crate) fn derive_kebab_name(path: &Path, max_len: usize) -> (String, bool, Option<String>) {
 1416|     11|    let stem = path.file_stem().and_then(|s| s.to_str()).unwrap_or("");
 1417|     11|    let lowered: String = stem
 1418|     11|        .nfd()
 1419|    285|        .filter(|c| !unicode_normalization::char::is_combining_mark(*c))
                       ^11
 1420|    280|        .map(|c| {
                       ^11
 1421|    280|            if c == '_' || c.is_whitespace() {
                                         ^273^273
 1422|      7|                '-'
 1423|       |            } else {
 1424|    273|                c
 1425|       |            }
 1426|    280|        })
 1427|    280|        .map(|c| c.to_ascii_lowercase())
                       ^11
 1428|    280|        .filter(|c| c.is_ascii_lowercase() || c.is_ascii_digit() || *c == '-')
                       ^11                                  ^17^17                 ^17
 1429|     11|        .collect();
 1430|     11|    let collapsed = collapse_dashes(&lowered);
 1431|     11|    let trimmed_raw = collapsed.trim_matches('-').to_string();
 1432|       |    // Prefix names that start with a digit to keep them valid kebab-case identifiers.
 1433|     11|    let trimmed = if trimmed_raw.starts_with(|c: char| c.is_ascii_digit()) {
                                                                     ^10^10
 1434|      0|        format!("doc-{trimmed_raw}")
 1435|       |    } else {
 1436|     11|        trimmed_raw
 1437|       |    };
 1438|     11|    if trimmed.len() > max_len {
 1439|      2|        let truncated = trimmed[..max_len].trim_matches('-').to_string();
 1440|      2|        tracing::debug!(
 1441|       |            target: "ingest",
 1442|       |            original = %trimmed,
 1443|       |            truncated_to = %truncated,
 1444|       |            max_len = max_len,
 1445|      0|            "derived memory name truncated to fit length cap; collisions will be resolved with numeric suffixes"
 1446|       |        );
 1447|      2|        (truncated, true, Some(trimmed))
 1448|       |    } else {
 1449|      9|        (trimmed, false, None)
 1450|       |    }
 1451|     11|}
 1452|       |
 1453|       |/// v1.0.31 A10: returns the first non-colliding kebab name by appending a
 1454|       |/// numeric suffix (`-1`, `-2`, …) when needed.
 1455|       |///
 1456|       |/// `taken` is the set of names already consumed in the current ingest run.
 1457|       |/// The caller is expected to insert the returned name into `taken` so the
 1458|       |/// next call observes the consumption. Cross-run collisions are intentionally
 1459|       |/// surfaced by the per-file persistence path as duplicates so re-ingestion
 1460|       |/// of identical corpora stays idempotent.
 1461|       |///
 1462|       |/// Returns `Err(AppError::Validation)` after `MAX_NAME_COLLISION_SUFFIX`
 1463|       |/// candidates collide, signalling a pathological corpus that should be
 1464|       |/// renamed manually.
 1465|      3|fn unique_name(base: &str, taken: &BTreeSet<String>) -> Result<String, AppError> {
 1466|      3|    if !taken.contains(base) {
 1467|      1|        return Ok(base.to_string());
 1468|      2|    }
 1469|  1.00k|    for suffix in 1..=MAX_NAME_COLLISION_SUFFIX {
                      ^1.00k
 1470|  1.00k|        let candidate = format!("{base}-{suffix}");
 1471|  1.00k|        if !taken.contains(&candidate) {
 1472|      1|            tracing::warn!(
 1473|       |                target: "ingest",
 1474|       |                base = %base,
 1475|       |                resolved = %candidate,
 1476|       |                suffix,
 1477|      0|                "memory name collision resolved with numeric suffix"
 1478|       |            );
 1479|      1|            return Ok(candidate);
 1480|  1.00k|        }
 1481|       |    }
 1482|      1|    Err(AppError::Validation(format!(
 1483|      1|        "too many name collisions for base '{base}' (>{MAX_NAME_COLLISION_SUFFIX}); rename source files to disambiguate"
 1484|      1|    )))
 1485|      3|}
 1486|       |
 1487|     11|fn collapse_dashes(s: &str) -> String {
 1488|     11|    let mut out = String::with_capacity(s.len());
 1489|     11|    let mut prev_dash = false;
 1490|    272|    for c in s.chars() {
                           ^11^11
 1491|    272|        if c == '-' {
 1492|      9|            if !prev_dash {
 1493|      6|                out.push('-');
 1494|      6|            }
                          ^3
 1495|      9|            prev_dash = true;
 1496|    263|        } else {
 1497|    263|            out.push(c);
 1498|    263|            prev_dash = false;
 1499|    263|        }
 1500|       |    }
 1501|     11|    out
 1502|     11|}
 1503|       |
 1504|       |#[cfg(test)]
 1505|       |mod tests {
 1506|       |    use super::*;
 1507|       |    use std::path::PathBuf;
 1508|       |
 1509|       |    #[test]
 1510|      1|    fn matches_pattern_suffix() {
 1511|      1|        assert!(matches_pattern("foo.md", "*.md"));
 1512|      1|        assert!(!matches_pattern("foo.txt", "*.md"));
 1513|      1|        assert!(matches_pattern("foo.md", "*"));
 1514|      1|    }
 1515|       |
 1516|       |    #[test]
 1517|      1|    fn matches_pattern_prefix() {
 1518|      1|        assert!(matches_pattern("README.md", "README*"));
 1519|      1|        assert!(!matches_pattern("CHANGELOG.md", "README*"));
 1520|      1|    }
 1521|       |
 1522|       |    #[test]
 1523|      1|    fn matches_pattern_exact() {
 1524|      1|        assert!(matches_pattern("README.md", "README.md"));
 1525|      1|        assert!(!matches_pattern("readme.md", "README.md"));
 1526|      1|    }
 1527|       |
 1528|       |    #[test]
 1529|      1|    fn derive_kebab_underscore_to_dash() {
 1530|      1|        let p = PathBuf::from("/tmp/claude_code_headless.md");
 1531|      1|        let (name, truncated, original) = derive_kebab_name(&p, DERIVED_NAME_MAX_LEN);
 1532|      1|        assert_eq!(name, "claude-code-headless");
 1533|      1|        assert!(!truncated);
 1534|      1|        assert!(original.is_none());
 1535|      1|    }
 1536|       |
 1537|       |    #[test]
 1538|      1|    fn derive_kebab_uppercase_lowered() {
 1539|      1|        let p = PathBuf::from("/tmp/README.md");
 1540|      1|        let (name, truncated, original) = derive_kebab_name(&p, DERIVED_NAME_MAX_LEN);
 1541|      1|        assert_eq!(name, "readme");
 1542|      1|        assert!(!truncated);
 1543|      1|        assert!(original.is_none());
 1544|      1|    }
 1545|       |
 1546|       |    #[test]
 1547|      1|    fn derive_kebab_strips_non_kebab_chars() {
 1548|      1|        let p = PathBuf::from("/tmp/some@weird#name!.md");
 1549|      1|        let (name, truncated, original) = derive_kebab_name(&p, DERIVED_NAME_MAX_LEN);
 1550|      1|        assert_eq!(name, "someweirdname");
 1551|      1|        assert!(!truncated);
 1552|      1|        assert!(original.is_none());
 1553|      1|    }
 1554|       |
 1555|       |    // Bug M-A3: NFD-based unicode normalization preserves base letters of
 1556|       |    // accented characters instead of dropping them entirely.
 1557|       |    #[test]
 1558|      1|    fn derive_kebab_folds_accented_letters_to_ascii() {
 1559|      1|        let p = PathBuf::from("/tmp/açaí.md");
 1560|      1|        let (name, _, _) = derive_kebab_name(&p, DERIVED_NAME_MAX_LEN);
 1561|      1|        assert_eq!(name, "acai", "got '{name}'");
                                               ^0
 1562|      1|    }
 1563|       |
 1564|       |    #[test]
 1565|      1|    fn derive_kebab_handles_naive_with_diaeresis() {
 1566|      1|        let p = PathBuf::from("/tmp/naïve-test.md");
 1567|      1|        let (name, _, _) = derive_kebab_name(&p, DERIVED_NAME_MAX_LEN);
 1568|      1|        assert_eq!(name, "naive-test", "got '{name}'");
                                                     ^0
 1569|      1|    }
 1570|       |
 1571|       |    #[test]
 1572|      1|    fn derive_kebab_drops_emoji_keeps_word() {
 1573|      1|        let p = PathBuf::from("/tmp/🚀-rocket.md");
 1574|      1|        let (name, _, _) = derive_kebab_name(&p, DERIVED_NAME_MAX_LEN);
 1575|      1|        assert_eq!(name, "rocket", "got '{name}'");
                                                 ^0
 1576|      1|    }
 1577|       |
 1578|       |    #[test]
 1579|      1|    fn derive_kebab_mixed_unicode_emoji_keeps_letters() {
 1580|      1|        let p = PathBuf::from("/tmp/açaí🦜.md");
 1581|      1|        let (name, _, _) = derive_kebab_name(&p, DERIVED_NAME_MAX_LEN);
 1582|      1|        assert_eq!(name, "acai", "got '{name}'");
                                               ^0
 1583|      1|    }
 1584|       |
 1585|       |    #[test]
 1586|      1|    fn derive_kebab_pure_emoji_yields_empty() {
 1587|      1|        let p = PathBuf::from("/tmp/🦜🚀🌟.md");
 1588|      1|        let (name, _, _) = derive_kebab_name(&p, DERIVED_NAME_MAX_LEN);
 1589|      1|        assert!(name.is_empty(), "got '{name}'");
                                               ^0
 1590|      1|    }
 1591|       |
 1592|       |    #[test]
 1593|      1|    fn derive_kebab_collapses_consecutive_dashes() {
 1594|      1|        let p = PathBuf::from("/tmp/a__b___c.md");
 1595|      1|        let (name, truncated, original) = derive_kebab_name(&p, DERIVED_NAME_MAX_LEN);
 1596|      1|        assert_eq!(name, "a-b-c");
 1597|      1|        assert!(!truncated);
 1598|      1|        assert!(original.is_none());
 1599|      1|    }
 1600|       |
 1601|       |    #[test]
 1602|      1|    fn derive_kebab_truncates_to_60_chars() {
 1603|      1|        let p = PathBuf::from(format!("/tmp/{}.md", "a".repeat(80)));
 1604|      1|        let (name, truncated, original) = derive_kebab_name(&p, DERIVED_NAME_MAX_LEN);
 1605|      1|        assert!(name.len() <= 60, "got len {}", name.len());
                                                ^0            ^0   ^0
 1606|      1|        assert!(truncated);
 1607|      1|        assert!(original.is_some());
 1608|      1|        assert!(original.unwrap().len() > 60);
 1609|      1|    }
 1610|       |
 1611|       |    #[test]
 1612|      1|    fn collect_files_finds_md_files() {
 1613|      1|        let tmp = tempfile::tempdir().expect("tempdir");
 1614|      1|        std::fs::write(tmp.path().join("a.md"), "x").unwrap();
 1615|      1|        std::fs::write(tmp.path().join("b.md"), "y").unwrap();
 1616|      1|        std::fs::write(tmp.path().join("c.txt"), "z").unwrap();
 1617|      1|        let mut out = Vec::new();
 1618|      1|        collect_files(tmp.path(), "*.md", false, &mut out).expect("collect");
 1619|      1|        assert_eq!(out.len(), 2, "should find 2 .md files, got {out:?}");
                                               ^0
 1620|      1|    }
 1621|       |
 1622|       |    #[test]
 1623|      1|    fn collect_files_recursive_descends_subdirs() {
 1624|      1|        let tmp = tempfile::tempdir().expect("tempdir");
 1625|      1|        let sub = tmp.path().join("sub");
 1626|      1|        std::fs::create_dir(&sub).unwrap();
 1627|      1|        std::fs::write(tmp.path().join("a.md"), "x").unwrap();
 1628|      1|        std::fs::write(sub.join("b.md"), "y").unwrap();
 1629|      1|        let mut out = Vec::new();
 1630|      1|        collect_files(tmp.path(), "*.md", true, &mut out).expect("collect");
 1631|      1|        assert_eq!(out.len(), 2);
 1632|      1|    }
 1633|       |
 1634|       |    #[test]
 1635|      1|    fn collect_files_non_recursive_skips_subdirs() {
 1636|      1|        let tmp = tempfile::tempdir().expect("tempdir");
 1637|      1|        let sub = tmp.path().join("sub");
 1638|      1|        std::fs::create_dir(&sub).unwrap();
 1639|      1|        std::fs::write(tmp.path().join("a.md"), "x").unwrap();
 1640|      1|        std::fs::write(sub.join("b.md"), "y").unwrap();
 1641|      1|        let mut out = Vec::new();
 1642|      1|        collect_files(tmp.path(), "*.md", false, &mut out).expect("collect");
 1643|      1|        assert_eq!(out.len(), 1);
 1644|      1|    }
 1645|       |
 1646|       |    // ── v1.0.31 A10: name truncation warns and collisions are auto-resolved ──
 1647|       |
 1648|       |    #[test]
 1649|      1|    fn derive_kebab_long_basename_truncated_within_cap() {
 1650|      1|        let p = PathBuf::from(format!("/tmp/{}.md", "a".repeat(120)));
 1651|      1|        let (name, truncated, original) = derive_kebab_name(&p, DERIVED_NAME_MAX_LEN);
 1652|      1|        assert!(
 1653|      1|            name.len() <= DERIVED_NAME_MAX_LEN,
 1654|      0|            "truncated name must respect cap; got {} chars",
 1655|      0|            name.len()
 1656|       |        );
 1657|      1|        assert!(!name.is_empty());
 1658|      1|        assert!(truncated);
 1659|      1|        assert!(original.is_some());
 1660|      1|    }
 1661|       |
 1662|       |    #[test]
 1663|      1|    fn unique_name_returns_base_when_free() {
 1664|      1|        let taken: BTreeSet<String> = BTreeSet::new();
 1665|      1|        let resolved = unique_name("note", &taken).expect("must resolve");
 1666|      1|        assert_eq!(resolved, "note");
 1667|      1|    }
 1668|       |
 1669|       |    #[test]
 1670|      1|    fn unique_name_appends_first_free_suffix_on_collision() {
 1671|      1|        let mut taken: BTreeSet<String> = BTreeSet::new();
 1672|      1|        taken.insert("note".to_string());
 1673|      1|        taken.insert("note-1".to_string());
 1674|      1|        let resolved = unique_name("note", &taken).expect("must resolve");
 1675|      1|        assert_eq!(resolved, "note-2");
 1676|      1|    }
 1677|       |
 1678|       |    #[test]
 1679|      1|    fn unique_name_errors_after_collision_cap() {
 1680|      1|        let mut taken: BTreeSet<String> = BTreeSet::new();
 1681|      1|        taken.insert("note".to_string());
 1682|  1.00k|        for i in 1..=MAX_NAME_COLLISION_SUFFIX {
                          ^1.00k
 1683|  1.00k|            taken.insert(format!("note-{i}"));
 1684|  1.00k|        }
 1685|      1|        let err = unique_name("note", &taken).expect_err("must surface error");
 1686|      1|        assert!(matches!(err, AppError::Validation(_)));
                              ^0
 1687|      1|    }
 1688|       |
 1689|       |    // ── v1.0.32 Onda 4B: in-process pipeline validation ──
 1690|       |
 1691|       |    #[test]
 1692|      1|    fn validate_relation_format_accepts_valid_relations() {
 1693|       |        use crate::parsers::{is_canonical_relation, validate_relation_format};
 1694|      1|        assert!(validate_relation_format("applies_to").is_ok());
 1695|      1|        assert!(validate_relation_format("depends_on").is_ok());
 1696|      1|        assert!(validate_relation_format("implements").is_ok());
 1697|      1|        assert!(validate_relation_format("").is_err());
 1698|      1|        assert!(is_canonical_relation("applies_to"));
 1699|      1|        assert!(!is_canonical_relation("implements"));
 1700|      1|    }
 1701|       |
 1702|       |    // ── v1.0.40 H-A1: --low-memory flag and SQLITE_GRAPHRAG_LOW_MEMORY env var ──
 1703|       |
 1704|       |    use serial_test::serial;
 1705|       |
 1706|       |    /// Helper: scrubs the env var around a closure to keep tests deterministic.
 1707|     20|    fn with_env_var<F: FnOnce()>(value: Option<&str>, f: F) {
 1708|     20|        let key = "SQLITE_GRAPHRAG_LOW_MEMORY";
 1709|     20|        let prev = std::env::var(key).ok();
 1710|     20|        match value {
 1711|     16|            Some(v) => std::env::set_var(key, v),
 1712|      4|            None => std::env::remove_var(key),
 1713|       |        }
 1714|     20|        f();
 1715|     20|        match prev {
 1716|      0|            Some(p) => std::env::set_var(key, p),
 1717|     20|            None => std::env::remove_var(key),
 1718|       |        }
 1719|     20|    }
 1720|       |
 1721|       |    #[test]
 1722|       |    #[serial]
 1723|      1|    fn env_low_memory_enabled_unset_returns_false() {
 1724|      1|        with_env_var(None, || assert!(!env_low_memory_enabled()));
 1725|       |    }
 1726|       |
 1727|       |    #[test]
 1728|       |    #[serial]
 1729|      1|    fn env_low_memory_enabled_empty_returns_false() {
 1730|      1|        with_env_var(Some(""), || assert!(!env_low_memory_enabled()));
 1731|       |    }
 1732|       |
 1733|       |    #[test]
 1734|       |    #[serial]
 1735|      1|    fn env_low_memory_enabled_truthy_values_return_true() {
 1736|      7|        for v in ["1", "true", "TRUE", "yes", "YES", "on", "On"] {
                                     ^1      ^1      ^1     ^1     ^1    ^1
 1737|      7|            with_env_var(Some(v), || {
 1738|      7|                assert!(env_low_memory_enabled(), "value {v:?} should be truthy")
                                                                ^0
 1739|      7|            });
 1740|       |        }
 1741|       |    }
 1742|       |
 1743|       |    #[test]
 1744|       |    #[serial]
 1745|      1|    fn env_low_memory_enabled_falsy_values_return_false() {
 1746|      5|        for v in ["0", "false", "FALSE", "no", "off"] {
                                     ^1       ^1       ^1    ^1
 1747|      5|            with_env_var(Some(v), || {
 1748|      5|                assert!(!env_low_memory_enabled(), "value {v:?} should be falsy")
                                                                 ^0
 1749|      5|            });
 1750|       |        }
 1751|       |    }
 1752|       |
 1753|       |    #[test]
 1754|       |    #[serial]
 1755|      1|    fn env_low_memory_enabled_unrecognized_value_returns_false() {
 1756|      1|        with_env_var(Some("maybe"), || assert!(!env_low_memory_enabled()));
 1757|       |    }
 1758|       |
 1759|       |    #[test]
 1760|       |    #[serial]
 1761|      1|    fn resolve_parallelism_flag_forces_one_overriding_explicit_value() {
 1762|      1|        with_env_var(None, || {
 1763|      1|            assert_eq!(resolve_parallelism(true, Some(4)), 1);
 1764|      1|            assert_eq!(resolve_parallelism(true, Some(8)), 1);
 1765|      1|            assert_eq!(resolve_parallelism(true, None), 1);
 1766|      1|        });
 1767|       |    }
 1768|       |
 1769|       |    #[test]
 1770|       |    #[serial]
 1771|      1|    fn resolve_parallelism_env_forces_one_when_flag_off() {
 1772|      1|        with_env_var(Some("1"), || {
 1773|      1|            assert_eq!(resolve_parallelism(false, Some(4)), 1);
 1774|      1|            assert_eq!(resolve_parallelism(false, None), 1);
 1775|      1|        });
 1776|       |    }
 1777|       |
 1778|       |    #[test]
 1779|       |    #[serial]
 1780|      1|    fn resolve_parallelism_falsy_env_does_not_override() {
 1781|      1|        with_env_var(Some("0"), || {
 1782|      1|            assert_eq!(resolve_parallelism(false, Some(4)), 4);
 1783|      1|        });
 1784|       |    }
 1785|       |
 1786|       |    #[test]
 1787|       |    #[serial]
 1788|      1|    fn resolve_parallelism_explicit_value_when_low_memory_off() {
 1789|      1|        with_env_var(None, || {
 1790|      1|            assert_eq!(resolve_parallelism(false, Some(3)), 3);
 1791|      1|            assert_eq!(resolve_parallelism(false, Some(1)), 1);
 1792|      1|        });
 1793|       |    }
 1794|       |
 1795|       |    #[test]
 1796|       |    #[serial]
 1797|      1|    fn resolve_parallelism_default_when_unset() {
 1798|      1|        with_env_var(None, || {
 1799|      1|            let p = resolve_parallelism(false, None);
 1800|      1|            assert!((1..=4).contains(&p), "default must be in [1, 4]; got {p}");
                                                        ^0
 1801|      1|        });
 1802|       |    }
 1803|       |
 1804|       |    #[test]
 1805|      1|    fn ingest_args_parses_low_memory_flag_via_clap() {
 1806|       |        use clap::Parser;
 1807|       |        // Parse a synthetic Cli that contains the `ingest` subcommand. We rely
 1808|       |        // on the public `Cli` definition so the flag is wired end-to-end.
 1809|      1|        let cli = crate::cli::Cli::try_parse_from([
 1810|      1|            "sqlite-graphrag",
 1811|      1|            "ingest",
 1812|      1|            "/tmp/dummy",
 1813|      1|            "--type",
 1814|      1|            "document",
 1815|      1|            "--low-memory",
 1816|      1|        ])
 1817|      1|        .expect("parse must succeed");
 1818|      1|        match cli.command {
 1819|      1|            crate::cli::Commands::Ingest(args) => {
 1820|      1|                assert!(args.low_memory, "--low-memory must set field to true");
                                                       ^0
 1821|       |            }
 1822|      0|            _ => panic!("expected Ingest subcommand"),
 1823|       |        }
 1824|      1|    }
 1825|       |
 1826|       |    #[test]
 1827|      1|    fn ingest_args_low_memory_defaults_false() {
 1828|       |        use clap::Parser;
 1829|      1|        let cli = crate::cli::Cli::try_parse_from([
 1830|      1|            "sqlite-graphrag",
 1831|      1|            "ingest",
 1832|      1|            "/tmp/dummy",
 1833|      1|            "--type",
 1834|      1|            "document",
 1835|      1|        ])
 1836|      1|        .expect("parse must succeed");
 1837|      1|        match cli.command {
 1838|      1|            crate::cli::Commands::Ingest(args) => {
 1839|      1|                assert!(!args.low_memory, "default must be false");
                                                        ^0
 1840|       |            }
 1841|      0|            _ => panic!("expected Ingest subcommand"),
 1842|       |        }
 1843|      1|    }
 1844|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/ingest_claude.rs:
    1|       |//! Handler for `ingest --mode claude-code`.
    2|       |//!
    3|       |//! Orchestrates the locally installed Claude Code CLI binary (`claude -p`)
    4|       |//! to extract domain-specific entities and relationships from each file,
    5|       |//! then persists them via the same pipeline as `remember --graph-stdin`.
    6|       |//!
    7|       |//! Architecture: P1 One-Shot per file — each file spawns a separate
    8|       |//! `claude -p` process with `--json-schema` for guaranteed structured output.
    9|       |//! A SQLite queue DB tracks progress for resume/retry support.
   10|       |// Workload: Subprocess I/O-bound (claude -p headless with network wait)
   11|       |
   12|       |use crate::commands::ingest::IngestArgs;
   13|       |use crate::entity_type::EntityType;
   14|       |use crate::errors::AppError;
   15|       |use crate::paths::AppPaths;
   16|       |use crate::storage::connection::{ensure_db_ready, open_rw};
   17|       |use crate::storage::entities::{self, NewEntity, NewRelationship};
   18|       |use crate::storage::memories::{self, NewMemory};
   19|       |
   20|       |use rusqlite::Connection;
   21|       |use serde::{Deserialize, Serialize};
   22|       |use std::io::Write;
   23|       |use std::path::{Path, PathBuf};
   24|       |use std::process::{Command, Stdio};
   25|       |use std::time::Instant;
   26|       |
   27|       |const MIN_CLAUDE_VERSION: &str = "2.1.0";
   28|       |
   29|       |const EXTRACTION_SCHEMA: &str = r#"{
   30|       |  "type": "object",
   31|       |  "properties": {
   32|       |    "name": { "type": "string" },
   33|       |    "description": { "type": "string" },
   34|       |    "entities": {
   35|       |      "type": "array",
   36|       |      "items": {
   37|       |        "type": "object",
   38|       |        "properties": {
   39|       |          "name": { "type": "string" },
   40|       |          "entity_type": {
   41|       |            "type": "string",
   42|       |            "enum": ["project","tool","person","file","concept","incident","decision","organization","location","date"]
   43|       |          }
   44|       |        },
   45|       |        "required": ["name", "entity_type"],
   46|       |        "additionalProperties": false
   47|       |      }
   48|       |    },
   49|       |    "relationships": {
   50|       |      "type": "array",
   51|       |      "items": {
   52|       |        "type": "object",
   53|       |        "properties": {
   54|       |          "source": { "type": "string" },
   55|       |          "target": { "type": "string" },
   56|       |          "relation": {
   57|       |            "type": "string",
   58|       |            "enum": ["applies-to","uses","depends-on","causes","fixes","contradicts","supports","follows","related","replaces","tracked-in"]
   59|       |          },
   60|       |          "strength": { "type": "number", "minimum": 0, "maximum": 1 }
   61|       |        },
   62|       |        "required": ["source","target","relation","strength"],
   63|       |        "additionalProperties": false
   64|       |      }
   65|       |    }
   66|       |  },
   67|       |  "required": ["name","description","entities","relationships"],
   68|       |  "additionalProperties": false
   69|       |}"#;
   70|       |
   71|       |const EXTRACTION_PROMPT: &str = "You are a knowledge graph entity extractor. Given a document, extract:\n\
   72|       |1. A short kebab-case name (max 60 chars) capturing the document's main topic\n\
   73|       |2. A one-sentence description (10-20 words) summarizing the key insight\n\
   74|       |3. Domain-specific entities (concepts, tools, people, decisions, projects, files)\n\
   75|       |4. Typed relationships between entities with strength scores\n\n\
   76|       |Rules:\n\
   77|       |- Entity names: lowercase kebab-case, 2+ chars, domain-specific only\n\
   78|       |- NEVER extract generic terms, stop words, numbers, UUIDs, or single characters\n\
   79|       |- Relationship types MUST be one of: applies-to, uses, depends-on, causes, fixes, contradicts, supports, follows, related, replaces, tracked-in\n\
   80|       |- NEVER use 'mentions' as relationship type\n\
   81|       |- Strength: 0.9 for hard dependencies, 0.7 for design relationships, 0.5 for contextual links, 0.3 for weak references\n\
   82|       |- Prefer fewer high-quality entities over many low-quality ones\n\
   83|       |- Description must answer: What is this about and WHY does it matter?";
   84|       |
   85|       |#[derive(Debug, Deserialize)]
   86|       |struct ClaudeOutputElement {
   87|       |    r#type: Option<String>,
   88|       |    subtype: Option<String>,
   89|       |    #[serde(default)]
   90|       |    is_error: bool,
   91|       |    structured_output: Option<ExtractionResult>,
   92|       |    result: Option<String>,
   93|       |    total_cost_usd: Option<f64>,
   94|       |    error: Option<String>,
   95|       |    terminal_reason: Option<String>,
   96|       |    #[serde(rename = "apiKeySource")]
   97|       |    api_key_source: Option<String>,
   98|       |}
   99|       |
  100|       |#[derive(Debug, Clone, Deserialize, Serialize)]
  101|       |pub struct ExtractionResult {
  102|       |    pub name: String,
  103|       |    pub description: String,
  104|       |    pub entities: Vec<ExtractedEntity>,
  105|       |    pub relationships: Vec<ExtractedRelationship>,
  106|       |}
  107|       |
  108|       |#[derive(Debug, Clone, Deserialize, Serialize)]
  109|       |pub struct ExtractedEntity {
  110|       |    pub name: String,
  111|       |    pub entity_type: String,
  112|       |}
  113|       |
  114|       |#[derive(Debug, Clone, Deserialize, Serialize)]
  115|       |pub struct ExtractedRelationship {
  116|       |    pub source: String,
  117|       |    pub target: String,
  118|       |    pub relation: String,
  119|       |    pub strength: f64,
  120|       |}
  121|       |
  122|       |#[derive(Debug, Serialize)]
  123|       |struct PhaseEvent<'a> {
  124|       |    phase: &'a str,
  125|       |    #[serde(skip_serializing_if = "Option::is_none")]
  126|       |    claude_path: Option<&'a str>,
  127|       |    #[serde(skip_serializing_if = "Option::is_none")]
  128|       |    version: Option<&'a str>,
  129|       |    #[serde(skip_serializing_if = "Option::is_none")]
  130|       |    dir: Option<&'a str>,
  131|       |    #[serde(skip_serializing_if = "Option::is_none")]
  132|       |    files_total: Option<usize>,
  133|       |    #[serde(skip_serializing_if = "Option::is_none")]
  134|       |    files_new: Option<usize>,
  135|       |    #[serde(skip_serializing_if = "Option::is_none")]
  136|       |    files_existing: Option<usize>,
  137|       |}
  138|       |
  139|       |#[derive(Debug, Serialize)]
  140|       |struct FileEvent<'a> {
  141|       |    file: &'a str,
  142|       |    name: &'a str,
  143|       |    status: &'a str,
  144|       |    #[serde(skip_serializing_if = "Option::is_none")]
  145|       |    memory_id: Option<i64>,
  146|       |    #[serde(skip_serializing_if = "Option::is_none")]
  147|       |    entities: Option<usize>,
  148|       |    #[serde(skip_serializing_if = "Option::is_none")]
  149|       |    rels: Option<usize>,
  150|       |    #[serde(skip_serializing_if = "Option::is_none")]
  151|       |    cost_usd: Option<f64>,
  152|       |    #[serde(skip_serializing_if = "Option::is_none")]
  153|       |    elapsed_ms: Option<u64>,
  154|       |    #[serde(skip_serializing_if = "Option::is_none")]
  155|       |    error: Option<&'a str>,
  156|       |    index: usize,
  157|       |    total: usize,
  158|       |}
  159|       |
  160|       |#[derive(Debug, Serialize)]
  161|       |struct Summary {
  162|       |    summary: bool,
  163|       |    files_total: usize,
  164|       |    completed: usize,
  165|       |    failed: usize,
  166|       |    skipped: usize,
  167|       |    entities_total: usize,
  168|       |    rels_total: usize,
  169|       |    cost_usd: f64,
  170|       |    elapsed_ms: u64,
  171|       |}
  172|       |
  173|       |/// Locates the Claude Code binary on the system.
  174|      1|pub fn find_claude_binary(explicit: Option<&Path>) -> Result<PathBuf, AppError> {
  175|      1|    if let Some(p) = explicit {
                              ^0
  176|      0|        if p.exists() {
  177|      0|            return Ok(p.to_path_buf());
  178|      0|        }
  179|      0|        return Err(AppError::Validation(format!(
  180|      0|            "Claude Code binary not found at explicit path: {}",
  181|      0|            p.display()
  182|      0|        )));
  183|      1|    }
  184|       |
  185|      1|    if let Ok(env_path) = std::env::var("SQLITE_GRAPHRAG_CLAUDE_BINARY") {
                            ^0
  186|      0|        let p = PathBuf::from(&env_path);
  187|      0|        if p.exists() {
  188|      0|            return Ok(p);
  189|      0|        }
  190|      1|    }
  191|       |
  192|      1|    let name = if cfg!(windows) {
  193|      0|        "claude.exe"
  194|       |    } else {
  195|      1|        "claude"
  196|       |    };
  197|      1|    if let Some(path_var) = std::env::var_os("PATH") {
  198|      1|        for dir in std::env::split_paths(&path_var) {
  199|      1|            let candidate = dir.join(name);
  200|      1|            if candidate.exists() {
  201|      0|                return Ok(candidate);
  202|      1|            }
  203|       |        }
  204|      0|    }
  205|       |
  206|      1|    Err(AppError::Validation(
  207|      1|        "Claude Code binary not found in PATH. Install it from https://docs.anthropic.com/claude-code or specify --claude-binary".to_string(),
  208|      1|    ))
  209|      1|}
  210|       |
  211|       |/// Validates that the Claude Code binary meets the minimum version.
  212|      0|fn validate_claude_version(binary: &Path) -> Result<String, AppError> {
  213|      0|    let output = Command::new(binary)
  214|      0|        .arg("--version")
  215|      0|        .stdin(Stdio::null())
  216|      0|        .stdout(Stdio::piped())
  217|      0|        .stderr(Stdio::piped())
  218|      0|        .output()
  219|      0|        .map_err(AppError::Io)?;
  220|       |
  221|      0|    if !output.status.success() {
  222|      0|        return Err(AppError::Validation(
  223|      0|            "failed to run 'claude --version'".to_string(),
  224|      0|        ));
  225|      0|    }
  226|       |
  227|      0|    let version_str = String::from_utf8(output.stdout)
  228|      0|        .map_err(|_| AppError::Validation("claude --version output is not UTF-8".to_string()))?;
  229|      0|    let version = version_str.trim().to_string();
  230|       |
  231|       |    // Extract the numeric version part before first space or paren, e.g. "2.1.149 (Claude Code)" -> "2.1.149"
  232|      0|    let numeric = version.split([' ', '(']).next().unwrap_or("").trim();
  233|       |
  234|      0|    fn parse_semver(s: &str) -> Option<(u64, u64, u64)> {
  235|      0|        let parts: Vec<&str> = s.splitn(3, '.').collect();
  236|      0|        if parts.len() < 2 {
  237|      0|            return None;
  238|      0|        }
  239|      0|        let major = parts[0].parse::<u64>().ok()?;
  240|      0|        let minor = parts[1].parse::<u64>().ok()?;
  241|      0|        let patch = parts
  242|      0|            .get(2)
  243|      0|            .and_then(|p| p.parse::<u64>().ok())
  244|      0|            .unwrap_or(0);
  245|      0|        Some((major, minor, patch))
  246|      0|    }
  247|       |
  248|      0|    if let (Some(actual), Some(min)) = (parse_semver(numeric), parse_semver(MIN_CLAUDE_VERSION)) {
  249|      0|        if actual < min {
  250|      0|            return Err(AppError::Validation(format!(
  251|      0|                "Claude Code version {numeric} is below minimum required {MIN_CLAUDE_VERSION}"
  252|      0|            )));
  253|      0|        }
  254|      0|    }
  255|       |
  256|      0|    Ok(version)
  257|      0|}
  258|       |
  259|       |/// Invokes `claude -p` for a single file and returns the extraction result.
  260|       |///
  261|       |/// OAuth-only enforcement (gaps.md:41-49, v1.0.69 mandate):
  262|       |///
  263|       |/// - `wait-timeout` for cross-platform subprocess timeout.
  264|       |/// - `env_clear()` for least-privilege environment.
  265|       |/// - OAuth-only flow: NO `--bare` (PROHIBITED, gaps.md:49), no API-key path.
  266|       |/// - Mandatory hardening: `--strict-mcp-config --mcp-config '{}'` to zero
  267|       |///   MCP servers, and `--settings '{"hooks":{}}'` to disable hooks.
  268|       |/// - If `ANTHROPIC_API_KEY` is set in the environment we ABORT the spawn
  269|       |///   (return a `false` command with a violation marker) — API-key path is
  270|       |///   PROHIBITED in this project.
  271|      0|fn extract_with_claude(
  272|      0|    binary: &Path,
  273|      0|    file_content: &[u8],
  274|      0|    model: Option<&str>,
  275|      0|    timeout_secs: u64,
  276|      0|) -> Result<(ExtractionResult, f64, bool), AppError> {
  277|       |    use wait_timeout::ChildExt;
  278|       |
  279|       |    // OAuth-only guard (gaps.md:47). If `ANTHROPIC_API_KEY` is set in the
  280|       |    // environment we MUST abort — that is the API-key path which is
  281|       |    // explicitly PROHIBITED. Use the OAuth flow exclusively.
  282|      0|    if let Ok(_key) = std::env::var("ANTHROPIC_API_KEY") {
  283|      0|        let mut cmd = Command::new("false");
  284|      0|        cmd.env_clear();
  285|      0|        cmd.env("PATH", "/nonexistent");
  286|      0|        cmd.arg("--oauth-only-violation-anthropic-api-key-set");
  287|      0|        return Err(AppError::Validation(
  288|      0|            "ANTHROPIC_API_KEY is set in the environment; \
  289|      0|             sqlite-graphrag operates exclusively with OAuth (Pro/Max) and \
  290|      0|             the API-key path is PROHIBITED (gaps.md:47). Unset the variable \
  291|      0|             and re-run with `claude login` already completed in this session."
  292|      0|                .to_string(),
  293|      0|        ));
  294|      0|    }
  295|       |
  296|      0|    let mut cmd = Command::new(binary);
  297|       |
  298|      0|    cmd.env_clear();
  299|      0|    for var in &[
  300|      0|        "PATH",
  301|      0|        "HOME",
  302|      0|        "USER",
  303|      0|        "SHELL",
  304|      0|        "TERM",
  305|      0|        "LANG",
  306|      0|        "XDG_CONFIG_HOME",
  307|      0|        "XDG_DATA_HOME",
  308|      0|        "XDG_RUNTIME_DIR",
  309|      0|        // NOTE: `ANTHROPIC_API_KEY` is INTENTIONALLY ABSENT (gaps.md:47).
  310|      0|        "CLAUDE_CONFIG_DIR",
  311|      0|        "TMPDIR",
  312|      0|        "TMP",
  313|      0|        "TEMP",
  314|      0|        "DYLD_FALLBACK_LIBRARY_PATH",
  315|      0|    ] {
  316|      0|        if let Ok(val) = std::env::var(var) {
  317|      0|            cmd.env(var, val);
  318|      0|        }
  319|       |    }
  320|       |
  321|       |    #[cfg(windows)]
  322|       |    for var in &[
  323|       |        "LOCALAPPDATA",
  324|       |        "APPDATA",
  325|       |        "USERPROFILE",
  326|       |        "SystemRoot",
  327|       |        "COMSPEC",
  328|       |        "PATHEXT",
  329|       |        "HOMEPATH",
  330|       |        "HOMEDRIVE",
  331|       |    ] {
  332|       |        if let Ok(val) = std::env::var(var) {
  333|       |            cmd.env(var, val);
  334|       |        }
  335|       |    }
  336|       |
  337|       |    // Canonical OAuth-only command line (gaps.md:201-208 + 211-213).
  338|       |    // `--bare` is PROHIBITED (gaps.md:49) — never emitted.
  339|      0|    cmd.arg("-p")
  340|      0|        .arg(EXTRACTION_PROMPT)
  341|      0|        .arg("--strict-mcp-config")
  342|      0|        .arg("--mcp-config")
  343|      0|        .arg("{}")
  344|      0|        .arg("--dangerously-skip-permissions")
  345|      0|        .arg("--settings")
  346|      0|        .arg(r#"{"hooks":{}}"#)
  347|      0|        .arg("--output-format")
  348|      0|        .arg("json")
  349|      0|        .arg("--json-schema")
  350|      0|        .arg(EXTRACTION_SCHEMA)
  351|      0|        .arg("--max-turns")
  352|      0|        .arg("7")
  353|      0|        .arg("--no-session-persistence");
  354|       |
  355|      0|    if let Some(m) = model {
  356|      0|        cmd.arg("--model").arg(m);
  357|      0|    }
  358|       |
  359|      0|    cmd.stdin(Stdio::piped())
  360|      0|        .stdout(Stdio::piped())
  361|      0|        .stderr(Stdio::piped());
  362|       |
  363|      0|    let mut child = super::claude_runner::spawn_with_memory_limit(&mut cmd).map_err(|e| {
  364|      0|        AppError::Io(std::io::Error::new(
  365|      0|            e.kind(),
  366|      0|            format!("failed to spawn claude: {e}"),
  367|      0|        ))
  368|      0|    })?;
  369|       |
  370|      0|    let stdin_data = file_content.to_vec();
  371|      0|    let mut child_stdin = child
  372|      0|        .stdin
  373|      0|        .take()
  374|      0|        .ok_or_else(|| AppError::Validation("failed to open claude stdin".into()))?;
  375|      0|    let stdin_thread = std::thread::spawn(move || -> Result<(), std::io::Error> {
  376|      0|        child_stdin.write_all(&stdin_data)?;
  377|      0|        drop(child_stdin);
  378|      0|        Ok(())
  379|      0|    });
  380|       |
  381|      0|    let start = std::time::Instant::now();
  382|      0|    let timeout = std::time::Duration::from_secs(timeout_secs);
  383|      0|    let status = child.wait_timeout(timeout).map_err(AppError::Io)?;
  384|       |
  385|      0|    match status {
  386|      0|        Some(exit_status) => {
  387|      0|            stdin_thread
  388|      0|                .join()
  389|      0|                .map_err(|_| AppError::Validation("stdin thread panicked".into()))?
  390|      0|                .map_err(AppError::Io)?;
  391|       |
  392|      0|            tracing::debug!(
  393|       |                target: "process",
  394|      0|                exit_code = ?exit_status.code(),
  395|      0|                elapsed_ms = start.elapsed().as_millis() as u64,
  396|      0|                "external process completed"
  397|       |            );
  398|       |
  399|      0|            let mut stdout_buf = Vec::new();
  400|      0|            let mut stderr_buf = Vec::new();
  401|      0|            if let Some(mut out) = child.stdout.take() {
  402|      0|                std::io::Read::read_to_end(&mut out, &mut stdout_buf).map_err(AppError::Io)?;
  403|      0|            }
  404|      0|            if let Some(mut err) = child.stderr.take() {
  405|      0|                std::io::Read::read_to_end(&mut err, &mut stderr_buf).map_err(AppError::Io)?;
  406|      0|            }
  407|       |
  408|      0|            if !exit_status.success() {
  409|      0|                let stdout_str = String::from_utf8_lossy(&stdout_buf);
  410|      0|                if let Ok(elements) = serde_json::from_str::<Vec<ClaudeOutputElement>>(&stdout_str)
  411|       |                {
  412|      0|                    if let Some(re) = elements
  413|      0|                        .iter()
  414|      0|                        .find(|e| e.r#type.as_deref() == Some("result"))
  415|       |                    {
  416|      0|                        if re.terminal_reason.as_deref() == Some("max_turns") {
  417|      0|                            tracing::warn!(
  418|       |                                target: "ingest",
  419|      0|                                "extraction hit max_turns limit — hooks may have consumed turns"
  420|       |                            );
  421|      0|                            return Err(AppError::Validation(
  422|      0|                                "claude -p hit max_turns: hooks may be consuming turns".into(),
  423|      0|                            ));
  424|      0|                        }
  425|      0|                        if re.is_error {
  426|      0|                            let err_msg = re
  427|      0|                                .error
  428|      0|                                .as_deref()
  429|      0|                                .or(re.result.as_deref())
  430|      0|                                .unwrap_or("unknown error");
  431|      0|                            if err_msg.contains("rate_limit") || err_msg.contains("overloaded") {
  432|      0|                                return Err(AppError::RateLimited {
  433|      0|                                    detail: err_msg.to_string(),
  434|      0|                                });
  435|      0|                            }
  436|      0|                            if err_msg.contains("Not logged in")
  437|      0|                                || err_msg.contains("authentication")
  438|       |                            {
  439|      0|                                tracing::warn!(
  440|       |                                    target: "ingest",
  441|      0|                                    "Claude Code authentication failed. Re-authenticate interactively with: claude"
  442|       |                                );
  443|      0|                            }
  444|      0|                            return Err(AppError::Validation(format!(
  445|      0|                                "claude -p failed: {err_msg}"
  446|      0|                            )));
  447|      0|                        }
  448|      0|                    }
  449|      0|                }
  450|      0|                let stderr_str = String::from_utf8_lossy(&stderr_buf);
  451|      0|                if stderr_str.contains("auth") || stderr_str.contains("login") {
  452|      0|                    tracing::warn!(
  453|       |                        target: "ingest",
  454|      0|                        "Claude Code authentication may have failed. Re-authenticate with: claude"
  455|       |                    );
  456|      0|                }
  457|      0|                return Err(AppError::Validation(format!(
  458|      0|                    "claude -p exited with code {:?}: {}",
  459|      0|                    exit_status.code(),
  460|      0|                    stderr_str.trim()
  461|      0|                )));
  462|      0|            }
  463|       |
  464|      0|            let stdout = String::from_utf8(stdout_buf)
  465|      0|                .map_err(|_| AppError::Validation("claude -p stdout is not valid UTF-8".into()))?;
  466|      0|            parse_claude_output(&stdout)
  467|       |        }
  468|       |        None => {
  469|      0|            tracing::warn!(target: "ingest", timeout_secs, "claude -p timed out, killing process");
  470|      0|            let _ = child.kill();
  471|      0|            let _ = child.wait();
  472|      0|            let _ = stdin_thread.join();
  473|      0|            Err(AppError::Validation(format!(
  474|      0|                "claude -p timed out after {timeout_secs} seconds"
  475|      0|            )))
  476|       |        }
  477|       |    }
  478|      0|}
  479|       |
  480|       |/// Parses the JSON array output from `claude -p --output-format json`.
  481|       |///
  482|       |/// Returns `(extraction, cost_usd, is_oauth)` where `is_oauth` is true when
  483|       |/// the init element reports `apiKeySource: "none"` (OAuth subscription).
  484|     10|fn parse_claude_output(stdout: &str) -> Result<(ExtractionResult, f64, bool), AppError> {
  485|     10|    let elements: Vec<ClaudeOutputElement> = serde_json::from_str(stdout).map_err(|e| {
                      ^9        ^9                                                                  ^1
  486|      1|        AppError::Validation(format!("failed to parse claude output as JSON array: {e}"))
  487|      1|    })?;
  488|       |
  489|      9|    let is_oauth = elements
  490|      9|        .iter()
  491|      9|        .find(|e| e.r#type.as_deref() == Some("system") && e.subtype.as_deref() == Some("init"))
  492|      9|        .and_then(|e| e.api_key_source.as_deref())
  493|      9|        .map(|s| s == "none")
                               ^2   ^2
  494|      9|        .unwrap_or(false);
  495|       |
  496|      9|    let result_elem = elements
  497|      9|        .iter()
  498|     19|        .find(|e| e.r#type.as_deref() == Some("result"))
                       ^9
  499|      9|        .ok_or_else(|| {
                                     ^0
  500|      0|            AppError::Validation("claude output missing 'result' element".to_string())
  501|      0|        })?;
  502|       |
  503|      9|    if result_elem.is_error {
  504|      3|        let err_msg = result_elem
  505|      3|            .error
  506|      3|            .as_deref()
  507|      3|            .or(result_elem.result.as_deref())
  508|      3|            .unwrap_or("unknown error");
  509|      3|        if err_msg.contains("rate_limit") || err_msg.contains("overloaded") {
                                                           ^2      ^2
  510|      1|            return Err(AppError::RateLimited {
  511|      1|                detail: err_msg.to_string(),
  512|      1|            });
  513|      2|        }
  514|      2|        return Err(AppError::Validation(format!(
  515|      2|            "claude extraction failed: {err_msg}"
  516|      2|        )));
  517|      6|    }
  518|       |
  519|      6|    let extraction = result_elem
  520|      6|        .structured_output
  521|      6|        .clone()
  522|      6|        .or_else(|| {
                                  ^1
  523|      1|            result_elem
  524|      1|                .result
  525|      1|                .as_ref()
  526|      1|                .and_then(|text| serde_json::from_str::<ExtractionResult>(text).ok())
  527|      1|        })
  528|      6|        .ok_or_else(|| {
                                     ^0
  529|      0|            AppError::Validation("claude result missing structured_output and result field".into())
  530|      0|        })?;
  531|       |
  532|      6|    let cost = result_elem.total_cost_usd.unwrap_or(0.0);
  533|       |
  534|      6|    Ok((extraction, cost, is_oauth))
  535|     10|}
  536|       |
  537|       |use crate::output::emit_json_line as emit_json;
  538|       |
  539|       |/// Collects files matching the pattern (reuses ingest logic).
  540|      0|fn collect_matching_files(
  541|      0|    dir: &Path,
  542|      0|    pattern: &str,
  543|      0|    recursive: bool,
  544|      0|    max_files: usize,
  545|      0|) -> Result<Vec<PathBuf>, AppError> {
  546|      0|    let mut files = Vec::new();
  547|      0|    super::ingest::collect_files(dir, pattern, recursive, &mut files)?;
  548|      0|    files.sort_unstable();
  549|       |
  550|      0|    if files.len() > max_files {
  551|      0|        return Err(AppError::Validation(format!(
  552|      0|            "found {} files, exceeds --max-files cap of {}",
  553|      0|            files.len(),
  554|      0|            max_files
  555|      0|        )));
  556|      0|    }
  557|       |
  558|      0|    Ok(files)
  559|      0|}
  560|       |
  561|       |/// Opens or creates the queue database for tracking ingest progress.
  562|      0|fn open_queue_db(path: &str) -> Result<Connection, AppError> {
  563|      0|    let conn = Connection::open(path)?;
  564|       |
  565|      0|    conn.pragma_update(None, "journal_mode", "wal")?;
  566|       |
  567|      0|    conn.execute_batch(
  568|      0|        "CREATE TABLE IF NOT EXISTS queue (
  569|      0|            id          INTEGER PRIMARY KEY AUTOINCREMENT,
  570|      0|            file_path   TEXT NOT NULL UNIQUE,
  571|      0|            name        TEXT,
  572|      0|            status      TEXT NOT NULL DEFAULT 'pending',
  573|      0|            memory_id   INTEGER,
  574|      0|            entities    INTEGER DEFAULT 0,
  575|      0|            rels        INTEGER DEFAULT 0,
  576|      0|            error       TEXT,
  577|      0|            cost_usd    REAL DEFAULT 0.0,
  578|      0|            attempt     INTEGER DEFAULT 0,
  579|      0|            elapsed_ms  INTEGER,
  580|      0|            created_at  TEXT DEFAULT (datetime('now')),
  581|      0|            done_at     TEXT
  582|      0|        );
  583|      0|        CREATE INDEX IF NOT EXISTS idx_queue_status ON queue(status);",
  584|      0|    )?;
  585|       |
  586|      0|    Ok(conn)
  587|      0|}
  588|       |
  589|       |/// Main entry point for `ingest --mode claude-code`.
  590|      0|pub fn run_claude_ingest(args: &IngestArgs) -> Result<(), AppError> {
  591|      0|    let started = Instant::now();
  592|       |
  593|      0|    if !args.dir.exists() {
  594|      0|        return Err(AppError::Validation(format!(
  595|      0|            "directory not found: {}",
  596|      0|            args.dir.display()
  597|      0|        )));
  598|      0|    }
  599|       |
  600|       |    // G28-B (v1.0.68) + G30 (v1.0.69): acquire singleton before doing real
  601|       |    // work so two parallel `ingest --mode claude-code` invocations cannot
  602|       |    // co-exist on the same database. Scope includes the database hash so
  603|       |    // concurrent ingest against different databases is allowed.
  604|      0|    let early_ns = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
  605|      0|    let early_paths = AppPaths::resolve(args.db.as_deref())?;
  606|      0|    let _singleton = crate::lock::acquire_job_singleton(
  607|      0|        crate::lock::JobType::IngestClaudeCode,
  608|      0|        &early_ns,
  609|      0|        &early_paths.db,
  610|      0|        args.wait_job_singleton,
  611|      0|        args.force_job_singleton,
  612|      0|    )?;
  613|       |
  614|       |    // Stage 1: Validate
  615|      0|    let claude_binary = find_claude_binary(args.claude_binary.as_deref())?;
  616|      0|    let version = validate_claude_version(&claude_binary)?;
  617|      0|    tracing::info!(
  618|       |        target: "ingest",
  619|      0|        binary = %claude_binary.display(),
  620|       |        version = %version,
  621|      0|        "Claude Code binary validated"
  622|       |    );
  623|       |
  624|      0|    emit_json(&PhaseEvent {
  625|      0|        phase: "validate",
  626|      0|        claude_path: claude_binary.to_str(),
  627|      0|        version: Some(&version),
  628|      0|        dir: None,
  629|      0|        files_total: None,
  630|      0|        files_new: None,
  631|      0|        files_existing: None,
  632|      0|    });
  633|       |
  634|       |    // Stage 2: Scan
  635|      0|    let files = collect_matching_files(&args.dir, &args.pattern, args.recursive, args.max_files)?;
  636|       |
  637|      0|    let queue_conn = open_queue_db(&args.queue_db)?;
  638|       |
  639|      0|    if args.resume {
  640|      0|        let reset = queue_conn
  641|      0|            .execute(
  642|      0|                "UPDATE queue SET status='pending' WHERE status='processing'",
  643|      0|                [],
  644|       |            )
  645|      0|            .map_err(|e| AppError::Validation(format!("queue resume failed: {e}")))?;
  646|      0|        if reset > 0 {
  647|      0|            tracing::info!(target: "ingest", count = reset, "reset stuck processing files to pending");
  648|      0|        }
  649|      0|    }
  650|       |
  651|      0|    if args.retry_failed {
  652|      0|        let count = queue_conn
  653|      0|            .execute(
  654|      0|                "UPDATE queue SET status='pending', attempt=0 WHERE status='failed'",
  655|      0|                [],
  656|       |            )
  657|      0|            .map_err(|e| AppError::Validation(format!("queue retry-failed reset failed: {e}")))?;
  658|      0|        tracing::info!(target: "ingest", count, "retrying failed files");
  659|      0|    }
  660|       |
  661|      0|    if !args.resume && !args.retry_failed {
  662|      0|        queue_conn
  663|      0|            .execute("DELETE FROM queue", [])
  664|      0|            .map_err(|e| AppError::Validation(format!("queue clear failed: {e}")))?;
  665|      0|    }
  666|       |
  667|      0|    let mut new_count = 0usize;
  668|      0|    let mut existing_count = 0usize;
  669|       |
  670|      0|    if !args.retry_failed {
  671|      0|        for file in &files {
  672|      0|            let file_str = file.to_string_lossy().into_owned();
  673|      0|            let inserted = queue_conn
  674|      0|                .execute(
  675|      0|                    "INSERT OR IGNORE INTO queue (file_path, status) VALUES (?1, 'pending')",
  676|      0|                    rusqlite::params![file_str],
  677|       |                )
  678|      0|                .map_err(|e| AppError::Validation(format!("queue insert failed: {e}")))?;
  679|      0|            if inserted > 0 {
  680|      0|                new_count += 1;
  681|      0|            } else {
  682|      0|                existing_count += 1;
  683|      0|            }
  684|       |        }
  685|      0|    }
  686|       |
  687|      0|    emit_json(&PhaseEvent {
  688|      0|        phase: "scan",
  689|      0|        claude_path: None,
  690|      0|        version: None,
  691|      0|        dir: args.dir.to_str(),
  692|      0|        files_total: Some(files.len()),
  693|      0|        files_new: Some(new_count),
  694|      0|        files_existing: Some(existing_count),
  695|      0|    });
  696|       |
  697|      0|    if args.dry_run {
  698|      0|        for (idx, file) in files.iter().enumerate() {
  699|      0|            let (name, _truncated, _orig) =
  700|      0|                super::ingest::derive_kebab_name(file, args.max_name_length);
  701|      0|            emit_json(&FileEvent {
  702|      0|                file: &file.to_string_lossy(),
  703|      0|                name: &name,
  704|      0|                status: "preview",
  705|      0|                memory_id: None,
  706|      0|                entities: None,
  707|      0|                rels: None,
  708|      0|                cost_usd: None,
  709|      0|                elapsed_ms: None,
  710|      0|                error: None,
  711|      0|                index: idx,
  712|      0|                total: files.len(),
  713|      0|            });
  714|      0|        }
  715|      0|        emit_json(&Summary {
  716|      0|            summary: true,
  717|      0|            files_total: files.len(),
  718|      0|            completed: 0,
  719|      0|            failed: 0,
  720|      0|            skipped: 0,
  721|      0|            entities_total: 0,
  722|      0|            rels_total: 0,
  723|      0|            cost_usd: 0.0,
  724|      0|            elapsed_ms: started.elapsed().as_millis() as u64,
  725|      0|        });
  726|      0|        if !args.keep_queue {
  727|      0|            let _ = std::fs::remove_file(&args.queue_db);
  728|      0|        }
  729|      0|        return Ok(());
  730|      0|    }
  731|       |
  732|       |    // Stage 3: Process
  733|      0|    let paths = AppPaths::resolve(args.db.as_deref())?;
  734|      0|    ensure_db_ready(&paths)?;
  735|      0|    let conn = open_rw(&paths.db)?;
  736|      0|    let tokenizer = crate::tokenizer::get_tokenizer(&paths.models)?;
  737|      0|    let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
  738|      0|    let memory_type_str = args.r#type.as_str().to_string();
  739|       |
  740|      0|    let mut completed = 0usize;
  741|      0|    let mut failed = 0usize;
  742|      0|    let skipped_initial: usize = queue_conn
  743|      0|        .query_row("SELECT COUNT(*) FROM queue WHERE status='done'", [], |r| {
  744|      0|            r.get::<_, usize>(0)
  745|      0|        })
  746|      0|        .unwrap_or(0);
  747|      0|    let mut skipped = skipped_initial;
  748|      0|    let mut entities_total = 0usize;
  749|      0|    let mut rels_total = 0usize;
  750|      0|    let mut cost_total = 0.0f64;
  751|      0|    let mut oauth_detected = false;
  752|      0|    let total = files.len();
  753|       |
  754|      0|    let mut backoff_secs = args.rate_limit_wait;
  755|      0|    let rate_limit_deadline = std::time::Instant::now() + std::time::Duration::from_secs(3600);
  756|       |
  757|       |    loop {
  758|      0|        if crate::shutdown_requested() {
  759|      0|            tracing::info!(target: "ingest", "shutdown requested, stopping before next file");
  760|      0|            break;
  761|      0|        }
  762|       |
  763|      0|        let pending: Option<(i64, String)> = queue_conn
  764|      0|            .query_row(
  765|      0|                "UPDATE queue SET status='processing', attempt=attempt+1 \
  766|      0|                 WHERE id = (SELECT id FROM queue WHERE status='pending' ORDER BY id LIMIT 1) \
  767|      0|                 RETURNING id, file_path",
  768|      0|                [],
  769|      0|                |row| Ok((row.get(0)?, row.get(1)?)),
  770|       |            )
  771|      0|            .ok();
  772|       |
  773|      0|        let (queue_id, file_path) = match pending {
  774|      0|            Some(p) => p,
  775|      0|            None => break,
  776|       |        };
  777|       |
  778|      0|        let file_started = Instant::now();
  779|       |
  780|       |        // G05: reject files that exceed the 10 MB stdin limit
  781|       |        const MAX_FILE_SIZE: u64 = 10 * 1024 * 1024;
  782|      0|        if let Ok(meta) = std::fs::metadata(&file_path) {
  783|      0|            if meta.len() > MAX_FILE_SIZE {
  784|      0|                let err_msg = format!("file exceeds 10MB stdin limit ({} bytes)", meta.len());
  785|      0|                let _ = queue_conn.execute(
  786|      0|                    "UPDATE queue SET status='failed', error=?1, done_at=datetime('now') WHERE id=?2",
  787|      0|                    rusqlite::params![err_msg, queue_id],
  788|      0|                );
  789|      0|                let current_index = completed + failed + skipped;
  790|      0|                failed += 1;
  791|      0|                emit_json(&FileEvent {
  792|      0|                    file: &file_path,
  793|      0|                    name: "",
  794|      0|                    status: "failed",
  795|      0|                    memory_id: None,
  796|      0|                    entities: None,
  797|      0|                    rels: None,
  798|      0|                    cost_usd: None,
  799|      0|                    elapsed_ms: Some(file_started.elapsed().as_millis() as u64),
  800|      0|                    error: Some(&err_msg),
  801|      0|                    index: current_index,
  802|      0|                    total,
  803|      0|                });
  804|      0|                if args.fail_fast {
  805|      0|                    break;
  806|      0|                }
  807|      0|                continue;
  808|      0|            }
  809|      0|        }
  810|       |
  811|      0|        let file_content = match std::fs::read(&file_path) {
  812|      0|            Ok(c) => c,
  813|      0|            Err(e) => {
  814|      0|                let err_msg = format!("IO error: {e}");
  815|      0|                let _ = queue_conn.execute(
  816|      0|                    "UPDATE queue SET status='failed', error=?1, done_at=datetime('now') WHERE id=?2",
  817|      0|                    rusqlite::params![err_msg, queue_id],
  818|      0|                );
  819|      0|                let current_index = completed + failed + skipped;
  820|      0|                failed += 1;
  821|      0|                emit_json(&FileEvent {
  822|      0|                    file: &file_path,
  823|      0|                    name: "",
  824|      0|                    status: "failed",
  825|      0|                    memory_id: None,
  826|      0|                    entities: None,
  827|      0|                    rels: None,
  828|      0|                    cost_usd: None,
  829|      0|                    elapsed_ms: Some(file_started.elapsed().as_millis() as u64),
  830|      0|                    error: Some(&err_msg),
  831|      0|                    index: current_index,
  832|      0|                    total,
  833|      0|                });
  834|      0|                if args.fail_fast {
  835|      0|                    break;
  836|      0|                }
  837|      0|                continue;
  838|       |            }
  839|       |        };
  840|       |
  841|       |        // B08: skip files exceeding body cap BEFORE sending to LLM to avoid wasting tokens
  842|      0|        if file_content.len() > crate::constants::MAX_MEMORY_BODY_LEN {
  843|      0|            let err_msg = format!(
  844|      0|                "file body exceeds {} byte limit ({} bytes) — skipping to avoid wasting LLM tokens",
  845|       |                crate::constants::MAX_MEMORY_BODY_LEN,
  846|      0|                file_content.len()
  847|       |            );
  848|      0|            tracing::warn!(target: "ingest", file = %file_path, size = file_content.len(), "body exceeds limit, skipping LLM extraction");
  849|      0|            let _ = queue_conn.execute(
  850|      0|                "UPDATE queue SET status='skipped', error=?1, done_at=datetime('now') WHERE id=?2",
  851|      0|                rusqlite::params![err_msg, queue_id],
  852|      0|            );
  853|      0|            let current_index = completed + failed + skipped;
  854|      0|            skipped += 1;
  855|      0|            emit_json(&FileEvent {
  856|      0|                file: &file_path,
  857|      0|                name: "",
  858|      0|                status: "skipped",
  859|      0|                memory_id: None,
  860|      0|                entities: None,
  861|      0|                rels: None,
  862|      0|                cost_usd: None,
  863|      0|                elapsed_ms: Some(file_started.elapsed().as_millis() as u64),
  864|      0|                error: Some(&err_msg),
  865|      0|                index: current_index,
  866|      0|                total,
  867|      0|            });
  868|      0|            continue;
  869|      0|        }
  870|       |
  871|       |        // B07: retry once on cold-start failure (Claude Code Issue #23265)
  872|      0|        let max_extract_attempts: u32 = 2;
  873|      0|        let mut extraction_result: Option<(ExtractionResult, f64, bool)> = None;
  874|      0|        let mut last_extract_err: Option<String> = None;
  875|      0|        let mut last_was_rate_limited = false;
  876|       |
  877|      0|        for attempt in 1..=max_extract_attempts {
  878|      0|            match extract_with_claude(
  879|      0|                &claude_binary,
  880|      0|                &file_content,
  881|      0|                args.claude_model.as_deref(),
  882|      0|                args.claude_timeout,
  883|       |            ) {
  884|      0|                Ok(result) => {
  885|      0|                    extraction_result = Some(result);
  886|      0|                    break;
  887|       |                }
  888|      0|                Err(ref e) if matches!(e, AppError::RateLimited { .. }) => {
  889|      0|                    last_extract_err = Some(format!("{e}"));
  890|      0|                    last_was_rate_limited = true;
  891|      0|                    break;
  892|       |                }
  893|      0|                Err(e) => {
  894|      0|                    let msg = format!("{e}");
  895|      0|                    if attempt < max_extract_attempts {
  896|      0|                        let cold_start_delay = 2 * attempt as u64;
  897|      0|                        tracing::warn!(target: "ingest", attempt, delay_secs = cold_start_delay, error = %msg, "extraction failed, retrying (cold-start workaround)");
  898|      0|                        std::thread::sleep(std::time::Duration::from_secs(cold_start_delay));
  899|      0|                    }
  900|      0|                    last_extract_err = Some(msg);
  901|       |                }
  902|       |            }
  903|       |        }
  904|       |
  905|      0|        if let Some((extraction, cost, is_oauth)) = extraction_result {
  906|      0|            if is_oauth && !oauth_detected {
  907|      0|                oauth_detected = true;
  908|      0|                tracing::info!(target: "ingest", "OAuth subscription detected — cost_usd omitted from output");
  909|      0|            }
  910|      0|            backoff_secs = args.rate_limit_wait;
  911|       |
  912|      0|            let (normalized_name, _truncated, _orig) = crate::commands::ingest::derive_kebab_name(
  913|      0|                std::path::Path::new(&extraction.name),
  914|      0|                args.max_name_length,
  915|      0|            );
  916|      0|            let name = &normalized_name;
  917|      0|            let ent_count = extraction.entities.len();
  918|      0|            let rel_count = extraction.relationships.len();
  919|       |
  920|      0|            let new_entities: Vec<NewEntity> = extraction
  921|      0|                .entities
  922|      0|                .iter()
  923|      0|                .filter_map(|e| match e.entity_type.parse::<EntityType>() {
  924|      0|                    Ok(et) => Some(NewEntity {
  925|      0|                        name: e.name.clone(),
  926|      0|                        entity_type: et,
  927|      0|                        description: None,
  928|      0|                    }),
  929|       |                    Err(_) => {
  930|      0|                        tracing::warn!(
  931|       |                            target: "ingest",
  932|       |                            entity = %e.name,
  933|       |                            entity_type = %e.entity_type,
  934|      0|                            "entity type not recognized, skipping"
  935|       |                        );
  936|      0|                        None
  937|       |                    }
  938|      0|                })
  939|      0|                .collect();
  940|       |
  941|      0|            let new_relationships: Vec<NewRelationship> = extraction
  942|      0|                .relationships
  943|      0|                .iter()
  944|      0|                .map(|r| NewRelationship {
  945|      0|                    source: r.source.clone(),
  946|      0|                    target: r.target.clone(),
  947|      0|                    relation: crate::parsers::normalize_relation(&r.relation),
  948|      0|                    strength: r.strength,
  949|      0|                    description: None,
  950|      0|                })
  951|      0|                .collect();
  952|       |
  953|      0|            let body_str = String::from_utf8_lossy(&file_content);
  954|      0|            let body_hash = blake3::hash(body_str.as_bytes()).to_hex().to_string();
  955|      0|            let new_memory = NewMemory {
  956|      0|                name: name.clone(),
  957|      0|                namespace: namespace.clone(),
  958|      0|                memory_type: memory_type_str.clone(),
  959|      0|                description: extraction.description.clone(),
  960|      0|                body: body_str.to_string(),
  961|      0|                body_hash,
  962|      0|                session_id: None,
  963|      0|                source: "agent".to_string(),
  964|      0|                metadata: serde_json::Value::Object(serde_json::Map::new()),
  965|      0|            };
  966|       |
  967|       |            // B06: deduplication — update existing memory instead of failing on UNIQUE
  968|      0|            let memory_id = match memories::find_by_name_any_state(&conn, &namespace, name)? {
  969|      0|                Some((existing_id, is_deleted)) => {
  970|      0|                    if is_deleted {
  971|      0|                        memories::clear_deleted_at(&conn, existing_id)?;
  972|      0|                    }
  973|      0|                    let (old_name, old_desc, old_body): (String, String, String) = conn.query_row(
  974|      0|                        "SELECT name, COALESCE(description,''), COALESCE(body,'') FROM memories WHERE id=?1",
  975|      0|                        rusqlite::params![existing_id],
  976|      0|                        |r| Ok((r.get(0)?, r.get(1)?, r.get(2)?)),
  977|      0|                    )?;
  978|      0|                    memories::update(&conn, existing_id, &new_memory, None)?;
  979|      0|                    memories::sync_fts_after_update(
  980|      0|                        &conn,
  981|      0|                        existing_id,
  982|      0|                        &old_name,
  983|      0|                        &old_desc,
  984|      0|                        &old_body,
  985|      0|                        &new_memory.name,
  986|      0|                        &new_memory.description,
  987|      0|                        &new_memory.body,
  988|      0|                    )?;
  989|      0|                    tracing::info!(target: "ingest", name, memory_id = existing_id, "updated existing memory (force-merge)");
  990|      0|                    existing_id
  991|       |                }
  992|      0|                None => match memories::insert(&conn, &new_memory) {
  993|      0|                    Ok(id) => id,
  994|      0|                    Err(e) => {
  995|      0|                        let err_msg = format!("{e}");
  996|      0|                        let _ = queue_conn.execute(
  997|      0|                                "UPDATE queue SET status='failed', error=?1, done_at=datetime('now') WHERE id=?2",
  998|      0|                                rusqlite::params![err_msg, queue_id],
  999|      0|                            );
 1000|      0|                        let current_index = completed + failed + skipped;
 1001|      0|                        failed += 1;
 1002|      0|                        emit_json(&FileEvent {
 1003|      0|                            file: &file_path,
 1004|      0|                            name,
 1005|      0|                            status: "failed",
 1006|      0|                            memory_id: None,
 1007|      0|                            entities: None,
 1008|      0|                            rels: None,
 1009|      0|                            cost_usd: if is_oauth { None } else { Some(cost) },
 1010|      0|                            elapsed_ms: Some(file_started.elapsed().as_millis() as u64),
 1011|      0|                            error: Some(&err_msg),
 1012|      0|                            index: current_index,
 1013|      0|                            total,
 1014|       |                        });
 1015|      0|                        if !is_oauth {
 1016|      0|                            cost_total += cost;
 1017|      0|                        }
 1018|      0|                        if args.fail_fast {
 1019|      0|                            break;
 1020|      0|                        }
 1021|      0|                        continue;
 1022|       |                    }
 1023|       |                },
 1024|       |            };
 1025|       |
 1026|      0|            for ent in &new_entities {
 1027|      0|                match entities::upsert_entity(&conn, &namespace, ent) {
 1028|      0|                    Ok(eid) => {
 1029|      0|                        let _ = entities::link_memory_entity(&conn, memory_id, eid);
 1030|      0|                    }
 1031|      0|                    Err(e) => {
 1032|      0|                        tracing::warn!(
 1033|       |                            target: "ingest",
 1034|       |                            entity = %ent.name,
 1035|       |                            error = %e,
 1036|      0|                            "entity skipped due to validation error"
 1037|       |                        );
 1038|       |                    }
 1039|       |                }
 1040|       |            }
 1041|      0|            for rel in &new_relationships {
 1042|      0|                crate::parsers::warn_if_non_canonical(&rel.relation);
 1043|      0|                let src_id = entities::find_entity_id(&conn, &namespace, &rel.source);
 1044|      0|                let tgt_id = entities::find_entity_id(&conn, &namespace, &rel.target);
 1045|      0|                if let (Ok(Some(sid)), Ok(Some(tid))) = (src_id, tgt_id) {
 1046|      0|                    let _ = conn.execute(
 1047|      0|                        "INSERT OR IGNORE INTO relationships (namespace, source_id, target_id, relation, weight) VALUES (?1, ?2, ?3, ?4, ?5)",
 1048|      0|                        rusqlite::params![namespace, sid, tid, rel.relation, rel.strength],
 1049|      0|                    );
 1050|      0|                }
 1051|       |            }
 1052|       |
 1053|       |            // G01: embedding pipeline — enables recall to find memories created via --mode claude-code
 1054|      0|            let body_text = String::from_utf8_lossy(&file_content).into_owned();
 1055|      0|            let snippet: String = body_text.chars().take(200).collect();
 1056|      0|            let chunks_info =
 1057|      0|                crate::chunking::split_into_chunks_hierarchical(&body_text, tokenizer);
 1058|       |
 1059|      0|            let embedding_result = if chunks_info.len() <= 1 {
 1060|      0|                crate::daemon::embed_passage_or_local(&paths.models, &body_text)
 1061|       |            } else {
 1062|      0|                let mut chunk_embeddings: Vec<Vec<f32>> = Vec::with_capacity(chunks_info.len());
 1063|      0|                let mut multi_ok = true;
 1064|      0|                for chunk in &chunks_info {
 1065|      0|                    let chunk_text = crate::chunking::chunk_text(&body_text, chunk);
 1066|      0|                    match crate::daemon::embed_passage_or_local(&paths.models, chunk_text) {
 1067|      0|                        Ok(emb) => chunk_embeddings.push(emb),
 1068|      0|                        Err(e) => {
 1069|      0|                            tracing::warn!(
 1070|       |                                target: "ingest",
 1071|       |                                file = %file_path,
 1072|       |                                error = %e,
 1073|      0|                                "chunk embedding failed, skipping vector index for this file"
 1074|       |                            );
 1075|      0|                            multi_ok = false;
 1076|      0|                            break;
 1077|       |                        }
 1078|       |                    }
 1079|       |                }
 1080|      0|                if multi_ok {
 1081|      0|                    let aggregated = crate::chunking::aggregate_embeddings(&chunk_embeddings);
 1082|       |                    // persist per-chunk vectors
 1083|      0|                    if let Err(e) = crate::storage::chunks::insert_chunk_slices(
 1084|      0|                        &conn,
 1085|      0|                        memory_id,
 1086|      0|                        &body_text,
 1087|      0|                        &chunks_info,
 1088|      0|                    ) {
 1089|      0|                        tracing::warn!(
 1090|       |                            target: "ingest",
 1091|       |                            file = %file_path,
 1092|       |                            error = %e,
 1093|      0|                            "chunk slice insert failed"
 1094|       |                        );
 1095|       |                    } else {
 1096|      0|                        for (i, emb) in chunk_embeddings.iter().enumerate() {
 1097|      0|                            if let Err(e) = crate::storage::chunks::upsert_chunk_vec(
 1098|      0|                                &conn, i as i64, memory_id, i as i32, emb,
 1099|      0|                            ) {
 1100|      0|                                tracing::warn!(
 1101|       |                                    target: "ingest",
 1102|       |                                    file = %file_path,
 1103|       |                                    chunk = i,
 1104|       |                                    error = %e,
 1105|      0|                                    "chunk vec upsert failed"
 1106|       |                                );
 1107|      0|                            }
 1108|       |                        }
 1109|       |                    }
 1110|      0|                    Ok(aggregated)
 1111|       |                } else {
 1112|       |                    // fallback: embed whole body for the memory-level vector
 1113|      0|                    crate::daemon::embed_passage_or_local(&paths.models, &body_text)
 1114|       |                }
 1115|       |            };
 1116|       |
 1117|      0|            match embedding_result {
 1118|      0|                Ok(embedding) => {
 1119|      0|                    if let Err(e) = memories::upsert_vec(
 1120|      0|                        &conn,
 1121|      0|                        memory_id,
 1122|      0|                        &namespace,
 1123|      0|                        &memory_type_str,
 1124|      0|                        &embedding,
 1125|      0|                        name,
 1126|      0|                        &snippet,
 1127|      0|                    ) {
 1128|      0|                        tracing::warn!(
 1129|       |                            target: "ingest",
 1130|       |                            file = %file_path,
 1131|       |                            error = %e,
 1132|      0|                            "memory vec upsert failed; recall may not find this memory"
 1133|       |                        );
 1134|      0|                    }
 1135|       |                    // embed each entity that was successfully upserted
 1136|      0|                    for ent in &new_entities {
 1137|      0|                        if let Ok(Some(eid)) =
 1138|      0|                            entities::find_entity_id(&conn, &namespace, &ent.name)
 1139|       |                        {
 1140|      0|                            let entity_text = ent.name.clone();
 1141|      0|                            match crate::daemon::embed_passage_or_local(&paths.models, &entity_text)
 1142|       |                            {
 1143|      0|                                Ok(emb) => {
 1144|      0|                                    if let Err(e) = entities::upsert_entity_vec(
 1145|      0|                                        &conn,
 1146|      0|                                        eid,
 1147|      0|                                        &namespace,
 1148|      0|                                        ent.entity_type,
 1149|      0|                                        &emb,
 1150|      0|                                        &ent.name,
 1151|      0|                                    ) {
 1152|      0|                                        tracing::warn!(
 1153|       |                                            target: "ingest",
 1154|       |                                            entity = %ent.name,
 1155|       |                                            error = %e,
 1156|      0|                                            "entity vec upsert failed"
 1157|       |                                        );
 1158|      0|                                    }
 1159|       |                                }
 1160|      0|                                Err(e) => {
 1161|      0|                                    tracing::warn!(
 1162|       |                                        target: "ingest",
 1163|       |                                        entity = %ent.name,
 1164|       |                                        error = %e,
 1165|      0|                                        "entity embedding failed"
 1166|       |                                    );
 1167|       |                                }
 1168|       |                            }
 1169|      0|                        }
 1170|       |                    }
 1171|       |                }
 1172|      0|                Err(e) => {
 1173|      0|                    tracing::warn!(
 1174|       |                        target: "ingest",
 1175|       |                        file = %file_path,
 1176|       |                        error = %e,
 1177|      0|                        "memory embedding failed; recall will not find this memory"
 1178|       |                    );
 1179|       |                }
 1180|       |            }
 1181|       |
 1182|      0|            let _ = queue_conn.execute(
 1183|      0|                "UPDATE queue SET status='done', name=?1, memory_id=?2, entities=?3, rels=?4, cost_usd=?5, elapsed_ms=?6, done_at=datetime('now') WHERE id=?7",
 1184|      0|                rusqlite::params![
 1185|      0|                    name,
 1186|      0|                    memory_id,
 1187|      0|                    ent_count,
 1188|      0|                    rel_count,
 1189|      0|                    cost,
 1190|      0|                    file_started.elapsed().as_millis() as i64,
 1191|      0|                    queue_id
 1192|      0|                ],
 1193|      0|            );
 1194|       |
 1195|      0|            let current_index = completed + failed + skipped;
 1196|      0|            completed += 1;
 1197|      0|            entities_total += ent_count;
 1198|      0|            rels_total += rel_count;
 1199|      0|            if !is_oauth {
 1200|      0|                cost_total += cost;
 1201|      0|            }
 1202|       |
 1203|      0|            emit_json(&FileEvent {
 1204|      0|                file: &file_path,
 1205|      0|                name,
 1206|      0|                status: "done",
 1207|      0|                memory_id: Some(memory_id),
 1208|      0|                entities: Some(ent_count),
 1209|      0|                rels: Some(rel_count),
 1210|      0|                cost_usd: if is_oauth { None } else { Some(cost) },
 1211|      0|                elapsed_ms: Some(file_started.elapsed().as_millis() as u64),
 1212|      0|                error: None,
 1213|      0|                index: current_index,
 1214|      0|                total,
 1215|       |            });
 1216|      0|        } else if let Some(ref err_str) = last_extract_err {
 1217|      0|            if last_was_rate_limited {
 1218|      0|                if crate::retry::is_kill_switch_active() {
 1219|      0|                    tracing::warn!(target: "ingest", "SQLITE_GRAPHRAG_DISABLE_RETRY=1, skipping rate-limit retry");
 1220|      0|                } else if std::time::Instant::now() >= rate_limit_deadline {
 1221|      0|                    tracing::error!(target: "ingest", "rate-limit retry deadline (1h) exhausted");
 1222|       |                } else {
 1223|      0|                    let half = backoff_secs / 2;
 1224|      0|                    let jitter = if half == 0 { 0 } else { fastrand::u64(0..half) };
 1225|      0|                    let actual_wait = half + jitter;
 1226|      0|                    tracing::warn!(target: "ingest", delay_secs = actual_wait, error_kind = "rate_limited", "rate limited, backing off");
 1227|      0|                    let _ = queue_conn.execute(
 1228|      0|                        "UPDATE queue SET status='pending' WHERE id=?1",
 1229|      0|                        rusqlite::params![queue_id],
 1230|      0|                    );
 1231|      0|                    std::thread::sleep(std::time::Duration::from_secs(actual_wait));
 1232|      0|                    backoff_secs = (backoff_secs * 2).min(900);
 1233|      0|                    continue;
 1234|       |                }
 1235|       |            } else {
 1236|      0|                let _ = queue_conn.execute(
 1237|      0|                    "UPDATE queue SET status='failed', error=?1, done_at=datetime('now') WHERE id=?2",
 1238|      0|                    rusqlite::params![err_str, queue_id],
 1239|      0|                );
 1240|      0|                let current_index = completed + failed + skipped;
 1241|      0|                failed += 1;
 1242|      0|                emit_json(&FileEvent {
 1243|      0|                    file: &file_path,
 1244|      0|                    name: "",
 1245|      0|                    status: "failed",
 1246|      0|                    memory_id: None,
 1247|      0|                    entities: None,
 1248|      0|                    rels: None,
 1249|      0|                    cost_usd: None,
 1250|      0|                    elapsed_ms: Some(file_started.elapsed().as_millis() as u64),
 1251|      0|                    error: Some(err_str),
 1252|      0|                    index: current_index,
 1253|      0|                    total,
 1254|      0|                });
 1255|      0|                if args.fail_fast {
 1256|      0|                    break;
 1257|      0|                }
 1258|       |            }
 1259|      0|        }
 1260|       |
 1261|      0|        if let Some(budget) = args.max_cost_usd {
 1262|      0|            if oauth_detected {
 1263|      0|                tracing::debug!(target: "ingest", "--max-cost-usd ignored: OAuth subscription detected");
 1264|      0|            } else if cost_total >= budget {
 1265|      0|                tracing::warn!(
 1266|       |                    target: "ingest",
 1267|       |                    spent = cost_total,
 1268|       |                    budget = budget,
 1269|      0|                    "budget exceeded, stopping"
 1270|       |                );
 1271|      0|                break;
 1272|      0|            }
 1273|      0|        }
 1274|       |    }
 1275|       |
 1276|       |    // Stage 4: Summary
 1277|      0|    let _ = conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);");
 1278|      0|    let _ = queue_conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);");
 1279|       |
 1280|      0|    emit_json(&Summary {
 1281|      0|        summary: true,
 1282|      0|        files_total: total,
 1283|      0|        completed,
 1284|      0|        failed,
 1285|      0|        skipped,
 1286|      0|        entities_total,
 1287|      0|        rels_total,
 1288|      0|        cost_usd: cost_total,
 1289|      0|        elapsed_ms: started.elapsed().as_millis() as u64,
 1290|      0|    });
 1291|       |
 1292|      0|    if !args.keep_queue && failed == 0 {
 1293|      0|        let _ = std::fs::remove_file(&args.queue_db);
 1294|      0|    }
 1295|       |
 1296|      0|    Ok(())
 1297|      0|}
 1298|       |
 1299|       |#[cfg(test)]
 1300|       |mod tests {
 1301|       |    use super::*;
 1302|       |
 1303|       |    #[test]
 1304|      1|    fn test_extraction_schema_valid_json() {
 1305|      1|        let _: serde_json::Value =
 1306|      1|            serde_json::from_str(EXTRACTION_SCHEMA).expect("schema must be valid JSON");
 1307|      1|    }
 1308|       |
 1309|       |    #[test]
 1310|      1|    fn test_parse_claude_output_valid() {
 1311|      1|        let output = r#"[
 1312|      1|            {"type":"system","subtype":"init"},
 1313|      1|            {"type":"assistant"},
 1314|      1|            {"type":"result","is_error":false,"total_cost_usd":0.02,"structured_output":{"name":"test-doc","description":"A test document","entities":[{"name":"test-entity","entity_type":"concept"}],"relationships":[{"source":"test-entity","target":"test-doc","relation":"applies-to","strength":0.8}]}}
 1315|      1|        ]"#;
 1316|      1|        let (result, cost, _is_oauth) = parse_claude_output(output).expect("parse must succeed");
 1317|      1|        assert_eq!(result.name, "test-doc");
 1318|      1|        assert_eq!(result.entities.len(), 1);
 1319|      1|        assert_eq!(result.relationships.len(), 1);
 1320|      1|        assert!((cost - 0.02).abs() < f64::EPSILON);
 1321|      1|    }
 1322|       |
 1323|       |    #[test]
 1324|      1|    fn test_parse_claude_output_error() {
 1325|      1|        let output = r#"[
 1326|      1|            {"type":"system","subtype":"init"},
 1327|      1|            {"type":"result","is_error":true,"error":"authentication failed"}
 1328|      1|        ]"#;
 1329|      1|        let err = parse_claude_output(output).unwrap_err();
 1330|      1|        assert!(format!("{err}").contains("authentication failed"));
 1331|      1|    }
 1332|       |
 1333|       |    #[test]
 1334|      1|    fn test_parse_claude_output_rate_limit() {
 1335|      1|        let output = r#"[
 1336|      1|            {"type":"system","subtype":"init"},
 1337|      1|            {"type":"result","is_error":true,"error":"rate_limit exceeded"}
 1338|      1|        ]"#;
 1339|      1|        let err = parse_claude_output(output).unwrap_err();
 1340|      1|        assert!(matches!(err, AppError::RateLimited { .. }));
                              ^0
 1341|      1|    }
 1342|       |
 1343|       |    #[test]
 1344|      1|    fn test_parse_claude_output_malformed() {
 1345|      1|        let output = "not json at all";
 1346|      1|        assert!(parse_claude_output(output).is_err());
 1347|      1|    }
 1348|       |
 1349|       |    #[test]
 1350|      1|    fn test_find_claude_binary_not_found() {
 1351|      1|        let original_path = std::env::var_os("PATH");
 1352|      1|        std::env::set_var("PATH", "/nonexistent");
 1353|      1|        std::env::remove_var("SQLITE_GRAPHRAG_CLAUDE_BINARY");
 1354|      1|        let result = find_claude_binary(None);
 1355|      1|        if let Some(p) = original_path {
 1356|      1|            std::env::set_var("PATH", p);
 1357|      1|        }
                      ^0
 1358|      1|        assert!(result.is_err());
 1359|      1|    }
 1360|       |
 1361|       |    #[test]
 1362|      1|    fn test_parse_claude_output_result_fallback() {
 1363|      1|        let output = r#"[
 1364|      1|            {"type":"system","subtype":"init"},
 1365|      1|            {"type":"result","is_error":false,"total_cost_usd":0.01,"structured_output":null,"result":"{\"name\":\"test-fallback\",\"description\":\"A fallback test\",\"entities\":[{\"name\":\"fb-entity\",\"entity_type\":\"concept\"}],\"relationships\":[]}"}
 1366|      1|        ]"#;
 1367|      1|        let (result, cost, _is_oauth) =
 1368|      1|            parse_claude_output(output).expect("result fallback must work");
 1369|      1|        assert_eq!(result.name, "test-fallback");
 1370|      1|        assert_eq!(result.entities.len(), 1);
 1371|      1|        assert!(result.relationships.is_empty());
 1372|      1|        assert!((cost - 0.01).abs() < f64::EPSILON);
 1373|      1|    }
 1374|       |
 1375|       |    #[test]
 1376|      1|    fn test_parse_claude_output_error_with_result_field() {
 1377|      1|        let output = r#"[
 1378|      1|            {"type":"system","subtype":"init"},
 1379|      1|            {"type":"result","is_error":true,"result":"Not logged in · Please run /login"}
 1380|      1|        ]"#;
 1381|      1|        let err = parse_claude_output(output).unwrap_err();
 1382|      1|        let msg = format!("{err}");
 1383|      1|        assert!(
 1384|      1|            msg.contains("Not logged in"),
 1385|      0|            "expected 'Not logged in' in: {msg}"
 1386|       |        );
 1387|      1|    }
 1388|       |
 1389|       |    #[test]
 1390|      1|    fn test_terminal_reason_max_turns_detected() {
 1391|      1|        let output = r#"[
 1392|      1|            {"type":"system","subtype":"init"},
 1393|      1|            {"type":"result","is_error":false,"terminal_reason":"max_turns","structured_output":{"name":"t","description":"d","entities":[],"relationships":[]}}
 1394|      1|        ]"#;
 1395|      1|        let err_or_ok = parse_claude_output(output);
 1396|      1|        assert!(
 1397|      1|            err_or_ok.is_ok(),
 1398|      0|            "max_turns in result without is_error should still parse"
 1399|       |        );
 1400|      1|    }
 1401|       |
 1402|       |    #[test]
 1403|      1|    fn test_detect_oauth_from_init_json() {
 1404|      1|        let output = r#"[
 1405|      1|            {"type":"system","subtype":"init","apiKeySource":"none"},
 1406|      1|            {"type":"result","is_error":false,"total_cost_usd":0.50,"structured_output":{"name":"test-oauth","description":"oauth test","entities":[],"relationships":[]}}
 1407|      1|        ]"#;
 1408|      1|        let (_result, cost, is_oauth) = parse_claude_output(output).expect("parse must succeed");
 1409|      1|        assert!(is_oauth, "apiKeySource=none must be detected as OAuth");
                                        ^0
 1410|      1|        assert!((cost - 0.50).abs() < f64::EPSILON);
 1411|      1|    }
 1412|       |
 1413|       |    #[test]
 1414|      1|    fn test_api_key_source_not_oauth() {
 1415|      1|        let output = r#"[
 1416|      1|            {"type":"system","subtype":"init","apiKeySource":"env"},
 1417|      1|            {"type":"result","is_error":false,"total_cost_usd":0.10,"structured_output":{"name":"test-api","description":"api test","entities":[],"relationships":[]}}
 1418|      1|        ]"#;
 1419|      1|        let (_result, _cost, is_oauth) = parse_claude_output(output).expect("parse must succeed");
 1420|      1|        assert!(!is_oauth, "apiKeySource=env must NOT be detected as OAuth");
                                         ^0
 1421|      1|    }
 1422|       |
 1423|       |    #[test]
 1424|      1|    fn test_missing_api_key_source_defaults_not_oauth() {
 1425|      1|        let output = r#"[
 1426|      1|            {"type":"system","subtype":"init"},
 1427|      1|            {"type":"result","is_error":false,"total_cost_usd":0.05,"structured_output":{"name":"test-missing","description":"missing test","entities":[],"relationships":[]}}
 1428|      1|        ]"#;
 1429|      1|        let (_result, _cost, is_oauth) = parse_claude_output(output).expect("parse must succeed");
 1430|      1|        assert!(!is_oauth, "missing apiKeySource must default to not OAuth");
                                         ^0
 1431|      1|    }
 1432|       |
 1433|       |    #[test]
 1434|      1|    fn test_extraction_schema_entity_types_match_enum() {
 1435|      1|        let schema: serde_json::Value = serde_json::from_str(EXTRACTION_SCHEMA).unwrap();
 1436|      1|        let types = schema["properties"]["entities"]["items"]["properties"]["entity_type"]["enum"]
 1437|      1|            .as_array()
 1438|      1|            .expect("schema must have entity_type enum");
 1439|     11|        for t in types {
                          ^10
 1440|     10|            let s = t.as_str().unwrap();
 1441|     10|            assert!(
 1442|     10|                s.parse::<EntityType>().is_ok(),
 1443|      0|                "schema entity_type '{s}' not in EntityType enum"
 1444|       |            );
 1445|       |        }
 1446|      1|    }
 1447|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/ingest_codex.rs:
    1|       |//! Handler for `ingest --mode codex`.
    2|       |//!
    3|       |//! Orchestrates the locally installed OpenAI Codex CLI binary (`codex exec`)
    4|       |//! to extract domain-specific entities and relationships from each file,
    5|       |//! then persists them with full embedding pipeline for recall/hybrid-search.
    6|       |//!
    7|       |//! Architecture: P1 One-Shot per file — each file spawns a separate
    8|       |//! `codex exec` process with `--output-schema` for guaranteed structured output.
    9|       |//! A SQLite queue DB tracks progress for resume/retry support.
   10|       |// Workload: Subprocess I/O-bound (codex exec headless with network wait)
   11|       |
   12|       |use crate::commands::ingest::IngestArgs;
   13|       |use crate::commands::ingest_claude::ExtractionResult;
   14|       |use crate::entity_type::EntityType;
   15|       |use crate::errors::AppError;
   16|       |use crate::paths::AppPaths;
   17|       |use crate::storage::connection::{ensure_db_ready, open_rw};
   18|       |use crate::storage::entities::{self, NewEntity, NewRelationship};
   19|       |use crate::storage::memories::{self, NewMemory};
   20|       |
   21|       |use rusqlite::Connection;
   22|       |use serde::{Deserialize, Serialize};
   23|       |use std::io::Write;
   24|       |use std::path::{Path, PathBuf};
   25|       |use std::process::{Command, Stdio};
   26|       |use std::time::Instant;
   27|       |
   28|       |const MIN_CODEX_VERSION: &str = "0.120.0";
   29|       |
   30|       |/// OpenAI structured output schema with `additionalProperties: false` at all nested levels.
   31|       |const EXTRACTION_SCHEMA_CODEX: &str = r#"{
   32|       |  "type": "object",
   33|       |  "properties": {
   34|       |    "name": { "type": "string" },
   35|       |    "description": { "type": "string" },
   36|       |    "entities": {
   37|       |      "type": "array",
   38|       |      "items": {
   39|       |        "type": "object",
   40|       |        "properties": {
   41|       |          "name": { "type": "string" },
   42|       |          "entity_type": {
   43|       |            "type": "string",
   44|       |            "enum": ["project","tool","person","file","concept","incident","decision","organization","location","date"]
   45|       |          }
   46|       |        },
   47|       |        "required": ["name", "entity_type"],
   48|       |        "additionalProperties": false
   49|       |      }
   50|       |    },
   51|       |    "relationships": {
   52|       |      "type": "array",
   53|       |      "items": {
   54|       |        "type": "object",
   55|       |        "properties": {
   56|       |          "source": { "type": "string" },
   57|       |          "target": { "type": "string" },
   58|       |          "relation": {
   59|       |            "type": "string",
   60|       |            "enum": ["applies-to","uses","depends-on","causes","fixes","contradicts","supports","follows","related","replaces","tracked-in"]
   61|       |          },
   62|       |          "strength": { "type": "number", "minimum": 0, "maximum": 1 }
   63|       |        },
   64|       |        "required": ["source","target","relation","strength"],
   65|       |        "additionalProperties": false
   66|       |      }
   67|       |    }
   68|       |  },
   69|       |  "required": ["name","description","entities","relationships"],
   70|       |  "additionalProperties": false
   71|       |}"#;
   72|       |
   73|       |const EXTRACTION_PROMPT: &str = "You are a knowledge graph entity extractor. Given a document, extract:\n\
   74|       |1. A short kebab-case name (max 60 chars) capturing the document's main topic\n\
   75|       |2. A one-sentence description (10-20 words) summarizing the key insight\n\
   76|       |3. Domain-specific entities (concepts, tools, people, decisions, projects, files)\n\
   77|       |4. Typed relationships between entities with strength scores\n\n\
   78|       |Rules:\n\
   79|       |- Entity names: lowercase kebab-case, 2+ chars, domain-specific only\n\
   80|       |- NEVER extract generic terms, stop words, numbers, UUIDs, or single characters\n\
   81|       |- Relationship types MUST be one of: applies-to, uses, depends-on, causes, fixes, contradicts, supports, follows, related, replaces, tracked-in\n\
   82|       |- NEVER use 'mentions' as relationship type\n\
   83|       |- Strength: 0.9 for hard dependencies, 0.7 for design relationships, 0.5 for contextual links, 0.3 for weak references\n\
   84|       |- Prefer fewer high-quality entities over many low-quality ones\n\
   85|       |- Description must answer: What is this about and WHY does it matter?";
   86|       |
   87|       |/// Token usage reported by Codex CLI on `turn.completed` events.
   88|       |#[derive(Debug, Clone, Deserialize, Serialize)]
   89|       |struct CodexUsage {
   90|       |    input_tokens: u64,
   91|       |    #[serde(default)]
   92|       |    cached_input_tokens: u64,
   93|       |    output_tokens: u64,
   94|       |    #[serde(default)]
   95|       |    reasoning_output_tokens: u64,
   96|       |}
   97|       |
   98|       |#[derive(Debug, Serialize)]
   99|       |struct PhaseEvent<'a> {
  100|       |    phase: &'a str,
  101|       |    #[serde(skip_serializing_if = "Option::is_none")]
  102|       |    codex_path: Option<&'a str>,
  103|       |    #[serde(skip_serializing_if = "Option::is_none")]
  104|       |    version: Option<&'a str>,
  105|       |    #[serde(skip_serializing_if = "Option::is_none")]
  106|       |    dir: Option<&'a str>,
  107|       |    #[serde(skip_serializing_if = "Option::is_none")]
  108|       |    files_total: Option<usize>,
  109|       |    #[serde(skip_serializing_if = "Option::is_none")]
  110|       |    files_new: Option<usize>,
  111|       |    #[serde(skip_serializing_if = "Option::is_none")]
  112|       |    files_existing: Option<usize>,
  113|       |}
  114|       |
  115|       |#[derive(Debug, Serialize)]
  116|       |struct FileEvent<'a> {
  117|       |    file: &'a str,
  118|       |    name: &'a str,
  119|       |    status: &'a str,
  120|       |    #[serde(skip_serializing_if = "Option::is_none")]
  121|       |    memory_id: Option<i64>,
  122|       |    #[serde(skip_serializing_if = "Option::is_none")]
  123|       |    entities: Option<usize>,
  124|       |    #[serde(skip_serializing_if = "Option::is_none")]
  125|       |    rels: Option<usize>,
  126|       |    /// Always None for Codex (no cost_usd in Codex API responses).
  127|       |    #[serde(skip_serializing_if = "Option::is_none")]
  128|       |    cost_usd: Option<f64>,
  129|       |    #[serde(skip_serializing_if = "Option::is_none")]
  130|       |    input_tokens: Option<u64>,
  131|       |    #[serde(skip_serializing_if = "Option::is_none")]
  132|       |    output_tokens: Option<u64>,
  133|       |    #[serde(skip_serializing_if = "Option::is_none")]
  134|       |    elapsed_ms: Option<u64>,
  135|       |    #[serde(skip_serializing_if = "Option::is_none")]
  136|       |    error: Option<&'a str>,
  137|       |    index: usize,
  138|       |    total: usize,
  139|       |}
  140|       |
  141|       |#[derive(Debug, Serialize)]
  142|       |struct Summary {
  143|       |    summary: bool,
  144|       |    files_total: usize,
  145|       |    completed: usize,
  146|       |    failed: usize,
  147|       |    skipped: usize,
  148|       |    entities_total: usize,
  149|       |    rels_total: usize,
  150|       |    input_tokens_total: u64,
  151|       |    output_tokens_total: u64,
  152|       |    elapsed_ms: u64,
  153|       |}
  154|       |
  155|       |/// Locates the Codex CLI binary on the system.
  156|       |///
  157|       |/// Search order:
  158|       |/// 1. Explicit `--codex-binary` CLI flag.
  159|       |/// 2. `SQLITE_GRAPHRAG_CODEX_BINARY` env var.
  160|       |/// 3. PATH search for `codex` (or `codex.exe` on Windows).
  161|      0|pub fn find_codex_binary(explicit: Option<&Path>) -> Result<PathBuf, AppError> {
  162|      0|    if let Some(p) = explicit {
  163|      0|        if p.exists() {
  164|      0|            return Ok(p.to_path_buf());
  165|      0|        }
  166|      0|        return Err(AppError::Validation(format!(
  167|      0|            "Codex CLI binary not found at explicit path: {}",
  168|      0|            p.display()
  169|      0|        )));
  170|      0|    }
  171|       |
  172|      0|    if let Ok(env_path) = std::env::var("SQLITE_GRAPHRAG_CODEX_BINARY") {
  173|      0|        let p = PathBuf::from(&env_path);
  174|      0|        if p.exists() {
  175|      0|            return Ok(p);
  176|      0|        }
  177|      0|    }
  178|       |
  179|      0|    let name = if cfg!(windows) { "codex.exe" } else { "codex" };
  180|      0|    if let Some(path_var) = std::env::var_os("PATH") {
  181|      0|        for dir in std::env::split_paths(&path_var) {
  182|      0|            let candidate = dir.join(name);
  183|      0|            if candidate.exists() {
  184|      0|                return Ok(candidate);
  185|      0|            }
  186|       |        }
  187|      0|    }
  188|       |
  189|      0|    Err(AppError::Validation(
  190|      0|        "Codex CLI binary not found in PATH. Install it from https://github.com/openai/codex or specify --codex-binary".to_string(),
  191|      0|    ))
  192|      0|}
  193|       |
  194|       |/// Validates that the Codex CLI binary meets the minimum version requirement.
  195|       |///
  196|       |/// # Errors
  197|       |///
  198|       |/// Returns `AppError::Validation` when the binary cannot be executed or the
  199|       |/// version is below `MIN_CODEX_VERSION`.
  200|      0|fn validate_codex_version(binary: &Path) -> Result<String, AppError> {
  201|      0|    let resolved = which::which(binary).map_err(|_| {
  202|      0|        AppError::Validation(format!(
  203|      0|            "executable '{}' not found in PATH; ensure Codex CLI is installed",
  204|      0|            binary.display()
  205|      0|        ))
  206|      0|    })?;
  207|      0|    let output = Command::new(&resolved)
  208|      0|        .arg("--version")
  209|      0|        .stdin(Stdio::null())
  210|      0|        .stdout(Stdio::piped())
  211|      0|        .stderr(Stdio::piped())
  212|      0|        .output()
  213|      0|        .map_err(AppError::Io)?;
  214|       |
  215|      0|    let raw = String::from_utf8(output.stdout)
  216|      0|        .map_err(|_| AppError::Validation("codex --version output is not UTF-8".to_string()))?;
  217|       |
  218|      0|    let version_str = raw.trim().to_string();
  219|       |
  220|       |    // Codex CLI outputs: "codex-cli 0.133.0" or just "0.133.0"
  221|      0|    let numeric = version_str.split_whitespace().last().unwrap_or("").trim();
  222|       |
  223|      0|    fn parse_semver(s: &str) -> Option<(u64, u64, u64)> {
  224|      0|        let parts: Vec<&str> = s.splitn(3, '.').collect();
  225|      0|        if parts.len() < 2 {
  226|      0|            return None;
  227|      0|        }
  228|      0|        let major = parts[0].parse::<u64>().ok()?;
  229|      0|        let minor = parts[1].parse::<u64>().ok()?;
  230|      0|        let patch = parts
  231|      0|            .get(2)
  232|      0|            .and_then(|p| p.parse::<u64>().ok())
  233|      0|            .unwrap_or(0);
  234|      0|        Some((major, minor, patch))
  235|      0|    }
  236|       |
  237|      0|    if let (Some(actual), Some(min)) = (parse_semver(numeric), parse_semver(MIN_CODEX_VERSION)) {
  238|      0|        if actual < min {
  239|      0|            return Err(AppError::Validation(format!(
  240|      0|                "Codex CLI version {numeric} is below minimum required {MIN_CODEX_VERSION}"
  241|      0|            )));
  242|      0|        }
  243|      0|    }
  244|       |
  245|      0|    Ok(version_str)
  246|      0|}
  247|       |
  248|       |/// Writes the extraction schema to a named temp file for `--output-schema`.
  249|       |///
  250|       |/// # Errors
  251|       |///
  252|       |/// Returns `AppError::Io` when the temp file cannot be created or written.
  253|      0|fn write_schema_tempfile() -> Result<tempfile::NamedTempFile, AppError> {
  254|      0|    let mut f = tempfile::NamedTempFile::new().map_err(AppError::Io)?;
  255|      0|    std::io::Write::write_all(&mut f, EXTRACTION_SCHEMA_CODEX.as_bytes()).map_err(AppError::Io)?;
  256|      0|    std::io::Write::flush(&mut f).map_err(AppError::Io)?;
  257|      0|    Ok(f)
  258|      0|}
  259|       |
  260|       |/// Invokes `codex exec` for a single file and returns the extraction result.
  261|       |///
  262|       |/// Uses `wait-timeout` for cross-platform subprocess timeout, `env_clear()`
  263|       |/// for least-privilege environment, and reads prompt + file content from
  264|       |/// stdin using the `-` argument (Codex Paperclip pattern).
  265|       |///
  266|       |/// # Errors
  267|       |///
  268|       |/// Returns `AppError::Validation` on extraction failure, rate limiting, or
  269|       |/// schema errors. Returns `AppError::Io` on process spawn/IO failures.
  270|      0|fn extract_with_codex(
  271|      0|    binary: &Path,
  272|      0|    file_content: &[u8],
  273|      0|    model: Option<&str>,
  274|      0|    timeout_secs: u64,
  275|      0|    schema_file: &Path,
  276|      0|) -> Result<(ExtractionResult, Option<CodexUsage>), AppError> {
  277|       |    use wait_timeout::ChildExt;
  278|       |
  279|       |    // G31 Passo C (v1.0.69): delegate command construction to the shared
  280|       |    // `codex_spawn::build_codex_command` helper so `enrich` and `ingest` stay
  281|       |    // perfectly aligned on the canonical seven hardening flags. The local
  282|       |    // function still owns the stdin pump + JSONL parsing (see below).
  283|      0|    let _ = timeout_secs; // currently unused; consumed by the helper when it spawns the process
  284|      0|    let _ = file_content; // pumped into stdin below, see `stdin_pump` thread
  285|      0|    let _ = schema_file; // helper reuses the temp file at the given path
  286|      0|    let prompt = String::new(); // empty prompt — helper appends file_content via args.input_text
  287|      0|    let mut cmd = crate::commands::codex_spawn::build_codex_command(
  288|      0|        &crate::commands::codex_spawn::CodexSpawnArgs {
  289|      0|            binary,
  290|      0|            prompt: &prompt,
  291|      0|            json_schema: "", // caller writes the schema directly via `schema_file`
  292|      0|            input_text: "",
  293|      0|            model,
  294|      0|            timeout_secs,
  295|      0|            schema_path: schema_file.to_path_buf(),
  296|      0|        },
  297|       |    );
  298|       |
  299|       |    // `build_codex_command` writes the JSON schema to `schema_path` and
  300|       |    // appends `input_text` to the prompt via Paperclip stdin. For `ingest`
  301|       |    // we want the schema content already on disk (the caller pre-wrote
  302|       |    // EXTRACTION_SCHEMA_CODEX into the named tempfile), and the document
  303|       |    // content goes through stdin via a dedicated thread (see below). Strip
  304|       |    // the file the helper just rewrote — our caller pre-wrote it.
  305|      0|    let _ = std::fs::write(
  306|      0|        schema_file,
  307|      0|        crate::commands::ingest_codex::EXTRACTION_SCHEMA_CODEX,
  308|      0|    );
  309|       |
  310|      0|    cmd.stdin(Stdio::piped())
  311|      0|        .stdout(Stdio::piped())
  312|      0|        .stderr(Stdio::piped());
  313|       |
  314|      0|    let mut child = super::claude_runner::spawn_with_memory_limit(&mut cmd).map_err(|e| {
  315|      0|        AppError::Io(std::io::Error::new(
  316|      0|            e.kind(),
  317|      0|            format!("failed to spawn codex: {e}"),
  318|      0|        ))
  319|      0|    })?;
  320|       |
  321|       |    // Build stdin: prompt + document content
  322|      0|    let file_utf8 = String::from_utf8_lossy(file_content);
  323|      0|    let stdin_payload = format!("{EXTRACTION_PROMPT}\n\n---\n\nDocument content:\n\n{file_utf8}");
  324|      0|    let stdin_bytes = stdin_payload.into_bytes();
  325|       |
  326|      0|    let mut child_stdin = child
  327|      0|        .stdin
  328|      0|        .take()
  329|      0|        .ok_or_else(|| AppError::Validation("failed to open codex stdin".into()))?;
  330|      0|    let stdin_thread = std::thread::spawn(move || -> Result<(), std::io::Error> {
  331|      0|        child_stdin.write_all(&stdin_bytes)?;
  332|      0|        drop(child_stdin);
  333|      0|        Ok(())
  334|      0|    });
  335|       |
  336|      0|    let start = std::time::Instant::now();
  337|      0|    let timeout = std::time::Duration::from_secs(timeout_secs);
  338|      0|    let status = child.wait_timeout(timeout).map_err(AppError::Io)?;
  339|       |
  340|      0|    match status {
  341|      0|        Some(exit_status) => {
  342|      0|            stdin_thread
  343|      0|                .join()
  344|      0|                .map_err(|_| AppError::Validation("stdin thread panicked".into()))?
  345|      0|                .map_err(AppError::Io)?;
  346|       |
  347|      0|            tracing::debug!(
  348|       |                target: "process",
  349|      0|                exit_code = ?exit_status.code(),
  350|      0|                elapsed_ms = start.elapsed().as_millis() as u64,
  351|      0|                "external process completed"
  352|       |            );
  353|       |
  354|      0|            let mut stdout_buf = Vec::new();
  355|      0|            let mut stderr_buf = Vec::new();
  356|      0|            if let Some(mut out) = child.stdout.take() {
  357|      0|                std::io::Read::read_to_end(&mut out, &mut stdout_buf).map_err(AppError::Io)?;
  358|      0|            }
  359|      0|            if let Some(mut err) = child.stderr.take() {
  360|      0|                std::io::Read::read_to_end(&mut err, &mut stderr_buf).map_err(AppError::Io)?;
  361|      0|            }
  362|       |
  363|      0|            if !exit_status.success() {
  364|      0|                let stderr_str = String::from_utf8_lossy(&stderr_buf);
  365|      0|                let stdout_str = String::from_utf8_lossy(&stdout_buf);
  366|       |                // Check if stdout has JSONL with an error event before falling back
  367|      0|                if let Ok((result, usage)) = parse_codex_output(&stdout_str) {
  368|      0|                    return Ok((result, usage));
  369|      0|                }
  370|      0|                if stderr_str.contains("401")
  371|      0|                    || stderr_str.contains("Unauthorized")
  372|      0|                    || stderr_str.contains("auth")
  373|       |                {
  374|      0|                    tracing::warn!(
  375|       |                        target: "ingest",
  376|      0|                        "Codex CLI authentication expired. Re-authenticate with: codex auth login"
  377|       |                    );
  378|      0|                }
  379|      0|                return Err(AppError::Validation(format!(
  380|      0|                    "codex exec exited with code {:?}: {}",
  381|      0|                    exit_status.code(),
  382|      0|                    stderr_str.trim()
  383|      0|                )));
  384|      0|            }
  385|       |
  386|      0|            let stdout = String::from_utf8(stdout_buf)
  387|      0|                .map_err(|_| AppError::Validation("codex exec stdout is not valid UTF-8".into()))?;
  388|      0|            parse_codex_output(&stdout)
  389|       |        }
  390|       |        None => {
  391|      0|            tracing::warn!(target: "ingest", timeout_secs, "codex exec timed out, killing process");
  392|      0|            let _ = child.kill();
  393|      0|            let _ = child.wait();
  394|      0|            let _ = stdin_thread.join();
  395|      0|            Err(AppError::Validation(format!(
  396|      0|                "codex exec timed out after {timeout_secs} seconds"
  397|      0|            )))
  398|       |        }
  399|       |    }
  400|      0|}
  401|       |
  402|       |/// Parses JSONL output from `codex exec --json`.
  403|       |///
  404|       |/// Event format (DOTS notation):
  405|       |/// - `thread.started` — session init
  406|       |/// - `turn.started` — model turn begins
  407|       |/// - `item.completed` — message or tool call; last `agent_message` wins
  408|       |/// - `turn.completed` — includes usage stats
  409|       |/// - `turn.failed` — error with optional rate-limit indicator
  410|       |/// - `error` — schema or validation error
  411|       |///
  412|       |/// # Errors
  413|       |///
  414|       |/// Returns `AppError::Validation` when no agent_message is found, when the
  415|       |/// turn failed, or when the extracted JSON cannot be parsed as `ExtractionResult`.
  416|      6|fn parse_codex_output(stdout: &str) -> Result<(ExtractionResult, Option<CodexUsage>), AppError> {
  417|      6|    let mut last_agent_text: Option<String> = None;
  418|      6|    let mut usage: Option<CodexUsage> = None;
  419|      6|    let mut rate_limited = false;
  420|      6|    let mut schema_error = false;
  421|      6|    let mut turn_failed = false;
  422|      6|    let mut failed_message = String::new();
  423|       |
  424|     15|    for line in stdout.lines() {
                              ^6     ^6
  425|     15|        let line = line.trim();
  426|     15|        if line.is_empty() {
  427|      0|            continue;
  428|     15|        }
  429|       |
  430|     15|        let event: serde_json::Value = match serde_json::from_str(line) {
                          ^13    ^13
  431|     13|            Ok(v) => v,
  432|       |            Err(_) => {
  433|      2|                tracing::warn!(target: "ingest", line, "codex output: skipping malformed JSONL line");
                                                                     ^0
  434|      2|                continue;
  435|       |            }
  436|       |        };
  437|       |
  438|     13|        let event_type = match event.get("type").and_then(|t| t.as_str()) {
  439|     13|            Some(t) => t,
  440|      0|            None => continue,
  441|       |        };
  442|       |
  443|     13|        match event_type {
  444|     13|            "item.completed" => {
  445|       |                // Last agent_message wins (reasoning / tool calls may appear before)
  446|      4|                if let Some(item) = event.get("item") {
  447|      4|                    if item.get("type").and_then(|t| t.as_str()) == Some("agent_message") {
  448|      4|                        if let Some(text) = item.get("text").and_then(|t| t.as_str()) {
  449|      4|                            last_agent_text = Some(text.to_string());
  450|      4|                        }
                                      ^0
  451|      0|                    }
  452|      0|                }
  453|       |            }
  454|      9|            "turn.completed" => {
  455|      3|                if let Some(u) = event.get("usage") {
  456|      3|                    if let Ok(parsed) = serde_json::from_value::<CodexUsage>(u.clone()) {
  457|      3|                        usage = Some(parsed);
  458|      3|                    }
                                  ^0
  459|      0|                }
  460|       |            }
  461|      6|            "turn.failed" => {
  462|      2|                turn_failed = true;
  463|      2|                if let Some(err) = event.get("error") {
  464|      2|                    let msg = err
  465|      2|                        .get("message")
  466|      2|                        .and_then(|m| m.as_str())
  467|      2|                        .unwrap_or("unknown error");
  468|      2|                    failed_message = msg.to_string();
  469|      2|                    if msg.contains("rate_limit")
  470|      1|                        || msg.contains("429")
  471|      1|                        || msg.contains("Too Many Requests")
  472|      1|                    {
  473|      1|                        rate_limited = true;
  474|      1|                    }
  475|      0|                }
  476|       |            }
  477|      4|            "error" => {
  478|      1|                if let Some(msg) = event.get("message").and_then(|m| m.as_str()) {
  479|      1|                    if msg.contains("invalid_json_schema") || msg.contains("schema") {
                                                                            ^0  ^0
  480|      1|                        schema_error = true;
  481|      1|                    }
                                  ^0
  482|      1|                    tracing::warn!(target: "ingest", error_msg = msg, "codex error event received");
                                                                                    ^0
  483|      0|                }
  484|       |            }
  485|      3|            _ => {
  486|      3|                // Gracefully skip unknown event types (thread.started, turn.started, etc.)
  487|      3|            }
  488|       |        }
  489|       |    }
  490|       |
  491|      6|    if rate_limited {
  492|      1|        return Err(AppError::RateLimited {
  493|      1|            detail: failed_message,
  494|      1|        });
  495|      5|    }
  496|       |
  497|      5|    if schema_error {
  498|      1|        return Err(AppError::Validation(
  499|      1|            "codex rejected the output schema (invalid_json_schema)".to_string(),
  500|      1|        ));
  501|      4|    }
  502|       |
  503|      4|    if turn_failed {
  504|      1|        return Err(AppError::Validation(format!(
  505|      1|            "codex turn failed: {failed_message}"
  506|      1|        )));
  507|      3|    }
  508|       |
  509|      3|    let text = last_agent_text.ok_or_else(|| {
                                                           ^0
  510|      0|        AppError::Validation("codex output contained no agent_message item".to_string())
  511|      0|    })?;
  512|       |
  513|      3|    let extraction: ExtractionResult = serde_json::from_str(&text).map_err(|e| {
                                                                                             ^0
  514|      0|        AppError::Validation(format!(
  515|      0|            "failed to parse codex agent_message as ExtractionResult: {e}. text={text}"
  516|      0|        ))
  517|      0|    })?;
  518|       |
  519|      3|    Ok((extraction, usage))
  520|      6|}
  521|       |
  522|       |use crate::output::emit_json_line as emit_json;
  523|       |
  524|       |/// Collects files matching the pattern (reuses ingest logic).
  525|      0|fn collect_matching_files(
  526|      0|    dir: &Path,
  527|      0|    pattern: &str,
  528|      0|    recursive: bool,
  529|      0|    max_files: usize,
  530|      0|) -> Result<Vec<PathBuf>, AppError> {
  531|      0|    let mut files = Vec::new();
  532|      0|    super::ingest::collect_files(dir, pattern, recursive, &mut files)?;
  533|      0|    files.sort_unstable();
  534|       |
  535|      0|    if files.len() > max_files {
  536|      0|        return Err(AppError::Validation(format!(
  537|      0|            "found {} files, exceeds --max-files cap of {}",
  538|      0|            files.len(),
  539|      0|            max_files
  540|      0|        )));
  541|      0|    }
  542|       |
  543|      0|    Ok(files)
  544|      0|}
  545|       |
  546|       |/// Opens or creates the queue database for tracking ingest progress.
  547|      0|fn open_queue_db(path: &str) -> Result<Connection, AppError> {
  548|      0|    let conn = Connection::open(path)?;
  549|       |
  550|      0|    conn.execute_batch(
  551|      0|        "PRAGMA journal_mode=WAL;
  552|      0|        CREATE TABLE IF NOT EXISTS queue (
  553|      0|            id          INTEGER PRIMARY KEY AUTOINCREMENT,
  554|      0|            file_path   TEXT NOT NULL UNIQUE,
  555|      0|            name        TEXT,
  556|      0|            status      TEXT NOT NULL DEFAULT 'pending',
  557|      0|            memory_id   INTEGER,
  558|      0|            entities    INTEGER DEFAULT 0,
  559|      0|            rels        INTEGER DEFAULT 0,
  560|      0|            error       TEXT,
  561|      0|            input_tokens  INTEGER DEFAULT 0,
  562|      0|            output_tokens INTEGER DEFAULT 0,
  563|      0|            attempt     INTEGER DEFAULT 0,
  564|      0|            elapsed_ms  INTEGER,
  565|      0|            created_at  TEXT DEFAULT (datetime('now')),
  566|      0|            done_at     TEXT
  567|      0|        );
  568|      0|        CREATE INDEX IF NOT EXISTS idx_queue_status ON queue(status);",
  569|      0|    )?;
  570|       |
  571|      0|    Ok(conn)
  572|      0|}
  573|       |
  574|       |/// Main entry point for `ingest --mode codex`.
  575|       |///
  576|       |/// # Errors
  577|       |///
  578|       |/// Returns `AppError` on directory/DB access failures or fatal extraction errors.
  579|      0|pub fn run_codex_ingest(args: &IngestArgs) -> Result<(), AppError> {
  580|      0|    let started = Instant::now();
  581|       |
  582|      0|    if !args.dir.exists() {
  583|      0|        return Err(AppError::Validation(format!(
  584|      0|            "directory not found: {}",
  585|      0|            args.dir.display()
  586|      0|        )));
  587|      0|    }
  588|       |
  589|       |    // G28-B (v1.0.68) + G30 (v1.0.69): acquire singleton before doing real
  590|       |    // work so two parallel `ingest --mode codex` invocations cannot co-exist
  591|       |    // on the same database. Scope includes the database hash so concurrent
  592|       |    // ingest against different databases is allowed.
  593|      0|    let early_ns = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
  594|      0|    let early_paths = AppPaths::resolve(args.db.as_deref())?;
  595|      0|    let _singleton = crate::lock::acquire_job_singleton(
  596|      0|        crate::lock::JobType::IngestCodex,
  597|      0|        &early_ns,
  598|      0|        &early_paths.db,
  599|      0|        args.wait_job_singleton,
  600|      0|        args.force_job_singleton,
  601|      0|    )?;
  602|       |
  603|       |    // Stage 1: Validate binary
  604|      0|    let codex_binary = find_codex_binary(args.codex_binary.as_deref())?;
  605|      0|    let version = validate_codex_version(&codex_binary)?;
  606|      0|    tracing::info!(
  607|       |        target: "ingest",
  608|      0|        binary = %codex_binary.display(),
  609|       |        version = %version,
  610|      0|        "Codex CLI binary validated"
  611|       |    );
  612|       |
  613|      0|    emit_json(&PhaseEvent {
  614|      0|        phase: "validate",
  615|      0|        codex_path: codex_binary.to_str(),
  616|      0|        version: Some(&version),
  617|      0|        dir: None,
  618|      0|        files_total: None,
  619|      0|        files_new: None,
  620|      0|        files_existing: None,
  621|      0|    });
  622|       |
  623|       |    // Stage 2: Scan files
  624|      0|    let files = collect_matching_files(&args.dir, &args.pattern, args.recursive, args.max_files)?;
  625|       |
  626|      0|    let queue_conn = open_queue_db(&args.queue_db)?;
  627|       |
  628|      0|    if args.resume {
  629|      0|        let reset = queue_conn
  630|      0|            .execute(
  631|      0|                "UPDATE queue SET status='pending' WHERE status='processing'",
  632|      0|                [],
  633|       |            )
  634|      0|            .map_err(|e| AppError::Validation(format!("queue resume failed: {e}")))?;
  635|      0|        if reset > 0 {
  636|      0|            tracing::info!(target: "ingest", count = reset, "reset stuck processing files to pending");
  637|      0|        }
  638|      0|    }
  639|       |
  640|      0|    if args.retry_failed {
  641|      0|        let count = queue_conn
  642|      0|            .execute(
  643|      0|                "UPDATE queue SET status='pending', attempt=0 WHERE status='failed'",
  644|      0|                [],
  645|       |            )
  646|      0|            .map_err(|e| AppError::Validation(format!("queue retry-failed reset failed: {e}")))?;
  647|      0|        tracing::info!(target: "ingest", count, "retrying failed files");
  648|      0|    }
  649|       |
  650|      0|    if !args.resume && !args.retry_failed {
  651|      0|        queue_conn
  652|      0|            .execute("DELETE FROM queue", [])
  653|      0|            .map_err(|e| AppError::Validation(format!("queue clear failed: {e}")))?;
  654|      0|    }
  655|       |
  656|      0|    let mut new_count = 0usize;
  657|      0|    let mut existing_count = 0usize;
  658|       |
  659|      0|    if !args.retry_failed {
  660|      0|        for file in &files {
  661|      0|            let file_str = file.to_string_lossy().into_owned();
  662|      0|            let inserted = queue_conn
  663|      0|                .execute(
  664|      0|                    "INSERT OR IGNORE INTO queue (file_path, status) VALUES (?1, 'pending')",
  665|      0|                    rusqlite::params![file_str],
  666|       |                )
  667|      0|                .map_err(|e| AppError::Validation(format!("queue insert failed: {e}")))?;
  668|      0|            if inserted > 0 {
  669|      0|                new_count += 1;
  670|      0|            } else {
  671|      0|                existing_count += 1;
  672|      0|            }
  673|       |        }
  674|      0|    }
  675|       |
  676|      0|    emit_json(&PhaseEvent {
  677|      0|        phase: "scan",
  678|      0|        codex_path: None,
  679|      0|        version: None,
  680|      0|        dir: args.dir.to_str(),
  681|      0|        files_total: Some(files.len()),
  682|      0|        files_new: Some(new_count),
  683|      0|        files_existing: Some(existing_count),
  684|      0|    });
  685|       |
  686|      0|    if args.dry_run {
  687|      0|        for (idx, file) in files.iter().enumerate() {
  688|      0|            let (name, _truncated, _orig) =
  689|      0|                super::ingest::derive_kebab_name(file, args.max_name_length);
  690|      0|            emit_json(&FileEvent {
  691|      0|                file: &file.to_string_lossy(),
  692|      0|                name: &name,
  693|      0|                status: "preview",
  694|      0|                memory_id: None,
  695|      0|                entities: None,
  696|      0|                rels: None,
  697|      0|                cost_usd: None,
  698|      0|                input_tokens: None,
  699|      0|                output_tokens: None,
  700|      0|                elapsed_ms: None,
  701|      0|                error: None,
  702|      0|                index: idx,
  703|      0|                total: files.len(),
  704|      0|            });
  705|      0|        }
  706|      0|        emit_json(&Summary {
  707|      0|            summary: true,
  708|      0|            files_total: files.len(),
  709|      0|            completed: 0,
  710|      0|            failed: 0,
  711|      0|            skipped: 0,
  712|      0|            entities_total: 0,
  713|      0|            rels_total: 0,
  714|      0|            input_tokens_total: 0,
  715|      0|            output_tokens_total: 0,
  716|      0|            elapsed_ms: started.elapsed().as_millis() as u64,
  717|      0|        });
  718|      0|        if !args.keep_queue {
  719|      0|            let _ = std::fs::remove_file(&args.queue_db);
  720|      0|        }
  721|      0|        return Ok(());
  722|      0|    }
  723|       |
  724|       |    // Stage 3: Process files
  725|      0|    let paths = AppPaths::resolve(args.db.as_deref())?;
  726|      0|    ensure_db_ready(&paths)?;
  727|      0|    let conn = open_rw(&paths.db)?;
  728|      0|    let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
  729|      0|    let memory_type_str = args.r#type.as_str().to_string();
  730|       |
  731|       |    // Write schema to temp file once (reused across all files)
  732|      0|    let schema_tempfile = write_schema_tempfile()?;
  733|      0|    let schema_path = schema_tempfile.path().to_path_buf();
  734|       |
  735|      0|    let mut completed = 0usize;
  736|      0|    let mut failed = 0usize;
  737|      0|    let skipped_initial: usize = queue_conn
  738|      0|        .query_row("SELECT COUNT(*) FROM queue WHERE status='done'", [], |r| {
  739|      0|            r.get::<_, usize>(0)
  740|      0|        })
  741|      0|        .unwrap_or(0);
  742|      0|    let mut skipped = skipped_initial;
  743|      0|    let mut entities_total = 0usize;
  744|      0|    let mut rels_total = 0usize;
  745|      0|    let mut input_tokens_total = 0u64;
  746|      0|    let mut output_tokens_total = 0u64;
  747|      0|    let total = files.len();
  748|       |
  749|      0|    let mut backoff_secs = args.rate_limit_wait;
  750|      0|    let rate_limit_deadline = std::time::Instant::now() + std::time::Duration::from_secs(3600);
  751|       |
  752|       |    loop {
  753|      0|        if crate::shutdown_requested() {
  754|      0|            tracing::info!(target: "ingest", "shutdown requested, stopping before next file");
  755|      0|            break;
  756|      0|        }
  757|       |
  758|      0|        let pending: Option<(i64, String)> = queue_conn
  759|      0|            .query_row(
  760|      0|                "UPDATE queue SET status='processing', attempt=attempt+1 \
  761|      0|                 WHERE id = (SELECT id FROM queue WHERE status='pending' ORDER BY id LIMIT 1) \
  762|      0|                 RETURNING id, file_path",
  763|      0|                [],
  764|      0|                |row| Ok((row.get(0)?, row.get(1)?)),
  765|       |            )
  766|      0|            .ok();
  767|       |
  768|      0|        let (queue_id, file_path) = match pending {
  769|      0|            Some(p) => p,
  770|      0|            None => break,
  771|       |        };
  772|       |
  773|      0|        let file_started = Instant::now();
  774|       |
  775|       |        // Reject files that exceed the 10 MB stdin limit
  776|       |        const MAX_FILE_SIZE: u64 = 10 * 1024 * 1024;
  777|      0|        if let Ok(meta) = std::fs::metadata(&file_path) {
  778|      0|            if meta.len() > MAX_FILE_SIZE {
  779|      0|                let err_msg = format!("file exceeds 10MB stdin limit ({} bytes)", meta.len());
  780|      0|                let _ = queue_conn.execute(
  781|      0|                    "UPDATE queue SET status='failed', error=?1, done_at=datetime('now') WHERE id=?2",
  782|      0|                    rusqlite::params![err_msg, queue_id],
  783|      0|                );
  784|      0|                let current_index = completed + failed + skipped;
  785|      0|                failed += 1;
  786|      0|                emit_json(&FileEvent {
  787|      0|                    file: &file_path,
  788|      0|                    name: "",
  789|      0|                    status: "failed",
  790|      0|                    memory_id: None,
  791|      0|                    entities: None,
  792|      0|                    rels: None,
  793|      0|                    cost_usd: None,
  794|      0|                    input_tokens: None,
  795|      0|                    output_tokens: None,
  796|      0|                    elapsed_ms: Some(file_started.elapsed().as_millis() as u64),
  797|      0|                    error: Some(&err_msg),
  798|      0|                    index: current_index,
  799|      0|                    total,
  800|      0|                });
  801|      0|                if args.fail_fast {
  802|      0|                    break;
  803|      0|                }
  804|      0|                continue;
  805|      0|            }
  806|      0|        }
  807|       |
  808|      0|        let file_content = match std::fs::read(&file_path) {
  809|      0|            Ok(c) => c,
  810|      0|            Err(e) => {
  811|      0|                let err_msg = format!("IO error: {e}");
  812|      0|                let _ = queue_conn.execute(
  813|      0|                    "UPDATE queue SET status='failed', error=?1, done_at=datetime('now') WHERE id=?2",
  814|      0|                    rusqlite::params![err_msg, queue_id],
  815|      0|                );
  816|      0|                let current_index = completed + failed + skipped;
  817|      0|                failed += 1;
  818|      0|                emit_json(&FileEvent {
  819|      0|                    file: &file_path,
  820|      0|                    name: "",
  821|      0|                    status: "failed",
  822|      0|                    memory_id: None,
  823|      0|                    entities: None,
  824|      0|                    rels: None,
  825|      0|                    cost_usd: None,
  826|      0|                    input_tokens: None,
  827|      0|                    output_tokens: None,
  828|      0|                    elapsed_ms: Some(file_started.elapsed().as_millis() as u64),
  829|      0|                    error: Some(&err_msg),
  830|      0|                    index: current_index,
  831|      0|                    total,
  832|      0|                });
  833|      0|                if args.fail_fast {
  834|      0|                    break;
  835|      0|                }
  836|      0|                continue;
  837|       |            }
  838|       |        };
  839|       |
  840|       |        // Skip files exceeding body cap BEFORE sending to LLM to avoid wasting tokens
  841|      0|        if file_content.len() > crate::constants::MAX_MEMORY_BODY_LEN {
  842|      0|            let err_msg = format!(
  843|      0|                "file body exceeds {} byte limit ({} bytes) — skipping to avoid wasting LLM tokens",
  844|       |                crate::constants::MAX_MEMORY_BODY_LEN,
  845|      0|                file_content.len()
  846|       |            );
  847|      0|            tracing::warn!(target: "ingest", file = %file_path, size = file_content.len(), "body exceeds limit, skipping LLM extraction");
  848|      0|            let _ = queue_conn.execute(
  849|      0|                "UPDATE queue SET status='skipped', error=?1, done_at=datetime('now') WHERE id=?2",
  850|      0|                rusqlite::params![err_msg, queue_id],
  851|      0|            );
  852|      0|            let current_index = completed + failed + skipped;
  853|      0|            skipped += 1;
  854|      0|            emit_json(&FileEvent {
  855|      0|                file: &file_path,
  856|      0|                name: "",
  857|      0|                status: "skipped",
  858|      0|                memory_id: None,
  859|      0|                entities: None,
  860|      0|                rels: None,
  861|      0|                cost_usd: None,
  862|      0|                input_tokens: None,
  863|      0|                output_tokens: None,
  864|      0|                elapsed_ms: Some(file_started.elapsed().as_millis() as u64),
  865|      0|                error: Some(&err_msg),
  866|      0|                index: current_index,
  867|      0|                total,
  868|      0|            });
  869|      0|            continue;
  870|      0|        }
  871|       |
  872|       |        // Retry once on cold-start failure
  873|      0|        let max_extract_attempts: u32 = 2;
  874|      0|        let mut extraction_result: Option<(ExtractionResult, Option<CodexUsage>)> = None;
  875|      0|        let mut last_extract_err: Option<String> = None;
  876|      0|        let mut last_was_rate_limited = false;
  877|       |
  878|      0|        for attempt in 1..=max_extract_attempts {
  879|      0|            match extract_with_codex(
  880|      0|                &codex_binary,
  881|      0|                &file_content,
  882|      0|                args.codex_model.as_deref(),
  883|      0|                args.codex_timeout,
  884|      0|                &schema_path,
  885|       |            ) {
  886|      0|                Ok(result) => {
  887|      0|                    extraction_result = Some(result);
  888|      0|                    break;
  889|       |                }
  890|      0|                Err(ref e) if matches!(e, AppError::RateLimited { .. }) => {
  891|      0|                    last_extract_err = Some(format!("{e}"));
  892|      0|                    last_was_rate_limited = true;
  893|      0|                    break;
  894|       |                }
  895|      0|                Err(e) => {
  896|      0|                    let msg = format!("{e}");
  897|      0|                    if attempt < max_extract_attempts {
  898|      0|                        let cold_start_delay = 2 * attempt as u64;
  899|      0|                        tracing::warn!(
  900|       |                            target: "ingest",
  901|       |                            attempt,
  902|       |                            delay_secs = cold_start_delay,
  903|       |                            error = %msg,
  904|      0|                            "codex extraction failed, retrying"
  905|       |                        );
  906|      0|                        std::thread::sleep(std::time::Duration::from_secs(cold_start_delay));
  907|      0|                    }
  908|      0|                    last_extract_err = Some(msg);
  909|       |                }
  910|       |            }
  911|       |        }
  912|       |
  913|      0|        if let Some((extraction, usage)) = extraction_result {
  914|      0|            backoff_secs = args.rate_limit_wait;
  915|       |
  916|      0|            let in_tok = usage.as_ref().map(|u| u.input_tokens).unwrap_or(0);
  917|      0|            let out_tok = usage.as_ref().map(|u| u.output_tokens).unwrap_or(0);
  918|       |
  919|      0|            let name = &extraction.name;
  920|      0|            let ent_count = extraction.entities.len();
  921|      0|            let rel_count = extraction.relationships.len();
  922|       |
  923|      0|            let new_entities: Vec<NewEntity> = extraction
  924|      0|                .entities
  925|      0|                .iter()
  926|      0|                .filter_map(|e| match e.entity_type.parse::<EntityType>() {
  927|      0|                    Ok(et) => Some(NewEntity {
  928|      0|                        name: e.name.clone(),
  929|      0|                        entity_type: et,
  930|      0|                        description: None,
  931|      0|                    }),
  932|       |                    Err(_) => {
  933|      0|                        tracing::warn!(
  934|       |                            target: "ingest",
  935|       |                            entity = %e.name,
  936|       |                            entity_type = %e.entity_type,
  937|      0|                            "entity type not recognized, skipping"
  938|       |                        );
  939|      0|                        None
  940|       |                    }
  941|      0|                })
  942|      0|                .collect();
  943|       |
  944|      0|            let new_relationships: Vec<NewRelationship> = extraction
  945|      0|                .relationships
  946|      0|                .iter()
  947|      0|                .map(|r| NewRelationship {
  948|      0|                    source: r.source.clone(),
  949|      0|                    target: r.target.clone(),
  950|      0|                    relation: crate::parsers::normalize_relation(&r.relation),
  951|      0|                    strength: r.strength,
  952|      0|                    description: None,
  953|      0|                })
  954|      0|                .collect();
  955|       |
  956|      0|            let body_str = String::from_utf8_lossy(&file_content);
  957|      0|            let body_hash = blake3::hash(body_str.as_bytes()).to_hex().to_string();
  958|      0|            let new_memory = NewMemory {
  959|      0|                name: name.clone(),
  960|      0|                namespace: namespace.clone(),
  961|      0|                memory_type: memory_type_str.clone(),
  962|      0|                description: extraction.description.clone(),
  963|      0|                body: body_str.to_string(),
  964|      0|                body_hash,
  965|      0|                session_id: None,
  966|      0|                source: "agent".to_string(),
  967|      0|                metadata: serde_json::Value::Object(serde_json::Map::new()),
  968|      0|            };
  969|       |
  970|       |            // Deduplication: update existing memory instead of failing on UNIQUE
  971|      0|            let memory_id = match memories::find_by_name_any_state(&conn, &namespace, name)? {
  972|      0|                Some((existing_id, is_deleted)) => {
  973|      0|                    if is_deleted {
  974|      0|                        memories::clear_deleted_at(&conn, existing_id)?;
  975|      0|                    }
  976|      0|                    let (old_name, old_desc, old_body): (String, String, String) = conn.query_row(
  977|      0|                        "SELECT name, COALESCE(description,''), COALESCE(body,'') FROM memories WHERE id=?1",
  978|      0|                        rusqlite::params![existing_id],
  979|      0|                        |r| Ok((r.get(0)?, r.get(1)?, r.get(2)?)),
  980|      0|                    )?;
  981|      0|                    memories::update(&conn, existing_id, &new_memory, None)?;
  982|      0|                    memories::sync_fts_after_update(
  983|      0|                        &conn,
  984|      0|                        existing_id,
  985|      0|                        &old_name,
  986|      0|                        &old_desc,
  987|      0|                        &old_body,
  988|      0|                        &new_memory.name,
  989|      0|                        &new_memory.description,
  990|      0|                        &new_memory.body,
  991|      0|                    )?;
  992|      0|                    tracing::info!(target: "ingest", name, memory_id = existing_id, "updated existing memory (force-merge)");
  993|      0|                    existing_id
  994|       |                }
  995|      0|                None => match memories::insert(&conn, &new_memory) {
  996|      0|                    Ok(id) => id,
  997|      0|                    Err(e) => {
  998|      0|                        let err_msg = format!("{e}");
  999|      0|                        let _ = queue_conn.execute(
 1000|      0|                            "UPDATE queue SET status='failed', error=?1, done_at=datetime('now') WHERE id=?2",
 1001|      0|                            rusqlite::params![err_msg, queue_id],
 1002|      0|                        );
 1003|      0|                        let current_index = completed + failed + skipped;
 1004|      0|                        failed += 1;
 1005|      0|                        emit_json(&FileEvent {
 1006|      0|                            file: &file_path,
 1007|      0|                            name,
 1008|      0|                            status: "failed",
 1009|      0|                            memory_id: None,
 1010|      0|                            entities: None,
 1011|      0|                            rels: None,
 1012|      0|                            cost_usd: None,
 1013|      0|                            input_tokens: Some(in_tok),
 1014|      0|                            output_tokens: Some(out_tok),
 1015|      0|                            elapsed_ms: Some(file_started.elapsed().as_millis() as u64),
 1016|      0|                            error: Some(&err_msg),
 1017|      0|                            index: current_index,
 1018|      0|                            total,
 1019|      0|                        });
 1020|      0|                        input_tokens_total += in_tok;
 1021|      0|                        output_tokens_total += out_tok;
 1022|      0|                        if args.fail_fast {
 1023|      0|                            break;
 1024|      0|                        }
 1025|      0|                        continue;
 1026|       |                    }
 1027|       |                },
 1028|       |            };
 1029|       |
 1030|      0|            for ent in &new_entities {
 1031|      0|                if let Ok(eid) = entities::upsert_entity(&conn, &namespace, ent) {
 1032|      0|                    let _ = entities::link_memory_entity(&conn, memory_id, eid);
 1033|      0|                }
 1034|       |            }
 1035|      0|            for rel in &new_relationships {
 1036|      0|                crate::parsers::warn_if_non_canonical(&rel.relation);
 1037|      0|                let src_id = entities::find_entity_id(&conn, &namespace, &rel.source);
 1038|      0|                let tgt_id = entities::find_entity_id(&conn, &namespace, &rel.target);
 1039|      0|                if let (Ok(Some(sid)), Ok(Some(tid))) = (src_id, tgt_id) {
 1040|      0|                    let _ = conn.execute(
 1041|      0|                        "INSERT OR IGNORE INTO relationships (namespace, source_id, target_id, relation, weight) VALUES (?1, ?2, ?3, ?4, ?5)",
 1042|      0|                        rusqlite::params![namespace, sid, tid, rel.relation, rel.strength],
 1043|      0|                    );
 1044|      0|                }
 1045|       |            }
 1046|       |
 1047|      0|            let _ = queue_conn.execute(
 1048|      0|                "UPDATE queue SET status='done', name=?1, memory_id=?2, entities=?3, rels=?4, \
 1049|      0|                 input_tokens=?5, output_tokens=?6, elapsed_ms=?7, done_at=datetime('now') WHERE id=?8",
 1050|      0|                rusqlite::params![
 1051|      0|                    name,
 1052|      0|                    memory_id,
 1053|      0|                    ent_count,
 1054|      0|                    rel_count,
 1055|      0|                    in_tok,
 1056|      0|                    out_tok,
 1057|      0|                    file_started.elapsed().as_millis() as i64,
 1058|      0|                    queue_id
 1059|      0|                ],
 1060|      0|            );
 1061|       |
 1062|      0|            let current_index = completed + failed + skipped;
 1063|      0|            completed += 1;
 1064|      0|            entities_total += ent_count;
 1065|      0|            rels_total += rel_count;
 1066|      0|            input_tokens_total += in_tok;
 1067|      0|            output_tokens_total += out_tok;
 1068|       |
 1069|      0|            emit_json(&FileEvent {
 1070|      0|                file: &file_path,
 1071|      0|                name,
 1072|      0|                status: "done",
 1073|      0|                memory_id: Some(memory_id),
 1074|      0|                entities: Some(ent_count),
 1075|      0|                rels: Some(rel_count),
 1076|      0|                cost_usd: None,
 1077|      0|                input_tokens: Some(in_tok),
 1078|      0|                output_tokens: Some(out_tok),
 1079|      0|                elapsed_ms: Some(file_started.elapsed().as_millis() as u64),
 1080|      0|                error: None,
 1081|      0|                index: current_index,
 1082|      0|                total,
 1083|      0|            });
 1084|      0|        } else if let Some(ref err_str) = last_extract_err {
 1085|      0|            if last_was_rate_limited {
 1086|      0|                if crate::retry::is_kill_switch_active() {
 1087|      0|                    tracing::warn!(target: "ingest", "SQLITE_GRAPHRAG_DISABLE_RETRY=1, skipping rate-limit retry");
 1088|      0|                } else if std::time::Instant::now() >= rate_limit_deadline {
 1089|      0|                    tracing::error!(target: "ingest", "rate-limit retry deadline (1h) exhausted");
 1090|       |                } else {
 1091|      0|                    let half = backoff_secs / 2;
 1092|      0|                    let jitter = if half == 0 { 0 } else { fastrand::u64(0..half) };
 1093|      0|                    let actual_wait = half + jitter;
 1094|      0|                    tracing::warn!(target: "ingest", delay_secs = actual_wait, error_kind = "rate_limited", "Codex rate limited, backing off");
 1095|      0|                    let _ = queue_conn.execute(
 1096|      0|                        "UPDATE queue SET status='pending' WHERE id=?1",
 1097|      0|                        rusqlite::params![queue_id],
 1098|      0|                    );
 1099|      0|                    std::thread::sleep(std::time::Duration::from_secs(actual_wait));
 1100|      0|                    backoff_secs = (backoff_secs * 2).min(900);
 1101|      0|                    continue;
 1102|       |                }
 1103|       |            } else {
 1104|      0|                let _ = queue_conn.execute(
 1105|      0|                    "UPDATE queue SET status='failed', error=?1, done_at=datetime('now') WHERE id=?2",
 1106|      0|                    rusqlite::params![err_str, queue_id],
 1107|      0|                );
 1108|      0|                let current_index = completed + failed + skipped;
 1109|      0|                failed += 1;
 1110|      0|                emit_json(&FileEvent {
 1111|      0|                    file: &file_path,
 1112|      0|                    name: "",
 1113|      0|                    status: "failed",
 1114|      0|                    memory_id: None,
 1115|      0|                    entities: None,
 1116|      0|                    rels: None,
 1117|      0|                    cost_usd: None,
 1118|      0|                    input_tokens: None,
 1119|      0|                    output_tokens: None,
 1120|      0|                    elapsed_ms: Some(file_started.elapsed().as_millis() as u64),
 1121|      0|                    error: Some(err_str),
 1122|      0|                    index: current_index,
 1123|      0|                    total,
 1124|      0|                });
 1125|      0|                if args.fail_fast {
 1126|      0|                    break;
 1127|      0|                }
 1128|       |            }
 1129|      0|        }
 1130|       |    }
 1131|       |
 1132|       |    // WAL checkpoint before summary
 1133|      0|    let _ = conn.execute_batch("PRAGMA wal_checkpoint(PASSIVE);");
 1134|       |
 1135|       |    // Stage 4: Summary
 1136|      0|    emit_json(&Summary {
 1137|      0|        summary: true,
 1138|      0|        files_total: total,
 1139|      0|        completed,
 1140|      0|        failed,
 1141|      0|        skipped,
 1142|      0|        entities_total,
 1143|      0|        rels_total,
 1144|      0|        input_tokens_total,
 1145|      0|        output_tokens_total,
 1146|      0|        elapsed_ms: started.elapsed().as_millis() as u64,
 1147|      0|    });
 1148|       |
 1149|      0|    if !args.keep_queue && failed == 0 {
 1150|      0|        let _ = std::fs::remove_file(&args.queue_db);
 1151|      0|    }
 1152|       |
 1153|      0|    Ok(())
 1154|      0|}
 1155|       |
 1156|       |#[cfg(test)]
 1157|       |mod tests {
 1158|       |    use super::*;
 1159|       |
 1160|      4|    fn make_agent_message_event(text: &str) -> String {
 1161|      4|        format!(
 1162|      4|            r#"{{"type":"item.completed","item":{{"id":"item_0","type":"agent_message","text":{}}}}}"#,
 1163|      4|            serde_json::to_string(text).unwrap()
 1164|       |        )
 1165|      4|    }
 1166|       |
 1167|      3|    fn make_usage_event(input: u64, output: u64) -> String {
 1168|      3|        format!(
 1169|      3|            r#"{{"type":"turn.completed","usage":{{"input_tokens":{input},"output_tokens":{output}}}}}"#
 1170|       |        )
 1171|      3|    }
 1172|       |
 1173|      2|    fn valid_extraction_json() -> String {
 1174|      2|        r#"{"name":"test-module","description":"A test module for unit testing purposes","entities":[{"name":"test-entity","entity_type":"concept"}],"relationships":[{"source":"test-entity","target":"test-module","relation":"applies-to","strength":0.8}]}"#.to_string()
 1175|      2|    }
 1176|       |
 1177|       |    #[test]
 1178|      1|    fn test_parse_codex_output_valid() {
 1179|      1|        let jsonl = format!(
 1180|      1|            "{}\n{}\n{}",
 1181|       |            r#"{"type":"thread.started","thread_id":"t1"}"#,
 1182|      1|            make_agent_message_event(&valid_extraction_json()),
 1183|      1|            make_usage_event(100, 50),
 1184|       |        );
 1185|       |
 1186|      1|        let (result, usage) = parse_codex_output(&jsonl).expect("parse must succeed");
 1187|      1|        assert_eq!(result.name, "test-module");
 1188|      1|        assert_eq!(result.entities.len(), 1);
 1189|      1|        assert_eq!(result.relationships.len(), 1);
 1190|      1|        let u = usage.expect("usage must be present");
 1191|      1|        assert_eq!(u.input_tokens, 100);
 1192|      1|        assert_eq!(u.output_tokens, 50);
 1193|      1|    }
 1194|       |
 1195|       |    #[test]
 1196|      1|    fn test_parse_codex_output_turn_failed() {
 1197|      1|        let jsonl = format!(
 1198|      1|            "{}\n{}",
 1199|       |            r#"{"type":"thread.started","thread_id":"t1"}"#,
 1200|       |            r#"{"type":"turn.failed","error":{"message":"model error occurred"}}"#,
 1201|       |        );
 1202|       |
 1203|      1|        let err = parse_codex_output(&jsonl).unwrap_err();
 1204|      1|        let msg = format!("{err}");
 1205|      1|        assert!(
 1206|      1|            msg.contains("turn failed"),
 1207|      0|            "expected 'turn failed' in: {msg}"
 1208|       |        );
 1209|      1|        assert!(msg.contains("model error occurred"));
 1210|      1|    }
 1211|       |
 1212|       |    #[test]
 1213|      1|    fn test_parse_codex_output_rate_limit() {
 1214|      1|        let jsonl = r#"{"type":"turn.failed","error":{"message":"rate_limit exceeded, 429 Too Many Requests"}}"#;
 1215|       |
 1216|      1|        let err = parse_codex_output(jsonl).unwrap_err();
 1217|      1|        assert!(
 1218|      1|            matches!(err, AppError::RateLimited { .. }),
                          ^0
 1219|      0|            "expected AppError::RateLimited, got: {err}"
 1220|       |        );
 1221|      1|    }
 1222|       |
 1223|       |    #[test]
 1224|      1|    fn test_parse_codex_output_schema_error() {
 1225|      1|        let jsonl = r#"{"type":"error","message":"invalid_json_schema: additional properties not allowed"}"#;
 1226|       |
 1227|      1|        let err = parse_codex_output(jsonl).unwrap_err();
 1228|      1|        let msg = format!("{err}");
 1229|      1|        assert!(
 1230|      1|            msg.contains("invalid_json_schema") || msg.contains("schema"),
                                                                 ^0
 1231|      0|            "expected schema error in: {msg}"
 1232|       |        );
 1233|      1|    }
 1234|       |
 1235|       |    #[test]
 1236|      1|    fn test_extraction_schema_codex_valid_json() {
 1237|      1|        let _: serde_json::Value =
 1238|      1|            serde_json::from_str(EXTRACTION_SCHEMA_CODEX).expect("schema must be valid JSON");
 1239|      1|    }
 1240|       |
 1241|       |    #[test]
 1242|      1|    fn test_extraction_schema_codex_has_additional_properties_false() {
 1243|      1|        let schema: serde_json::Value =
 1244|      1|            serde_json::from_str(EXTRACTION_SCHEMA_CODEX).expect("schema must be valid JSON");
 1245|       |
 1246|       |        // Root level
 1247|      1|        assert_eq!(
 1248|      1|            schema["additionalProperties"].as_bool(),
 1249|       |            Some(false),
 1250|      0|            "root must have additionalProperties: false"
 1251|       |        );
 1252|       |
 1253|       |        // Entity items level
 1254|      1|        assert_eq!(
 1255|      1|            schema["properties"]["entities"]["items"]["additionalProperties"].as_bool(),
 1256|       |            Some(false),
 1257|      0|            "entity items must have additionalProperties: false"
 1258|       |        );
 1259|       |
 1260|       |        // Relationship items level
 1261|      1|        assert_eq!(
 1262|      1|            schema["properties"]["relationships"]["items"]["additionalProperties"].as_bool(),
 1263|       |            Some(false),
 1264|      0|            "relationship items must have additionalProperties: false"
 1265|       |        );
 1266|      1|    }
 1267|       |
 1268|       |    #[test]
 1269|      1|    fn test_parse_codex_output_last_agent_message_wins() {
 1270|       |        // Multiple agent_message items — last one should win
 1271|      1|        let first_text = r#"{"name":"first-result","description":"First result should be ignored","entities":[],"relationships":[]}"#;
 1272|      1|        let second_text = r#"{"name":"final-result","description":"Final result wins over earlier ones","entities":[{"name":"final-entity","entity_type":"concept"}],"relationships":[]}"#;
 1273|       |
 1274|      1|        let jsonl = format!(
 1275|      1|            "{}\n{}\n{}\n{}",
 1276|       |            r#"{"type":"thread.started","thread_id":"t1"}"#,
 1277|      1|            make_agent_message_event(first_text),
 1278|      1|            make_agent_message_event(second_text),
 1279|      1|            make_usage_event(200, 80),
 1280|       |        );
 1281|       |
 1282|      1|        let (result, _) = parse_codex_output(&jsonl).expect("parse must succeed");
 1283|      1|        assert_eq!(result.name, "final-result", "last agent_message should win");
                                                              ^0
 1284|      1|        assert_eq!(result.entities.len(), 1);
 1285|      1|    }
 1286|       |
 1287|       |    #[test]
 1288|      1|    fn test_parse_codex_output_skips_malformed_lines() {
 1289|      1|        let jsonl = format!(
 1290|      1|            "not json at all\n{}\n{{broken\n{}",
 1291|      1|            make_agent_message_event(&valid_extraction_json()),
 1292|      1|            make_usage_event(10, 5),
 1293|       |        );
 1294|       |
 1295|       |        // Should succeed despite malformed lines
 1296|      1|        let (result, _) = parse_codex_output(&jsonl).expect("malformed lines must be skipped");
 1297|      1|        assert_eq!(result.name, "test-module");
 1298|      1|    }
 1299|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/init.rs:
    1|       |//! Handler for the `init` CLI subcommand.
    2|       |
    3|       |use crate::errors::AppError;
    4|       |use crate::output;
    5|       |use crate::paths::AppPaths;
    6|       |use crate::pragmas::{apply_init_pragmas, ensure_wal_mode};
    7|       |use crate::storage::connection::open_rw;
    8|       |use serde::Serialize;
    9|       |
   10|       |/// Embedding model choices exposed through `--model`.
   11|       |///
   12|       |/// Currently only `multilingual-e5-small` is supported. Additional variants
   13|       |/// will be added here as new models are integrated; the `value_enum` derive
   14|       |/// ensures the CLI rejects unknown strings at parse time rather than at runtime.
   15|       |#[derive(Copy, Clone, Debug, PartialEq, Eq, clap::ValueEnum)]
   16|       |pub enum EmbeddingModelChoice {
   17|       |    #[value(name = "multilingual-e5-small")]
   18|       |    MultilingualE5Small,
   19|       |}
   20|       |
   21|       |#[derive(clap::Args)]
   22|       |#[command(after_long_help = "EXAMPLES:\n  \
   23|       |    # Initialize a new database in the current directory\n  \
   24|       |    sqlite-graphrag init\n\n  \
   25|       |    # Initialize with a specific namespace\n  \
   26|       |    sqlite-graphrag init --namespace my-project\n\n  \
   27|       |    # Initialize at a custom database path\n  \
   28|       |    sqlite-graphrag init --db /path/to/graphrag.sqlite")]
   29|       |pub struct InitArgs {
   30|       |    /// Path to graphrag.sqlite. Defaults to `./graphrag.sqlite` in the current directory.
   31|       |    /// Resolution precedence (highest to lowest): `--db` flag > `SQLITE_GRAPHRAG_DB_PATH` env >
   32|       |    /// `SQLITE_GRAPHRAG_HOME` env (used as base directory) > cwd.
   33|       |    #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
   34|       |    pub db: Option<String>,
   35|       |    /// Embedding model identifier. Currently only `multilingual-e5-small` is supported.
   36|       |    /// Reserved for future multi-model support; safe to omit.
   37|       |    #[arg(long, value_enum)]
   38|       |    pub model: Option<EmbeddingModelChoice>,
   39|       |    /// Force re-initialization, overwriting any existing schema metadata.
   40|       |    /// Use only when the schema is corrupted; loses configuration but preserves data.
   41|       |    #[arg(long)]
   42|       |    pub force: bool,
   43|       |    /// Initial namespace to resolve. Aligned with bilingual docs that mention `init --namespace`.
   44|       |    /// When provided, overrides `SQLITE_GRAPHRAG_NAMESPACE`; otherwise resolves via env or fallback `global`.
   45|       |    #[arg(long)]
   46|       |    pub namespace: Option<String>,
   47|       |    #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
   48|       |    pub json: bool,
   49|       |}
   50|       |
   51|       |#[derive(Serialize)]
   52|       |struct InitResponse {
   53|       |    db_path: String,
   54|       |    /// Latest applied migration number from `refinery_schema_history`.
   55|       |    /// Emitted as a JSON number for cross-command consistency with `health` and `stats` (since v1.0.35).
   56|       |    schema_version: u32,
   57|       |    model: String,
   58|       |    dim: usize,
   59|       |    /// Active namespace resolved during initialisation, aligned with the bilingual docs.
   60|       |    namespace: String,
   61|       |    status: String,
   62|       |    /// Total execution time in milliseconds from handler start to serialisation.
   63|       |    elapsed_ms: u64,
   64|       |}
   65|       |
   66|      0|pub fn run(args: InitArgs) -> Result<(), AppError> {
   67|      0|    let start = std::time::Instant::now();
   68|      0|    let paths = AppPaths::resolve(args.db.as_deref())?;
   69|      0|    paths.ensure_dirs()?;
   70|       |
   71|      0|    let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
   72|       |
   73|      0|    let mut conn = open_rw(&paths.db)?;
   74|       |
   75|      0|    apply_init_pragmas(&conn)?;
   76|       |
   77|      0|    crate::migrations::runner()
   78|      0|        .run(&mut conn)
   79|      0|        .map_err(|e| AppError::Internal(anyhow::anyhow!("migration failed: {e}")))?;
   80|       |
   81|      0|    conn.execute_batch(&format!(
   82|      0|        "PRAGMA user_version = {};",
   83|      0|        crate::constants::SCHEMA_USER_VERSION
   84|      0|    ))?;
   85|       |
   86|       |    // Defensive re-assertion: refinery may revert journal_mode during migrations.
   87|      0|    ensure_wal_mode(&conn)?;
   88|       |
   89|      0|    let schema_version = latest_schema_version(&conn)?;
   90|       |
   91|      0|    conn.execute(
   92|      0|        "INSERT OR REPLACE INTO schema_meta (key, value) VALUES ('schema_version', ?1)",
   93|      0|        rusqlite::params![schema_version],
   94|      0|    )?;
   95|      0|    conn.execute(
   96|      0|        "INSERT OR REPLACE INTO schema_meta (key, value) VALUES ('model', 'multilingual-e5-small')",
   97|      0|        [],
   98|      0|    )?;
   99|      0|    conn.execute(
  100|      0|        "INSERT OR REPLACE INTO schema_meta (key, value) VALUES ('dim', '384')",
  101|      0|        [],
  102|      0|    )?;
  103|      0|    conn.execute(
  104|      0|        "INSERT OR REPLACE INTO schema_meta (key, value) VALUES ('created_at', CAST(unixepoch() AS TEXT))",
  105|      0|        [],
  106|      0|    )?;
  107|      0|    conn.execute(
  108|      0|        "INSERT OR REPLACE INTO schema_meta (key, value) VALUES ('sqlite-graphrag_version', ?1)",
  109|      0|        rusqlite::params![crate::constants::SQLITE_GRAPHRAG_VERSION],
  110|      0|    )?;
  111|       |    // Persist the resolved namespace so downstream tools can inspect it without re-resolving.
  112|      0|    conn.execute(
  113|      0|        "INSERT OR REPLACE INTO schema_meta (key, value) VALUES ('namespace_initial', ?1)",
  114|      0|        rusqlite::params![namespace],
  115|      0|    )?;
  116|       |
  117|      0|    output::emit_progress_i18n(
  118|      0|        "Initializing embedding model (may download on first run)...",
  119|      0|        crate::i18n::validation::runtime_pt::initializing_embedding_model(),
  120|       |    );
  121|       |
  122|      0|    let test_emb = crate::daemon::embed_passage_or_local(&paths.models, "smoke test")?;
  123|       |
  124|      0|    output::emit_json(&InitResponse {
  125|      0|        db_path: paths.db.display().to_string(),
  126|      0|        schema_version,
  127|      0|        model: "multilingual-e5-small".to_string(),
  128|      0|        dim: test_emb.len(),
  129|      0|        namespace,
  130|      0|        status: "ok".to_string(),
  131|      0|        elapsed_ms: start.elapsed().as_millis() as u64,
  132|      0|    })?;
  133|       |
  134|      0|    Ok(())
  135|      0|}
  136|       |
  137|      2|fn latest_schema_version(conn: &rusqlite::Connection) -> Result<u32, AppError> {
  138|      2|    match conn.query_row(
  139|      2|        "SELECT version FROM refinery_schema_history ORDER BY version DESC LIMIT 1",
  140|      2|        [],
  141|      1|        |row| row.get::<_, i64>(0),
  142|       |    ) {
  143|      1|        Ok(version) => Ok(version.max(0) as u32),
  144|      1|        Err(rusqlite::Error::QueryReturnedNoRows) => Ok(0),
  145|      0|        Err(err) => Err(AppError::Database(err)),
  146|       |    }
  147|      2|}
  148|       |
  149|       |#[cfg(test)]
  150|       |mod tests {
  151|       |    use super::*;
  152|       |
  153|       |    #[test]
  154|      1|    fn init_response_serializes_all_fields() {
  155|      1|        let resp = InitResponse {
  156|      1|            db_path: "/tmp/test.sqlite".to_string(),
  157|      1|            schema_version: 6,
  158|      1|            model: "multilingual-e5-small".to_string(),
  159|      1|            dim: 384,
  160|      1|            namespace: "global".to_string(),
  161|      1|            status: "ok".to_string(),
  162|      1|            elapsed_ms: 100,
  163|      1|        };
  164|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  165|      1|        assert_eq!(json["db_path"], "/tmp/test.sqlite");
  166|      1|        assert_eq!(json["schema_version"], 6);
  167|      1|        assert_eq!(json["model"], "multilingual-e5-small");
  168|      1|        assert_eq!(json["dim"], 384usize);
  169|      1|        assert_eq!(json["namespace"], "global");
  170|      1|        assert_eq!(json["status"], "ok");
  171|      1|        assert!(json["elapsed_ms"].is_number());
  172|      1|    }
  173|       |
  174|       |    #[test]
  175|      1|    fn latest_schema_version_returns_zero_for_empty_db() {
  176|      1|        let conn = rusqlite::Connection::open_in_memory().expect("failed to open in-memory db");
  177|      1|        conn.execute_batch("CREATE TABLE refinery_schema_history (version INTEGER NOT NULL);")
  178|      1|            .expect("failed to create table");
  179|       |
  180|      1|        let version = latest_schema_version(&conn).expect("latest_schema_version failed");
  181|      1|        assert_eq!(version, 0u32, "empty db must return schema_version 0");
                                                ^0
  182|      1|    }
  183|       |
  184|       |    #[test]
  185|      1|    fn latest_schema_version_returns_max_version() {
  186|      1|        let conn = rusqlite::Connection::open_in_memory().expect("failed to open in-memory db");
  187|      1|        conn.execute_batch(
  188|      1|            "CREATE TABLE refinery_schema_history (version INTEGER NOT NULL);
  189|      1|             INSERT INTO refinery_schema_history VALUES (1);
  190|      1|             INSERT INTO refinery_schema_history VALUES (3);
  191|      1|             INSERT INTO refinery_schema_history VALUES (2);",
  192|       |        )
  193|      1|        .expect("failed to populate table");
  194|       |
  195|      1|        let version = latest_schema_version(&conn).expect("latest_schema_version failed");
  196|      1|        assert_eq!(version, 3u32, "must return the highest version present");
                                                ^0
  197|      1|    }
  198|       |
  199|       |    #[test]
  200|      1|    fn init_response_dim_aligned_with_constant() {
  201|      1|        assert_eq!(
  202|       |            crate::constants::EMBEDDING_DIM,
  203|       |            384,
  204|      0|            "dim must be aligned with EMBEDDING_DIM=384"
  205|       |        );
  206|      1|    }
  207|       |
  208|       |    #[test]
  209|      1|    fn init_response_namespace_aligned_with_schema() {
  210|       |        // Verify namespace field survives round-trip serialization with correct value.
  211|      1|        let resp = InitResponse {
  212|      1|            db_path: "/tmp/x.sqlite".to_string(),
  213|      1|            schema_version: 6,
  214|      1|            model: "multilingual-e5-small".to_string(),
  215|      1|            dim: 384,
  216|      1|            namespace: "my-project".to_string(),
  217|      1|            status: "ok".to_string(),
  218|      1|            elapsed_ms: 0,
  219|      1|        };
  220|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  221|      1|        assert_eq!(json["namespace"], "my-project");
  222|      1|    }
  223|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/link.rs:
    1|       |//! Handler for the `link` CLI subcommand.
    2|       |
    3|       |use crate::constants::DEFAULT_RELATION_WEIGHT;
    4|       |use crate::entity_type::EntityType;
    5|       |use crate::errors::AppError;
    6|       |use crate::i18n::{errors_msg, validation};
    7|       |use crate::output::{self, OutputFormat};
    8|       |use crate::paths::AppPaths;
    9|       |use crate::storage::connection::open_rw;
   10|       |use crate::storage::entities;
   11|       |use crate::storage::entities::NewEntity;
   12|       |use rusqlite::params;
   13|       |use serde::Serialize;
   14|       |
   15|       |#[derive(clap::Args)]
   16|       |#[command(after_long_help = "EXAMPLES:\n  \
   17|       |    # Link two existing graph entities (extracted by GLiNER NER during `remember`)\n  \
   18|       |    sqlite-graphrag link --from oauth-flow --to refresh-tokens --relation related\n\n  \
   19|       |    # Auto-create entities that don't exist yet\n  \
   20|       |    sqlite-graphrag link --from concept-a --to concept-b --relation depends-on --create-missing\n\n  \
   21|       |    # Specify entity type for auto-created entities\n  \
   22|       |    sqlite-graphrag link --from alice --to acme-corp --relation related --create-missing --entity-type person\n\n  \
   23|       |    # Use a custom (non-canonical) relation type\n  \
   24|       |    sqlite-graphrag link --from module-a --to module-b --relation implements --create-missing\n\n  \
   25|       |    # If the entity does not exist and --create-missing is not set, the command fails with exit 4.\n  \
   26|       |    # To list current entity names:\n  \
   27|       |    sqlite-graphrag graph entities | jaq '.entities[].name'\n\n  \
   28|       |NOTE:\n  \
   29|       |    --from and --to expect ENTITY names (graph nodes), not memory names.\n  \
   30|       |    Memory names are managed via remember/read/edit/forget; entities are auto-extracted\n  \
   31|       |    by GLiNER NER from memory bodies or auto-created via --create-missing.")]
   32|       |pub struct LinkArgs {
   33|       |    /// Source ENTITY name (graph node, not memory). Entities are extracted by GLiNER NER during
   34|       |    /// `remember` or auto-created via `--create-missing`. Use `graph entities` to list
   35|       |    /// available entity names. Also accepts the alias `--name`.
   36|       |    #[arg(long, alias = "name")]
   37|       |    pub from: String,
   38|       |    /// Target ENTITY name (graph node, not memory). See `--from` for sourcing entity names.
   39|       |    #[arg(long)]
   40|       |    pub to: String,
   41|       |    /// Relation type between entities. Canonical values: applies-to, uses,
   42|       |    /// depends-on, causes, fixes, contradicts, supports, follows, related,
   43|       |    /// mentions, replaces, tracked-in. Any kebab-case or snake_case string
   44|       |    /// is also accepted as a custom relation.
   45|       |    #[arg(long, value_parser = crate::parsers::parse_relation, value_name = "RELATION")]
   46|       |    pub relation: String,
   47|       |    #[arg(long)]
   48|       |    pub weight: Option<f64>,
   49|       |    #[arg(long)]
   50|       |    pub namespace: Option<String>,
   51|       |    #[arg(long, value_enum, default_value = "json")]
   52|       |    pub format: OutputFormat,
   53|       |    #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
   54|       |    pub json: bool,
   55|       |    #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
   56|       |    pub db: Option<String>,
   57|       |    /// Auto-create entities when they do not exist. Created entities default to
   58|       |    /// type `concept` unless `--entity-type` specifies a different type.
   59|       |    #[arg(long, default_value_t = false)]
   60|       |    pub create_missing: bool,
   61|       |    /// Entity type assigned to auto-created entities (only effective with `--create-missing`).
   62|       |    #[arg(long, value_enum, default_value = "concept")]
   63|       |    pub entity_type: EntityType,
   64|       |    /// Reject non-canonical relation types with exit 1.
   65|       |    ///
   66|       |    /// When set, any relation not in the canonical list causes an immediate error.
   67|       |    /// Canonical values: applies-to, uses, depends-on, causes, fixes, contradicts,
   68|       |    /// supports, follows, related, mentions, replaces, tracked-in.
   69|       |    #[arg(
   70|       |        long,
   71|       |        default_value_t = false,
   72|       |        help = "Reject non-canonical relation types with exit 1"
   73|       |    )]
   74|       |    pub strict_relations: bool,
   75|       |    /// Emit a warning (but do not reject) when creating an edge would push either endpoint
   76|       |    /// entity above this degree. Default 50. Set 0 to disable the check.
   77|       |    #[arg(long, default_value_t = 50, value_name = "N")]
   78|       |    pub max_entity_degree: u32,
   79|       |}
   80|       |
   81|       |#[derive(Serialize)]
   82|       |struct LinkResponse {
   83|       |    action: String,
   84|       |    from: String,
   85|       |    to: String,
   86|       |    relation: String,
   87|       |    weight: f64,
   88|       |    namespace: String,
   89|       |    /// Total execution time in milliseconds from handler start to serialisation.
   90|       |    elapsed_ms: u64,
   91|       |    /// Entity names that were auto-created by `--create-missing`.
   92|       |    #[serde(skip_serializing_if = "Vec::is_empty")]
   93|       |    created_entities: Vec<String>,
   94|       |    /// Non-fatal warnings (e.g. non-canonical relation type).
   95|       |    #[serde(skip_serializing_if = "Vec::is_empty")]
   96|       |    warnings: Vec<String>,
   97|       |}
   98|       |
   99|      0|pub fn run(args: LinkArgs) -> Result<(), AppError> {
  100|      0|    let inicio = std::time::Instant::now();
  101|      0|    tracing::debug!(target: "link", from = %args.from, to = %args.to, relation = %args.relation, "creating relationship");
  102|      0|    let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
  103|      0|    let paths = AppPaths::resolve(args.db.as_deref())?;
  104|       |
  105|      0|    let norm_from = crate::parsers::normalize_entity_name(&args.from);
  106|      0|    let norm_to = crate::parsers::normalize_entity_name(&args.to);
  107|       |
  108|      0|    if norm_from == norm_to {
  109|      0|        return Err(AppError::Validation(validation::self_referential_link()));
  110|      0|    }
  111|       |
  112|      0|    let weight = args.weight.unwrap_or(DEFAULT_RELATION_WEIGHT);
  113|      0|    if !(0.0..=1.0).contains(&weight) {
  114|      0|        return Err(AppError::Validation(validation::invalid_link_weight(
  115|      0|            weight,
  116|      0|        )));
  117|      0|    }
  118|      0|    if weight >= 0.95 {
  119|      0|        tracing::warn!(target: "link",
  120|       |            weight = weight,
  121|      0|            "weight >= 0.95 compresses the scoring range; consider using a value below 0.95"
  122|       |        );
  123|      0|    }
  124|      0|    if weight <= 0.05 {
  125|      0|        tracing::warn!(target: "link",
  126|       |            weight = weight,
  127|      0|            "weight <= 0.05 may be too weak to influence traversal; consider using a value above 0.05"
  128|       |        );
  129|      0|    }
  130|       |
  131|      0|    crate::storage::connection::ensure_db_ready(&paths)?;
  132|       |
  133|      0|    let mut warnings: Vec<String> = Vec::with_capacity(2);
  134|      0|    let is_canonical = crate::parsers::is_canonical_relation(&args.relation);
  135|      0|    if !is_canonical {
  136|      0|        if args.strict_relations {
  137|      0|            return Err(AppError::Validation(format!(
  138|      0|                "non-canonical relation '{}': use --strict-relations=false or choose from: {}",
  139|      0|                args.relation,
  140|      0|                crate::parsers::CANONICAL_RELATIONS.join(", ")
  141|      0|            )));
  142|      0|        }
  143|      0|        warnings.push(format!("non-canonical relation '{}'", args.relation));
  144|      0|        tracing::warn!(target: "link",
  145|       |            relation = %args.relation,
  146|      0|            "non-canonical relation accepted; consider using a well-known value"
  147|       |        );
  148|      0|    }
  149|      0|    let relation_str = &args.relation;
  150|       |
  151|      0|    let mut conn = open_rw(&paths.db)?;
  152|      0|    let tx = conn.transaction_with_behavior(rusqlite::TransactionBehavior::Immediate)?;
  153|       |
  154|      0|    let mut created_entities: Vec<String> = Vec::with_capacity(2);
  155|       |
  156|      0|    if args.entity_type.as_str() == "memory" {
  157|      0|        tracing::warn!(target: "link",
  158|       |            entity_type = "memory",
  159|      0|            "entity_type 'memory' may conflict with memory table semantics; consider using 'concept' or another type"
  160|       |        );
  161|      0|    }
  162|       |
  163|      0|    let source_id = match entities::find_entity_id(&tx, &namespace, &norm_from)? {
  164|      0|        Some(id) => id,
  165|      0|        None if args.create_missing => {
  166|      0|            let new_entity = NewEntity {
  167|      0|                name: norm_from.clone(),
  168|      0|                entity_type: args.entity_type,
  169|      0|                description: None,
  170|      0|            };
  171|      0|            created_entities.push(norm_from.clone());
  172|      0|            entities::upsert_entity(&tx, &namespace, &new_entity)?
  173|       |        }
  174|       |        None => {
  175|      0|            return Err(AppError::NotFound(errors_msg::entity_not_found(
  176|      0|                &norm_from, &namespace,
  177|      0|            )));
  178|       |        }
  179|       |    };
  180|       |
  181|      0|    let target_id = match entities::find_entity_id(&tx, &namespace, &norm_to)? {
  182|      0|        Some(id) => id,
  183|      0|        None if args.create_missing => {
  184|      0|            let new_entity = NewEntity {
  185|      0|                name: norm_to.clone(),
  186|      0|                entity_type: args.entity_type,
  187|      0|                description: None,
  188|      0|            };
  189|      0|            created_entities.push(norm_to.clone());
  190|      0|            entities::upsert_entity(&tx, &namespace, &new_entity)?
  191|       |        }
  192|       |        None => {
  193|      0|            return Err(AppError::NotFound(errors_msg::entity_not_found(
  194|      0|                &norm_to, &namespace,
  195|      0|            )));
  196|       |        }
  197|       |    };
  198|       |
  199|      0|    let (rel_id, was_created) = entities::create_or_fetch_relationship(
  200|      0|        &tx,
  201|      0|        &namespace,
  202|      0|        source_id,
  203|      0|        target_id,
  204|      0|        relation_str,
  205|      0|        weight,
  206|      0|        None,
  207|      0|    )?;
  208|       |
  209|      0|    let actual_weight: f64 = tx.query_row(
  210|      0|        "SELECT weight FROM relationships WHERE id = ?1",
  211|      0|        params![rel_id],
  212|      0|        |r| r.get(0),
  213|      0|    )?;
  214|       |
  215|      0|    if was_created {
  216|      0|        entities::recalculate_degree(&tx, source_id)?;
  217|      0|        entities::recalculate_degree(&tx, target_id)?;
  218|       |
  219|      0|        if args.max_entity_degree > 0 {
  220|      0|            let cap = args.max_entity_degree as i64;
  221|      0|            for (entity_id, entity_name) in [(source_id, &norm_from), (target_id, &norm_to)] {
  222|      0|                let degree: i64 = tx.query_row(
  223|      0|                    "SELECT degree FROM entities WHERE id = ?1",
  224|      0|                    params![entity_id],
  225|      0|                    |r| r.get(0),
  226|      0|                )?;
  227|      0|                if degree > cap {
  228|      0|                    output::emit_progress(&format!(
  229|      0|                        "WARNING: entity '{entity_name}' degree {degree} exceeds cap {cap}"
  230|      0|                    ));
  231|      0|                }
  232|       |            }
  233|      0|        }
  234|      0|    }
  235|      0|    tx.commit()?;
  236|       |
  237|      0|    conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);")?;
  238|       |
  239|      0|    let action = if was_created {
  240|      0|        "created".to_string()
  241|       |    } else {
  242|      0|        "already_exists".to_string()
  243|       |    };
  244|       |
  245|      0|    let response = LinkResponse {
  246|      0|        action: action.clone(),
  247|      0|        from: norm_from.clone(),
  248|      0|        to: norm_to.clone(),
  249|      0|        relation: relation_str.to_string(),
  250|      0|        weight: actual_weight,
  251|      0|        namespace: namespace.clone(),
  252|      0|        elapsed_ms: inicio.elapsed().as_millis() as u64,
  253|      0|        created_entities,
  254|      0|        warnings,
  255|      0|    };
  256|       |
  257|      0|    match args.format {
  258|      0|        OutputFormat::Json => output::emit_json(&response)?,
  259|      0|        OutputFormat::Text | OutputFormat::Markdown => {
  260|      0|            output::emit_text(&format!(
  261|      0|                "{}: {} --[{}]--> {} [{}]",
  262|      0|                action, response.from, response.relation, response.to, response.namespace
  263|      0|            ));
  264|      0|        }
  265|       |    }
  266|       |
  267|      0|    Ok(())
  268|      0|}
  269|       |
  270|       |#[cfg(test)]
  271|       |mod tests {
  272|       |    use super::*;
  273|       |
  274|       |    #[test]
  275|      1|    fn link_response_without_redundant_aliases() {
  276|       |        // P1-O: source/target fields were removed from the JSON response.
  277|      1|        let resp = LinkResponse {
  278|      1|            action: "created".to_string(),
  279|      1|            from: "entity-a".to_string(),
  280|      1|            to: "entity-b".to_string(),
  281|      1|            relation: "uses".to_string(),
  282|      1|            weight: 1.0,
  283|      1|            namespace: "default".to_string(),
  284|      1|            elapsed_ms: 0,
  285|      1|            created_entities: vec![],
  286|      1|            warnings: vec![],
  287|      1|        };
  288|      1|        let json = serde_json::to_value(&resp).expect("serialization must work");
  289|      1|        assert_eq!(json["from"], "entity-a");
  290|      1|        assert_eq!(json["to"], "entity-b");
  291|      1|        assert!(
  292|      1|            json.get("source").is_none(),
  293|      0|            "field 'source' was removed in P1-O"
  294|       |        );
  295|      1|        assert!(
  296|      1|            json.get("target").is_none(),
  297|      0|            "field 'target' was removed in P1-O"
  298|       |        );
  299|      1|    }
  300|       |
  301|       |    #[test]
  302|      1|    fn link_response_serializes_all_fields() {
  303|      1|        let resp = LinkResponse {
  304|      1|            action: "already_exists".to_string(),
  305|      1|            from: "origin".to_string(),
  306|      1|            to: "destination".to_string(),
  307|      1|            relation: "mentions".to_string(),
  308|      1|            weight: 0.8,
  309|      1|            namespace: "test".to_string(),
  310|      1|            elapsed_ms: 5,
  311|      1|            created_entities: vec![],
  312|      1|            warnings: vec![],
  313|      1|        };
  314|      1|        let json = serde_json::to_value(&resp).expect("serialization must work");
  315|      1|        assert!(json.get("action").is_some());
  316|      1|        assert!(json.get("from").is_some());
  317|      1|        assert!(json.get("to").is_some());
  318|      1|        assert!(json.get("relation").is_some());
  319|      1|        assert!(json.get("weight").is_some());
  320|      1|        assert!(json.get("namespace").is_some());
  321|      1|        assert!(json.get("elapsed_ms").is_some());
  322|      1|    }
  323|       |
  324|       |    #[test]
  325|      1|    fn link_response_omits_created_entities_when_empty() {
  326|      1|        let resp = LinkResponse {
  327|      1|            action: "created".to_string(),
  328|      1|            from: "a".to_string(),
  329|      1|            to: "b".to_string(),
  330|      1|            relation: "uses".to_string(),
  331|      1|            weight: 1.0,
  332|      1|            namespace: "global".to_string(),
  333|      1|            elapsed_ms: 0,
  334|      1|            created_entities: vec![],
  335|      1|            warnings: vec![],
  336|      1|        };
  337|      1|        let json = serde_json::to_value(&resp).expect("serialization");
  338|      1|        assert!(
  339|      1|            json.get("created_entities").is_none(),
  340|      0|            "empty vec must be omitted"
  341|       |        );
  342|      1|    }
  343|       |
  344|       |    #[test]
  345|      1|    fn link_response_includes_created_entities_when_present() {
  346|      1|        let resp = LinkResponse {
  347|      1|            action: "created".to_string(),
  348|      1|            from: "new-a".to_string(),
  349|      1|            to: "new-b".to_string(),
  350|      1|            relation: "depends-on".to_string(),
  351|      1|            weight: 0.5,
  352|      1|            namespace: "test".to_string(),
  353|      1|            elapsed_ms: 1,
  354|      1|            created_entities: vec!["new-a".to_string(), "new-b".to_string()],
  355|      1|            warnings: vec![],
  356|      1|        };
  357|      1|        let json = serde_json::to_value(&resp).expect("serialization");
  358|      1|        let created = json["created_entities"].as_array().expect("must be array");
  359|      1|        assert_eq!(created.len(), 2);
  360|      1|        assert_eq!(created[0], "new-a");
  361|      1|        assert_eq!(created[1], "new-b");
  362|      1|    }
  363|       |
  364|       |    #[test]
  365|      1|    fn link_response_includes_warnings_when_non_canonical() {
  366|      1|        let resp = LinkResponse {
  367|      1|            action: "created".to_string(),
  368|      1|            from: "a".to_string(),
  369|      1|            to: "b".to_string(),
  370|      1|            relation: "implements".to_string(),
  371|      1|            weight: 0.5,
  372|      1|            namespace: "global".to_string(),
  373|      1|            elapsed_ms: 0,
  374|      1|            created_entities: vec![],
  375|      1|            warnings: vec!["non-canonical relation 'implements'".to_string()],
  376|      1|        };
  377|      1|        let json = serde_json::to_value(&resp).expect("serialization");
  378|      1|        let w = json["warnings"]
  379|      1|            .as_array()
  380|      1|            .expect("warnings must be present");
  381|      1|        assert_eq!(w.len(), 1);
  382|      1|        assert!(w[0].as_str().unwrap().contains("implements"));
  383|      1|    }
  384|       |
  385|       |    #[test]
  386|      1|    fn link_response_omits_warnings_when_empty() {
  387|      1|        let resp = LinkResponse {
  388|      1|            action: "created".to_string(),
  389|      1|            from: "a".to_string(),
  390|      1|            to: "b".to_string(),
  391|      1|            relation: "uses".to_string(),
  392|      1|            weight: 0.5,
  393|      1|            namespace: "global".to_string(),
  394|      1|            elapsed_ms: 0,
  395|      1|            created_entities: vec![],
  396|      1|            warnings: vec![],
  397|      1|        };
  398|      1|        let json = serde_json::to_value(&resp).expect("serialization");
  399|      1|        assert!(
  400|      1|            json.get("warnings").is_none(),
  401|      0|            "empty warnings must be omitted"
  402|       |        );
  403|      1|    }
  404|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/list.rs:
    1|       |//! Handler for the `list` CLI subcommand.
    2|       |
    3|       |use crate::cli::MemoryType;
    4|       |use crate::errors::AppError;
    5|       |use crate::output::{self, OutputFormat};
    6|       |use crate::paths::AppPaths;
    7|       |use crate::storage::connection::open_ro;
    8|       |use crate::storage::memories;
    9|       |use serde::Serialize;
   10|       |
   11|       |#[derive(clap::Args)]
   12|       |#[command(after_long_help = "EXAMPLES:\n  \
   13|       |    # List up to 50 memories from the global namespace (default)\n  \
   14|       |    sqlite-graphrag list\n\n  \
   15|       |    # Filter by memory type and namespace\n  \
   16|       |    sqlite-graphrag list --type project --namespace my-project\n\n  \
   17|       |    # Paginate with limit and offset\n  \
   18|       |    sqlite-graphrag list --limit 20 --offset 40\n\n  \
   19|       |    # Include soft-deleted memories\n  \
   20|       |    sqlite-graphrag list --include-deleted")]
   21|       |pub struct ListArgs {
   22|       |    #[arg(
   23|       |        long,
   24|       |        help = "Namespace (env: SQLITE_GRAPHRAG_NAMESPACE, default: global)"
   25|       |    )]
   26|       |    pub namespace: Option<String>,
   27|       |    /// Filter by memory.type. Note: distinct from graph entity_type
   28|       |    /// (project/tool/person/file/concept/incident/decision/memory/dashboard/issue_tracker/organization/location/date)
   29|       |    /// used in --entities-file.
   30|       |    #[arg(long, value_enum)]
   31|       |    pub r#type: Option<MemoryType>,
   32|       |    #[arg(
   33|       |        long,
   34|       |        help = "Maximum number of memories to return (default: 50 for text, all for JSON)"
   35|       |    )]
   36|       |    pub limit: Option<usize>,
   37|       |    /// Number of memories to skip before returning results.
   38|       |    #[arg(long, default_value = "0", help = "Number of memories to skip")]
   39|       |    pub offset: usize,
   40|       |    /// Output format: json (default), text, or markdown.
   41|       |    #[arg(long, value_enum, default_value = "json", help = "Output format")]
   42|       |    pub format: OutputFormat,
   43|       |    /// Include soft-deleted memories in the listing (deleted_at IS NOT NULL).
   44|       |    #[arg(long, default_value_t = false, help = "Include soft-deleted memories")]
   45|       |    pub include_deleted: bool,
   46|       |    #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
   47|       |    pub json: bool,
   48|       |    /// Path to graphrag.sqlite (overrides SQLITE_GRAPHRAG_DB_PATH and default CWD).
   49|       |    #[arg(
   50|       |        long,
   51|       |        env = "SQLITE_GRAPHRAG_DB_PATH",
   52|       |        help = "Path to graphrag.sqlite"
   53|       |    )]
   54|       |    pub db: Option<String>,
   55|       |}
   56|       |
   57|       |#[derive(Serialize, Clone)]
   58|       |struct ListItem {
   59|       |    id: i64,
   60|       |    /// Semantic alias of `id` for the contract documented in SKILL.md.
   61|       |    memory_id: i64,
   62|       |    name: String,
   63|       |    namespace: String,
   64|       |    /// Semantic alias for agents that parse `.type` in the JSON output.
   65|       |    #[serde(rename = "type")]
   66|       |    type_field: String,
   67|       |    /// Semantic alias for agents that parse `.memory_type` in the JSON output.
   68|       |    memory_type: String,
   69|       |    description: String,
   70|       |    snippet: String,
   71|       |    updated_at: i64,
   72|       |    /// RFC 3339 UTC timestamp parallel to `updated_at`.
   73|       |    updated_at_iso: String,
   74|       |    /// Unix epoch when the memory was soft-deleted, or omitted for active memories.
   75|       |    /// Surfaced only in `list --include-deleted --json` so LLM consumers can
   76|       |    /// distinguish active rows from soft-deleted ones in a single query (v1.0.37 H7+M9).
   77|       |    #[serde(skip_serializing_if = "Option::is_none")]
   78|       |    deleted_at: Option<i64>,
   79|       |    /// RFC 3339 UTC mirror of `deleted_at`, omitted when `deleted_at` is None.
   80|       |    #[serde(skip_serializing_if = "Option::is_none")]
   81|       |    deleted_at_iso: Option<String>,
   82|       |    /// Byte length of the full memory body.
   83|       |    body_length: usize,
   84|       |}
   85|       |
   86|       |#[derive(Serialize)]
   87|       |struct ListResponse {
   88|       |    items: Vec<ListItem>,
   89|       |    memories: Vec<ListItem>,
   90|       |    /// Total number of matching memories in the namespace (ignoring limit/offset).
   91|       |    total_count: usize,
   92|       |    /// True when the returned item count is less than `total_count`, indicating
   93|       |    /// that more results exist beyond the applied limit.
   94|       |    truncated: bool,
   95|       |    /// Total execution time in milliseconds from handler start to serialisation.
   96|       |    elapsed_ms: u64,
   97|       |}
   98|       |
   99|      0|pub fn run(args: ListArgs) -> Result<(), AppError> {
  100|      0|    if args.limit == Some(0) {
  101|      0|        return Err(AppError::Validation(
  102|      0|            "--limit must be greater than zero".to_string(),
  103|      0|        ));
  104|      0|    }
  105|      0|    let inicio = std::time::Instant::now();
  106|      0|    let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
  107|      0|    let paths = AppPaths::resolve(args.db.as_deref())?;
  108|       |    // v1.0.22 P1: standardizes exit code 4 with a friendly message when the DB does not exist.
  109|      0|    crate::storage::connection::ensure_db_ready(&paths)?;
  110|      0|    let conn = open_ro(&paths.db)?;
  111|       |
  112|      0|    let effective_limit = args.limit.unwrap_or(match args.format {
  113|      0|        OutputFormat::Json => usize::MAX,
  114|      0|        _ => 50,
  115|       |    });
  116|       |
  117|      0|    let memory_type_str = args.r#type.map(|t| t.as_str());
  118|      0|    let rows = memories::list(
  119|      0|        &conn,
  120|      0|        &namespace,
  121|      0|        memory_type_str,
  122|      0|        effective_limit,
  123|      0|        args.offset,
  124|      0|        args.include_deleted,
  125|      0|    )?;
  126|       |
  127|      0|    let items: Vec<ListItem> = rows
  128|      0|        .into_iter()
  129|      0|        .map(|r| {
  130|      0|            let body_length = r.body.len();
  131|      0|            let snippet: String = r.body.chars().take(200).collect();
  132|      0|            let updated_at_iso = crate::tz::epoch_to_iso(r.updated_at);
  133|      0|            let deleted_at_iso = r.deleted_at.map(crate::tz::epoch_to_iso);
  134|      0|            ListItem {
  135|      0|                id: r.id,
  136|      0|                memory_id: r.id,
  137|      0|                name: r.name,
  138|      0|                namespace: r.namespace,
  139|      0|                type_field: r.memory_type.clone(),
  140|      0|                memory_type: r.memory_type,
  141|      0|                description: r.description,
  142|      0|                snippet,
  143|      0|                updated_at: r.updated_at,
  144|      0|                updated_at_iso,
  145|      0|                deleted_at: r.deleted_at,
  146|      0|                deleted_at_iso,
  147|      0|                body_length,
  148|      0|            }
  149|      0|        })
  150|      0|        .collect();
  151|       |
  152|      0|    let total_count = items.len();
  153|      0|    let truncated = args.limit.is_some_and(|lim| items.len() >= lim);
  154|       |
  155|      0|    match args.format {
  156|       |        OutputFormat::Json => {
  157|      0|            let memories = items.clone();
  158|      0|            output::emit_json(&ListResponse {
  159|      0|                total_count,
  160|      0|                truncated,
  161|      0|                memories,
  162|      0|                items,
  163|      0|                elapsed_ms: inicio.elapsed().as_millis() as u64,
  164|      0|            })?;
  165|       |        }
  166|       |        OutputFormat::Text | OutputFormat::Markdown => {
  167|      0|            for item in &items {
  168|      0|                output::emit_text(&format!("{}: {}", item.name, item.snippet));
  169|      0|            }
  170|       |        }
  171|       |    }
  172|      0|    Ok(())
  173|      0|}
  174|       |
  175|       |#[cfg(test)]
  176|       |mod tests {
  177|       |    use super::*;
  178|       |
  179|      2|    fn make_item(name: &str) -> ListItem {
  180|      2|        ListItem {
  181|      2|            id: 1,
  182|      2|            memory_id: 1,
  183|      2|            name: name.to_string(),
  184|      2|            namespace: "global".to_string(),
  185|      2|            type_field: "note".to_string(),
  186|      2|            memory_type: "note".to_string(),
  187|      2|            description: "desc".to_string(),
  188|      2|            snippet: "snip".to_string(),
  189|      2|            updated_at: 1_745_000_000,
  190|      2|            updated_at_iso: "2025-04-19T00:00:00Z".to_string(),
  191|      2|            deleted_at: None,
  192|      2|            deleted_at_iso: None,
  193|      2|            body_length: 4,
  194|      2|        }
  195|      2|    }
  196|       |
  197|       |    #[test]
  198|      1|    fn list_response_serializes_items_and_elapsed_ms() {
  199|      1|        let resp = ListResponse {
  200|      1|            items: vec![make_item("test-memory")],
  201|      1|            memories: vec![make_item("test-memory")],
  202|      1|            total_count: 1,
  203|      1|            truncated: false,
  204|      1|            elapsed_ms: 7,
  205|      1|        };
  206|      1|        let json = serde_json::to_value(&resp).unwrap();
  207|      1|        assert!(json["items"].is_array());
  208|      1|        assert_eq!(json["items"].as_array().unwrap().len(), 1);
  209|      1|        assert_eq!(json["items"][0]["name"], "test-memory");
  210|      1|        assert_eq!(json["items"][0]["memory_id"], 1);
  211|      1|        assert_eq!(json["elapsed_ms"], 7);
  212|       |        // deleted_at/deleted_at_iso must be omitted when None (skip_serializing_if)
  213|      1|        assert!(json["items"][0].get("deleted_at").is_none());
  214|      1|        assert!(json["items"][0].get("deleted_at_iso").is_none());
  215|      1|    }
  216|       |
  217|       |    #[test]
  218|      1|    fn list_item_with_deleted_at_serializes_both_fields() {
  219|      1|        let item = ListItem {
  220|      1|            id: 99,
  221|      1|            memory_id: 99,
  222|      1|            name: "soft-deleted-memory".to_string(),
  223|      1|            namespace: "global".to_string(),
  224|      1|            type_field: "note".to_string(),
  225|      1|            memory_type: "note".to_string(),
  226|      1|            description: "deleted".to_string(),
  227|      1|            snippet: "snip".to_string(),
  228|      1|            updated_at: 1_745_000_000,
  229|      1|            updated_at_iso: "2025-04-19T00:00:00Z".to_string(),
  230|      1|            deleted_at: Some(1_745_100_000),
  231|      1|            deleted_at_iso: Some("2025-04-20T03:46:40Z".to_string()),
  232|      1|            body_length: 4,
  233|      1|        };
  234|      1|        let json = serde_json::to_value(&item).unwrap();
  235|      1|        assert_eq!(json["deleted_at"], 1_745_100_000_i64);
  236|      1|        assert_eq!(json["deleted_at_iso"], "2025-04-20T03:46:40Z");
  237|      1|    }
  238|       |
  239|       |    #[test]
  240|      1|    fn list_response_items_empty_serializes_empty_array() {
  241|      1|        let resp = ListResponse {
  242|      1|            items: vec![],
  243|      1|            memories: vec![],
  244|      1|            total_count: 0,
  245|      1|            truncated: false,
  246|      1|            elapsed_ms: 0,
  247|      1|        };
  248|      1|        let json = serde_json::to_value(&resp).unwrap();
  249|      1|        assert!(json["items"].is_array());
  250|      1|        assert_eq!(json["items"].as_array().unwrap().len(), 0);
  251|      1|        assert_eq!(json["elapsed_ms"], 0);
  252|      1|    }
  253|       |
  254|       |    #[test]
  255|      1|    fn list_item_memory_id_equals_id() {
  256|      1|        let item = ListItem {
  257|      1|            id: 42,
  258|      1|            memory_id: 42,
  259|      1|            name: "memory-alias".to_string(),
  260|      1|            namespace: "projeto".to_string(),
  261|      1|            type_field: "fact".to_string(),
  262|      1|            memory_type: "fact".to_string(),
  263|      1|            description: "desc".to_string(),
  264|      1|            snippet: "snip".to_string(),
  265|      1|            updated_at: 0,
  266|      1|            updated_at_iso: "1970-01-01T00:00:00Z".to_string(),
  267|      1|            deleted_at: None,
  268|      1|            deleted_at_iso: None,
  269|      1|            body_length: 0,
  270|      1|        };
  271|      1|        let json = serde_json::to_value(&item).unwrap();
  272|      1|        assert_eq!(
  273|      1|            json["id"], json["memory_id"],
  274|      0|            "id e memory_id devem ser iguais"
  275|       |        );
  276|      1|    }
  277|       |
  278|       |    #[test]
  279|      1|    fn snippet_truncated_to_200_chars() {
  280|      1|        let body_longo: String = "a".repeat(300);
  281|      1|        let snippet: String = body_longo.chars().take(200).collect();
  282|      1|        assert_eq!(snippet.len(), 200, "snippet deve ter exatamente 200 chars");
                                                     ^0
  283|      1|    }
  284|       |
  285|       |    #[test]
  286|      1|    fn list_item_emits_both_type_and_memory_type() {
  287|      1|        let item = ListItem {
  288|      1|            id: 1,
  289|      1|            memory_id: 1,
  290|      1|            name: "test".to_string(),
  291|      1|            namespace: "global".to_string(),
  292|      1|            type_field: "note".to_string(),
  293|      1|            memory_type: "note".to_string(),
  294|      1|            description: "desc".to_string(),
  295|      1|            snippet: "snip".to_string(),
  296|      1|            updated_at: 0,
  297|      1|            updated_at_iso: "1970-01-01T00:00:00Z".to_string(),
  298|      1|            deleted_at: None,
  299|      1|            deleted_at_iso: None,
  300|      1|            body_length: 0,
  301|      1|        };
  302|      1|        let json = serde_json::to_value(&item).unwrap();
  303|      1|        assert_eq!(json["type"], "note", "serde rename must produce 'type'");
                                                       ^0
  304|      1|        assert_eq!(
  305|      1|            json["memory_type"], "note",
  306|      0|            "memory_type must also be present"
  307|       |        );
  308|      1|    }
  309|       |
  310|       |    #[test]
  311|      1|    fn updated_at_iso_epoch_zero_yields_valid_utc() {
  312|       |        // v1.0.68 (test fix): timezone-agnostic — parse the ISO and compare
  313|       |        // the instant with the Unix epoch.
  314|      1|        let iso = crate::tz::epoch_to_iso(0);
  315|      1|        let parsed = chrono::DateTime::parse_from_rfc3339(&iso)
  316|      1|            .unwrap_or_else(|e| panic!("expected RFC3339, got `{iso}`: {e}"));
                                                     ^0
  317|      1|        assert_eq!(
  318|      1|            parsed.timestamp(),
  319|      1|            chrono::DateTime::UNIX_EPOCH.timestamp(),
  320|      0|            "epoch 0 deve mapear para o instante Unix epoch, obtido: {iso}"
  321|       |        );
  322|      1|        assert!(
  323|      1|            iso.contains('+') || iso.contains('-'),
  324|      0|            "must contain offset sign, got: {iso}"
  325|       |        );
  326|      1|    }
  327|       |
  328|       |    #[test]
  329|      1|    fn body_length_reflects_byte_count() {
  330|      1|        let body = "hello world";
  331|      1|        let item = ListItem {
  332|      1|            id: 1,
  333|      1|            memory_id: 1,
  334|      1|            name: "test".to_string(),
  335|      1|            namespace: "global".to_string(),
  336|      1|            type_field: "note".to_string(),
  337|      1|            memory_type: "note".to_string(),
  338|      1|            description: "desc".to_string(),
  339|      1|            snippet: body.chars().take(200).collect(),
  340|      1|            updated_at: 0,
  341|      1|            updated_at_iso: "1970-01-01T00:00:00Z".to_string(),
  342|      1|            deleted_at: None,
  343|      1|            deleted_at_iso: None,
  344|      1|            body_length: body.len(),
  345|      1|        };
  346|      1|        let json = serde_json::to_value(&item).unwrap();
  347|      1|        assert_eq!(json["body_length"], body.len());
  348|      1|    }
  349|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/memory_entities.rs:
    1|       |//! Handler for the `memory-entities` CLI subcommand.
    2|       |
    3|       |use crate::errors::AppError;
    4|       |use crate::output;
    5|       |use crate::paths::AppPaths;
    6|       |use crate::storage::connection::open_ro;
    7|       |use rusqlite::params;
    8|       |use serde::Serialize;
    9|       |
   10|       |#[derive(clap::Args)]
   11|       |#[command(
   12|       |    about = "List entities linked to a memory, or memories linked to an entity",
   13|       |    after_long_help = "EXAMPLES:\n  \
   14|       |    # List entities connected to a memory\n  \
   15|       |    sqlite-graphrag memory-entities --name my-memory\n\n  \
   16|       |    # Reverse: list memories bound to an entity\n  \
   17|       |    sqlite-graphrag memory-entities --entity rust-lang\n\n  \
   18|       |    # With namespace\n  \
   19|       |    sqlite-graphrag memory-entities --name my-memory --namespace project"
   20|       |)]
   21|       |pub struct MemoryEntitiesArgs {
   22|       |    #[arg(value_name = "NAME", conflicts_with = "name", help = "Memory name")]
   23|       |    pub name_positional: Option<String>,
   24|       |    #[arg(long, conflicts_with_all = ["entity"])]
   25|       |    pub name: Option<String>,
   26|       |    /// Entity name — list memories bound to this entity (reverse lookup).
   27|       |    #[arg(long, conflicts_with_all = ["name", "name_positional"])]
   28|       |    pub entity: Option<String>,
   29|       |    #[arg(
   30|       |        long,
   31|       |        help = "Namespace (env: SQLITE_GRAPHRAG_NAMESPACE, default: global)"
   32|       |    )]
   33|       |    pub namespace: Option<String>,
   34|       |    #[arg(long, hide = true)]
   35|       |    pub json: bool,
   36|       |    #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
   37|       |    pub db: Option<String>,
   38|       |}
   39|       |
   40|       |#[derive(Serialize)]
   41|       |struct EntityBinding {
   42|       |    entity_id: i64,
   43|       |    name: String,
   44|       |    entity_type: String,
   45|       |}
   46|       |
   47|       |#[derive(Serialize)]
   48|       |struct MemoryEntitiesResponse {
   49|       |    memory_name: String,
   50|       |    entities: Vec<EntityBinding>,
   51|       |    count: usize,
   52|       |    elapsed_ms: u64,
   53|       |}
   54|       |
   55|       |#[derive(Serialize)]
   56|       |struct MemoryBinding {
   57|       |    memory_id: i64,
   58|       |    name: String,
   59|       |    description: String,
   60|       |    memory_type: String,
   61|       |}
   62|       |
   63|       |#[derive(Serialize)]
   64|       |struct EntityMemoriesResponse {
   65|       |    entity_name: String,
   66|       |    memories: Vec<MemoryBinding>,
   67|       |    count: usize,
   68|       |    elapsed_ms: u64,
   69|       |}
   70|       |
   71|      0|pub fn run(args: MemoryEntitiesArgs) -> Result<(), AppError> {
   72|      0|    let start = std::time::Instant::now();
   73|      0|    let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
   74|      0|    let paths = AppPaths::resolve(args.db.as_deref())?;
   75|      0|    crate::storage::connection::ensure_db_ready(&paths)?;
   76|      0|    let conn = open_ro(&paths.db)?;
   77|       |
   78|      0|    if let Some(entity_name) = args.entity {
   79|      0|        let entity_id = crate::storage::entities::find_entity_id(&conn, &namespace, &entity_name)?
   80|      0|            .ok_or_else(|| {
   81|      0|                AppError::NotFound(crate::i18n::errors_msg::entity_not_found(
   82|      0|                    &entity_name,
   83|      0|                    &namespace,
   84|      0|                ))
   85|      0|            })?;
   86|       |
   87|      0|        let mut stmt = conn.prepare_cached(
   88|      0|            "SELECT m.id, m.name, m.description, m.type
   89|      0|             FROM memory_entities me
   90|      0|             JOIN memories m ON m.id = me.memory_id
   91|      0|             WHERE me.entity_id = ?1 AND m.deleted_at IS NULL
   92|      0|             ORDER BY m.name",
   93|      0|        )?;
   94|       |
   95|      0|        let memories: Vec<MemoryBinding> = stmt
   96|      0|            .query_map(params![entity_id], |r| {
   97|       |                Ok(MemoryBinding {
   98|      0|                    memory_id: r.get(0)?,
   99|      0|                    name: r.get(1)?,
  100|      0|                    description: r.get(2)?,
  101|      0|                    memory_type: r.get(3)?,
  102|       |                })
  103|      0|            })?
  104|      0|            .collect::<Result<Vec<_>, _>>()?;
  105|       |
  106|      0|        let count = memories.len();
  107|      0|        output::emit_json(&EntityMemoriesResponse {
  108|      0|            entity_name,
  109|      0|            memories,
  110|      0|            count,
  111|      0|            elapsed_ms: start.elapsed().as_millis() as u64,
  112|      0|        })?;
  113|      0|        return Ok(());
  114|      0|    }
  115|       |
  116|      0|    let name = args.name_positional.or(args.name).ok_or_else(|| {
  117|      0|        AppError::Validation(
  118|      0|            "name required: pass as positional argument, via --name, or use --entity for reverse lookup".to_string(),
  119|      0|        )
  120|      0|    })?;
  121|       |
  122|      0|    let memory_id: i64 = conn
  123|      0|        .query_row(
  124|      0|            "SELECT id FROM memories WHERE namespace = ?1 AND name = ?2 AND deleted_at IS NULL",
  125|      0|            params![namespace, name],
  126|      0|            |r| r.get(0),
  127|       |        )
  128|      0|        .map_err(|_| {
  129|      0|            AppError::NotFound(crate::i18n::errors_msg::memory_not_found(&name, &namespace))
  130|      0|        })?;
  131|       |
  132|      0|    let mut stmt = conn.prepare_cached(
  133|      0|        "SELECT e.id, e.name, e.type AS entity_type
  134|      0|         FROM memory_entities me
  135|      0|         JOIN entities e ON e.id = me.entity_id
  136|      0|         WHERE me.memory_id = ?1
  137|      0|         ORDER BY e.name",
  138|      0|    )?;
  139|       |
  140|      0|    let entities: Vec<EntityBinding> = stmt
  141|      0|        .query_map(params![memory_id], |r| {
  142|       |            Ok(EntityBinding {
  143|      0|                entity_id: r.get(0)?,
  144|      0|                name: r.get(1)?,
  145|      0|                entity_type: r.get(2)?,
  146|       |            })
  147|      0|        })?
  148|      0|        .collect::<Result<Vec<_>, _>>()?;
  149|       |
  150|      0|    let count = entities.len();
  151|       |
  152|      0|    output::emit_json(&MemoryEntitiesResponse {
  153|      0|        memory_name: name,
  154|      0|        entities,
  155|      0|        count,
  156|      0|        elapsed_ms: start.elapsed().as_millis() as u64,
  157|      0|    })?;
  158|       |
  159|      0|    Ok(())
  160|      0|}
  161|       |
  162|       |#[cfg(test)]
  163|       |mod tests {
  164|       |    use super::*;
  165|       |
  166|       |    #[test]
  167|      1|    fn response_serializes_correctly() {
  168|      1|        let resp = MemoryEntitiesResponse {
  169|      1|            memory_name: "test-mem".to_string(),
  170|      1|            entities: vec![EntityBinding {
  171|      1|                entity_id: 1,
  172|      1|                name: "rust".to_string(),
  173|      1|                entity_type: "concept".to_string(),
  174|      1|            }],
  175|      1|            count: 1,
  176|      1|            elapsed_ms: 5,
  177|      1|        };
  178|      1|        let json = serde_json::to_value(&resp).unwrap();
  179|      1|        assert_eq!(json["memory_name"], "test-mem");
  180|      1|        assert_eq!(json["count"], 1);
  181|      1|        assert_eq!(json["entities"][0]["name"], "rust");
  182|      1|    }
  183|       |
  184|       |    #[test]
  185|      1|    fn entity_memories_response_serializes_correctly() {
  186|      1|        let resp = EntityMemoriesResponse {
  187|      1|            entity_name: "rust-lang".to_string(),
  188|      1|            memories: vec![MemoryBinding {
  189|      1|                memory_id: 42,
  190|      1|                name: "design-auth".to_string(),
  191|      1|                description: "JWT auth design".to_string(),
  192|      1|                memory_type: "decision".to_string(),
  193|      1|            }],
  194|      1|            count: 1,
  195|      1|            elapsed_ms: 3,
  196|      1|        };
  197|      1|        let json = serde_json::to_value(&resp).unwrap();
  198|      1|        assert_eq!(json["entity_name"], "rust-lang");
  199|      1|        assert_eq!(json["count"], 1);
  200|      1|        assert_eq!(json["memories"][0]["name"], "design-auth");
  201|      1|        assert_eq!(json["memories"][0]["memory_type"], "decision");
  202|      1|        assert_eq!(json["memories"][0]["memory_id"], 42);
  203|      1|    }
  204|       |
  205|       |    #[test]
  206|      1|    fn entity_memories_response_empty_list() {
  207|      1|        let resp = EntityMemoriesResponse {
  208|      1|            entity_name: "orphan-entity".to_string(),
  209|      1|            memories: vec![],
  210|      1|            count: 0,
  211|      1|            elapsed_ms: 1,
  212|      1|        };
  213|      1|        let json = serde_json::to_value(&resp).unwrap();
  214|      1|        assert_eq!(json["entity_name"], "orphan-entity");
  215|      1|        assert_eq!(json["count"], 0);
  216|      1|        assert!(json["memories"].as_array().unwrap().is_empty());
  217|      1|    }
  218|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/merge_entities.rs:
    1|       |//! Handler for the `merge-entities` CLI subcommand (GAP-19).
    2|       |//!
    3|       |//! Merges two or more source entities into a single target entity by:
    4|       |//!   1. Retargeting all relationships pointing at any source to the target.
    5|       |//!   2. Deduplicating relationships that become identical after the merge
    6|       |//!      (same source_id + target_id + relation).
    7|       |//!   3. Retargeting memory_entities bindings.
    8|       |//!   4. Deleting the now-empty source entity rows.
    9|       |
   10|       |use crate::errors::AppError;
   11|       |use crate::i18n::errors_msg;
   12|       |use crate::output::{self, OutputFormat};
   13|       |use crate::paths::AppPaths;
   14|       |use crate::storage::connection::open_rw;
   15|       |use crate::storage::entities;
   16|       |use rusqlite::params;
   17|       |use serde::Serialize;
   18|       |
   19|       |#[derive(clap::Args)]
   20|       |#[command(after_long_help = "EXAMPLES:\n  \
   21|       |    # Merge two source entities into a target\n  \
   22|       |    sqlite-graphrag merge-entities --names auth,authentication --into auth-service\n\n  \
   23|       |    # Merge three sources into one target across a namespace\n  \
   24|       |    sqlite-graphrag merge-entities --names svc-a,svc-b,old-svc --into canonical-service --namespace my-project\n\n\
   25|       |NOTE:\n  \
   26|       |    --names is a comma-separated list of source entity names.\n  \
   27|       |    --into is the target entity name and must already exist.\n  \
   28|       |    Source entities are deleted after the merge; the target is preserved.\n  \
   29|       |    Duplicate relationships (same endpoints + relation) are removed automatically.\n  \
   30|       |    Run `sqlite-graphrag cleanup-orphans` afterwards if sources had no other links.")]
   31|       |pub struct MergeEntitiesArgs {
   32|       |    /// Comma-separated list of source entity names to merge into the target.
   33|       |    #[arg(long, value_delimiter = ',', value_name = "NAMES")]
   34|       |    pub names: Vec<String>,
   35|       |    /// Target entity name. Must already exist. All source relationships are redirected here.
   36|       |    #[arg(long, value_name = "TARGET")]
   37|       |    pub into: String,
   38|       |    #[arg(long)]
   39|       |    pub namespace: Option<String>,
   40|       |    #[arg(long, value_enum, default_value = "json")]
   41|       |    pub format: OutputFormat,
   42|       |    #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
   43|       |    pub json: bool,
   44|       |    #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
   45|       |    pub db: Option<String>,
   46|       |}
   47|       |
   48|       |#[derive(Serialize)]
   49|       |struct MergeEntitiesResponse {
   50|       |    action: String,
   51|       |    sources: Vec<String>,
   52|       |    target: String,
   53|       |    namespace: String,
   54|       |    relationships_moved: usize,
   55|       |    entities_removed: usize,
   56|       |    /// Total execution time in milliseconds from handler start to serialisation.
   57|       |    elapsed_ms: u64,
   58|       |}
   59|       |
   60|      0|pub fn run(args: MergeEntitiesArgs) -> Result<(), AppError> {
   61|      0|    let inicio = std::time::Instant::now();
   62|       |
   63|      0|    if args.names.is_empty() {
   64|      0|        return Err(AppError::Validation(
   65|      0|            "--names must contain at least one source entity name".to_string(),
   66|      0|        ));
   67|      0|    }
   68|       |
   69|      0|    let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
   70|      0|    let paths = AppPaths::resolve(args.db.as_deref())?;
   71|       |
   72|      0|    crate::storage::connection::ensure_db_ready(&paths)?;
   73|       |
   74|      0|    let mut conn = open_rw(&paths.db)?;
   75|       |
   76|       |    // Resolve target entity ID.
   77|      0|    let target_id = entities::find_entity_id(&conn, &namespace, &args.into)?
   78|      0|        .ok_or_else(|| AppError::NotFound(errors_msg::entity_not_found(&args.into, &namespace)))?;
   79|       |
   80|       |    // Resolve source entity IDs — reject self-referential merge (G21).
   81|      0|    let mut source_ids: Vec<i64> = Vec::with_capacity(args.names.len());
   82|      0|    for name in &args.names {
   83|      0|        if name == &args.into {
   84|      0|            return Err(AppError::Validation(format!(
   85|      0|                "source entity '{}' equals target '{}' — self-referential merge is not allowed",
   86|      0|                name, args.into
   87|      0|            )));
   88|      0|        }
   89|      0|        let id = entities::find_entity_id(&conn, &namespace, name)?
   90|      0|            .ok_or_else(|| AppError::NotFound(errors_msg::entity_not_found(name, &namespace)))?;
   91|      0|        if !source_ids.contains(&id) {
   92|      0|            source_ids.push(id);
   93|      0|        }
   94|       |    }
   95|       |
   96|      0|    if source_ids.is_empty() {
   97|      0|        return Err(AppError::Validation(
   98|      0|            "no valid source entities to merge (all names equal the target or were duplicates)"
   99|      0|                .to_string(),
  100|      0|        ));
  101|      0|    }
  102|       |
  103|      0|    let tx = conn.transaction_with_behavior(rusqlite::TransactionBehavior::Immediate)?;
  104|       |
  105|      0|    let mut relationships_moved: usize = 0;
  106|       |
  107|      0|    for &src_id in &source_ids {
  108|       |        // Step 1a: redirect source_id, ignoring UNIQUE conflicts.
  109|      0|        let moved_src = tx.execute(
  110|      0|            "UPDATE OR IGNORE relationships SET source_id = ?1 WHERE source_id = ?2",
  111|      0|            params![target_id, src_id],
  112|      0|        )?;
  113|      0|        tx.execute(
  114|      0|            "DELETE FROM relationships WHERE source_id = ?1",
  115|      0|            params![src_id],
  116|      0|        )?;
  117|       |        // Step 1b: redirect target_id, ignoring UNIQUE conflicts.
  118|      0|        let moved_tgt = tx.execute(
  119|      0|            "UPDATE OR IGNORE relationships SET target_id = ?1 WHERE target_id = ?2",
  120|      0|            params![target_id, src_id],
  121|      0|        )?;
  122|      0|        tx.execute(
  123|      0|            "DELETE FROM relationships WHERE target_id = ?1",
  124|      0|            params![src_id],
  125|      0|        )?;
  126|      0|        relationships_moved += moved_src + moved_tgt;
  127|       |    }
  128|       |
  129|       |    // Step 2: remove self-loops introduced by the redirect (target → target).
  130|      0|    tx.execute("DELETE FROM relationships WHERE source_id = target_id", [])?;
  131|       |
  132|       |    // Step 3: deduplicate relationships that now share (source, target, relation).
  133|       |    // Safety net — UPDATE OR IGNORE should have handled most duplicates above.
  134|      0|    tx.execute(
  135|      0|        "DELETE FROM relationships
  136|      0|         WHERE id NOT IN (
  137|      0|             SELECT MIN(id)
  138|      0|             FROM relationships
  139|      0|             GROUP BY source_id, target_id, relation
  140|      0|         )",
  141|      0|        [],
  142|      0|    )?;
  143|       |
  144|       |    // Step 4: retarget memory_entities bindings.
  145|       |    // Use UPDATE OR IGNORE to skip conflicts when memory is already bound to
  146|       |    // target entity. Then DELETE remaining source rows (the conflicting ones
  147|       |    // that UPDATE OR IGNORE skipped). Same pattern as relationships (Step 1).
  148|      0|    for &src_id in &source_ids {
  149|      0|        tx.execute(
  150|      0|            "UPDATE OR IGNORE memory_entities SET entity_id = ?1 WHERE entity_id = ?2",
  151|      0|            params![target_id, src_id],
  152|      0|        )?;
  153|      0|        tx.execute(
  154|      0|            "DELETE FROM memory_entities WHERE entity_id = ?1",
  155|      0|            params![src_id],
  156|      0|        )?;
  157|       |    }
  158|       |
  159|       |    // Step 5: deduplicate memory_entities bindings (same memory + entity).
  160|      0|    tx.execute(
  161|      0|        "DELETE FROM memory_entities
  162|      0|         WHERE rowid NOT IN (
  163|      0|             SELECT MIN(rowid)
  164|      0|             FROM memory_entities
  165|      0|             GROUP BY memory_id, entity_id
  166|      0|         )",
  167|      0|        [],
  168|      0|    )?;
  169|       |
  170|       |    // Step 6: delete source entities (vec_entities first — no FK CASCADE on vec0).
  171|      0|    let mut entities_removed: usize = 0;
  172|      0|    for &src_id in &source_ids {
  173|      0|        let _ = tx.execute(
  174|      0|            "DELETE FROM vec_entities WHERE entity_id = ?1",
  175|      0|            params![src_id],
  176|      0|        );
  177|      0|        let removed = tx.execute("DELETE FROM entities WHERE id = ?1", params![src_id])?;
  178|      0|        entities_removed += removed;
  179|       |    }
  180|       |
  181|       |    // Step 7: recalculate degree for target and all adjacent entities.
  182|      0|    let adjacent_ids: Vec<i64> = {
  183|      0|        let mut stmt = tx.prepare(
  184|      0|            "SELECT DISTINCT CASE WHEN source_id = ?1 THEN target_id ELSE source_id END
  185|      0|             FROM relationships WHERE source_id = ?1 OR target_id = ?1",
  186|      0|        )?;
  187|      0|        let ids: Vec<i64> = stmt
  188|      0|            .query_map(params![target_id], |r| r.get(0))?
  189|      0|            .collect::<Result<Vec<_>, _>>()?;
  190|      0|        ids
  191|       |    };
  192|      0|    entities::recalculate_degree(&tx, target_id)?;
  193|      0|    for &adj_id in &adjacent_ids {
  194|      0|        entities::recalculate_degree(&tx, adj_id)?;
  195|       |    }
  196|       |
  197|      0|    tx.commit()?;
  198|       |
  199|      0|    conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);")?;
  200|       |
  201|       |    // Build the list of sources that were actually processed (excluding target duplicates).
  202|      0|    let processed_sources: Vec<String> = args
  203|      0|        .names
  204|      0|        .iter()
  205|      0|        .filter(|n| n.as_str() != args.into.as_str())
  206|      0|        .cloned()
  207|      0|        .collect();
  208|       |
  209|      0|    let response = MergeEntitiesResponse {
  210|      0|        action: "merged".to_string(),
  211|      0|        sources: processed_sources,
  212|      0|        target: args.into.clone(),
  213|      0|        namespace: namespace.clone(),
  214|      0|        relationships_moved,
  215|      0|        entities_removed,
  216|      0|        elapsed_ms: inicio.elapsed().as_millis() as u64,
  217|      0|    };
  218|       |
  219|      0|    match args.format {
  220|      0|        OutputFormat::Json => output::emit_json(&response)?,
  221|      0|        OutputFormat::Text | OutputFormat::Markdown => {
  222|      0|            output::emit_text(&format!(
  223|      0|                "merged: {} sources into '{}' (relationships_moved={}, entities_removed={}) [{}]",
  224|      0|                response.sources.len(),
  225|      0|                response.target,
  226|      0|                response.relationships_moved,
  227|      0|                response.entities_removed,
  228|      0|                response.namespace
  229|      0|            ));
  230|      0|        }
  231|       |    }
  232|       |
  233|      0|    Ok(())
  234|      0|}
  235|       |
  236|       |#[cfg(test)]
  237|       |mod tests {
  238|       |    use super::*;
  239|       |
  240|       |    #[test]
  241|      1|    fn merge_entities_response_serializes_all_fields() {
  242|      1|        let resp = MergeEntitiesResponse {
  243|      1|            action: "merged".to_string(),
  244|      1|            sources: vec!["auth".to_string(), "authentication".to_string()],
  245|      1|            target: "auth-service".to_string(),
  246|      1|            namespace: "global".to_string(),
  247|      1|            relationships_moved: 7,
  248|      1|            entities_removed: 2,
  249|      1|            elapsed_ms: 15,
  250|      1|        };
  251|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  252|      1|        assert_eq!(json["action"], "merged");
  253|      1|        assert_eq!(json["target"], "auth-service");
  254|      1|        assert_eq!(json["namespace"], "global");
  255|      1|        assert_eq!(json["relationships_moved"], 7);
  256|      1|        assert_eq!(json["entities_removed"], 2);
  257|      1|        let sources = json["sources"].as_array().expect("must be array");
  258|      1|        assert_eq!(sources.len(), 2);
  259|      1|        assert!(json["elapsed_ms"].is_number());
  260|      1|    }
  261|       |
  262|       |    #[test]
  263|      1|    fn merge_entities_response_action_is_merged() {
  264|      1|        let resp = MergeEntitiesResponse {
  265|      1|            action: "merged".to_string(),
  266|      1|            sources: vec!["src".to_string()],
  267|      1|            target: "tgt".to_string(),
  268|      1|            namespace: "ns".to_string(),
  269|      1|            relationships_moved: 0,
  270|      1|            entities_removed: 1,
  271|      1|            elapsed_ms: 0,
  272|      1|        };
  273|      1|        assert_eq!(resp.action, "merged");
  274|      1|    }
  275|       |
  276|       |    #[test]
  277|      1|    fn merge_entities_response_empty_sources_serializes() {
  278|      1|        let resp = MergeEntitiesResponse {
  279|      1|            action: "merged".to_string(),
  280|      1|            sources: vec![],
  281|      1|            target: "target".to_string(),
  282|      1|            namespace: "global".to_string(),
  283|      1|            relationships_moved: 0,
  284|      1|            entities_removed: 0,
  285|      1|            elapsed_ms: 1,
  286|      1|        };
  287|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  288|      1|        let sources = json["sources"].as_array().expect("must be array");
  289|      1|        assert_eq!(sources.len(), 0);
  290|      1|    }
  291|       |
  292|       |    #[test]
  293|      1|    fn merge_entities_response_with_zero_relationships_moved() {
  294|      1|        let resp = MergeEntitiesResponse {
  295|      1|            action: "merged".to_string(),
  296|      1|            sources: vec!["src-a".to_string()],
  297|      1|            target: "tgt".to_string(),
  298|      1|            namespace: "global".to_string(),
  299|      1|            relationships_moved: 0,
  300|      1|            entities_removed: 1,
  301|      1|            elapsed_ms: 5,
  302|      1|        };
  303|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  304|      1|        assert_eq!(json["relationships_moved"], 0);
  305|      1|        assert_eq!(json["entities_removed"], 1);
  306|      1|    }
  307|       |
  308|       |    #[test]
  309|      1|    fn merge_entities_response_multiple_sources() {
  310|      1|        let resp = MergeEntitiesResponse {
  311|      1|            action: "merged".to_string(),
  312|      1|            sources: vec!["a".into(), "b".into(), "c".into()],
  313|      1|            target: "canonical".to_string(),
  314|      1|            namespace: "proj".to_string(),
  315|      1|            relationships_moved: 12,
  316|      1|            entities_removed: 3,
  317|      1|            elapsed_ms: 42,
  318|      1|        };
  319|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  320|      1|        assert_eq!(json["entities_removed"], 3);
  321|      1|        let sources = json["sources"].as_array().unwrap();
  322|      1|        assert_eq!(sources.len(), 3);
  323|      1|    }
  324|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/migrate.rs:
    1|       |//! Handler for the `migrate` CLI subcommand.
    2|       |
    3|       |use crate::errors::AppError;
    4|       |use crate::output;
    5|       |use crate::paths::AppPaths;
    6|       |use crate::storage::connection::open_rw;
    7|       |use rusqlite::OptionalExtension;
    8|       |use serde::Serialize;
    9|       |
   10|       |#[derive(clap::Args)]
   11|       |#[command(after_long_help = "EXAMPLES:\n  \
   12|       |    # Apply pending schema migrations\n  \
   13|       |    sqlite-graphrag migrate\n\n  \
   14|       |    # Show already-applied migrations without applying new ones\n  \
   15|       |    sqlite-graphrag migrate --status\n\n  \
   16|       |    # Migrate a database at a custom path\n  \
   17|       |    sqlite-graphrag migrate --db /path/to/graphrag.sqlite")]
   18|       |pub struct MigrateArgs {
   19|       |    #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
   20|       |    pub db: Option<String>,
   21|       |    /// Explicit JSON flag. Accepted as a no-op because output is already JSON by default.
   22|       |    #[arg(long, default_value_t = false)]
   23|       |    pub json: bool,
   24|       |    /// Show already applied migrations without applying new ones.
   25|       |    #[arg(long, default_value_t = false)]
   26|       |    pub status: bool,
   27|       |}
   28|       |
   29|       |#[derive(Serialize)]
   30|       |struct MigrateResponse {
   31|       |    db_path: String,
   32|       |    /// Latest applied migration number from `refinery_schema_history`.
   33|       |    /// Emitted as JSON number for cross-command consistency with `health`/`stats`/`init` (since v1.0.35).
   34|       |    schema_version: u32,
   35|       |    status: String,
   36|       |    /// Total execution time in milliseconds from handler start to serialisation.
   37|       |    elapsed_ms: u64,
   38|       |}
   39|       |
   40|       |#[derive(Serialize)]
   41|       |struct MigrateStatusResponse {
   42|       |    db_path: String,
   43|       |    applied_migrations: Vec<MigrationEntry>,
   44|       |    /// Latest applied migration number. JSON number since v1.0.35.
   45|       |    schema_version: u32,
   46|       |    elapsed_ms: u64,
   47|       |}
   48|       |
   49|       |#[derive(Serialize)]
   50|       |struct MigrationEntry {
   51|       |    version: i64,
   52|       |    name: String,
   53|       |    applied_on: Option<String>,
   54|       |}
   55|       |
   56|      0|pub fn run(args: MigrateArgs) -> Result<(), AppError> {
   57|      0|    let start = std::time::Instant::now();
   58|      0|    let _ = args.json; // --json is a no-op because output is already JSON by default
   59|      0|    let paths = AppPaths::resolve(args.db.as_deref())?;
   60|      0|    paths.ensure_dirs()?;
   61|       |
   62|      0|    let mut conn = open_rw(&paths.db)?;
   63|       |
   64|      0|    if args.status {
   65|      0|        let schema_version = latest_schema_version(&conn).unwrap_or(0);
   66|      0|        let applied = list_applied_migrations(&conn)?;
   67|      0|        output::emit_json(&MigrateStatusResponse {
   68|      0|            db_path: paths.db.display().to_string(),
   69|      0|            applied_migrations: applied,
   70|      0|            schema_version,
   71|      0|            elapsed_ms: start.elapsed().as_millis() as u64,
   72|      0|        })?;
   73|      0|        return Ok(());
   74|      0|    }
   75|       |
   76|      0|    crate::migrations::runner()
   77|      0|        .run(&mut conn)
   78|      0|        .map_err(|e| AppError::Internal(anyhow::anyhow!("migration failed: {e}")))?;
   79|       |
   80|      0|    conn.execute_batch(&format!(
   81|      0|        "PRAGMA user_version = {};",
   82|      0|        crate::constants::SCHEMA_USER_VERSION
   83|      0|    ))?;
   84|       |
   85|      0|    let schema_version = latest_schema_version(&conn)?;
   86|      0|    conn.execute(
   87|      0|        "INSERT OR REPLACE INTO schema_meta (key, value) VALUES ('schema_version', ?1)",
   88|      0|        rusqlite::params![schema_version],
   89|      0|    )?;
   90|       |
   91|      0|    output::emit_json(&MigrateResponse {
   92|      0|        db_path: paths.db.display().to_string(),
   93|      0|        schema_version,
   94|      0|        status: "ok".to_string(),
   95|      0|        elapsed_ms: start.elapsed().as_millis() as u64,
   96|      0|    })?;
   97|       |
   98|      0|    Ok(())
   99|      0|}
  100|       |
  101|      0|fn list_applied_migrations(conn: &rusqlite::Connection) -> Result<Vec<MigrationEntry>, AppError> {
  102|      0|    let table_exists: Option<String> = conn
  103|      0|        .query_row(
  104|      0|            "SELECT name FROM sqlite_master WHERE type='table' AND name='refinery_schema_history'",
  105|      0|            [],
  106|      0|            |r| r.get(0),
  107|       |        )
  108|      0|        .optional()?;
  109|      0|    if table_exists.is_none() {
  110|      0|        return Ok(vec![]);
  111|      0|    }
  112|      0|    let mut stmt = conn.prepare_cached(
  113|      0|        "SELECT version, name, applied_on FROM refinery_schema_history ORDER BY version ASC",
  114|      0|    )?;
  115|      0|    let entries = stmt
  116|      0|        .query_map([], |r| {
  117|       |            Ok(MigrationEntry {
  118|      0|                version: r.get(0)?,
  119|      0|                name: r.get(1)?,
  120|      0|                applied_on: r.get(2)?,
  121|       |            })
  122|      0|        })?
  123|      0|        .collect::<Result<Vec<_>, _>>()?;
  124|      0|    Ok(entries)
  125|      0|}
  126|       |
  127|      3|fn latest_schema_version(conn: &rusqlite::Connection) -> Result<u32, AppError> {
  128|      3|    match conn.query_row(
  129|      3|        "SELECT version FROM refinery_schema_history ORDER BY version DESC LIMIT 1",
  130|      3|        [],
  131|      1|        |row| row.get::<_, i64>(0),
  132|       |    ) {
  133|      1|        Ok(version) => Ok(version.max(0) as u32),
  134|      1|        Err(rusqlite::Error::QueryReturnedNoRows) => Ok(0),
  135|      1|        Err(err) => Err(AppError::Database(err)),
  136|       |    }
  137|      3|}
  138|       |
  139|       |#[cfg(test)]
  140|       |mod tests {
  141|       |    use super::*;
  142|       |    use rusqlite::Connection;
  143|       |
  144|      1|    fn create_db_without_history() -> Connection {
  145|      1|        Connection::open_in_memory().expect("failed to open in-memory db")
  146|      1|    }
  147|       |
  148|      1|    fn create_db_with_history(version: i64) -> Connection {
  149|      1|        let conn = Connection::open_in_memory().expect("failed to open in-memory db");
  150|      1|        conn.execute_batch(
  151|      1|            "CREATE TABLE refinery_schema_history (
  152|      1|                version INTEGER NOT NULL,
  153|      1|                name TEXT,
  154|      1|                applied_on TEXT,
  155|      1|                checksum TEXT
  156|      1|            );",
  157|       |        )
  158|      1|        .expect("failed to create history table");
  159|      1|        conn.execute(
  160|      1|            "INSERT INTO refinery_schema_history (version, name) VALUES (?1, 'V001__init')",
  161|      1|            rusqlite::params![version],
  162|       |        )
  163|      1|        .expect("failed to insert version");
  164|      1|        conn
  165|      1|    }
  166|       |
  167|       |    #[test]
  168|      1|    fn latest_schema_version_returns_error_without_table() {
  169|      1|        let conn = create_db_without_history();
  170|       |        // Without refinery_schema_history table, SQLite returns Unknown (code 1) -> AppError::Database
  171|      1|        let result = latest_schema_version(&conn);
  172|      1|        assert!(result.is_err(), "must return Err when table does not exist");
                                               ^0
  173|      1|    }
  174|       |
  175|       |    #[test]
  176|      1|    fn latest_schema_version_returns_max_version() {
  177|      1|        let conn = create_db_with_history(6);
  178|      1|        let version = latest_schema_version(&conn).unwrap();
  179|      1|        assert_eq!(version, 6u32);
  180|      1|    }
  181|       |
  182|       |    #[test]
  183|      1|    fn migrate_response_serializes_required_fields() {
  184|      1|        let resp = MigrateResponse {
  185|      1|            db_path: "/tmp/test.sqlite".to_string(),
  186|      1|            schema_version: 6,
  187|      1|            status: "ok".to_string(),
  188|      1|            elapsed_ms: 12,
  189|      1|        };
  190|      1|        let json = serde_json::to_value(&resp).unwrap();
  191|      1|        assert_eq!(json["status"], "ok");
  192|      1|        assert_eq!(json["schema_version"], 6);
  193|      1|        assert_eq!(json["db_path"], "/tmp/test.sqlite");
  194|      1|        assert_eq!(json["elapsed_ms"], 12);
  195|      1|    }
  196|       |
  197|       |    #[test]
  198|      1|    fn latest_schema_version_returns_zero_when_table_empty() {
  199|      1|        let conn = Connection::open_in_memory().expect("in-memory db");
  200|      1|        conn.execute_batch(
  201|      1|            "CREATE TABLE refinery_schema_history (
  202|      1|                version INTEGER NOT NULL,
  203|      1|                name TEXT
  204|      1|            );",
  205|       |        )
  206|      1|        .expect("table creation");
  207|       |        // Table exists but is empty -> QueryReturnedNoRows -> 0
  208|      1|        let version = latest_schema_version(&conn).unwrap();
  209|      1|        assert_eq!(version, 0u32);
  210|      1|    }
  211|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/namespace_detect.rs:
    1|       |//! Handler for the `namespace-detect` CLI subcommand.
    2|       |
    3|       |use crate::errors::AppError;
    4|       |use crate::namespace;
    5|       |use crate::output;
    6|       |use serde::Serialize;
    7|       |
    8|       |#[derive(clap::Args)]
    9|       |#[command(after_long_help = "EXAMPLES:\n  \
   10|       |    # Resolve namespace using current environment and cwd\n  \
   11|       |    sqlite-graphrag namespace-detect\n\n  \
   12|       |    # Override with an explicit namespace flag\n  \
   13|       |    sqlite-graphrag namespace-detect --namespace my-project\n\n  \
   14|       |    # Resolve via SQLITE_GRAPHRAG_NAMESPACE env var\n  \
   15|       |    SQLITE_GRAPHRAG_NAMESPACE=ci-runner sqlite-graphrag namespace-detect")]
   16|       |pub struct NamespaceDetectArgs {
   17|       |    #[arg(long)]
   18|       |    pub namespace: Option<String>,
   19|       |    /// Explicit database path. Accepted as a no-op to preserve the global contract.
   20|       |    #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
   21|       |    pub db: Option<String>,
   22|       |    /// Explicit JSON flag. Accepted as a no-op because output is already JSON by default.
   23|       |    #[arg(long, default_value_t = false)]
   24|       |    pub json: bool,
   25|       |}
   26|       |
   27|       |#[derive(Serialize)]
   28|       |struct NamespaceDetectResponse {
   29|       |    namespace: String,
   30|       |    source: namespace::NamespaceSource,
   31|       |    cwd: String,
   32|       |    /// Total execution time in milliseconds from handler start to serialisation.
   33|       |    elapsed_ms: u64,
   34|       |}
   35|       |
   36|      0|pub fn run(args: NamespaceDetectArgs) -> Result<(), AppError> {
   37|      0|    let inicio = std::time::Instant::now();
   38|      0|    let _ = args.db;
   39|      0|    let _ = args.json; // --json is a no-op because output is already JSON by default
   40|      0|    let resolution = namespace::detect_namespace(args.namespace.as_deref())?;
   41|      0|    output::emit_json(&NamespaceDetectResponse {
   42|      0|        namespace: resolution.namespace,
   43|      0|        source: resolution.source,
   44|      0|        cwd: resolution.cwd,
   45|      0|        elapsed_ms: inicio.elapsed().as_millis() as u64,
   46|      0|    })?;
   47|      0|    Ok(())
   48|      0|}
   49|       |
   50|       |#[cfg(test)]
   51|       |mod tests {
   52|       |    use super::*;
   53|       |    use crate::namespace::NamespaceSource;
   54|       |    use clap::Parser;
   55|       |    use serial_test::serial;
   56|       |
   57|       |    #[test]
   58|       |    #[serial]
   59|      1|    fn namespace_detect_default_returns_global_via_detect() {
   60|       |        // Garante que sem flag e sem env, detect_namespace retorna "global"
   61|      1|        std::env::remove_var("SQLITE_GRAPHRAG_NAMESPACE");
   62|      1|        let resolution = namespace::detect_namespace(None).unwrap();
   63|      1|        assert_eq!(resolution.namespace, "global");
   64|      1|        assert_eq!(resolution.source, NamespaceSource::Default);
   65|       |    }
   66|       |
   67|       |    #[test]
   68|       |    #[serial]
   69|      1|    fn namespace_detect_explicit_flag_overrides_env() {
   70|      1|        std::env::set_var("SQLITE_GRAPHRAG_NAMESPACE", "env-namespace");
   71|      1|        let resolution = namespace::detect_namespace(Some("flag-namespace")).unwrap();
   72|      1|        assert_eq!(resolution.namespace, "flag-namespace");
   73|      1|        assert_eq!(resolution.source, NamespaceSource::ExplicitFlag);
   74|      1|        std::env::remove_var("SQLITE_GRAPHRAG_NAMESPACE");
   75|       |    }
   76|       |
   77|       |    #[test]
   78|       |    #[serial]
   79|      1|    fn namespace_detect_env_var_used_when_no_flag() {
   80|      1|        std::env::remove_var("SQLITE_GRAPHRAG_NAMESPACE");
   81|      1|        std::env::set_var("SQLITE_GRAPHRAG_NAMESPACE", "namespace-de-env");
   82|      1|        let resolution = namespace::detect_namespace(None).unwrap();
   83|      1|        assert_eq!(resolution.namespace, "namespace-de-env");
   84|      1|        assert_eq!(resolution.source, NamespaceSource::Environment);
   85|      1|        std::env::remove_var("SQLITE_GRAPHRAG_NAMESPACE");
   86|       |    }
   87|       |
   88|       |    #[test]
   89|      1|    fn namespace_detect_response_serializes_all_fields() {
   90|      1|        let resp = NamespaceDetectResponse {
   91|      1|            namespace: "meu-projeto".to_string(),
   92|      1|            source: NamespaceSource::ExplicitFlag,
   93|      1|            cwd: "/home/usuario/projeto".to_string(),
   94|      1|            elapsed_ms: 3,
   95|      1|        };
   96|      1|        let json = serde_json::to_value(&resp).unwrap();
   97|      1|        assert_eq!(json["namespace"], "meu-projeto");
   98|      1|        assert_eq!(json["source"], "explicit_flag");
   99|      1|        assert!(json["cwd"].is_string());
  100|      1|        assert_eq!(json["elapsed_ms"], 3);
  101|      1|    }
  102|       |
  103|       |    #[test]
  104|      1|    fn namespace_source_serializes_in_snake_case() {
  105|      1|        let cases = vec![
  106|      1|            (NamespaceSource::ExplicitFlag, "explicit_flag"),
  107|      1|            (NamespaceSource::Environment, "environment"),
  108|      1|            (NamespaceSource::Default, "default"),
  109|       |        ];
  110|      4|        for (source, expected) in cases {
                           ^3      ^3
  111|      3|            let json = serde_json::to_value(source).unwrap();
  112|      3|            assert_eq!(
  113|       |                json, expected,
  114|      0|                "NamespaceSource::{source:?} must serialize as \"{expected}\""
  115|       |            );
  116|       |        }
  117|      1|    }
  118|       |
  119|       |    #[test]
  120|      1|    fn namespace_detect_accepts_db_as_noop() {
  121|      1|        let cli = crate::cli::Cli::try_parse_from([
  122|      1|            "sqlite-graphrag",
  123|      1|            "namespace-detect",
  124|      1|            "--db",
  125|      1|            "/tmp/graphrag.sqlite",
  126|      1|        ])
  127|      1|        .expect("namespace-detect must accept --db as a no-op");
  128|       |
  129|      1|        match cli.command {
  130|      1|            crate::cli::Commands::NamespaceDetect(args) => {
  131|      1|                assert_eq!(args.db.as_deref(), Some("/tmp/graphrag.sqlite"));
  132|       |            }
  133|      0|            _ => unreachable!("unexpected command parsed"),
  134|       |        }
  135|      1|    }
  136|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/normalize_entities.rs:
    1|       |//! Handler for the `normalize-entities` CLI subcommand (GAP-15).
    2|       |//!
    3|       |//! Scans all existing entity names in the namespace and normalizes them to
    4|       |//! kebab-case ASCII using [`crate::parsers::normalize_entity_name`].
    5|       |//! When a normalized name already exists (collision), the source entity is
    6|       |//! merged into the target using the same logic as `merge-entities`:
    7|       |//! relationships are retargeted via `UPDATE OR IGNORE` + `DELETE`, then
    8|       |//! the source row is removed. Otherwise the entity name is updated in place.
    9|       |
   10|       |use crate::errors::AppError;
   11|       |use crate::output::{self, OutputFormat};
   12|       |use crate::parsers::normalize_entity_name;
   13|       |use crate::paths::AppPaths;
   14|       |use crate::storage::connection::open_rw;
   15|       |use rusqlite::params;
   16|       |use serde::Serialize;
   17|       |
   18|       |#[derive(clap::Args)]
   19|       |#[command(after_long_help = "EXAMPLES:\n  \
   20|       |    # Preview which entities would be renamed or merged\n  \
   21|       |    sqlite-graphrag normalize-entities --dry-run\n\n  \
   22|       |    # Apply normalization to all entity names\n  \
   23|       |    sqlite-graphrag normalize-entities --yes\n\n  \
   24|       |    # Scope to a specific namespace\n  \
   25|       |    sqlite-graphrag normalize-entities --yes --namespace my-project\n\n\
   26|       |NOTE:\n  \
   27|       |    When a normalized name already exists, the source entity is merged into\n  \
   28|       |    the existing target via relationship retargeting (UPDATE OR IGNORE + DELETE).\n  \
   29|       |    Run `cleanup-orphans` afterwards to remove any newly orphaned entities.")]
   30|       |pub struct NormalizeEntitiesArgs {
   31|       |    /// Preview changes without persisting them.
   32|       |    #[arg(long, conflicts_with = "yes")]
   33|       |    pub dry_run: bool,
   34|       |    /// Apply normalization without interactive confirmation.
   35|       |    #[arg(long, conflicts_with = "dry_run")]
   36|       |    pub yes: bool,
   37|       |    #[arg(long)]
   38|       |    pub namespace: Option<String>,
   39|       |    #[arg(long, value_enum, default_value = "json")]
   40|       |    pub format: OutputFormat,
   41|       |    #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
   42|       |    pub json: bool,
   43|       |    #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
   44|       |    pub db: Option<String>,
   45|       |}
   46|       |
   47|       |#[derive(Serialize)]
   48|       |struct NormalizeEntitiesResponse {
   49|       |    /// "normalized" when changes were applied, "dry_run" when only previewed.
   50|       |    action: String,
   51|       |    /// Number of entities whose names were updated in place.
   52|       |    normalized_count: usize,
   53|       |    /// Number of entities that collided with an existing normalized name and
   54|       |    /// were merged into the target.
   55|       |    merged_count: usize,
   56|       |    namespace: String,
   57|       |    /// Total execution time in milliseconds from handler start to serialisation.
   58|       |    elapsed_ms: u64,
   59|       |}
   60|       |
   61|      0|pub fn run(args: NormalizeEntitiesArgs) -> Result<(), AppError> {
   62|      0|    let inicio = std::time::Instant::now();
   63|       |
   64|      0|    if !args.dry_run && !args.yes {
   65|      0|        return Err(AppError::Validation(
   66|      0|            "pass --dry-run to preview or --yes to apply changes".to_string(),
   67|      0|        ));
   68|      0|    }
   69|       |
   70|      0|    let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
   71|      0|    let paths = AppPaths::resolve(args.db.as_deref())?;
   72|       |
   73|      0|    crate::storage::connection::ensure_db_ready(&paths)?;
   74|       |
   75|      0|    let mut conn = open_rw(&paths.db)?;
   76|       |
   77|       |    // Collect all entity (id, name) pairs for the namespace.
   78|      0|    let entities: Vec<(i64, String)> = {
   79|      0|        let mut stmt =
   80|      0|            conn.prepare_cached("SELECT id, name FROM entities WHERE namespace = ?1 ORDER BY id")?;
   81|      0|        let rows = stmt.query_map(params![namespace], |r| {
   82|      0|            Ok((r.get::<_, i64>(0)?, r.get::<_, String>(1)?))
   83|      0|        })?;
   84|      0|        rows.collect::<Result<Vec<_>, _>>()?
   85|       |    };
   86|       |
   87|       |    // Compute which names need changing.
   88|      0|    let to_change: Vec<(i64, String, String)> = entities
   89|      0|        .iter()
   90|      0|        .filter_map(|(id, name)| {
   91|      0|            let normalized = normalize_entity_name(name);
   92|      0|            if normalized != *name {
   93|      0|                Some((*id, name.clone(), normalized))
   94|       |            } else {
   95|      0|                None
   96|       |            }
   97|      0|        })
   98|      0|        .collect();
   99|       |
  100|       |    // G10: classify changes into renames (no collision) and merges (collision).
  101|       |    // A collision occurs when two distinct names normalize to the same target,
  102|       |    // or when the normalized target already exists in the DB as an already-normalized entity.
  103|      0|    let already_normalized: std::collections::HashSet<String> = entities
  104|      0|        .iter()
  105|      0|        .filter(|(_, name)| normalize_entity_name(name) == *name)
  106|      0|        .map(|(_, name)| name.clone())
  107|      0|        .collect();
  108|       |
  109|      0|    let mut target_groups: std::collections::HashMap<String, usize> =
  110|      0|        std::collections::HashMap::with_capacity(to_change.len());
  111|      0|    for (_, _, normalized) in &to_change {
  112|      0|        *target_groups.entry(normalized.clone()).or_insert(0) += 1;
  113|      0|    }
  114|       |
  115|      0|    let mut merge_count_preview: usize = 0;
  116|      0|    let mut rename_count_preview: usize = 0;
  117|      0|    for (target, count) in &target_groups {
  118|      0|        if *count > 1 || already_normalized.contains(target) {
  119|       |            // All sources in this group will merge into the existing or first entity
  120|      0|            let extra = if already_normalized.contains(target) {
  121|      0|                *count // all merge into existing
  122|       |            } else {
  123|      0|                count - 1 // first one renames, rest merge
  124|       |            };
  125|      0|            merge_count_preview += extra;
  126|      0|            rename_count_preview += count - extra;
  127|      0|        } else {
  128|      0|            rename_count_preview += 1;
  129|      0|        }
  130|       |    }
  131|       |
  132|      0|    if args.dry_run {
  133|      0|        let response = NormalizeEntitiesResponse {
  134|      0|            action: "dry_run".to_string(),
  135|      0|            normalized_count: rename_count_preview,
  136|      0|            merged_count: merge_count_preview,
  137|      0|            namespace,
  138|      0|            elapsed_ms: inicio.elapsed().as_millis() as u64,
  139|      0|        };
  140|      0|        match args.format {
  141|      0|            OutputFormat::Json => output::emit_json(&response)?,
  142|      0|            OutputFormat::Text | OutputFormat::Markdown => {
  143|      0|                output::emit_text(&format!(
  144|      0|                    "dry_run: {} entity names would be normalized",
  145|      0|                    response.normalized_count
  146|      0|                ));
  147|      0|            }
  148|       |        }
  149|      0|        return Ok(());
  150|      0|    }
  151|       |
  152|       |    // Apply changes inside a transaction.
  153|      0|    let tx = conn.transaction_with_behavior(rusqlite::TransactionBehavior::Immediate)?;
  154|       |
  155|      0|    let mut normalized_count: usize = 0;
  156|      0|    let mut merged_count: usize = 0;
  157|       |
  158|      0|    for (src_id, _original_name, normalized) in &to_change {
  159|       |        // Check whether a row with the normalized name already exists.
  160|      0|        let existing_id: Option<i64> = {
  161|      0|            let mut stmt =
  162|      0|                tx.prepare_cached("SELECT id FROM entities WHERE namespace = ?1 AND name = ?2")?;
  163|      0|            match stmt.query_row(params![namespace, normalized], |r| r.get::<_, i64>(0)) {
  164|      0|                Ok(id) => Some(id),
  165|      0|                Err(rusqlite::Error::QueryReturnedNoRows) => None,
  166|      0|                Err(e) => return Err(AppError::Database(e)),
  167|       |            }
  168|       |        };
  169|       |
  170|      0|        match existing_id {
  171|      0|            Some(target_id) if target_id != *src_id => {
  172|       |                // Collision: merge source into target using UPDATE OR IGNORE + DELETE.
  173|       |                // Step 1a: redirect source_id.
  174|      0|                tx.execute(
  175|      0|                    "UPDATE OR IGNORE relationships SET source_id = ?1 WHERE source_id = ?2",
  176|      0|                    params![target_id, src_id],
  177|      0|                )?;
  178|      0|                tx.execute(
  179|      0|                    "DELETE FROM relationships WHERE source_id = ?1",
  180|      0|                    params![src_id],
  181|      0|                )?;
  182|       |                // Step 1b: redirect target_id.
  183|      0|                tx.execute(
  184|      0|                    "UPDATE OR IGNORE relationships SET target_id = ?1 WHERE target_id = ?2",
  185|      0|                    params![target_id, src_id],
  186|      0|                )?;
  187|      0|                tx.execute(
  188|      0|                    "DELETE FROM relationships WHERE target_id = ?1",
  189|      0|                    params![src_id],
  190|      0|                )?;
  191|       |                // Remove self-loops.
  192|      0|                tx.execute("DELETE FROM relationships WHERE source_id = target_id", [])?;
  193|       |                // Retarget memory_entities bindings.
  194|      0|                tx.execute(
  195|      0|                    "UPDATE OR IGNORE memory_entities SET entity_id = ?1 WHERE entity_id = ?2",
  196|      0|                    params![target_id, src_id],
  197|      0|                )?;
  198|      0|                tx.execute(
  199|      0|                    "DELETE FROM memory_entities WHERE entity_id = ?1",
  200|      0|                    params![src_id],
  201|      0|                )?;
  202|       |                // Remove the source entity row.
  203|      0|                tx.execute("DELETE FROM entities WHERE id = ?1", params![src_id])?;
  204|       |                // Recalculate degree for the surviving target.
  205|      0|                tx.execute(
  206|      0|                    "UPDATE entities
  207|      0|                     SET degree = (SELECT COUNT(*) FROM relationships
  208|      0|                                   WHERE source_id = entities.id OR target_id = entities.id)
  209|      0|                     WHERE id = ?1",
  210|      0|                    params![target_id],
  211|      0|                )?;
  212|      0|                tracing::info!(target: "normalize_entities",
  213|       |                    src_id = src_id,
  214|       |                    target_id = target_id,
  215|       |                    normalized = normalized,
  216|      0|                    "entity merged into existing normalized target"
  217|       |                );
  218|      0|                merged_count += 1;
  219|       |            }
  220|       |            _ => {
  221|       |                // No collision: simple rename.
  222|      0|                tx.execute(
  223|      0|                    "UPDATE entities SET name = ?1, updated_at = unixepoch() WHERE id = ?2",
  224|      0|                    params![normalized, src_id],
  225|      0|                )?;
  226|      0|                tracing::info!(target: "normalize_entities",
  227|       |                    entity_id = src_id,
  228|       |                    normalized = normalized,
  229|      0|                    "entity name normalized"
  230|       |                );
  231|      0|                normalized_count += 1;
  232|       |            }
  233|       |        }
  234|       |    }
  235|       |
  236|      0|    tx.commit()?;
  237|      0|    conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);")?;
  238|       |
  239|      0|    let response = NormalizeEntitiesResponse {
  240|      0|        action: "normalized".to_string(),
  241|      0|        normalized_count,
  242|      0|        merged_count,
  243|      0|        namespace,
  244|      0|        elapsed_ms: inicio.elapsed().as_millis() as u64,
  245|      0|    };
  246|       |
  247|      0|    match args.format {
  248|      0|        OutputFormat::Json => output::emit_json(&response)?,
  249|      0|        OutputFormat::Text | OutputFormat::Markdown => {
  250|      0|            output::emit_text(&format!(
  251|      0|                "normalized: {} renamed, {} merged",
  252|      0|                response.normalized_count, response.merged_count
  253|      0|            ));
  254|      0|        }
  255|       |    }
  256|       |
  257|      0|    Ok(())
  258|      0|}
  259|       |
  260|       |#[cfg(test)]
  261|       |mod tests {
  262|       |    use super::*;
  263|       |    use crate::storage::connection::register_vec_extension;
  264|       |    use rusqlite::Connection;
  265|       |    use tempfile::TempDir;
  266|       |
  267|       |    type TestResult = Result<(), Box<dyn std::error::Error>>;
  268|       |
  269|       |    /// Opens a temp DB with the full schema applied via migrations.
  270|      3|    fn setup_db() -> Result<(TempDir, Connection), Box<dyn std::error::Error>> {
  271|      3|        register_vec_extension();
  272|      3|        let tmp = TempDir::new()?;
                                              ^0
  273|      3|        let db_path = tmp.path().join("test.db");
  274|      3|        let mut conn = Connection::open(&db_path)?;
                                                               ^0
  275|      3|        crate::migrations::runner().run(&mut conn)?;
                                                                ^0
  276|      3|        Ok((tmp, conn))
  277|      3|    }
  278|       |
  279|       |    /// Inserts an entity bypassing `upsert_entity` normalization, so tests can
  280|       |    /// seed deliberately un-normalized names.
  281|      5|    fn insert_entity(conn: &Connection, name: &str) -> Result<i64, Box<dyn std::error::Error>> {
  282|       |        // Bypass upsert_entity normalization to seed raw (un-normalized) names.
  283|      5|        conn.execute(
  284|      5|            "INSERT INTO entities (namespace, name, type, description) VALUES ('global', ?1, 'concept', NULL)",
  285|      5|            params![name],
  286|      0|        )?;
  287|      5|        let id: i64 = conn.query_row(
  288|      5|            "SELECT id FROM entities WHERE namespace = 'global' AND name = ?1",
  289|      5|            params![name],
  290|      5|            |r| r.get(0),
  291|      0|        )?;
  292|      5|        Ok(id)
  293|      5|    }
  294|       |
  295|       |    #[test]
  296|      1|    fn dry_run_returns_count_without_changes() -> TestResult {
  297|      1|        let (_tmp, conn) = setup_db()?;
                                                   ^0
  298|      1|        insert_entity(&conn, "Hello World")?;
                                                         ^0
  299|      1|        insert_entity(&conn, "already-normalized")?;
                                                                ^0
  300|       |
  301|       |        // Verify "Hello World" exists.
  302|      1|        let count: i64 = conn.query_row(
  303|      1|            "SELECT COUNT(*) FROM entities WHERE name = 'Hello World' AND namespace = 'global'",
  304|      1|            [],
  305|      1|            |r| r.get(0),
  306|      0|        )?;
  307|      1|        assert_eq!(count, 1, "entity must exist before dry run");
                                           ^0
  308|       |
  309|       |        // dry_run must not modify anything.
  310|      1|        let count_after: i64 = conn.query_row(
  311|      1|            "SELECT COUNT(*) FROM entities WHERE name = 'Hello World' AND namespace = 'global'",
  312|      1|            [],
  313|      1|            |r| r.get(0),
  314|      0|        )?;
  315|      1|        assert_eq!(count_after, 1, "dry run must not rename entities");
                                                 ^0
  316|      1|        Ok(())
  317|      1|    }
  318|       |
  319|       |    #[test]
  320|      1|    fn renames_unnormalized_entity_in_place() -> TestResult {
  321|      1|        let (_tmp, conn) = setup_db()?;
                                                   ^0
  322|      1|        let src_id = insert_entity(&conn, "Hello World")?;
                                                                      ^0
  323|       |
  324|       |        // Apply normalization directly via the internal logic.
  325|       |        {
  326|      1|            let normalized = normalize_entity_name("Hello World");
  327|      1|            let existing: Option<i64> = {
  328|      1|                match conn.query_row(
  329|      1|                    "SELECT id FROM entities WHERE namespace = 'global' AND name = ?1",
  330|      1|                    params![normalized],
  331|      0|                    |r| r.get::<_, i64>(0),
  332|       |                ) {
  333|      0|                    Ok(id) => Some(id),
  334|      1|                    Err(rusqlite::Error::QueryReturnedNoRows) => None,
  335|      0|                    Err(e) => return Err(e.into()),
  336|       |                }
  337|       |            };
  338|      1|            assert!(existing.is_none(), "no collision expected");
                                                      ^0
  339|      1|            conn.execute(
  340|      1|                "UPDATE entities SET name = ?1 WHERE id = ?2",
  341|      1|                params![normalized, src_id],
  342|      0|            )?;
  343|       |        }
  344|       |
  345|      1|        let name: String = conn.query_row(
  346|      1|            "SELECT name FROM entities WHERE id = ?1",
  347|      1|            params![src_id],
  348|      1|            |r| r.get(0),
  349|      0|        )?;
  350|      1|        assert_eq!(name, "hello-world");
  351|      1|        Ok(())
  352|      1|    }
  353|       |
  354|       |    #[test]
  355|      1|    fn merges_into_existing_on_collision() -> TestResult {
  356|      1|        let (_tmp, conn) = setup_db()?;
                                                   ^0
  357|       |        // Target already exists with the normalized name.
  358|      1|        let target_id = insert_entity(&conn, "hello-world")?;
                                                                         ^0
  359|       |        // Source has the un-normalized form that normalizes to the same value.
  360|      1|        let src_id = insert_entity(&conn, "Hello World")?;
                                                                      ^0
  361|       |
  362|       |        // Insert a relationship attached to src_id.
  363|      1|        conn.execute(
  364|      1|            "INSERT INTO relationships (namespace, source_id, target_id, relation, weight)
  365|      1|             VALUES ('global', ?1, ?1, 'related', 0.5)",
  366|      1|            params![src_id],
  367|      0|        )?;
  368|       |
  369|       |        // Merge: retarget relationships from src → target.
  370|      1|        conn.execute(
  371|      1|            "UPDATE OR IGNORE relationships SET source_id = ?1 WHERE source_id = ?2",
  372|      1|            params![target_id, src_id],
  373|      0|        )?;
  374|      1|        conn.execute(
  375|      1|            "DELETE FROM relationships WHERE source_id = ?1",
  376|      1|            params![src_id],
  377|      0|        )?;
  378|      1|        conn.execute("DELETE FROM entities WHERE id = ?1", params![src_id])?;
                                                                                         ^0
  379|       |
  380|       |        // Source must be gone.
  381|      1|        let src_exists: i64 = conn.query_row(
  382|      1|            "SELECT COUNT(*) FROM entities WHERE id = ?1",
  383|      1|            params![src_id],
  384|      1|            |r| r.get(0),
  385|      0|        )?;
  386|      1|        assert_eq!(src_exists, 0, "source entity must be deleted after merge");
                                                ^0
  387|       |
  388|       |        // Target must still exist.
  389|      1|        let target_name: String = conn.query_row(
  390|      1|            "SELECT name FROM entities WHERE id = ?1",
  391|      1|            params![target_id],
  392|      1|            |r| r.get(0),
  393|      0|        )?;
  394|      1|        assert_eq!(target_name, "hello-world");
  395|      1|        Ok(())
  396|      1|    }
  397|       |
  398|       |    #[test]
  399|      1|    fn normalize_entities_response_serializes_correctly() {
  400|      1|        let resp = NormalizeEntitiesResponse {
  401|      1|            action: "normalized".to_string(),
  402|      1|            normalized_count: 3,
  403|      1|            merged_count: 1,
  404|      1|            namespace: "global".to_string(),
  405|      1|            elapsed_ms: 42,
  406|      1|        };
  407|      1|        let json = serde_json::to_value(&resp).expect("serialization");
  408|      1|        assert_eq!(json["action"], "normalized");
  409|      1|        assert_eq!(json["normalized_count"], 3);
  410|      1|        assert_eq!(json["merged_count"], 1);
  411|      1|        assert_eq!(json["namespace"], "global");
  412|      1|        assert!(json["elapsed_ms"].as_u64().is_some());
  413|      1|    }
  414|       |
  415|       |    #[test]
  416|      1|    fn dry_run_response_has_correct_action() {
  417|      1|        let resp = NormalizeEntitiesResponse {
  418|      1|            action: "dry_run".to_string(),
  419|      1|            normalized_count: 5,
  420|      1|            merged_count: 0,
  421|      1|            namespace: "test".to_string(),
  422|      1|            elapsed_ms: 1,
  423|      1|        };
  424|      1|        let json = serde_json::to_value(&resp).expect("serialization");
  425|      1|        assert_eq!(json["action"], "dry_run");
  426|      1|    }
  427|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/optimize.rs:
    1|       |//! Handler for the `optimize` CLI subcommand.
    2|       |
    3|       |use crate::commands::fts::check_fts_functional;
    4|       |use crate::errors::AppError;
    5|       |use crate::output;
    6|       |use crate::paths::AppPaths;
    7|       |use crate::storage::connection::open_rw;
    8|       |use serde::Serialize;
    9|       |
   10|       |#[derive(clap::Args)]
   11|       |#[command(after_long_help = "EXAMPLES:\n  \
   12|       |    # Run PRAGMA optimize on the default database\n  \
   13|       |    sqlite-graphrag optimize\n\n  \
   14|       |    # Optimize a database at a custom path\n  \
   15|       |    sqlite-graphrag optimize --db /path/to/graphrag.sqlite\n\n  \
   16|       |    # Skip the FTS5 rebuild even if the index looks unhealthy\n  \
   17|       |    sqlite-graphrag optimize --skip-fts\n\n  \
   18|       |    # Dry-run: only report FTS5 health status, do not rebuild\n  \
   19|       |    sqlite-graphrag optimize --fts-dry-run\n\n  \
   20|       |    # Run optimize non-interactively (skip confirmation prompts)\n  \
   21|       |    sqlite-graphrag optimize --yes\n\n  \
   22|       |    # Force a full FTS5 rebuild even if the index already passes integrity-check\n  \
   23|       |    sqlite-graphrag optimize --no-fts-skip-when-functional\n\n  \
   24|       |    # Optimize via SQLITE_GRAPHRAG_DB_PATH env var\n  \
   25|       |    SQLITE_GRAPHRAG_DB_PATH=/data/graphrag.sqlite sqlite-graphrag optimize")]
   26|       |pub struct OptimizeArgs {
   27|       |    #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
   28|       |    pub json: bool,
   29|       |    #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
   30|       |    pub db: Option<String>,
   31|       |    #[arg(long, default_value_t = false, help = "Skip FTS5 index rebuild")]
   32|       |    pub skip_fts: bool,
   33|       |    /// When true (default), the FTS5 rebuild step is skipped when
   34|       |    /// `fts check` reports the index is already functional. Saves 5-15
   35|       |    /// minutes on large databases. Set to false to always rebuild.
   36|       |    #[arg(
   37|       |        long,
   38|       |        default_value_t = true,
   39|       |        help = "Skip FTS5 rebuild when index is already functional (saves minutes on big DBs)"
   40|       |    )]
   41|       |    pub fts_skip_when_functional: bool,
   42|       |    /// G36 Passo 2 (v1.0.69): run `fts check` + `fts stats` only, do not
   43|       |    /// trigger any rebuild. Exit code is 0 when the index is healthy, 1
   44|       |    /// when a rebuild would be recommended.
   45|       |    #[arg(
   46|       |        long,
   47|       |        default_value_t = false,
   48|       |        help = "G36: only run fts check + fts stats, do not rebuild (exit 1 if rebuild recommended)"
   49|       |    )]
   50|       |    pub fts_dry_run: bool,
   51|       |    /// G36 Passo 3 (v1.0.69): emit a tracing::info! progress line every
   52|       |    /// N seconds during the FTS5 rebuild. The FTS5 `rebuild` command is
   53|       |    /// synchronous and does not call the SQLite progress handler, so the
   54|       |    /// progress is sampled at the configured interval. Use 0 to disable.
   55|       |    #[arg(
   56|       |        long,
   57|       |        default_value_t = 30,
   58|       |        help = "G36: emit progress line every N seconds during FTS5 rebuild (0 to disable)"
   59|       |    )]
   60|       |    pub fts_progress: u64,
   61|       |    /// G36 Passo 4 (v1.0.69): skip all confirmation prompts. Required
   62|       |    /// for non-interactive CI/CD pipelines that cannot answer `y/N`.
   63|       |    #[arg(
   64|       |        long,
   65|       |        default_value_t = false,
   66|       |        help = "G36: skip confirmation prompts (required for non-interactive CI)"
   67|       |    )]
   68|       |    pub yes: bool,
   69|       |}
   70|       |
   71|       |#[derive(Serialize)]
   72|       |struct OptimizeResponse {
   73|       |    db_path: String,
   74|       |    status: String,
   75|       |    /// True when the FTS5 index was rebuilt during this optimize run.
   76|       |    fts_rebuilt: bool,
   77|       |    /// True when the FTS5 rebuild was skipped because the index was already healthy.
   78|       |    fts_skipped_functional: bool,
   79|       |    /// True when FTS5 was detected as unhealthy AND the rebuild was attempted.
   80|       |    fts_unhealthy: bool,
   81|       |    /// Number of FTS5 rows indexed during the rebuild (G36 progress observability).
   82|       |    fts_rows_indexed: Option<i64>,
   83|       |    /// Total execution time in milliseconds from handler start to serialisation.
   84|       |    elapsed_ms: u64,
   85|       |}
   86|       |
   87|      1|pub fn run(args: OptimizeArgs) -> Result<(), AppError> {
   88|      1|    let inicio = std::time::Instant::now();
   89|      1|    let paths = AppPaths::resolve(args.db.as_deref())?;
                                                                   ^0
   90|       |
   91|      1|    crate::storage::connection::ensure_db_ready(&paths)?;
                                                                     ^0
   92|       |
   93|      1|    let conn = open_rw(&paths.db)?;
                                               ^0
   94|      1|    conn.execute_batch("PRAGMA optimize;")?;
                                                        ^0
   95|       |
   96|       |    // G36: pre-check FTS5 health before triggering a multi-minute rebuild.
   97|      1|    let fts_functional = if !args.skip_fts {
   98|      1|        check_fts_functional(&conn).unwrap_or(false)
   99|       |    } else {
  100|      0|        false
  101|       |    };
  102|       |
  103|       |    // G36 Passo 2 (v1.0.69): dry-run path. Run fts check + fts stats, emit
  104|       |    // JSON envelope, and return exit 1 when a rebuild would be recommended.
  105|      1|    if args.fts_dry_run {
  106|      0|        let recommend_rebuild = !fts_functional;
  107|      0|        output::emit_json(&OptimizeResponse {
  108|      0|            db_path: paths.db.display().to_string(),
  109|      0|            status: if recommend_rebuild {
  110|      0|                "rebuild_recommended".to_string()
  111|       |            } else {
  112|      0|                "ok".to_string()
  113|       |            },
  114|       |            fts_rebuilt: false,
  115|       |            fts_skipped_functional: false,
  116|      0|            fts_unhealthy: !fts_functional,
  117|      0|            fts_rows_indexed: None,
  118|      0|            elapsed_ms: inicio.elapsed().as_millis() as u64,
  119|      0|        })?;
  120|      0|        if recommend_rebuild {
  121|      0|            std::process::exit(1);
  122|      0|        }
  123|      0|        return Ok(());
  124|      1|    }
  125|       |
  126|      1|    let (fts_rebuilt, fts_skipped_functional, fts_unhealthy, fts_rows_indexed) = if args.skip_fts {
  127|      0|        (false, false, false, None)
  128|      1|    } else if args.fts_skip_when_functional && fts_functional {
  129|      1|        tracing::info!(target: "optimize",
  130|      0|            "FTS5 index already functional; skipping rebuild (use --no-fts-skip-when-functional to override)"
  131|       |        );
  132|      1|        (false, true, false, None)
  133|       |    } else {
  134|      0|        if !fts_functional {
  135|      0|            tracing::warn!(target: "optimize",
  136|      0|                "FTS5 index reported unhealthy; running full rebuild"
  137|       |            );
  138|      0|        }
  139|       |        // Capture row count BEFORE rebuild so we can report progress.
  140|       |        // (FTS5 rebuild is synchronous; a true callback would require
  141|       |        // `sqlite3_progress_handler` which the FTS5 'rebuild' command
  142|       |        // does not respect. We sample the row count after.)
  143|      0|        let before: i64 = conn
  144|      0|            .query_row("SELECT COUNT(*) FROM fts_memories", [], |r| r.get(0))
  145|      0|            .unwrap_or(0);
  146|       |        // G36 Passo 3 (v1.0.69): spawn a lightweight background thread that
  147|       |        // emits a tracing::info! progress line every `args.fts_progress`
  148|       |        // seconds while the rebuild is in flight. The FTS5 rebuild command
  149|       |        // is synchronous and does not call the SQLite progress handler, so
  150|       |        // the only observability we can add is a row-count poll from a
  151|       |        // background thread. We open a SEPARATE read-only connection
  152|       |        // because `rusqlite::Connection` is not `Sync` and the rebuild
  153|       |        // holds the main connection exclusively. Default 30s; 0 disables.
  154|      0|        let progress_thread = if args.fts_progress > 0 {
  155|      0|            let interval = std::time::Duration::from_secs(args.fts_progress);
  156|      0|            let db_path = paths.db.clone();
  157|      0|            let child = std::thread::spawn(move || loop {
  158|      0|                std::thread::sleep(interval);
  159|      0|                let count: i64 = match crate::storage::connection::open_ro(&db_path) {
  160|      0|                    Ok(c) => c
  161|      0|                        .query_row("SELECT COUNT(*) FROM fts_memories", [], |r| r.get(0))
  162|      0|                        .unwrap_or(-1),
  163|      0|                    Err(_) => -1,
  164|       |                };
  165|      0|                tracing::info!(target: "optimize", fts_rows = count, "FTS5 rebuild progress sample");
  166|       |            });
  167|      0|            Some(child)
  168|       |        } else {
  169|      0|            None
  170|       |        };
  171|      0|        let rebuilt_ok = conn
  172|      0|            .execute_batch("INSERT INTO fts_memories(fts_memories) VALUES('rebuild');")
  173|      0|            .is_ok();
  174|      0|        if let Some(handle) = progress_thread {
  175|      0|            // The thread runs forever in a sleep loop; we leak it on
  176|      0|            // purpose because (a) it terminates when the process exits
  177|      0|            // and (b) we cannot safely join without a stop signal channel
  178|      0|            // which would add complexity not warranted for a 30s sampler.
  179|      0|            std::mem::forget(handle);
  180|      0|        }
  181|      0|        let after: i64 = if rebuilt_ok {
  182|      0|            conn.query_row("SELECT COUNT(*) FROM fts_memories", [], |r| r.get(0))
  183|      0|                .unwrap_or(0)
  184|       |        } else {
  185|      0|            0
  186|       |        };
  187|       |        // G36 progress: rows_indexed == after - before.  Emitted as a
  188|       |        // tracing::info! line so operators following logs see the
  189|       |        // rebuild magnitude without needing NDJSON streaming.
  190|      0|        tracing::info!(target: "optimize", before, after, "FTS5 rebuild complete");
  191|      0|        (rebuilt_ok, false, !fts_functional, Some(after - before))
  192|       |    };
  193|       |
  194|       |    // G36 Passo 4 (v1.0.69): --yes flag is currently honored for forward
  195|       |    // compatibility — every interactive prompt path in optimize must
  196|       |    // check this flag and skip the prompt when set. As of v1.0.69 there
  197|       |    // are no interactive prompts in optimize (the user is told up front
  198|       |    // via the after_long_help), but the flag is reserved so future
  199|       |    // confirmations can be added without breaking the CLI contract.
  200|      1|    let _ = args.yes;
  201|       |
  202|      1|    output::emit_json(&OptimizeResponse {
  203|      1|        db_path: paths.db.display().to_string(),
  204|      1|        status: "ok".to_string(),
  205|      1|        fts_rebuilt,
  206|      1|        fts_skipped_functional,
  207|      1|        fts_unhealthy,
  208|      1|        fts_rows_indexed,
  209|      1|        elapsed_ms: inicio.elapsed().as_millis() as u64,
  210|      1|    })?;
                    ^0
  211|       |
  212|      1|    Ok(())
  213|      1|}
  214|       |
  215|       |#[cfg(test)]
  216|       |mod tests {
  217|       |    use super::*;
  218|       |    use serial_test::serial;
  219|       |    use tempfile::TempDir;
  220|       |
  221|       |    #[test]
  222|      1|    fn optimize_response_serializes_required_fields() {
  223|      1|        let resp = OptimizeResponse {
  224|      1|            db_path: "/tmp/graphrag.sqlite".to_string(),
  225|      1|            status: "ok".to_string(),
  226|      1|            fts_rebuilt: false,
  227|      1|            fts_rows_indexed: None,
  228|      1|            fts_skipped_functional: false,
  229|      1|            fts_unhealthy: false,
  230|      1|            elapsed_ms: 5,
  231|      1|        };
  232|      1|        let json = serde_json::to_value(&resp).unwrap();
  233|      1|        assert_eq!(json["status"], "ok");
  234|      1|        assert_eq!(json["db_path"], "/tmp/graphrag.sqlite");
  235|      1|        assert_eq!(json["elapsed_ms"], 5);
  236|      1|    }
  237|       |
  238|       |    #[test]
  239|       |    #[serial]
  240|      1|    fn optimize_auto_inits_when_db_missing() {
  241|      1|        let dir = TempDir::new().unwrap();
  242|      1|        let db_path = dir.path().join("missing.sqlite");
  243|       |        // SAFETY: `#[serial]` guarantees single-threaded execution.
  244|      1|        unsafe {
  245|      1|            std::env::set_var("SQLITE_GRAPHRAG_DB_PATH", db_path.to_str().unwrap());
  246|      1|            std::env::set_var("LOG_LEVEL", "error");
  247|      1|        }
  248|       |
  249|      1|        let args = OptimizeArgs {
  250|      1|            json: false,
  251|      1|            db: Some(db_path.to_string_lossy().into_owned()),
  252|      1|            skip_fts: false,
  253|      1|            fts_skip_when_functional: true,
  254|      1|            fts_dry_run: false,
  255|      1|            fts_progress: 30,
  256|      1|            yes: true,
  257|      1|        };
  258|      1|        let result = run(args);
  259|      1|        assert!(
  260|      1|            result.is_ok(),
  261|      0|            "auto-init must succeed and PRAGMA optimize must run on the fresh database, got {result:?}"
  262|       |        );
  263|      1|        assert!(
  264|      1|            db_path.exists(),
  265|      0|            "auto-init must create the database file at {}",
  266|      0|            db_path.display()
  267|       |        );
  268|       |        // SAFETY: `#[serial]` guarantees single-threaded execution.
  269|      1|        unsafe {
  270|      1|            std::env::remove_var("SQLITE_GRAPHRAG_DB_PATH");
  271|      1|            std::env::remove_var("LOG_LEVEL");
  272|      1|        }
  273|       |    }
  274|       |
  275|       |    #[test]
  276|      1|    fn optimize_response_status_ok_fixo() {
  277|      1|        let resp = OptimizeResponse {
  278|      1|            db_path: "/qualquer/caminho".to_string(),
  279|      1|            status: "ok".to_string(),
  280|      1|            fts_rebuilt: false,
  281|      1|            fts_rows_indexed: None,
  282|      1|            fts_skipped_functional: false,
  283|      1|            fts_unhealthy: false,
  284|      1|            elapsed_ms: 0,
  285|      1|        };
  286|      1|        let json = serde_json::to_value(&resp).unwrap();
  287|      1|        assert_eq!(json["status"], "ok", "status deve ser sempre 'ok'");
                                                       ^0
  288|      1|    }
  289|       |
  290|       |    #[test]
  291|      1|    fn optimize_response_serializes_all_fields() {
  292|      1|        let resp = OptimizeResponse {
  293|      1|            db_path: "/data/x.sqlite".into(),
  294|      1|            status: "ok".into(),
  295|      1|            fts_rebuilt: true,
  296|      1|            fts_rows_indexed: Some(0),
  297|      1|            fts_skipped_functional: false,
  298|      1|            fts_unhealthy: true,
  299|      1|            elapsed_ms: 120,
  300|      1|        };
  301|      1|        let v = serde_json::to_value(&resp).unwrap();
  302|      1|        assert_eq!(v["db_path"], "/data/x.sqlite");
  303|      1|        assert_eq!(v["status"], "ok");
  304|      1|        assert_eq!(v["fts_rebuilt"], true);
  305|      1|        assert_eq!(v["fts_skipped_functional"], false);
  306|      1|        assert_eq!(v["fts_unhealthy"], true);
  307|      1|        assert_eq!(v["elapsed_ms"], 120u64);
  308|      1|    }
  309|       |
  310|       |    #[test]
  311|      1|    fn optimize_response_includes_fts_flags() {
  312|       |        // G36: operator must be able to distinguish (a) rebuilt, (b) skipped-healthy,
  313|       |        // (c) skipped-by-flag from (d) attempted-but-failed. The response
  314|       |        // exposes fts_rebuilt, fts_skipped_functional, fts_unhealthy booleans.
  315|      1|        let resp = OptimizeResponse {
  316|      1|            db_path: "/x".into(),
  317|      1|            status: "ok".into(),
  318|      1|            fts_rebuilt: true,
  319|      1|            fts_rows_indexed: Some(0),
  320|      1|            fts_skipped_functional: false,
  321|      1|            fts_unhealthy: true,
  322|      1|            elapsed_ms: 1,
  323|      1|        };
  324|      1|        let v = serde_json::to_value(&resp).unwrap();
  325|      1|        assert_eq!(v["fts_rebuilt"], true);
  326|      1|        assert_eq!(v["fts_skipped_functional"], false);
  327|      1|        assert_eq!(v["fts_unhealthy"], true);
  328|      1|    }
  329|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/prune_ner.rs:
    1|       |//! Handler for the `prune-ner` CLI subcommand.
    2|       |//!
    3|       |//! Removes NER bindings (rows in `memory_entities`) for a single entity or for
    4|       |//! all entities in the namespace. Useful for cleaning up low-quality automatic
    5|       |//! extractions without touching the entities or memories themselves.
    6|       |
    7|       |use crate::errors::AppError;
    8|       |use crate::output::{self, OutputFormat};
    9|       |use crate::paths::AppPaths;
   10|       |use crate::storage::connection::open_rw;
   11|       |use serde::Serialize;
   12|       |
   13|       |#[derive(clap::Args)]
   14|       |#[command(after_long_help = "EXAMPLES:\n  \
   15|       |    # Preview bindings that would be removed for a single entity\n  \
   16|       |    sqlite-graphrag prune-ner --entity jwt-token --dry-run\n\n  \
   17|       |    # Remove all NER bindings for a single entity\n  \
   18|       |    sqlite-graphrag prune-ner --entity jwt-token --yes\n\n  \
   19|       |    # Remove ALL NER bindings in the current namespace\n  \
   20|       |    sqlite-graphrag prune-ner --all --yes\n\n  \
   21|       |NOTE:\n  \
   22|       |    This command deletes rows from memory_entities (the link table between\n  \
   23|       |    memories and extracted entities). The entities and memories themselves\n  \
   24|       |    are not deleted. Use cleanup-orphans afterwards to remove entity nodes\n  \
   25|       |    that have no remaining links.")]
   26|       |pub struct PruneNerArgs {
   27|       |    /// Entity name whose bindings should be removed.
   28|       |    /// Mutually exclusive with --all.
   29|       |    #[arg(long, conflicts_with = "all", value_name = "NAME")]
   30|       |    pub entity: Option<String>,
   31|       |
   32|       |    /// Remove all NER bindings in the namespace. Mutually exclusive with --entity.
   33|       |    #[arg(long, conflicts_with = "entity", default_value_t = false)]
   34|       |    pub all: bool,
   35|       |
   36|       |    #[arg(long)]
   37|       |    pub namespace: Option<String>,
   38|       |
   39|       |    /// Preview count without deleting.
   40|       |    #[arg(long, default_value_t = false)]
   41|       |    pub dry_run: bool,
   42|       |
   43|       |    /// Skip confirmation for destructive operation.
   44|       |    #[arg(long, default_value_t = false)]
   45|       |    pub yes: bool,
   46|       |
   47|       |    #[arg(long, value_enum, default_value = "json")]
   48|       |    pub format: OutputFormat,
   49|       |
   50|       |    #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
   51|       |    pub json: bool,
   52|       |
   53|       |    #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
   54|       |    pub db: Option<String>,
   55|       |}
   56|       |
   57|       |#[derive(Serialize)]
   58|       |struct PruneNerResponse {
   59|       |    action: String,
   60|       |    bindings_removed: usize,
   61|       |    namespace: String,
   62|       |    /// Entity name targeted, when `--entity` was used.
   63|       |    #[serde(skip_serializing_if = "Option::is_none")]
   64|       |    entity: Option<String>,
   65|       |    /// Total execution time in milliseconds from handler start to serialisation.
   66|       |    elapsed_ms: u64,
   67|       |}
   68|       |
   69|      0|pub fn run(args: PruneNerArgs) -> Result<(), AppError> {
   70|      0|    let inicio = std::time::Instant::now();
   71|       |
   72|      0|    if args.entity.is_none() && !args.all {
   73|      0|        return Err(AppError::Validation(
   74|      0|            "either --entity <NAME> or --all must be specified".to_string(),
   75|      0|        ));
   76|      0|    }
   77|       |
   78|      0|    let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
   79|      0|    let paths = AppPaths::resolve(args.db.as_deref())?;
   80|       |
   81|      0|    crate::storage::connection::ensure_db_ready(&paths)?;
   82|       |
   83|      0|    let mut conn = open_rw(&paths.db)?;
   84|       |
   85|       |    // Count how many rows would be affected.
   86|      0|    let count: usize = if let Some(ref entity_name) = args.entity {
   87|      0|        conn.query_row(
   88|      0|            "SELECT COUNT(*) FROM memory_entities me
   89|      0|             JOIN entities e ON e.id = me.entity_id
   90|      0|             WHERE e.name = ?1 AND e.namespace = ?2",
   91|      0|            rusqlite::params![entity_name, namespace],
   92|      0|            |r| r.get::<_, i64>(0).map(|v| v as usize),
   93|      0|        )?
   94|       |    } else {
   95|      0|        conn.query_row(
   96|      0|            "SELECT COUNT(*) FROM memory_entities me
   97|      0|             JOIN entities e ON e.id = me.entity_id
   98|      0|             WHERE e.namespace = ?1",
   99|      0|            rusqlite::params![namespace],
  100|      0|            |r| r.get::<_, i64>(0).map(|v| v as usize),
  101|      0|        )?
  102|       |    };
  103|       |
  104|      0|    if args.dry_run {
  105|      0|        let response = PruneNerResponse {
  106|      0|            action: "dry_run".to_string(),
  107|      0|            bindings_removed: count,
  108|      0|            namespace: namespace.clone(),
  109|      0|            entity: args.entity.clone(),
  110|      0|            elapsed_ms: inicio.elapsed().as_millis() as u64,
  111|      0|        };
  112|       |
  113|      0|        match args.format {
  114|      0|            OutputFormat::Json => output::emit_json(&response)?,
  115|      0|            OutputFormat::Text | OutputFormat::Markdown => {
  116|      0|                output::emit_text(&format!(
  117|      0|                    "dry_run: {count} NER bindings would be removed [{namespace}]"
  118|      0|                ));
  119|      0|            }
  120|       |        }
  121|       |
  122|      0|        return Ok(());
  123|      0|    }
  124|       |
  125|      0|    if !args.yes {
  126|      0|        let response = PruneNerResponse {
  127|      0|            action: "aborted".to_string(),
  128|      0|            bindings_removed: count,
  129|      0|            namespace: namespace.clone(),
  130|      0|            entity: args.entity.clone(),
  131|      0|            elapsed_ms: inicio.elapsed().as_millis() as u64,
  132|      0|        };
  133|       |
  134|      0|        match args.format {
  135|      0|            OutputFormat::Json => output::emit_json(&response)?,
  136|      0|            OutputFormat::Text | OutputFormat::Markdown => {
  137|      0|                output::emit_text(&format!(
  138|      0|                    "aborted: {count} NER bindings would be removed; pass --yes to confirm [{namespace}]"
  139|      0|                ));
  140|      0|            }
  141|       |        }
  142|       |
  143|      0|        return Ok(());
  144|      0|    }
  145|       |
  146|       |    // Destructive path: COUNT + DELETE in same transaction for consistency.
  147|      0|    let removed: usize = if let Some(ref entity_name) = args.entity {
  148|       |        // Normalize to match the normalized stored entity names.
  149|      0|        let entity_name = crate::parsers::normalize_entity_name(entity_name);
  150|      0|        let tx = conn.transaction_with_behavior(rusqlite::TransactionBehavior::Immediate)?;
  151|      0|        let n = tx.execute(
  152|      0|            "DELETE FROM memory_entities WHERE entity_id IN (
  153|      0|                 SELECT id FROM entities WHERE name = ?1 AND namespace = ?2
  154|      0|             )",
  155|      0|            rusqlite::params![entity_name, namespace],
  156|      0|        )?;
  157|      0|        tx.commit()?;
  158|      0|        n
  159|       |    } else {
  160|      0|        let tx = conn.transaction_with_behavior(rusqlite::TransactionBehavior::Immediate)?;
  161|      0|        let n = tx.execute(
  162|      0|            "DELETE FROM memory_entities WHERE entity_id IN (
  163|      0|                 SELECT id FROM entities WHERE namespace = ?1
  164|      0|             )",
  165|      0|            rusqlite::params![namespace],
  166|      0|        )?;
  167|      0|        tx.commit()?;
  168|      0|        n
  169|       |    };
  170|       |
  171|      0|    conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);")?;
  172|       |
  173|      0|    tracing::info!(target: "prune_ner",
  174|       |        removed = removed,
  175|       |        namespace = %namespace,
  176|       |        entity = ?args.entity,
  177|      0|        "NER bindings pruned"
  178|       |    );
  179|       |
  180|      0|    let response = PruneNerResponse {
  181|      0|        action: "pruned".to_string(),
  182|      0|        bindings_removed: removed,
  183|      0|        namespace: namespace.clone(),
  184|      0|        entity: args.entity.clone(),
  185|      0|        elapsed_ms: inicio.elapsed().as_millis() as u64,
  186|      0|    };
  187|       |
  188|      0|    match args.format {
  189|      0|        OutputFormat::Json => output::emit_json(&response)?,
  190|      0|        OutputFormat::Text | OutputFormat::Markdown => {
  191|      0|            output::emit_text(&format!(
  192|      0|                "pruned: {removed} NER bindings removed [{namespace}]"
  193|      0|            ));
  194|      0|        }
  195|       |    }
  196|       |
  197|      0|    Ok(())
  198|      0|}
  199|       |
  200|       |#[cfg(test)]
  201|       |mod tests {
  202|       |    use super::*;
  203|       |
  204|       |    #[test]
  205|      1|    fn prune_ner_response_dry_run_serializes_correctly() {
  206|      1|        let resp = PruneNerResponse {
  207|      1|            action: "dry_run".to_string(),
  208|      1|            bindings_removed: 42,
  209|      1|            namespace: "global".to_string(),
  210|      1|            entity: Some("jwt-token".to_string()),
  211|      1|            elapsed_ms: 5,
  212|      1|        };
  213|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  214|      1|        assert_eq!(json["action"], "dry_run");
  215|      1|        assert_eq!(json["bindings_removed"], 42);
  216|      1|        assert_eq!(json["entity"], "jwt-token");
  217|      1|        assert_eq!(json["namespace"], "global");
  218|      1|    }
  219|       |
  220|       |    #[test]
  221|      1|    fn prune_ner_response_pruned_all_omits_entity() {
  222|      1|        let resp = PruneNerResponse {
  223|      1|            action: "pruned".to_string(),
  224|      1|            bindings_removed: 200,
  225|      1|            namespace: "project-x".to_string(),
  226|      1|            entity: None,
  227|      1|            elapsed_ms: 15,
  228|      1|        };
  229|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  230|      1|        assert_eq!(json["action"], "pruned");
  231|      1|        assert_eq!(json["bindings_removed"], 200);
  232|      1|        assert!(
  233|      1|            json.get("entity").is_none(),
  234|      0|            "entity must be omitted when None"
  235|       |        );
  236|      1|    }
  237|       |
  238|       |    #[test]
  239|      1|    fn prune_ner_response_aborted_includes_count() {
  240|      1|        let resp = PruneNerResponse {
  241|      1|            action: "aborted".to_string(),
  242|      1|            bindings_removed: 10,
  243|      1|            namespace: "global".to_string(),
  244|      1|            entity: None,
  245|      1|            elapsed_ms: 1,
  246|      1|        };
  247|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  248|      1|        assert_eq!(json["action"], "aborted");
  249|      1|        assert_eq!(json["bindings_removed"], 10);
  250|      1|        assert!(json["elapsed_ms"].is_number());
  251|      1|    }
  252|       |
  253|       |    #[test]
  254|      1|    fn prune_ner_response_zero_bindings() {
  255|      1|        let resp = PruneNerResponse {
  256|      1|            action: "pruned".to_string(),
  257|      1|            bindings_removed: 0,
  258|      1|            namespace: "global".to_string(),
  259|      1|            entity: Some("nonexistent".to_string()),
  260|      1|            elapsed_ms: 2,
  261|      1|        };
  262|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  263|      1|        assert_eq!(json["bindings_removed"], 0);
  264|      1|    }
  265|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/prune_relations.rs:
    1|       |//! Handler for the `prune-relations` CLI subcommand.
    2|       |
    3|       |use crate::errors::AppError;
    4|       |use crate::i18n;
    5|       |use crate::output::{self, OutputFormat};
    6|       |use crate::paths::AppPaths;
    7|       |use crate::storage::connection::open_rw;
    8|       |use crate::storage::entities;
    9|       |use serde::Serialize;
   10|       |
   11|       |#[derive(clap::Args)]
   12|       |#[command(after_long_help = "EXAMPLES:\n  \
   13|       |    # Preview how many 'mentions' relations would be removed\n  \
   14|       |    sqlite-graphrag prune-relations --relation mentions --dry-run\n\n  \
   15|       |    # Remove all 'mentions' relations without confirmation prompt\n  \
   16|       |    sqlite-graphrag prune-relations --relation mentions --yes\n\n\
   17|       |NOTE:\n  \
   18|       |    This command permanently deletes relationships. Use --dry-run first.\n  \
   19|       |    Entity degree counts are automatically recalculated after pruning.")]
   20|       |pub struct PruneRelationsArgs {
   21|       |    /// Relation type to delete (e.g. mentions, related, uses).
   22|       |    /// Accepts canonical and custom kebab-case/snake_case values.
   23|       |    #[arg(long, value_parser = crate::parsers::parse_relation, value_name = "RELATION")]
   24|       |    pub relation: String,
   25|       |    #[arg(long)]
   26|       |    pub namespace: Option<String>,
   27|       |    /// Preview count without deleting.
   28|       |    #[arg(long)]
   29|       |    pub dry_run: bool,
   30|       |    /// Skip confirmation for destructive operation.
   31|       |    #[arg(long)]
   32|       |    pub yes: bool,
   33|       |    /// Show affected entity names during --dry-run preview.
   34|       |    #[arg(long, default_value_t = false)]
   35|       |    pub show_entities: bool,
   36|       |    #[arg(long, value_enum, default_value = "json")]
   37|       |    pub format: OutputFormat,
   38|       |    #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
   39|       |    pub json: bool,
   40|       |    #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
   41|       |    pub db: Option<String>,
   42|       |}
   43|       |
   44|       |#[derive(Serialize)]
   45|       |struct PruneRelationsResponse {
   46|       |    action: String,
   47|       |    relation: String,
   48|       |    count: usize,
   49|       |    entities_affected: usize,
   50|       |    namespace: String,
   51|       |    /// Total execution time in milliseconds from handler start to serialisation.
   52|       |    elapsed_ms: u64,
   53|       |    #[serde(skip_serializing_if = "Option::is_none")]
   54|       |    affected_entity_names: Option<Vec<String>>,
   55|       |}
   56|       |
   57|      0|pub fn run(args: PruneRelationsArgs) -> Result<(), AppError> {
   58|      0|    let inicio = std::time::Instant::now();
   59|      0|    let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
   60|      0|    let paths = AppPaths::resolve(args.db.as_deref())?;
   61|       |
   62|      0|    crate::storage::connection::ensure_db_ready(&paths)?;
   63|       |
   64|      0|    crate::parsers::warn_if_non_canonical(&args.relation);
   65|       |
   66|      0|    let mut conn = open_rw(&paths.db)?;
   67|       |
   68|      0|    if args.dry_run {
   69|      0|        let count = entities::count_relationships_by_relation(&conn, &namespace, &args.relation)?;
   70|       |
   71|      0|        let affected_names = if args.show_entities {
   72|      0|            Some(entities::list_entity_names_by_relation(
   73|      0|                &conn,
   74|      0|                &namespace,
   75|      0|                &args.relation,
   76|      0|            )?)
   77|       |        } else {
   78|      0|            None
   79|       |        };
   80|       |
   81|      0|        let entities_affected_count = affected_names.as_ref().map_or(0, |v| v.len());
   82|       |
   83|      0|        output::emit_progress(&i18n::prune_dry_run(count, &args.relation));
   84|       |
   85|      0|        let response = PruneRelationsResponse {
   86|      0|            action: "dry_run".to_string(),
   87|      0|            relation: args.relation.clone(),
   88|      0|            count,
   89|      0|            entities_affected: entities_affected_count,
   90|      0|            namespace: namespace.clone(),
   91|      0|            elapsed_ms: inicio.elapsed().as_millis() as u64,
   92|      0|            affected_entity_names: affected_names,
   93|      0|        };
   94|       |
   95|      0|        match args.format {
   96|      0|            OutputFormat::Json => output::emit_json(&response)?,
   97|      0|            OutputFormat::Text | OutputFormat::Markdown => {
   98|      0|                output::emit_text(&format!(
   99|      0|                    "dry_run: {} '{}' relations would be removed [{}]",
  100|      0|                    response.count, response.relation, response.namespace
  101|      0|                ));
  102|      0|            }
  103|       |        }
  104|       |
  105|      0|        return Ok(());
  106|      0|    }
  107|       |
  108|      0|    if !args.yes {
  109|      0|        output::emit_progress(&i18n::prune_requires_yes());
  110|       |
  111|      0|        let count = entities::count_relationships_by_relation(&conn, &namespace, &args.relation)?;
  112|       |
  113|      0|        let response = PruneRelationsResponse {
  114|      0|            action: "aborted".to_string(),
  115|      0|            relation: args.relation.clone(),
  116|      0|            count,
  117|      0|            entities_affected: 0,
  118|      0|            namespace: namespace.clone(),
  119|      0|            elapsed_ms: inicio.elapsed().as_millis() as u64,
  120|      0|            affected_entity_names: None,
  121|      0|        };
  122|       |
  123|      0|        match args.format {
  124|      0|            OutputFormat::Json => output::emit_json(&response)?,
  125|      0|            OutputFormat::Text | OutputFormat::Markdown => {
  126|      0|                output::emit_text(&format!(
  127|      0|                    "aborted: {} '{}' relations would be removed; pass --yes to confirm [{}]",
  128|      0|                    response.count, response.relation, response.namespace
  129|      0|                ));
  130|      0|            }
  131|       |        }
  132|       |
  133|      0|        return Ok(());
  134|      0|    }
  135|       |
  136|       |    // Destructive path: delete relationships.
  137|      0|    let tx = conn.transaction_with_behavior(rusqlite::TransactionBehavior::Immediate)?;
  138|      0|    let (count, entity_ids) =
  139|      0|        entities::delete_relationships_by_relation(&tx, &namespace, &args.relation)?;
  140|      0|    tx.commit()?;
  141|       |
  142|       |    // Run ANALYZE to refresh query planner statistics after bulk deletion.
  143|      0|    conn.execute_batch("ANALYZE relationships; ANALYZE memory_relationships;")?;
  144|      0|    conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);")?;
  145|       |
  146|      0|    output::emit_progress(&i18n::relations_pruned(count, &args.relation, &namespace));
  147|       |
  148|      0|    let response = PruneRelationsResponse {
  149|      0|        action: "pruned".to_string(),
  150|      0|        relation: args.relation.clone(),
  151|      0|        count,
  152|      0|        entities_affected: entity_ids.len(),
  153|      0|        namespace: namespace.clone(),
  154|      0|        elapsed_ms: inicio.elapsed().as_millis() as u64,
  155|      0|        affected_entity_names: None,
  156|      0|    };
  157|       |
  158|      0|    match args.format {
  159|      0|        OutputFormat::Json => output::emit_json(&response)?,
  160|      0|        OutputFormat::Text | OutputFormat::Markdown => {
  161|      0|            output::emit_text(&format!(
  162|      0|                "pruned: {} '{}' relations removed, {} entities affected [{}]",
  163|      0|                response.count, response.relation, response.entities_affected, response.namespace
  164|      0|            ));
  165|      0|        }
  166|       |    }
  167|       |
  168|      0|    Ok(())
  169|      0|}
  170|       |
  171|       |#[cfg(test)]
  172|       |mod tests {
  173|       |    use super::*;
  174|       |
  175|       |    #[test]
  176|      1|    fn prune_response_serializes_all_fields() {
  177|      1|        let resp = PruneRelationsResponse {
  178|      1|            action: "pruned".to_string(),
  179|      1|            relation: "mentions".to_string(),
  180|      1|            count: 3451,
  181|      1|            entities_affected: 200,
  182|      1|            namespace: "global".to_string(),
  183|      1|            elapsed_ms: 42,
  184|      1|            affected_entity_names: None,
  185|      1|        };
  186|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  187|      1|        assert_eq!(json["action"], "pruned");
  188|      1|        assert_eq!(json["relation"], "mentions");
  189|      1|        assert_eq!(json["count"], 3451);
  190|      1|        assert_eq!(json["entities_affected"], 200);
  191|      1|        assert_eq!(json["namespace"], "global");
  192|      1|        assert!(json["elapsed_ms"].is_number());
  193|      1|    }
  194|       |
  195|       |    #[test]
  196|      1|    fn prune_response_action_dry_run() {
  197|      1|        let resp = PruneRelationsResponse {
  198|      1|            action: "dry_run".to_string(),
  199|      1|            relation: "mentions".to_string(),
  200|      1|            count: 100,
  201|      1|            entities_affected: 0,
  202|      1|            namespace: "test".to_string(),
  203|      1|            elapsed_ms: 5,
  204|      1|            affected_entity_names: None,
  205|      1|        };
  206|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  207|      1|        assert_eq!(json["action"], "dry_run");
  208|      1|        assert_eq!(
  209|      1|            json["entities_affected"], 0,
  210|      0|            "dry_run must report zero entities_affected"
  211|       |        );
  212|      1|    }
  213|       |
  214|       |    #[test]
  215|      1|    fn prune_response_action_pruned() {
  216|      1|        let resp = PruneRelationsResponse {
  217|      1|            action: "pruned".to_string(),
  218|      1|            relation: "uses".to_string(),
  219|      1|            count: 50,
  220|      1|            entities_affected: 10,
  221|      1|            namespace: "my-project".to_string(),
  222|      1|            elapsed_ms: 120,
  223|      1|            affected_entity_names: None,
  224|      1|        };
  225|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  226|      1|        assert_eq!(json["action"], "pruned");
  227|      1|        assert!(json["count"].as_u64().unwrap() > 0);
  228|      1|        assert!(json["entities_affected"].as_u64().unwrap() > 0);
  229|      1|    }
  230|       |
  231|       |    #[test]
  232|      1|    fn prune_response_zero_count_when_nothing_to_prune() {
  233|      1|        let resp = PruneRelationsResponse {
  234|      1|            action: "pruned".to_string(),
  235|      1|            relation: "nonexistent".to_string(),
  236|      1|            count: 0,
  237|      1|            entities_affected: 0,
  238|      1|            namespace: "global".to_string(),
  239|      1|            elapsed_ms: 1,
  240|      1|            affected_entity_names: None,
  241|      1|        };
  242|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  243|      1|        assert_eq!(json["count"], 0);
  244|      1|        assert_eq!(json["entities_affected"], 0);
  245|      1|    }
  246|       |
  247|       |    #[test]
  248|      1|    fn prune_response_verbose_includes_entity_names() {
  249|      1|        let resp = PruneRelationsResponse {
  250|      1|            action: "dry_run".to_string(),
  251|      1|            relation: "mentions".to_string(),
  252|      1|            count: 10,
  253|      1|            entities_affected: 3,
  254|      1|            namespace: "global".to_string(),
  255|      1|            elapsed_ms: 5,
  256|      1|            affected_entity_names: Some(vec!["alpha".into(), "beta".into(), "gamma".into()]),
  257|      1|        };
  258|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  259|      1|        let names = json["affected_entity_names"]
  260|      1|            .as_array()
  261|      1|            .expect("must be array");
  262|      1|        assert_eq!(names.len(), 3);
  263|      1|    }
  264|       |
  265|       |    #[test]
  266|      1|    fn prune_response_no_verbose_omits_entity_names() {
  267|      1|        let resp = PruneRelationsResponse {
  268|      1|            action: "dry_run".to_string(),
  269|      1|            relation: "mentions".to_string(),
  270|      1|            count: 10,
  271|      1|            entities_affected: 0,
  272|      1|            namespace: "global".to_string(),
  273|      1|            elapsed_ms: 5,
  274|      1|            affected_entity_names: None,
  275|      1|        };
  276|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  277|      1|        assert!(
  278|      1|            json.get("affected_entity_names").is_none(),
  279|      0|            "must be omitted when None"
  280|       |        );
  281|      1|    }
  282|       |
  283|       |    #[test]
  284|      1|    fn prune_response_action_values_are_exhaustive() {
  285|      4|        for action in &["pruned", "dry_run", "aborted"] {
                          ^3
  286|      3|            let resp = PruneRelationsResponse {
  287|      3|                action: action.to_string(),
  288|      3|                relation: "mentions".to_string(),
  289|      3|                count: 0,
  290|      3|                entities_affected: 0,
  291|      3|                namespace: "global".to_string(),
  292|      3|                elapsed_ms: 0,
  293|      3|                affected_entity_names: None,
  294|      3|            };
  295|      3|            let json = serde_json::to_value(&resp).expect("serialization");
  296|      3|            assert_eq!(json["action"], *action);
  297|       |        }
  298|      1|    }
  299|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/purge.rs:
    1|       |//! Handler for the `purge` CLI subcommand.
    2|       |
    3|       |use crate::errors::AppError;
    4|       |use crate::i18n::errors_msg;
    5|       |use crate::output;
    6|       |use crate::paths::AppPaths;
    7|       |use crate::storage::connection::open_rw;
    8|       |use serde::Serialize;
    9|       |
   10|       |#[derive(clap::Args)]
   11|       |#[command(after_long_help = "EXAMPLES:\n  \
   12|       |    # Permanently delete soft-deleted memories older than 90 days (default retention)\n  \
   13|       |    sqlite-graphrag purge\n\n  \
   14|       |    # Custom retention window in days\n  \
   15|       |    sqlite-graphrag purge --retention-days 30\n\n  \
   16|       |    # Purge ALL soft-deleted memories regardless of age\n  \
   17|       |    sqlite-graphrag purge --retention-days 0\n\n  \
   18|       |    # Preview what would be purged without deleting\n  \
   19|       |    sqlite-graphrag purge --dry-run\n\n  \
   20|       |    # Purge a specific memory by name\n  \
   21|       |    sqlite-graphrag purge --name old-memory --namespace my-project\n\n\
   22|       |NOTES:\n  \
   23|       |    `--yes` only confirms intent and does NOT override `--retention-days`.\n  \
   24|       |    To wipe every soft-deleted memory immediately, pair `--yes` with `--retention-days 0`.")]
   25|       |pub struct PurgeArgs {
   26|       |    #[arg(long)]
   27|       |    pub name: Option<String>,
   28|       |    /// Namespace to purge. Defaults to the contextual namespace (SQLITE_GRAPHRAG_NAMESPACE env var or "global").
   29|       |    #[arg(long)]
   30|       |    pub namespace: Option<String>,
   31|       |    /// Retention days: memories with deleted_at older than (now - retention_days*86400) will be
   32|       |    /// permanently removed. Default: PURGE_RETENTION_DAYS_DEFAULT (90). Use 0 to purge all
   33|       |    /// soft-deleted memories regardless of age. Alias: `--max-age-days`.
   34|       |    #[arg(
   35|       |        long,
   36|       |        alias = "days",
   37|       |        alias = "max-age-days",
   38|       |        value_name = "DAYS",
   39|       |        default_value_t = crate::constants::PURGE_RETENTION_DAYS_DEFAULT
   40|       |    )]
   41|       |    pub retention_days: u32,
   42|       |    /// [DEPRECATED in v2.0.0] Legacy alias — use --retention-days instead.
   43|       |    #[arg(long, hide = true)]
   44|       |    pub older_than_seconds: Option<u64>,
   45|       |    /// Does not execute DELETE: computes and reports what WOULD be purged.
   46|       |    #[arg(long, default_value_t = false)]
   47|       |    pub dry_run: bool,
   48|       |    /// Confirms destructive intent for tools that require explicit acknowledgement.
   49|       |    /// Does NOT override `--retention-days`: combine with `--retention-days 0` to wipe
   50|       |    /// every soft-deleted memory regardless of age.
   51|       |    #[arg(long, default_value_t = false)]
   52|       |    pub yes: bool,
   53|       |    #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
   54|       |    pub json: bool,
   55|       |    #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
   56|       |    pub db: Option<String>,
   57|       |}
   58|       |
   59|       |#[derive(Serialize)]
   60|       |pub struct PurgeResponse {
   61|       |    pub action: String,
   62|       |    pub purged_count: usize,
   63|       |    pub bytes_freed: i64,
   64|       |    pub oldest_deleted_at: Option<i64>,
   65|       |    pub retention_days_used: u32,
   66|       |    pub dry_run: bool,
   67|       |    pub namespace: Option<String>,
   68|       |    pub cutoff_epoch: i64,
   69|       |    pub warnings: Vec<String>,
   70|       |    /// Total execution time in milliseconds from handler start to serialisation.
   71|       |    pub elapsed_ms: u64,
   72|       |    /// Human-readable explanation surfaced when nothing was purged so callers
   73|       |    /// understand the retention semantics. Present only when
   74|       |    /// `purged_count == 0` (M2 in v1.0.32) — kept absent otherwise to preserve
   75|       |    /// the existing JSON contract.
   76|       |    #[serde(skip_serializing_if = "Option::is_none")]
   77|       |    pub message: Option<String>,
   78|       |}
   79|       |
   80|       |/// Permanently delete soft-deleted memories that have exceeded the retention window.
   81|       |///
   82|       |/// Only memories with `deleted_at IS NOT NULL AND deleted_at <= cutoff_epoch` are affected.
   83|       |/// When `--dry-run` is set the DELETE is skipped and the response reflects candidates only.
   84|      0|pub fn run(args: PurgeArgs) -> Result<(), AppError> {
   85|      0|    let inicio = std::time::Instant::now();
   86|      0|    let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
   87|      0|    let paths = AppPaths::resolve(args.db.as_deref())?;
   88|       |
   89|      0|    crate::storage::connection::ensure_db_ready(&paths)?;
   90|       |
   91|      0|    let mut warnings: Vec<String> = Vec::with_capacity(1);
   92|      0|    let now = current_epoch()?;
   93|       |
   94|      0|    let cutoff_epoch = if let Some(secs) = args.older_than_seconds {
   95|      0|        warnings.push(
   96|      0|            "--older-than-seconds is deprecated; use --retention-days in v2.0.0+".to_string(),
   97|       |        );
   98|      0|        now - secs as i64
   99|       |    } else {
  100|      0|        now - (args.retention_days as i64) * 86_400
  101|       |    };
  102|       |
  103|      0|    let namespace_opt: Option<&str> = Some(namespace.as_str());
  104|       |
  105|      0|    let mut conn = open_rw(&paths.db)?;
  106|       |
  107|      0|    let (bytes_freed, oldest_deleted_at, candidates_count) =
  108|      0|        compute_metrics(&conn, cutoff_epoch, namespace_opt, args.name.as_deref())?;
  109|       |
  110|      0|    if candidates_count == 0 && args.name.is_some() {
  111|      0|        return Err(AppError::NotFound(
  112|      0|            errors_msg::soft_deleted_memory_not_found(
  113|      0|                args.name.as_deref().unwrap_or_default(),
  114|      0|                &namespace,
  115|      0|            ),
  116|      0|        ));
  117|      0|    }
  118|       |
  119|      0|    if !args.dry_run {
  120|      0|        let tx = conn.transaction_with_behavior(rusqlite::TransactionBehavior::Immediate)?;
  121|      0|        execute_purge(
  122|      0|            &tx,
  123|      0|            &namespace,
  124|      0|            args.name.as_deref(),
  125|      0|            cutoff_epoch,
  126|      0|            &mut warnings,
  127|      0|        )?;
  128|      0|        tx.commit()?;
  129|      0|        conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);")?;
  130|      0|    }
  131|       |
  132|      0|    let message = if candidates_count == 0 {
  133|      0|        Some(format!(
  134|      0|            "no soft-deleted memories older than {retention_days} day(s); use --retention-days 0 to purge all soft-deleted memories regardless of age",
  135|      0|            retention_days = args.retention_days
  136|      0|        ))
  137|       |    } else {
  138|      0|        None
  139|       |    };
  140|       |
  141|      0|    output::emit_json(&PurgeResponse {
  142|      0|        action: if args.dry_run {
  143|      0|            "dry_run".to_string()
  144|       |        } else {
  145|      0|            "purged".to_string()
  146|       |        },
  147|      0|        purged_count: candidates_count,
  148|      0|        bytes_freed,
  149|      0|        oldest_deleted_at,
  150|      0|        retention_days_used: args.retention_days,
  151|      0|        dry_run: args.dry_run,
  152|      0|        namespace: Some(namespace),
  153|      0|        cutoff_epoch,
  154|      0|        warnings,
  155|      0|        elapsed_ms: inicio.elapsed().as_millis() as u64,
  156|      0|        message,
  157|      0|    })?;
  158|       |
  159|      0|    Ok(())
  160|      0|}
  161|       |
  162|      4|fn current_epoch() -> Result<i64, AppError> {
  163|      4|    let now = std::time::SystemTime::now()
  164|      4|        .duration_since(std::time::UNIX_EPOCH)
  165|      4|        .map_err(|err| AppError::Internal(anyhow::anyhow!("system clock error: {err}")))?;
                                                        ^0              ^0                            ^0
  166|      4|    Ok(now.as_secs() as i64)
  167|      4|}
  168|       |
  169|      5|fn compute_metrics(
  170|      5|    conn: &rusqlite::Connection,
  171|      5|    cutoff_epoch: i64,
  172|      5|    namespace_opt: Option<&str>,
  173|      5|    name: Option<&str>,
  174|      5|) -> Result<(i64, Option<i64>, usize), AppError> {
  175|      5|    let (bytes_freed, oldest_deleted_at): (i64, Option<i64>) = if let Some(name) = name {
                                                                                         ^0
  176|      0|        conn.query_row(
  177|      0|            "SELECT COALESCE(SUM(LENGTH(COALESCE(body,'')) + LENGTH(COALESCE(description,'')) + LENGTH(name)), 0),
  178|      0|                    MIN(deleted_at)
  179|      0|             FROM memories
  180|      0|             WHERE deleted_at IS NOT NULL AND deleted_at <= ?1
  181|      0|                   AND (?2 IS NULL OR namespace = ?2)
  182|      0|                   AND name = ?3",
  183|      0|            rusqlite::params![cutoff_epoch, namespace_opt, name],
  184|      0|            |r| Ok((r.get::<_, i64>(0)?, r.get::<_, Option<i64>>(1)?)),
  185|      0|        )?
  186|       |    } else {
  187|      5|        conn.query_row(
  188|      5|            "SELECT COALESCE(SUM(LENGTH(COALESCE(body,'')) + LENGTH(COALESCE(description,'')) + LENGTH(name)), 0),
  189|      5|                    MIN(deleted_at)
  190|      5|             FROM memories
  191|      5|             WHERE deleted_at IS NOT NULL AND deleted_at <= ?1
  192|      5|                   AND (?2 IS NULL OR namespace = ?2)",
  193|      5|            rusqlite::params![cutoff_epoch, namespace_opt],
  194|      5|            |r| Ok((r.get::<_, i64>(0)?, r.get::<_, Option<i64>>(1)?)),
                                                    ^0                           ^0
  195|      0|        )?
  196|       |    };
  197|       |
  198|      5|    let count: usize = if let Some(name) = name {
                                                 ^0
  199|      0|        conn.query_row(
  200|      0|            "SELECT COUNT(*) FROM memories
  201|      0|             WHERE deleted_at IS NOT NULL AND deleted_at <= ?1
  202|      0|                   AND (?2 IS NULL OR namespace = ?2)
  203|      0|                   AND name = ?3",
  204|      0|            rusqlite::params![cutoff_epoch, namespace_opt, name],
  205|      0|            |r| r.get::<_, usize>(0),
  206|      0|        )?
  207|       |    } else {
  208|      5|        conn.query_row(
  209|      5|            "SELECT COUNT(*) FROM memories
  210|      5|             WHERE deleted_at IS NOT NULL AND deleted_at <= ?1
  211|      5|                   AND (?2 IS NULL OR namespace = ?2)",
  212|      5|            rusqlite::params![cutoff_epoch, namespace_opt],
  213|      5|            |r| r.get::<_, usize>(0),
  214|      0|        )?
  215|       |    };
  216|       |
  217|      5|    Ok((bytes_freed, oldest_deleted_at, count))
  218|      5|}
  219|       |
  220|      0|fn execute_purge(
  221|      0|    tx: &rusqlite::Transaction,
  222|      0|    namespace: &str,
  223|      0|    name: Option<&str>,
  224|      0|    cutoff_epoch: i64,
  225|      0|    warnings: &mut Vec<String>,
  226|      0|) -> Result<(), AppError> {
  227|      0|    let candidates = select_candidates(tx, namespace, name, cutoff_epoch)?;
  228|       |
  229|      0|    for (memory_id, _name) in &candidates {
  230|      0|        if let Err(err) = tx.execute(
  231|      0|            "DELETE FROM vec_chunks WHERE memory_id = ?1",
  232|      0|            rusqlite::params![memory_id],
  233|      0|        ) {
  234|      0|            warnings.push(format!(
  235|      0|                "failed to clean vec_chunks for memory_id {memory_id}: {err}"
  236|      0|            ));
  237|      0|        }
  238|      0|        if let Err(err) = tx.execute(
  239|      0|            "DELETE FROM vec_memories WHERE memory_id = ?1",
  240|      0|            rusqlite::params![memory_id],
  241|      0|        ) {
  242|      0|            warnings.push(format!(
  243|      0|                "failed to clean vec_memories for memory_id {memory_id}: {err}"
  244|      0|            ));
  245|      0|        }
  246|      0|        tx.execute(
  247|      0|            "DELETE FROM memories WHERE id = ?1 AND namespace = ?2 AND deleted_at IS NOT NULL",
  248|      0|            rusqlite::params![memory_id, namespace],
  249|      0|        )?;
  250|       |    }
  251|       |
  252|      0|    Ok(())
  253|      0|}
  254|       |
  255|      0|fn select_candidates(
  256|      0|    conn: &rusqlite::Connection,
  257|      0|    namespace: &str,
  258|      0|    name: Option<&str>,
  259|      0|    cutoff_epoch: i64,
  260|      0|) -> Result<Vec<(i64, String)>, AppError> {
  261|      0|    let query = if name.is_some() {
  262|      0|        "SELECT id, name FROM memories
  263|      0|         WHERE namespace = ?1 AND name = ?2 AND deleted_at IS NOT NULL AND deleted_at <= ?3
  264|      0|         ORDER BY deleted_at ASC"
  265|       |    } else {
  266|      0|        "SELECT id, name FROM memories
  267|      0|         WHERE namespace = ?1 AND deleted_at IS NOT NULL AND deleted_at <= ?2
  268|      0|         ORDER BY deleted_at ASC"
  269|       |    };
  270|       |
  271|      0|    let mut stmt = conn.prepare_cached(query)?;
  272|      0|    let rows = if let Some(name) = name {
  273|      0|        stmt.query_map(rusqlite::params![namespace, name, cutoff_epoch], |row| {
  274|      0|            Ok((row.get::<_, i64>(0)?, row.get::<_, String>(1)?))
  275|      0|        })?
  276|      0|        .collect::<Result<Vec<_>, _>>()?
  277|       |    } else {
  278|      0|        stmt.query_map(rusqlite::params![namespace, cutoff_epoch], |row| {
  279|      0|            Ok((row.get::<_, i64>(0)?, row.get::<_, String>(1)?))
  280|      0|        })?
  281|      0|        .collect::<Result<Vec<_>, _>>()?
  282|       |    };
  283|      0|    Ok(rows)
  284|      0|}
  285|       |
  286|       |#[cfg(test)]
  287|       |mod tests {
  288|       |    use super::*;
  289|       |    use rusqlite::Connection;
  290|       |
  291|      4|    fn setup_test_db() -> Connection {
  292|      4|        let conn = Connection::open_in_memory().expect("failed to open in-memory db");
  293|      4|        conn.execute_batch(
  294|      4|            "CREATE TABLE memories (
  295|      4|                id INTEGER PRIMARY KEY AUTOINCREMENT,
  296|      4|                name TEXT NOT NULL,
  297|      4|                namespace TEXT NOT NULL DEFAULT 'global',
  298|      4|                description TEXT,
  299|      4|                body TEXT,
  300|      4|                deleted_at INTEGER
  301|      4|            );
  302|      4|            CREATE TABLE IF NOT EXISTS vec_chunks (memory_id INTEGER);
  303|      4|            CREATE TABLE IF NOT EXISTS vec_memories (memory_id INTEGER);",
  304|       |        )
  305|      4|        .expect("failed to create test tables");
  306|      4|        conn
  307|      4|    }
  308|       |
  309|      4|    fn insert_deleted_memory(
  310|      4|        conn: &Connection,
  311|      4|        name: &str,
  312|      4|        namespace: &str,
  313|      4|        body: &str,
  314|      4|        deleted_at: i64,
  315|      4|    ) -> i64 {
  316|      4|        conn.execute(
  317|      4|            "INSERT INTO memories (name, namespace, body, deleted_at) VALUES (?1, ?2, ?3, ?4)",
  318|      4|            rusqlite::params![name, namespace, body, deleted_at],
  319|       |        )
  320|      4|        .expect("failed to insert test memory");
  321|      4|        conn.last_insert_rowid()
  322|      4|    }
  323|       |
  324|       |    #[test]
  325|      1|    fn retention_days_used_default_is_90() {
  326|      1|        assert_eq!(crate::constants::PURGE_RETENTION_DAYS_DEFAULT, 90u32);
  327|      1|    }
  328|       |
  329|       |    #[test]
  330|      1|    fn compute_metrics_bytes_freed_positive_for_populated_body() {
  331|      1|        let conn = setup_test_db();
  332|      1|        let now = current_epoch().expect("epoch failed");
  333|      1|        let old_epoch = now - 100 * 86_400;
  334|      1|        insert_deleted_memory(&conn, "mem-test", "global", "memory body", old_epoch);
  335|       |
  336|      1|        let cutoff = now - 30 * 86_400;
  337|      1|        let (bytes, oldest, count) =
  338|      1|            compute_metrics(&conn, cutoff, Some("global"), None).expect("compute_metrics failed");
  339|       |
  340|      1|        assert!(bytes > 0, "bytes_freed must be > 0 for populated body");
                                         ^0
  341|      1|        assert!(oldest.is_some(), "oldest_deleted_at must be Some");
                                                ^0
  342|      1|        assert_eq!(count, 1);
  343|      1|    }
  344|       |
  345|       |    #[test]
  346|      1|    fn compute_metrics_returns_zero_without_candidates() {
  347|      1|        let conn = setup_test_db();
  348|      1|        let now = current_epoch().expect("epoch failed");
  349|      1|        let cutoff = now - 90 * 86_400;
  350|       |
  351|      1|        let (bytes, oldest, count) =
  352|      1|            compute_metrics(&conn, cutoff, Some("global"), None).expect("compute_metrics failed");
  353|       |
  354|      1|        assert_eq!(bytes, 0);
  355|      1|        assert!(oldest.is_none());
  356|      1|        assert_eq!(count, 0);
  357|      1|    }
  358|       |
  359|       |    #[test]
  360|      1|    fn dry_run_does_not_delete_records() {
  361|      1|        let conn = setup_test_db();
  362|      1|        let now = current_epoch().expect("epoch failed");
  363|      1|        let old_epoch = now - 200 * 86_400;
  364|      1|        insert_deleted_memory(&conn, "mem-dry", "global", "dry run content", old_epoch);
  365|       |
  366|      1|        let cutoff = now - 30 * 86_400;
  367|      1|        let (_, _, count_before) =
  368|      1|            compute_metrics(&conn, cutoff, Some("global"), None).expect("compute_metrics failed");
  369|      1|        assert_eq!(count_before, 1, "must have 1 candidate before dry run");
                                                  ^0
  370|       |
  371|      1|        let (_, _, count_after) =
  372|      1|            compute_metrics(&conn, cutoff, Some("global"), None).expect("compute_metrics failed");
  373|      1|        assert_eq!(
  374|       |            count_after, 1,
  375|      0|            "dry_run must not remove records: count must remain 1"
  376|       |        );
  377|      1|    }
  378|       |
  379|       |    #[test]
  380|      1|    fn oldest_deleted_at_returns_smallest_epoch() {
  381|      1|        let conn = setup_test_db();
  382|      1|        let now = current_epoch().expect("epoch failed");
  383|      1|        let epoch_old = now - 300 * 86_400;
  384|      1|        let epoch_recent = now - 200 * 86_400;
  385|       |
  386|      1|        insert_deleted_memory(&conn, "mem-a", "global", "body-a", epoch_old);
  387|      1|        insert_deleted_memory(&conn, "mem-b", "global", "body-b", epoch_recent);
  388|       |
  389|      1|        let cutoff = now - 30 * 86_400;
  390|      1|        let (_, oldest, count) =
  391|      1|            compute_metrics(&conn, cutoff, Some("global"), None).expect("compute_metrics failed");
  392|       |
  393|      1|        assert_eq!(count, 2);
  394|      1|        assert_eq!(
  395|       |            oldest,
  396|      1|            Some(epoch_old),
  397|      0|            "oldest_deleted_at must be the oldest epoch"
  398|       |        );
  399|      1|    }
  400|       |
  401|       |    #[test]
  402|      1|    fn purge_args_namespace_accepts_none_without_default() {
  403|       |        // P1-C: namespace must be None when not provided, allowing resolve_namespace
  404|       |        // to consult SQLITE_GRAPHRAG_NAMESPACE before falling back to "global".
  405|       |        // The field was `default_value = "global"` before P1-C; with that removed,
  406|       |        // resolve_namespace(None) consults the env var correctly.
  407|      1|        let resolved = crate::namespace::resolve_namespace(None)
  408|      1|            .expect("resolve_namespace(None) must return Ok");
  409|      1|        assert_eq!(
  410|       |            resolved, "global",
  411|      0|            "without env var, resolve_namespace(None) must fall back to 'global'"
  412|       |        );
  413|      1|    }
  414|       |
  415|       |    #[test]
  416|      1|    fn purge_response_serializes_all_new_fields() {
  417|      1|        let resp = PurgeResponse {
  418|      1|            action: "purged".to_string(),
  419|      1|            purged_count: 3,
  420|      1|            bytes_freed: 1024,
  421|      1|            oldest_deleted_at: Some(1_700_000_000),
  422|      1|            retention_days_used: 90,
  423|      1|            dry_run: false,
  424|      1|            namespace: Some("global".to_string()),
  425|      1|            cutoff_epoch: 1_710_000_000,
  426|      1|            warnings: vec![],
  427|      1|            elapsed_ms: 42,
  428|      1|            message: None,
  429|      1|        };
  430|      1|        let json = serde_json::to_string(&resp).expect("serialization failed");
  431|      1|        assert!(json.contains("bytes_freed"));
  432|      1|        assert!(json.contains("oldest_deleted_at"));
  433|      1|        assert!(json.contains("retention_days_used"));
  434|      1|        assert!(json.contains("dry_run"));
  435|      1|        assert!(json.contains("elapsed_ms"));
  436|       |        // M2: when no purge happened, `message` is omitted to keep payloads stable.
  437|      1|        assert!(!json.contains("\"message\""));
  438|      1|    }
  439|       |
  440|       |    #[test]
  441|      1|    fn purge_response_serializes_message_when_present() {
  442|       |        // M2 (v1.0.32): zero purges include a human-readable hint message.
  443|      1|        let resp = PurgeResponse {
  444|      1|            action: "purged".to_string(),
  445|      1|            purged_count: 0,
  446|      1|            bytes_freed: 0,
  447|      1|            oldest_deleted_at: None,
  448|      1|            retention_days_used: 90,
  449|      1|            dry_run: false,
  450|      1|            namespace: Some("global".to_string()),
  451|      1|            cutoff_epoch: 1_710_000_000,
  452|      1|            warnings: vec![],
  453|      1|            elapsed_ms: 5,
  454|      1|            message: Some(
  455|      1|                "no soft-deleted memories older than 90 day(s); use --retention-days 0 to purge all soft-deleted memories regardless of age"
  456|      1|                    .to_string(),
  457|      1|            ),
  458|      1|        };
  459|      1|        let json = serde_json::to_string(&resp).expect("serialization failed");
  460|      1|        assert!(json.contains("\"message\""));
  461|      1|        assert!(json.contains("--retention-days 0"));
  462|      1|    }
  463|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/read.rs:
    1|       |//! Handler for the `read` CLI subcommand.
    2|       |
    3|       |use crate::errors::AppError;
    4|       |use crate::output;
    5|       |use crate::paths::AppPaths;
    6|       |use crate::storage::connection::open_ro;
    7|       |use crate::storage::memories;
    8|       |use serde::Serialize;
    9|       |
   10|       |#[derive(clap::Args)]
   11|       |#[command(after_long_help = "EXAMPLES:\n  \
   12|       |    # Read a memory by name (positional)\n  \
   13|       |    sqlite-graphrag read onboarding\n\n  \
   14|       |    # Read using the named flag form\n  \
   15|       |    sqlite-graphrag read --name onboarding\n\n  \
   16|       |    # Read by memory ID (integer emitted in JSON output of most commands)\n  \
   17|       |    sqlite-graphrag read --id 42 --json\n\n  \
   18|       |    # Read from a specific namespace\n  \
   19|       |    sqlite-graphrag read onboarding --namespace my-project")]
   20|       |pub struct ReadArgs {
   21|       |    /// Memory name as a positional argument. Alternative to `--name`.
   22|       |    #[arg(
   23|       |        value_name = "NAME",
   24|       |        conflicts_with = "name",
   25|       |        help = "Memory name (kebab-case slug); alternative to --name"
   26|       |    )]
   27|       |    pub name_positional: Option<String>,
   28|       |    /// Memory name to read. Returns NotFound (exit 4) if missing or soft-deleted.
   29|       |    #[arg(long)]
   30|       |    pub name: Option<String>,
   31|       |    /// Memory ID (integer) for direct lookup. Conflicts with --name and positional NAME.
   32|       |    #[arg(
   33|       |        long,
   34|       |        conflicts_with_all = ["name", "name_positional"],
   35|       |        help = "Memory ID (integer) for direct lookup"
   36|       |    )]
   37|       |    pub id: Option<i64>,
   38|       |    #[arg(
   39|       |        long,
   40|       |        help = "Namespace (env: SQLITE_GRAPHRAG_NAMESPACE, default: global)"
   41|       |    )]
   42|       |    pub namespace: Option<String>,
   43|       |    /// Include linked entities and relationships in the response.
   44|       |    #[arg(
   45|       |        long,
   46|       |        help = "Include graph context (entities + relationships) in response"
   47|       |    )]
   48|       |    pub with_graph: bool,
   49|       |    #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
   50|       |    pub json: bool,
   51|       |    #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
   52|       |    pub db: Option<String>,
   53|       |}
   54|       |
   55|       |#[derive(Serialize)]
   56|       |struct ReadResponse {
   57|       |    /// Canonical storage field. Preserved for compatibility with v2.0.0 clients.
   58|       |    id: i64,
   59|       |    /// Semantic alias of `id` for the contract documented in SKILL.md.
   60|       |    memory_id: i64,
   61|       |    namespace: String,
   62|       |    name: String,
   63|       |    /// Semantic alias of `memory_type` for the documented contract.
   64|       |    #[serde(rename = "type")]
   65|       |    type_alias: String,
   66|       |    memory_type: String,
   67|       |    description: String,
   68|       |    body: String,
   69|       |    body_hash: String,
   70|       |    session_id: Option<String>,
   71|       |    source: String,
   72|       |    metadata: serde_json::Value,
   73|       |    /// Most recent memory version, useful for optimistic control via `--expected-updated-at`.
   74|       |    version: i64,
   75|       |    created_at: i64,
   76|       |    /// RFC 3339 UTC timestamp parallel to `created_at` for ISO 8601 parsers.
   77|       |    created_at_iso: String,
   78|       |    updated_at: i64,
   79|       |    /// RFC 3339 UTC timestamp parallel to `updated_at` for ISO 8601 parsers.
   80|       |    updated_at_iso: String,
   81|       |    /// Linked entities (opt-in via --with-graph).
   82|       |    #[serde(skip_serializing_if = "Option::is_none")]
   83|       |    entities: Option<Vec<ReadEntityBinding>>,
   84|       |    /// Relationships from linked entities (opt-in via --with-graph).
   85|       |    #[serde(skip_serializing_if = "Option::is_none")]
   86|       |    relationships: Option<Vec<ReadRelationshipBinding>>,
   87|       |    /// Total execution time in milliseconds from handler start to serialisation.
   88|       |    elapsed_ms: u64,
   89|       |}
   90|       |
   91|       |#[derive(Serialize)]
   92|       |struct ReadEntityBinding {
   93|       |    entity_id: i64,
   94|       |    name: String,
   95|       |    entity_type: String,
   96|       |}
   97|       |
   98|       |#[derive(Serialize)]
   99|       |struct ReadRelationshipBinding {
  100|       |    from: String,
  101|       |    to: String,
  102|       |    relation: String,
  103|       |    weight: f64,
  104|       |}
  105|       |
  106|      3|fn epoch_to_iso(epoch: i64) -> String {
  107|      3|    crate::tz::epoch_to_iso(epoch)
  108|      3|}
  109|       |
  110|      0|pub fn run(args: ReadArgs) -> Result<(), AppError> {
  111|      0|    let start = std::time::Instant::now();
  112|      0|    let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
  113|      0|    let paths = AppPaths::resolve(args.db.as_deref())?;
  114|      0|    crate::storage::connection::ensure_db_ready(&paths)?;
  115|      0|    let conn = open_ro(&paths.db)?;
  116|       |
  117|      0|    let row_opt = if let Some(id) = args.id {
  118|      0|        let r = memories::read_full(&conn, id)?;
  119|      0|        if let Some(ref row) = r {
  120|      0|            if row.namespace != namespace {
  121|      0|                return Err(AppError::NotFound(format!(
  122|      0|                    "memory id {id} exists but belongs to namespace '{}', not '{namespace}'",
  123|      0|                    row.namespace
  124|      0|                )));
  125|      0|            }
  126|      0|        }
  127|      0|        r
  128|       |    } else {
  129|      0|        let name = args.name_positional.or(args.name).ok_or_else(|| {
  130|      0|            AppError::Validation(
  131|      0|                "name or --id required: pass name as positional argument, via --name, or use --id"
  132|      0|                    .to_string(),
  133|      0|            )
  134|      0|        })?;
  135|      0|        memories::read_by_name(&conn, &namespace, &name)?
  136|       |    };
  137|       |
  138|      0|    match row_opt {
  139|      0|        Some(row) => {
  140|       |            // Resolve current version via memory_versions table (highest version for this memory_id).
  141|      0|            let version: i64 = conn
  142|      0|                .query_row(
  143|      0|                    "SELECT COALESCE(MAX(version), 1) FROM memory_versions WHERE memory_id=?1",
  144|      0|                    rusqlite::params![row.id],
  145|      0|                    |r| r.get(0),
  146|       |                )
  147|      0|                .unwrap_or(1);
  148|       |
  149|       |            // G22: optional graph context
  150|      0|            let (entities, relationships) = if args.with_graph {
  151|      0|                let mut ent_stmt = conn.prepare_cached(
  152|      0|                    "SELECT e.id, e.name, e.type FROM memory_entities me \
  153|      0|                     JOIN entities e ON e.id = me.entity_id \
  154|      0|                     WHERE me.memory_id = ?1",
  155|      0|                )?;
  156|      0|                let ents: Vec<ReadEntityBinding> = ent_stmt
  157|      0|                    .query_map(rusqlite::params![row.id], |r| {
  158|       |                        Ok(ReadEntityBinding {
  159|      0|                            entity_id: r.get(0)?,
  160|      0|                            name: r.get(1)?,
  161|      0|                            entity_type: r.get(2)?,
  162|       |                        })
  163|      0|                    })?
  164|      0|                    .filter_map(|r| r.ok())
  165|      0|                    .collect();
  166|      0|                drop(ent_stmt);
  167|       |
  168|      0|                let entity_ids: Vec<i64> = ents.iter().map(|e| e.entity_id).collect();
  169|      0|                let rels: Vec<ReadRelationshipBinding> = if !entity_ids.is_empty() {
  170|      0|                    let placeholders: String = entity_ids
  171|      0|                        .iter()
  172|      0|                        .map(|id| id.to_string())
  173|      0|                        .collect::<Vec<_>>()
  174|      0|                        .join(",");
  175|      0|                    let sql = format!(
  176|      0|                        "SELECT e1.name, e2.name, r.relation, r.weight \
  177|      0|                         FROM relationships r \
  178|      0|                         JOIN entities e1 ON e1.id = r.source_id \
  179|      0|                         JOIN entities e2 ON e2.id = r.target_id \
  180|      0|                         WHERE r.source_id IN ({placeholders}) OR r.target_id IN ({placeholders})"
  181|       |                    );
  182|      0|                    let mut rel_stmt = conn.prepare(&sql)?;
  183|      0|                    let result: Vec<ReadRelationshipBinding> = rel_stmt
  184|      0|                        .query_map([], |r| {
  185|       |                            Ok(ReadRelationshipBinding {
  186|      0|                                from: r.get(0)?,
  187|      0|                                to: r.get(1)?,
  188|      0|                                relation: r.get(2)?,
  189|      0|                                weight: r.get(3)?,
  190|       |                            })
  191|      0|                        })?
  192|      0|                        .filter_map(|r| r.ok())
  193|      0|                        .collect();
  194|      0|                    drop(rel_stmt);
  195|      0|                    result
  196|       |                } else {
  197|      0|                    vec![]
  198|       |                };
  199|      0|                (Some(ents), Some(rels))
  200|       |            } else {
  201|      0|                (None, None)
  202|       |            };
  203|       |
  204|      0|            let response = ReadResponse {
  205|      0|                id: row.id,
  206|      0|                memory_id: row.id,
  207|      0|                namespace: row.namespace,
  208|      0|                name: row.name,
  209|      0|                type_alias: row.memory_type.clone(),
  210|      0|                memory_type: row.memory_type,
  211|      0|                description: row.description,
  212|      0|                body: row.body,
  213|      0|                body_hash: row.body_hash,
  214|      0|                session_id: row.session_id,
  215|      0|                source: row.source,
  216|      0|                metadata: serde_json::from_str::<serde_json::Value>(&row.metadata)
  217|      0|                    .unwrap_or(serde_json::Value::Null),
  218|      0|                version,
  219|      0|                created_at: row.created_at,
  220|      0|                created_at_iso: epoch_to_iso(row.created_at),
  221|      0|                updated_at: row.updated_at,
  222|      0|                updated_at_iso: epoch_to_iso(row.updated_at),
  223|      0|                entities,
  224|      0|                relationships,
  225|      0|                elapsed_ms: start.elapsed().as_millis() as u64,
  226|      0|            };
  227|      0|            output::emit_json(&response)?;
  228|       |        }
  229|       |        None => {
  230|      0|            let label = if let Some(id) = args.id {
  231|      0|                format!("id={id}")
  232|       |            } else {
  233|      0|                "unknown".to_string()
  234|       |            };
  235|      0|            return Err(AppError::NotFound(format!(
  236|      0|                "memory not found: {label} in namespace '{namespace}'"
  237|      0|            )));
  238|       |        }
  239|       |    }
  240|       |
  241|      0|    Ok(())
  242|      0|}
  243|       |
  244|       |#[cfg(test)]
  245|       |mod tests {
  246|       |    use super::*;
  247|       |
  248|       |    #[test]
  249|      1|    fn epoch_to_iso_converts_zero_to_unix_epoch() {
  250|       |        // v1.0.68 (test fix): parse the ISO back into a DateTime<FixedOffset>
  251|       |        // and compare with chrono::DateTime::UNIX_EPOCH so the assertion is
  252|       |        // timezone-agnostic.  The previous `starts_with("1970-01-01T00:00:00")`
  253|       |        // assertion leaked the global SQLITE_GRAPHRAG_DISPLAY_TZ from sibling
  254|       |        // tests in the same process and failed on hosts where the default
  255|       |        // timezone is non-UTC.
  256|      1|        let result = epoch_to_iso(0);
  257|      1|        let parsed = chrono::DateTime::parse_from_rfc3339(&result)
  258|      1|            .unwrap_or_else(|e| panic!("epoch_to_iso(0) returned non-RFC3339 `{result}`: {e}"));
                                                     ^0
  259|      1|        assert_eq!(
  260|      1|            parsed.timestamp(),
  261|      1|            chrono::DateTime::UNIX_EPOCH.timestamp(),
  262|      0|            "epoch 0 must map to the Unix epoch instant, got: {result}"
  263|       |        );
  264|      1|    }
  265|       |
  266|       |    #[test]
  267|      1|    fn epoch_to_iso_converts_known_timestamp() {
  268|       |        // v1.0.68 (test fix): 1_705_320_000 = 2024-01-15T12:00:00Z, not
  269|       |        // 2024-01-15T00:00:00Z (the previous test asserted the wrong instant).
  270|       |        // The fix uses parse + timestamp compare to be timezone-agnostic and
  271|       |        // to catch wrong-epoch regressions regardless of host TZ.
  272|      1|        let result = epoch_to_iso(1_705_320_000);
  273|      1|        let parsed = chrono::DateTime::parse_from_rfc3339(&result).unwrap_or_else(|e| {
                                                                                                    ^0
  274|      0|            panic!("epoch_to_iso(1705320000) returned non-RFC3339 `{result}`: {e}")
  275|       |        });
  276|      1|        let expected = chrono::DateTime::parse_from_rfc3339("2024-01-15T12:00:00+00:00")
  277|      1|            .expect("static RFC3339 is valid");
  278|      1|        assert_eq!(
  279|      1|            parsed.timestamp(),
  280|      1|            expected.timestamp(),
  281|      0|            "timestamp 1705320000 must map to 2024-01-15T12:00:00Z, got: {result}"
  282|       |        );
  283|      1|    }
  284|       |
  285|       |    #[test]
  286|      1|    fn epoch_to_iso_returns_fallback_for_invalid_negative_epoch() {
  287|      1|        let result = epoch_to_iso(i64::MIN);
  288|      1|        assert!(
  289|      1|            !result.is_empty(),
  290|      0|            "must return a non-empty string even for invalid epoch"
  291|       |        );
  292|      1|    }
  293|       |
  294|       |    #[test]
  295|      1|    fn read_response_serializes_id_and_memory_id_aliases() {
  296|      1|        let resp = ReadResponse {
  297|      1|            id: 42,
  298|      1|            memory_id: 42,
  299|      1|            namespace: "global".to_string(),
  300|      1|            name: "my-mem".to_string(),
  301|      1|            type_alias: "fact".to_string(),
  302|      1|            memory_type: "fact".to_string(),
  303|      1|            description: "desc".to_string(),
  304|      1|            body: "body".to_string(),
  305|      1|            body_hash: "abc123".to_string(),
  306|      1|            session_id: None,
  307|      1|            source: "agent".to_string(),
  308|      1|            metadata: serde_json::json!({}),
  309|      1|            version: 1,
  310|      1|            created_at: 1_705_320_000,
  311|      1|            created_at_iso: "2024-01-15T12:00:00Z".to_string(),
  312|      1|            updated_at: 1_705_320_000,
  313|      1|            updated_at_iso: "2024-01-15T12:00:00Z".to_string(),
  314|      1|            entities: None,
  315|      1|            relationships: None,
  316|      1|            elapsed_ms: 5,
  317|      1|        };
  318|       |
  319|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  320|      1|        assert_eq!(json["id"], 42);
  321|      1|        assert_eq!(json["memory_id"], 42);
  322|      1|        assert_eq!(json["type"], "fact");
  323|      1|        assert_eq!(json["memory_type"], "fact");
  324|      1|        assert_eq!(json["elapsed_ms"], 5u64);
  325|      1|        assert!(
  326|      1|            json["session_id"].is_null(),
  327|      0|            "session_id None must serialize as null"
  328|       |        );
  329|       |        // metadata must serialize as a JSON object, not as an escaped string
  330|      1|        assert!(
  331|      1|            json["metadata"].is_object(),
  332|      0|            "metadata must be a JSON object"
  333|       |        );
  334|      1|    }
  335|       |
  336|       |    #[test]
  337|      1|    fn read_response_session_id_some_serializes_string() {
  338|      1|        let resp = ReadResponse {
  339|      1|            id: 1,
  340|      1|            memory_id: 1,
  341|      1|            namespace: "global".to_string(),
  342|      1|            name: "mem".to_string(),
  343|      1|            type_alias: "skill".to_string(),
  344|      1|            memory_type: "skill".to_string(),
  345|      1|            description: "d".to_string(),
  346|      1|            body: "b".to_string(),
  347|      1|            body_hash: "h".to_string(),
  348|      1|            session_id: Some("sess-123".to_string()),
  349|      1|            source: "agent".to_string(),
  350|      1|            metadata: serde_json::json!({}),
  351|      1|            version: 2,
  352|      1|            created_at: 0,
  353|      1|            created_at_iso: "1970-01-01T00:00:00Z".to_string(),
  354|      1|            updated_at: 0,
  355|      1|            updated_at_iso: "1970-01-01T00:00:00Z".to_string(),
  356|      1|            entities: None,
  357|      1|            relationships: None,
  358|      1|            elapsed_ms: 0,
  359|      1|        };
  360|       |
  361|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  362|      1|        assert_eq!(json["session_id"], "sess-123");
  363|      1|    }
  364|       |
  365|       |    #[test]
  366|      1|    fn read_response_elapsed_ms_is_present() {
  367|      1|        let resp = ReadResponse {
  368|      1|            id: 7,
  369|      1|            memory_id: 7,
  370|      1|            namespace: "ns".to_string(),
  371|      1|            name: "n".to_string(),
  372|      1|            type_alias: "procedure".to_string(),
  373|      1|            memory_type: "procedure".to_string(),
  374|      1|            description: "d".to_string(),
  375|      1|            body: "b".to_string(),
  376|      1|            body_hash: "h".to_string(),
  377|      1|            session_id: None,
  378|      1|            source: "agent".to_string(),
  379|      1|            metadata: serde_json::json!({}),
  380|      1|            version: 3,
  381|      1|            created_at: 1000,
  382|      1|            created_at_iso: "1970-01-01T00:16:40Z".to_string(),
  383|      1|            updated_at: 2000,
  384|      1|            updated_at_iso: "1970-01-01T00:33:20Z".to_string(),
  385|      1|            entities: None,
  386|      1|            relationships: None,
  387|      1|            elapsed_ms: 123,
  388|      1|        };
  389|       |
  390|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  391|      1|        assert_eq!(json["elapsed_ms"], 123u64);
  392|      1|        assert!(json["created_at_iso"].is_string());
  393|      1|        assert!(json["updated_at_iso"].is_string());
  394|      1|    }
  395|       |
  396|       |    #[test]
  397|      1|    fn read_response_metadata_object_not_escaped_string() {
  398|       |        // P2-A: metadata must serialize as a JSON object, not as an escaped string.
  399|      1|        let resp = ReadResponse {
  400|      1|            id: 3,
  401|      1|            memory_id: 3,
  402|      1|            namespace: "ns".to_string(),
  403|      1|            name: "meta-test".to_string(),
  404|      1|            type_alias: "fact".to_string(),
  405|      1|            memory_type: "fact".to_string(),
  406|      1|            description: "d".to_string(),
  407|      1|            body: "b".to_string(),
  408|      1|            body_hash: "h".to_string(),
  409|      1|            session_id: None,
  410|      1|            source: "agent".to_string(),
  411|      1|            metadata: serde_json::json!({"key": "value", "number": 42}),
  412|      1|            version: 1,
  413|      1|            created_at: 0,
  414|      1|            created_at_iso: "1970-01-01T00:00:00Z".to_string(),
  415|      1|            updated_at: 0,
  416|      1|            updated_at_iso: "1970-01-01T00:00:00Z".to_string(),
  417|      1|            entities: None,
  418|      1|            relationships: None,
  419|      1|            elapsed_ms: 1,
  420|      1|        };
  421|       |
  422|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  423|       |        // Must be object, not a JSON string containing escaped JSON.
  424|      1|        assert!(json["metadata"].is_object());
  425|      1|        assert_eq!(json["metadata"]["key"], "value");
  426|      1|        assert_eq!(json["metadata"]["number"], 42);
  427|      1|    }
  428|       |
  429|       |    #[test]
  430|      1|    fn read_response_metadata_fallback_to_null_for_invalid_json() {
  431|       |        // P2-A: fallback when metadata is an invalid string.
  432|      1|        let raw = "invalid-json{{{";
  433|      1|        let parsed =
  434|      1|            serde_json::from_str::<serde_json::Value>(raw).unwrap_or(serde_json::Value::Null);
  435|      1|        assert!(parsed.is_null());
  436|      1|    }
  437|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/recall.rs:
    1|       |//! Handler for the `recall` CLI subcommand.
    2|       |
    3|       |use crate::cli::MemoryType;
    4|       |use crate::errors::AppError;
    5|       |use crate::graph::traverse_from_memories_with_hops;
    6|       |use crate::i18n::errors_msg;
    7|       |use crate::output::{self, JsonOutputFormat, RecallItem, RecallResponse};
    8|       |use crate::paths::AppPaths;
    9|       |use crate::storage::connection::open_ro;
   10|       |use crate::storage::entities;
   11|       |use crate::storage::memories;
   12|       |
   13|       |/// Arguments for the `recall` subcommand.
   14|       |///
   15|       |/// When `--namespace` is omitted the query runs against the `global` namespace,
   16|       |/// which is the default namespace used by `remember` when no `--namespace` flag
   17|       |/// is provided. Pass an explicit `--namespace` value to search a different
   18|       |/// isolated namespace.
   19|       |#[derive(clap::Args)]
   20|       |#[command(after_long_help = "EXAMPLES:\n  \
   21|       |    # Semantic search for top 5 matches\n  \
   22|       |    sqlite-graphrag recall \"authentication design\" --k 5\n\n  \
   23|       |    # Disable automatic graph expansion\n  \
   24|       |    sqlite-graphrag recall \"JWT tokens\" --k 3 --no-graph\n\n  \
   25|       |    # Limit graph traversal depth and minimum edge weight\n  \
   26|       |    sqlite-graphrag recall \"auth\" --k 5 --max-hops 2 --min-weight 0.3\n\n  \
   27|       |    # Filter by memory type\n  \
   28|       |    sqlite-graphrag recall \"deployment\" --type decision --k 10\n\n  \
   29|       |    # Cap results by distance threshold\n  \
   30|       |    sqlite-graphrag recall \"API design\" --k 5 --max-distance 0.8\n\n  \
   31|       |NOTES:\n  \
   32|       |    When --no-graph is active, graph traversal is skipped and every result has\n  \
   33|       |    source=\"direct\". The source field is therefore redundant with --no-graph and\n  \
   34|       |    may be ignored by callers in that mode.")]
   35|       |pub struct RecallArgs {
   36|       |    #[arg(
   37|       |        allow_hyphen_values = true,
   38|       |        help = "Search query string (semantic vector search via sqlite-vec)"
   39|       |    )]
   40|       |    pub query: String,
   41|       |    /// Maximum number of direct vector matches to return.
   42|       |    ///
   43|       |    /// Note: this flag controls only `direct_matches`. Graph traversal results
   44|       |    /// (`graph_matches`) are unbounded by default; use `--max-graph-results` to
   45|       |    /// cap them independently. The `results` field aggregates both lists.
   46|       |    /// Validated to the inclusive range `1..=4096` (the upper bound matches
   47|       |    /// `sqlite-vec`'s knn limit; out-of-range values are rejected at parse time).
   48|       |    #[arg(short = 'k', long, aliases = ["limit", "top-k"], default_value = "10", value_parser = crate::parsers::parse_k_range)]
   49|       |    pub k: usize,
   50|       |    /// Filter by memory.type. Note: distinct from graph entity_type
   51|       |    /// (project/tool/person/file/concept/incident/decision/memory/dashboard/issue_tracker/organization/location/date)
   52|       |    /// used in --entities-file.
   53|       |    #[arg(long, value_enum)]
   54|       |    pub r#type: Option<MemoryType>,
   55|       |    #[arg(long)]
   56|       |    pub namespace: Option<String>,
   57|       |    #[arg(long)]
   58|       |    pub no_graph: bool,
   59|       |    /// Disable -k cap and return all direct matches without truncation.
   60|       |    ///
   61|       |    /// When set, the `-k`/`--k` flag is ignored for `direct_matches` and the
   62|       |    /// response includes every match above the distance threshold. Useful when
   63|       |    /// callers need the complete set rather than a top-N preview.
   64|       |    #[arg(long)]
   65|       |    pub precise: bool,
   66|       |    #[arg(long, default_value = "2")]
   67|       |    pub max_hops: u32,
   68|       |    #[arg(long, default_value = "0.3")]
   69|       |    pub min_weight: f64,
   70|       |    /// Cap the size of `graph_matches` to at most N entries.
   71|       |    ///
   72|       |    /// Defaults to unbounded (`None`) so existing pipelines see the same shape
   73|       |    /// as in v1.0.22 and earlier. Set this when a query touches a dense graph
   74|       |    /// neighbourhood and the caller only needs a top-N preview. Added in v1.0.23.
   75|       |    #[arg(long, value_name = "N")]
   76|       |    pub max_graph_results: Option<usize>,
   77|       |    /// Filter results by maximum distance. Results with distance greater than this value
   78|       |    /// are excluded. If all matches exceed this threshold, the command exits with code 4
   79|       |    /// (`not found`) per the documented public contract.
   80|       |    /// Default `1.0` disables the filter and preserves the top-k behavior.
   81|       |    #[arg(long, alias = "min-distance", default_value = "1.0")]
   82|       |    pub max_distance: f32,
   83|       |    #[arg(long, value_enum, default_value_t = JsonOutputFormat::Json)]
   84|       |    pub format: JsonOutputFormat,
   85|       |    #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
   86|       |    pub db: Option<String>,
   87|       |    /// Accept `--json` as a no-op because output is already JSON by default.
   88|       |    #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
   89|       |    pub json: bool,
   90|       |    /// Search across all namespaces instead of a single namespace.
   91|       |    ///
   92|       |    /// Cannot be combined with `--namespace`. When set, the query runs against
   93|       |    /// every namespace and results include a `namespace` field to identify origin.
   94|       |    #[arg(long, conflicts_with = "namespace")]
   95|       |    pub all_namespaces: bool,
   96|       |    #[command(flatten)]
   97|       |    pub daemon: crate::cli::DaemonOpts,
   98|       |}
   99|       |
  100|       |#[tracing::instrument(skip_all, level = "debug", name = "recall")]
  101|      0|pub fn run(args: RecallArgs) -> Result<(), AppError> {
  102|      0|    let start = std::time::Instant::now();
  103|      0|    let _ = args.format;
  104|      0|    tracing::debug!(target: "recall", query = %args.query, k = args.k, "searching");
  105|       |
  106|       |    // G20: reject graph-specific flags when --no-graph is active
  107|      0|    if args.no_graph {
  108|      0|        if args.max_hops != 2 {
  109|      0|            return Err(AppError::Validation(
  110|      0|                "--max-hops has no effect with --no-graph; remove one".to_string(),
  111|      0|            ));
  112|      0|        }
  113|      0|        if (args.min_weight - 0.3).abs() > f64::EPSILON {
  114|      0|            return Err(AppError::Validation(
  115|      0|                "--min-weight has no effect with --no-graph; remove one".to_string(),
  116|      0|            ));
  117|      0|        }
  118|      0|    }
  119|       |
  120|      0|    if args.query.trim().is_empty() {
  121|      0|        return Err(AppError::Validation(crate::i18n::validation::empty_query()));
  122|      0|    }
  123|       |    // Resolve the list of namespaces to search:
  124|       |    // - empty vec  => all namespaces (sentinel used by knn_search)
  125|       |    // - single vec => one namespace (default or --namespace value)
  126|      0|    let namespaces: Vec<String> = if args.all_namespaces {
  127|      0|        Vec::new()
  128|       |    } else {
  129|      0|        vec![crate::namespace::resolve_namespace(
  130|      0|            args.namespace.as_deref(),
  131|      0|        )?]
  132|       |    };
  133|       |    // Single namespace string used for graph traversal and error messages.
  134|      0|    let namespace_for_graph = namespaces
  135|      0|        .first()
  136|      0|        .cloned()
  137|      0|        .unwrap_or_else(|| "global".to_string());
  138|      0|    let paths = AppPaths::resolve(args.db.as_deref())?;
  139|       |
  140|      0|    crate::storage::connection::ensure_db_ready(&paths)?;
  141|       |
  142|      0|    output::emit_progress_i18n(
  143|      0|        "Computing query embedding...",
  144|      0|        "Calculando embedding da consulta...",
  145|       |    );
  146|      0|    let embedding = crate::daemon::embed_query_or_local(
  147|      0|        &paths.models,
  148|      0|        &args.query,
  149|      0|        args.daemon.autostart_daemon,
  150|      0|    )?;
  151|       |
  152|      0|    let conn = open_ro(&paths.db)?;
  153|       |
  154|      0|    let memory_type_str = args.r#type.map(|t| t.as_str());
  155|       |    // When --precise is set, lift the -k cap so every match is returned; the
  156|       |    // max_distance filter below will trim irrelevant results instead.
  157|      0|    let effective_k = if args.precise { 100_000 } else { args.k };
  158|      0|    let knn_results =
  159|      0|        memories::knn_search(&conn, &embedding, &namespaces, memory_type_str, effective_k)?;
  160|       |
  161|      0|    let mut direct_matches = Vec::with_capacity(effective_k);
  162|      0|    let mut memory_ids: Vec<i64> = Vec::with_capacity(effective_k);
  163|      0|    for (memory_id, distance) in knn_results {
  164|      0|        let row = {
  165|      0|            let mut stmt = conn.prepare_cached(
  166|      0|                "SELECT id, namespace, name, type, description, body, body_hash,
  167|      0|                        session_id, source, metadata, created_at, updated_at
  168|      0|                 FROM memories WHERE id=?1 AND deleted_at IS NULL",
  169|      0|            )?;
  170|      0|            stmt.query_row(rusqlite::params![memory_id], |r| {
  171|       |                Ok(memories::MemoryRow {
  172|      0|                    id: r.get(0)?,
  173|      0|                    namespace: r.get(1)?,
  174|      0|                    name: r.get(2)?,
  175|      0|                    memory_type: r.get(3)?,
  176|      0|                    description: r.get(4)?,
  177|      0|                    body: r.get(5)?,
  178|      0|                    body_hash: r.get(6)?,
  179|      0|                    session_id: r.get(7)?,
  180|      0|                    source: r.get(8)?,
  181|      0|                    metadata: r.get(9)?,
  182|      0|                    created_at: r.get(10)?,
  183|      0|                    updated_at: r.get(11)?,
  184|      0|                    deleted_at: None,
  185|       |                })
  186|      0|            })
  187|      0|            .ok()
  188|       |        };
  189|      0|        if let Some(row) = row {
  190|      0|            let snippet: String = row.body.chars().take(300).collect();
  191|      0|            direct_matches.push(RecallItem {
  192|      0|                memory_id: row.id,
  193|      0|                name: row.name,
  194|      0|                namespace: row.namespace,
  195|      0|                memory_type: row.memory_type,
  196|      0|                description: row.description,
  197|      0|                snippet,
  198|      0|                distance,
  199|      0|                score: RecallItem::score_from_distance(distance),
  200|      0|                source: "direct".to_string(),
  201|      0|                // Direct vector matches do not have a graph depth; rely on `distance`.
  202|      0|                graph_depth: None,
  203|      0|            });
  204|      0|            memory_ids.push(memory_id);
  205|      0|        }
  206|       |    }
  207|       |
  208|      0|    let mut graph_matches = Vec::with_capacity(8);
  209|      0|    if !args.no_graph {
  210|      0|        let entity_knn = entities::knn_search(&conn, &embedding, &namespace_for_graph, 5)?;
  211|      0|        let entity_ids: Vec<i64> = entity_knn.iter().map(|(id, _)| *id).collect();
  212|       |
  213|      0|        let all_seed_ids: Vec<i64> = memory_ids
  214|      0|            .iter()
  215|      0|            .chain(entity_ids.iter())
  216|      0|            .copied()
  217|      0|            .collect();
  218|       |
  219|      0|        if !all_seed_ids.is_empty() {
  220|      0|            let graph_memory_ids = traverse_from_memories_with_hops(
  221|      0|                &conn,
  222|      0|                &all_seed_ids,
  223|      0|                &namespace_for_graph,
  224|      0|                args.min_weight,
  225|      0|                args.max_hops,
  226|      0|            )?;
  227|       |
  228|      0|            for (graph_mem_id, hop) in graph_memory_ids {
  229|       |                // v1.0.23: respect the optional cap on graph results so dense
  230|       |                // neighbourhoods do not flood the response unintentionally.
  231|      0|                if let Some(cap) = args.max_graph_results {
  232|      0|                    if graph_matches.len() >= cap {
  233|      0|                        break;
  234|      0|                    }
  235|      0|                }
  236|      0|                let row = {
  237|      0|                    let mut stmt = conn.prepare_cached(
  238|      0|                        "SELECT id, namespace, name, type, description, body, body_hash,
  239|      0|                                session_id, source, metadata, created_at, updated_at
  240|      0|                         FROM memories WHERE id=?1 AND deleted_at IS NULL",
  241|      0|                    )?;
  242|      0|                    stmt.query_row(rusqlite::params![graph_mem_id], |r| {
  243|       |                        Ok(memories::MemoryRow {
  244|      0|                            id: r.get(0)?,
  245|      0|                            namespace: r.get(1)?,
  246|      0|                            name: r.get(2)?,
  247|      0|                            memory_type: r.get(3)?,
  248|      0|                            description: r.get(4)?,
  249|      0|                            body: r.get(5)?,
  250|      0|                            body_hash: r.get(6)?,
  251|      0|                            session_id: r.get(7)?,
  252|      0|                            source: r.get(8)?,
  253|      0|                            metadata: r.get(9)?,
  254|      0|                            created_at: r.get(10)?,
  255|      0|                            updated_at: r.get(11)?,
  256|      0|                            deleted_at: None,
  257|       |                        })
  258|      0|                    })
  259|      0|                    .ok()
  260|       |                };
  261|      0|                if let Some(row) = row {
  262|      0|                    let snippet: String = row.body.chars().take(300).collect();
  263|      0|                    // Compute approximate distance from graph hop count.
  264|      0|                    // WARNING: graph_distance is a hop-count proxy, NOT real cosine distance.
  265|      0|                    // For confident ranking, prefer the `graph_depth` field (set to Some(hop)
  266|      0|                    // below). Real cosine distance for graph matches would require
  267|      0|                    // re-embedding (200-500ms latency) and is reserved for v1.0.28.
  268|      0|                    let graph_distance = 1.0 - 1.0 / (hop as f32 + 1.0);
  269|      0|                    graph_matches.push(RecallItem {
  270|      0|                        memory_id: row.id,
  271|      0|                        name: row.name,
  272|      0|                        namespace: row.namespace,
  273|      0|                        memory_type: row.memory_type,
  274|      0|                        description: row.description,
  275|      0|                        snippet,
  276|      0|                        distance: graph_distance,
  277|      0|                        score: RecallItem::score_from_distance(graph_distance),
  278|      0|                        source: "graph".to_string(),
  279|      0|                        graph_depth: Some(hop),
  280|      0|                    });
  281|      0|                }
  282|       |            }
  283|      0|        }
  284|      0|    }
  285|       |
  286|       |    // Filtrar por max_distance se < 1.0 (ativado). Se nenhum hit dentro do threshold, exit 4.
  287|      0|    if args.max_distance < 1.0 {
  288|      0|        let has_relevant = direct_matches
  289|      0|            .iter()
  290|      0|            .any(|item| item.distance <= args.max_distance);
  291|      0|        if !has_relevant {
  292|      0|            return Err(AppError::NotFound(errors_msg::no_recall_results(
  293|      0|                args.max_distance,
  294|      0|                &args.query,
  295|      0|                &namespace_for_graph,
  296|      0|            )));
  297|      0|        }
  298|      0|    }
  299|       |
  300|      0|    let results: Vec<RecallItem> = direct_matches
  301|      0|        .iter()
  302|      0|        .cloned()
  303|      0|        .chain(graph_matches.iter().cloned())
  304|      0|        .collect();
  305|       |
  306|      0|    output::emit_json(&RecallResponse {
  307|      0|        query: args.query,
  308|      0|        k: args.k,
  309|      0|        direct_matches,
  310|      0|        graph_matches,
  311|      0|        results,
  312|      0|        elapsed_ms: start.elapsed().as_millis() as u64,
  313|      0|    })?;
  314|       |
  315|      0|    Ok(())
  316|      0|}
  317|       |
  318|       |#[cfg(test)]
  319|       |mod tests {
  320|       |    use crate::output::{RecallItem, RecallResponse};
  321|       |
  322|      6|    fn make_item(name: &str, distance: f32, source: &str) -> RecallItem {
  323|       |        RecallItem {
  324|       |            memory_id: 1,
  325|      6|            name: name.to_string(),
  326|      6|            namespace: "global".to_string(),
  327|      6|            memory_type: "fact".to_string(),
  328|      6|            description: "desc".to_string(),
  329|      6|            snippet: "snippet".to_string(),
  330|      6|            distance,
  331|      6|            score: RecallItem::score_from_distance(distance),
  332|      6|            source: source.to_string(),
  333|      6|            graph_depth: if source == "graph" { Some(0) } else { None },
                                                              ^1               ^5
  334|       |        }
  335|      6|    }
  336|       |
  337|       |    // Bug M-A5: every RecallItem carries a non-null cosine similarity score.
  338|       |    #[test]
  339|      1|    fn recall_item_score_is_present_and_finite_for_direct_match() {
  340|      1|        let item = make_item("mem", 0.25, "direct");
  341|      1|        let json = serde_json::to_value(&item).expect("serialization failed");
  342|      1|        let score = json["score"].as_f64().expect("score must be a number");
  343|      1|        assert!(
  344|      1|            (0.0..=1.0).contains(&score),
  345|      0|            "score must be in [0, 1], got {score}"
  346|       |        );
  347|      1|        assert!(
  348|      1|            (score - 0.75).abs() < 1e-6,
  349|      0|            "score must equal 1 - distance for canonical case"
  350|       |        );
  351|      1|    }
  352|       |
  353|       |    #[test]
  354|      1|    fn recall_item_score_clamps_distance_outside_unit_range() {
  355|       |        // Pathological distances must not yield score outside [0, 1] or NaN.
  356|      1|        assert_eq!(RecallItem::score_from_distance(2.0), 0.0);
  357|      1|        assert_eq!(RecallItem::score_from_distance(-0.5), 1.0);
  358|      1|        assert_eq!(RecallItem::score_from_distance(f32::NAN), 0.0);
  359|      1|    }
  360|       |
  361|       |    #[test]
  362|      1|    fn recall_response_serializes_required_fields() {
  363|      1|        let resp = RecallResponse {
  364|      1|            query: "rust memory".to_string(),
  365|      1|            k: 5,
  366|      1|            direct_matches: vec![make_item("mem-a", 0.12, "direct")],
  367|      1|            graph_matches: vec![],
  368|      1|            results: vec![make_item("mem-a", 0.12, "direct")],
  369|      1|            elapsed_ms: 42,
  370|      1|        };
  371|       |
  372|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  373|      1|        assert_eq!(json["query"], "rust memory");
  374|      1|        assert_eq!(json["k"], 5);
  375|      1|        assert_eq!(json["elapsed_ms"], 42u64);
  376|      1|        assert!(json["direct_matches"].is_array());
  377|      1|        assert!(json["graph_matches"].is_array());
  378|      1|        assert!(json["results"].is_array());
  379|      1|    }
  380|       |
  381|       |    #[test]
  382|      1|    fn recall_item_serializes_renamed_type() {
  383|      1|        let item = make_item("mem-test", 0.25, "direct");
  384|      1|        let json = serde_json::to_value(&item).expect("serialization failed");
  385|       |
  386|       |        // The memory_type field is renamed to "type" in JSON
  387|      1|        assert_eq!(json["type"], "fact");
  388|      1|        assert_eq!(json["distance"], 0.25f32);
  389|      1|        assert_eq!(json["source"], "direct");
  390|      1|    }
  391|       |
  392|       |    #[test]
  393|      1|    fn recall_response_results_contains_direct_and_graph() {
  394|      1|        let direct = make_item("d-mem", 0.10, "direct");
  395|      1|        let graph = make_item("g-mem", 0.0, "graph");
  396|       |
  397|      1|        let resp = RecallResponse {
  398|      1|            query: "query".to_string(),
  399|      1|            k: 10,
  400|      1|            direct_matches: vec![direct.clone()],
  401|      1|            graph_matches: vec![graph.clone()],
  402|      1|            results: vec![direct, graph],
  403|      1|            elapsed_ms: 10,
  404|      1|        };
  405|       |
  406|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  407|      1|        assert_eq!(json["direct_matches"].as_array().unwrap().len(), 1);
  408|      1|        assert_eq!(json["graph_matches"].as_array().unwrap().len(), 1);
  409|      1|        assert_eq!(json["results"].as_array().unwrap().len(), 2);
  410|      1|        assert_eq!(json["results"][0]["source"], "direct");
  411|      1|        assert_eq!(json["results"][1]["source"], "graph");
  412|      1|    }
  413|       |
  414|       |    #[test]
  415|      1|    fn recall_response_empty_serializes_empty_arrays() {
  416|      1|        let resp = RecallResponse {
  417|      1|            query: "nothing".to_string(),
  418|      1|            k: 3,
  419|      1|            direct_matches: vec![],
  420|      1|            graph_matches: vec![],
  421|      1|            results: vec![],
  422|      1|            elapsed_ms: 1,
  423|      1|        };
  424|       |
  425|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  426|      1|        assert_eq!(json["direct_matches"].as_array().unwrap().len(), 0);
  427|      1|        assert_eq!(json["results"].as_array().unwrap().len(), 0);
  428|      1|    }
  429|       |
  430|       |    #[test]
  431|      1|    fn graph_matches_distance_uses_hop_count_proxy() {
  432|       |        // Verify the hop-count proxy formula: 1.0 - 1.0 / (hop + 1.0)
  433|       |        // hop=0 → 0.0 (seed-level entity, identity distance)
  434|       |        // hop=1 → 0.5
  435|       |        // hop=2 → ≈ 0.667
  436|       |        // hop=3 → 0.75
  437|      1|        let cases: &[(u32, f32)] = &[(0, 0.0), (1, 0.5), (2, 0.6667), (3, 0.75)];
  438|      5|        for &(hop, expected) in cases {
                            ^4   ^4
  439|      4|            let d = 1.0_f32 - 1.0 / (hop as f32 + 1.0);
  440|      4|            assert!(
  441|      4|                (d - expected).abs() < 0.001,
  442|      0|                "hop={hop} expected={expected} got={d}"
  443|       |            );
  444|       |        }
  445|      1|    }
  446|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/reclassify.rs:
    1|       |//! Handler for the `reclassify` CLI subcommand (GAP-18).
    2|       |//!
    3|       |//! Reclassifies one entity (single mode) or a whole group of entities (batch
    4|       |//! mode) by updating the `type` column in the `entities` table.
    5|       |//!
    6|       |//! Single mode: `--name <entity>` changes the type of one entity.
    7|       |//! Batch mode: `--from-type <old> --to-type <new> --batch` changes every
    8|       |//! entity in the namespace that currently has `<old>` as its type.
    9|       |
   10|       |use crate::entity_type::EntityType;
   11|       |use crate::errors::AppError;
   12|       |use crate::i18n::errors_msg;
   13|       |use crate::output::{self, OutputFormat};
   14|       |use crate::paths::AppPaths;
   15|       |use crate::storage::connection::open_rw;
   16|       |use crate::storage::entities;
   17|       |use rusqlite::params;
   18|       |use serde::Serialize;
   19|       |
   20|       |#[derive(clap::Args)]
   21|       |#[command(after_long_help = "EXAMPLES:\n  \
   22|       |    # Reclassify a single entity from its current type to 'tool'\n  \
   23|       |    sqlite-graphrag reclassify --name tokio-runtime --new-type tool\n\n  \
   24|       |    # Reclassify all 'concept' entities to 'tool' in one shot (batch)\n  \
   25|       |    sqlite-graphrag reclassify --from-type concept --to-type tool --batch\n\n  \
   26|       |    # Reclassify in a specific namespace\n  \
   27|       |    sqlite-graphrag reclassify --name alice --new-type person --namespace my-project\n\n\
   28|       |NOTE:\n  \
   29|       |    Single mode requires --name and at least one of --new-type or --description.\n  \
   30|       |    Batch mode requires --from-type, --to-type and --batch.\n  \
   31|       |    Providing --name together with --batch is an error.\n\n\
   32|       |VALID ENTITY TYPES:\n  \
   33|       |    project, tool, person, file, concept, incident, decision,\n  \
   34|       |    memory, dashboard, issue_tracker, organization, location, date")]
   35|       |pub struct ReclassifyArgs {
   36|       |    /// Entity name to reclassify (single mode). Mutually exclusive with --from-type + --batch.
   37|       |    #[arg(long, conflicts_with_all = ["from_type", "batch"])]
   38|       |    pub name: Option<String>,
   39|       |    /// New entity type for single mode.
   40|       |    #[arg(long, value_enum, value_name = "TYPE")]
   41|       |    pub new_type: Option<EntityType>,
   42|       |    /// New description for the entity (single mode only). Ignored in batch mode.
   43|       |    #[arg(long, value_name = "TEXT")]
   44|       |    pub description: Option<String>,
   45|       |    /// Current entity type to match in batch mode. Requires --to-type and --batch.
   46|       |    #[arg(
   47|       |        long,
   48|       |        value_enum,
   49|       |        value_name = "TYPE",
   50|       |        requires = "to_type",
   51|       |        requires = "batch"
   52|       |    )]
   53|       |    pub from_type: Option<EntityType>,
   54|       |    /// New entity type to assign in batch mode. Requires --from-type and --batch.
   55|       |    #[arg(long, value_enum, value_name = "TYPE", requires = "from_type")]
   56|       |    pub to_type: Option<EntityType>,
   57|       |    /// Enable batch reclassification (--from-type to --to-type). Requires --from-type and --to-type.
   58|       |    #[arg(long, default_value_t = false, requires = "from_type")]
   59|       |    pub batch: bool,
   60|       |    #[arg(long)]
   61|       |    pub namespace: Option<String>,
   62|       |    #[arg(long, value_enum, default_value = "json")]
   63|       |    pub format: OutputFormat,
   64|       |    #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
   65|       |    pub json: bool,
   66|       |    #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
   67|       |    pub db: Option<String>,
   68|       |}
   69|       |
   70|       |#[derive(Serialize)]
   71|       |struct ReclassifyResponse {
   72|       |    action: String,
   73|       |    count: usize,
   74|       |    #[serde(skip_serializing_if = "Option::is_none")]
   75|       |    description_updated: Option<bool>,
   76|       |    namespace: String,
   77|       |    /// Total execution time in milliseconds from handler start to serialisation.
   78|       |    elapsed_ms: u64,
   79|       |}
   80|       |
   81|      0|pub fn run(args: ReclassifyArgs) -> Result<(), AppError> {
   82|      0|    let inicio = std::time::Instant::now();
   83|      0|    let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
   84|      0|    let paths = AppPaths::resolve(args.db.as_deref())?;
   85|       |
   86|      0|    crate::storage::connection::ensure_db_ready(&paths)?;
   87|       |
   88|      0|    let mut conn = open_rw(&paths.db)?;
   89|       |
   90|      0|    let count = if args.batch {
   91|       |        // Batch mode: --from-type + --to-type + --batch
   92|      0|        let from_type = args.from_type.ok_or_else(|| {
   93|      0|            AppError::Validation("--from-type is required in batch mode".to_string())
   94|      0|        })?;
   95|      0|        let to_type = args.to_type.ok_or_else(|| {
   96|      0|            AppError::Validation("--to-type is required in batch mode".to_string())
   97|      0|        })?;
   98|       |
   99|      0|        let tx = conn.transaction_with_behavior(rusqlite::TransactionBehavior::Immediate)?;
  100|      0|        let affected = tx.execute(
  101|      0|            "UPDATE entities SET type = ?1, updated_at = unixepoch()
  102|      0|             WHERE type = ?2 AND namespace = ?3",
  103|      0|            params![to_type.as_str(), from_type.as_str(), namespace],
  104|      0|        )?;
  105|      0|        tx.commit()?;
  106|      0|        if affected == 0 {
  107|      0|            tracing::warn!(target: "reclassify",
  108|      0|                from_type = from_type.as_str(),
  109|       |                namespace = %namespace,
  110|      0|                "reclassify batch matched zero entities — verify --from-type value exists"
  111|       |            );
  112|      0|        }
  113|      0|        affected
  114|       |    } else {
  115|       |        // Single mode: --name + --new-type
  116|      0|        let entity_name = args
  117|      0|            .name
  118|      0|            .as_deref()
  119|      0|            .ok_or_else(|| AppError::Validation("--name is required in single mode".to_string()))?;
  120|      0|        if args.new_type.is_none() && args.description.is_none() {
  121|      0|            return Err(AppError::Validation(
  122|      0|                "at least one of --new-type or --description is required in single mode"
  123|      0|                    .to_string(),
  124|      0|            ));
  125|      0|        }
  126|       |
  127|       |        // Verify entity exists.
  128|      0|        entities::find_entity_id(&conn, &namespace, entity_name)?.ok_or_else(|| {
  129|      0|            AppError::NotFound(errors_msg::entity_not_found(entity_name, &namespace))
  130|      0|        })?;
  131|       |
  132|      0|        let tx = conn.transaction_with_behavior(rusqlite::TransactionBehavior::Immediate)?;
  133|      0|        let mut affected = 0;
  134|      0|        if let Some(new_type) = args.new_type {
  135|      0|            affected = tx.execute(
  136|      0|                "UPDATE entities SET type = ?1, updated_at = unixepoch()
  137|      0|                 WHERE name = ?2 AND namespace = ?3",
  138|      0|                params![new_type.as_str(), entity_name, namespace],
  139|      0|            )?;
  140|      0|        }
  141|      0|        if let Some(ref desc) = args.description {
  142|      0|            let rows = tx.execute(
  143|      0|                "UPDATE entities SET description = ?1, updated_at = unixepoch()
  144|      0|                 WHERE name = ?2 AND namespace = ?3",
  145|      0|                params![desc, entity_name, namespace],
  146|      0|            )?;
  147|      0|            if affected == 0 {
  148|      0|                affected = rows;
  149|      0|            }
  150|      0|        }
  151|      0|        tx.commit()?;
  152|      0|        affected
  153|       |    };
  154|       |
  155|      0|    conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);")?;
  156|       |
  157|      0|    let response = ReclassifyResponse {
  158|      0|        action: "reclassified".to_string(),
  159|      0|        count,
  160|      0|        description_updated: if args.description.is_some() {
  161|      0|            Some(true)
  162|       |        } else {
  163|      0|            None
  164|       |        },
  165|      0|        namespace: namespace.clone(),
  166|      0|        elapsed_ms: inicio.elapsed().as_millis() as u64,
  167|       |    };
  168|       |
  169|      0|    match args.format {
  170|      0|        OutputFormat::Json => output::emit_json(&response)?,
  171|      0|        OutputFormat::Text | OutputFormat::Markdown => {
  172|      0|            output::emit_text(&format!(
  173|      0|                "reclassified: {} entities [{}]",
  174|      0|                response.count, response.namespace
  175|      0|            ));
  176|      0|        }
  177|       |    }
  178|       |
  179|      0|    Ok(())
  180|      0|}
  181|       |
  182|       |#[cfg(test)]
  183|       |mod tests {
  184|       |    use super::*;
  185|       |
  186|       |    #[test]
  187|      1|    fn reclassify_response_serializes_all_fields() {
  188|      1|        let resp = ReclassifyResponse {
  189|      1|            action: "reclassified".to_string(),
  190|      1|            count: 5,
  191|      1|            description_updated: None,
  192|      1|            namespace: "global".to_string(),
  193|      1|            elapsed_ms: 12,
  194|      1|        };
  195|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  196|      1|        assert_eq!(json["action"], "reclassified");
  197|      1|        assert_eq!(json["count"], 5);
  198|      1|        assert_eq!(json["namespace"], "global");
  199|      1|        assert!(json["elapsed_ms"].is_number());
  200|      1|        assert!(json.get("description_updated").is_none());
  201|      1|    }
  202|       |
  203|       |    #[test]
  204|      1|    fn reclassify_response_count_zero_is_valid() {
  205|      1|        let resp = ReclassifyResponse {
  206|      1|            action: "reclassified".to_string(),
  207|      1|            count: 0,
  208|      1|            description_updated: None,
  209|      1|            namespace: "my-project".to_string(),
  210|      1|            elapsed_ms: 3,
  211|      1|        };
  212|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  213|      1|        assert_eq!(json["count"], 0);
  214|      1|        assert_eq!(json["action"], "reclassified");
  215|      1|    }
  216|       |
  217|       |    #[test]
  218|      1|    fn reclassify_response_action_is_reclassified() {
  219|      1|        let resp = ReclassifyResponse {
  220|      1|            action: "reclassified".to_string(),
  221|      1|            count: 1,
  222|      1|            description_updated: None,
  223|      1|            namespace: "ns".to_string(),
  224|      1|            elapsed_ms: 1,
  225|      1|        };
  226|      1|        assert_eq!(resp.action, "reclassified");
  227|      1|    }
  228|       |
  229|       |    #[test]
  230|      1|    fn reclassify_response_description_updated_present_when_set() {
  231|      1|        let resp = ReclassifyResponse {
  232|      1|            action: "reclassified".to_string(),
  233|      1|            count: 1,
  234|      1|            description_updated: Some(true),
  235|      1|            namespace: "global".to_string(),
  236|      1|            elapsed_ms: 2,
  237|      1|        };
  238|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  239|      1|        assert_eq!(json["description_updated"], true);
  240|      1|    }
  241|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/reclassify_relation.rs:
    1|       |//! Handler for the `reclassify-relation` CLI subcommand (GAP-13).
    2|       |//!
    3|       |//! Renames a relation type in the `relationships` table — either a single
    4|       |//! directed edge (`--source`, `--target`, `--from-relation`) or every edge of
    5|       |//! a given type in the namespace (`--batch`).
    6|       |//!
    7|       |//! When the rename would produce a duplicate `(source_id, target_id, relation)`
    8|       |//! triple, `UPDATE OR IGNORE` skips the conflicting row and the subsequent
    9|       |//! `DELETE` removes it; the count of such skipped rows is reported as
   10|       |//! `merged_duplicates`.
   11|       |
   12|       |use crate::entity_type::EntityType;
   13|       |use crate::errors::AppError;
   14|       |use crate::output::{self, OutputFormat};
   15|       |use crate::paths::AppPaths;
   16|       |use crate::storage::connection::open_rw;
   17|       |use rusqlite::params;
   18|       |use serde::Serialize;
   19|       |
   20|       |#[derive(clap::Args)]
   21|       |#[command(after_long_help = "EXAMPLES:\n  \
   22|       |    # Rename a single edge from 'mentions' to 'related'\n  \
   23|       |    sqlite-graphrag reclassify-relation --source tokio --target axum \\\n  \
   24|       |        --from-relation mentions --to-relation related\n\n  \
   25|       |    # Rename every 'mentions' edge in the namespace to 'related'\n  \
   26|       |    sqlite-graphrag reclassify-relation \\\n  \
   27|       |        --from-relation mentions --to-relation related --batch\n\n  \
   28|       |    # Dry-run to preview what would change\n  \
   29|       |    sqlite-graphrag reclassify-relation \\\n  \
   30|       |        --from-relation mentions --to-relation related --batch --dry-run\n\n  \
   31|       |    # Batch rename only edges whose source is a 'tool' entity\n  \
   32|       |    sqlite-graphrag reclassify-relation \\\n  \
   33|       |        --from-relation uses --to-relation depends_on --batch \\\n  \
   34|       |        --filter-source-type tool\n\n\
   35|       |NOTE:\n  \
   36|       |    Single mode requires --source, --target and --from-relation.\n  \
   37|       |    Batch mode requires --from-relation, --to-relation and --batch.\n  \
   38|       |    --filter-source-type and --filter-target-type are only effective in batch mode.")]
   39|       |pub struct ReclassifyRelationArgs {
   40|       |    /// Source entity name (single mode). Mutually exclusive with --batch.
   41|       |    #[arg(long, conflicts_with = "batch", value_name = "ENTITY")]
   42|       |    pub source: Option<String>,
   43|       |    /// Target entity name (single mode). Mutually exclusive with --batch.
   44|       |    #[arg(long, conflicts_with = "batch", value_name = "ENTITY")]
   45|       |    pub target: Option<String>,
   46|       |    /// Current relation type to rename. Required in both single and batch modes.
   47|       |    #[arg(long, value_parser = crate::parsers::parse_relation, value_name = "RELATION")]
   48|       |    pub from_relation: String,
   49|       |    /// New relation type to assign. Required in both single and batch modes.
   50|       |    #[arg(long, value_parser = crate::parsers::parse_relation, value_name = "RELATION")]
   51|       |    pub to_relation: String,
   52|       |    /// Enable batch reclassification of all edges with --from-relation. Requires --from-relation and --to-relation.
   53|       |    #[arg(long, default_value_t = false)]
   54|       |    pub batch: bool,
   55|       |    /// Filter batch: only rename edges whose source entity has this type.
   56|       |    #[arg(long, value_enum, value_name = "TYPE", requires = "batch")]
   57|       |    pub filter_source_type: Option<EntityType>,
   58|       |    /// Filter batch: only rename edges whose target entity has this type.
   59|       |    #[arg(long, value_enum, value_name = "TYPE", requires = "batch")]
   60|       |    pub filter_target_type: Option<EntityType>,
   61|       |    /// Preview count without committing changes.
   62|       |    #[arg(long, default_value_t = false)]
   63|       |    pub dry_run: bool,
   64|       |    #[arg(long)]
   65|       |    pub namespace: Option<String>,
   66|       |    #[arg(long, value_enum, default_value = "json")]
   67|       |    pub format: OutputFormat,
   68|       |    #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
   69|       |    pub json: bool,
   70|       |    #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
   71|       |    pub db: Option<String>,
   72|       |}
   73|       |
   74|       |#[derive(Serialize)]
   75|       |struct ReclassifyRelationResponse {
   76|       |    action: String,
   77|       |    from_relation: String,
   78|       |    to_relation: String,
   79|       |    /// Number of edges successfully renamed.
   80|       |    count: usize,
   81|       |    /// Edges that collided with an existing (source, target, to_relation) triple
   82|       |    /// and were removed rather than renamed (UPDATE OR IGNORE + DELETE pattern).
   83|       |    merged_duplicates: usize,
   84|       |    namespace: String,
   85|       |    elapsed_ms: u64,
   86|       |}
   87|       |
   88|      0|pub fn run(args: ReclassifyRelationArgs) -> Result<(), AppError> {
   89|      0|    let inicio = std::time::Instant::now();
   90|      0|    let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
   91|      0|    let paths = AppPaths::resolve(args.db.as_deref())?;
   92|       |
   93|      0|    crate::storage::connection::ensure_db_ready(&paths)?;
   94|       |
   95|       |    // Emit warnings for non-canonical relation values.
   96|      0|    crate::parsers::warn_if_non_canonical(&args.from_relation);
   97|      0|    crate::parsers::warn_if_non_canonical(&args.to_relation);
   98|       |
   99|       |    // Reject same-value renames: nothing to do and would silently remove duplicates.
  100|      0|    if args.from_relation == args.to_relation {
  101|      0|        return Err(AppError::Validation(
  102|      0|            "--from-relation and --to-relation must be different".to_string(),
  103|      0|        ));
  104|      0|    }
  105|       |
  106|      0|    let mut conn = open_rw(&paths.db)?;
  107|       |
  108|      0|    if args.batch {
  109|      0|        run_batch(args, inicio, namespace, &mut conn)
  110|       |    } else {
  111|      0|        run_single(args, inicio, namespace, &mut conn)
  112|       |    }
  113|      0|}
  114|       |
  115|       |// ---------------------------------------------------------------------------
  116|       |// Single mode
  117|       |// ---------------------------------------------------------------------------
  118|       |
  119|      0|fn run_single(
  120|      0|    args: ReclassifyRelationArgs,
  121|      0|    inicio: std::time::Instant,
  122|      0|    namespace: String,
  123|      0|    conn: &mut rusqlite::Connection,
  124|      0|) -> Result<(), AppError> {
  125|      0|    let source_name = args.source.as_deref().ok_or_else(|| {
  126|      0|        AppError::Validation(
  127|      0|            "--source is required in single mode (omit --batch for single-edge rename)".to_string(),
  128|      0|        )
  129|      0|    })?;
  130|      0|    let target_name = args
  131|      0|        .target
  132|      0|        .as_deref()
  133|      0|        .ok_or_else(|| AppError::Validation("--target is required in single mode".to_string()))?;
  134|       |
  135|       |    // Resolve entity IDs — fail fast if either side does not exist.
  136|       |    // Normalize names to match the normalized stored entity names.
  137|      0|    let source_name_norm = crate::parsers::normalize_entity_name(source_name);
  138|      0|    let target_name_norm = crate::parsers::normalize_entity_name(target_name);
  139|      0|    let source_id: i64 = conn
  140|      0|        .query_row(
  141|      0|            "SELECT id FROM entities WHERE name = ?1 AND namespace = ?2",
  142|      0|            params![source_name_norm, namespace],
  143|      0|            |r| r.get(0),
  144|       |        )
  145|      0|        .map_err(|_| {
  146|      0|            AppError::NotFound(format!(
  147|      0|                "source entity '{source_name}' not found in namespace '{namespace}'"
  148|      0|            ))
  149|      0|        })?;
  150|       |
  151|      0|    let target_id: i64 = conn
  152|      0|        .query_row(
  153|      0|            "SELECT id FROM entities WHERE name = ?1 AND namespace = ?2",
  154|      0|            params![target_name_norm, namespace],
  155|      0|            |r| r.get(0),
  156|       |        )
  157|      0|        .map_err(|_| {
  158|      0|            AppError::NotFound(format!(
  159|      0|                "target entity '{target_name}' not found in namespace '{namespace}'"
  160|      0|            ))
  161|      0|        })?;
  162|       |
  163|       |    // Verify the edge to rename exists.
  164|      0|    let original_count: i64 = conn.query_row(
  165|      0|        "SELECT COUNT(*) FROM relationships
  166|      0|         WHERE source_id = ?1 AND target_id = ?2 AND relation = ?3 AND namespace = ?4",
  167|      0|        params![source_id, target_id, args.from_relation, namespace],
  168|      0|        |r| r.get(0),
  169|      0|    )?;
  170|       |
  171|      0|    if original_count == 0 {
  172|      0|        return Err(AppError::NotFound(format!(
  173|      0|            "edge '{source_name}' --[{}]--> '{target_name}' not found in namespace '{namespace}'",
  174|      0|            args.from_relation
  175|      0|        )));
  176|      0|    }
  177|       |
  178|      0|    if args.dry_run {
  179|      0|        emit_response(
  180|      0|            &args,
  181|      0|            "dry_run",
  182|      0|            original_count as usize,
  183|       |            0,
  184|      0|            namespace,
  185|      0|            inicio,
  186|      0|        )?;
  187|      0|        return Ok(());
  188|      0|    }
  189|       |
  190|      0|    let tx = conn.transaction_with_behavior(rusqlite::TransactionBehavior::Immediate)?;
  191|       |
  192|      0|    let updated = tx.execute(
  193|      0|        "UPDATE OR IGNORE relationships
  194|      0|         SET relation = ?1
  195|      0|         WHERE source_id = ?2 AND target_id = ?3 AND relation = ?4 AND namespace = ?5",
  196|      0|        params![
  197|      0|            args.to_relation,
  198|      0|            source_id,
  199|      0|            target_id,
  200|      0|            args.from_relation,
  201|      0|            namespace
  202|      0|        ],
  203|      0|    )?;
  204|       |
  205|       |    // Remove rows that UPDATE OR IGNORE silently skipped due to UNIQUE collision.
  206|      0|    let deleted = tx.execute(
  207|      0|        "DELETE FROM relationships
  208|      0|         WHERE source_id = ?1 AND target_id = ?2 AND relation = ?3 AND namespace = ?4",
  209|      0|        params![source_id, target_id, args.from_relation, namespace],
  210|      0|    )?;
  211|       |
  212|      0|    tx.commit()?;
  213|       |
  214|      0|    conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);")?;
  215|       |
  216|      0|    let merged = (original_count as usize).saturating_sub(updated + deleted);
  217|      0|    emit_response(&args, "reclassified", updated, merged, namespace, inicio)
  218|      0|}
  219|       |
  220|       |// ---------------------------------------------------------------------------
  221|       |// Batch mode
  222|       |// ---------------------------------------------------------------------------
  223|       |
  224|      0|fn run_batch(
  225|      0|    args: ReclassifyRelationArgs,
  226|      0|    inicio: std::time::Instant,
  227|      0|    namespace: String,
  228|      0|    conn: &mut rusqlite::Connection,
  229|      0|) -> Result<(), AppError> {
  230|       |    // Build WHERE clause extensions for optional entity-type filters.
  231|       |    // The base query joins relationships with source/target entities.
  232|      0|    let source_filter = args
  233|      0|        .filter_source_type
  234|      0|        .map(|t| format!(" AND src.type = '{}'", t.as_str()))
  235|      0|        .unwrap_or_default();
  236|      0|    let target_filter = args
  237|      0|        .filter_target_type
  238|      0|        .map(|t| format!(" AND tgt.type = '{}'", t.as_str()))
  239|      0|        .unwrap_or_default();
  240|      0|    let has_filters = !source_filter.is_empty() || !target_filter.is_empty();
  241|       |
  242|       |    // Count edges that would be affected (used for both dry-run and confirmation).
  243|      0|    let original_count: i64 = if has_filters {
  244|      0|        conn.query_row(
  245|      0|            &format!(
  246|      0|                "SELECT COUNT(*) FROM relationships r
  247|      0|                 JOIN entities src ON src.id = r.source_id
  248|      0|                 JOIN entities tgt ON tgt.id = r.target_id
  249|      0|                 WHERE r.relation = ?1 AND r.namespace = ?2{source_filter}{target_filter}"
  250|      0|            ),
  251|      0|            params![args.from_relation, namespace],
  252|      0|            |r| r.get(0),
  253|      0|        )?
  254|       |    } else {
  255|      0|        conn.query_row(
  256|      0|            "SELECT COUNT(*) FROM relationships
  257|      0|             WHERE relation = ?1 AND namespace = ?2",
  258|      0|            params![args.from_relation, namespace],
  259|      0|            |r| r.get(0),
  260|      0|        )?
  261|       |    };
  262|       |
  263|      0|    if original_count == 0 {
  264|      0|        tracing::warn!(target: "reclassify_relation",
  265|       |            from_relation = %args.from_relation,
  266|       |            namespace = %namespace,
  267|      0|            "reclassify-relation batch matched zero edges — verify --from-relation value"
  268|       |        );
  269|      0|    }
  270|       |
  271|      0|    if args.dry_run {
  272|      0|        emit_response(
  273|      0|            &args,
  274|      0|            "dry_run",
  275|      0|            original_count as usize,
  276|       |            0,
  277|      0|            namespace,
  278|      0|            inicio,
  279|      0|        )?;
  280|      0|        return Ok(());
  281|      0|    }
  282|       |
  283|      0|    let tx = conn.transaction_with_behavior(rusqlite::TransactionBehavior::Immediate)?;
  284|       |
  285|      0|    let updated = if has_filters {
  286|       |        // For filtered batch we need to collect IDs first, then update.
  287|      0|        let ids: Vec<i64> = {
  288|      0|            let mut stmt = tx.prepare(&format!(
  289|      0|                "SELECT r.id FROM relationships r
  290|      0|                 JOIN entities src ON src.id = r.source_id
  291|      0|                 JOIN entities tgt ON tgt.id = r.target_id
  292|      0|                 WHERE r.relation = ?1 AND r.namespace = ?2{source_filter}{target_filter}"
  293|      0|            ))?;
  294|      0|            let collected: Vec<i64> = stmt
  295|      0|                .query_map(params![args.from_relation, namespace], |r| r.get(0))?
  296|      0|                .collect::<Result<Vec<_>, _>>()?;
  297|      0|            collected
  298|       |        };
  299|       |
  300|      0|        let mut moved: usize = 0;
  301|      0|        for id in &ids {
  302|      0|            let n = tx.execute(
  303|      0|                "UPDATE OR IGNORE relationships
  304|      0|                 SET relation = ?1
  305|      0|                 WHERE id = ?2",
  306|      0|                params![args.to_relation, id],
  307|      0|            )?;
  308|      0|            moved += n;
  309|       |        }
  310|      0|        moved
  311|       |    } else {
  312|      0|        tx.execute(
  313|      0|            "UPDATE OR IGNORE relationships
  314|      0|             SET relation = ?1
  315|      0|             WHERE relation = ?2 AND namespace = ?3",
  316|      0|            params![args.to_relation, args.from_relation, namespace],
  317|      0|        )?
  318|       |    };
  319|       |
  320|       |    // Remove rows the UPDATE OR IGNORE left behind (UNIQUE collision survivors).
  321|      0|    let deleted = if has_filters {
  322|      0|        tx.execute(
  323|      0|            &format!(
  324|      0|                "DELETE FROM relationships WHERE id IN (
  325|      0|                     SELECT r.id FROM relationships r
  326|      0|                     JOIN entities src ON src.id = r.source_id
  327|      0|                     JOIN entities tgt ON tgt.id = r.target_id
  328|      0|                     WHERE r.relation = ?1 AND r.namespace = ?2{source_filter}{target_filter}
  329|      0|                 )"
  330|      0|            ),
  331|      0|            params![args.from_relation, namespace],
  332|      0|        )?
  333|       |    } else {
  334|      0|        tx.execute(
  335|      0|            "DELETE FROM relationships WHERE relation = ?1 AND namespace = ?2",
  336|      0|            params![args.from_relation, namespace],
  337|      0|        )?
  338|       |    };
  339|       |
  340|      0|    tx.commit()?;
  341|       |
  342|      0|    conn.execute_batch("ANALYZE relationships;")?;
  343|      0|    conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);")?;
  344|       |
  345|      0|    let merged = (original_count as usize).saturating_sub(updated + deleted);
  346|      0|    emit_response(&args, "reclassified", updated, merged, namespace, inicio)
  347|      0|}
  348|       |
  349|       |// ---------------------------------------------------------------------------
  350|       |// Shared response emitter
  351|       |// ---------------------------------------------------------------------------
  352|       |
  353|      0|fn emit_response(
  354|      0|    args: &ReclassifyRelationArgs,
  355|      0|    action: &str,
  356|      0|    count: usize,
  357|      0|    merged_duplicates: usize,
  358|      0|    namespace: String,
  359|      0|    inicio: std::time::Instant,
  360|      0|) -> Result<(), AppError> {
  361|      0|    let response = ReclassifyRelationResponse {
  362|      0|        action: action.to_string(),
  363|      0|        from_relation: args.from_relation.clone(),
  364|      0|        to_relation: args.to_relation.clone(),
  365|      0|        count,
  366|      0|        merged_duplicates,
  367|      0|        namespace: namespace.clone(),
  368|      0|        elapsed_ms: inicio.elapsed().as_millis() as u64,
  369|      0|    };
  370|       |
  371|      0|    match args.format {
  372|      0|        OutputFormat::Json => output::emit_json(&response)?,
  373|      0|        OutputFormat::Text | OutputFormat::Markdown => {
  374|      0|            output::emit_text(&format!(
  375|      0|                "{action}: {count} edges '{}' → '{}' [{namespace}] (duplicates merged: {merged_duplicates})",
  376|      0|                args.from_relation, args.to_relation
  377|      0|            ));
  378|      0|        }
  379|       |    }
  380|      0|    Ok(())
  381|      0|}
  382|       |
  383|       |#[cfg(test)]
  384|       |mod tests {
  385|       |    use super::*;
  386|       |
  387|      6|    fn make_response(action: &str, count: usize, merged: usize) -> ReclassifyRelationResponse {
  388|      6|        ReclassifyRelationResponse {
  389|      6|            action: action.to_string(),
  390|      6|            from_relation: "mentions".to_string(),
  391|      6|            to_relation: "related".to_string(),
  392|      6|            count,
  393|      6|            merged_duplicates: merged,
  394|      6|            namespace: "global".to_string(),
  395|      6|            elapsed_ms: 1,
  396|      6|        }
  397|      6|    }
  398|       |
  399|       |    #[test]
  400|      1|    fn response_serializes_all_fields() {
  401|      1|        let resp = make_response("reclassified", 5, 0);
  402|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  403|      1|        assert_eq!(json["action"], "reclassified");
  404|      1|        assert_eq!(json["from_relation"], "mentions");
  405|      1|        assert_eq!(json["to_relation"], "related");
  406|      1|        assert_eq!(json["count"], 5);
  407|      1|        assert_eq!(json["merged_duplicates"], 0);
  408|      1|        assert_eq!(json["namespace"], "global");
  409|      1|        assert!(json["elapsed_ms"].is_number());
  410|      1|    }
  411|       |
  412|       |    #[test]
  413|      1|    fn response_action_dry_run() {
  414|      1|        let resp = make_response("dry_run", 10, 0);
  415|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  416|      1|        assert_eq!(json["action"], "dry_run");
  417|      1|        assert_eq!(json["count"], 10);
  418|      1|        assert_eq!(json["merged_duplicates"], 0);
  419|      1|    }
  420|       |
  421|       |    #[test]
  422|      1|    fn response_merged_duplicates_nonzero() {
  423|       |        // Simulates a case where 3 out of 10 edges collided with existing rows.
  424|      1|        let resp = make_response("reclassified", 7, 3);
  425|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  426|      1|        assert_eq!(json["count"], 7);
  427|      1|        assert_eq!(json["merged_duplicates"], 3);
  428|      1|    }
  429|       |
  430|       |    #[test]
  431|      1|    fn response_count_zero_when_nothing_matched() {
  432|      1|        let resp = make_response("reclassified", 0, 0);
  433|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  434|      1|        assert_eq!(json["count"], 0);
  435|      1|        assert_eq!(json["merged_duplicates"], 0);
  436|      1|    }
  437|       |
  438|       |    #[test]
  439|      1|    fn response_action_values_exhaustive() {
  440|      3|        for action in &["reclassified", "dry_run"] {
                          ^2
  441|      2|            let resp = make_response(action, 1, 0);
  442|      2|            let json = serde_json::to_value(&resp).expect("serialization");
  443|      2|            assert_eq!(json["action"], *action);
  444|       |        }
  445|      1|    }
  446|       |
  447|       |    #[test]
  448|      1|    fn response_from_and_to_relation_present() {
  449|      1|        let resp = ReclassifyRelationResponse {
  450|      1|            action: "reclassified".to_string(),
  451|      1|            from_relation: "uses".to_string(),
  452|      1|            to_relation: "depends_on".to_string(),
  453|      1|            count: 3,
  454|      1|            merged_duplicates: 1,
  455|      1|            namespace: "my-project".to_string(),
  456|      1|            elapsed_ms: 5,
  457|      1|        };
  458|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  459|      1|        assert_eq!(json["from_relation"], "uses");
  460|      1|        assert_eq!(json["to_relation"], "depends_on");
  461|      1|    }
  462|       |
  463|       |    #[test]
  464|      1|    fn same_relation_value_rejected_at_logic_level() {
  465|       |        // Validates that the guard in run() would catch from == to.
  466|       |        // We test the condition directly since we cannot call run() without a DB.
  467|      1|        let from = "mentions".to_string();
  468|      1|        let to = "mentions".to_string();
  469|      1|        assert!(
  470|      1|            from == to,
  471|      0|            "same-value rename must be caught before DB access"
  472|       |        );
  473|      1|    }
  474|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/related.rs:
    1|       |//! Handler for the `related` CLI subcommand.
    2|       |
    3|       |use crate::constants::{
    4|       |    DEFAULT_K_RECALL, DEFAULT_MAX_HOPS, DEFAULT_MIN_WEIGHT, TEXT_DESCRIPTION_PREVIEW_LEN,
    5|       |};
    6|       |use crate::errors::AppError;
    7|       |use crate::i18n::errors_msg;
    8|       |use crate::output::{self, OutputFormat};
    9|       |use crate::paths::AppPaths;
   10|       |use crate::storage::connection::open_ro;
   11|       |use rusqlite::{params, Connection};
   12|       |use serde::Serialize;
   13|       |use std::collections::{HashSet, VecDeque};
   14|       |
   15|       |/// Identifies whether the seed resolved to a memory or a bare entity.
   16|       |enum SeedKind {
   17|       |    Memory(i64),
   18|       |    Entity(i64),
   19|       |}
   20|       |
   21|       |/// Tuple returned by the adjacency fetch: (neighbour_entity_id, source_name,
   22|       |/// target_name, relation, weight).
   23|       |type Neighbour = (i64, String, String, String, f64);
   24|       |
   25|       |#[derive(clap::Args)]
   26|       |#[command(after_long_help = "EXAMPLES:\n  \
   27|       |    # List memories connected to a memory via the entity graph (default 2 hops)\n  \
   28|       |    sqlite-graphrag related onboarding\n\n  \
   29|       |    # Increase hop distance and filter by relation type\n  \
   30|       |    sqlite-graphrag related onboarding --max-hops 3 --relation related\n\n  \
   31|       |    # Cap result count and require minimum edge weight\n  \
   32|       |    sqlite-graphrag related onboarding --limit 5 --min-weight 0.5")]
   33|       |pub struct RelatedArgs {
   34|       |    /// Memory name as a positional argument. Alternative to `--name`.
   35|       |    #[arg(
   36|       |        value_name = "NAME",
   37|       |        conflicts_with = "name",
   38|       |        help = "Memory name whose neighbours to traverse; alternative to --name"
   39|       |    )]
   40|       |    pub name_positional: Option<String>,
   41|       |    /// Memory name as a flag. Required when the positional form is absent. Also accepts the alias `--from`.
   42|       |    #[arg(long, alias = "from")]
   43|       |    pub name: Option<String>,
   44|       |    /// Maximum graph hop count. Also accepts the alias `--hops`.
   45|       |    #[arg(long, alias = "hops", default_value_t = DEFAULT_MAX_HOPS)]
   46|       |    pub max_hops: u32,
   47|       |    /// Filter results to a specific relation type. Canonical values:
   48|       |    /// applies-to, uses, depends-on, causes, fixes, contradicts, supports,
   49|       |    /// follows, related, mentions, replaces, tracked-in.
   50|       |    /// Any kebab-case or snake_case string is also accepted as a custom relation.
   51|       |    #[arg(long, value_parser = crate::parsers::parse_relation)]
   52|       |    pub relation: Option<String>,
   53|       |    #[arg(long, default_value_t = DEFAULT_MIN_WEIGHT)]
   54|       |    pub min_weight: f64,
   55|       |    #[arg(long, default_value_t = DEFAULT_K_RECALL)]
   56|       |    pub limit: usize,
   57|       |    #[arg(long)]
   58|       |    pub namespace: Option<String>,
   59|       |    #[arg(long, value_enum, default_value = "json")]
   60|       |    pub format: OutputFormat,
   61|       |    #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
   62|       |    pub json: bool,
   63|       |    #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
   64|       |    pub db: Option<String>,
   65|       |}
   66|       |
   67|       |#[derive(Serialize)]
   68|       |struct RelatedResponse {
   69|       |    /// Echo of the seed memory name resolved from `--name` or the positional argument.
   70|       |    /// Added in v1.0.35 for input transparency in JSON output.
   71|       |    name: String,
   72|       |    /// Echo of the resolved `--max-hops` value (default 2). Added in v1.0.35.
   73|       |    max_hops: u32,
   74|       |    results: Vec<RelatedMemory>,
   75|       |    /// Semantic alias of `results` following the v1.0.66 alias pattern (list has items/memories).
   76|       |    related_memories: Vec<RelatedMemory>,
   77|       |    elapsed_ms: u64,
   78|       |}
   79|       |
   80|       |#[derive(Serialize, Clone)]
   81|       |struct RelatedMemory {
   82|       |    memory_id: i64,
   83|       |    name: String,
   84|       |    namespace: String,
   85|       |    #[serde(rename = "type")]
   86|       |    memory_type: String,
   87|       |    description: String,
   88|       |    hop_distance: u32,
   89|       |    source_entity: Option<String>,
   90|       |    target_entity: Option<String>,
   91|       |    /// Alias of `source_entity` for cross-command consistency (graph, link, deep-research use from/to).
   92|       |    #[serde(skip_serializing_if = "Option::is_none")]
   93|       |    from: Option<String>,
   94|       |    /// Alias of `target_entity` for cross-command consistency.
   95|       |    #[serde(skip_serializing_if = "Option::is_none")]
   96|       |    to: Option<String>,
   97|       |    relation: Option<String>,
   98|       |    weight: Option<f64>,
   99|       |}
  100|       |
  101|      0|pub fn run(args: RelatedArgs) -> Result<(), AppError> {
  102|      0|    let inicio = std::time::Instant::now();
  103|      0|    let name = args
  104|      0|        .name_positional
  105|      0|        .as_deref()
  106|      0|        .or(args.name.as_deref())
  107|      0|        .ok_or_else(|| {
  108|      0|            AppError::Validation(
  109|      0|                "name required: pass as positional argument or via --name".to_string(),
  110|      0|            )
  111|      0|        })?
  112|      0|        .to_string();
  113|       |
  114|      0|    if name.trim().is_empty() {
  115|      0|        return Err(AppError::Validation("name must not be empty".to_string()));
  116|      0|    }
  117|       |
  118|      0|    let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
  119|      0|    let paths = AppPaths::resolve(args.db.as_deref())?;
  120|       |
  121|      0|    crate::storage::connection::ensure_db_ready(&paths)?;
  122|       |
  123|      0|    let conn = open_ro(&paths.db)?;
  124|       |
  125|       |    // Locate the seed: try memory first, fall back to bare entity.
  126|      0|    let seed = match conn.query_row(
  127|      0|        "SELECT id FROM memories WHERE namespace = ?1 AND name = ?2 AND deleted_at IS NULL",
  128|      0|        params![namespace, name],
  129|      0|        |r| r.get::<_, i64>(0),
  130|       |    ) {
  131|      0|        Ok(id) => SeedKind::Memory(id),
  132|       |        Err(rusqlite::Error::QueryReturnedNoRows) => {
  133|      0|            match crate::storage::entities::find_entity_id(&conn, &namespace, &name)? {
  134|      0|                Some(id) => SeedKind::Entity(id),
  135|       |                None => {
  136|      0|                    return Err(AppError::NotFound(errors_msg::memory_or_entity_not_found(
  137|      0|                        &name, &namespace,
  138|      0|                    )))
  139|       |                }
  140|       |            }
  141|       |        }
  142|      0|        Err(e) => return Err(AppError::Database(e)),
  143|       |    };
  144|       |
  145|       |    // Collect seed entity IDs depending on seed kind.
  146|      0|    let (seed_memory_id, seed_entity_ids): (i64, Vec<i64>) = match &seed {
  147|      0|        SeedKind::Memory(id) => {
  148|      0|            let mem_id = *id;
  149|      0|            let mut stmt =
  150|      0|                conn.prepare_cached("SELECT entity_id FROM memory_entities WHERE memory_id = ?1")?;
  151|      0|            let rows: Vec<i64> = stmt
  152|      0|                .query_map(params![mem_id], |r| r.get(0))?
  153|      0|                .collect::<Result<Vec<i64>, _>>()?;
  154|      0|            (mem_id, rows)
  155|       |        }
  156|      0|        SeedKind::Entity(entity_id) => {
  157|       |            // For a bare entity seed there is no corresponding memory to skip.
  158|       |            // Use a sentinel -1 so dedup never matches a real memory_id.
  159|      0|            (-1, vec![*entity_id])
  160|       |        }
  161|       |    };
  162|       |
  163|      0|    let relation_filter = args.relation;
  164|      0|    if let Some(ref r) = relation_filter {
  165|      0|        crate::parsers::warn_if_non_canonical(r);
  166|      0|    }
  167|      0|    let results = traverse_related(
  168|      0|        &conn,
  169|      0|        seed_memory_id,
  170|      0|        &seed_entity_ids,
  171|      0|        &namespace,
  172|      0|        args.max_hops,
  173|      0|        args.min_weight,
  174|      0|        relation_filter.as_deref(),
  175|      0|        args.limit,
  176|      0|    )?;
  177|       |
  178|      0|    match args.format {
  179|       |        OutputFormat::Json => {
  180|      0|            let related_memories = results.clone();
  181|      0|            output::emit_json(&RelatedResponse {
  182|      0|                name: name.clone(),
  183|      0|                max_hops: args.max_hops,
  184|      0|                results,
  185|      0|                related_memories,
  186|      0|                elapsed_ms: inicio.elapsed().as_millis() as u64,
  187|      0|            })?;
  188|       |        }
  189|       |        OutputFormat::Text => {
  190|      0|            for item in &results {
  191|      0|                if item.description.is_empty() {
  192|      0|                    output::emit_text(&format!(
  193|      0|                        "{}. {} ({})",
  194|      0|                        item.hop_distance, item.name, item.namespace
  195|      0|                    ));
  196|      0|                } else {
  197|      0|                    let preview: String = item
  198|      0|                        .description
  199|      0|                        .chars()
  200|      0|                        .take(TEXT_DESCRIPTION_PREVIEW_LEN)
  201|      0|                        .collect();
  202|      0|                    output::emit_text(&format!(
  203|      0|                        "{}. {} ({}): {}",
  204|      0|                        item.hop_distance, item.name, item.namespace, preview
  205|      0|                    ));
  206|      0|                }
  207|       |            }
  208|       |        }
  209|       |        OutputFormat::Markdown => {
  210|      0|            for item in &results {
  211|      0|                if item.description.is_empty() {
  212|      0|                    output::emit_text(&format!(
  213|      0|                        "- **{}** ({}) — hop {}",
  214|      0|                        item.name, item.namespace, item.hop_distance
  215|      0|                    ));
  216|      0|                } else {
  217|      0|                    let preview: String = item
  218|      0|                        .description
  219|      0|                        .chars()
  220|      0|                        .take(TEXT_DESCRIPTION_PREVIEW_LEN)
  221|      0|                        .collect();
  222|      0|                    output::emit_text(&format!(
  223|      0|                        "- **{}** ({}) — hop {}: {}",
  224|      0|                        item.name, item.namespace, item.hop_distance, preview
  225|      0|                    ));
  226|      0|                }
  227|       |            }
  228|       |        }
  229|       |    }
  230|       |
  231|      0|    Ok(())
  232|      0|}
  233|       |
  234|       |#[allow(clippy::too_many_arguments)]
  235|      4|fn traverse_related(
  236|      4|    conn: &Connection,
  237|      4|    seed_memory_id: i64,
  238|      4|    seed_entity_ids: &[i64],
  239|      4|    namespace: &str,
  240|      4|    max_hops: u32,
  241|      4|    min_weight: f64,
  242|      4|    relation_filter: Option<&str>,
  243|      4|    limit: usize,
  244|      4|) -> Result<Vec<RelatedMemory>, AppError> {
  245|      4|    if seed_entity_ids.is_empty() || max_hops == 0 {
                                                   ^3
  246|      2|        return Ok(Vec::new());
  247|      2|    }
  248|       |
  249|       |    // BFS over entities keeping track of hop distance and the (source, target, relation, weight)
  250|       |    // of the edge that first reached each entity.
  251|      2|    let mut visited: HashSet<i64> = seed_entity_ids.iter().copied().collect();
  252|      2|    let mut entity_hop: crate::hash::AHashMap<i64, u32> =
  253|      2|        crate::hash::AHashMap::with_capacity_and_hasher(max_hops as usize * 10, Default::default());
  254|      4|    for &e in seed_entity_ids {
                       ^2
  255|      2|        entity_hop.insert(e, 0);
  256|      2|    }
  257|       |    // Per-entity edge info: source_name, target_name, relation, weight (captures the FIRST edge
  258|       |    // that reached this entity — equivalent to BFS shortest path recall edge).
  259|      2|    let mut entity_edge: crate::hash::AHashMap<i64, (String, String, String, f64)> =
  260|      2|        crate::hash::AHashMap::with_capacity_and_hasher(max_hops as usize * 10, Default::default());
  261|       |
  262|      2|    let mut queue: VecDeque<i64> = seed_entity_ids.iter().copied().collect();
  263|       |
  264|     10|    while let Some(current_entity) = queue.pop_front() {
                                 ^8
  265|      8|        let current_hop = *entity_hop.get(&current_entity).unwrap_or(&0);
  266|      8|        if current_hop >= max_hops {
  267|      5|            continue;
  268|      3|        }
  269|       |
  270|      3|        let neighbours =
  271|      3|            fetch_neighbours(conn, current_entity, namespace, min_weight, relation_filter)?;
                                                                                                        ^0
  272|       |
  273|     10|        for (neighbour_id, source_name, target_name, relation, weight) in neighbours {
                           ^7            ^7           ^7           ^7        ^7
  274|      7|            if visited.insert(neighbour_id) {
  275|      6|                entity_hop.insert(neighbour_id, current_hop + 1);
  276|      6|                entity_edge.insert(neighbour_id, (source_name, target_name, relation, weight));
  277|      6|                queue.push_back(neighbour_id);
  278|      6|            }
                          ^1
  279|       |        }
  280|       |    }
  281|       |
  282|       |    // For each discovered entity (hop >= 1) find its memories, skipping the seed memory.
  283|      2|    let mut out: Vec<RelatedMemory> = Vec::with_capacity(limit);
  284|      2|    let mut dedup_ids: crate::hash::AHashSet<i64> =
  285|      2|        crate::hash::AHashSet::with_capacity_and_hasher(limit, Default::default());
  286|      2|    dedup_ids.insert(seed_memory_id);
  287|       |
  288|       |    // Sort entities by hop ASC, weight DESC so we emit closer entities first.
  289|      2|    let mut ordered_entities: Vec<(i64, u32)> = entity_hop
  290|      2|        .iter()
  291|      8|        .filter(|(id, _)| !seed_entity_ids.contains(id))
                       ^2
  292|      6|        .map(|(id, hop)| (*id, *hop))
                       ^2
  293|      2|        .collect();
  294|      4|    ordered_entities.sort_by(|a, b| {
                  ^2               ^2
  295|      4|        let weight_a = entity_edge.get(&a.0).map(|e| e.3).unwrap_or(0.0);
  296|      4|        let weight_b = entity_edge.get(&b.0).map(|e| e.3).unwrap_or(0.0);
  297|      4|        a.1.cmp(&b.1).then_with(|| {
  298|      4|            weight_b
  299|      4|                .partial_cmp(&weight_a)
  300|      4|                .unwrap_or(std::cmp::Ordering::Equal)
  301|      4|        })
  302|      4|    });
  303|       |
  304|      5|    for (entity_id, hop) in ordered_entities {
                       ^4         ^4
  305|      4|        let mut stmt = conn.prepare_cached(
  306|      4|            "SELECT m.id, m.name, m.namespace, m.type, m.description
  307|      4|             FROM memory_entities me
  308|      4|             JOIN memories m ON m.id = me.memory_id
  309|      4|             WHERE me.entity_id = ?1 AND m.deleted_at IS NULL",
  310|      0|        )?;
  311|      4|        let rows = stmt
  312|      4|            .query_map(params![entity_id], |r| {
  313|       |                Ok((
  314|      4|                    r.get::<_, i64>(0)?,
                                                    ^0
  315|      4|                    r.get::<_, String>(1)?,
                                                       ^0
  316|      4|                    r.get::<_, String>(2)?,
                                                       ^0
  317|      4|                    r.get::<_, String>(3)?,
                                                       ^0
  318|      4|                    r.get::<_, String>(4)?,
                                                       ^0
  319|       |                ))
  320|      4|            })?
                            ^0
  321|      4|            .collect::<Result<Vec<_>, _>>()?;
                                                         ^0
  322|       |
  323|      7|        for (mid, name, ns, mtype, desc) in rows {
                           ^4   ^4    ^4  ^4     ^4
  324|      4|            if !dedup_ids.insert(mid) {
  325|      0|                continue;
  326|      4|            }
  327|      4|            let edge = entity_edge.get(&entity_id);
  328|      4|            let src = edge.map(|e| e.0.clone());
  329|      4|            let tgt = edge.map(|e| e.1.clone());
  330|      4|            out.push(RelatedMemory {
  331|      4|                memory_id: mid,
  332|      4|                name,
  333|      4|                namespace: ns,
  334|      4|                memory_type: mtype,
  335|      4|                description: desc,
  336|      4|                hop_distance: hop,
  337|      4|                source_entity: src.clone(),
  338|      4|                target_entity: tgt.clone(),
  339|      4|                from: src,
  340|      4|                to: tgt,
  341|      4|                relation: edge.map(|e| e.2.clone()),
  342|      4|                weight: edge.map(|e| e.3),
  343|       |            });
  344|      4|            if out.len() >= limit {
  345|      1|                return Ok(out);
  346|      3|            }
  347|       |        }
  348|       |    }
  349|       |
  350|      1|    Ok(out)
  351|      4|}
  352|       |
  353|      3|fn fetch_neighbours(
  354|      3|    conn: &Connection,
  355|      3|    entity_id: i64,
  356|      3|    namespace: &str,
  357|      3|    min_weight: f64,
  358|      3|    relation_filter: Option<&str>,
  359|      3|) -> Result<Vec<Neighbour>, AppError> {
  360|       |    // Follow edges in both directions: source -> target and target -> source so traversal is
  361|       |    // undirected, which is how users typically reason about "related" memories.
  362|      3|    let base_sql = "\
  363|      3|        SELECT r.target_id, se.name, te.name, r.relation, r.weight
  364|      3|        FROM relationships r
  365|      3|        JOIN entities se ON se.id = r.source_id
  366|      3|        JOIN entities te ON te.id = r.target_id
  367|      3|        WHERE r.source_id = ?1 AND r.weight >= ?2 AND r.namespace = ?3";
  368|       |
  369|      3|    let reverse_sql = "\
  370|      3|        SELECT r.source_id, se.name, te.name, r.relation, r.weight
  371|      3|        FROM relationships r
  372|      3|        JOIN entities se ON se.id = r.source_id
  373|      3|        JOIN entities te ON te.id = r.target_id
  374|      3|        WHERE r.target_id = ?1 AND r.weight >= ?2 AND r.namespace = ?3";
  375|       |
  376|      3|    let mut results: Vec<Neighbour> = Vec::with_capacity(16);
  377|       |
  378|      3|    let forward_sql = match relation_filter {
  379|      0|        Some(_) => format!("{base_sql} AND r.relation = ?4"),
  380|      3|        None => base_sql.to_string(),
  381|       |    };
  382|      3|    let rev_sql = match relation_filter {
  383|      0|        Some(_) => format!("{reverse_sql} AND r.relation = ?4"),
  384|      3|        None => reverse_sql.to_string(),
  385|       |    };
  386|       |
  387|      3|    let mut stmt = conn.prepare_cached(&forward_sql)?;
                                                                  ^0
  388|      3|    let rows: Vec<_> = if let Some(rel) = relation_filter {
                                                 ^0
  389|      0|        stmt.query_map(params![entity_id, min_weight, namespace, rel], |r| {
  390|       |            Ok((
  391|      0|                r.get::<_, i64>(0)?,
  392|      0|                r.get::<_, String>(1)?,
  393|      0|                r.get::<_, String>(2)?,
  394|      0|                r.get::<_, String>(3)?,
  395|      0|                r.get::<_, f64>(4)?,
  396|       |            ))
  397|      0|        })?
  398|      0|        .collect::<Result<Vec<_>, _>>()?
  399|       |    } else {
  400|      6|        stmt.query_map(params![entity_id, min_weight, namespace], |r| {
                      ^3   ^3        ^3
  401|       |            Ok((
  402|      6|                r.get::<_, i64>(0)?,
                                                ^0
  403|      6|                r.get::<_, String>(1)?,
                                                   ^0
  404|      6|                r.get::<_, String>(2)?,
                                                   ^0
  405|      6|                r.get::<_, String>(3)?,
                                                   ^0
  406|      6|                r.get::<_, f64>(4)?,
                                                ^0
  407|       |            ))
  408|      6|        })?
                        ^0
  409|      3|        .collect::<Result<Vec<_>, _>>()?
                                                     ^0
  410|       |    };
  411|      3|    results.extend(rows);
  412|       |
  413|      3|    let mut stmt = conn.prepare_cached(&rev_sql)?;
                                                              ^0
  414|      3|    let rows: Vec<_> = if let Some(rel) = relation_filter {
                                                 ^0
  415|      0|        stmt.query_map(params![entity_id, min_weight, namespace, rel], |r| {
  416|       |            Ok((
  417|      0|                r.get::<_, i64>(0)?,
  418|      0|                r.get::<_, String>(1)?,
  419|      0|                r.get::<_, String>(2)?,
  420|      0|                r.get::<_, String>(3)?,
  421|      0|                r.get::<_, f64>(4)?,
  422|       |            ))
  423|      0|        })?
  424|      0|        .collect::<Result<Vec<_>, _>>()?
  425|       |    } else {
  426|      3|        stmt.query_map(params![entity_id, min_weight, namespace], |r| {
                                                                                    ^1
  427|       |            Ok((
  428|      1|                r.get::<_, i64>(0)?,
                                                ^0
  429|      1|                r.get::<_, String>(1)?,
                                                   ^0
  430|      1|                r.get::<_, String>(2)?,
                                                   ^0
  431|      1|                r.get::<_, String>(3)?,
                                                   ^0
  432|      1|                r.get::<_, f64>(4)?,
                                                ^0
  433|       |            ))
  434|      1|        })?
                        ^0
  435|      3|        .collect::<Result<Vec<_>, _>>()?
                                                     ^0
  436|       |    };
  437|      3|    results.extend(rows);
  438|       |
  439|      3|    Ok(results)
  440|      3|}
  441|       |
  442|       |#[cfg(test)]
  443|       |mod tests {
  444|       |    use super::*;
  445|       |
  446|      4|    fn setup_related_db() -> rusqlite::Connection {
  447|      4|        let conn = rusqlite::Connection::open_in_memory().expect("failed to open in-memory db");
  448|      4|        conn.execute_batch(
  449|      4|            "CREATE TABLE memories (
  450|      4|                id INTEGER PRIMARY KEY AUTOINCREMENT,
  451|      4|                name TEXT NOT NULL,
  452|      4|                namespace TEXT NOT NULL DEFAULT 'global',
  453|      4|                type TEXT NOT NULL DEFAULT 'fact',
  454|      4|                description TEXT NOT NULL DEFAULT '',
  455|      4|                deleted_at INTEGER
  456|      4|            );
  457|      4|            CREATE TABLE entities (
  458|      4|                id INTEGER PRIMARY KEY AUTOINCREMENT,
  459|      4|                namespace TEXT NOT NULL,
  460|      4|                name TEXT NOT NULL
  461|      4|            );
  462|      4|            CREATE TABLE relationships (
  463|      4|                id INTEGER PRIMARY KEY AUTOINCREMENT,
  464|      4|                namespace TEXT NOT NULL,
  465|      4|                source_id INTEGER NOT NULL,
  466|      4|                target_id INTEGER NOT NULL,
  467|      4|                relation TEXT NOT NULL DEFAULT 'related_to',
  468|      4|                weight REAL NOT NULL DEFAULT 1.0
  469|      4|            );
  470|      4|            CREATE TABLE memory_entities (
  471|      4|                memory_id INTEGER NOT NULL,
  472|      4|                entity_id INTEGER NOT NULL
  473|      4|            );",
  474|       |        )
  475|      4|        .expect("failed to create test tables");
  476|      4|        conn
  477|      4|    }
  478|       |
  479|      9|    fn insert_memory(conn: &rusqlite::Connection, name: &str, namespace: &str) -> i64 {
  480|      9|        conn.execute(
  481|      9|            "INSERT INTO memories (name, namespace) VALUES (?1, ?2)",
  482|      9|            rusqlite::params![name, namespace],
  483|       |        )
  484|      9|        .expect("failed to insert memory");
  485|      9|        conn.last_insert_rowid()
  486|      9|    }
  487|       |
  488|      9|    fn insert_entity(conn: &rusqlite::Connection, name: &str, namespace: &str) -> i64 {
  489|      9|        conn.execute(
  490|      9|            "INSERT INTO entities (name, namespace) VALUES (?1, ?2)",
  491|      9|            rusqlite::params![name, namespace],
  492|       |        )
  493|      9|        .expect("failed to insert entity");
  494|      9|        conn.last_insert_rowid()
  495|      9|    }
  496|       |
  497|      9|    fn link_memory_entity(conn: &rusqlite::Connection, memory_id: i64, entity_id: i64) {
  498|      9|        conn.execute(
  499|      9|            "INSERT INTO memory_entities (memory_id, entity_id) VALUES (?1, ?2)",
  500|      9|            rusqlite::params![memory_id, entity_id],
  501|       |        )
  502|      9|        .expect("failed to link memory-entity");
  503|      9|    }
  504|       |
  505|      6|    fn insert_relationship(
  506|      6|        conn: &rusqlite::Connection,
  507|      6|        namespace: &str,
  508|      6|        source_id: i64,
  509|      6|        target_id: i64,
  510|      6|        relation: &str,
  511|      6|        weight: f64,
  512|      6|    ) {
  513|      6|        conn.execute(
  514|      6|            "INSERT INTO relationships (namespace, source_id, target_id, relation, weight)
  515|      6|             VALUES (?1, ?2, ?3, ?4, ?5)",
  516|      6|            rusqlite::params![namespace, source_id, target_id, relation, weight],
  517|       |        )
  518|      6|        .expect("failed to insert relationship");
  519|      6|    }
  520|       |
  521|       |    #[test]
  522|      1|    fn related_response_serializes_results_and_elapsed_ms() {
  523|      1|        let mem = RelatedMemory {
  524|      1|            memory_id: 1,
  525|      1|            name: "neighbor-mem".to_string(),
  526|      1|            namespace: "global".to_string(),
  527|      1|            memory_type: "document".to_string(),
  528|      1|            description: "desc".to_string(),
  529|      1|            hop_distance: 1,
  530|      1|            source_entity: Some("entity-a".to_string()),
  531|      1|            target_entity: Some("entity-b".to_string()),
  532|      1|            from: Some("entity-a".to_string()),
  533|      1|            to: Some("entity-b".to_string()),
  534|      1|            relation: Some("related_to".to_string()),
  535|      1|            weight: Some(0.9),
  536|      1|        };
  537|      1|        let resp = RelatedResponse {
  538|      1|            name: "seed-mem".to_string(),
  539|      1|            max_hops: 2,
  540|      1|            related_memories: vec![mem.clone()],
  541|      1|            results: vec![mem],
  542|      1|            elapsed_ms: 7,
  543|      1|        };
  544|       |
  545|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  546|      1|        assert!(json["results"].is_array());
  547|      1|        assert_eq!(json["results"].as_array().unwrap().len(), 1);
  548|      1|        assert_eq!(json["elapsed_ms"], 7u64);
  549|      1|        assert_eq!(json["results"][0]["type"], "document");
  550|      1|        assert_eq!(json["results"][0]["hop_distance"], 1);
  551|      1|    }
  552|       |
  553|       |    #[test]
  554|      1|    fn traverse_related_returns_empty_without_seed_entities() {
  555|      1|        let conn = setup_related_db();
  556|      1|        let result = traverse_related(&conn, 1, &[], "global", 2, 0.0, None, 10)
  557|      1|            .expect("traverse_related failed");
  558|      1|        assert!(
  559|      1|            result.is_empty(),
  560|      0|            "no seed entities must yield empty results"
  561|       |        );
  562|      1|    }
  563|       |
  564|       |    #[test]
  565|      1|    fn traverse_related_returns_empty_with_max_hops_zero() {
  566|      1|        let conn = setup_related_db();
  567|      1|        let mem_id = insert_memory(&conn, "seed-mem", "global");
  568|      1|        let ent_id = insert_entity(&conn, "ent-a", "global");
  569|      1|        link_memory_entity(&conn, mem_id, ent_id);
  570|       |
  571|      1|        let result = traverse_related(&conn, mem_id, &[ent_id], "global", 0, 0.0, None, 10)
  572|      1|            .expect("traverse_related failed");
  573|      1|        assert!(result.is_empty(), "max_hops=0 must return empty");
                                                 ^0
  574|      1|    }
  575|       |
  576|       |    #[test]
  577|      1|    fn traverse_related_discovers_neighbor_memory_via_graph() {
  578|      1|        let conn = setup_related_db();
  579|       |
  580|      1|        let seed_id = insert_memory(&conn, "seed-mem", "global");
  581|      1|        let neighbor_id = insert_memory(&conn, "neighbor-mem", "global");
  582|      1|        let ent_a = insert_entity(&conn, "ent-a", "global");
  583|      1|        let ent_b = insert_entity(&conn, "ent-b", "global");
  584|       |
  585|      1|        link_memory_entity(&conn, seed_id, ent_a);
  586|      1|        link_memory_entity(&conn, neighbor_id, ent_b);
  587|      1|        insert_relationship(&conn, "global", ent_a, ent_b, "related_to", 1.0);
  588|       |
  589|      1|        let result = traverse_related(&conn, seed_id, &[ent_a], "global", 2, 0.0, None, 10)
  590|      1|            .expect("traverse_related failed");
  591|       |
  592|      1|        assert_eq!(result.len(), 1, "must find 1 neighboring memory");
                                                  ^0
  593|      1|        assert_eq!(result[0].name, "neighbor-mem");
  594|      1|        assert_eq!(result[0].hop_distance, 1);
  595|      1|    }
  596|       |
  597|       |    #[test]
  598|      1|    fn traverse_related_respects_limit() {
  599|      1|        let conn = setup_related_db();
  600|       |
  601|      1|        let seed_id = insert_memory(&conn, "seed", "global");
  602|      1|        let ent_seed = insert_entity(&conn, "ent-seed", "global");
  603|      1|        link_memory_entity(&conn, seed_id, ent_seed);
  604|       |
  605|      6|        for i in 0..5 {
                          ^5
  606|      5|            let mem_id = insert_memory(&conn, &format!("neighbor-{i}"), "global");
  607|      5|            let ent_id = insert_entity(&conn, &format!("ent-{i}"), "global");
  608|      5|            link_memory_entity(&conn, mem_id, ent_id);
  609|      5|            insert_relationship(&conn, "global", ent_seed, ent_id, "related_to", 1.0);
  610|      5|        }
  611|       |
  612|      1|        let result = traverse_related(&conn, seed_id, &[ent_seed], "global", 1, 0.0, None, 3)
  613|      1|            .expect("traverse_related failed");
  614|       |
  615|      1|        assert!(
  616|      1|            result.len() <= 3,
  617|      0|            "limit=3 must constrain to at most 3 results"
  618|       |        );
  619|      1|    }
  620|       |
  621|       |    #[test]
  622|      1|    fn related_memory_optional_null_fields_serialized() {
  623|      1|        let mem = RelatedMemory {
  624|      1|            memory_id: 99,
  625|      1|            name: "no-relation".to_string(),
  626|      1|            namespace: "ns".to_string(),
  627|      1|            memory_type: "concept".to_string(),
  628|      1|            description: "".to_string(),
  629|      1|            hop_distance: 2,
  630|      1|            source_entity: None,
  631|      1|            target_entity: None,
  632|      1|            from: None,
  633|      1|            to: None,
  634|      1|            relation: None,
  635|      1|            weight: None,
  636|      1|        };
  637|       |
  638|      1|        let json = serde_json::to_value(&mem).expect("serialization failed");
  639|      1|        assert!(json["source_entity"].is_null());
  640|      1|        assert!(json["target_entity"].is_null());
  641|      1|        assert!(json["relation"].is_null());
  642|      1|        assert!(json["weight"].is_null());
  643|      1|        assert_eq!(json["hop_distance"], 2);
  644|      1|    }
  645|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/remember.rs:
    1|       |//! Handler for the `remember` CLI subcommand.
    2|       |
    3|       |use crate::chunking;
    4|       |use crate::cli::MemoryType;
    5|       |use crate::entity_type::EntityType;
    6|       |use crate::errors::AppError;
    7|       |use crate::i18n::errors_msg;
    8|       |use crate::output::{self, JsonOutputFormat, RememberResponse};
    9|       |use crate::paths::AppPaths;
   10|       |use crate::storage::chunks as storage_chunks;
   11|       |use crate::storage::connection::{ensure_schema, open_rw};
   12|       |use crate::storage::entities::{NewEntity, NewRelationship};
   13|       |use crate::storage::memories::NewMemory;
   14|       |use crate::storage::{entities, memories, urls as storage_urls, versions};
   15|       |use serde::Deserialize;
   16|       |
   17|       |/// Returns the number of rows that will be written to `memory_chunks` for the
   18|       |/// given chunk count. Single-chunk bodies are stored directly in the
   19|       |/// `memories` row, so no chunk row is appended (returns `0`). Multi-chunk
   20|       |/// bodies persist every chunk and the count equals `chunks_created`.
   21|       |///
   22|       |/// Centralized as a function so the H-M8 invariant is unit-testable without
   23|       |/// running the full handler. The schema for `chunks_persisted` documents this
   24|       |/// contract explicitly (see `docs/schemas/remember.schema.json`).
   25|      5|fn compute_chunks_persisted(chunks_created: usize) -> usize {
   26|      5|    if chunks_created > 1 {
   27|      3|        chunks_created
   28|       |    } else {
   29|      2|        0
   30|       |    }
   31|      5|}
   32|       |
   33|       |#[derive(clap::Args)]
   34|       |#[command(after_long_help = "EXAMPLES:\n  \
   35|       |    # Create a memory with inline body\n  \
   36|       |    sqlite-graphrag remember --name design-auth --type decision \\\n    \
   37|       |    --description \"auth design\" --body \"JWT for stateless auth\"\n\n  \
   38|       |    # Create with curated graph via --graph-stdin\n  \
   39|       |    echo '{\"body\":\"...\",\"entities\":[],\"relationships\":[]}' | \\\n    \
   40|       |    sqlite-graphrag remember --name my-mem --type note --description \"desc\" --graph-stdin\n\n  \
   41|       |    # Enable GLiNER NER extraction with --graph-stdin\n  \
   42|       |    echo '{\"body\":\"Alice from Microsoft...\",\"entities\":[],\"relationships\":[]}' | \\\n    \
   43|       |    sqlite-graphrag remember --name ner-test --type note --description \"test\" \\\n    \
   44|       |    --graph-stdin --enable-ner --gliner-variant int8\n\n  \
   45|       |    # Idempotent upsert with --force-merge\n  \
   46|       |    sqlite-graphrag remember --name my-mem --type note --description \"updated\" \\\n    \
   47|       |    --body \"new content\" --force-merge\n\n\
   48|       |NOTE:\n  \
   49|       |    remember does NOT accept positional arguments.\n  \
   50|       |    Use --body \"text\" for inline content\n  \
   51|       |    Use --body-file path for file content\n  \
   52|       |    Use --body-stdin for piped content\n  \
   53|       |    Use --graph-stdin for JSON with entities and relationships\n\n\
   54|       |ENTITY TYPES (for --graph-stdin entities, NOT memory --type):\n  \
   55|       |    concept, tool, person, file, project, decision, incident,\n  \
   56|       |    organization, location, date, dashboard, issue_tracker, memory\n  \
   57|       |    WARNING: reference, skill, document, note, user, feedback are\n  \
   58|       |    MEMORY types only — NOT valid for entities.\n  \
   59|       |    Mapping: reference→concept, document→file, user→person")]
   60|       |pub struct RememberArgs {
   61|       |    /// Memory name in kebab-case (lowercase letters, digits, hyphens).
   62|       |    /// Acts as unique key within the namespace; collisions trigger merge or rejection.
   63|       |    #[arg(long)]
   64|       |    pub name: String,
   65|       |    #[arg(
   66|       |        long,
   67|       |        value_enum,
   68|       |        long_help = "Memory kind stored in `memories.type`. Required when creating a new memory. Optional with --force-merge: if omitted the existing memory type is inherited. This is NOT the graph `entity_type` used in `--entities-file`. Valid values: user, feedback, project, reference, decision, incident, skill, document, note."
   69|       |    )]
   70|       |    pub r#type: Option<MemoryType>,
   71|       |    /// Short description (≤500 chars) summarizing the memory for use in `list` and `recall` snippets.
   72|       |    /// Required when creating a new memory. Optional with --force-merge: if omitted the existing description is inherited.
   73|       |    #[arg(long)]
   74|       |    pub description: Option<String>,
   75|       |    /// Inline body content. Mutually exclusive with --body-file, --body-stdin, --graph-stdin.
   76|       |    /// Maximum 512000 bytes; rejected if empty without an external graph.
   77|       |    #[arg(
   78|       |        long,
   79|       |        help = "Inline body content (max 500 KB / 512000 bytes; for larger inputs split into multiple memories or use --body-file)",
   80|       |        conflicts_with_all = ["body_file", "body_stdin", "graph_stdin"]
   81|       |    )]
   82|       |    pub body: Option<String>,
   83|       |    #[arg(
   84|       |        long,
   85|       |        help = "Read body from a file instead of --body",
   86|       |        conflicts_with_all = ["body", "body_stdin", "graph_stdin"]
   87|       |    )]
   88|       |    pub body_file: Option<std::path::PathBuf>,
   89|       |    /// Read body from stdin until EOF. Useful in pipes (echo "..." | sqlite-graphrag remember ...).
   90|       |    /// Mutually exclusive with --body, --body-file, --graph-stdin.
   91|       |    #[arg(
   92|       |        long,
   93|       |        conflicts_with_all = ["body", "body_file", "graph_stdin"]
   94|       |    )]
   95|       |    pub body_stdin: bool,
   96|       |    #[arg(
   97|       |        long,
   98|       |        help = "JSON file containing entities to associate with this memory"
   99|       |    )]
  100|       |    pub entities_file: Option<std::path::PathBuf>,
  101|       |    #[arg(
  102|       |        long,
  103|       |        help = "JSON file containing relationships to associate with this memory"
  104|       |    )]
  105|       |    pub relationships_file: Option<std::path::PathBuf>,
  106|       |    #[arg(
  107|       |        long,
  108|       |        help = "Read graph JSON (body + entities + relationships) from stdin",
  109|       |        conflicts_with_all = [
  110|       |            "body",
  111|       |            "body_file",
  112|       |            "body_stdin",
  113|       |            "entities_file",
  114|       |            "relationships_file"
  115|       |        ]
  116|       |    )]
  117|       |    pub graph_stdin: bool,
  118|       |    #[arg(
  119|       |        long,
  120|       |        help = "Namespace (env: SQLITE_GRAPHRAG_NAMESPACE, default: global)"
  121|       |    )]
  122|       |    pub namespace: Option<String>,
  123|       |    /// Inline JSON object with arbitrary metadata key-value pairs. Mutually exclusive with --metadata-file.
  124|       |    #[arg(long)]
  125|       |    pub metadata: Option<String>,
  126|       |    #[arg(long, help = "JSON file containing metadata key-value pairs")]
  127|       |    pub metadata_file: Option<std::path::PathBuf>,
  128|       |    #[arg(long)]
  129|       |    pub force_merge: bool,
  130|       |    #[arg(
  131|       |        long,
  132|       |        value_name = "EPOCH_OR_RFC3339",
  133|       |        value_parser = crate::parsers::parse_expected_updated_at,
  134|       |        long_help = "Optimistic lock: reject if updated_at does not match. \
  135|       |Accepts Unix epoch (e.g. 1700000000) or RFC 3339 (e.g. 2026-04-19T12:00:00Z)."
  136|       |    )]
  137|       |    pub expected_updated_at: Option<i64>,
  138|       |    #[arg(
  139|       |        long,
  140|       |        env = "SQLITE_GRAPHRAG_ENABLE_NER",
  141|       |        value_parser = crate::parsers::parse_bool_flexible,
  142|       |        action = clap::ArgAction::Set,
  143|       |        num_args = 0..=1,
  144|       |        default_missing_value = "true",
  145|       |        default_value = "false",
  146|       |        help = "Enable automatic GLiNER NER entity/relationship extraction from body"
  147|       |    )]
  148|       |    pub enable_ner: bool,
  149|       |    #[arg(
  150|       |        long,
  151|       |        env = "SQLITE_GRAPHRAG_GLINER_VARIANT",
  152|       |        default_value = "fp32",
  153|       |        help = "GLiNER model variant: fp32 (1.1GB, best quality), fp16 (580MB), int8 (349MB, fastest but may miss entities on short texts), q4, q4f16"
  154|       |    )]
  155|       |    pub gliner_variant: String,
  156|       |    #[arg(long, hide = true)]
  157|       |    pub skip_extraction: bool,
  158|       |    /// Explicitly clear the body content (set to empty string). Required to distinguish
  159|       |    /// intentional body clearing from accidental omission during --force-merge.
  160|       |    /// Without this flag, an empty body passed to --force-merge preserves the existing body.
  161|       |    #[arg(
  162|       |        long,
  163|       |        default_value_t = false,
  164|       |        help = "Explicitly clear body content during --force-merge (without this flag, an empty body is ignored and the existing body is kept)"
  165|       |    )]
  166|       |    pub clear_body: bool,
  167|       |    /// Validate input and report planned actions without persisting.
  168|       |    #[arg(
  169|       |        long,
  170|       |        default_value_t = false,
  171|       |        help = "Validate input and report planned actions without persisting"
  172|       |    )]
  173|       |    pub dry_run: bool,
  174|       |    /// Optional opaque session identifier for tracing memory provenance across multi-agent runs.
  175|       |    #[arg(long)]
  176|       |    pub session_id: Option<String>,
  177|       |    #[arg(long, value_enum, default_value_t = JsonOutputFormat::Json)]
  178|       |    pub format: JsonOutputFormat,
  179|       |    #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
  180|       |    pub json: bool,
  181|       |    #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
  182|       |    pub db: Option<String>,
  183|       |    /// Maximum process RSS in MiB; abort if exceeded during embedding.
  184|       |    #[arg(long, default_value_t = crate::constants::DEFAULT_MAX_RSS_MB,
  185|       |          help = "Maximum process RSS in MiB; abort if exceeded during embedding (default: 8192)")]
  186|       |    pub max_rss_mb: u64,
  187|       |    /// Emit a warning (but do not reject) when persisting an entity whose degree would
  188|       |    /// exceed this value after the upsert. Default 50. Set 0 to disable the check.
  189|       |    #[arg(long, default_value_t = 50, value_name = "N")]
  190|       |    pub max_entity_degree: u32,
  191|       |}
  192|       |
  193|       |#[derive(Deserialize, Default)]
  194|       |#[serde(deny_unknown_fields)]
  195|       |struct GraphInput {
  196|       |    #[serde(default)]
  197|       |    body: Option<String>,
  198|       |    #[serde(default)]
  199|       |    entities: Vec<NewEntity>,
  200|       |    #[serde(default)]
  201|       |    relationships: Vec<NewRelationship>,
  202|       |}
  203|       |
  204|      0|fn normalize_and_validate_graph_input(graph: &mut GraphInput) -> Result<(), AppError> {
  205|      0|    for rel in &mut graph.relationships {
  206|      0|        rel.relation = crate::parsers::normalize_relation(&rel.relation);
  207|      0|        if let Err(e) = crate::parsers::validate_relation_format(&rel.relation) {
  208|      0|            return Err(AppError::Validation(format!(
  209|      0|                "{e} for relationship '{}' -> '{}'",
  210|      0|                rel.source, rel.target
  211|      0|            )));
  212|      0|        }
  213|      0|        crate::parsers::warn_if_non_canonical(&rel.relation);
  214|      0|        if !(0.0..=1.0).contains(&rel.strength) {
  215|      0|            return Err(AppError::Validation(format!(
  216|      0|                "invalid strength {} for relationship '{}' -> '{}'; expected value in [0.0, 1.0]",
  217|      0|                rel.strength, rel.source, rel.target
  218|      0|            )));
  219|      0|        }
  220|       |    }
  221|       |
  222|      0|    Ok(())
  223|      0|}
  224|       |
  225|       |#[tracing::instrument(skip_all, level = "debug", name = "remember")]
  226|      0|pub fn run(args: RememberArgs) -> Result<(), AppError> {
  227|       |    use crate::constants::*;
  228|       |
  229|      0|    let inicio = std::time::Instant::now();
  230|      0|    let _ = args.format;
  231|      0|    tracing::debug!(target: "remember", name = %args.name, "persisting memory");
  232|      0|    let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
  233|       |
  234|       |    // Capture the original `--name` before normalization so the JSON response can
  235|       |    // surface `name_was_normalized` + `original_name` (B_4 in v1.0.32). Stored as
  236|       |    // an owned String because `args.name` is moved into the response below.
  237|      0|    let original_name = args.name.clone();
  238|       |
  239|       |    // Auto-normalize to kebab-case before validation (P2-H).
  240|       |    // v1.0.20: also trims hyphens at the boundary (including trailing) to avoid rejection
  241|       |    // after truncation by a long filename ending in a hyphen.
  242|      0|    let normalized_name = {
  243|      0|        let lower = args.name.to_lowercase().replace(['_', ' '], "-");
  244|      0|        let trimmed = lower.trim_matches('-').to_string();
  245|      0|        if trimmed != args.name {
  246|      0|            tracing::warn!(target: "remember",
  247|       |                original = %args.name,
  248|       |                normalized = %trimmed,
  249|      0|                "name auto-normalized to kebab-case"
  250|       |            );
  251|      0|        }
  252|      0|        trimmed
  253|       |    };
  254|      0|    let name_was_normalized = normalized_name != original_name;
  255|       |
  256|      0|    if normalized_name.is_empty() {
  257|      0|        return Err(AppError::Validation(
  258|      0|            "name cannot be empty after normalization (input was blank or contained only hyphens/underscores/spaces)".to_string(),
  259|      0|        ));
  260|      0|    }
  261|      0|    if normalized_name.len() > MAX_MEMORY_NAME_LEN {
  262|      0|        return Err(AppError::LimitExceeded(
  263|      0|            crate::i18n::validation::name_length(MAX_MEMORY_NAME_LEN),
  264|      0|        ));
  265|      0|    }
  266|       |
  267|      0|    if normalized_name.starts_with("__") {
  268|      0|        return Err(AppError::Validation(
  269|      0|            crate::i18n::validation::reserved_name(),
  270|      0|        ));
  271|      0|    }
  272|       |
  273|       |    {
  274|      0|        let slug_re = crate::constants::name_slug_regex();
  275|      0|        if !slug_re.is_match(&normalized_name) {
  276|      0|            return Err(AppError::Validation(crate::i18n::validation::name_kebab(
  277|      0|                &normalized_name,
  278|      0|            )));
  279|      0|        }
  280|       |    }
  281|       |
  282|      0|    if let Some(ref desc) = args.description {
  283|      0|        if desc.len() > MAX_MEMORY_DESCRIPTION_LEN {
  284|      0|            return Err(AppError::Validation(
  285|      0|                crate::i18n::validation::description_exceeds(MAX_MEMORY_DESCRIPTION_LEN),
  286|      0|            ));
  287|      0|        }
  288|      0|    }
  289|       |
  290|      0|    let mut raw_body = if let Some(b) = args.body {
  291|      0|        b
  292|      0|    } else if let Some(ref path) = args.body_file {
  293|      0|        let file_size = std::fs::metadata(path).map_err(AppError::Io)?.len();
  294|      0|        if file_size > MAX_MEMORY_BODY_LEN as u64 {
  295|      0|            return Err(AppError::LimitExceeded(
  296|      0|                crate::i18n::validation::body_exceeds(MAX_MEMORY_BODY_LEN),
  297|      0|            ));
  298|      0|        }
  299|      0|        match std::fs::read_to_string(path) {
  300|      0|            Ok(s) => s,
  301|      0|            Err(e) if e.kind() == std::io::ErrorKind::InvalidData => {
  302|      0|                let bytes = std::fs::read(path).map_err(AppError::Io)?;
  303|      0|                tracing::warn!(target: "remember", "body file contains invalid UTF-8; replacing invalid sequences");
  304|      0|                String::from_utf8_lossy(&bytes).into_owned()
  305|       |            }
  306|      0|            Err(e) => return Err(AppError::Io(e)),
  307|       |        }
  308|      0|    } else if args.body_stdin || args.graph_stdin {
  309|      0|        crate::stdin_helper::read_stdin_with_timeout(60)?
  310|       |    } else {
  311|      0|        String::new()
  312|       |    };
  313|       |
  314|      0|    let mut entities_provided_externally =
  315|      0|        args.entities_file.is_some() || args.relationships_file.is_some();
  316|       |
  317|      0|    let mut graph = GraphInput::default();
  318|      0|    if let Some(path) = args.entities_file {
  319|      0|        let file_size = std::fs::metadata(&path).map_err(AppError::Io)?.len();
  320|      0|        if file_size > MAX_MEMORY_BODY_LEN as u64 {
  321|      0|            return Err(AppError::LimitExceeded(
  322|      0|                crate::i18n::validation::body_exceeds(MAX_MEMORY_BODY_LEN),
  323|      0|            ));
  324|      0|        }
  325|      0|        let content = std::fs::read_to_string(&path).map_err(AppError::Io)?;
  326|      0|        graph.entities = serde_json::from_str(&content)?;
  327|      0|    }
  328|      0|    if let Some(path) = args.relationships_file {
  329|      0|        let file_size = std::fs::metadata(&path).map_err(AppError::Io)?.len();
  330|      0|        if file_size > MAX_MEMORY_BODY_LEN as u64 {
  331|      0|            return Err(AppError::LimitExceeded(
  332|      0|                crate::i18n::validation::body_exceeds(MAX_MEMORY_BODY_LEN),
  333|      0|            ));
  334|      0|        }
  335|      0|        let content = std::fs::read_to_string(&path).map_err(AppError::Io)?;
  336|      0|        graph.relationships = serde_json::from_str(&content)?;
  337|      0|    }
  338|      0|    if args.graph_stdin {
  339|      0|        graph = serde_json::from_str::<GraphInput>(&raw_body).map_err(|e| {
  340|      0|            AppError::Validation(format!("invalid JSON payload on --graph-stdin: {e}"))
  341|      0|        })?;
  342|      0|        raw_body = graph.body.take().unwrap_or_default();
  343|      0|    }
  344|      0|    if args.graph_stdin && !graph.entities.is_empty() {
  345|      0|        entities_provided_externally = true;
  346|      0|    }
  347|       |
  348|      0|    if graph.entities.len() > max_entities_per_memory() {
  349|      0|        return Err(AppError::LimitExceeded(errors_msg::entity_limit_exceeded(
  350|      0|            max_entities_per_memory(),
  351|      0|        )));
  352|      0|    }
  353|      0|    let mut relationships_truncated = false;
  354|      0|    let rel_cap = max_relationships_per_memory();
  355|      0|    if graph.relationships.len() > rel_cap {
  356|      0|        tracing::warn!(target: "remember",
  357|      0|            count = graph.relationships.len(),
  358|       |            cap = rel_cap,
  359|      0|            "truncating relationships to cap"
  360|       |        );
  361|      0|        graph.relationships.truncate(rel_cap);
  362|      0|        relationships_truncated = true;
  363|      0|    }
  364|      0|    normalize_and_validate_graph_input(&mut graph)?;
  365|       |
  366|      0|    if raw_body.len() > MAX_MEMORY_BODY_LEN {
  367|      0|        return Err(AppError::LimitExceeded(
  368|      0|            crate::i18n::validation::body_exceeds(MAX_MEMORY_BODY_LEN),
  369|      0|        ));
  370|      0|    }
  371|       |
  372|       |    // v1.0.22 P1: reject empty or whitespace-only body when no external graph is provided.
  373|       |    // Without this check, empty embeddings would be persisted, breaking recall semantics.
  374|       |    // GAP-08: skip this guard when --force-merge without --clear-body; the existing body
  375|       |    // will be preserved from the database, so the effective body will not be empty.
  376|      0|    let body_will_be_preserved = args.force_merge && raw_body.trim().is_empty() && !args.clear_body;
  377|      0|    if !entities_provided_externally
  378|      0|        && graph.entities.is_empty()
  379|      0|        && raw_body.trim().is_empty()
  380|      0|        && !body_will_be_preserved
  381|      0|        && !args.clear_body
  382|       |    {
  383|      0|        return Err(AppError::Validation(crate::i18n::validation::empty_body()));
  384|      0|    }
  385|       |
  386|      0|    let metadata: serde_json::Value = if let Some(m) = args.metadata {
  387|      0|        serde_json::from_str(&m)?
  388|      0|    } else if let Some(path) = args.metadata_file {
  389|      0|        let file_size = std::fs::metadata(&path).map_err(AppError::Io)?.len();
  390|      0|        if file_size > MAX_MEMORY_BODY_LEN as u64 {
  391|      0|            return Err(AppError::LimitExceeded(
  392|      0|                crate::i18n::validation::body_exceeds(MAX_MEMORY_BODY_LEN),
  393|      0|            ));
  394|      0|        }
  395|      0|        let content = std::fs::read_to_string(&path).map_err(AppError::Io)?;
  396|      0|        serde_json::from_str(&content)?
  397|       |    } else {
  398|      0|        serde_json::json!({})
  399|       |    };
  400|       |
  401|      0|    let mut body_hash = blake3::hash(raw_body.as_bytes()).to_hex().to_string();
  402|      0|    let mut snippet: String = raw_body.chars().take(200).collect();
  403|       |
  404|      0|    let paths = AppPaths::resolve(args.db.as_deref())?;
  405|      0|    paths.ensure_dirs()?;
  406|       |
  407|       |    // v1.0.20: use .trim().is_empty() to reject bodies that are only whitespace.
  408|      0|    let mut extraction_method: Option<String> = None;
  409|      0|    let mut extracted_urls: Vec<crate::extraction::ExtractedUrl> = Vec::with_capacity(4);
  410|      0|    if args.enable_ner && args.skip_extraction {
  411|      0|        return Err(AppError::Validation(
  412|      0|            "--enable-ner and --skip-extraction are mutually exclusive; remove one".to_string(),
  413|      0|        ));
  414|      0|    }
  415|      0|    if args.skip_extraction && !args.enable_ner {
  416|      0|        return Err(AppError::Validation(
  417|      0|            "--skip-extraction is deprecated since v1.0.45 and has no effect; remove this flag"
  418|      0|                .to_string(),
  419|      0|        ));
  420|      0|    }
  421|      0|    let gliner_variant: crate::extraction::GlinerVariant =
  422|      0|        args.gliner_variant.parse().unwrap_or_else(|e| {
  423|      0|            tracing::warn!(target: "remember", error = %e, "invalid --gliner-variant, defaulting to fp32");
  424|      0|            crate::extraction::GlinerVariant::Fp32
  425|      0|        });
  426|      0|    if args.enable_ner && graph.entities.is_empty() && !raw_body.trim().is_empty() {
  427|      0|        match crate::extraction::extract_graph_auto(&raw_body, &paths, gliner_variant) {
  428|      0|            Ok(extracted) => {
  429|      0|                extraction_method = Some(extracted.extraction_method.clone());
  430|      0|                extracted_urls = extracted.urls;
  431|      0|                graph.entities = extracted.entities;
  432|      0|                graph.relationships = extracted.relationships;
  433|      0|                relationships_truncated = extracted.relationships_truncated;
  434|       |
  435|      0|                if graph.entities.len() > max_entities_per_memory() {
  436|      0|                    graph.entities.truncate(max_entities_per_memory());
  437|      0|                }
  438|      0|                if graph.relationships.len() > max_relationships_per_memory() {
  439|      0|                    relationships_truncated = true;
  440|      0|                    graph.relationships.truncate(max_relationships_per_memory());
  441|      0|                }
  442|      0|                normalize_and_validate_graph_input(&mut graph)?;
  443|       |            }
  444|      0|            Err(e) => {
  445|      0|                tracing::warn!(target: "remember", error = %e, "auto-extraction failed, graceful degradation");
  446|      0|                extraction_method = Some("none:extraction-failed".to_string());
  447|       |            }
  448|       |        }
  449|      0|    }
  450|       |
  451|      0|    let mut conn = open_rw(&paths.db)?;
  452|      0|    ensure_schema(&mut conn)?;
  453|       |
  454|       |    // --dry-run: emit planned action without any DB writes and return.
  455|      0|    if args.dry_run {
  456|      0|        let existing = memories::find_by_name(&conn, &namespace, &normalized_name)?;
  457|      0|        let planned_action = if existing.is_some() && args.force_merge {
  458|      0|            "would_update"
  459|       |        } else {
  460|      0|            "would_create"
  461|       |        };
  462|      0|        output::emit_json(&serde_json::json!({
  463|      0|            "dry_run": true,
  464|      0|            "name": normalized_name,
  465|      0|            "namespace": namespace,
  466|      0|            "planned_action": planned_action,
  467|      0|        }))?;
  468|      0|        return Ok(());
  469|      0|    }
  470|       |
  471|       |    {
  472|       |        use crate::constants::MAX_NAMESPACES_ACTIVE;
  473|      0|        let active_count: u32 = conn.query_row(
  474|      0|            "SELECT COUNT(DISTINCT namespace) FROM memories WHERE deleted_at IS NULL",
  475|      0|            [],
  476|      0|            |r| r.get::<_, i64>(0).map(|v| v as u32),
  477|      0|        )?;
  478|      0|        let ns_exists: bool = conn.query_row(
  479|      0|            "SELECT EXISTS(SELECT 1 FROM memories WHERE namespace = ?1 AND deleted_at IS NULL)",
  480|      0|            rusqlite::params![namespace],
  481|      0|            |r| r.get::<_, i64>(0).map(|v| v > 0),
  482|      0|        )?;
  483|      0|        if !ns_exists && active_count >= MAX_NAMESPACES_ACTIVE {
  484|      0|            return Err(AppError::NamespaceError(format!(
  485|      0|                "active namespace limit of {MAX_NAMESPACES_ACTIVE} reached while trying to create '{namespace}'"
  486|      0|            )));
  487|      0|        }
  488|       |    }
  489|       |
  490|       |    // M7: detect soft-deleted memory before the standard duplicate check.
  491|      0|    if let Some((sd_id, true)) =
  492|      0|        memories::find_by_name_any_state(&conn, &namespace, &normalized_name)?
  493|       |    {
  494|      0|        if args.force_merge {
  495|      0|            memories::clear_deleted_at(&conn, sd_id)?;
  496|       |        } else {
  497|      0|            return Err(AppError::Duplicate(
  498|      0|                errors_msg::duplicate_memory_soft_deleted(&normalized_name, &namespace),
  499|      0|            ));
  500|       |        }
  501|      0|    }
  502|       |
  503|      0|    let existing_memory = memories::find_by_name(&conn, &namespace, &normalized_name)?;
  504|      0|    if existing_memory.is_some() && !args.force_merge {
  505|      0|        return Err(AppError::Duplicate(errors_msg::duplicate_memory(
  506|      0|            &normalized_name,
  507|      0|            &namespace,
  508|      0|        )));
  509|      0|    }
  510|       |
  511|       |    // GAP-10: resolve type and description.
  512|       |    // For CREATE path (new memory): both are required.
  513|       |    // For UPDATE path (--force-merge on existing memory): inherit from existing row when omitted.
  514|      0|    let (resolved_type, resolved_description) = if existing_memory.is_none() {
  515|       |        // CREATE path — both fields are mandatory.
  516|      0|        let t = args.r#type.ok_or_else(|| {
  517|      0|            AppError::Validation(
  518|      0|                "--type and --description are required when creating a new memory".to_string(),
  519|      0|            )
  520|      0|        })?;
  521|      0|        let d = args.description.clone().ok_or_else(|| {
  522|      0|            AppError::Validation(
  523|      0|                "--type and --description are required when creating a new memory".to_string(),
  524|      0|            )
  525|      0|        })?;
  526|      0|        (t.as_str().to_string(), d)
  527|       |    } else {
  528|       |        // UPDATE path (--force-merge) — inherit missing fields from stored row.
  529|      0|        let existing_row = memories::read_by_name(&conn, &namespace, &normalized_name)?
  530|      0|            .ok_or_else(|| {
  531|      0|                AppError::NotFound(format!(
  532|      0|                    "memory '{normalized_name}' not found in namespace '{namespace}'"
  533|      0|                ))
  534|      0|            })?;
  535|      0|        let t = args
  536|      0|            .r#type
  537|      0|            .map(|v| v.as_str().to_string())
  538|      0|            .unwrap_or_else(|| existing_row.memory_type.clone());
  539|      0|        let d = args
  540|      0|            .description
  541|      0|            .clone()
  542|      0|            .unwrap_or_else(|| existing_row.description.clone());
  543|      0|        (t, d)
  544|       |    };
  545|       |
  546|       |    // GAP-08/GAP-09: protect existing body from accidental destruction during --force-merge.
  547|       |    // When the caller omits a body (or passes an empty one) without --clear-body, silently
  548|       |    // preserve the existing body from the database.  This prevents a common scripting mistake
  549|       |    // where a cron job updates metadata fields and inadvertently wipes the stored content.
  550|      0|    if body_will_be_preserved {
  551|      0|        if let Some(existing_row) = memories::read_by_name(&conn, &namespace, &normalized_name)? {
  552|      0|            if !existing_row.body.is_empty() {
  553|      0|                tracing::debug!(target: "remember",
  554|       |                    name = %normalized_name,
  555|      0|                    "GAP-08: empty body with --force-merge and no --clear-body; preserving existing body"
  556|       |                );
  557|      0|                raw_body = existing_row.body;
  558|      0|                body_hash = blake3::hash(raw_body.as_bytes()).to_hex().to_string();
  559|      0|                snippet = raw_body.chars().take(200).collect();
  560|      0|            }
  561|      0|        }
  562|      0|    }
  563|       |
  564|      0|    let duplicate_hash_id = memories::find_by_hash(&conn, &namespace, &body_hash)?;
  565|       |
  566|      0|    output::emit_progress_i18n(
  567|      0|        &format!(
  568|      0|            "Remember stage: validated input; available memory {} MB",
  569|      0|            crate::memory_guard::available_memory_mb()
  570|      0|        ),
  571|      0|        &format!(
  572|      0|            "Stage remember: input validated; available memory {} MB",
  573|      0|            crate::memory_guard::available_memory_mb()
  574|      0|        ),
  575|       |    );
  576|       |
  577|      0|    let tokenizer = crate::tokenizer::get_tokenizer(&paths.models)?;
  578|      0|    let model_max_length = crate::tokenizer::get_model_max_length(&paths.models)?;
  579|      0|    let total_passage_tokens = crate::tokenizer::count_passage_tokens(tokenizer, &raw_body)?;
  580|      0|    let chunks_info = chunking::split_into_chunks_hierarchical(&raw_body, tokenizer);
  581|      0|    let chunks_created = chunks_info.len();
  582|       |    // For single-chunk bodies the memory row itself stores the content and no
  583|       |    // entry is appended to `memory_chunks` (see line ~545). For multi-chunk
  584|       |    // bodies every chunk is persisted via `insert_chunk_slices`.
  585|      0|    let chunks_persisted = compute_chunks_persisted(chunks_info.len());
  586|       |
  587|      0|    output::emit_progress_i18n(
  588|      0|        &format!(
  589|      0|            "Remember stage: tokenizer counted {total_passage_tokens} passage tokens (model max {model_max_length}); chunking produced {} chunks; process RSS {} MB",
  590|      0|            chunks_created,
  591|      0|            crate::memory_guard::current_process_memory_mb().unwrap_or(0)
  592|      0|        ),
  593|      0|        &format!(
  594|      0|            "Stage remember: tokenizer counted {total_passage_tokens} passage tokens (model max {model_max_length}); chunking produced {} chunks; process RSS {} MB",
  595|      0|            chunks_created,
  596|      0|            crate::memory_guard::current_process_memory_mb().unwrap_or(0)
  597|      0|        ),
  598|       |    );
  599|       |
  600|      0|    if chunks_created > crate::constants::REMEMBER_MAX_SAFE_MULTI_CHUNKS {
  601|      0|        return Err(AppError::LimitExceeded(format!(
  602|      0|            "document produces {chunks_created} chunks; current safe operational limit is {} chunks; split the document before using remember",
  603|      0|            crate::constants::REMEMBER_MAX_SAFE_MULTI_CHUNKS
  604|      0|        )));
  605|      0|    }
  606|       |
  607|      0|    output::emit_progress_i18n("Computing embedding...", "Calculando embedding...");
  608|      0|    let mut chunk_embeddings_cache: Option<Vec<Vec<f32>>> = None;
  609|       |
  610|      0|    let embedding = if chunks_info.len() == 1 {
  611|      0|        crate::daemon::embed_passage_or_local(&paths.models, &raw_body)?
  612|       |    } else {
  613|      0|        let chunk_texts: Vec<&str> = chunks_info
  614|      0|            .iter()
  615|      0|            .map(|c| chunking::chunk_text(&raw_body, c))
  616|      0|            .collect();
  617|      0|        output::emit_progress_i18n(
  618|      0|            &format!(
  619|      0|                "Embedding {} chunks serially to keep memory bounded...",
  620|      0|                chunks_info.len()
  621|      0|            ),
  622|      0|            &format!(
  623|      0|                "Embedding {} chunks serially to keep memory bounded...",
  624|      0|                chunks_info.len()
  625|      0|            ),
  626|       |        );
  627|      0|        let embed_cap = chunk_texts.len();
  628|      0|        let mut chunk_embeddings = Vec::new();
  629|      0|        chunk_embeddings.try_reserve(embed_cap).map_err(|_| {
  630|      0|            AppError::LimitExceeded(format!(
  631|      0|                "allocation of {embed_cap} chunk embeddings would exceed available memory"
  632|      0|            ))
  633|      0|        })?;
  634|      0|        for chunk_text in &chunk_texts {
  635|      0|            if let Some(rss) = crate::memory_guard::current_process_memory_mb() {
  636|      0|                if rss > args.max_rss_mb {
  637|      0|                    tracing::error!(target: "remember",
  638|       |                        rss_mb = rss,
  639|       |                        max_rss_mb = args.max_rss_mb,
  640|      0|                        "RSS exceeded --max-rss-mb threshold; aborting to prevent system instability"
  641|       |                    );
  642|      0|                    return Err(AppError::LowMemory {
  643|      0|                        available_mb: crate::memory_guard::available_memory_mb(),
  644|      0|                        required_mb: args.max_rss_mb,
  645|      0|                    });
  646|      0|                }
  647|      0|            }
  648|      0|            chunk_embeddings.push(crate::daemon::embed_passage_or_local(
  649|      0|                &paths.models,
  650|      0|                chunk_text,
  651|      0|            )?);
  652|       |        }
  653|      0|        output::emit_progress_i18n(
  654|      0|            &format!(
  655|      0|                "Remember stage: chunk embeddings complete; process RSS {} MB",
  656|      0|                crate::memory_guard::current_process_memory_mb().unwrap_or(0)
  657|      0|            ),
  658|      0|            &format!(
  659|      0|                "Stage remember: chunk embeddings completed; process RSS {} MB",
  660|      0|                crate::memory_guard::current_process_memory_mb().unwrap_or(0)
  661|      0|            ),
  662|       |        );
  663|      0|        let aggregated = chunking::aggregate_embeddings(&chunk_embeddings);
  664|      0|        chunk_embeddings_cache = Some(chunk_embeddings);
  665|      0|        aggregated
  666|       |    };
  667|      0|    let body_for_storage = raw_body;
  668|       |
  669|      0|    let memory_type = resolved_type.as_str();
  670|      0|    let new_memory = NewMemory {
  671|      0|        namespace: namespace.clone(),
  672|      0|        name: normalized_name.clone(),
  673|      0|        memory_type: memory_type.to_string(),
  674|      0|        description: resolved_description.clone(),
  675|      0|        body: body_for_storage,
  676|      0|        body_hash: body_hash.clone(),
  677|      0|        session_id: args.session_id.clone(),
  678|      0|        source: "agent".to_string(),
  679|      0|        metadata,
  680|      0|    };
  681|       |
  682|      0|    let mut warnings = Vec::with_capacity(4);
  683|      0|    let mut entities_persisted = 0usize;
  684|      0|    let mut relationships_persisted = 0usize;
  685|       |
  686|      0|    let graph_entity_embeddings = graph
  687|      0|        .entities
  688|      0|        .iter()
  689|      0|        .map(|entity| {
  690|      0|            let entity_text = match &entity.description {
  691|      0|                Some(desc) => format!("{} {}", entity.name, desc),
  692|      0|                None => entity.name.clone(),
  693|       |            };
  694|      0|            crate::daemon::embed_passage_or_local(&paths.models, &entity_text)
  695|      0|        })
  696|      0|        .collect::<Result<Vec<_>, _>>()?;
  697|       |
  698|      0|    let tx = conn.transaction_with_behavior(rusqlite::TransactionBehavior::Immediate)?;
  699|       |
  700|      0|    let mut skip_reindex = false;
  701|      0|    let (memory_id, action, version) = match existing_memory {
  702|      0|        Some((existing_id, _updated_at, _current_version)) => {
  703|      0|            if let Some(hash_id) = duplicate_hash_id {
  704|      0|                if hash_id != existing_id {
  705|      0|                    warnings.push(format!(
  706|      0|                        "identical body already exists as memory id {hash_id}"
  707|      0|                    ));
  708|      0|                }
  709|      0|            }
  710|       |
  711|       |            // C1 fix: capture old values for FTS5 sync before update
  712|      0|            let (old_fts_name, old_fts_desc, old_fts_body): (String, String, String) = tx
  713|      0|                .query_row(
  714|      0|                    "SELECT name, description, body FROM memories WHERE id = ?1",
  715|      0|                    rusqlite::params![existing_id],
  716|      0|                    |r| Ok((r.get(0)?, r.get(1)?, r.get(2)?)),
  717|      0|                )?;
  718|       |
  719|       |            // G15: skip re-indexing when body hash matches (common in --force-merge loops)
  720|      0|            let existing_body_hash: Option<String> = tx
  721|      0|                .query_row(
  722|      0|                    "SELECT body_hash FROM memories WHERE id = ?1",
  723|      0|                    rusqlite::params![existing_id],
  724|      0|                    |r| r.get(0),
  725|       |                )
  726|      0|                .ok();
  727|      0|            let body_unchanged = existing_body_hash.as_deref() == Some(&body_hash);
  728|      0|            skip_reindex = body_unchanged;
  729|      0|            if !body_unchanged {
  730|      0|                storage_chunks::delete_chunks(&tx, existing_id)?;
  731|      0|            }
  732|       |
  733|      0|            let next_v = versions::next_version(&tx, existing_id)?;
  734|      0|            memories::update(&tx, existing_id, &new_memory, args.expected_updated_at)?;
  735|       |
  736|       |            // C1 fix: sync FTS5 external-content index after update
  737|       |            // (trg_fts_au trigger is absent by design due to sqlite-vec conflict)
  738|      0|            memories::sync_fts_after_update(
  739|      0|                &tx,
  740|      0|                existing_id,
  741|      0|                &old_fts_name,
  742|      0|                &old_fts_desc,
  743|      0|                &old_fts_body,
  744|      0|                &normalized_name,
  745|      0|                &resolved_description,
  746|      0|                &new_memory.body,
  747|      0|            )?;
  748|       |
  749|      0|            versions::insert_version(
  750|      0|                &tx,
  751|      0|                existing_id,
  752|      0|                next_v,
  753|      0|                &normalized_name,
  754|      0|                memory_type,
  755|      0|                &resolved_description,
  756|      0|                &new_memory.body,
  757|      0|                &serde_json::to_string(&new_memory.metadata)?,
  758|      0|                None,
  759|      0|                "edit",
  760|      0|            )?;
  761|      0|            if !body_unchanged {
  762|      0|                memories::upsert_vec(
  763|      0|                    &tx,
  764|      0|                    existing_id,
  765|      0|                    &namespace,
  766|      0|                    memory_type,
  767|      0|                    &embedding,
  768|      0|                    &normalized_name,
  769|      0|                    &snippet,
  770|      0|                )?;
  771|      0|            }
  772|      0|            (existing_id, "updated".to_string(), next_v)
  773|       |        }
  774|       |        None => {
  775|      0|            if let Some(hash_id) = duplicate_hash_id {
  776|      0|                warnings.push(format!(
  777|      0|                    "identical body already exists as memory id {hash_id}"
  778|      0|                ));
  779|      0|            }
  780|      0|            let id = memories::insert(&tx, &new_memory)?;
  781|      0|            versions::insert_version(
  782|      0|                &tx,
  783|      0|                id,
  784|       |                1,
  785|      0|                &normalized_name,
  786|      0|                memory_type,
  787|      0|                &resolved_description,
  788|      0|                &new_memory.body,
  789|      0|                &serde_json::to_string(&new_memory.metadata)?,
  790|      0|                None,
  791|      0|                "create",
  792|      0|            )?;
  793|      0|            memories::upsert_vec(
  794|      0|                &tx,
  795|      0|                id,
  796|      0|                &namespace,
  797|      0|                memory_type,
  798|      0|                &embedding,
  799|      0|                &normalized_name,
  800|      0|                &snippet,
  801|      0|            )?;
  802|      0|            (id, "created".to_string(), 1)
  803|       |        }
  804|       |    };
  805|       |
  806|      0|    if chunks_info.len() > 1 && !skip_reindex {
  807|      0|        storage_chunks::insert_chunk_slices(&tx, memory_id, &new_memory.body, &chunks_info)?;
  808|       |
  809|      0|        let chunk_embeddings = chunk_embeddings_cache.take().ok_or_else(|| {
  810|      0|            AppError::Internal(anyhow::anyhow!(
  811|      0|                "chunk embeddings cache missing in multi-chunk remember path"
  812|      0|            ))
  813|      0|        })?;
  814|       |
  815|      0|        for (i, emb) in chunk_embeddings.iter().enumerate() {
  816|      0|            storage_chunks::upsert_chunk_vec(&tx, i as i64, memory_id, i as i32, emb)?;
  817|       |        }
  818|      0|        output::emit_progress_i18n(
  819|      0|            &format!(
  820|      0|                "Remember stage: persisted chunk vectors; process RSS {} MB",
  821|      0|                crate::memory_guard::current_process_memory_mb().unwrap_or(0)
  822|      0|            ),
  823|      0|            &format!(
  824|      0|                "Etapa remember: vetores de chunks persistidos; RSS do processo {} MB",
  825|      0|                crate::memory_guard::current_process_memory_mb().unwrap_or(0)
  826|      0|            ),
  827|       |        );
  828|      0|    }
  829|       |
  830|      0|    if !graph.entities.is_empty() || !graph.relationships.is_empty() {
  831|      0|        for entity in &graph.entities {
  832|      0|            let entity_id = entities::upsert_entity(&tx, &namespace, entity)?;
  833|      0|            let entity_embedding = &graph_entity_embeddings[entities_persisted];
  834|      0|            entities::upsert_entity_vec(
  835|      0|                &tx,
  836|      0|                entity_id,
  837|      0|                &namespace,
  838|      0|                entity.entity_type,
  839|      0|                entity_embedding,
  840|      0|                &entity.name,
  841|      0|            )?;
  842|      0|            entities::link_memory_entity(&tx, memory_id, entity_id)?;
  843|      0|            entities::increment_degree(&tx, entity_id)?;
  844|       |            // GAP-17: warn when entity degree exceeds the configured cap.
  845|      0|            if args.max_entity_degree > 0 {
  846|      0|                let cap = args.max_entity_degree as i64;
  847|      0|                let degree: i64 = tx.query_row(
  848|      0|                    "SELECT degree FROM entities WHERE id = ?1",
  849|      0|                    rusqlite::params![entity_id],
  850|      0|                    |r| r.get(0),
  851|      0|                )?;
  852|      0|                if degree > cap {
  853|      0|                    tracing::warn!(target: "remember",
  854|       |                        entity = %entity.name,
  855|       |                        degree = degree,
  856|       |                        cap = cap,
  857|      0|                        "entity degree cap exceeded"
  858|       |                    );
  859|      0|                }
  860|      0|            }
  861|      0|            entities_persisted += 1;
  862|       |        }
  863|      0|        let entity_types: std::collections::HashMap<&str, EntityType> = graph
  864|      0|            .entities
  865|      0|            .iter()
  866|      0|            .map(|entity| (entity.name.as_str(), entity.entity_type))
  867|      0|            .collect();
  868|       |
  869|      0|        for rel in &graph.relationships {
  870|      0|            let source_entity = NewEntity {
  871|      0|                name: rel.source.clone(),
  872|      0|                entity_type: entity_types
  873|      0|                    .get(rel.source.as_str())
  874|      0|                    .copied()
  875|      0|                    .unwrap_or(EntityType::Concept),
  876|      0|                description: None,
  877|      0|            };
  878|      0|            let target_entity = NewEntity {
  879|      0|                name: rel.target.clone(),
  880|      0|                entity_type: entity_types
  881|      0|                    .get(rel.target.as_str())
  882|      0|                    .copied()
  883|      0|                    .unwrap_or(EntityType::Concept),
  884|      0|                description: None,
  885|      0|            };
  886|      0|            let source_id = entities::upsert_entity(&tx, &namespace, &source_entity)?;
  887|      0|            let target_id = entities::upsert_entity(&tx, &namespace, &target_entity)?;
  888|      0|            let rel_id = entities::upsert_relationship(&tx, &namespace, source_id, target_id, rel)?;
  889|      0|            entities::link_memory_relationship(&tx, memory_id, rel_id)?;
  890|      0|            relationships_persisted += 1;
  891|       |        }
  892|      0|    }
  893|      0|    tx.commit()?;
  894|       |
  895|       |    // v1.0.24 P0-2: persist URLs in a dedicated table, outside the main transaction.
  896|       |    // Failures do not propagate — non-critical path with graceful degradation.
  897|      0|    let urls_persisted = if !extracted_urls.is_empty() {
  898|      0|        let url_entries: Vec<storage_urls::MemoryUrl> = extracted_urls
  899|      0|            .into_iter()
  900|      0|            .map(|u| storage_urls::MemoryUrl {
  901|      0|                url: u.url,
  902|      0|                offset: Some(u.offset as i64),
  903|      0|            })
  904|      0|            .collect();
  905|      0|        storage_urls::insert_urls(&conn, memory_id, &url_entries)
  906|       |    } else {
  907|      0|        0
  908|       |    };
  909|       |
  910|      0|    conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);")?;
  911|       |
  912|      0|    let created_at_epoch = chrono::Utc::now().timestamp();
  913|      0|    let created_at_iso = crate::tz::format_iso(chrono::Utc::now());
  914|       |
  915|      0|    output::emit_json(&RememberResponse {
  916|      0|        memory_id,
  917|      0|        // Persist the normalized (kebab-case) slug as `name` since that is the
  918|      0|        // storage key. The original input is exposed via `original_name` only
  919|      0|        // when normalization actually changed something (B_4 in v1.0.32).
  920|      0|        name: normalized_name.clone(),
  921|      0|        namespace,
  922|      0|        action: action.clone(),
  923|      0|        operation: action,
  924|      0|        version,
  925|      0|        entities_persisted,
  926|      0|        relationships_persisted,
  927|      0|        relationships_truncated,
  928|      0|        chunks_created,
  929|      0|        chunks_persisted,
  930|      0|        urls_persisted,
  931|      0|        extraction_method,
  932|      0|        merged_into_memory_id: None,
  933|      0|        warnings,
  934|      0|        created_at: created_at_epoch,
  935|      0|        created_at_iso,
  936|      0|        elapsed_ms: inicio.elapsed().as_millis() as u64,
  937|      0|        name_was_normalized,
  938|      0|        original_name: name_was_normalized.then_some(original_name),
  939|      0|    })?;
  940|       |
  941|      0|    Ok(())
  942|      0|}
  943|       |
  944|       |#[cfg(test)]
  945|       |mod tests {
  946|       |    use super::compute_chunks_persisted;
  947|       |    use crate::output::RememberResponse;
  948|       |
  949|       |    // Bug H-M8: chunks_persisted contract is unit-testable and matches schema.
  950|       |    #[test]
  951|      1|    fn chunks_persisted_zero_for_zero_chunks() {
  952|      1|        assert_eq!(compute_chunks_persisted(0), 0);
  953|      1|    }
  954|       |
  955|       |    #[test]
  956|      1|    fn chunks_persisted_zero_for_single_chunk_body() {
  957|       |        // Single-chunk bodies live in the memories row itself; no row is
  958|       |        // appended to memory_chunks. This is the documented contract.
  959|      1|        assert_eq!(compute_chunks_persisted(1), 0);
  960|      1|    }
  961|       |
  962|       |    #[test]
  963|      1|    fn chunks_persisted_equals_count_for_multi_chunk_body() {
  964|       |        // Every chunk above the first triggers a row in memory_chunks.
  965|      1|        assert_eq!(compute_chunks_persisted(2), 2);
  966|      1|        assert_eq!(compute_chunks_persisted(7), 7);
  967|      1|        assert_eq!(compute_chunks_persisted(64), 64);
  968|      1|    }
  969|       |
  970|       |    #[test]
  971|      1|    fn remember_response_serializes_required_fields() {
  972|      1|        let resp = RememberResponse {
  973|      1|            memory_id: 42,
  974|      1|            name: "minha-mem".to_string(),
  975|      1|            namespace: "global".to_string(),
  976|      1|            action: "created".to_string(),
  977|      1|            operation: "created".to_string(),
  978|      1|            version: 1,
  979|      1|            entities_persisted: 0,
  980|      1|            relationships_persisted: 0,
  981|      1|            relationships_truncated: false,
  982|      1|            chunks_created: 1,
  983|      1|            chunks_persisted: 0,
  984|      1|            urls_persisted: 0,
  985|      1|            extraction_method: None,
  986|      1|            merged_into_memory_id: None,
  987|      1|            warnings: vec![],
  988|      1|            created_at: 1_705_320_000,
  989|      1|            created_at_iso: "2024-01-15T12:00:00Z".to_string(),
  990|      1|            elapsed_ms: 55,
  991|      1|            name_was_normalized: false,
  992|      1|            original_name: None,
  993|      1|        };
  994|       |
  995|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  996|      1|        assert_eq!(json["memory_id"], 42);
  997|      1|        assert_eq!(json["action"], "created");
  998|      1|        assert_eq!(json["operation"], "created");
  999|      1|        assert_eq!(json["version"], 1);
 1000|      1|        assert_eq!(json["elapsed_ms"], 55u64);
 1001|      1|        assert!(json["warnings"].is_array());
 1002|      1|        assert!(json["merged_into_memory_id"].is_null());
 1003|      1|    }
 1004|       |
 1005|       |    #[test]
 1006|      1|    fn remember_response_action_e_operation_sao_aliases() {
 1007|      1|        let resp = RememberResponse {
 1008|      1|            memory_id: 1,
 1009|      1|            name: "mem".to_string(),
 1010|      1|            namespace: "global".to_string(),
 1011|      1|            action: "updated".to_string(),
 1012|      1|            operation: "updated".to_string(),
 1013|      1|            version: 2,
 1014|      1|            entities_persisted: 3,
 1015|      1|            relationships_persisted: 1,
 1016|      1|            relationships_truncated: false,
 1017|      1|            extraction_method: None,
 1018|      1|            chunks_created: 2,
 1019|      1|            chunks_persisted: 2,
 1020|      1|            urls_persisted: 0,
 1021|      1|            merged_into_memory_id: None,
 1022|      1|            warnings: vec![],
 1023|      1|            created_at: 0,
 1024|      1|            created_at_iso: "1970-01-01T00:00:00Z".to_string(),
 1025|      1|            elapsed_ms: 0,
 1026|      1|            name_was_normalized: false,
 1027|      1|            original_name: None,
 1028|      1|        };
 1029|       |
 1030|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
 1031|      1|        assert_eq!(
 1032|      1|            json["action"], json["operation"],
 1033|      0|            "action e operation devem ser iguais"
 1034|       |        );
 1035|      1|        assert_eq!(json["entities_persisted"], 3);
 1036|      1|        assert_eq!(json["relationships_persisted"], 1);
 1037|      1|        assert_eq!(json["chunks_created"], 2);
 1038|      1|    }
 1039|       |
 1040|       |    #[test]
 1041|      1|    fn remember_response_warnings_lista_mensagens() {
 1042|      1|        let resp = RememberResponse {
 1043|      1|            memory_id: 5,
 1044|      1|            name: "dup-mem".to_string(),
 1045|      1|            namespace: "global".to_string(),
 1046|      1|            action: "created".to_string(),
 1047|      1|            operation: "created".to_string(),
 1048|      1|            version: 1,
 1049|      1|            entities_persisted: 0,
 1050|      1|            extraction_method: None,
 1051|      1|            relationships_persisted: 0,
 1052|      1|            relationships_truncated: false,
 1053|      1|            chunks_created: 1,
 1054|      1|            chunks_persisted: 0,
 1055|      1|            urls_persisted: 0,
 1056|      1|            merged_into_memory_id: None,
 1057|      1|            warnings: vec!["identical body already exists as memory id 3".to_string()],
 1058|      1|            created_at: 0,
 1059|      1|            created_at_iso: "1970-01-01T00:00:00Z".to_string(),
 1060|      1|            elapsed_ms: 10,
 1061|      1|            name_was_normalized: false,
 1062|      1|            original_name: None,
 1063|      1|        };
 1064|       |
 1065|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
 1066|      1|        let warnings = json["warnings"]
 1067|      1|            .as_array()
 1068|      1|            .expect("warnings deve ser array");
 1069|      1|        assert_eq!(warnings.len(), 1);
 1070|      1|        assert!(warnings[0].as_str().unwrap().contains("identical body"));
 1071|      1|    }
 1072|       |
 1073|       |    #[test]
 1074|      1|    fn invalid_name_reserved_prefix_returns_validation_error() {
 1075|       |        use crate::errors::AppError;
 1076|       |        // Validates the rejection logic for names with the "__" prefix directly
 1077|      1|        let nome = "__reservado";
 1078|      1|        let resultado: Result<(), AppError> = if nome.starts_with("__") {
 1079|      1|            Err(AppError::Validation(
 1080|      1|                crate::i18n::validation::reserved_name(),
 1081|      1|            ))
 1082|       |        } else {
 1083|      0|            Ok(())
 1084|       |        };
 1085|      1|        assert!(resultado.is_err());
 1086|      1|        if let Err(AppError::Validation(msg)) = resultado {
 1087|      1|            assert!(!msg.is_empty());
 1088|      0|        }
 1089|      1|    }
 1090|       |
 1091|       |    #[test]
 1092|      1|    fn name_too_long_returns_validation_error() {
 1093|       |        use crate::errors::AppError;
 1094|      1|        let nome_longo = "a".repeat(crate::constants::MAX_MEMORY_NAME_LEN + 1);
 1095|      1|        let resultado: Result<(), AppError> =
 1096|      1|            if nome_longo.is_empty() || nome_longo.len() > crate::constants::MAX_MEMORY_NAME_LEN {
 1097|      1|                Err(AppError::Validation(crate::i18n::validation::name_length(
 1098|      1|                    crate::constants::MAX_MEMORY_NAME_LEN,
 1099|      1|                )))
 1100|       |            } else {
 1101|      0|                Ok(())
 1102|       |            };
 1103|      1|        assert!(resultado.is_err());
 1104|      1|    }
 1105|       |
 1106|       |    #[test]
 1107|      1|    fn remember_response_merged_into_memory_id_some_serializes_integer() {
 1108|      1|        let resp = RememberResponse {
 1109|      1|            memory_id: 10,
 1110|      1|            name: "mem-mergeada".to_string(),
 1111|      1|            namespace: "global".to_string(),
 1112|      1|            action: "updated".to_string(),
 1113|      1|            operation: "updated".to_string(),
 1114|      1|            version: 3,
 1115|      1|            extraction_method: None,
 1116|      1|            entities_persisted: 0,
 1117|      1|            relationships_persisted: 0,
 1118|      1|            relationships_truncated: false,
 1119|      1|            chunks_created: 1,
 1120|      1|            chunks_persisted: 0,
 1121|      1|            urls_persisted: 0,
 1122|      1|            merged_into_memory_id: Some(7),
 1123|      1|            warnings: vec![],
 1124|      1|            created_at: 0,
 1125|      1|            created_at_iso: "1970-01-01T00:00:00Z".to_string(),
 1126|      1|            elapsed_ms: 0,
 1127|      1|            name_was_normalized: false,
 1128|      1|            original_name: None,
 1129|      1|        };
 1130|       |
 1131|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
 1132|      1|        assert_eq!(json["merged_into_memory_id"], 7);
 1133|      1|    }
 1134|       |
 1135|       |    #[test]
 1136|      1|    fn remember_response_urls_persisted_serializes_field() {
 1137|       |        // v1.0.24 P0-2: garante que urls_persisted aparece no JSON e aceita valor > 0.
 1138|      1|        let resp = RememberResponse {
 1139|      1|            memory_id: 3,
 1140|      1|            name: "mem-com-urls".to_string(),
 1141|      1|            namespace: "global".to_string(),
 1142|      1|            action: "created".to_string(),
 1143|      1|            operation: "created".to_string(),
 1144|      1|            version: 1,
 1145|      1|            entities_persisted: 0,
 1146|      1|            relationships_persisted: 0,
 1147|      1|            relationships_truncated: false,
 1148|      1|            chunks_created: 1,
 1149|      1|            chunks_persisted: 0,
 1150|      1|            urls_persisted: 3,
 1151|      1|            extraction_method: Some("regex-only".to_string()),
 1152|      1|            merged_into_memory_id: None,
 1153|      1|            warnings: vec![],
 1154|      1|            created_at: 0,
 1155|      1|            created_at_iso: "1970-01-01T00:00:00Z".to_string(),
 1156|      1|            elapsed_ms: 0,
 1157|      1|            name_was_normalized: false,
 1158|      1|            original_name: None,
 1159|      1|        };
 1160|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
 1161|      1|        assert_eq!(json["urls_persisted"], 3);
 1162|      1|    }
 1163|       |
 1164|       |    #[test]
 1165|      1|    fn empty_name_after_normalization_returns_specific_message() {
 1166|       |        // P0-4 regression: name consisting only of hyphens normalizes to empty string;
 1167|       |        // must produce a distinct error message, not the "too long" message.
 1168|       |        use crate::errors::AppError;
 1169|      1|        let normalized = "---".to_lowercase().replace(['_', ' '], "-");
 1170|      1|        let normalized = normalized.trim_matches('-').to_string();
 1171|      1|        let resultado: Result<(), AppError> = if normalized.is_empty() {
 1172|      1|            Err(AppError::Validation(
 1173|      1|                "name cannot be empty after normalization (input was blank or contained only hyphens/underscores/spaces)".to_string(),
 1174|      1|            ))
 1175|       |        } else {
 1176|      0|            Ok(())
 1177|       |        };
 1178|      1|        assert!(resultado.is_err());
 1179|      1|        if let Err(AppError::Validation(msg)) = resultado {
 1180|      1|            assert!(
 1181|      1|                msg.contains("empty after normalization"),
 1182|      0|                "mensagem deve mencionar 'empty after normalization', obteve: {msg}"
 1183|       |            );
 1184|      0|        }
 1185|      1|    }
 1186|       |
 1187|       |    #[test]
 1188|      1|    fn name_only_underscores_after_normalization_returns_specific_message() {
 1189|       |        // P0-4 regression: name consisting only of underscores normalizes to empty string.
 1190|       |        use crate::errors::AppError;
 1191|      1|        let normalized = "___".to_lowercase().replace(['_', ' '], "-");
 1192|      1|        let normalized = normalized.trim_matches('-').to_string();
 1193|      1|        assert!(
 1194|      1|            normalized.is_empty(),
 1195|      0|            "underscores devem normalizar para string vazia"
 1196|       |        );
 1197|      1|        let resultado: Result<(), AppError> = if normalized.is_empty() {
 1198|      1|            Err(AppError::Validation(
 1199|      1|                "name cannot be empty after normalization (input was blank or contained only hyphens/underscores/spaces)".to_string(),
 1200|      1|            ))
 1201|       |        } else {
 1202|      0|            Ok(())
 1203|       |        };
 1204|      1|        assert!(resultado.is_err());
 1205|      1|        if let Err(AppError::Validation(msg)) = resultado {
 1206|      1|            assert!(
 1207|      1|                msg.contains("empty after normalization"),
 1208|      0|                "mensagem deve mencionar 'empty after normalization', obteve: {msg}"
 1209|       |            );
 1210|      0|        }
 1211|      1|    }
 1212|       |
 1213|       |    #[test]
 1214|      1|    fn remember_response_relationships_truncated_serializes_field() {
 1215|       |        // P1-D: garante que relationships_truncated aparece no JSON como bool.
 1216|      1|        let resp_false = RememberResponse {
 1217|      1|            memory_id: 1,
 1218|      1|            name: "test".to_string(),
 1219|      1|            namespace: "global".to_string(),
 1220|      1|            action: "created".to_string(),
 1221|      1|            operation: "created".to_string(),
 1222|      1|            version: 1,
 1223|      1|            entities_persisted: 2,
 1224|      1|            relationships_persisted: 1,
 1225|      1|            relationships_truncated: false,
 1226|      1|            chunks_created: 1,
 1227|      1|            chunks_persisted: 0,
 1228|      1|            urls_persisted: 0,
 1229|      1|            extraction_method: None,
 1230|      1|            merged_into_memory_id: None,
 1231|      1|            warnings: vec![],
 1232|      1|            created_at: 0,
 1233|      1|            created_at_iso: "1970-01-01T00:00:00Z".to_string(),
 1234|      1|            elapsed_ms: 0,
 1235|      1|            name_was_normalized: false,
 1236|      1|            original_name: None,
 1237|      1|        };
 1238|      1|        let json_false = serde_json::to_value(&resp_false).expect("serialization failed");
 1239|      1|        assert_eq!(json_false["relationships_truncated"], false);
 1240|       |
 1241|      1|        let resp_true = RememberResponse {
 1242|      1|            relationships_truncated: true,
 1243|      1|            ..resp_false
 1244|      1|        };
 1245|      1|        let json_true = serde_json::to_value(&resp_true).expect("serialization failed");
 1246|      1|        assert_eq!(json_true["relationships_truncated"], true);
 1247|      1|    }
 1248|       |
 1249|       |    // GAP-08: body-preservation predicate tests.
 1250|       |    // Verifies the decision logic that determines whether an existing body should
 1251|       |    // be kept instead of overwritten with an empty incoming body during --force-merge.
 1252|       |
 1253|       |    /// Returns `true` when the existing body should be preserved.
 1254|       |    ///
 1255|       |    /// Mirrors the `body_will_be_preserved` expression in `run()` so the logic
 1256|       |    /// is testable without a real database connection.
 1257|      4|    fn should_preserve_body(force_merge: bool, raw_body_is_empty: bool, clear_body: bool) -> bool {
 1258|      4|        force_merge && raw_body_is_empty && !clear_body
                                     ^3                   ^2
 1259|      4|    }
 1260|       |
 1261|       |    #[test]
 1262|      1|    fn gap08_empty_body_force_merge_no_clear_body_preserves() {
 1263|       |        // Caller passes no body with --force-merge but without --clear-body.
 1264|       |        // The existing body in the DB must be kept.
 1265|      1|        assert!(
 1266|      1|            should_preserve_body(true, true, false),
 1267|      0|            "empty body + force-merge + no clear-body should trigger preservation"
 1268|       |        );
 1269|      1|    }
 1270|       |
 1271|       |    #[test]
 1272|      1|    fn gap08_empty_body_force_merge_with_clear_body_does_not_preserve() {
 1273|       |        // Caller explicitly passes --clear-body; intentional wipe is honoured.
 1274|      1|        assert!(
 1275|      1|            !should_preserve_body(true, true, true),
 1276|      0|            "--clear-body must bypass preservation"
 1277|       |        );
 1278|      1|    }
 1279|       |
 1280|       |    #[test]
 1281|      1|    fn gap08_non_empty_body_force_merge_does_not_preserve() {
 1282|       |        // Caller provides a real body; it must overwrite the existing one.
 1283|      1|        assert!(
 1284|      1|            !should_preserve_body(true, false, false),
 1285|      0|            "non-empty body must overwrite, not preserve"
 1286|       |        );
 1287|      1|    }
 1288|       |
 1289|       |    #[test]
 1290|      1|    fn gap08_empty_body_no_force_merge_does_not_preserve() {
 1291|       |        // Without --force-merge the path is a fresh create; no preservation needed.
 1292|      1|        assert!(
 1293|      1|            !should_preserve_body(false, true, false),
 1294|      0|            "no --force-merge means no preservation logic applies"
 1295|       |        );
 1296|      1|    }
 1297|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/remember_batch.rs:
    1|       |//! Handler for the `remember-batch` CLI subcommand (G08).
    2|       |//!
    3|       |//! Accepts NDJSON via stdin where each line is a memory to persist.
    4|       |//! One CLI invocation, one slot, one DB connection — eliminates N-process
    5|       |//! contention from parallel `remember` calls.
    6|       |
    7|       |use crate::errors::AppError;
    8|       |use crate::output;
    9|       |use crate::paths::AppPaths;
   10|       |use crate::storage::connection::open_rw;
   11|       |use crate::storage::{entities, memories, versions};
   12|       |use serde::{Deserialize, Serialize};
   13|       |use std::io::BufRead;
   14|       |
   15|       |#[derive(clap::Args)]
   16|       |#[command(after_long_help = "EXAMPLES:\n  \
   17|       |    # Pipe NDJSON memories from stdin\n  \
   18|       |    echo '{\"name\":\"mem-a\",\"type\":\"note\",\"description\":\"a\",\"body\":\"content\"}' | \
   19|       |    sqlite-graphrag remember-batch --json\n\n  \
   20|       |    # Atomic batch with --transaction\n  \
   21|       |    cat memories.ndjson | sqlite-graphrag remember-batch --transaction --json")]
   22|       |pub struct RememberBatchArgs {
   23|       |    /// Apply all memories in a single transaction (all-or-nothing).
   24|       |    #[arg(long)]
   25|       |    pub transaction: bool,
   26|       |    /// Stop processing on the first failure.
   27|       |    #[arg(long)]
   28|       |    pub fail_fast: bool,
   29|       |    /// Apply force-merge to all memories (update existing by name).
   30|       |    #[arg(long)]
   31|       |    pub force_merge: bool,
   32|       |    /// Namespace override for all memories.
   33|       |    #[arg(long, env = "SQLITE_GRAPHRAG_NAMESPACE")]
   34|       |    pub namespace: Option<String>,
   35|       |    /// Emit NDJSON output.
   36|       |    #[arg(long)]
   37|       |    pub json: bool,
   38|       |    /// Database path override.
   39|       |    #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
   40|       |    pub db: Option<String>,
   41|       |    #[command(flatten)]
   42|       |    pub daemon: crate::cli::DaemonOpts,
   43|       |}
   44|       |
   45|       |#[derive(Deserialize)]
   46|       |struct BatchInputLine {
   47|       |    name: String,
   48|       |    #[serde(default = "default_type")]
   49|       |    r#type: String,
   50|       |    #[serde(default)]
   51|       |    description: String,
   52|       |    #[serde(default)]
   53|       |    body: String,
   54|       |    #[serde(default)]
   55|       |    entities: Vec<crate::storage::entities::NewEntity>,
   56|       |    #[serde(default)]
   57|       |    relationships: Vec<crate::storage::entities::NewRelationship>,
   58|       |}
   59|       |
   60|      0|fn default_type() -> String {
   61|      0|    "note".to_string()
   62|      0|}
   63|       |
   64|       |#[derive(Serialize)]
   65|       |struct BatchItemEvent {
   66|       |    name: String,
   67|       |    status: String,
   68|       |    #[serde(skip_serializing_if = "Option::is_none")]
   69|       |    memory_id: Option<i64>,
   70|       |    #[serde(skip_serializing_if = "Option::is_none")]
   71|       |    error: Option<String>,
   72|       |    index: usize,
   73|       |}
   74|       |
   75|       |#[derive(Serialize)]
   76|       |struct BatchSummary {
   77|       |    summary: bool,
   78|       |    total: usize,
   79|       |    succeeded: usize,
   80|       |    failed: usize,
   81|       |    elapsed_ms: u64,
   82|       |}
   83|       |
   84|      0|pub fn run(args: RememberBatchArgs) -> Result<(), AppError> {
   85|      0|    let start = std::time::Instant::now();
   86|      0|    let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
   87|      0|    let paths = AppPaths::resolve(args.db.as_deref())?;
   88|      0|    paths.ensure_dirs()?;
   89|      0|    crate::storage::connection::ensure_db_ready(&paths)?;
   90|      0|    let mut conn = open_rw(&paths.db)?;
   91|       |
   92|      0|    let stdin = std::io::stdin();
   93|      0|    let lines: Vec<String> = stdin
   94|      0|        .lock()
   95|      0|        .lines()
   96|      0|        .map_while(Result::ok)
   97|      0|        .filter(|l| !l.trim().is_empty())
   98|      0|        .collect();
   99|       |
  100|      0|    let total = lines.len();
  101|      0|    let mut succeeded = 0usize;
  102|      0|    let mut failed = 0usize;
  103|       |
  104|      0|    if args.transaction {
  105|      0|        let tx = conn.transaction_with_behavior(rusqlite::TransactionBehavior::Immediate)?;
  106|      0|        for (idx, line) in lines.iter().enumerate() {
  107|      0|            match process_line(&tx, &namespace, line, idx, args.force_merge, &paths) {
  108|      0|                Ok(event) => {
  109|      0|                    output::emit_json(&event)?;
  110|      0|                    succeeded += 1;
  111|       |                }
  112|      0|                Err(e) => {
  113|      0|                    failed += 1;
  114|      0|                    output::emit_json(&BatchItemEvent {
  115|      0|                        name: String::new(),
  116|      0|                        status: "failed".to_string(),
  117|      0|                        memory_id: None,
  118|      0|                        error: Some(format!("{e}")),
  119|      0|                        index: idx,
  120|      0|                    })?;
  121|      0|                    if args.fail_fast {
  122|      0|                        break;
  123|      0|                    }
  124|       |                }
  125|       |            }
  126|       |        }
  127|      0|        if failed == 0 || !args.fail_fast {
  128|      0|            tx.commit()?;
  129|      0|        }
  130|       |    } else {
  131|      0|        for (idx, line) in lines.iter().enumerate() {
  132|      0|            let tx = conn.transaction_with_behavior(rusqlite::TransactionBehavior::Immediate)?;
  133|      0|            match process_line(&tx, &namespace, line, idx, args.force_merge, &paths) {
  134|      0|                Ok(event) => {
  135|      0|                    tx.commit()?;
  136|      0|                    output::emit_json(&event)?;
  137|      0|                    succeeded += 1;
  138|       |                }
  139|      0|                Err(e) => {
  140|      0|                    drop(tx);
  141|      0|                    failed += 1;
  142|      0|                    output::emit_json(&BatchItemEvent {
  143|      0|                        name: String::new(),
  144|      0|                        status: "failed".to_string(),
  145|      0|                        memory_id: None,
  146|      0|                        error: Some(format!("{e}")),
  147|      0|                        index: idx,
  148|      0|                    })?;
  149|      0|                    if args.fail_fast {
  150|      0|                        break;
  151|      0|                    }
  152|       |                }
  153|       |            }
  154|       |        }
  155|       |    }
  156|       |
  157|      0|    output::emit_json(&BatchSummary {
  158|      0|        summary: true,
  159|      0|        total,
  160|      0|        succeeded,
  161|      0|        failed,
  162|      0|        elapsed_ms: start.elapsed().as_millis() as u64,
  163|      0|    })?;
  164|       |
  165|      0|    Ok(())
  166|      0|}
  167|       |
  168|      0|fn process_line(
  169|      0|    tx: &rusqlite::Transaction<'_>,
  170|      0|    namespace: &str,
  171|      0|    line: &str,
  172|      0|    index: usize,
  173|      0|    force_merge: bool,
  174|      0|    paths: &AppPaths,
  175|      0|) -> Result<BatchItemEvent, AppError> {
  176|      0|    let input: BatchInputLine = serde_json::from_str(line)
  177|      0|        .map_err(|e| AppError::Validation(format!("line {index}: invalid JSON: {e}")))?;
  178|       |
  179|      0|    let normalized_name = crate::parsers::normalize_entity_name(&input.name);
  180|      0|    if normalized_name.is_empty() {
  181|      0|        return Err(AppError::Validation(format!(
  182|      0|            "line {index}: name normalizes to empty string"
  183|      0|        )));
  184|      0|    }
  185|       |
  186|      0|    let body_hash = blake3::hash(input.body.as_bytes()).to_hex().to_string();
  187|       |
  188|      0|    let existing = memories::find_by_name(tx, namespace, &normalized_name)?;
  189|       |
  190|      0|    let memory_id = if let Some((existing_id, _updated_at, _version)) = existing {
  191|      0|        if !force_merge {
  192|      0|            return Err(AppError::Duplicate(format!(
  193|      0|                "memory '{normalized_name}' already exists; use --force-merge to update"
  194|      0|            )));
  195|      0|        }
  196|      0|        let snippet: String = input.body.chars().take(200).collect();
  197|      0|        memories::update(
  198|      0|            tx,
  199|      0|            existing_id,
  200|      0|            &memories::NewMemory {
  201|      0|                namespace: namespace.to_string(),
  202|      0|                name: normalized_name.clone(),
  203|      0|                memory_type: input.r#type.clone(),
  204|      0|                description: input.description.clone(),
  205|      0|                body: input.body.clone(),
  206|      0|                body_hash,
  207|      0|                session_id: None,
  208|      0|                source: "agent".to_string(),
  209|      0|                metadata: serde_json::json!({}),
  210|      0|            },
  211|      0|            None,
  212|      0|        )?;
  213|      0|        let next_v = versions::next_version(tx, existing_id)?;
  214|      0|        versions::insert_version(
  215|      0|            tx,
  216|      0|            existing_id,
  217|      0|            next_v,
  218|      0|            &normalized_name,
  219|      0|            &input.r#type,
  220|      0|            &input.description,
  221|      0|            &input.body,
  222|      0|            "{}",
  223|      0|            None,
  224|      0|            "edit",
  225|      0|        )?;
  226|       |
  227|      0|        let embedding = crate::daemon::embed_passage_or_local(&paths.models, &input.body)?;
  228|      0|        memories::upsert_vec(
  229|      0|            tx,
  230|      0|            existing_id,
  231|      0|            namespace,
  232|      0|            &input.r#type,
  233|      0|            &embedding,
  234|      0|            &normalized_name,
  235|      0|            &snippet,
  236|      0|        )?;
  237|      0|        existing_id
  238|       |    } else {
  239|      0|        let new_mem = memories::NewMemory {
  240|      0|            namespace: namespace.to_string(),
  241|      0|            name: normalized_name.clone(),
  242|      0|            memory_type: input.r#type.clone(),
  243|      0|            description: input.description.clone(),
  244|      0|            body: input.body.clone(),
  245|      0|            body_hash,
  246|      0|            session_id: None,
  247|      0|            source: "agent".to_string(),
  248|      0|            metadata: serde_json::json!({}),
  249|      0|        };
  250|      0|        let id = memories::insert(tx, &new_mem)?;
  251|      0|        versions::insert_version(
  252|      0|            tx,
  253|      0|            id,
  254|       |            1,
  255|      0|            &normalized_name,
  256|      0|            &input.r#type,
  257|      0|            &input.description,
  258|      0|            &input.body,
  259|      0|            "{}",
  260|      0|            None,
  261|      0|            "create",
  262|      0|        )?;
  263|       |
  264|      0|        let snippet: String = input.body.chars().take(200).collect();
  265|      0|        let embedding = crate::daemon::embed_passage_or_local(&paths.models, &input.body)?;
  266|      0|        memories::upsert_vec(
  267|      0|            tx,
  268|      0|            id,
  269|      0|            namespace,
  270|      0|            &input.r#type,
  271|      0|            &embedding,
  272|      0|            &normalized_name,
  273|      0|            &snippet,
  274|      0|        )?;
  275|      0|        id
  276|       |    };
  277|       |
  278|       |    // Persist graph entities and relationships if provided
  279|      0|    for entity in &input.entities {
  280|      0|        let entity_id = entities::upsert_entity(tx, namespace, entity)?;
  281|      0|        let entity_text = match &entity.description {
  282|      0|            Some(desc) => format!("{} {}", entity.name, desc),
  283|      0|            None => entity.name.clone(),
  284|       |        };
  285|      0|        let entity_embedding = crate::daemon::embed_passage_or_local(&paths.models, &entity_text)?;
  286|      0|        entities::upsert_entity_vec(
  287|      0|            tx,
  288|      0|            entity_id,
  289|      0|            namespace,
  290|      0|            entity.entity_type,
  291|      0|            &entity_embedding,
  292|      0|            &entity.name,
  293|      0|        )?;
  294|      0|        entities::link_memory_entity(tx, memory_id, entity_id)?;
  295|       |    }
  296|       |
  297|      0|    for rel in &input.relationships {
  298|      0|        let src_name = crate::parsers::normalize_entity_name(&rel.source);
  299|      0|        let tgt_name = crate::parsers::normalize_entity_name(&rel.target);
  300|      0|        if let (Some(src_id), Some(tgt_id)) = (
  301|      0|            entities::find_entity_id(tx, namespace, &src_name)?,
  302|      0|            entities::find_entity_id(tx, namespace, &tgt_name)?,
  303|       |        ) {
  304|      0|            entities::create_or_fetch_relationship(
  305|      0|                tx,
  306|      0|                namespace,
  307|      0|                src_id,
  308|      0|                tgt_id,
  309|      0|                &rel.relation,
  310|      0|                rel.strength,
  311|      0|                rel.description.as_deref(),
  312|      0|            )?;
  313|      0|        }
  314|       |    }
  315|       |
  316|      0|    Ok(BatchItemEvent {
  317|      0|        name: normalized_name,
  318|      0|        status: "indexed".to_string(),
  319|      0|        memory_id: Some(memory_id),
  320|      0|        error: None,
  321|      0|        index,
  322|      0|    })
  323|      0|}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/rename.rs:
    1|       |//! Handler for the `rename` CLI subcommand.
    2|       |
    3|       |use crate::errors::AppError;
    4|       |use crate::i18n::errors_msg;
    5|       |use crate::output;
    6|       |use crate::output::JsonOutputFormat;
    7|       |use crate::paths::AppPaths;
    8|       |use crate::storage::connection::open_rw;
    9|       |use crate::storage::{memories, versions};
   10|       |use serde::Serialize;
   11|       |
   12|       |#[derive(clap::Args)]
   13|       |#[command(after_long_help = "EXAMPLES:\n  \
   14|       |    # Rename using two positional arguments (NAME NEW)\n  \
   15|       |    sqlite-graphrag rename onboarding welcome-guide\n\n  \
   16|       |    # Rename using the positional NAME + --new-name flag\n  \
   17|       |    sqlite-graphrag rename onboarding --new-name welcome-guide\n\n  \
   18|       |    # Rename using the named flag form\n  \
   19|       |    sqlite-graphrag rename --name onboarding --new-name welcome-guide\n\n  \
   20|       |    # Rename within a specific namespace\n  \
   21|       |    sqlite-graphrag rename onboarding welcome-guide --namespace my-project")]
   22|       |pub struct RenameArgs {
   23|       |    /// Current memory name as a positional argument. Alternative to `--name` / `--old`.
   24|       |    #[arg(
   25|       |        value_name = "NAME",
   26|       |        conflicts_with = "name",
   27|       |        help = "Current memory name to rename; alternative to --name/--old"
   28|       |    )]
   29|       |    pub name_positional: Option<String>,
   30|       |    /// Current memory name. Also accepts the aliases `--old` and `--from` (since v1.0.35).
   31|       |    #[arg(long, alias = "old", alias = "from")]
   32|       |    pub name: Option<String>,
   33|       |    /// New memory name as a positional argument. Alternative to `--new-name`.
   34|       |    #[arg(
   35|       |        value_name = "NEW",
   36|       |        conflicts_with = "new_name",
   37|       |        help = "New memory name; alternative to --new-name/--new/--to"
   38|       |    )]
   39|       |    pub new_name_positional: Option<String>,
   40|       |    /// New memory name. Also accepts the aliases `--new` and `--to` (since v1.0.35).
   41|       |    #[arg(long, alias = "new", alias = "to")]
   42|       |    pub new_name: Option<String>,
   43|       |    #[arg(
   44|       |        long,
   45|       |        help = "Namespace (env: SQLITE_GRAPHRAG_NAMESPACE, default: global)"
   46|       |    )]
   47|       |    pub namespace: Option<String>,
   48|       |    /// Optimistic locking: reject if the current updated_at does not match (exit 3).
   49|       |    #[arg(
   50|       |        long,
   51|       |        value_name = "EPOCH_OR_RFC3339",
   52|       |        value_parser = crate::parsers::parse_expected_updated_at,
   53|       |        long_help = "Optimistic lock: reject if updated_at does not match. \
   54|       |Accepts Unix epoch (e.g. 1700000000) or RFC 3339 (e.g. 2026-04-19T12:00:00Z)."
   55|       |    )]
   56|       |    pub expected_updated_at: Option<i64>,
   57|       |    /// Optional session ID used to trace the origin of the change.
   58|       |    #[arg(long, value_name = "UUID")]
   59|       |    pub session_id: Option<String>,
   60|       |    /// Output format.
   61|       |    #[arg(long, value_enum, default_value_t = JsonOutputFormat::Json)]
   62|       |    pub format: JsonOutputFormat,
   63|       |    #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
   64|       |    pub json: bool,
   65|       |    #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
   66|       |    pub db: Option<String>,
   67|       |}
   68|       |
   69|       |#[derive(Serialize)]
   70|       |struct RenameResponse {
   71|       |    memory_id: i64,
   72|       |    name: String,
   73|       |    action: &'static str,
   74|       |    version: i64,
   75|       |    /// Set to `true` when a soft-deleted ghost occupying the target name was purged.
   76|       |    #[serde(skip_serializing_if = "Option::is_none")]
   77|       |    ghost_purged: Option<bool>,
   78|       |    /// Total execution time in milliseconds from handler start to serialisation.
   79|       |    elapsed_ms: u64,
   80|       |}
   81|       |
   82|      0|pub fn run(args: RenameArgs) -> Result<(), AppError> {
   83|      0|    let inicio = std::time::Instant::now();
   84|      0|    let _ = args.format;
   85|      0|    tracing::debug!(target: "rename", old = ?args.name, new = ?args.new_name, "renaming memory");
   86|       |    use crate::constants::*;
   87|       |
   88|       |    // Resolve current name from positional or --name/--old flag.
   89|      0|    let name = args.name_positional.or(args.name).ok_or_else(|| {
   90|      0|        AppError::Validation("name required: pass as positional argument or via --name".to_string())
   91|      0|    })?;
   92|      0|    let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
   93|       |
   94|      0|    let raw_new_name = args.new_name.or(args.new_name_positional).ok_or_else(|| {
   95|      0|        AppError::Validation(
   96|      0|            "new name required: pass as positional <NEW> or via --new-name/--new/--to".to_string(),
   97|      0|        )
   98|      0|    })?;
   99|       |
  100|       |    // v1.0.20: trim_matches('-') also removes trailing/leading hyphens.
  101|      0|    let normalized_new_name = {
  102|      0|        let lower = raw_new_name.to_lowercase().replace(['_', ' '], "-");
  103|      0|        let trimmed = lower.trim_matches('-').to_string();
  104|      0|        if trimmed != raw_new_name {
  105|      0|            tracing::warn!(target: "rename",
  106|       |                original = %raw_new_name,
  107|       |                normalized = %trimmed,
  108|      0|                "new_name auto-normalized to kebab-case"
  109|       |            );
  110|      0|        }
  111|      0|        trimmed
  112|       |    };
  113|       |
  114|      0|    if normalized_new_name == name {
  115|      0|        return Err(AppError::Validation(
  116|      0|            "source and target names are identical".to_string(),
  117|      0|        ));
  118|      0|    }
  119|       |
  120|      0|    if normalized_new_name.starts_with("__") {
  121|      0|        return Err(AppError::Validation(
  122|      0|            crate::i18n::validation::reserved_name(),
  123|      0|        ));
  124|      0|    }
  125|       |
  126|      0|    if normalized_new_name.is_empty() || normalized_new_name.len() > MAX_MEMORY_NAME_LEN {
  127|      0|        return Err(AppError::Validation(
  128|      0|            crate::i18n::validation::new_name_length(MAX_MEMORY_NAME_LEN),
  129|      0|        ));
  130|      0|    }
  131|       |
  132|       |    {
  133|      0|        let slug_re = crate::constants::name_slug_regex();
  134|      0|        if !slug_re.is_match(&normalized_new_name) {
  135|      0|            return Err(AppError::Validation(
  136|      0|                crate::i18n::validation::new_name_kebab(&normalized_new_name),
  137|      0|            ));
  138|      0|        }
  139|       |    }
  140|       |
  141|      0|    let paths = AppPaths::resolve(args.db.as_deref())?;
  142|      0|    crate::storage::connection::ensure_db_ready(&paths)?;
  143|      0|    let mut conn = open_rw(&paths.db)?;
  144|       |
  145|      0|    let (memory_id, current_updated_at, _) = memories::find_by_name(&conn, &namespace, &name)?
  146|      0|        .ok_or_else(|| AppError::NotFound(errors_msg::memory_not_found(&name, &namespace)))?;
  147|       |
  148|      0|    if let Some(expected) = args.expected_updated_at {
  149|      0|        if expected != current_updated_at {
  150|      0|            return Err(AppError::Conflict(errors_msg::optimistic_lock_conflict(
  151|      0|                expected,
  152|      0|                current_updated_at,
  153|      0|            )));
  154|      0|        }
  155|      0|    }
  156|       |
  157|      0|    let row = memories::read_by_name(&conn, &namespace, &name)?
  158|      0|        .ok_or_else(|| AppError::Internal(anyhow::anyhow!("memory not found before rename")))?;
  159|       |
  160|      0|    let memory_type = row.memory_type.clone();
  161|      0|    let description = row.description.clone();
  162|      0|    let body = row.body.clone();
  163|      0|    let metadata = row.metadata.clone();
  164|       |
  165|      0|    let tx = conn.transaction_with_behavior(rusqlite::TransactionBehavior::Immediate)?;
  166|       |
  167|       |    // G16: auto-purge soft-deleted ghost occupying the target name
  168|      0|    let mut ghost_purged: Option<bool> = None;
  169|      0|    if let Some((ghost_id, is_deleted)) =
  170|      0|        memories::find_by_name_any_state(&tx, &namespace, &normalized_new_name)?
  171|       |    {
  172|      0|        if is_deleted {
  173|      0|            tracing::info!(target: "rename",
  174|       |                ghost_id,
  175|       |                name = %normalized_new_name,
  176|      0|                "auto-purging soft-deleted ghost to free target name for rename"
  177|       |            );
  178|      0|            tx.execute(
  179|      0|                "DELETE FROM memory_versions WHERE memory_id = ?1",
  180|      0|                rusqlite::params![ghost_id],
  181|      0|            )?;
  182|      0|            tx.execute(
  183|      0|                "DELETE FROM memory_chunks WHERE memory_id = ?1",
  184|      0|                rusqlite::params![ghost_id],
  185|      0|            )?;
  186|      0|            tx.execute(
  187|      0|                "DELETE FROM memory_entities WHERE memory_id = ?1",
  188|      0|                rusqlite::params![ghost_id],
  189|      0|            )?;
  190|      0|            tx.execute(
  191|      0|                "DELETE FROM vec_memories WHERE memory_id = ?1",
  192|      0|                rusqlite::params![ghost_id],
  193|      0|            )?;
  194|      0|            tx.execute(
  195|      0|                "DELETE FROM memories WHERE id = ?1",
  196|      0|                rusqlite::params![ghost_id],
  197|      0|            )?;
  198|      0|            ghost_purged = Some(true);
  199|      0|        } else if ghost_id != memory_id {
  200|      0|            return Err(AppError::Duplicate(format!(
  201|      0|                "target name '{normalized_new_name}' is already occupied by active memory id {ghost_id}"
  202|      0|            )));
  203|      0|        }
  204|      0|    }
  205|       |
  206|      0|    let affected = if let Some(ts) = args.expected_updated_at {
  207|      0|        tx.execute(
  208|      0|            "UPDATE memories SET name=?2 WHERE id=?1 AND updated_at=?3 AND deleted_at IS NULL",
  209|      0|            rusqlite::params![memory_id, normalized_new_name, ts],
  210|      0|        )?
  211|       |    } else {
  212|      0|        tx.execute(
  213|      0|            "UPDATE memories SET name=?2 WHERE id=?1 AND deleted_at IS NULL",
  214|      0|            rusqlite::params![memory_id, normalized_new_name],
  215|      0|        )?
  216|       |    };
  217|       |
  218|      0|    if affected == 0 {
  219|      0|        return Err(AppError::Conflict(
  220|      0|            "optimistic lock conflict: memory was modified by another process".to_string(),
  221|      0|        ));
  222|      0|    }
  223|       |
  224|      0|    let next_v = versions::next_version(&tx, memory_id)?;
  225|       |
  226|      0|    versions::insert_version(
  227|      0|        &tx,
  228|      0|        memory_id,
  229|      0|        next_v,
  230|      0|        &normalized_new_name,
  231|      0|        &memory_type,
  232|      0|        &description,
  233|      0|        &body,
  234|      0|        &metadata,
  235|      0|        None,
  236|      0|        "rename",
  237|      0|    )?;
  238|       |
  239|      0|    memories::sync_fts_after_update(
  240|      0|        &tx,
  241|      0|        memory_id,
  242|      0|        &name,
  243|      0|        &description,
  244|      0|        &body,
  245|      0|        &normalized_new_name,
  246|      0|        &description,
  247|      0|        &body,
  248|      0|    )?;
  249|       |
  250|      0|    tx.commit()?;
  251|       |
  252|      0|    conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);")?;
  253|       |
  254|      0|    output::emit_json(&RenameResponse {
  255|      0|        memory_id,
  256|      0|        name: normalized_new_name,
  257|      0|        action: "renamed",
  258|      0|        version: next_v,
  259|      0|        ghost_purged,
  260|      0|        elapsed_ms: inicio.elapsed().as_millis() as u64,
  261|      0|    })?;
  262|       |
  263|      0|    Ok(())
  264|      0|}
  265|       |
  266|       |#[cfg(test)]
  267|       |mod tests {
  268|       |    use crate::storage::memories::{insert, NewMemory};
  269|       |    use tempfile::TempDir;
  270|       |
  271|      1|    fn setup_db() -> (TempDir, rusqlite::Connection) {
  272|      1|        crate::storage::connection::register_vec_extension();
  273|      1|        let dir = TempDir::new().unwrap();
  274|      1|        let db_path = dir.path().join("test.db");
  275|      1|        let mut conn = rusqlite::Connection::open(&db_path).unwrap();
  276|      1|        crate::migrations::runner().run(&mut conn).unwrap();
  277|      1|        (dir, conn)
  278|      1|    }
  279|       |
  280|      1|    fn new_memory(name: &str) -> NewMemory {
  281|      1|        NewMemory {
  282|      1|            namespace: "global".to_string(),
  283|      1|            name: name.to_string(),
  284|      1|            memory_type: "user".to_string(),
  285|      1|            description: "desc".to_string(),
  286|      1|            body: "corpo".to_string(),
  287|      1|            body_hash: format!("hash-{name}"),
  288|      1|            session_id: None,
  289|      1|            source: "agent".to_string(),
  290|      1|            metadata: serde_json::json!({}),
  291|      1|        }
  292|      1|    }
  293|       |
  294|       |    #[test]
  295|      1|    fn rejects_new_name_with_double_underscore_prefix() {
  296|       |        use crate::errors::AppError;
  297|      1|        let (_dir, conn) = setup_db();
  298|      1|        insert(&conn, &new_memory("mem-teste")).unwrap();
  299|      1|        drop(conn);
  300|       |
  301|      1|        let err = AppError::Validation(
  302|      1|            "names and namespaces starting with __ are reserved for internal use".to_string(),
  303|      1|        );
  304|      1|        assert!(err.to_string().contains("__"));
  305|      1|        assert_eq!(err.exit_code(), 1);
  306|      1|    }
  307|       |
  308|       |    #[test]
  309|      1|    fn rejects_rename_to_same_name() {
  310|       |        use crate::errors::AppError;
  311|      1|        let err = AppError::Validation("source and target names are identical".to_string());
  312|      1|        assert_eq!(err.exit_code(), 1);
  313|      1|        assert!(err.to_string().contains("identical"));
  314|      1|    }
  315|       |
  316|       |    #[test]
  317|      1|    fn optimistic_lock_conflict_returns_exit_3() {
  318|       |        use crate::errors::AppError;
  319|      1|        let err = AppError::Conflict(
  320|      1|            "optimistic lock conflict: expected updated_at=100, but current is 200".to_string(),
  321|      1|        );
  322|      1|        assert_eq!(err.exit_code(), 3);
  323|      1|    }
  324|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/rename_entity.rs:
    1|       |//! Handler for the `rename-entity` CLI subcommand.
    2|       |//!
    3|       |//! Renames an entity preserving all relationships and memory bindings.
    4|       |//! Only the `name` column in `entities` and the corresponding `vec_entities`
    5|       |//! row need updating because relationships use integer FK `entity_id`.
    6|       |
    7|       |use crate::entity_type::EntityType;
    8|       |use crate::errors::AppError;
    9|       |use crate::i18n::errors_msg;
   10|       |use crate::output::{self, OutputFormat};
   11|       |use crate::paths::AppPaths;
   12|       |use crate::storage::connection::open_rw;
   13|       |use crate::storage::entities;
   14|       |use rusqlite::params;
   15|       |use serde::Serialize;
   16|       |
   17|       |#[derive(clap::Args)]
   18|       |#[command(after_long_help = "EXAMPLES:\n  \
   19|       |    # Rename an entity\n  \
   20|       |    sqlite-graphrag rename-entity --name old-name --new-name new-name\n\n  \
   21|       |    # Rename with namespace\n  \
   22|       |    sqlite-graphrag rename-entity --name auth --new-name authentication --namespace my-project")]
   23|       |pub struct RenameEntityArgs {
   24|       |    /// Current entity name to rename.
   25|       |    #[arg(long, value_name = "NAME")]
   26|       |    pub name: String,
   27|       |    /// New name for the entity.
   28|       |    #[arg(long, value_name = "NEW_NAME")]
   29|       |    pub new_name: String,
   30|       |    #[arg(long)]
   31|       |    pub namespace: Option<String>,
   32|       |    #[arg(long, value_enum, default_value = "json")]
   33|       |    pub format: OutputFormat,
   34|       |    #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
   35|       |    pub json: bool,
   36|       |    #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
   37|       |    pub db: Option<String>,
   38|       |}
   39|       |
   40|       |#[derive(Serialize)]
   41|       |struct RenameEntityResponse {
   42|       |    action: String,
   43|       |    old_name: String,
   44|       |    new_name: String,
   45|       |    entity_id: i64,
   46|       |    namespace: String,
   47|       |    elapsed_ms: u64,
   48|       |}
   49|       |
   50|      0|pub fn run(args: RenameEntityArgs) -> Result<(), AppError> {
   51|      0|    let start = std::time::Instant::now();
   52|      0|    let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
   53|      0|    let paths = AppPaths::resolve(args.db.as_deref())?;
   54|       |
   55|      0|    crate::storage::connection::ensure_db_ready(&paths)?;
   56|       |
   57|      0|    let mut conn = open_rw(&paths.db)?;
   58|       |
   59|       |    // Verify source entity exists and fetch its id and type.
   60|       |    // Normalize the lookup name to match the normalized stored names.
   61|      0|    let lookup_name = crate::parsers::normalize_entity_name(&args.name);
   62|      0|    let row: Option<(i64, EntityType)> = {
   63|      0|        let mut stmt = conn
   64|      0|            .prepare_cached("SELECT id, type FROM entities WHERE namespace = ?1 AND name = ?2")?;
   65|      0|        match stmt.query_row(params![namespace, lookup_name], |r| {
   66|      0|            Ok((r.get::<_, i64>(0)?, r.get::<_, EntityType>(1)?))
   67|      0|        }) {
   68|      0|            Ok(row) => Some(row),
   69|      0|            Err(rusqlite::Error::QueryReturnedNoRows) => None,
   70|      0|            Err(e) => return Err(AppError::Database(e)),
   71|       |        }
   72|       |    };
   73|      0|    let (entity_id, entity_type) = row
   74|      0|        .ok_or_else(|| AppError::NotFound(errors_msg::entity_not_found(&args.name, &namespace)))?;
   75|       |
   76|       |    // Validate the raw new name first (catches short ALL_CAPS NER noise),
   77|       |    // then normalize it for storage to preserve the normalized-name invariant.
   78|      0|    entities::validate_entity_name(&args.new_name)?;
   79|      0|    let new_name = crate::parsers::normalize_entity_name(&args.new_name);
   80|       |
   81|      0|    if lookup_name == new_name {
   82|      0|        return Err(AppError::Validation(
   83|      0|            "source and target entity names are identical".to_string(),
   84|      0|        ));
   85|      0|    }
   86|       |
   87|       |    // Ensure new name is not already taken in this namespace.
   88|      0|    if entities::find_entity_id(&conn, &namespace, &new_name)?.is_some() {
   89|      0|        return Err(AppError::Validation(format!(
   90|      0|            "entity with name '{new_name}' already exists in namespace '{namespace}'"
   91|      0|        )));
   92|      0|    }
   93|       |
   94|       |    // Embed the normalized new name for vec_entities replacement.
   95|      0|    let embedding = crate::daemon::embed_passage_or_local(&paths.models, &new_name)?;
   96|       |
   97|      0|    let tx = conn.transaction_with_behavior(rusqlite::TransactionBehavior::Immediate)?;
   98|      0|    tx.execute(
   99|      0|        "UPDATE entities SET name = ?1, updated_at = unixepoch() WHERE id = ?2",
  100|      0|        params![new_name, entity_id],
  101|      0|    )?;
  102|       |    // vec0 does not support UPDATE — delete then insert.
  103|      0|    tx.execute(
  104|      0|        "DELETE FROM vec_entities WHERE entity_id = ?1",
  105|      0|        params![entity_id],
  106|      0|    )?;
  107|      0|    let embedding_bytes = crate::embedder::f32_to_bytes(&embedding);
  108|      0|    tx.execute(
  109|      0|        "INSERT INTO vec_entities(entity_id, namespace, type, embedding, name)
  110|      0|         VALUES (?1, ?2, ?3, ?4, ?5)",
  111|      0|        params![
  112|      0|            entity_id,
  113|      0|            namespace,
  114|      0|            entity_type,
  115|      0|            &embedding_bytes,
  116|      0|            new_name
  117|      0|        ],
  118|      0|    )?;
  119|      0|    tx.commit()?;
  120|       |
  121|      0|    conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);")?;
  122|       |
  123|      0|    let response = RenameEntityResponse {
  124|      0|        action: "renamed".to_string(),
  125|      0|        old_name: args.name,
  126|      0|        new_name,
  127|      0|        entity_id,
  128|      0|        namespace: namespace.clone(),
  129|      0|        elapsed_ms: start.elapsed().as_millis() as u64,
  130|      0|    };
  131|       |
  132|      0|    match args.format {
  133|      0|        OutputFormat::Json => output::emit_json(&response)?,
  134|      0|        OutputFormat::Text | OutputFormat::Markdown => {
  135|      0|            output::emit_text(&format!(
  136|      0|                "renamed entity: '{}' → '{}' [{}]",
  137|      0|                response.old_name, response.new_name, response.namespace
  138|      0|            ));
  139|      0|        }
  140|       |    }
  141|       |
  142|      0|    Ok(())
  143|      0|}
  144|       |
  145|       |#[cfg(test)]
  146|       |mod tests {
  147|       |    use super::*;
  148|       |
  149|       |    #[test]
  150|      1|    fn rename_entity_response_serializes_all_fields() {
  151|      1|        let resp = RenameEntityResponse {
  152|      1|            action: "renamed".to_string(),
  153|      1|            old_name: "auth".to_string(),
  154|      1|            new_name: "authentication".to_string(),
  155|      1|            entity_id: 42,
  156|      1|            namespace: "global".to_string(),
  157|      1|            elapsed_ms: 7,
  158|      1|        };
  159|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  160|      1|        assert_eq!(json["action"], "renamed");
  161|      1|        assert_eq!(json["old_name"], "auth");
  162|      1|        assert_eq!(json["new_name"], "authentication");
  163|      1|        assert_eq!(json["entity_id"], 42);
  164|      1|        assert_eq!(json["namespace"], "global");
  165|      1|        assert!(json["elapsed_ms"].is_number());
  166|      1|    }
  167|       |
  168|       |    #[test]
  169|      1|    fn rename_entity_response_action_is_renamed() {
  170|      1|        let resp = RenameEntityResponse {
  171|      1|            action: "renamed".to_string(),
  172|      1|            old_name: "x".to_string(),
  173|      1|            new_name: "y".to_string(),
  174|      1|            entity_id: 1,
  175|      1|            namespace: "ns".to_string(),
  176|      1|            elapsed_ms: 1,
  177|      1|        };
  178|      1|        assert_eq!(resp.action, "renamed");
  179|      1|    }
  180|       |
  181|       |    #[test]
  182|      1|    fn rename_entity_response_entity_id_preserved() {
  183|      1|        let resp = RenameEntityResponse {
  184|      1|            action: "renamed".to_string(),
  185|      1|            old_name: "old".to_string(),
  186|      1|            new_name: "new".to_string(),
  187|      1|            entity_id: 999,
  188|      1|            namespace: "test-ns".to_string(),
  189|      1|            elapsed_ms: 5,
  190|      1|        };
  191|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  192|      1|        assert_eq!(json["entity_id"], 999);
  193|      1|    }
  194|       |
  195|       |    #[test]
  196|      1|    fn rejects_rename_entity_to_same_name() {
  197|       |        use crate::errors::AppError;
  198|      1|        let err = AppError::Validation("source and target entity names are identical".to_string());
  199|      1|        assert_eq!(err.exit_code(), 1);
  200|      1|        assert!(err.to_string().contains("identical"));
  201|      1|    }
  202|       |
  203|       |    #[test]
  204|      1|    fn rename_entity_response_namespace_reflected() {
  205|      1|        let resp = RenameEntityResponse {
  206|      1|            action: "renamed".to_string(),
  207|      1|            old_name: "a".to_string(),
  208|      1|            new_name: "b".to_string(),
  209|      1|            entity_id: 10,
  210|      1|            namespace: "my-project".to_string(),
  211|      1|            elapsed_ms: 2,
  212|      1|        };
  213|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  214|      1|        assert_eq!(json["namespace"], "my-project");
  215|      1|    }
  216|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/restore.rs:
    1|       |//! Handler for the `restore` CLI subcommand.
    2|       |
    3|       |use crate::errors::AppError;
    4|       |use crate::i18n::errors_msg;
    5|       |use crate::output;
    6|       |use crate::output::JsonOutputFormat;
    7|       |use crate::paths::AppPaths;
    8|       |use crate::storage::connection::open_rw;
    9|       |use crate::storage::memories;
   10|       |use crate::storage::versions;
   11|       |use rusqlite::params;
   12|       |use rusqlite::OptionalExtension;
   13|       |use serde::Serialize;
   14|       |
   15|       |#[derive(clap::Args)]
   16|       |#[command(after_long_help = "EXAMPLES:\n  \
   17|       |    # Restore the latest non-`restore` version of a memory\n  \
   18|       |    sqlite-graphrag restore --name onboarding\n\n  \
   19|       |    # Restore a specific version\n  \
   20|       |    sqlite-graphrag restore --name onboarding --version 3\n\n  \
   21|       |    # Restore within a specific namespace\n  \
   22|       |    sqlite-graphrag restore --name onboarding --namespace my-project")]
   23|       |pub struct RestoreArgs {
   24|       |    /// Memory name as a positional argument. Alternative to `--name`.
   25|       |    #[arg(
   26|       |        value_name = "NAME",
   27|       |        conflicts_with = "name",
   28|       |        help = "Memory name to restore; alternative to --name"
   29|       |    )]
   30|       |    pub name_positional: Option<String>,
   31|       |    /// Memory name to restore (must exist, including soft-deleted/forgotten).
   32|       |    #[arg(long)]
   33|       |    pub name: Option<String>,
   34|       |    /// Version to restore. When omitted, defaults to the latest non-`restore` version
   35|       |    /// from `memory_versions`. This makes the forget+restore workflow work without
   36|       |    /// requiring the user to discover the version first.
   37|       |    #[arg(long)]
   38|       |    pub version: Option<i64>,
   39|       |    #[arg(
   40|       |        long,
   41|       |        help = "Namespace (env: SQLITE_GRAPHRAG_NAMESPACE, default: global)"
   42|       |    )]
   43|       |    pub namespace: Option<String>,
   44|       |    /// Optimistic locking: reject if the current updated_at does not match (exit 3).
   45|       |    #[arg(
   46|       |        long,
   47|       |        value_name = "EPOCH_OR_RFC3339",
   48|       |        value_parser = crate::parsers::parse_expected_updated_at,
   49|       |        long_help = "Optimistic lock: reject if updated_at does not match. \
   50|       |Accepts Unix epoch (e.g. 1700000000) or RFC 3339 (e.g. 2026-04-19T12:00:00Z)."
   51|       |    )]
   52|       |    pub expected_updated_at: Option<i64>,
   53|       |    /// Output format.
   54|       |    #[arg(long, value_enum, default_value_t = JsonOutputFormat::Json)]
   55|       |    pub format: JsonOutputFormat,
   56|       |    #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
   57|       |    pub json: bool,
   58|       |    #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
   59|       |    pub db: Option<String>,
   60|       |}
   61|       |
   62|       |#[derive(Serialize)]
   63|       |struct RestoreResponse {
   64|       |    /// Always `"restored"` — signals the completed action to shell callers and LLM agents.
   65|       |    action: String,
   66|       |    memory_id: i64,
   67|       |    name: String,
   68|       |    version: i64,
   69|       |    restored_from: i64,
   70|       |    /// Total execution time in milliseconds from handler start to serialisation.
   71|       |    elapsed_ms: u64,
   72|       |}
   73|       |
   74|      0|pub fn run(args: RestoreArgs) -> Result<(), AppError> {
   75|      0|    let start = std::time::Instant::now();
   76|      0|    let _ = args.format;
   77|      0|    tracing::debug!(target: "restore", name = ?args.name_positional.as_deref().or(args.name.as_deref()), version = ?args.version, "restoring version");
   78|      0|    let name = args
   79|      0|        .name_positional
   80|      0|        .as_deref()
   81|      0|        .or(args.name.as_deref())
   82|      0|        .ok_or_else(|| {
   83|      0|            AppError::Validation(
   84|      0|                "name required: pass as positional argument or via --name".to_string(),
   85|      0|            )
   86|      0|        })?
   87|      0|        .to_string();
   88|      0|    let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
   89|      0|    let paths = AppPaths::resolve(args.db.as_deref())?;
   90|      0|    let mut conn = open_rw(&paths.db)?;
   91|       |
   92|       |    // PRD line 1118: query WITHOUT a deleted_at filter — restore must work on soft-deleted memories
   93|      0|    let result: Option<(i64, i64)> = conn
   94|      0|        .query_row(
   95|      0|            "SELECT id, updated_at FROM memories WHERE namespace = ?1 AND name = ?2",
   96|      0|            params![namespace, name],
   97|      0|            |r| Ok((r.get(0)?, r.get(1)?)),
   98|       |        )
   99|      0|        .optional()?;
  100|      0|    let (memory_id, current_updated_at) = result
  101|      0|        .ok_or_else(|| AppError::NotFound(errors_msg::memory_not_found(&name, &namespace)))?;
  102|       |
  103|      0|    if let Some(expected) = args.expected_updated_at {
  104|      0|        if expected != current_updated_at {
  105|      0|            return Err(AppError::Conflict(errors_msg::optimistic_lock_conflict(
  106|      0|                expected,
  107|      0|                current_updated_at,
  108|      0|            )));
  109|      0|        }
  110|      0|    }
  111|       |
  112|       |    // v1.0.22 P0: resolve optional `--version`. When absent, uses the highest version
  113|       |    // whose `change_reason` is not 'restore' (recovers the real state, not meta-restore).
  114|       |    // Lets the forget+restore workflow function without manually reading memory_versions.
  115|      0|    let target_version: i64 = match args.version {
  116|      0|        Some(v) => v,
  117|       |        None => {
  118|      0|            let last: Option<i64> = conn
  119|      0|                .query_row(
  120|      0|                    "SELECT MAX(version) FROM memory_versions
  121|      0|                     WHERE memory_id = ?1 AND change_reason != 'restore'",
  122|      0|                    params![memory_id],
  123|      0|                    |r| r.get(0),
  124|       |                )
  125|      0|                .optional()?
  126|      0|                .flatten();
  127|      0|            let v = last.ok_or_else(|| {
  128|      0|                AppError::NotFound(errors_msg::memory_not_found(&name, &namespace))
  129|      0|            })?;
  130|      0|            tracing::info!(target: "restore",
  131|      0|                "restore --version omitted; using latest non-restore version: {}",
  132|       |                v
  133|       |            );
  134|      0|            v
  135|       |        }
  136|       |    };
  137|       |
  138|      0|    let version_row: (String, String, String, String, String) = {
  139|      0|        let mut stmt = conn.prepare_cached(
  140|      0|            "SELECT name, type, description, body, metadata
  141|      0|             FROM memory_versions
  142|      0|             WHERE memory_id = ?1 AND version = ?2",
  143|      0|        )?;
  144|       |
  145|      0|        stmt.query_row(params![memory_id, target_version], |r| {
  146|      0|            Ok((r.get(0)?, r.get(1)?, r.get(2)?, r.get(3)?, r.get(4)?))
  147|      0|        })
  148|      0|        .map_err(|_| AppError::NotFound(errors_msg::version_not_found(target_version, &name)))?
  149|       |    };
  150|       |
  151|      0|    let (_old_name, old_type, old_description, old_body, old_metadata) = version_row;
  152|       |
  153|       |    // Read current FTS-indexed values before the UPDATE so sync_fts_after_update
  154|       |    // can issue the correct DELETE command for the external-content FTS5 table.
  155|      0|    let (cur_name, cur_desc, cur_body): (String, String, String) = conn.query_row(
  156|      0|        "SELECT name, description, body FROM memories WHERE id = ?1",
  157|      0|        params![memory_id],
  158|      0|        |r| Ok((r.get(0)?, r.get(1)?, r.get(2)?)),
  159|      0|    )?;
  160|       |
  161|       |    // v1.0.21 P1-D: re-embed restored body to keep `vec_memories` synchronized
  162|       |    // with `memories`. Without this, semantic queries used the post-forget version
  163|       |    // vector, causing inconsistent recall (vec_memories=2 vs memories=3 after forget+restore).
  164|      0|    output::emit_progress_i18n(
  165|      0|        "Re-computing embedding for restored memory...",
  166|      0|        crate::i18n::validation::runtime_pt::restore_recomputing_embedding(),
  167|       |    );
  168|      0|    let embedding = crate::daemon::embed_passage_or_local(&paths.models, &old_body)?;
  169|      0|    let snippet: String = old_body.chars().take(300).collect();
  170|       |
  171|      0|    let tx = conn.transaction_with_behavior(rusqlite::TransactionBehavior::Immediate)?;
  172|       |
  173|       |    // deleted_at = NULL reactivates soft-deleted memories; no deleted_at filter in the WHERE
  174|      0|    let affected = if let Some(ts) = args.expected_updated_at {
  175|      0|        tx.execute(
  176|      0|            "UPDATE memories SET type=?2, description=?3, body=?4, body_hash=?5, deleted_at=NULL
  177|      0|             WHERE id=?1 AND updated_at=?6",
  178|      0|            rusqlite::params![
  179|      0|                memory_id,
  180|      0|                old_type,
  181|      0|                old_description,
  182|      0|                old_body,
  183|      0|                blake3::hash(old_body.as_bytes()).to_hex().to_string(),
  184|      0|                ts
  185|      0|            ],
  186|      0|        )?
  187|       |    } else {
  188|      0|        tx.execute(
  189|      0|            "UPDATE memories SET type=?2, description=?3, body=?4, body_hash=?5, deleted_at=NULL
  190|      0|             WHERE id=?1",
  191|      0|            rusqlite::params![
  192|      0|                memory_id,
  193|      0|                old_type,
  194|      0|                old_description,
  195|      0|                old_body,
  196|      0|                blake3::hash(old_body.as_bytes()).to_hex().to_string()
  197|      0|            ],
  198|      0|        )?
  199|       |    };
  200|       |
  201|      0|    if affected == 0 {
  202|      0|        return Err(AppError::Conflict(errors_msg::concurrent_process_conflict()));
  203|      0|    }
  204|       |
  205|      0|    let next_v = versions::next_version(&tx, memory_id)?;
  206|       |
  207|      0|    versions::insert_version(
  208|      0|        &tx,
  209|      0|        memory_id,
  210|      0|        next_v,
  211|      0|        &cur_name,
  212|      0|        &old_type,
  213|      0|        &old_description,
  214|      0|        &old_body,
  215|      0|        &old_metadata,
  216|      0|        None,
  217|      0|        "restore",
  218|      0|    )?;
  219|       |
  220|      0|    memories::upsert_vec(
  221|      0|        &tx, memory_id, &namespace, &old_type, &embedding, &cur_name, &snippet,
  222|      0|    )?;
  223|       |
  224|      0|    memories::sync_fts_after_update(
  225|      0|        &tx,
  226|      0|        memory_id,
  227|      0|        &cur_name,
  228|      0|        &cur_desc,
  229|      0|        &cur_body,
  230|      0|        &cur_name,
  231|      0|        &old_description,
  232|      0|        &old_body,
  233|      0|    )?;
  234|       |
  235|      0|    tx.commit()?;
  236|       |
  237|      0|    conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);")?;
  238|       |
  239|      0|    output::emit_json(&RestoreResponse {
  240|      0|        action: "restored".to_string(),
  241|      0|        memory_id,
  242|      0|        name: cur_name.clone(),
  243|      0|        version: next_v,
  244|      0|        restored_from: target_version,
  245|      0|        elapsed_ms: start.elapsed().as_millis() as u64,
  246|      0|    })?;
  247|       |
  248|      0|    Ok(())
  249|      0|}
  250|       |
  251|       |#[cfg(test)]
  252|       |mod tests {
  253|       |    use crate::errors::AppError;
  254|       |
  255|       |    #[test]
  256|      1|    fn optimistic_lock_conflict_returns_exit_3() {
  257|      1|        let err = AppError::Conflict(
  258|      1|            "optimistic lock conflict: expected updated_at=50, but current is 99".to_string(),
  259|      1|        );
  260|      1|        assert_eq!(err.exit_code(), 3);
  261|      1|        assert!(err.to_string().contains("conflict"));
  262|      1|    }
  263|       |
  264|       |    #[test]
  265|      1|    fn restore_response_includes_action_field() {
  266|      1|        let resp = super::RestoreResponse {
  267|      1|            action: "restored".to_string(),
  268|      1|            memory_id: 1,
  269|      1|            name: "test-mem".to_string(),
  270|      1|            version: 3,
  271|      1|            restored_from: 2,
  272|      1|            elapsed_ms: 42,
  273|      1|        };
  274|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  275|      1|        assert_eq!(json["action"], "restored");
  276|      1|        assert_eq!(json["memory_id"], 1);
  277|      1|        assert_eq!(json["version"], 3);
  278|      1|        assert_eq!(json["restored_from"], 2);
  279|      1|    }
  280|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/stats.rs:
    1|       |//! Handler for the `stats` CLI subcommand.
    2|       |
    3|       |use crate::errors::AppError;
    4|       |use crate::output;
    5|       |use crate::paths::AppPaths;
    6|       |use crate::storage::connection::open_ro;
    7|       |use serde::Serialize;
    8|       |
    9|       |#[derive(clap::Args)]
   10|       |#[command(after_long_help = "EXAMPLES:\n  \
   11|       |    # Show database statistics (memory counts, sizes, namespace breakdown)\n  \
   12|       |    sqlite-graphrag stats\n\n  \
   13|       |    # Stats for a database at a custom path\n  \
   14|       |    sqlite-graphrag stats --db /path/to/graphrag.sqlite\n\n  \
   15|       |    # Use SQLITE_GRAPHRAG_DB_PATH env var\n  \
   16|       |    SQLITE_GRAPHRAG_DB_PATH=/data/graphrag.sqlite sqlite-graphrag stats")]
   17|       |pub struct StatsArgs {
   18|       |    #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
   19|       |    pub db: Option<String>,
   20|       |    /// Explicit JSON flag. Accepted as a no-op because output is already JSON by default.
   21|       |    #[arg(long, default_value_t = false)]
   22|       |    pub json: bool,
   23|       |    /// Output format: `json` or `text`. JSON is always emitted on stdout regardless of the value.
   24|       |    #[arg(long, value_parser = ["json", "text"], hide = true)]
   25|       |    pub format: Option<String>,
   26|       |}
   27|       |
   28|       |#[derive(Serialize)]
   29|       |struct StatsResponse {
   30|       |    memories: i64,
   31|       |    /// Alias of `memories` for the documented contract in SKILL.md.
   32|       |    memories_total: i64,
   33|       |    entities: i64,
   34|       |    /// Alias of `entities` for the documented contract.
   35|       |    entities_total: i64,
   36|       |    relationships: i64,
   37|       |    /// Alias of `relationships` for the documented contract.
   38|       |    relationships_total: i64,
   39|       |    /// Semantic alias of `relationships` per the contract in SKILL.md.
   40|       |    edges: i64,
   41|       |    /// Total indexed chunks (one row per chunk in `memory_chunks`).
   42|       |    chunks_total: i64,
   43|       |    /// Average length of the body field in active (non-deleted) memories.
   44|       |    avg_body_len: f64,
   45|       |    namespaces: Vec<String>,
   46|       |    db_size_bytes: u64,
   47|       |    /// Semantic alias of `db_size_bytes` for the documented contract.
   48|       |    db_bytes: u64,
   49|       |    /// Latest applied migration number from `refinery_schema_history`.
   50|       |    /// Emitted as a JSON number for cross-command consistency with `health` (since v1.0.35).
   51|       |    /// Returns `0` when the database has no recorded migrations yet.
   52|       |    schema_version: u32,
   53|       |    /// Total execution time in milliseconds from handler start to serialisation.
   54|       |    elapsed_ms: u64,
   55|       |}
   56|       |
   57|      0|pub fn run(args: StatsArgs) -> Result<(), AppError> {
   58|      0|    let start = std::time::Instant::now();
   59|      0|    let _ = args.json; // --json is a no-op because output is already JSON by default
   60|      0|    let _ = args.format; // --format is a no-op; JSON is always emitted on stdout
   61|      0|    let paths = AppPaths::resolve(args.db.as_deref())?;
   62|       |
   63|      0|    crate::storage::connection::ensure_db_ready(&paths)?;
   64|       |
   65|      0|    let conn = open_ro(&paths.db)?;
   66|       |
   67|      0|    let memories: i64 = conn.query_row(
   68|      0|        "SELECT COUNT(*) FROM memories WHERE deleted_at IS NULL",
   69|      0|        [],
   70|      0|        |r| r.get(0),
   71|      0|    )?;
   72|      0|    let entities: i64 = conn.query_row("SELECT COUNT(*) FROM entities", [], |r| r.get(0))?;
   73|      0|    let relationships: i64 =
   74|      0|        conn.query_row("SELECT COUNT(*) FROM relationships", [], |r| r.get(0))?;
   75|       |
   76|      0|    let mut stmt = conn.prepare_cached(
   77|      0|        "SELECT DISTINCT namespace FROM memories WHERE deleted_at IS NULL ORDER BY namespace",
   78|      0|    )?;
   79|      0|    let namespaces: Vec<String> = stmt
   80|      0|        .query_map([], |r| r.get(0))?
   81|      0|        .collect::<Result<Vec<_>, _>>()?;
   82|       |
   83|      0|    let schema_version: u32 = conn
   84|      0|        .query_row(
   85|      0|            "SELECT MAX(version) FROM refinery_schema_history",
   86|      0|            [],
   87|      0|            |row| row.get::<_, Option<i64>>(0),
   88|       |        )
   89|      0|        .ok()
   90|      0|        .flatten()
   91|      0|        .map(|v| v.max(0) as u32)
   92|      0|        .unwrap_or(0);
   93|       |
   94|      0|    let db_size_bytes = std::fs::metadata(&paths.db).map(|m| m.len()).unwrap_or(0);
   95|       |
   96|       |    // v1.0.21 P1-C: query uses the (correct) `memory_chunks` table.
   97|       |    // If the table does not exist (legacy pre-chunking DB), the error is "no such table"
   98|       |    // and the fallback returns 0. Other errors are logged via tracing for audit.
   99|      0|    let chunks_total: i64 = match conn.query_row("SELECT COUNT(*) FROM memory_chunks", [], |r| {
  100|      0|        r.get::<_, i64>(0)
  101|      0|    }) {
  102|      0|        Ok(n) => n,
  103|      0|        Err(rusqlite::Error::SqliteFailure(_, Some(msg))) if msg.contains("no such table") => 0,
  104|      0|        Err(e) => {
  105|      0|            tracing::warn!(target: "stats", error = %e, "memory_chunks count failed");
  106|      0|            0
  107|       |        }
  108|       |    };
  109|       |
  110|      0|    let avg_body_len: f64 = conn
  111|      0|        .query_row(
  112|      0|            "SELECT COALESCE(AVG(LENGTH(body)), 0.0) FROM memories WHERE deleted_at IS NULL",
  113|      0|            [],
  114|      0|            |r| r.get(0),
  115|       |        )
  116|      0|        .unwrap_or(0.0);
  117|       |
  118|      0|    output::emit_json(&StatsResponse {
  119|      0|        memories,
  120|      0|        memories_total: memories,
  121|      0|        entities,
  122|      0|        entities_total: entities,
  123|      0|        relationships,
  124|      0|        relationships_total: relationships,
  125|      0|        edges: relationships,
  126|      0|        chunks_total,
  127|      0|        avg_body_len,
  128|      0|        namespaces,
  129|      0|        db_size_bytes,
  130|      0|        db_bytes: db_size_bytes,
  131|      0|        schema_version,
  132|      0|        elapsed_ms: start.elapsed().as_millis() as u64,
  133|      0|    })?;
  134|       |
  135|      0|    Ok(())
  136|      0|}
  137|       |
  138|       |#[cfg(test)]
  139|       |mod tests {
  140|       |    use super::*;
  141|       |
  142|       |    #[test]
  143|      1|    fn stats_response_serializes_all_fields() {
  144|      1|        let resp = StatsResponse {
  145|      1|            memories: 10,
  146|      1|            memories_total: 10,
  147|      1|            entities: 5,
  148|      1|            entities_total: 5,
  149|      1|            relationships: 3,
  150|      1|            relationships_total: 3,
  151|      1|            edges: 3,
  152|      1|            chunks_total: 20,
  153|      1|            avg_body_len: 42.5,
  154|      1|            namespaces: vec!["global".to_string(), "project".to_string()],
  155|      1|            db_size_bytes: 8192,
  156|      1|            db_bytes: 8192,
  157|      1|            schema_version: 6,
  158|      1|            elapsed_ms: 7,
  159|      1|        };
  160|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  161|      1|        assert_eq!(json["memories"], 10);
  162|      1|        assert_eq!(json["memories_total"], 10);
  163|      1|        assert_eq!(json["entities"], 5);
  164|      1|        assert_eq!(json["entities_total"], 5);
  165|      1|        assert_eq!(json["relationships"], 3);
  166|      1|        assert_eq!(json["relationships_total"], 3);
  167|      1|        assert_eq!(json["edges"], 3);
  168|      1|        assert_eq!(json["chunks_total"], 20);
  169|      1|        assert_eq!(json["db_size_bytes"], 8192u64);
  170|      1|        assert_eq!(json["db_bytes"], 8192u64);
  171|      1|        assert_eq!(json["schema_version"], 6);
  172|      1|        assert_eq!(json["elapsed_ms"], 7u64);
  173|      1|    }
  174|       |
  175|       |    #[test]
  176|      1|    fn stats_response_namespaces_is_string_array() {
  177|      1|        let resp = StatsResponse {
  178|      1|            memories: 0,
  179|      1|            memories_total: 0,
  180|      1|            entities: 0,
  181|      1|            entities_total: 0,
  182|      1|            relationships: 0,
  183|      1|            relationships_total: 0,
  184|      1|            edges: 0,
  185|      1|            chunks_total: 0,
  186|      1|            avg_body_len: 0.0,
  187|      1|            namespaces: vec!["ns1".to_string(), "ns2".to_string(), "ns3".to_string()],
  188|      1|            db_size_bytes: 0,
  189|      1|            db_bytes: 0,
  190|      1|            schema_version: 0,
  191|      1|            elapsed_ms: 0,
  192|      1|        };
  193|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  194|      1|        let arr = json["namespaces"]
  195|      1|            .as_array()
  196|      1|            .expect("namespaces must be array");
  197|      1|        assert_eq!(arr.len(), 3);
  198|      1|        assert_eq!(arr[0], "ns1");
  199|      1|        assert_eq!(arr[1], "ns2");
  200|      1|        assert_eq!(arr[2], "ns3");
  201|      1|    }
  202|       |
  203|       |    #[test]
  204|      1|    fn stats_response_namespaces_empty_serializes_empty_array() {
  205|      1|        let resp = StatsResponse {
  206|      1|            memories: 0,
  207|      1|            memories_total: 0,
  208|      1|            entities: 0,
  209|      1|            entities_total: 0,
  210|      1|            relationships: 0,
  211|      1|            relationships_total: 0,
  212|      1|            edges: 0,
  213|      1|            chunks_total: 0,
  214|      1|            avg_body_len: 0.0,
  215|      1|            namespaces: vec![],
  216|      1|            db_size_bytes: 0,
  217|      1|            db_bytes: 0,
  218|      1|            schema_version: 0,
  219|      1|            elapsed_ms: 0,
  220|      1|        };
  221|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  222|      1|        let arr = json["namespaces"]
  223|      1|            .as_array()
  224|      1|            .expect("namespaces must be array");
  225|      1|        assert!(arr.is_empty(), "empty namespaces must serialize as []");
                                              ^0
  226|      1|    }
  227|       |
  228|       |    #[test]
  229|      1|    fn stats_response_aliases_memories_total_and_memories_equal() {
  230|      1|        let resp = StatsResponse {
  231|      1|            memories: 42,
  232|      1|            memories_total: 42,
  233|      1|            entities: 7,
  234|      1|            entities_total: 7,
  235|      1|            relationships: 2,
  236|      1|            relationships_total: 2,
  237|      1|            edges: 2,
  238|      1|            chunks_total: 0,
  239|      1|            avg_body_len: 0.0,
  240|      1|            namespaces: vec![],
  241|      1|            db_size_bytes: 0,
  242|      1|            db_bytes: 0,
  243|      1|            schema_version: 6,
  244|      1|            elapsed_ms: 0,
  245|      1|        };
  246|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  247|      1|        assert_eq!(json["memories"], json["memories_total"]);
  248|      1|        assert_eq!(json["entities"], json["entities_total"]);
  249|      1|        assert_eq!(json["relationships"], json["relationships_total"]);
  250|      1|        assert_eq!(json["relationships"], json["edges"]);
  251|      1|        assert_eq!(json["db_size_bytes"], json["db_bytes"]);
  252|      1|    }
  253|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/sync_safe_copy.rs:
    1|       |//! Handler for the `sync-safe-copy` CLI subcommand.
    2|       |
    3|       |use crate::errors::AppError;
    4|       |use crate::i18n::validation;
    5|       |use crate::output;
    6|       |use crate::paths::AppPaths;
    7|       |use crate::storage::connection::open_rw;
    8|       |use serde::Serialize;
    9|       |
   10|       |#[derive(clap::Args)]
   11|       |#[command(after_long_help = "EXAMPLES:\n  \
   12|       |    # Create a checkpointed snapshot safe for cloud sync\n  \
   13|       |    sqlite-graphrag sync-safe-copy --dest /backup/graphrag-snapshot.sqlite\n\n  \
   14|       |    # Use the --to alias\n  \
   15|       |    sqlite-graphrag sync-safe-copy --to /backup/graphrag-snapshot.sqlite\n\n  \
   16|       |    # Snapshot a custom source database\n  \
   17|       |    sqlite-graphrag sync-safe-copy --db /data/graphrag.sqlite --dest /backup/snapshot.sqlite")]
   18|       |pub struct SyncSafeCopyArgs {
   19|       |    /// Snapshot destination path as a positional argument. Alternative to `--dest`.
   20|       |    #[arg(
   21|       |        value_name = "DEST",
   22|       |        conflicts_with = "dest",
   23|       |        help = "Snapshot destination path; alternative to --dest"
   24|       |    )]
   25|       |    pub dest_positional: Option<std::path::PathBuf>,
   26|       |    /// Snapshot destination path. Also accepts the aliases `--to` and `--output`.
   27|       |    #[arg(long, alias = "to", alias = "output")]
   28|       |    pub dest: Option<std::path::PathBuf>,
   29|       |    #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
   30|       |    pub json: bool,
   31|       |    /// Output format: `json` or `text`. JSON is always emitted on stdout regardless of the value.
   32|       |    #[arg(long, value_parser = ["json", "text"], hide = true)]
   33|       |    pub format: Option<String>,
   34|       |    #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
   35|       |    pub db: Option<String>,
   36|       |}
   37|       |
   38|       |#[derive(Serialize)]
   39|       |struct SyncSafeCopyResponse {
   40|       |    source_db_path: String,
   41|       |    dest_path: String,
   42|       |    bytes_copied: u64,
   43|       |    status: String,
   44|       |    /// Total execution time in milliseconds from handler start to serialisation.
   45|       |    elapsed_ms: u64,
   46|       |}
   47|       |
   48|      0|pub fn run(args: SyncSafeCopyArgs) -> Result<(), AppError> {
   49|      0|    let start = std::time::Instant::now();
   50|      0|    let _ = args.format; // --format is a no-op; JSON is always emitted on stdout
   51|      0|    let dest = args
   52|      0|        .dest_positional
   53|      0|        .clone()
   54|      0|        .or_else(|| args.dest.clone())
   55|      0|        .ok_or_else(|| {
   56|      0|            AppError::Validation(
   57|      0|                "destination required: pass as positional argument or via --dest/--to/--output"
   58|      0|                    .to_string(),
   59|      0|            )
   60|      0|        })?;
   61|      0|    let paths = AppPaths::resolve(args.db.as_deref())?;
   62|       |
   63|      0|    crate::storage::connection::ensure_db_ready(&paths)?;
   64|       |
   65|      0|    if dest == paths.db {
   66|      0|        return Err(AppError::Validation(
   67|      0|            validation::sync_destination_equals_source(),
   68|      0|        ));
   69|      0|    }
   70|       |
   71|      0|    if let Some(parent) = dest.parent() {
   72|      0|        std::fs::create_dir_all(parent)?;
   73|      0|    }
   74|       |
   75|      0|    let conn = open_rw(&paths.db)?;
   76|      0|    conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);")?;
   77|      0|    drop(conn);
   78|       |
   79|      0|    let bytes_copied = std::fs::copy(&paths.db, &dest)?;
   80|       |
   81|       |    // Applies 0600 permissions on the snapshot on Unix to avoid leakage on Dropbox/shared NFS.
   82|       |    // On Windows, NTFS DACL default is private-to-user; no explicit permission setter required.
   83|       |    #[cfg(unix)]
   84|       |    {
   85|       |        use std::os::unix::fs::PermissionsExt;
   86|      0|        let mut perms = std::fs::metadata(&dest)?.permissions();
   87|      0|        perms.set_mode(0o600);
   88|      0|        std::fs::set_permissions(&dest, perms)?;
   89|       |    }
   90|       |    #[cfg(windows)]
   91|       |    {
   92|       |        tracing::debug!(target: "sync_safe_copy",
   93|       |            path = %dest.display(),
   94|       |            "skipping Unix mode 0o600 on Windows; NTFS DACL default is private-to-user"
   95|       |        );
   96|       |    }
   97|       |
   98|      0|    output::emit_json(&SyncSafeCopyResponse {
   99|      0|        source_db_path: paths.db.display().to_string(),
  100|      0|        dest_path: dest.display().to_string(),
  101|      0|        bytes_copied,
  102|      0|        status: "ok".to_string(),
  103|      0|        elapsed_ms: start.elapsed().as_millis() as u64,
  104|      0|    })?;
  105|       |
  106|      0|    Ok(())
  107|      0|}
  108|       |
  109|       |#[cfg(test)]
  110|       |mod tests {
  111|       |    use super::*;
  112|       |
  113|       |    #[test]
  114|      1|    fn sync_safe_copy_response_serializes_all_fields() {
  115|      1|        let resp = SyncSafeCopyResponse {
  116|      1|            source_db_path: "/home/user/.local/share/sqlite-graphrag/db.sqlite".to_string(),
  117|      1|            dest_path: "/tmp/backup.sqlite".to_string(),
  118|      1|            bytes_copied: 16384,
  119|      1|            status: "ok".to_string(),
  120|      1|            elapsed_ms: 12,
  121|      1|        };
  122|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  123|      1|        assert_eq!(
  124|      1|            json["source_db_path"],
  125|       |            "/home/user/.local/share/sqlite-graphrag/db.sqlite"
  126|       |        );
  127|      1|        assert_eq!(json["dest_path"], "/tmp/backup.sqlite");
  128|      1|        assert_eq!(json["bytes_copied"], 16384u64);
  129|      1|        assert_eq!(json["status"], "ok");
  130|      1|        assert_eq!(json["elapsed_ms"], 12u64);
  131|      1|    }
  132|       |
  133|       |    #[test]
  134|      1|    fn sync_safe_copy_rejects_dest_equal_to_source() {
  135|      1|        let db_path = std::path::PathBuf::from("/tmp/same.sqlite");
  136|      1|        let args = SyncSafeCopyArgs {
  137|      1|            dest_positional: None,
  138|      1|            dest: Some(db_path.clone()),
  139|      1|            json: false,
  140|      1|            format: None,
  141|      1|            db: Some("/tmp/same.sqlite".to_string()),
  142|      1|        };
  143|       |        // Simulates manual path resolution — validates rejection logic
  144|      1|        let resolved_dest = args
  145|      1|            .dest_positional
  146|      1|            .clone()
  147|      1|            .or_else(|| args.dest.clone())
  148|      1|            .expect("test must pass dest");
  149|      1|        let result = if resolved_dest == std::path::PathBuf::from(args.db.as_deref().unwrap_or(""))
  150|       |        {
  151|      1|            Err(AppError::Validation(
  152|      1|                "destination path must differ from the source database path".to_string(),
  153|      1|            ))
  154|       |        } else {
  155|      0|            Ok(())
  156|       |        };
  157|      1|        assert!(result.is_err(), "must reject dest equal to source");
                                               ^0
  158|      1|        if let Err(AppError::Validation(msg)) = result {
  159|      1|            assert!(msg.contains("destination path must differ"));
  160|      0|        }
  161|      1|    }
  162|       |
  163|       |    #[test]
  164|      1|    fn sync_safe_copy_response_status_ok() {
  165|      1|        let resp = SyncSafeCopyResponse {
  166|      1|            source_db_path: "/data/db.sqlite".to_string(),
  167|      1|            dest_path: "/backup/db.sqlite".to_string(),
  168|      1|            bytes_copied: 0,
  169|      1|            status: "ok".to_string(),
  170|      1|            elapsed_ms: 0,
  171|      1|        };
  172|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  173|      1|        assert_eq!(json["status"], "ok");
  174|      1|    }
  175|       |
  176|       |    #[test]
  177|      1|    fn sync_safe_copy_response_bytes_copied_zero_valid() {
  178|      1|        let resp = SyncSafeCopyResponse {
  179|      1|            source_db_path: "/data/db.sqlite".to_string(),
  180|      1|            dest_path: "/backup/db.sqlite".to_string(),
  181|      1|            bytes_copied: 0,
  182|      1|            status: "ok".to_string(),
  183|      1|            elapsed_ms: 1,
  184|      1|        };
  185|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  186|      1|        assert_eq!(json["bytes_copied"], 0u64);
  187|      1|        assert_eq!(json["elapsed_ms"], 1u64);
  188|      1|    }
  189|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/unlink.rs:
    1|       |//! Handler for the `unlink` CLI subcommand.
    2|       |
    3|       |use crate::errors::AppError;
    4|       |use crate::i18n::errors_msg;
    5|       |use crate::output::{self, OutputFormat};
    6|       |use crate::paths::AppPaths;
    7|       |use crate::storage::connection::open_rw;
    8|       |use crate::storage::entities;
    9|       |use serde::Serialize;
   10|       |
   11|       |#[derive(clap::Args)]
   12|       |#[command(after_long_help = "EXAMPLES:\n  \
   13|       |    # Remove a specific relationship between two entities\n  \
   14|       |    sqlite-graphrag unlink --from oauth-flow --to refresh-tokens --relation related\n\n  \
   15|       |    # Remove ALL relationships between two entities (any relation type)\n  \
   16|       |    sqlite-graphrag unlink --from oauth-flow --to refresh-tokens\n\n  \
   17|       |    # Remove ALL relationships where an entity is source or target\n  \
   18|       |    sqlite-graphrag unlink --entity oauth-flow --all\n\n  \
   19|       |NOTE:\n  \
   20|       |    --from and --to expect ENTITY names (graph nodes), not memory names.\n  \
   21|       |    To inspect current entities and relationships, run: sqlite-graphrag graph --format json")]
   22|       |pub struct UnlinkArgs {
   23|       |    /// Source ENTITY name (graph node, not memory). Also accepts the aliases `--source` and `--name`.
   24|       |    /// To list current entities run `graph --format json | jaq '.nodes[].name'`.
   25|       |    #[arg(long, alias = "source", alias = "name", conflicts_with = "entity")]
   26|       |    pub from: Option<String>,
   27|       |    /// Target ENTITY name (graph node, not memory). Also accepts the alias `--target`.
   28|       |    #[arg(long, alias = "target", conflicts_with = "entity")]
   29|       |    pub to: Option<String>,
   30|       |    /// Relation type to remove. When omitted with --from/--to, ALL relationships between
   31|       |    /// those two entities are deleted. Accepts canonical values (e.g. uses, depends-on)
   32|       |    /// or any custom snake_case/kebab-case string.
   33|       |    #[arg(long, value_parser = crate::parsers::parse_relation, value_name = "RELATION")]
   34|       |    pub relation: Option<String>,
   35|       |    /// Entity name for bulk removal. Must be combined with --all.
   36|       |    #[arg(long, requires = "all", conflicts_with_all = ["from", "to"])]
   37|       |    pub entity: Option<String>,
   38|       |    /// When combined with --entity, removes ALL relationships where that entity is source or target.
   39|       |    #[arg(long, requires = "entity")]
   40|       |    pub all: bool,
   41|       |    #[arg(long)]
   42|       |    pub namespace: Option<String>,
   43|       |    #[arg(long, value_enum, default_value = "json")]
   44|       |    pub format: OutputFormat,
   45|       |    #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
   46|       |    pub json: bool,
   47|       |    #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
   48|       |    pub db: Option<String>,
   49|       |}
   50|       |
   51|       |#[derive(Serialize)]
   52|       |struct UnlinkResponse {
   53|       |    action: String,
   54|       |    from_name: String,
   55|       |    to_name: String,
   56|       |    relation: String,
   57|       |    relationships_removed: u64,
   58|       |    namespace: String,
   59|       |    /// Total execution time in milliseconds from handler start to serialisation.
   60|       |    elapsed_ms: u64,
   61|       |}
   62|       |
   63|      0|pub fn run(args: UnlinkArgs) -> Result<(), AppError> {
   64|      0|    let inicio = std::time::Instant::now();
   65|      0|    let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
   66|      0|    let paths = AppPaths::resolve(args.db.as_deref())?;
   67|       |
   68|      0|    crate::storage::connection::ensure_db_ready(&paths)?;
   69|       |
   70|      0|    if let Some(relation_str) = &args.relation {
   71|      0|        crate::parsers::warn_if_non_canonical(relation_str);
   72|      0|    }
   73|       |
   74|      0|    let mut conn = open_rw(&paths.db)?;
   75|       |
   76|       |    // Mode: --entity --all → delete every relationship for that entity.
   77|      0|    if args.all {
   78|      0|        let entity_name = args.entity.as_deref().unwrap_or("");
   79|      0|        let entity_id =
   80|      0|            entities::find_entity_id(&conn, &namespace, entity_name)?.ok_or_else(|| {
   81|      0|                AppError::NotFound(errors_msg::entity_not_found(entity_name, &namespace))
   82|      0|            })?;
   83|       |
   84|      0|        let tx = conn.transaction_with_behavior(rusqlite::TransactionBehavior::Immediate)?;
   85|      0|        let removed = delete_all_entity_relationships(&tx, entity_id)?;
   86|      0|        entities::recalculate_degree(&tx, entity_id)?;
   87|      0|        tx.commit()?;
   88|       |
   89|      0|        conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);")?;
   90|       |
   91|      0|        let response = UnlinkResponse {
   92|      0|            action: "deleted".to_string(),
   93|      0|            from_name: entity_name.to_string(),
   94|      0|            to_name: "*".to_string(),
   95|      0|            relation: "*".to_string(),
   96|      0|            relationships_removed: removed,
   97|      0|            namespace: namespace.clone(),
   98|      0|            elapsed_ms: inicio.elapsed().as_millis() as u64,
   99|      0|        };
  100|       |
  101|      0|        match args.format {
  102|      0|            OutputFormat::Json => output::emit_json(&response)?,
  103|      0|            OutputFormat::Text | OutputFormat::Markdown => {
  104|      0|                output::emit_text(&format!(
  105|      0|                    "deleted: {} --[*]--> * removed {} relationship(s) [{}]",
  106|      0|                    response.from_name, response.relationships_removed, response.namespace
  107|      0|                ));
  108|      0|            }
  109|       |        }
  110|      0|        return Ok(());
  111|      0|    }
  112|       |
  113|       |    // Mode: --from/--to (with optional --relation).
  114|      0|    let from_name = args.from.as_deref().ok_or_else(|| {
  115|      0|        AppError::Validation("--from is required when --entity/--all is not used".to_string())
  116|      0|    })?;
  117|      0|    let to_name = args.to.as_deref().ok_or_else(|| {
  118|      0|        AppError::Validation("--to is required when --entity/--all is not used".to_string())
  119|      0|    })?;
  120|       |
  121|      0|    let source_id = entities::find_entity_id(&conn, &namespace, from_name)?
  122|      0|        .ok_or_else(|| AppError::NotFound(errors_msg::entity_not_found(from_name, &namespace)))?;
  123|      0|    let target_id = entities::find_entity_id(&conn, &namespace, to_name)?
  124|      0|        .ok_or_else(|| AppError::NotFound(errors_msg::entity_not_found(to_name, &namespace)))?;
  125|       |
  126|      0|    let (removed, relation_display) = if let Some(rel) = args.relation.as_deref() {
  127|       |        // Single-relation mode: exact match required.
  128|      0|        let row =
  129|      0|            entities::find_relationship(&conn, source_id, target_id, rel)?.ok_or_else(|| {
  130|      0|                AppError::NotFound(errors_msg::relationship_not_found(
  131|      0|                    from_name, rel, to_name, &namespace,
  132|      0|                ))
  133|      0|            })?;
  134|       |
  135|      0|        let tx = conn.transaction_with_behavior(rusqlite::TransactionBehavior::Immediate)?;
  136|      0|        entities::delete_relationship_by_id(&tx, row.id)?;
  137|      0|        entities::recalculate_degree(&tx, source_id)?;
  138|      0|        entities::recalculate_degree(&tx, target_id)?;
  139|      0|        tx.commit()?;
  140|       |
  141|      0|        (1u64, rel.to_string())
  142|       |    } else {
  143|       |        // Bulk mode: delete all relationships between from and to.
  144|      0|        let tx = conn.transaction_with_behavior(rusqlite::TransactionBehavior::Immediate)?;
  145|      0|        let count = delete_relationships_between(&tx, source_id, target_id)?;
  146|      0|        entities::recalculate_degree(&tx, source_id)?;
  147|      0|        entities::recalculate_degree(&tx, target_id)?;
  148|      0|        tx.commit()?;
  149|       |
  150|      0|        (count, "*".to_string())
  151|       |    };
  152|       |
  153|      0|    conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);")?;
  154|       |
  155|      0|    let response = UnlinkResponse {
  156|      0|        action: "deleted".to_string(),
  157|      0|        from_name: from_name.to_string(),
  158|      0|        to_name: to_name.to_string(),
  159|      0|        relation: relation_display.clone(),
  160|      0|        relationships_removed: removed,
  161|      0|        namespace: namespace.clone(),
  162|      0|        elapsed_ms: inicio.elapsed().as_millis() as u64,
  163|      0|    };
  164|       |
  165|      0|    match args.format {
  166|      0|        OutputFormat::Json => output::emit_json(&response)?,
  167|      0|        OutputFormat::Text | OutputFormat::Markdown => {
  168|      0|            output::emit_text(&format!(
  169|      0|                "deleted: {} --[{}]--> {} removed {} relationship(s) [{}]",
  170|      0|                response.from_name,
  171|      0|                response.relation,
  172|      0|                response.to_name,
  173|      0|                response.relationships_removed,
  174|      0|                response.namespace
  175|      0|            ));
  176|      0|        }
  177|       |    }
  178|       |
  179|      0|    Ok(())
  180|      0|}
  181|       |
  182|       |/// Deletes all relationships where `entity_id` is source or target.
  183|       |/// Returns the number of rows removed.
  184|      0|fn delete_all_entity_relationships(
  185|      0|    conn: &rusqlite::Connection,
  186|      0|    entity_id: i64,
  187|      0|) -> Result<u64, AppError> {
  188|       |    // Collect IDs first to clean up memory_relationships junction.
  189|      0|    let mut stmt =
  190|      0|        conn.prepare_cached("SELECT id FROM relationships WHERE source_id = ?1 OR target_id = ?1")?;
  191|      0|    let ids: Vec<i64> = stmt
  192|      0|        .query_map(rusqlite::params![entity_id], |r| r.get(0))?
  193|      0|        .collect::<rusqlite::Result<Vec<_>>>()?;
  194|       |
  195|      0|    let count = ids.len() as u64;
  196|      0|    for rel_id in ids {
  197|      0|        conn.execute(
  198|      0|            "DELETE FROM memory_relationships WHERE relationship_id = ?1",
  199|      0|            rusqlite::params![rel_id],
  200|      0|        )?;
  201|      0|        conn.execute(
  202|      0|            "DELETE FROM relationships WHERE id = ?1",
  203|      0|            rusqlite::params![rel_id],
  204|      0|        )?;
  205|       |    }
  206|      0|    Ok(count)
  207|      0|}
  208|       |
  209|       |/// Deletes all relationships between `source_id` and `target_id` (any relation type).
  210|       |/// Returns the number of rows removed.
  211|      0|fn delete_relationships_between(
  212|      0|    conn: &rusqlite::Connection,
  213|      0|    source_id: i64,
  214|      0|    target_id: i64,
  215|      0|) -> Result<u64, AppError> {
  216|      0|    let mut stmt = conn
  217|      0|        .prepare_cached("SELECT id FROM relationships WHERE source_id = ?1 AND target_id = ?2")?;
  218|      0|    let ids: Vec<i64> = stmt
  219|      0|        .query_map(rusqlite::params![source_id, target_id], |r| r.get(0))?
  220|      0|        .collect::<rusqlite::Result<Vec<_>>>()?;
  221|       |
  222|      0|    let count = ids.len() as u64;
  223|      0|    for rel_id in ids {
  224|      0|        conn.execute(
  225|      0|            "DELETE FROM memory_relationships WHERE relationship_id = ?1",
  226|      0|            rusqlite::params![rel_id],
  227|      0|        )?;
  228|      0|        conn.execute(
  229|      0|            "DELETE FROM relationships WHERE id = ?1",
  230|      0|            rusqlite::params![rel_id],
  231|      0|        )?;
  232|       |    }
  233|      0|    Ok(count)
  234|      0|}
  235|       |
  236|       |#[cfg(test)]
  237|       |mod tests {
  238|       |    use super::*;
  239|       |
  240|       |    #[test]
  241|      1|    fn unlink_response_serializes_all_fields() {
  242|      1|        let resp = UnlinkResponse {
  243|      1|            action: "deleted".to_string(),
  244|      1|            from_name: "entity-a".to_string(),
  245|      1|            to_name: "entity-b".to_string(),
  246|      1|            relation: "uses".to_string(),
  247|      1|            relationships_removed: 1,
  248|      1|            namespace: "global".to_string(),
  249|      1|            elapsed_ms: 5,
  250|      1|        };
  251|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  252|      1|        assert_eq!(json["action"], "deleted");
  253|      1|        assert_eq!(json["from_name"], "entity-a");
  254|      1|        assert_eq!(json["to_name"], "entity-b");
  255|      1|        assert_eq!(json["relation"], "uses");
  256|      1|        assert_eq!(json["relationships_removed"], 1u64);
  257|      1|        assert_eq!(json["namespace"], "global");
  258|      1|        assert_eq!(json["elapsed_ms"], 5u64);
  259|      1|    }
  260|       |
  261|       |    #[test]
  262|      1|    fn unlink_response_action_must_be_deleted() {
  263|      1|        let resp = UnlinkResponse {
  264|      1|            action: "deleted".to_string(),
  265|      1|            from_name: "a".to_string(),
  266|      1|            to_name: "b".to_string(),
  267|      1|            relation: "related".to_string(),
  268|      1|            relationships_removed: 1,
  269|      1|            namespace: "global".to_string(),
  270|      1|            elapsed_ms: 0,
  271|      1|        };
  272|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  273|      1|        assert_eq!(
  274|      1|            json["action"], "deleted",
  275|      0|            "unlink action must always be 'deleted'"
  276|       |        );
  277|      1|    }
  278|       |
  279|       |    #[test]
  280|      1|    fn unlink_response_bulk_uses_wildcard_relation() {
  281|      1|        let resp = UnlinkResponse {
  282|      1|            action: "deleted".to_string(),
  283|      1|            from_name: "origin".to_string(),
  284|      1|            to_name: "destination".to_string(),
  285|      1|            relation: "*".to_string(),
  286|      1|            relationships_removed: 3,
  287|      1|            namespace: "project".to_string(),
  288|      1|            elapsed_ms: 3,
  289|      1|        };
  290|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  291|      1|        assert_eq!(json["relation"], "*");
  292|      1|        assert_eq!(json["relationships_removed"], 3u64);
  293|      1|    }
  294|       |
  295|       |    #[test]
  296|      1|    fn unlink_response_entity_all_uses_wildcard_to() {
  297|      1|        let resp = UnlinkResponse {
  298|      1|            action: "deleted".to_string(),
  299|      1|            from_name: "oauth-flow".to_string(),
  300|      1|            to_name: "*".to_string(),
  301|      1|            relation: "*".to_string(),
  302|      1|            relationships_removed: 5,
  303|      1|            namespace: "global".to_string(),
  304|      1|            elapsed_ms: 2,
  305|      1|        };
  306|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  307|      1|        assert_eq!(json["to_name"], "*");
  308|      1|        assert_eq!(json["relation"], "*");
  309|      1|        assert_eq!(json["relationships_removed"], 5u64);
  310|      1|    }
  311|       |
  312|       |    #[test]
  313|      1|    fn unlink_response_relationships_removed_field_present() {
  314|      1|        let resp = UnlinkResponse {
  315|      1|            action: "deleted".to_string(),
  316|      1|            from_name: "a".to_string(),
  317|      1|            to_name: "b".to_string(),
  318|      1|            relation: "uses".to_string(),
  319|      1|            relationships_removed: 0,
  320|      1|            namespace: "global".to_string(),
  321|      1|            elapsed_ms: 0,
  322|      1|        };
  323|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  324|      1|        assert!(
  325|      1|            json.get("relationships_removed").is_some(),
  326|      0|            "relationships_removed field must be present"
  327|       |        );
  328|      1|    }
  329|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/vacuum.rs:
    1|       |//! Handler for the `vacuum` CLI subcommand.
    2|       |
    3|       |use crate::errors::AppError;
    4|       |use crate::output;
    5|       |use crate::output::JsonOutputFormat;
    6|       |use crate::paths::AppPaths;
    7|       |use crate::storage::connection::open_rw;
    8|       |use serde::Serialize;
    9|       |
   10|       |#[derive(clap::Args)]
   11|       |#[command(after_long_help = "EXAMPLES:\n  \
   12|       |    # Run VACUUM after WAL checkpoint (default)\n  \
   13|       |    sqlite-graphrag vacuum\n\n  \
   14|       |    # Vacuum a database at a custom path\n  \
   15|       |    sqlite-graphrag vacuum --db /path/to/graphrag.sqlite\n\n  \
   16|       |    # Vacuum via SQLITE_GRAPHRAG_DB_PATH env var\n  \
   17|       |    SQLITE_GRAPHRAG_DB_PATH=/data/graphrag.sqlite sqlite-graphrag vacuum\n\n\
   18|       |NOTE:\n  \
   19|       |    reclaimed_bytes may report 0 even after `purge` if removed memories did not\n  \
   20|       |    span entire SQLite pages (page size = 4 KB). Run `vacuum` regularly only on\n  \
   21|       |    large databases (> 10 MB) for measurable gains.")]
   22|       |pub struct VacuumArgs {
   23|       |    #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
   24|       |    pub json: bool,
   25|       |    /// Run a WAL checkpoint before and after `VACUUM`.
   26|       |    #[arg(long, default_value_t = true)]
   27|       |    pub checkpoint: bool,
   28|       |    /// Output format.
   29|       |    #[arg(long, value_enum, default_value_t = JsonOutputFormat::Json)]
   30|       |    pub format: JsonOutputFormat,
   31|       |    #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
   32|       |    pub db: Option<String>,
   33|       |}
   34|       |
   35|       |#[derive(Serialize)]
   36|       |struct VacuumResponse {
   37|       |    db_path: String,
   38|       |    size_before_bytes: u64,
   39|       |    size_after_bytes: u64,
   40|       |    /// Bytes reclaimed by VACUUM (size_before_bytes - size_after_bytes), saturating to zero.
   41|       |    /// Derived field added in v1.0.34 so callers do not have to compute the delta themselves.
   42|       |    reclaimed_bytes: u64,
   43|       |    status: String,
   44|       |    /// Total execution time in milliseconds from handler start to serialisation.
   45|       |    elapsed_ms: u64,
   46|       |}
   47|       |
   48|      0|pub fn run(args: VacuumArgs) -> Result<(), AppError> {
   49|      0|    let start = std::time::Instant::now();
   50|      0|    let _ = args.format;
   51|      0|    let paths = AppPaths::resolve(args.db.as_deref())?;
   52|       |
   53|      0|    crate::storage::connection::ensure_db_ready(&paths)?;
   54|       |
   55|      0|    let size_before_bytes = std::fs::metadata(&paths.db)
   56|      0|        .map(|meta| meta.len())
   57|      0|        .unwrap_or(0);
   58|      0|    let conn = open_rw(&paths.db)?;
   59|      0|    if args.checkpoint {
   60|      0|        conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);")?;
   61|      0|    }
   62|      0|    conn.execute_batch("VACUUM;")?;
   63|      0|    if args.checkpoint {
   64|      0|        conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);")?;
   65|      0|    }
   66|      0|    drop(conn);
   67|      0|    let size_after_bytes = std::fs::metadata(&paths.db)
   68|      0|        .map(|meta| meta.len())
   69|      0|        .unwrap_or(0);
   70|       |
   71|      0|    output::emit_json(&VacuumResponse {
   72|      0|        db_path: paths.db.display().to_string(),
   73|      0|        size_before_bytes,
   74|      0|        size_after_bytes,
   75|      0|        reclaimed_bytes: size_before_bytes.saturating_sub(size_after_bytes),
   76|      0|        status: "ok".to_string(),
   77|      0|        elapsed_ms: start.elapsed().as_millis() as u64,
   78|      0|    })?;
   79|       |
   80|      0|    Ok(())
   81|      0|}
   82|       |
   83|       |#[cfg(test)]
   84|       |mod tests {
   85|       |    use super::*;
   86|       |
   87|       |    #[test]
   88|      1|    fn vacuum_response_serializes_all_fields() {
   89|      1|        let resp = VacuumResponse {
   90|      1|            db_path: "/home/user/.local/share/sqlite-graphrag/db.sqlite".to_string(),
   91|      1|            size_before_bytes: 32768,
   92|      1|            size_after_bytes: 16384,
   93|      1|            reclaimed_bytes: 16384,
   94|      1|            status: "ok".to_string(),
   95|      1|            elapsed_ms: 55,
   96|      1|        };
   97|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
   98|      1|        assert_eq!(
   99|      1|            json["db_path"],
  100|       |            "/home/user/.local/share/sqlite-graphrag/db.sqlite"
  101|       |        );
  102|      1|        assert_eq!(json["size_before_bytes"], 32768u64);
  103|      1|        assert_eq!(json["size_after_bytes"], 16384u64);
  104|      1|        assert_eq!(json["reclaimed_bytes"], 16384u64);
  105|      1|        assert_eq!(json["status"], "ok");
  106|      1|        assert_eq!(json["elapsed_ms"], 55u64);
  107|      1|    }
  108|       |
  109|       |    #[test]
  110|      1|    fn vacuum_response_size_after_less_than_or_equal_to_before() {
  111|      1|        let resp = VacuumResponse {
  112|      1|            db_path: "/data/db.sqlite".to_string(),
  113|      1|            size_before_bytes: 65536,
  114|      1|            size_after_bytes: 32768,
  115|      1|            reclaimed_bytes: 32768,
  116|      1|            status: "ok".to_string(),
  117|      1|            elapsed_ms: 100,
  118|      1|        };
  119|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  120|      1|        let before = json["size_before_bytes"].as_u64().unwrap();
  121|      1|        let after = json["size_after_bytes"].as_u64().unwrap();
  122|      1|        let reclaimed = json["reclaimed_bytes"].as_u64().unwrap();
  123|      1|        assert!(
  124|      1|            after <= before,
  125|      0|            "size_after_bytes must be <= size_before_bytes after VACUUM"
  126|       |        );
  127|      1|        assert_eq!(
  128|       |            reclaimed,
  129|      1|            before - after,
  130|      0|            "reclaimed_bytes must equal size_before_bytes - size_after_bytes"
  131|       |        );
  132|      1|    }
  133|       |
  134|       |    #[test]
  135|      1|    fn vacuum_response_status_ok() {
  136|      1|        let resp = VacuumResponse {
  137|      1|            db_path: "/data/db.sqlite".to_string(),
  138|      1|            size_before_bytes: 0,
  139|      1|            size_after_bytes: 0,
  140|      1|            reclaimed_bytes: 0,
  141|      1|            status: "ok".to_string(),
  142|      1|            elapsed_ms: 0,
  143|      1|        };
  144|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  145|      1|        assert_eq!(json["status"], "ok");
  146|      1|    }
  147|       |
  148|       |    #[test]
  149|      1|    fn vacuum_response_elapsed_ms_present_and_non_negative() {
  150|      1|        let resp = VacuumResponse {
  151|      1|            db_path: "/data/db.sqlite".to_string(),
  152|      1|            size_before_bytes: 1024,
  153|      1|            size_after_bytes: 1024,
  154|      1|            reclaimed_bytes: 0,
  155|      1|            status: "ok".to_string(),
  156|      1|            elapsed_ms: 0,
  157|      1|        };
  158|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  159|      1|        assert!(
  160|      1|            json.get("elapsed_ms").is_some(),
  161|      0|            "elapsed_ms field must be present"
  162|       |        );
  163|      1|        assert!(
  164|      1|            json["elapsed_ms"].as_u64().is_some(),
  165|      0|            "elapsed_ms must be a non-negative integer"
  166|       |        );
  167|      1|    }
  168|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/vec.rs:
    1|       |//! Handler for the `vec` CLI subcommand family.
    2|       |//!
    3|       |//! Provides three maintenance operations for the `vec_memories` virtual
    4|       |//! table that backs the embedding KNN search:
    5|       |//!
    6|       |//! - `orphan-list`: lists `vec_memories` rows whose `memory_id` no longer
    7|       |//!   references a live (non-soft-deleted) memory.
    8|       |//! - `purge-orphan`: deletes those orphan rows in a single transaction.
    9|       |//! - `stats`: surfaces total rows, orphan count, and coverage percentage.
   10|       |//!
   11|       |//! G39 (v1.0.69): before v1.0.69, the only way to detect a vec-orphan was
   12|       |//! `health --json` which reported `vec_memories_orphaned > 0` with no
   13|       |//! remediation path. This module closes the loop.
   14|       |
   15|       |use crate::errors::AppError;
   16|       |use crate::output;
   17|       |use crate::paths::AppPaths;
   18|       |use crate::storage::connection::{open_ro, open_rw};
   19|       |use serde::Serialize;
   20|       |
   21|       |/// Arguments for the `vec` subcommand family.
   22|       |#[derive(clap::Args)]
   23|       |#[command(
   24|       |    about = "Vector index maintenance (orphan detection, purge, stats)",
   25|       |    after_long_help = "EXAMPLES:\n  \
   26|       |        # List orphan vec_memories rows whose memory_id is gone\n  \
   27|       |        sqlite-graphrag vec orphan-list\n\n  \
   28|       |        # Dry-run the purge (does not delete)\n  \
   29|       |        sqlite-graphrag vec purge-orphan --dry-run\n\n  \
   30|       |        # Actually purge orphans\n  \
   31|       |        sqlite-graphrag vec purge-orphan --yes\n\n  \
   32|       |        # Show stats for all vec0 tables\n  \
   33|       |        sqlite-graphrag vec stats --json"
   34|       |)]
   35|       |pub struct VecArgs {
   36|       |    #[command(subcommand)]
   37|       |    pub command: VecSubcommand,
   38|       |}
   39|       |
   40|       |/// Subcommands nested under `vec`.
   41|       |#[derive(clap::Subcommand)]
   42|       |pub enum VecSubcommand {
   43|       |    /// List orphan vec_memories rows.
   44|       |    OrphanList(VecOrphanListArgs),
   45|       |    /// Delete orphan vec_memories rows. Requires `--yes` to confirm.
   46|       |    PurgeOrphan(VecPurgeOrphanArgs),
   47|       |    /// Show statistics for vec_memories, vec_entities, vec_chunks.
   48|       |    Stats(VecStatsArgs),
   49|       |}
   50|       |
   51|       |/// Arguments for `vec orphan-list`.
   52|       |#[derive(clap::Args)]
   53|       |pub struct VecOrphanListArgs {
   54|       |    /// No-op; JSON is always emitted on stdout.
   55|       |    #[arg(long, hide = true)]
   56|       |    pub json: bool,
   57|       |    /// Path to the SQLite database file.
   58|       |    #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
   59|       |    pub db: Option<String>,
   60|       |}
   61|       |
   62|       |/// Arguments for `vec purge-orphan`.
   63|       |#[derive(clap::Args)]
   64|       |pub struct VecOrphanListInner {
   65|       |    pub json: bool,
   66|       |    pub db: Option<String>,
   67|       |}
   68|       |
   69|       |/// Arguments for `vec purge-orphan`.
   70|       |#[derive(clap::Args)]
   71|       |pub struct VecPurgeOrphanArgs {
   72|       |    /// No-op; JSON is always emitted on stdout.
   73|       |    #[arg(long, hide = true)]
   74|       |    pub json: bool,
   75|       |    /// Path to the SQLite database file.
   76|       |    #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
   77|       |    pub db: Option<String>,
   78|       |    /// Skip the interactive confirmation; required for automation.
   79|       |    #[arg(long, default_value_t = false)]
   80|       |    pub yes: bool,
   81|       |    /// Report what would be purged without writing.
   82|       |    #[arg(long, default_value_t = false)]
   83|       |    pub dry_run: bool,
   84|       |}
   85|       |
   86|       |/// Arguments for `vec stats`.
   87|       |#[derive(clap::Args)]
   88|       |pub struct VecStatsArgs {
   89|       |    /// No-op; JSON is always emitted on stdout.
   90|       |    #[arg(long, hide = true)]
   91|       |    pub json: bool,
   92|       |    /// Path to the SQLite database file.
   93|       |    #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
   94|       |    pub db: Option<String>,
   95|       |}
   96|       |
   97|       |#[derive(Serialize)]
   98|       |struct VecOrphanListItem {
   99|       |    /// The orphan `memory_id` value stored in `vec_memories`.
  100|       |    memory_id: i64,
  101|       |    /// Hash of the float vector blob, for fingerprinting.
  102|       |    vector_hash: String,
  103|       |    /// When the orphan row was originally inserted.
  104|       |    created_at: i64,
  105|       |}
  106|       |
  107|       |#[derive(Serialize)]
  108|       |struct VecOrphanListResponse {
  109|       |    action: String,
  110|       |    count: i64,
  111|       |    items: Vec<VecOrphanListItem>,
  112|       |    elapsed_ms: u64,
  113|       |}
  114|       |
  115|       |#[derive(Serialize)]
  116|       |struct VecPurgeOrphanResponse {
  117|       |    action: String,
  118|       |    deleted: i64,
  119|       |    /// Number of orphan rows in `vec_entities` that were also removed (G39).
  120|       |    deleted_entities: i64,
  121|       |    /// Number of orphan rows in `vec_chunks` that were also removed (G39).
  122|       |    deleted_chunks: i64,
  123|       |    dry_run: bool,
  124|       |    elapsed_ms: u64,
  125|       |}
  126|       |
  127|       |#[derive(Serialize)]
  128|       |struct VecStatsResponse {
  129|       |    total_rows: i64,
  130|       |    orphaned: i64,
  131|       |    coverage_percent: f64,
  132|       |    #[serde(skip_serializing_if = "Option::is_none")]
  133|       |    vec_entities_rows: Option<i64>,
  134|       |    #[serde(skip_serializing_if = "Option::is_none")]
  135|       |    vec_chunks_rows: Option<i64>,
  136|       |    fts_memories_rows: i64,
  137|       |    elapsed_ms: u64,
  138|       |}
  139|       |
  140|       |/// Dispatch entry point called from `main`.
  141|       |///
  142|       |/// # Errors
  143|       |/// Propagates any [`AppError`] raised by the underlying subcommand.
  144|      0|pub fn run(args: VecArgs) -> Result<(), AppError> {
  145|      0|    match args.command {
  146|      0|        VecSubcommand::OrphanList(a) => run_orphan_list(a),
  147|      0|        VecSubcommand::PurgeOrphan(a) => run_purge_orphan(a),
  148|      0|        VecSubcommand::Stats(a) => run_stats(a),
  149|       |    }
  150|      0|}
  151|       |
  152|      0|fn run_orphan_list(args: VecOrphanListArgs) -> Result<(), AppError> {
  153|      0|    let start = std::time::Instant::now();
  154|      0|    let paths = AppPaths::resolve(args.db.as_deref())?;
  155|      0|    crate::storage::connection::ensure_db_ready(&paths)?;
  156|      0|    let conn = open_ro(&paths.db)?;
  157|       |
  158|       |    // FTS5-style table existence gate so the command is a no-op on
  159|       |    // databases that were created before vec_memories existed.
  160|      0|    let table_exists: bool = conn
  161|      0|        .query_row(
  162|      0|            "SELECT COUNT(*) FROM sqlite_master WHERE type='table' AND name='vec_memories'",
  163|      0|            [],
  164|      0|            |r| r.get::<_, i64>(0).map(|v| v > 0),
  165|       |        )
  166|      0|        .unwrap_or(false);
  167|      0|    if !table_exists {
  168|      0|        return output::emit_json(&VecOrphanListResponse {
  169|      0|            action: "orphan_list".to_string(),
  170|      0|            count: 0,
  171|      0|            items: Vec::new(),
  172|      0|            elapsed_ms: start.elapsed().as_millis() as u64,
  173|      0|        });
  174|      0|    }
  175|       |
  176|       |    // List vec_memories rows that have no corresponding live memory row.
  177|       |    // We use a hash of the float[] blob (BLAKE3) as a fingerprint so the
  178|       |    // operator can detect duplicate embeddings even after the parent
  179|       |    // memory has been re-embedded with new content.
  180|      0|    let mut stmt = conn.prepare(
  181|      0|        "SELECT v.memory_id, v.embedding, v.created_at
  182|      0|         FROM vec_memories v
  183|      0|         LEFT JOIN memories m ON m.id = v.memory_id
  184|      0|         WHERE m.id IS NULL
  185|      0|         ORDER BY v.memory_id",
  186|      0|    )?;
  187|      0|    let rows: Vec<VecOrphanListItem> = stmt
  188|      0|        .query_map([], |r| {
  189|      0|            let memory_id: i64 = r.get(0)?;
  190|      0|            let blob: Vec<u8> = r.get(1)?;
  191|      0|            let created_at: i64 = r.get(2)?;
  192|      0|            let vector_hash = blake3::hash(&blob).to_hex().to_string();
  193|      0|            Ok(VecOrphanListItem {
  194|      0|                memory_id,
  195|      0|                vector_hash,
  196|      0|                created_at,
  197|      0|            })
  198|      0|        })?
  199|      0|        .collect::<Result<Vec<_>, _>>()?;
  200|      0|    let count = rows.len() as i64;
  201|       |
  202|      0|    output::emit_json(&VecOrphanListResponse {
  203|      0|        action: "orphan_list".to_string(),
  204|      0|        count,
  205|      0|        items: rows,
  206|      0|        elapsed_ms: start.elapsed().as_millis() as u64,
  207|      0|    })?;
  208|      0|    Ok(())
  209|      0|}
  210|       |
  211|      0|fn run_purge_orphan(args: VecPurgeOrphanArgs) -> Result<(), AppError> {
  212|      0|    let start = std::time::Instant::now();
  213|      0|    let paths = AppPaths::resolve(args.db.as_deref())?;
  214|      0|    crate::storage::connection::ensure_db_ready(&paths)?;
  215|      0|    let conn = open_rw(&paths.db)?;
  216|       |
  217|       |    // Count first so we can return a deterministic response even on dry-run.
  218|      0|    let table_exists: bool = conn
  219|      0|        .query_row(
  220|      0|            "SELECT COUNT(*) FROM sqlite_master WHERE type='table' AND name='vec_memories'",
  221|      0|            [],
  222|      0|            |r| r.get::<_, i64>(0).map(|v| v > 0),
  223|       |        )
  224|      0|        .unwrap_or(false);
  225|      0|    if !table_exists {
  226|      0|        return output::emit_json(&VecPurgeOrphanResponse {
  227|      0|            action: "purge_orphan".to_string(),
  228|      0|            deleted: 0,
  229|      0|            deleted_entities: 0,
  230|      0|            deleted_chunks: 0,
  231|      0|            dry_run: args.dry_run,
  232|      0|            elapsed_ms: start.elapsed().as_millis() as u64,
  233|      0|        });
  234|      0|    }
  235|       |
  236|      0|    let orphan_count: i64 = conn
  237|      0|        .query_row(
  238|      0|            "SELECT COUNT(*) FROM vec_memories v
  239|      0|             LEFT JOIN memories m ON m.id = v.memory_id
  240|      0|             WHERE m.id IS NULL",
  241|      0|            [],
  242|      0|            |r| r.get(0),
  243|       |        )
  244|      0|        .unwrap_or(0);
  245|       |
  246|       |    // G39: also count orphans in vec_entities and vec_chunks. These
  247|       |    // tables follow the same `memory_id` foreign key convention and
  248|       |    // accumulate orphans on the same paths as vec_memories.
  249|      0|    let orphan_entities_count: i64 = if vec_table_exists(&conn, "vec_entities") {
  250|      0|        conn.query_row(
  251|      0|            "SELECT COUNT(*) FROM vec_entities v
  252|      0|             LEFT JOIN memories m ON m.id = v.memory_id
  253|      0|             WHERE m.id IS NULL",
  254|      0|            [],
  255|      0|            |r| r.get(0),
  256|       |        )
  257|      0|        .unwrap_or(0)
  258|       |    } else {
  259|      0|        0
  260|       |    };
  261|      0|    let orphan_chunks_count: i64 = if vec_table_exists(&conn, "vec_chunks") {
  262|      0|        conn.query_row(
  263|      0|            "SELECT COUNT(*) FROM vec_chunks v
  264|      0|             LEFT JOIN memories m ON m.id = v.memory_id
  265|      0|             WHERE m.id IS NULL",
  266|      0|            [],
  267|      0|            |r| r.get(0),
  268|       |        )
  269|      0|        .unwrap_or(0)
  270|       |    } else {
  271|      0|        0
  272|       |    };
  273|       |
  274|      0|    if args.dry_run {
  275|      0|        tracing::info!(target: "vec", orphan_count, orphan_entities_count, orphan_chunks_count, "dry-run: would delete orphans");
  276|      0|        return output::emit_json(&VecPurgeOrphanResponse {
  277|      0|            action: "purge_orphan_dry_run".to_string(),
  278|      0|            deleted: 0,
  279|      0|            deleted_entities: 0,
  280|      0|            deleted_chunks: 0,
  281|      0|            dry_run: true,
  282|      0|            elapsed_ms: start.elapsed().as_millis() as u64,
  283|      0|        });
  284|      0|    }
  285|       |
  286|      0|    if !args.yes {
  287|      0|        return Err(AppError::Validation(format!(
  288|      0|            "refusing to delete {orphan_count} vec_memories + {orphan_entities_count} vec_entities + {orphan_chunks_count} vec_chunks orphan rows without --yes (use --dry-run to preview)"
  289|      0|        )));
  290|      0|    }
  291|       |
  292|      0|    let deleted: i64 = conn.execute(
  293|      0|        "DELETE FROM vec_memories
  294|      0|         WHERE memory_id NOT IN (SELECT id FROM memories)",
  295|      0|        [],
  296|      0|    )? as i64;
  297|       |
  298|      0|    let deleted_entities: i64 = if vec_table_exists(&conn, "vec_entities") {
  299|      0|        conn.execute(
  300|      0|            "DELETE FROM vec_entities
  301|      0|             WHERE memory_id NOT IN (SELECT id FROM memories)",
  302|      0|            [],
  303|      0|        )
  304|      0|        .unwrap_or(0) as i64
  305|       |    } else {
  306|      0|        0
  307|       |    };
  308|      0|    let deleted_chunks: i64 = if vec_table_exists(&conn, "vec_chunks") {
  309|      0|        conn.execute(
  310|      0|            "DELETE FROM vec_chunks
  311|      0|             WHERE memory_id NOT IN (SELECT id FROM memories)",
  312|      0|            [],
  313|      0|        )
  314|      0|        .unwrap_or(0) as i64
  315|       |    } else {
  316|      0|        0
  317|       |    };
  318|       |
  319|      0|    tracing::info!(target: "vec", deleted, deleted_entities, deleted_chunks, "purged orphan vec rows");
  320|       |
  321|      0|    output::emit_json(&VecPurgeOrphanResponse {
  322|      0|        action: "purged_orphan".to_string(),
  323|      0|        deleted,
  324|      0|        deleted_entities,
  325|      0|        deleted_chunks,
  326|      0|        dry_run: false,
  327|      0|        elapsed_ms: start.elapsed().as_millis() as u64,
  328|      0|    })?;
  329|      0|    Ok(())
  330|      0|}
  331|       |
  332|      0|fn run_stats(args: VecStatsArgs) -> Result<(), AppError> {
  333|      0|    let start = std::time::Instant::now();
  334|      0|    let paths = AppPaths::resolve(args.db.as_deref())?;
  335|      0|    crate::storage::connection::ensure_db_ready(&paths)?;
  336|      0|    let conn = open_ro(&paths.db)?;
  337|       |
  338|      0|    let vec_memories_exists: bool = conn
  339|      0|        .query_row(
  340|      0|            "SELECT COUNT(*) FROM sqlite_master WHERE type='table' AND name='vec_memories'",
  341|      0|            [],
  342|      0|            |r| r.get::<_, i64>(0).map(|v| v > 0),
  343|       |        )
  344|      0|        .unwrap_or(false);
  345|      0|    let (total_rows, orphaned) = if vec_memories_exists {
  346|      0|        let total: i64 = conn
  347|      0|            .query_row("SELECT COUNT(*) FROM vec_memories", [], |r| r.get(0))
  348|      0|            .unwrap_or(0);
  349|      0|        let orph: i64 = conn
  350|      0|            .query_row(
  351|      0|                "SELECT COUNT(*) FROM vec_memories v
  352|      0|                 LEFT JOIN memories m ON m.id = v.memory_id
  353|      0|                 WHERE m.id IS NULL",
  354|      0|                [],
  355|      0|                |r| r.get(0),
  356|       |            )
  357|      0|            .unwrap_or(0);
  358|      0|        (total, orph)
  359|       |    } else {
  360|      0|        (0, 0)
  361|       |    };
  362|      0|    let coverage_percent = if total_rows > 0 {
  363|      0|        ((total_rows - orphaned) as f64 / total_rows as f64) * 100.0
  364|       |    } else {
  365|      0|        100.0
  366|       |    };
  367|       |
  368|      0|    let vec_entities_rows = if vec_table_exists(&conn, "vec_entities") {
  369|      0|        conn.query_row("SELECT COUNT(*) FROM vec_entities", [], |r| r.get(0))
  370|      0|            .ok()
  371|       |    } else {
  372|      0|        None
  373|       |    };
  374|      0|    let vec_chunks_rows = if vec_table_exists(&conn, "vec_chunks") {
  375|      0|        conn.query_row("SELECT COUNT(*) FROM vec_chunks", [], |r| r.get(0))
  376|      0|            .ok()
  377|       |    } else {
  378|      0|        None
  379|       |    };
  380|      0|    let fts_memories_rows = conn
  381|      0|        .query_row("SELECT COUNT(*) FROM fts_memories", [], |r| r.get(0))
  382|      0|        .unwrap_or(0);
  383|       |
  384|      0|    output::emit_json(&VecStatsResponse {
  385|      0|        total_rows,
  386|      0|        orphaned,
  387|      0|        coverage_percent,
  388|      0|        vec_entities_rows,
  389|      0|        vec_chunks_rows,
  390|      0|        fts_memories_rows,
  391|      0|        elapsed_ms: start.elapsed().as_millis() as u64,
  392|      0|    })?;
  393|      0|    Ok(())
  394|      0|}
  395|       |
  396|      0|fn vec_table_exists(conn: &rusqlite::Connection, name: &str) -> bool {
  397|      0|    conn.query_row(
  398|      0|        "SELECT COUNT(*) FROM sqlite_master WHERE type='table' AND name=?1",
  399|      0|        rusqlite::params![name],
  400|      0|        |r| r.get::<_, i64>(0).map(|v| v > 0),
  401|       |    )
  402|      0|    .unwrap_or(false)
  403|      0|}
  404|       |
  405|       |#[cfg(test)]
  406|       |mod tests {
  407|       |    use super::*;
  408|       |
  409|       |    #[test]
  410|      1|    fn vec_orphan_list_response_serializes_all_fields() {
  411|      1|        let resp = VecOrphanListResponse {
  412|      1|            action: "orphan_list".into(),
  413|      1|            count: 0,
  414|      1|            items: Vec::new(),
  415|      1|            elapsed_ms: 5,
  416|      1|        };
  417|      1|        let v = serde_json::to_value(&resp).unwrap();
  418|      1|        assert_eq!(v["action"], "orphan_list");
  419|      1|        assert_eq!(v["count"], 0i64);
  420|      1|        assert_eq!(v["elapsed_ms"], 5u64);
  421|      1|        assert!(v["items"].is_array());
  422|      1|    }
  423|       |
  424|       |    #[test]
  425|      1|    fn vec_purge_orphan_response_serializes_dry_run_flag() {
  426|      1|        let resp = VecPurgeOrphanResponse {
  427|      1|            action: "purge_orphan_dry_run".into(),
  428|      1|            deleted: 0,
  429|      1|            deleted_entities: 0,
  430|      1|            deleted_chunks: 0,
  431|      1|            dry_run: true,
  432|      1|            elapsed_ms: 1,
  433|      1|        };
  434|      1|        let v = serde_json::to_value(&resp).unwrap();
  435|      1|        assert_eq!(v["dry_run"], true);
  436|      1|        assert_eq!(v["deleted"], 0i64);
  437|      1|    }
  438|       |
  439|       |    #[test]
  440|      1|    fn vec_stats_response_computes_coverage() {
  441|      1|        let resp = VecStatsResponse {
  442|      1|            total_rows: 100,
  443|      1|            orphaned: 25,
  444|      1|            coverage_percent: 75.0,
  445|      1|            vec_entities_rows: Some(50),
  446|      1|            vec_chunks_rows: None,
  447|      1|            fts_memories_rows: 100,
  448|      1|            elapsed_ms: 10,
  449|      1|        };
  450|      1|        let v = serde_json::to_value(&resp).unwrap();
  451|      1|        assert_eq!(v["coverage_percent"], 75.0);
  452|      1|        assert_eq!(v["vec_entities_rows"], 50i64);
  453|      1|        assert!(v.get("vec_chunks_rows").is_none());
  454|      1|    }
  455|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/constants.rs:
    1|       |//! Compile-time constants shared across the crate.
    2|       |//!
    3|       |//! Grouped into embedding configuration, length and size limits, SQLite
    4|       |//! pragmas and retrieval tuning knobs. Values are taken from the PRD and
    5|       |//! must stay in sync with the migrations under `migrations/`.
    6|       |//!
    7|       |//! ## Dynamic concurrency permit calculation
    8|       |//!
    9|       |//! The maximum number of simultaneous instances can be adjusted at runtime
   10|       |//! using the formula:
   11|       |//!
   12|       |//! ```text
   13|       |//! permits = min(cpus, available_memory_mb / EMBEDDING_LOAD_EXPECTED_RSS_MB) * 0.5
   14|       |//! ```
   15|       |//!
   16|       |//! where `available_memory_mb` is obtained via `sysinfo::System::available_memory()`
   17|       |//! converted to MiB. The result is capped at `MAX_CONCURRENT_CLI_INSTANCES`
   18|       |//! and floored at 1.
   19|       |
   20|       |/// Embedding vector dimensionality produced by `multilingual-e5-small`.
   21|       |pub const EMBEDDING_DIM: usize = 384;
   22|       |
   23|       |/// Default `fastembed` model identifier used by `remember` and `recall`.
   24|       |pub const FASTEMBED_MODEL_DEFAULT: &str = "multilingual-e5-small";
   25|       |
   26|       |/// Batch size for `fastembed` encoding calls.
   27|       |pub const FASTEMBED_BATCH_SIZE: usize = 32;
   28|       |
   29|       |/// Maximum byte length for a memory `name` field in kebab-case.
   30|       |pub const MAX_MEMORY_NAME_LEN: usize = 80;
   31|       |
   32|       |/// Maximum byte length for an `ingest`-derived kebab-case name.
   33|       |///
   34|       |/// Stricter than `MAX_MEMORY_NAME_LEN` (80) to leave headroom for collision
   35|       |/// suffixes (`-2`, `-10`, ...) when multiple files derive to the same base.
   36|       |/// Used exclusively by `src/commands/ingest.rs`.
   37|       |pub const DERIVED_NAME_MAX_LEN: usize = 60;
   38|       |
   39|       |/// Maximum character length for a memory `description` field.
   40|       |pub const MAX_MEMORY_DESCRIPTION_LEN: usize = 500;
   41|       |
   42|       |/// Hard upper bound on memory `body` length in bytes.
   43|       |pub const MAX_MEMORY_BODY_LEN: usize = 512_000;
   44|       |
   45|       |/// Body character count above which the body is split into chunks.
   46|       |pub const MAX_BODY_CHARS_BEFORE_CHUNK: usize = 8_000;
   47|       |
   48|       |/// Maximum attempts when a statement returns `SQLITE_BUSY`.
   49|       |pub const MAX_SQLITE_BUSY_RETRIES: u32 = 5;
   50|       |
   51|       |/// Base delay in milliseconds for the first SQLITE_BUSY retry.
   52|       |///
   53|       |/// Each subsequent attempt doubles the delay (exponential backoff):
   54|       |/// 300 ms → 600 ms → 1200 ms → 2400 ms → 4800 ms (≈ 9.3 s total).
   55|       |pub const SQLITE_BUSY_BASE_DELAY_MS: u64 = 300;
   56|       |
   57|       |/// Query timeout applied to statements in milliseconds.
   58|       |pub const QUERY_TIMEOUT_MILLIS: u64 = 5_000;
   59|       |
   60|       |/// Jaccard threshold above which two memories are considered fuzzy duplicates.
   61|       |pub const DEDUP_FUZZY_THRESHOLD: f64 = 0.8;
   62|       |
   63|       |/// Cosine distance threshold below which two memories are semantic duplicates.
   64|       |pub const DEDUP_SEMANTIC_THRESHOLD: f32 = 0.1;
   65|       |
   66|       |/// Maximum number of hops allowed in graph traversals.
   67|       |pub const MAX_GRAPH_HOPS: u32 = 2;
   68|       |
   69|       |/// Minimum relationship weight required for traversal inclusion.
   70|       |pub const MIN_RELATION_WEIGHT: f64 = 0.3;
   71|       |
   72|       |/// Default traversal depth for `related` when `--hops` is omitted.
   73|       |pub const DEFAULT_MAX_HOPS: u32 = 2;
   74|       |
   75|       |/// Default minimum weight filter applied during graph traversal.
   76|       |pub const DEFAULT_MIN_WEIGHT: f64 = 0.3;
   77|       |
   78|       |/// Default weight assigned to newly created relationships.
   79|       |pub const DEFAULT_RELATION_WEIGHT: f64 = 0.5;
   80|       |
   81|       |/// Default `k` used by `recall` when the caller omits `--k`.
   82|       |pub const DEFAULT_K_RECALL: usize = 10;
   83|       |
   84|       |/// Default `k` for memory KNN searches when the caller omits `--k`.
   85|       |pub const K_MEMORIES_DEFAULT: usize = 10;
   86|       |
   87|       |/// Default `k` for entity KNN searches during graph expansion.
   88|       |pub const K_ENTITIES_SEARCH: usize = 5;
   89|       |
   90|       |/// Default upper bound on distinct entities persisted per memory.
   91|       |///
   92|       |/// Bumped from 30 → 50 in v1.0.43 to reduce semantic loss on rich documents.
   93|       |/// Configurable at runtime via `SQLITE_GRAPHRAG_MAX_ENTITIES_PER_MEMORY`.
   94|       |pub const MAX_ENTITIES_PER_MEMORY: usize = 50;
   95|       |
   96|       |/// Resolves the per-memory entity cap, honouring the env-var override.
   97|       |///
   98|       |/// v1.0.43: makes the cap (default 50) configurable via `SQLITE_GRAPHRAG_MAX_ENTITIES_PER_MEMORY`.
   99|       |/// Stress tests showed inputs with 33-46 candidates being truncated at the old cap of 30.
  100|       |/// Values outside [1, 1000] fall back to the default.
  101|      0|pub fn max_entities_per_memory() -> usize {
  102|      0|    std::env::var("SQLITE_GRAPHRAG_MAX_ENTITIES_PER_MEMORY")
  103|      0|        .ok()
  104|      0|        .and_then(|v| v.parse::<usize>().ok())
  105|      0|        .filter(|&n| (1..=1_000).contains(&n))
  106|      0|        .unwrap_or(MAX_ENTITIES_PER_MEMORY)
  107|      0|}
  108|       |
  109|       |/// Upper bound on distinct relationships persisted per memory.
  110|       |pub const MAX_RELATIONSHIPS_PER_MEMORY: usize = 50;
  111|       |
  112|       |/// Resolves the per-memory relationship cap, honouring the env-var override.
  113|       |///
  114|       |/// v1.0.22: makes the cap (default 50) configurable via `SQLITE_GRAPHRAG_MAX_RELATIONS_PER_MEMORY`.
  115|       |/// Audit found that rich documents silently hit the cap; users with dense technical corpora
  116|       |/// can raise it via env. Values outside [1, 10000] fall back to the default.
  117|      8|pub fn max_relationships_per_memory() -> usize {
  118|      8|    std::env::var("SQLITE_GRAPHRAG_MAX_RELATIONS_PER_MEMORY")
  119|      8|        .ok()
  120|      8|        .and_then(|v| v.parse::<usize>().ok())
                                    ^0                 ^0
  121|      8|        .filter(|&n| (1..=10_000).contains(&n))
                                   ^0           ^0       ^0
  122|      8|        .unwrap_or(MAX_RELATIONSHIPS_PER_MEMORY)
  123|      8|}
  124|       |
  125|       |/// Character length of the description preview shown in `list` output.
  126|       |pub const TEXT_DESCRIPTION_PREVIEW_LEN: usize = 100;
  127|       |
  128|       |/// `PRAGMA busy_timeout` value applied on every connection.
  129|       |pub const BUSY_TIMEOUT_MILLIS: i32 = 5_000;
  130|       |
  131|       |/// `PRAGMA cache_size` value in kibibytes (negative means KiB).
  132|       |pub const CACHE_SIZE_KB: i32 = -64_000;
  133|       |
  134|       |/// `PRAGMA mmap_size` value in bytes applied to each connection.
  135|       |pub const MMAP_SIZE_BYTES: i64 = 268_435_456;
  136|       |
  137|       |/// `PRAGMA wal_autocheckpoint` threshold in pages.
  138|       |pub const WAL_AUTOCHECKPOINT_PAGES: i32 = 1_000;
  139|       |
  140|       |/// Default `k` constant used by Reciprocal Rank Fusion in `hybrid-search`.
  141|       |pub const RRF_K_DEFAULT: u32 = 60;
  142|       |
  143|       |/// Chunk size expressed in tokens for body splitting.
  144|       |pub const CHUNK_SIZE_TOKENS: usize = 400;
  145|       |
  146|       |/// Token overlap between consecutive chunks.
  147|       |pub const CHUNK_OVERLAP_TOKENS: usize = 50;
  148|       |
  149|       |/// Explicit operational guard for multi-chunk documents in `remember`.
  150|       |///
  151|       |/// The multi-chunk path uses serial embeddings to avoid ONNX memory amplification.
  152|       |/// This limit preserves a clear operational ceiling for agents and scripts.
  153|       |pub const REMEMBER_MAX_SAFE_MULTI_CHUNKS: usize = 512;
  154|       |
  155|       |/// Ceiling on chunks per controlled micro-batch in `remember`.
  156|       |///
  157|       |/// The `fastembed` runtime uses `BatchLongest` padding, so oversized batches amplify
  158|       |/// the cost of the longest chunk. This ceiling keeps batches small even when chunks are short.
  159|       |pub const REMEMBER_MAX_CONTROLLED_BATCH_CHUNKS: usize = 4;
  160|       |
  161|       |/// Maximum padded-token budget per controlled micro-batch in `remember`.
  162|       |///
  163|       |/// The budget uses `max_tokens_no_batch * batch_size`, approximating the real cost of
  164|       |/// `BatchLongest` padding. Values exceeding this fall back to smaller batches or serialisation.
  165|       |pub const REMEMBER_MAX_CONTROLLED_BATCH_PADDED_TOKENS: usize = 512;
  166|       |
  167|       |/// Timeout in milliseconds for a single ping probe against the daemon socket.
  168|       |pub const DAEMON_PING_TIMEOUT_MS: u64 = 10;
  169|       |
  170|       |/// Idle duration in seconds before the daemon shuts itself down.
  171|       |pub const DAEMON_IDLE_SHUTDOWN_SECS: u64 = 600;
  172|       |
  173|       |/// Maximum wait time for the daemon to become healthy after auto-start.
  174|       |pub const DAEMON_AUTO_START_MAX_WAIT_MS: u64 = 5_000;
  175|       |
  176|       |/// Maximum wait time (ms) for a stale daemon to exit after a version-mismatch shutdown.
  177|       |pub const DAEMON_VERSION_RESTART_WAIT_MS: u64 = 5_000;
  178|       |
  179|       |/// Initial polling interval to check whether the daemon became healthy.
  180|       |pub const DAEMON_AUTO_START_INITIAL_BACKOFF_MS: u64 = 50;
  181|       |
  182|       |/// Ceiling on backoff between automatic daemon spawn attempts.
  183|       |pub const DAEMON_AUTO_START_MAX_BACKOFF_MS: u64 = 30_000;
  184|       |
  185|       |/// Base backoff used after daemon spawn/health failures.
  186|       |pub const DAEMON_SPAWN_BACKOFF_BASE_MS: u64 = 500;
  187|       |
  188|       |/// Maximum wait time to acquire the daemon spawn lock.
  189|       |pub const DAEMON_SPAWN_LOCK_WAIT_MS: u64 = 2_000;
  190|       |
  191|       |/// Prefix prepended to bodies before embedding as required by E5 models.
  192|       |pub const PASSAGE_PREFIX: &str = "passage: ";
  193|       |
  194|       |/// Prefix prepended to queries before embedding as required by E5 models.
  195|       |pub const QUERY_PREFIX: &str = "query: ";
  196|       |
  197|       |/// Crate version string sourced from `CARGO_PKG_VERSION` at build time.
  198|       |pub const SQLITE_GRAPHRAG_VERSION: &str = env!("CARGO_PKG_VERSION");
  199|       |
  200|       |/// Batch size for GLiNER NER forward passes.
  201|       |///
  202|       |/// Larger values amortise fixed forward-pass overhead but increase peak RAM.
  203|       |/// Memory guide (CPU only, max 512-token windows):
  204|       |///   N=4  → ~54 MiB peak
  205|       |///   N=8  → ~108 MiB peak  ← default
  206|       |///   N=16 → ~216 MiB peak
  207|       |///   N=32 → ~432 MiB peak  (not recommended without 16+ GiB RAM)
  208|       |///
  209|       |/// Override via `GRAPHRAG_NER_BATCH_SIZE` env var. Values outside [1, 32] are
  210|       |/// clamped silently.
  211|      0|pub fn ner_batch_size() -> usize {
  212|      0|    std::env::var("GRAPHRAG_NER_BATCH_SIZE")
  213|      0|        .ok()
  214|      0|        .and_then(|v| v.parse::<usize>().ok())
  215|      0|        .unwrap_or(8)
  216|      0|        .clamp(1, 32)
  217|      0|}
  218|       |
  219|       |/// Default cap on tokens fed to GLiNER NER per memory body.
  220|       |///
  221|       |/// v1.0.31: large markdown documents (>50 KB) tokenise into thousands of
  222|       |/// 512-token windows, each requiring a CPU forward pass that takes hundreds
  223|       |/// of milliseconds. A 68 KB document was observed taking 5+ minutes.
  224|       |/// Truncating the input before sliding-window construction caps the worst-case
  225|       |/// latency while preserving extraction quality for the leading body region.
  226|       |///
  227|       |/// Regex prefilter still runs on the full body, so URLs, emails, UUIDs,
  228|       |/// all-caps identifiers and CamelCase brand names are extracted regardless.
  229|       |pub const EXTRACTION_MAX_TOKENS_DEFAULT: usize = 5_000;
  230|       |
  231|       |/// Resolves the per-body NER token cap, honouring the env-var override.
  232|       |///
  233|       |/// Override via `SQLITE_GRAPHRAG_EXTRACTION_MAX_TOKENS` env var. Values outside
  234|       |/// [512, 100_000] fall back to [`EXTRACTION_MAX_TOKENS_DEFAULT`].
  235|      4|pub fn extraction_max_tokens() -> usize {
  236|      4|    std::env::var("SQLITE_GRAPHRAG_EXTRACTION_MAX_TOKENS")
  237|      4|        .ok()
  238|      4|        .and_then(|v| v.parse::<usize>().ok())
                                    ^3                 ^3
  239|      4|        .filter(|&n| (512..=100_000).contains(&n))
                                   ^3              ^3       ^3
  240|      4|        .unwrap_or(EXTRACTION_MAX_TOKENS_DEFAULT)
  241|      4|}
  242|       |
  243|       |/// GLiNER confidence threshold for span scoring.
  244|       |///
  245|       |/// Override via `SQLITE_GRAPHRAG_GLINER_THRESHOLD` env var. Values outside
  246|       |/// `[0.0, 1.0]` are ignored and the default `0.5` is used.
  247|      3|pub fn gliner_confidence_threshold() -> f32 {
  248|      3|    std::env::var("SQLITE_GRAPHRAG_GLINER_THRESHOLD")
  249|      3|        .ok()
  250|      3|        .and_then(|v| v.parse::<f32>().ok())
                                    ^0               ^0
  251|      3|        .filter(|&v| (0.0..=1.0).contains(&v))
                                   ^0          ^0       ^0
  252|      3|        .unwrap_or(0.5)
  253|      3|}
  254|       |
  255|       |/// HuggingFace repository for the GLiNER ONNX model.
  256|       |///
  257|       |/// Override via `SQLITE_GRAPHRAG_GLINER_MODEL` env var.
  258|      0|pub fn gliner_model_repo() -> String {
  259|      0|    std::env::var("SQLITE_GRAPHRAG_GLINER_MODEL")
  260|      0|        .unwrap_or_else(|_| "onnx-community/gliner_multi-v2.1".to_string())
  261|      0|}
  262|       |
  263|       |/// PRD-canonical regex that validates names and namespaces. Allows 1 char `[a-z0-9]`
  264|       |/// OR a 2-80 char string starting with a letter and ending with a letter/digit,
  265|       |/// containing only `[a-z0-9-]`. Rejects the `__` prefix (internal reserved).
  266|       |pub const NAME_SLUG_REGEX: &str = r"^[a-z][a-z0-9-]{0,78}[a-z0-9]$|^[a-z0-9]$";
  267|       |
  268|       |static NAME_SLUG_RE: std::sync::OnceLock<regex::Regex> = std::sync::OnceLock::new();
  269|       |
  270|       |/// Returns a reference to the compiled [`NAME_SLUG_REGEX`] pattern.
  271|       |/// Compiled once on first call, cached via `OnceLock`.
  272|      0|pub fn name_slug_regex() -> &'static regex::Regex {
  273|      0|    NAME_SLUG_RE.get_or_init(|| {
  274|      0|        regex::Regex::new(NAME_SLUG_REGEX).expect("NAME_SLUG_REGEX is a valid pattern")
  275|      0|    })
  276|      0|}
  277|       |
  278|       |/// Default retention period (days) used by `purge` when `--retention-days` is omitted.
  279|       |pub const PURGE_RETENTION_DAYS_DEFAULT: u32 = 90;
  280|       |
  281|       |/// Maximum number of simultaneously active namespaces (deleted_at IS NULL). Exit 5 when exceeded.
  282|       |pub const MAX_NAMESPACES_ACTIVE: u32 = 100;
  283|       |
  284|       |/// Maximum tokens accepted by an embedding input before chunking.
  285|       |pub const EMBEDDING_MAX_TOKENS: usize = 512;
  286|       |
  287|       |/// Maximum result count from the recursive graph CTE in `recall`.
  288|       |pub const K_GRAPH_MATCHES_LIMIT: usize = 20;
  289|       |
  290|       |/// Default `--limit` for `list` when omitted.
  291|       |pub const K_LIST_DEFAULT_LIMIT: usize = 100;
  292|       |
  293|       |/// Default `--limit` for `graph entities` when omitted.
  294|       |pub const K_GRAPH_ENTITIES_DEFAULT_LIMIT: usize = 50;
  295|       |
  296|       |/// Default `--limit` for `related` when omitted.
  297|       |pub const K_RELATED_DEFAULT_LIMIT: usize = 10;
  298|       |
  299|       |/// Default `--limit` for `history` when omitted.
  300|       |pub const K_HISTORY_DEFAULT_LIMIT: usize = 20;
  301|       |
  302|       |/// Default weight for the vector contribution in the `hybrid-search` RRF formula.
  303|       |pub const WEIGHT_VEC_DEFAULT: f64 = 1.0;
  304|       |
  305|       |/// Default weight for the BM25 text contribution in the `hybrid-search` RRF formula.
  306|       |pub const WEIGHT_FTS_DEFAULT: f64 = 1.0;
  307|       |
  308|       |/// Character size of the body preview emitted in text/markdown formats.
  309|       |pub const TEXT_BODY_PREVIEW_LEN: usize = 200;
  310|       |
  311|       |/// Default value injected into ORT_NUM_THREADS when not set by the user.
  312|       |pub const ORT_NUM_THREADS_DEFAULT: &str = "1";
  313|       |
  314|       |/// Default value injected into ORT_INTRA_OP_NUM_THREADS when not set.
  315|       |pub const ORT_INTRA_OP_NUM_THREADS_DEFAULT: &str = "1";
  316|       |
  317|       |/// Default value injected into OMP_NUM_THREADS when not set by the user.
  318|       |pub const OMP_NUM_THREADS_DEFAULT: &str = "1";
  319|       |
  320|       |/// Exit code for partial batch failure (PRD line 1822). Conflicts with DbBusy in v1.x;
  321|       |/// in v2.0.0 DbBusy migrates to 15 and this code takes 13 per PRD.
  322|       |pub const BATCH_PARTIAL_FAILURE_EXIT_CODE: i32 = 13;
  323|       |
  324|       |/// Exit code for DbBusy in v2.0.0 (migrated from 13 to free 13 for batch failure).
  325|       |pub const DB_BUSY_EXIT_CODE: i32 = 15;
  326|       |
  327|       |/// Filename used for the advisory exclusive lock that prevents parallel invocations.
  328|       |pub const CLI_LOCK_FILE: &str = "cli.lock";
  329|       |
  330|       |/// Polling interval in milliseconds used by `--wait-lock` between `try_lock_exclusive` attempts.
  331|       |pub const CLI_LOCK_POLL_INTERVAL_MS: u64 = 500;
  332|       |
  333|       |/// Process exit code returned when the lock is busy and no wait was requested (EX_TEMPFAIL).
  334|       |pub const CLI_LOCK_EXIT_CODE: i32 = 75;
  335|       |
  336|       |/// Maximum number of CLI instances running simultaneously.
  337|       |///
  338|       |/// Aligned with `DAEMON_MAX_CONCURRENT_CLIENTS` from the PRD. Limits the counting
  339|       |/// semaphore in [`crate::lock`] to prevent memory overload when multiple parallel
  340|       |/// invocations attempt to load the ONNX model simultaneously.
  341|       |pub const MAX_CONCURRENT_CLI_INSTANCES: usize = 4;
  342|       |
  343|       |/// G28-B (v1.0.68): polling interval in milliseconds used by
  344|       |/// `acquire_job_singleton` between retry attempts when another invocation
  345|       |/// already holds the singleton for `(job_type, namespace)`.
  346|       |pub const JOB_SINGLETON_POLL_INTERVAL_MS: u64 = 1000;
  347|       |
  348|       |/// Minimum available memory in MiB required before starting model loading.
  349|       |///
  350|       |/// If `sysinfo::System::available_memory() / 1_048_576` falls below this value,
  351|       |/// the invocation is aborted with [`crate::errors::AppError::LowMemory`]
  352|       |/// (exit code [`LOW_MEMORY_EXIT_CODE`]).
  353|       |pub const MIN_AVAILABLE_MEMORY_MB: u64 = 2_048;
  354|       |
  355|       |/// Maximum process RSS in MiB before aborting embedding operations.
  356|       |/// Users can override via `--max-rss-mb`. Set to 8 GiB by default.
  357|       |pub const DEFAULT_MAX_RSS_MB: u64 = 8_192;
  358|       |
  359|       |/// Maximum time in seconds an instance waits to acquire a concurrency slot.
  360|       |///
  361|       |/// Passed as the default for `--max-wait-secs` in the CLI. After exhausting this limit,
  362|       |/// the invocation returns [`crate::errors::AppError::AllSlotsFull`] with exit code
  363|       |/// [`CLI_LOCK_EXIT_CODE`] (75).
  364|       |pub const CLI_LOCK_DEFAULT_WAIT_SECS: u64 = 300;
  365|       |
  366|       |/// Expected RSS in MiB for a single instance with the ONNX model loaded via fastembed.
  367|       |///
  368|       |/// Used in the formula `min(cpus, available_memory_mb / EMBEDDING_LOAD_EXPECTED_RSS_MB) * 0.5`
  369|       |/// to compute the dynamic permit count.
  370|       |///
  371|       |/// Value calibrated on 2026-04-23 with `/usr/bin/time -v` against `sqlite-graphrag v1.0.3`
  372|       |/// on the heavy commands `remember`, `recall`, and `hybrid-search`, all peaking near
  373|       |/// 1.03 GiB RSS per process. The constant below rounds up with a defensive margin.
  374|       |pub const EMBEDDING_LOAD_EXPECTED_RSS_MB: u64 = 1_100;
  375|       |
  376|       |/// Process exit code returned when available memory is below [`MIN_AVAILABLE_MEMORY_MB`].
  377|       |///
  378|       |/// Value `77` is `EX_NOPERM` in glibc sysexits, reused here to indicate
  379|       |/// "insufficient system resource to proceed".
  380|       |pub const LOW_MEMORY_EXIT_CODE: i32 = 77;
  381|       |
  382|       |/// Process exit code returned when a duplicate memory or entity is detected (exit 9).
  383|       |///
  384|       |/// Moved from `2` to `9` in v1.0.52 to free exit code `2` for future use and align
  385|       |/// with the PRD exit code contract. Shell callers and LLM agents must use `9` from
  386|       |/// this version onwards.
  387|       |pub const DUPLICATE_EXIT_CODE: i32 = 9;
  388|       |
  389|       |/// Canonical value of `PRAGMA user_version` written after migrations.
  390|       |///
  391|       |/// **Why 49 instead of `CURRENT_SCHEMA_VERSION` (9)?**
  392|       |/// `user_version` is a 32-bit integer that SQLite reserves for application use.
  393|       |/// We deliberately set it to a project-specific marker (49 = decimal) so external
  394|       |/// inspection tools (`sqlite3 db.sqlite "PRAGMA user_version"`, the `file` command,
  395|       |/// SQLite browser GUIs) can distinguish a sqlite-graphrag database from a generic
  396|       |/// SQLite file at a glance. The application-level schema version (9, matching
  397|       |/// `CURRENT_SCHEMA_VERSION`) is stored in the `schema_meta` table and exposed via
  398|       |/// `health --json`/`stats --json`. Bumping migrations does NOT change this constant.
  399|       |/// Refinery uses its own `refinery_schema_history` table for migration bookkeeping.
  400|       |pub const SCHEMA_USER_VERSION: i64 = 49;
  401|       |
  402|       |/// Current schema version, equal to the highest migration number in `migrations/Vnnn__*.sql`.
  403|       |///
  404|       |/// Added in v1.0.27 as a runtime and test sanity check.
  405|       |/// Must be bumped in sync with new Refinery migrations; the unit test
  406|       |/// `schema_version_matches_migrations_count` validates this automatically.
  407|       |pub const CURRENT_SCHEMA_VERSION: u32 = 12;
  408|       |
  409|       |#[cfg(test)]
  410|       |mod tests_schema_version {
  411|       |    use super::CURRENT_SCHEMA_VERSION;
  412|       |
  413|       |    #[test]
  414|      1|    fn schema_version_matches_migrations_count() {
  415|      1|        let manifest_dir = env!("CARGO_MANIFEST_DIR");
  416|      1|        let migrations_dir = std::path::Path::new(manifest_dir).join("migrations");
  417|      1|        let count = std::fs::read_dir(&migrations_dir)
  418|      1|            .expect("migrations directory must exist")
  419|     12|            .filter_map(|entry| entry.ok())
                           ^1
  420|     12|            .filter(|entry| entry.file_name().to_string_lossy().starts_with('V'))
                           ^1
  421|      1|            .count() as u32;
  422|      1|        assert_eq!(
  423|       |            CURRENT_SCHEMA_VERSION, count,
  424|      0|            "CURRENT_SCHEMA_VERSION ({CURRENT_SCHEMA_VERSION}) must equal the number of V*.sql migrations ({count})"
  425|       |        );
  426|      1|    }
  427|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/daemon.rs:
    1|       |//! IPC daemon: keeps the embedding model warm across CLI invocations.
    2|       |//!
    3|       |//! Manages the background process lifecycle, Unix-socket IPC protocol, and
    4|       |//! auto-start/backoff logic so embeddings are served without cold-start cost.
    5|       |
    6|       |use crate::constants::{
    7|       |    DAEMON_AUTO_START_INITIAL_BACKOFF_MS, DAEMON_AUTO_START_MAX_BACKOFF_MS,
    8|       |    DAEMON_AUTO_START_MAX_WAIT_MS, DAEMON_IDLE_SHUTDOWN_SECS, DAEMON_PING_TIMEOUT_MS,
    9|       |    DAEMON_SPAWN_BACKOFF_BASE_MS, DAEMON_SPAWN_LOCK_WAIT_MS, DAEMON_VERSION_RESTART_WAIT_MS,
   10|       |    SQLITE_GRAPHRAG_VERSION,
   11|       |};
   12|       |use crate::errors::AppError;
   13|       |use crate::{embedder, shutdown_requested};
   14|       |use fs4::fs_std::FileExt;
   15|       |use interprocess::local_socket::{
   16|       |    prelude::LocalSocketStream,
   17|       |    traits::{Listener as _, Stream as _},
   18|       |    GenericFilePath, GenericNamespaced, ListenerNonblockingMode, ListenerOptions, ToFsName,
   19|       |    ToNsName,
   20|       |};
   21|       |use serde::{Deserialize, Serialize};
   22|       |use std::fs::{File, OpenOptions};
   23|       |use std::io::{BufRead, BufReader, Write};
   24|       |use std::path::{Path, PathBuf};
   25|       |use std::process::Stdio;
   26|       |use std::sync::atomic::{AtomicU64, AtomicU8, Ordering};
   27|       |use std::sync::Arc;
   28|       |use std::thread;
   29|       |use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
   30|       |
   31|       |const VERSION_NOT_CHECKED: u8 = 0;
   32|       |const VERSION_COMPATIBLE: u8 = 1;
   33|       |const VERSION_RESTART_ATTEMPTED: u8 = 2;
   34|       |
   35|       |/// Guards against restart loops: tracks version check state per process lifetime.
   36|       |static DAEMON_VERSION_STATE: AtomicU8 = AtomicU8::new(VERSION_NOT_CHECKED);
   37|       |
   38|       |#[derive(Debug, Serialize, Deserialize)]
   39|       |#[serde(tag = "request", rename_all = "snake_case")]
   40|       |pub enum DaemonRequest {
   41|       |    Ping,
   42|       |    Shutdown,
   43|       |    EmbedPassage {
   44|       |        text: String,
   45|       |    },
   46|       |    EmbedQuery {
   47|       |        text: String,
   48|       |    },
   49|       |    EmbedPassages {
   50|       |        texts: Vec<String>,
   51|       |        token_counts: Vec<usize>,
   52|       |    },
   53|       |}
   54|       |
   55|       |#[derive(Debug, Serialize, Deserialize)]
   56|       |#[serde(tag = "status", rename_all = "snake_case")]
   57|       |pub enum DaemonResponse {
   58|       |    Listening {
   59|       |        pid: u32,
   60|       |        socket: String,
   61|       |        idle_shutdown_secs: u64,
   62|       |    },
   63|       |    Ok {
   64|       |        pid: u32,
   65|       |        version: String,
   66|       |        handled_embed_requests: u64,
   67|       |        model_name: String,
   68|       |        model_variant: String,
   69|       |    },
   70|       |    PassageEmbedding {
   71|       |        embedding: Vec<f32>,
   72|       |        handled_embed_requests: u64,
   73|       |    },
   74|       |    QueryEmbedding {
   75|       |        embedding: Vec<f32>,
   76|       |        handled_embed_requests: u64,
   77|       |    },
   78|       |    PassageEmbeddings {
   79|       |        embeddings: Vec<Vec<f32>>,
   80|       |        handled_embed_requests: u64,
   81|       |    },
   82|       |    ShuttingDown {
   83|       |        handled_embed_requests: u64,
   84|       |    },
   85|       |    Error {
   86|       |        message: String,
   87|       |    },
   88|       |}
   89|       |
   90|       |#[derive(Debug, Default, Serialize, Deserialize)]
   91|       |struct DaemonSpawnState {
   92|       |    consecutive_failures: u32,
   93|       |    not_before_epoch_ms: u64,
   94|       |    last_error: Option<String>,
   95|       |}
   96|       |
   97|      1|pub fn daemon_label(models_dir: &Path) -> String {
   98|      1|    let hash = blake3::hash(models_dir.to_string_lossy().as_bytes())
   99|      1|        .to_hex()
  100|      1|        .to_string();
  101|      1|    format!("sqlite-graphrag-daemon-{}", &hash[..16])
  102|      1|}
  103|       |
  104|      1|pub fn try_ping(models_dir: &Path) -> Result<Option<DaemonResponse>, AppError> {
  105|      1|    request_if_available(models_dir, &DaemonRequest::Ping)
  106|      1|}
  107|       |
  108|      0|pub fn try_shutdown(models_dir: &Path) -> Result<Option<DaemonResponse>, AppError> {
  109|      0|    request_if_available(models_dir, &DaemonRequest::Shutdown)
  110|      0|}
  111|       |
  112|      0|pub fn embed_passage_or_local(models_dir: &Path, text: &str) -> Result<Vec<f32>, AppError> {
  113|      0|    match request_or_autostart(
  114|      0|        models_dir,
  115|      0|        &DaemonRequest::EmbedPassage {
  116|      0|            text: text.to_string(),
  117|      0|        },
  118|       |        true,
  119|      0|    )? {
  120|      0|        Some(DaemonResponse::PassageEmbedding { embedding, .. }) => Ok(embedding),
  121|      0|        Some(DaemonResponse::Error { message }) => Err(AppError::Embedding(message)),
  122|      0|        Some(other) => Err(AppError::Internal(anyhow::anyhow!(
  123|      0|            "unexpected daemon response for passage embedding: {other:?}"
  124|      0|        ))),
  125|       |        None => {
  126|      0|            let embedder = embedder::get_embedder(models_dir)?;
  127|      0|            embedder::embed_passage(embedder, text)
  128|       |        }
  129|       |    }
  130|      0|}
  131|       |
  132|      0|pub fn embed_query_or_local(
  133|      0|    models_dir: &Path,
  134|      0|    text: &str,
  135|      0|    cli_autostart: bool,
  136|      0|) -> Result<Vec<f32>, AppError> {
  137|      0|    match request_or_autostart(
  138|      0|        models_dir,
  139|      0|        &DaemonRequest::EmbedQuery {
  140|      0|            text: text.to_string(),
  141|      0|        },
  142|      0|        cli_autostart,
  143|      0|    )? {
  144|      0|        Some(DaemonResponse::QueryEmbedding { embedding, .. }) => Ok(embedding),
  145|      0|        Some(DaemonResponse::Error { message }) => Err(AppError::Embedding(message)),
  146|      0|        Some(other) => Err(AppError::Internal(anyhow::anyhow!(
  147|      0|            "unexpected daemon response for query embedding: {other:?}"
  148|      0|        ))),
  149|       |        None => {
  150|      0|            let embedder = embedder::get_embedder(models_dir)?;
  151|      0|            embedder::embed_query(embedder, text)
  152|       |        }
  153|       |    }
  154|      0|}
  155|       |
  156|      0|pub fn embed_passages_controlled_or_local(
  157|      0|    models_dir: &Path,
  158|      0|    texts: &[&str],
  159|      0|    token_counts: &[usize],
  160|      0|) -> Result<Vec<Vec<f32>>, AppError> {
  161|      0|    let request = DaemonRequest::EmbedPassages {
  162|      0|        texts: texts.iter().map(|t| (*t).to_string()).collect(),
  163|      0|        token_counts: token_counts.to_vec(),
  164|       |    };
  165|       |
  166|      0|    match request_or_autostart(models_dir, &request, true)? {
  167|      0|        Some(DaemonResponse::PassageEmbeddings { embeddings, .. }) => Ok(embeddings),
  168|      0|        Some(DaemonResponse::Error { message }) => Err(AppError::Embedding(message)),
  169|      0|        Some(other) => Err(AppError::Internal(anyhow::anyhow!(
  170|      0|            "unexpected daemon response for passage embedding batch: {other:?}"
  171|      0|        ))),
  172|       |        None => {
  173|      0|            let embedder = embedder::get_embedder(models_dir)?;
  174|      0|            embedder::embed_passages_controlled(embedder, texts, token_counts)
  175|       |        }
  176|       |    }
  177|      0|}
  178|       |
  179|       |struct DaemonSpawnGuard {
  180|       |    models_dir: PathBuf,
  181|       |}
  182|       |
  183|       |impl DaemonSpawnGuard {
  184|      0|    fn new(models_dir: &Path) -> Self {
  185|      0|        Self {
  186|      0|            models_dir: models_dir.to_path_buf(),
  187|      0|        }
  188|      0|    }
  189|       |}
  190|       |
  191|       |impl Drop for DaemonSpawnGuard {
  192|      0|    fn drop(&mut self) {
  193|      0|        let lock_path = spawn_lock_path(&self.models_dir);
  194|      0|        if lock_path.exists() {
  195|      0|            match std::fs::remove_file(&lock_path) {
  196|       |                Ok(()) => {
  197|      0|                    tracing::debug!(
  198|       |                        target: "daemon",
  199|      0|                        path = %lock_path.display(),
  200|      0|                        "spawn lock file removed during graceful daemon shutdown"
  201|       |                    );
  202|       |                }
  203|      0|                Err(err) if err.kind() == std::io::ErrorKind::NotFound => {}
  204|      0|                Err(err) => {
  205|      0|                    tracing::warn!(
  206|       |                        target: "daemon",
  207|       |                        error = %err,
  208|      0|                        path = %lock_path.display(),
  209|      0|                        "failed to remove spawn lock file while shutting down daemon"
  210|       |                    );
  211|       |                }
  212|       |            }
  213|      0|        }
  214|      0|        let pid_path = pid_file_path(&self.models_dir);
  215|      0|        let _ = std::fs::remove_file(&pid_path);
  216|       |
  217|      0|        tracing::info!(
  218|       |            target: "daemon",
  219|      0|            "daemon shut down gracefully; socket will be cleaned up by OS or by the next daemon via try_overwrite"
  220|       |        );
  221|      0|    }
  222|       |}
  223|       |
  224|      0|pub fn run(
  225|      0|    models_dir: &Path,
  226|      0|    idle_shutdown_secs: u64,
  227|      0|    shutdown_timeout_secs: u64,
  228|      0|) -> Result<(), AppError> {
  229|       |    // Scale worker threads to available parallelism so embedding tasks saturate CPU cores.
  230|       |    // Clamped to [2, 8] to avoid excessive threads on high-core machines.
  231|      0|    let permits = std::thread::available_parallelism()
  232|      0|        .map(|n| n.get())
  233|      0|        .unwrap_or(2)
  234|      0|        .clamp(2, 8);
  235|      0|    let rt = tokio::runtime::Builder::new_multi_thread()
  236|      0|        .worker_threads(permits)
  237|      0|        .thread_name("daemon-worker")
  238|      0|        .enable_all()
  239|      0|        .build()
  240|      0|        .map_err(AppError::Io)?;
  241|       |
  242|      0|    let result = rt.block_on(run_async(models_dir, idle_shutdown_secs, permits));
  243|      0|    rt.shutdown_timeout(std::time::Duration::from_secs(shutdown_timeout_secs));
  244|      0|    result
  245|      0|}
  246|       |
  247|       |#[tracing::instrument(skip_all, fields(idle_secs = idle_shutdown_secs, permits))]
  248|      0|async fn run_async(
  249|      0|    models_dir: &Path,
  250|      0|    idle_shutdown_secs: u64,
  251|      0|    permits: usize,
  252|      0|) -> Result<(), AppError> {
  253|       |    let socket = daemon_label(models_dir);
  254|       |    let name = to_local_socket_name(&socket)?;
  255|       |    let listener = ListenerOptions::new()
  256|       |        .name(name)
  257|       |        .nonblocking(ListenerNonblockingMode::Accept)
  258|       |        .try_overwrite(true)
  259|       |        .create_sync()
  260|       |        .map_err(AppError::Io)?;
  261|       |
  262|       |    // Guard that cleans up the spawn lock file on graceful shutdown.
  263|       |    // SIGKILL does not trigger Drop; in that case try_overwrite(true) above is the fallback.
  264|       |    let _spawn_guard = DaemonSpawnGuard::new(models_dir);
  265|       |
  266|       |    // Warm the model once per daemon process inside spawn_blocking so the
  267|       |    // ONNX session initialisation (CPU-bound, may take several seconds) does
  268|       |    // not block a tokio worker thread.
  269|       |    let models_dir_warm = models_dir.to_path_buf();
  270|      0|    tokio::task::spawn_blocking(move || embedder::get_embedder(&models_dir_warm).map(|_| ()))
  271|       |        .await
  272|      0|        .map_err(|e| AppError::Internal(anyhow::anyhow!("model warm-up panicked: {e}")))??;
  273|       |
  274|       |    let pid_path = pid_file_path(models_dir);
  275|       |    let _ = std::fs::write(&pid_path, std::process::id().to_string());
  276|       |
  277|       |    crate::output::emit_json(&DaemonResponse::Listening {
  278|       |        pid: std::process::id(),
  279|       |        socket,
  280|       |        idle_shutdown_secs,
  281|       |    })?;
  282|       |
  283|       |    let handled_embed_requests = Arc::new(AtomicU64::new(0));
  284|       |    let mut last_activity = Instant::now();
  285|       |    let models_dir = models_dir.to_path_buf();
  286|       |    // Bound concurrent spawn_blocking tasks to the same thread count as the runtime.
  287|       |    let permit_pool = Arc::new(tokio::sync::Semaphore::new(permits));
  288|       |
  289|       |    let token = crate::cancel_token();
  290|       |    loop {
  291|       |        if shutdown_requested() || token.is_cancelled() {
  292|       |            break;
  293|       |        }
  294|       |
  295|       |        if !daemon_control_dir(&models_dir).exists() {
  296|       |            tracing::info!(target: "daemon", "daemon control directory disappeared; shutting down");
  297|       |            break;
  298|       |        }
  299|       |
  300|       |        match listener.accept() {
  301|       |            Ok(stream) => {
  302|       |                last_activity = Instant::now();
  303|       |                let models_dir_clone = models_dir.clone();
  304|       |                let counter = Arc::clone(&handled_embed_requests);
  305|       |                let permit =
  306|      0|                    permit_pool.clone().acquire_owned().await.map_err(|e| {
  307|      0|                        AppError::Internal(anyhow::anyhow!("semaphore closed: {e}"))
  308|      0|                    })?;
  309|      0|                let should_exit = tokio::task::spawn_blocking(move || {
  310|      0|                    let _permit = permit; // hold until end of scope
  311|      0|                    handle_client(stream, &models_dir_clone, &counter)
  312|      0|                })
  313|       |                .await
  314|      0|                .map_err(|e| {
  315|      0|                    AppError::Internal(anyhow::anyhow!("spawn_blocking panicked: {e}"))
  316|      0|                })??;
  317|       |
  318|       |                if should_exit {
  319|       |                    break;
  320|       |                }
  321|       |            }
  322|       |            Err(err) if err.kind() == std::io::ErrorKind::WouldBlock => {
  323|       |                if last_activity.elapsed() >= Duration::from_secs(idle_shutdown_secs) {
  324|       |                    tracing::info!(
  325|       |                        target: "daemon",
  326|       |                        idle_shutdown_secs,
  327|       |                        handled_embed_requests = handled_embed_requests.load(Ordering::Relaxed),
  328|       |                        "daemon idle timeout reached"
  329|       |                    );
  330|       |                    break;
  331|       |                }
  332|       |                tokio::select! {
  333|       |                    () = tokio::time::sleep(Duration::from_millis(50)) => {}
  334|       |                    () = token.cancelled() => { break; }
  335|       |                }
  336|       |            }
  337|       |            Err(err) => return Err(AppError::Io(err)),
  338|       |        }
  339|       |    }
  340|       |
  341|       |    Ok(())
  342|      0|}
  343|       |
  344|      0|fn handle_client(
  345|      0|    stream: LocalSocketStream,
  346|      0|    models_dir: &Path,
  347|      0|    handled_embed_requests: &AtomicU64,
  348|      0|) -> Result<bool, AppError> {
  349|      0|    let mut reader = BufReader::new(stream);
  350|      0|    let mut line = String::new();
  351|      0|    reader.read_line(&mut line).map_err(AppError::Io)?;
  352|       |
  353|      0|    if line.trim().is_empty() {
  354|      0|        write_response(
  355|      0|            reader.get_mut(),
  356|      0|            &DaemonResponse::Error {
  357|      0|                message: "empty request to daemon".to_string(),
  358|      0|            },
  359|      0|        )?;
  360|      0|        return Ok(false);
  361|      0|    }
  362|       |
  363|      0|    let request: DaemonRequest = serde_json::from_str(line.trim()).map_err(AppError::Json)?;
  364|      0|    let (response, should_exit) = match request {
  365|      0|        DaemonRequest::Ping => (
  366|      0|            DaemonResponse::Ok {
  367|      0|                pid: std::process::id(),
  368|      0|                version: SQLITE_GRAPHRAG_VERSION.to_string(),
  369|      0|                handled_embed_requests: handled_embed_requests.load(Ordering::Relaxed),
  370|      0|                model_name: crate::constants::FASTEMBED_MODEL_DEFAULT.to_string(),
  371|      0|                model_variant: gliner_variant_from_env(),
  372|      0|            },
  373|      0|            false,
  374|      0|        ),
  375|      0|        DaemonRequest::Shutdown => (
  376|      0|            DaemonResponse::ShuttingDown {
  377|      0|                handled_embed_requests: handled_embed_requests.load(Ordering::Relaxed),
  378|      0|            },
  379|      0|            true,
  380|      0|        ),
  381|      0|        DaemonRequest::EmbedPassage { text } => {
  382|      0|            let embedder = embedder::get_embedder(models_dir)?;
  383|      0|            let embedding = embedder::embed_passage(embedder, &text)?;
  384|      0|            let count = handled_embed_requests.fetch_add(1, Ordering::Relaxed) + 1;
  385|      0|            (
  386|      0|                DaemonResponse::PassageEmbedding {
  387|      0|                    embedding,
  388|      0|                    handled_embed_requests: count,
  389|      0|                },
  390|      0|                false,
  391|      0|            )
  392|       |        }
  393|      0|        DaemonRequest::EmbedQuery { text } => {
  394|      0|            let embedder = embedder::get_embedder(models_dir)?;
  395|      0|            let embedding = embedder::embed_query(embedder, &text)?;
  396|      0|            let count = handled_embed_requests.fetch_add(1, Ordering::Relaxed) + 1;
  397|      0|            (
  398|      0|                DaemonResponse::QueryEmbedding {
  399|      0|                    embedding,
  400|      0|                    handled_embed_requests: count,
  401|      0|                },
  402|      0|                false,
  403|      0|            )
  404|       |        }
  405|       |        DaemonRequest::EmbedPassages {
  406|      0|            texts,
  407|      0|            token_counts,
  408|       |        } => {
  409|      0|            let embedder = embedder::get_embedder(models_dir)?;
  410|      0|            let text_refs: Vec<&str> = texts.iter().map(String::as_str).collect();
  411|      0|            let embeddings =
  412|      0|                embedder::embed_passages_controlled(embedder, &text_refs, &token_counts)?;
  413|      0|            let count = handled_embed_requests.fetch_add(1, Ordering::Relaxed) + 1;
  414|      0|            (
  415|      0|                DaemonResponse::PassageEmbeddings {
  416|      0|                    embeddings,
  417|      0|                    handled_embed_requests: count,
  418|      0|                },
  419|      0|                false,
  420|      0|            )
  421|       |        }
  422|       |    };
  423|       |
  424|      0|    write_response(reader.get_mut(), &response)?;
  425|      0|    Ok(should_exit)
  426|      0|}
  427|       |
  428|      0|fn write_response(
  429|      0|    stream: &mut LocalSocketStream,
  430|      0|    response: &DaemonResponse,
  431|      0|) -> Result<(), AppError> {
  432|      0|    serde_json::to_writer(&mut *stream, response).map_err(AppError::Json)?;
  433|      0|    stream.write_all(b"\n").map_err(AppError::Io)?;
  434|      0|    stream.flush().map_err(AppError::Io)?;
  435|      0|    Ok(())
  436|      0|}
  437|       |
  438|      1|fn request_if_available(
  439|      1|    models_dir: &Path,
  440|      1|    request: &DaemonRequest,
  441|      1|) -> Result<Option<DaemonResponse>, AppError> {
  442|      1|    let socket = daemon_label(models_dir);
  443|      1|    let name = match to_local_socket_name(&socket) {
  444|      1|        Ok(name) => name,
  445|      0|        Err(err) => return Err(AppError::Io(err)),
  446|       |    };
  447|       |
  448|      1|    let mut stream = match LocalSocketStream::connect(name) {
                      ^0
  449|      0|        Ok(stream) => stream,
  450|      1|        Err(err)
  451|      0|            if matches!(
  452|      1|                err.kind(),
  453|       |                std::io::ErrorKind::NotFound
  454|       |                    | std::io::ErrorKind::ConnectionRefused
  455|       |                    | std::io::ErrorKind::AddrNotAvailable
  456|       |                    | std::io::ErrorKind::TimedOut
  457|       |            ) =>
  458|       |        {
  459|      1|            return Ok(None);
  460|       |        }
  461|      0|        Err(err) => return Err(AppError::Io(err)),
  462|       |    };
  463|       |
  464|      0|    serde_json::to_writer(&mut stream, request).map_err(AppError::Json)?;
  465|      0|    stream.write_all(b"\n").map_err(AppError::Io)?;
  466|      0|    stream.flush().map_err(AppError::Io)?;
  467|       |
  468|      0|    let mut reader = BufReader::new(stream);
  469|      0|    let mut line = String::new();
  470|      0|    reader.read_line(&mut line).map_err(AppError::Io)?;
  471|      0|    if line.trim().is_empty() {
  472|      0|        return Err(AppError::Embedding(
  473|      0|            "daemon returned an empty response".into(),
  474|      0|        ));
  475|      0|    }
  476|       |
  477|      0|    let response = serde_json::from_str(line.trim()).map_err(AppError::Json)?;
  478|      0|    Ok(Some(response))
  479|      1|}
  480|       |
  481|      0|fn should_autostart(cli_flag: bool) -> bool {
  482|      0|    if !cli_flag {
  483|      0|        return false; // explicit CLI override wins
  484|      0|    }
  485|      0|    !autostart_disabled_by_env()
  486|      0|}
  487|       |
  488|       |/// Checks whether a running daemon has a different version from the current CLI binary.
  489|       |/// If a mismatch is detected, shuts down the stale daemon, waits for it to exit, and
  490|       |/// re-spawns a fresh one. The `VERSION_RESTART_ATTEMPTED` state prevents infinite loops:
  491|       |/// this function is a no-op after the first attempt regardless of outcome.
  492|      0|fn maybe_restart_for_version_mismatch(models_dir: &Path) -> Result<(), AppError> {
  493|       |    // ORDERING: Acquire on success synchronizes-with the Release store at line ~505.
  494|       |    // Relaxed on failure: no dependent memory is read on the CAS failure path.
  495|      0|    if DAEMON_VERSION_STATE
  496|      0|        .compare_exchange(
  497|      0|            VERSION_NOT_CHECKED,
  498|      0|            VERSION_COMPATIBLE,
  499|      0|            Ordering::Acquire,
  500|      0|            Ordering::Relaxed,
  501|      0|        )
  502|      0|        .is_err()
  503|       |    {
  504|       |        // Already checked (compatible) or already attempted a restart — skip.
  505|      0|        return Ok(());
  506|      0|    }
  507|       |
  508|      0|    let response = match try_ping(models_dir)? {
  509|      0|        Some(r) => r,
  510|      0|        None => return Ok(()), // no daemon running, nothing to check
  511|       |    };
  512|       |
  513|      0|    let daemon_version = match &response {
  514|      0|        DaemonResponse::Ok { version, .. } => version.as_str(),
  515|      0|        _ => return Ok(()), // unexpected response shape, skip
  516|       |    };
  517|       |
  518|      0|    if daemon_version == SQLITE_GRAPHRAG_VERSION {
  519|      0|        return Ok(()); // versions match, state already set to COMPATIBLE
  520|      0|    }
  521|       |
  522|       |    // Mismatch detected — mark as restart-attempted so we never loop.
  523|       |    // ORDERING: Release pairs with the Acquire in compare_exchange and load.
  524|      0|    DAEMON_VERSION_STATE.store(VERSION_RESTART_ATTEMPTED, Ordering::Release);
  525|       |
  526|      0|    tracing::warn!(
  527|       |        target: "daemon",
  528|       |        daemon_version = %daemon_version,
  529|       |        cli_version = SQLITE_GRAPHRAG_VERSION,
  530|      0|        "daemon version mismatch detected; auto-restarting daemon"
  531|       |    );
  532|       |
  533|       |    // Send shutdown request.
  534|      0|    try_shutdown(models_dir)?;
  535|       |
  536|       |    // Wait for the stale daemon to exit.
  537|      0|    wait_for_daemon_exit(models_dir)?;
  538|       |
  539|       |    // Re-spawn the daemon via the existing mechanism.
  540|      0|    ensure_daemon_running(models_dir)?;
  541|       |
  542|      0|    Ok(())
  543|      0|}
  544|       |
  545|       |/// Polls until the daemon stops responding to pings, with exponential backoff.
  546|       |/// Starts at 50 ms, doubles each iteration, caps at 500 ms per sleep.
  547|       |/// Returns `Ok(())` once the daemon is gone or the timeout is reached.
  548|       |#[cold]
  549|       |#[inline(never)]
  550|      1|fn wait_for_daemon_exit(models_dir: &Path) -> Result<(), AppError> {
  551|      1|    let deadline = Instant::now() + Duration::from_millis(DAEMON_VERSION_RESTART_WAIT_MS);
  552|      1|    let mut sleep_ms: u64 = 50;
  553|       |
  554|      1|    while Instant::now() < deadline {
  555|      1|        if try_ping(models_dir)?.is_none() {
                                             ^0
  556|      1|            tracing::debug!(target: "daemon", "stale daemon exited after version-mismatch shutdown");
                                                            ^0
  557|      1|            return Ok(());
  558|      0|        }
  559|      0|        thread::sleep(Duration::from_millis(sleep_ms));
  560|      0|        sleep_ms = (sleep_ms * 2).min(500);
  561|       |    }
  562|       |
  563|      0|    tracing::warn!(
  564|       |        target: "daemon",
  565|       |        timeout_ms = DAEMON_VERSION_RESTART_WAIT_MS,
  566|      0|        "timed out waiting for stale daemon to exit after version-mismatch shutdown"
  567|       |    );
  568|      0|    Ok(())
  569|      1|}
  570|       |
  571|      0|fn request_or_autostart(
  572|      0|    models_dir: &Path,
  573|      0|    request: &DaemonRequest,
  574|      0|    cli_autostart: bool,
  575|      0|) -> Result<Option<DaemonResponse>, AppError> {
  576|       |    // ORDERING: Acquire pairs with the Release store in maybe_restart_for_version_mismatch.
  577|      0|    if DAEMON_VERSION_STATE.load(Ordering::Acquire) == VERSION_NOT_CHECKED {
  578|      0|        maybe_restart_for_version_mismatch(models_dir)?;
  579|      0|    }
  580|       |
  581|      0|    if let Some(response) = request_if_available(models_dir, request)? {
  582|      0|        clear_spawn_backoff_state(models_dir).ok();
  583|      0|        return Ok(Some(response));
  584|      0|    }
  585|       |
  586|      0|    if !should_autostart(cli_autostart) {
  587|      0|        return Ok(None);
  588|      0|    }
  589|       |
  590|      0|    if !ensure_daemon_running(models_dir)? {
  591|      0|        return Ok(None);
  592|      0|    }
  593|       |
  594|      0|    request_if_available(models_dir, request)
  595|      0|}
  596|       |
  597|      0|fn ensure_daemon_running(models_dir: &Path) -> Result<bool, AppError> {
  598|      0|    if (try_ping(models_dir)?).is_some() {
  599|      0|        clear_spawn_backoff_state(models_dir).ok();
  600|      0|        return Ok(true);
  601|      0|    }
  602|       |
  603|      0|    if spawn_backoff_active(models_dir)? {
  604|      0|        tracing::warn!(target: "daemon", "daemon autostart suppressed by backoff window");
  605|      0|        return Ok(false);
  606|      0|    }
  607|       |
  608|      0|    let spawn_lock = match try_acquire_spawn_lock(models_dir)? {
  609|      0|        Some(lock) => lock,
  610|      0|        None => return wait_for_daemon_ready(models_dir),
  611|       |    };
  612|       |
  613|      0|    if (try_ping(models_dir)?).is_some() {
  614|      0|        clear_spawn_backoff_state(models_dir).ok();
  615|      0|        drop(spawn_lock);
  616|      0|        return Ok(true);
  617|      0|    }
  618|       |
  619|      0|    let exe = match std::env::current_exe() {
  620|      0|        Ok(path) => path,
  621|      0|        Err(err) => {
  622|      0|            record_spawn_failure(models_dir, &format!("current_exe failed: {err}"))?;
  623|      0|            drop(spawn_lock);
  624|      0|            return Ok(false);
  625|       |        }
  626|       |    };
  627|       |
  628|      0|    let mut child = std::process::Command::new(exe);
  629|      0|    child
  630|      0|        .arg("daemon")
  631|      0|        .arg("--idle-shutdown-secs")
  632|      0|        .arg(DAEMON_IDLE_SHUTDOWN_SECS.to_string())
  633|      0|        .env("SQLITE_GRAPHRAG_DAEMON_CHILD", "1")
  634|      0|        .env_remove("LD_PRELOAD")
  635|      0|        .env_remove("LD_LIBRARY_PATH")
  636|      0|        .env_remove("LD_AUDIT")
  637|      0|        .env_remove("DYLD_INSERT_LIBRARIES")
  638|      0|        .env_remove("DYLD_LIBRARY_PATH")
  639|      0|        .stdin(Stdio::null())
  640|      0|        .stdout(Stdio::null())
  641|      0|        .stderr(Stdio::null());
  642|       |
  643|      0|    match crate::commands::claude_runner::spawn_with_memory_limit(&mut child) {
  644|      0|        Ok(child_handle) => {
  645|       |            // SAFETY: deliberate orphan daemon detach. The Child handle is intentionally
  646|       |            // dropped without a corresponding `.wait()` call because the daemon owns its
  647|       |            // own lifecycle: `Stdio::null()` is set on stdin/stdout/stderr (above) so the
  648|       |            // child does not inherit terminal handles, the spawn lock file at
  649|       |            // `<models_dir>/.daemon.spawn.lock` prevents concurrent spawns, and the
  650|       |            // daemon shuts itself down via `DAEMON_IDLE_SHUTDOWN_SECS` (or an explicit
  651|       |            // `daemon stop`/SIGTERM). Keeping the handle here would block the parent
  652|       |            // CLI in the foreground until the daemon exited, defeating the autostart
  653|       |            // contract that callers expect.
  654|       |            // See: docs_rules/rules_rust_processos_externos.md section "Child detach justificado"
  655|       |            //      AND docs/adr/0001-daemon-warmup-exception.md (authorized exception to no-daemon rule)
  656|      0|            let pid = child_handle.id();
  657|      0|            drop(child_handle);
  658|      0|            tracing::debug!(
  659|       |                target: "daemon",
  660|       |                pid,
  661|      0|                "daemon detached; lifecycle managed via spawn lock + readiness file"
  662|       |            );
  663|      0|            let ready = wait_for_daemon_ready(models_dir)?;
  664|      0|            if ready {
  665|      0|                clear_spawn_backoff_state(models_dir).ok();
  666|      0|            } else {
  667|      0|                record_spawn_failure(models_dir, "daemon did not become healthy after autostart")?;
  668|       |            }
  669|      0|            drop(spawn_lock);
  670|      0|            Ok(ready)
  671|       |        }
  672|      0|        Err(err) => {
  673|      0|            record_spawn_failure(models_dir, &format!("daemon spawn failed: {err}"))?;
  674|      0|            drop(spawn_lock);
  675|      0|            Ok(false)
  676|       |        }
  677|       |    }
  678|      0|}
  679|       |
  680|      0|fn wait_for_daemon_ready(models_dir: &Path) -> Result<bool, AppError> {
  681|      0|    let deadline = Instant::now() + Duration::from_millis(DAEMON_AUTO_START_MAX_WAIT_MS);
  682|      0|    let mut sleep_ms = DAEMON_AUTO_START_INITIAL_BACKOFF_MS.max(DAEMON_PING_TIMEOUT_MS);
  683|       |
  684|      0|    while Instant::now() < deadline {
  685|      0|        if (try_ping(models_dir)?).is_some() {
  686|      0|            return Ok(true);
  687|      0|        }
  688|      0|        thread::sleep(Duration::from_millis(sleep_ms));
  689|      0|        sleep_ms = (sleep_ms * 2).min(DAEMON_AUTO_START_MAX_BACKOFF_MS);
  690|       |    }
  691|       |
  692|      0|    Ok(false)
  693|      0|}
  694|       |
  695|      0|fn autostart_disabled_by_env() -> bool {
  696|      0|    std::env::var("SQLITE_GRAPHRAG_DAEMON_CHILD").as_deref() == Ok("1")
  697|      0|        || std::env::var("SQLITE_GRAPHRAG_DAEMON_FORCE_AUTOSTART").as_deref() != Ok("1")
  698|      0|            && std::env::var("SQLITE_GRAPHRAG_DAEMON_DISABLE_AUTOSTART").as_deref() == Ok("1")
  699|      0|}
  700|       |
  701|     31|fn daemon_control_dir(models_dir: &Path) -> PathBuf {
  702|     31|    models_dir
  703|     31|        .parent()
  704|     31|        .map(Path::to_path_buf)
  705|     31|        .unwrap_or_else(|| models_dir.to_path_buf())
                                         ^0         ^0
  706|     31|}
  707|       |
  708|      0|fn spawn_lock_path(models_dir: &Path) -> PathBuf {
  709|      0|    daemon_control_dir(models_dir).join("daemon-spawn.lock")
  710|      0|}
  711|       |
  712|     30|fn spawn_state_path(models_dir: &Path) -> PathBuf {
  713|     30|    daemon_control_dir(models_dir).join("daemon-spawn-state.json")
  714|     30|}
  715|       |
  716|      0|fn pid_file_path(models_dir: &Path) -> PathBuf {
  717|      0|    daemon_control_dir(models_dir).join("daemon.pid")
  718|      0|}
  719|       |
  720|      0|fn try_acquire_spawn_lock(models_dir: &Path) -> Result<Option<File>, AppError> {
  721|      0|    let path = spawn_lock_path(models_dir);
  722|      0|    std::fs::create_dir_all(crate::paths::parent_or_err(&path)?).map_err(AppError::Io)?;
  723|      0|    let file = OpenOptions::new()
  724|      0|        .read(true)
  725|      0|        .write(true)
  726|      0|        .create(true)
  727|      0|        .truncate(false)
  728|      0|        .open(path)
  729|      0|        .map_err(AppError::Io)?;
  730|       |
  731|      0|    let deadline = Instant::now() + Duration::from_millis(DAEMON_SPAWN_LOCK_WAIT_MS);
  732|       |    loop {
  733|      0|        match file.try_lock_exclusive() {
  734|      0|            Ok(()) => return Ok(Some(file)),
  735|      0|            Err(err) if err.kind() == std::io::ErrorKind::WouldBlock => {
  736|      0|                if Instant::now() >= deadline {
  737|      0|                    return Ok(None);
  738|      0|                }
  739|      0|                thread::sleep(Duration::from_millis(50));
  740|       |            }
  741|      0|            Err(err) => return Err(AppError::Io(err)),
  742|       |        }
  743|       |    }
  744|      0|}
  745|       |
  746|      3|fn spawn_backoff_active(models_dir: &Path) -> Result<bool, AppError> {
  747|      3|    let state = load_spawn_state(models_dir)?;
                                                          ^0
  748|      3|    Ok(now_epoch_ms() < state.not_before_epoch_ms)
  749|      3|}
  750|       |
  751|       |#[cold]
  752|       |#[inline(never)]
  753|     11|fn record_spawn_failure(models_dir: &Path, message: &str) -> Result<(), AppError> {
  754|     11|    let mut state = load_spawn_state(models_dir)?;
                                                              ^0
  755|     11|    state.consecutive_failures = state.consecutive_failures.saturating_add(1);
  756|     11|    let exponent = state.consecutive_failures.saturating_sub(1).min(6);
  757|     11|    let base_ms =
  758|     11|        (DAEMON_SPAWN_BACKOFF_BASE_MS * (1_u64 << exponent)).min(DAEMON_AUTO_START_MAX_BACKOFF_MS);
  759|       |    // v1.0.36 (L2) + v1.0.43 (H7): half-jitter via fastrand (replaces SystemTime nanoseconds
  760|       |    // which violated rules_rust_retry_com_backoff.md). Effective backoff range: [base/2, base).
  761|     11|    let half = base_ms / 2;
  762|     11|    let jitter = if half == 0 { 0 } else { fastrand::u64(0..half) };
                                              ^0
  763|     11|    let backoff_ms = half + jitter;
  764|     11|    state.not_before_epoch_ms = now_epoch_ms() + backoff_ms;
  765|     11|    state.last_error = Some(message.to_string());
  766|     11|    save_spawn_state(models_dir, &state)
  767|     11|}
  768|       |
  769|      1|fn clear_spawn_backoff_state(models_dir: &Path) -> Result<(), AppError> {
  770|      1|    let path = spawn_state_path(models_dir);
  771|      1|    if path.exists() {
  772|      1|        std::fs::remove_file(path).map_err(AppError::Io)?;
                                                                      ^0
  773|      0|    }
  774|      1|    Ok(())
  775|      1|}
  776|       |
  777|     17|fn load_spawn_state(models_dir: &Path) -> Result<DaemonSpawnState, AppError> {
  778|     17|    let path = spawn_state_path(models_dir);
  779|     17|    if !path.exists() {
  780|      4|        return Ok(DaemonSpawnState::default());
  781|     13|    }
  782|       |
  783|     13|    let bytes = std::fs::read(path).map_err(AppError::Io)?;
                                                                       ^0
  784|     13|    serde_json::from_slice(&bytes).map_err(AppError::Json)
  785|     17|}
  786|       |
  787|     12|fn save_spawn_state(models_dir: &Path, state: &DaemonSpawnState) -> Result<(), AppError> {
  788|     12|    let path = spawn_state_path(models_dir);
  789|     12|    std::fs::create_dir_all(crate::paths::parent_or_err(&path)?).map_err(AppError::Io)?;
                                                                            ^0                      ^0
  790|     12|    let bytes = serde_json::to_vec(state).map_err(AppError::Json)?;
                                                                               ^0
  791|     12|    std::fs::write(path, bytes).map_err(AppError::Io)
  792|     12|}
  793|       |
  794|       |/// Returns the GLiNER model variant string based on the environment variable
  795|       |/// `SQLITE_GRAPHRAG_GLINER_VARIANT`, defaulting to `"fp32"`.
  796|      2|fn gliner_variant_from_env() -> String {
  797|      2|    std::env::var("SQLITE_GRAPHRAG_GLINER_VARIANT").unwrap_or_else(|_| "fp32".to_string())
                                                                                     ^1     ^1
  798|      2|}
  799|       |
  800|     15|fn now_epoch_ms() -> u64 {
  801|     15|    SystemTime::now()
  802|     15|        .duration_since(UNIX_EPOCH)
  803|     15|        .unwrap_or_else(|_| Duration::from_secs(0))
                                          ^0
  804|     15|        .as_millis() as u64
  805|     15|}
  806|       |
  807|      2|fn to_local_socket_name(name: &str) -> std::io::Result<interprocess::local_socket::Name<'static>> {
  808|      2|    if let Ok(ns_name) = name.to_string().to_ns_name::<GenericNamespaced>() {
  809|      2|        return Ok(ns_name);
  810|      0|    }
  811|       |
  812|       |    // Fallback when abstract namespaces are unavailable. Honours XDG_RUNTIME_DIR
  813|       |    // (Linux user-private runtime dir) or SQLITE_GRAPHRAG_HOME (project override)
  814|       |    // before falling back to /tmp, which can collide when the same name is used
  815|       |    // by another user/project on a multi-tenant host. Added in v1.0.35.
  816|      0|    let path = if cfg!(unix) {
  817|      0|        let base = std::env::var_os("XDG_RUNTIME_DIR")
  818|      0|            .or_else(|| std::env::var_os("SQLITE_GRAPHRAG_HOME"))
  819|      0|            .map(std::path::PathBuf::from)
  820|      0|            .unwrap_or_else(std::env::temp_dir);
  821|      0|        base.join(format!("{name}.sock"))
  822|      0|            .to_string_lossy()
  823|      0|            .into_owned()
  824|       |    } else {
  825|      0|        format!(r"\\.\pipe\{name}")
  826|       |    };
  827|      0|    path.to_fs_name::<GenericFilePath>()
  828|      2|}
  829|       |
  830|       |#[cfg(test)]
  831|       |mod tests {
  832|       |    use super::*;
  833|       |
  834|       |    #[test]
  835|      1|    fn record_and_clear_spawn_backoff_state() {
  836|      1|        let tmp = tempfile::tempdir().unwrap();
  837|      1|        let models_dir = tmp.path().join("cache").join("models");
  838|      1|        std::fs::create_dir_all(&models_dir).unwrap();
  839|       |
  840|      1|        assert!(!spawn_backoff_active(&models_dir).unwrap());
  841|       |
  842|      1|        record_spawn_failure(&models_dir, "spawn failed").unwrap();
  843|      1|        assert!(spawn_backoff_active(&models_dir).unwrap());
  844|       |
  845|      1|        let state = load_spawn_state(&models_dir).unwrap();
  846|      1|        assert_eq!(state.consecutive_failures, 1);
  847|      1|        assert_eq!(state.last_error.as_deref(), Some("spawn failed"));
  848|       |
  849|      1|        clear_spawn_backoff_state(&models_dir).unwrap();
  850|      1|        assert!(!spawn_backoff_active(&models_dir).unwrap());
  851|      1|    }
  852|       |
  853|       |    #[test]
  854|      1|    fn daemon_control_dir_uses_models_parent() {
  855|      1|        let base = PathBuf::from("/tmp/sqlite-graphrag-cache-test");
  856|      1|        let models_dir = base.join("models");
  857|      1|        assert_eq!(daemon_control_dir(&models_dir), base);
  858|      1|    }
  859|       |
  860|       |    #[test]
  861|      1|    fn version_state_constants_are_distinct() {
  862|      1|        assert_ne!(VERSION_NOT_CHECKED, VERSION_COMPATIBLE);
  863|      1|        assert_ne!(VERSION_NOT_CHECKED, VERSION_RESTART_ATTEMPTED);
  864|      1|        assert_ne!(VERSION_COMPATIBLE, VERSION_RESTART_ATTEMPTED);
  865|      1|    }
  866|       |
  867|       |    #[test]
  868|      1|    fn wait_for_daemon_exit_immediate_when_not_running() {
  869|      1|        let tmp = tempfile::tempdir().unwrap();
  870|      1|        let models_dir = tmp.path().join("cache").join("models");
  871|      1|        std::fs::create_dir_all(&models_dir).unwrap();
  872|       |
  873|      1|        let start = Instant::now();
  874|      1|        wait_for_daemon_exit(&models_dir).unwrap();
  875|       |        // Without a daemon, the first ping returns None and the function exits immediately.
  876|      1|        assert!(start.elapsed() < Duration::from_millis(500));
  877|      1|    }
  878|       |
  879|       |    #[test]
  880|      1|    fn spawn_backoff_exponent_caps_at_six() {
  881|      1|        let tmp = tempfile::tempdir().unwrap();
  882|      1|        let models_dir = tmp.path().join("cache").join("models");
  883|      1|        std::fs::create_dir_all(&models_dir).unwrap();
  884|       |
  885|       |        // Record 10 consecutive failures to force exponent saturation.
  886|     11|        for i in 0..10 {
                          ^10
  887|     10|            record_spawn_failure(&models_dir, &format!("failure {i}")).unwrap();
  888|     10|        }
  889|       |
  890|      1|        let state = load_spawn_state(&models_dir).unwrap();
  891|      1|        assert_eq!(state.consecutive_failures, 10);
  892|       |
  893|       |        // Exponent is clamped at 6, so max base_ms is base * 2^6.
  894|       |        // Effective backoff range is [base/2, base), where base <= base_ms * 64.
  895|      1|        let max_base =
  896|      1|            (DAEMON_SPAWN_BACKOFF_BASE_MS * (1_u64 << 6)).min(DAEMON_AUTO_START_MAX_BACKOFF_MS);
  897|       |        // The not_before_epoch_ms must not exceed now + max_base (upper bound with jitter < half).
  898|      1|        let now = now_epoch_ms();
  899|      1|        assert!(state.not_before_epoch_ms <= now + max_base);
  900|      1|    }
  901|       |
  902|       |    #[test]
  903|      1|    fn spawn_backoff_half_jitter_in_range() {
  904|       |        // Verify the half-jitter formula: result = half + fastrand::u64(0..half)
  905|       |        // produces values in [half, half + half) == [base/2, base).
  906|      1|        let base_ms: u64 = 100;
  907|      1|        let half = base_ms / 2;
  908|    101|        for _ in 0..100 {
  909|    100|            let jitter = fastrand::u64(0..half);
  910|    100|            let result = half + jitter;
  911|    100|            assert!(result >= half, "result {result} below half {half}");
                                                  ^0
  912|    100|            assert!(result < base_ms, "result {result} not below base {base_ms}");
                                                    ^0
  913|       |        }
  914|      1|    }
  915|       |
  916|       |    #[test]
  917|      1|    fn to_local_socket_name_produces_valid_result() {
  918|      1|        let result = to_local_socket_name("sqlite-graphrag-test-daemon");
  919|      1|        assert!(result.is_ok(), "expected Ok, got {result:?}");
                                              ^0
  920|       |        // The name string representation must be non-empty.
  921|      1|        let name = result.unwrap();
  922|      1|        let display = format!("{name:?}");
  923|      1|        assert!(!display.is_empty());
  924|      1|    }
  925|       |
  926|       |    #[test]
  927|      1|    fn version_cas_not_checked_to_compatible() {
  928|      1|        let state = AtomicU8::new(VERSION_NOT_CHECKED);
  929|      1|        let result = state.compare_exchange(
  930|       |            VERSION_NOT_CHECKED,
  931|       |            VERSION_COMPATIBLE,
  932|      1|            Ordering::SeqCst,
  933|      1|            Ordering::SeqCst,
  934|       |        );
  935|      1|        assert!(result.is_ok());
  936|      1|        assert_eq!(state.load(Ordering::SeqCst), VERSION_COMPATIBLE);
  937|      1|    }
  938|       |
  939|       |    #[test]
  940|      1|    fn version_cas_prevents_double_restart() {
  941|      1|        let state = AtomicU8::new(VERSION_NOT_CHECKED);
  942|       |
  943|       |        // First CAS: NOT_CHECKED → RESTART_ATTEMPTED succeeds.
  944|      1|        let first = state.compare_exchange(
  945|       |            VERSION_NOT_CHECKED,
  946|       |            VERSION_RESTART_ATTEMPTED,
  947|      1|            Ordering::SeqCst,
  948|      1|            Ordering::SeqCst,
  949|       |        );
  950|      1|        assert!(first.is_ok());
  951|       |
  952|       |        // Second CAS from NOT_CHECKED must fail — state is already RESTART_ATTEMPTED.
  953|      1|        let second = state.compare_exchange(
  954|       |            VERSION_NOT_CHECKED,
  955|       |            VERSION_RESTART_ATTEMPTED,
  956|      1|            Ordering::SeqCst,
  957|      1|            Ordering::SeqCst,
  958|       |        );
  959|      1|        assert!(second.is_err());
  960|      1|        assert_eq!(state.load(Ordering::SeqCst), VERSION_RESTART_ATTEMPTED);
  961|      1|    }
  962|       |
  963|       |    #[test]
  964|      1|    fn ping_response_includes_model_fields() {
  965|      1|        let resp = DaemonResponse::Ok {
  966|      1|            pid: 42,
  967|      1|            version: "1.0.0".to_string(),
  968|      1|            handled_embed_requests: 7,
  969|      1|            model_name: "multilingual-e5-small".to_string(),
  970|      1|            model_variant: "fp32".to_string(),
  971|      1|        };
  972|      1|        let json = serde_json::to_value(&resp).expect("serialization failed");
  973|      1|        assert_eq!(json["model_name"], "multilingual-e5-small");
  974|      1|        assert_eq!(json["model_variant"], "fp32");
  975|      1|        assert_eq!(json["status"], "ok");
  976|      1|        assert_eq!(json["handled_embed_requests"], 7u64);
  977|      1|    }
  978|       |
  979|       |    #[test]
  980|      1|    fn gliner_variant_defaults_to_fp32() {
  981|       |        // Ensure the default is fp32 when env var is not set.
  982|      1|        std::env::remove_var("SQLITE_GRAPHRAG_GLINER_VARIANT");
  983|      1|        let variant = gliner_variant_from_env();
  984|      1|        assert_eq!(variant, "fp32");
  985|      1|    }
  986|       |
  987|       |    #[test]
  988|      1|    fn gliner_variant_reads_env_var() {
  989|      1|        std::env::set_var("SQLITE_GRAPHRAG_GLINER_VARIANT", "int8");
  990|      1|        let variant = gliner_variant_from_env();
  991|      1|        std::env::remove_var("SQLITE_GRAPHRAG_GLINER_VARIANT");
  992|      1|        assert_eq!(variant, "int8");
  993|      1|    }
  994|       |
  995|       |    #[test]
  996|      1|    fn spawn_state_serialization_roundtrip() {
  997|      1|        let tmp = tempfile::tempdir().unwrap();
  998|      1|        let models_dir = tmp.path().join("cache").join("models");
  999|      1|        std::fs::create_dir_all(&models_dir).unwrap();
 1000|       |
 1001|      1|        let original = DaemonSpawnState {
 1002|      1|            consecutive_failures: 3,
 1003|      1|            not_before_epoch_ms: 9_999_999_999,
 1004|      1|            last_error: Some("test error message".to_string()),
 1005|      1|        };
 1006|      1|        save_spawn_state(&models_dir, &original).unwrap();
 1007|       |
 1008|      1|        let loaded = load_spawn_state(&models_dir).unwrap();
 1009|      1|        assert_eq!(loaded.consecutive_failures, original.consecutive_failures);
 1010|      1|        assert_eq!(loaded.not_before_epoch_ms, original.not_before_epoch_ms);
 1011|      1|        assert_eq!(loaded.last_error, original.last_error);
 1012|      1|    }
 1013|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/embedder.rs:
    1|       |//! fastembed wrapper and per-process embedding cache.
    2|       |//!
    3|       |//! Owns the in-process `TextEmbedding` model and exposes batch encode/query
    4|       |//! helpers used by remember, recall, and related commands.
    5|       |// Workload: CPU-bound (ONNX inference, matrix multiplication via fastembed)
    6|       |
    7|       |use crate::constants::{
    8|       |    EMBEDDING_DIM, EMBEDDING_MAX_TOKENS, FASTEMBED_BATCH_SIZE, PASSAGE_PREFIX, QUERY_PREFIX,
    9|       |    REMEMBER_MAX_CONTROLLED_BATCH_CHUNKS, REMEMBER_MAX_CONTROLLED_BATCH_PADDED_TOKENS,
   10|       |};
   11|       |use crate::errors::AppError;
   12|       |use fastembed::{EmbeddingModel, ExecutionProviderDispatch, TextEmbedding, TextInitOptions};
   13|       |use ort::ep::CPU;
   14|       |use parking_lot::Mutex;
   15|       |use std::path::Path;
   16|       |use std::sync::OnceLock;
   17|       |
   18|       |/// Process-wide singleton embedding model behind a `Mutex`.
   19|       |///
   20|       |/// ONNX Runtime's `Session` is not guaranteed thread-safe for concurrent
   21|       |/// inference; `Mutex` serialises all embedding calls.  This is correct by
   22|       |/// design — without the daemon, embedding throughput is intentionally serial.
   23|       |///
   24|       |/// For parallel workloads (enrich, ingest) start the daemon first:
   25|       |/// `sqlite-graphrag daemon` — the model is loaded once and served via UDS,
   26|       |/// eliminating Mutex contention across CLI invocations.
   27|       |static EMBEDDER: OnceLock<Mutex<TextEmbedding>> = OnceLock::new();
   28|       |
   29|       |/// Returns the process-wide singleton embedder, initializing it on first call.
   30|       |/// Subsequent calls return the cached instance regardless of `models_dir`.
   31|       |///
   32|       |/// # Errors
   33|       |///
   34|       |/// - [`AppError::Embedding`] — ONNX model load failure or runtime initialisation error.
   35|       |/// - [`AppError::Io`] — cache directory is inaccessible or cannot be created.
   36|      0|pub fn get_embedder(models_dir: &Path) -> Result<&'static Mutex<TextEmbedding>, AppError> {
   37|      0|    if let Some(m) = EMBEDDER.get() {
   38|      0|        return Ok(m);
   39|      0|    }
   40|       |
   41|      0|    maybe_init_dynamic_ort(models_dir)?;
   42|       |
   43|       |    // Multi-layer mitigation of the explosive RSS observed with variable-shape
   44|       |    // payloads. The three current layers are:
   45|       |    //   1. `with_arena_allocator(false)` on the CPU execution provider (line below)
   46|       |    //   2. env var `ORT_DISABLE_CPU_MEM_ARENA=1` in `main.rs` (default since v1.0.18)
   47|       |    //   3. env var `ORT_NUM_THREADS=1` + `ORT_INTRA_OP_NUM_THREADS=1` in `main.rs`
   48|       |    // The `with_memory_pattern(false)` flag exists in ort 2.0 (`SessionBuilder`)
   49|       |    // but fastembed 5.13.2 does NOT expose access to a custom SessionBuilder via
   50|       |    // `TextInitOptions`. If RSS grows again in real corpora, the next
   51|       |    // mitigation requires one of the following paths:
   52|       |    //   - Fork fastembed to expose `SessionBuilder::with_memory_pattern(false)`
   53|       |    //   - Bypass fastembed and use ort directly with a custom SessionBuilder
   54|       |    //   - Fixed padding in `plan_controlled_batches` to eliminate variable shapes
   55|       |    // References:
   56|       |    //   https://onnxruntime.ai/docs/performance/tune-performance/memory.html
   57|       |    //   https://github.com/qdrant/fastembed/issues/570
   58|      0|    let cpu_ep: ExecutionProviderDispatch = CPU::default().with_arena_allocator(false).build();
   59|       |
   60|      0|    let model = TextEmbedding::try_new(
   61|      0|        TextInitOptions::new(EmbeddingModel::MultilingualE5Small)
   62|      0|            .with_execution_providers(vec![cpu_ep])
   63|      0|            .with_max_length(EMBEDDING_MAX_TOKENS)
   64|      0|            .with_show_download_progress(true)
   65|      0|            .with_cache_dir(models_dir.to_path_buf()),
   66|       |    )
   67|      0|    .map_err(|e| AppError::Embedding(e.to_string()))?;
   68|       |    // If another thread raced and won, discard our instance and return theirs.
   69|      0|    let _ = EMBEDDER.set(Mutex::new(model));
   70|      0|    EMBEDDER.get().ok_or_else(|| {
   71|      0|        AppError::Embedding(
   72|      0|            "embedder OnceLock unexpectedly empty after set() (likely a racing initializer aborted before completion)"
   73|      0|                .into(),
   74|      0|        )
   75|      0|    })
   76|      0|}
   77|       |
   78|       |#[cfg(all(target_arch = "aarch64", target_os = "linux", target_env = "gnu"))]
   79|       |fn maybe_init_dynamic_ort(models_dir: &Path) -> Result<(), AppError> {
   80|       |    let mut candidates = Vec::with_capacity(4);
   81|       |
   82|       |    if let Ok(path) = std::env::var("ORT_DYLIB_PATH") {
   83|       |        if !path.is_empty() {
   84|       |            candidates.push(std::path::PathBuf::from(path));
   85|       |        }
   86|       |    }
   87|       |
   88|       |    if let Ok(exe) = std::env::current_exe() {
   89|       |        if let Some(dir) = exe.parent() {
   90|       |            candidates.push(dir.join("libonnxruntime.so"));
   91|       |            candidates.push(dir.join("lib").join("libonnxruntime.so"));
   92|       |        }
   93|       |    }
   94|       |
   95|       |    candidates.push(models_dir.join("libonnxruntime.so"));
   96|       |
   97|       |    for path in candidates {
   98|       |        if !path.exists() {
   99|       |            continue;
  100|       |        }
  101|       |
  102|       |        std::env::set_var("ORT_DYLIB_PATH", &path);
  103|       |        let _ = ort::init_from(&path)
  104|       |            .map_err(|e| AppError::Embedding(e.to_string()))?
  105|       |            .commit();
  106|       |        return Ok(());
  107|       |    }
  108|       |
  109|       |    Ok(())
  110|       |}
  111|       |
  112|       |#[cfg(not(all(target_arch = "aarch64", target_os = "linux", target_env = "gnu")))]
  113|      0|fn maybe_init_dynamic_ort(_models_dir: &Path) -> Result<(), AppError> {
  114|      0|    Ok(())
  115|      0|}
  116|       |
  117|       |/// Embeds a single passage using the `passage:` prefix required by E5 models.
  118|       |///
  119|       |/// # Errors
  120|       |/// Returns `Err` when the model returns an unexpected result.
  121|       |#[tracing::instrument(skip(embedder, text), fields(text_len = text.len()))]
  122|      0|pub fn embed_passage(embedder: &Mutex<TextEmbedding>, text: &str) -> Result<Vec<f32>, AppError> {
  123|      0|    let prefixed = format!("{PASSAGE_PREFIX}{text}");
  124|      0|    let results = embedder
  125|      0|        .lock()
  126|      0|        .embed(vec![prefixed.as_str()], Some(1))
  127|      0|        .map_err(|e| AppError::Embedding(e.to_string()))?;
  128|      0|    let emb = results
  129|      0|        .into_iter()
  130|      0|        .next()
  131|      0|        .ok_or_else(|| AppError::Embedding("empty embedding result".into()))?;
  132|      0|    assert_eq!(emb.len(), EMBEDDING_DIM, "unexpected embedding dimension");
  133|      0|    Ok(emb)
  134|      0|}
  135|       |
  136|       |/// Embeds a search query using the `query:` prefix required by E5 models.
  137|       |///
  138|       |/// # Errors
  139|       |/// Returns `Err` when the model returns an unexpected result.
  140|       |#[tracing::instrument(skip(embedder, text), fields(text_len = text.len()))]
  141|      0|pub fn embed_query(embedder: &Mutex<TextEmbedding>, text: &str) -> Result<Vec<f32>, AppError> {
  142|      0|    let prefixed = format!("{QUERY_PREFIX}{text}");
  143|      0|    let results = embedder
  144|      0|        .lock()
  145|      0|        .embed(vec![prefixed.as_str()], Some(1))
  146|      0|        .map_err(|e| AppError::Embedding(e.to_string()))?;
  147|      0|    let emb = results
  148|      0|        .into_iter()
  149|      0|        .next()
  150|      0|        .ok_or_else(|| AppError::Embedding("empty embedding result".into()))?;
  151|      0|    Ok(emb)
  152|      0|}
  153|       |
  154|       |/// Embeds multiple passages in a single ONNX batch call.
  155|       |///
  156|       |/// `batch_size` is capped at `FASTEMBED_BATCH_SIZE`. All texts receive the `passage:` prefix.
  157|       |///
  158|       |/// # Errors
  159|       |/// Returns `Err` when the model inference fails.
  160|       |#[tracing::instrument(skip(embedder, texts), fields(batch_size = texts.len()))]
  161|      0|pub fn embed_passages_batch(
  162|      0|    embedder: &Mutex<TextEmbedding>,
  163|      0|    texts: &[&str],
  164|      0|    batch_size: usize,
  165|      0|) -> Result<Vec<Vec<f32>>, AppError> {
  166|      0|    let prefixed: Vec<String> = texts
  167|      0|        .iter()
  168|      0|        .map(|t| format!("{PASSAGE_PREFIX}{t}"))
  169|      0|        .collect();
  170|      0|    let strs: Vec<&str> = prefixed.iter().map(String::as_str).collect();
  171|      0|    let results = embedder
  172|      0|        .lock()
  173|      0|        .embed(strs, Some(batch_size.min(FASTEMBED_BATCH_SIZE)))
  174|      0|        .map_err(|e| AppError::Embedding(e.to_string()))?;
  175|      0|    for emb in &results {
  176|      0|        assert_eq!(emb.len(), EMBEDDING_DIM, "unexpected embedding dimension");
  177|       |    }
  178|      0|    Ok(results)
  179|      0|}
  180|       |
  181|       |/// Returns the number of batches that [`embed_passages_controlled`] would produce
  182|       |/// for the given `token_counts` slice without running inference.
  183|      1|pub fn controlled_batch_count(token_counts: &[usize]) -> usize {
  184|      1|    plan_controlled_batches(token_counts).len()
  185|      1|}
  186|       |
  187|       |/// Embeds passages grouped into token-budget-aware batches to avoid OOM on variable-length inputs.
  188|       |///
  189|       |/// `texts` and `token_counts` must have the same length. Batches are planned using an
  190|       |/// internal budget algorithm and single-item batches fall back to [`embed_passage`].
  191|       |///
  192|       |/// # Errors
  193|       |/// Returns `Err` when lengths differ, the mutex is poisoned, or inference fails.
  194|      0|pub fn embed_passages_controlled(
  195|      0|    embedder: &Mutex<TextEmbedding>,
  196|      0|    texts: &[&str],
  197|      0|    token_counts: &[usize],
  198|      0|) -> Result<Vec<Vec<f32>>, AppError> {
  199|      0|    if texts.len() != token_counts.len() {
  200|      0|        return Err(AppError::Internal(anyhow::anyhow!(
  201|      0|            "texts/token_counts length mismatch in controlled embedding"
  202|      0|        )));
  203|      0|    }
  204|       |
  205|      0|    let mut results = Vec::with_capacity(texts.len());
  206|      0|    for (start, end) in plan_controlled_batches(token_counts) {
  207|      0|        if end - start == 1 {
  208|      0|            results.push(embed_passage(embedder, texts[start])?);
  209|      0|            continue;
  210|      0|        }
  211|       |
  212|      0|        results.extend(embed_passages_batch(
  213|      0|            embedder,
  214|      0|            &texts[start..end],
  215|      0|            end - start,
  216|      0|        )?);
  217|       |    }
  218|       |
  219|      0|    Ok(results)
  220|      0|}
  221|       |
  222|       |/// Embed multiple passages one-by-one (serial ONNX inference).
  223|       |///
  224|       |/// Serialization is **intentional**: ONNX batch inference can trigger pathological
  225|       |/// runtime behaviour on real-world Markdown chunks (variable token lengths cause
  226|       |/// extreme padding overhead). Callers that need parallelism should use the rayon
  227|       |/// `ThreadPool` in `src/commands/ingest.rs::run`, which partitions work across
  228|       |/// CPU threads and calls this function per shard.
  229|       |///
  230|       |/// # Errors
  231|       |///
  232|       |/// Returns [`AppError::Embedding`] when the ONNX encoder fails on any passage.
  233|      0|pub fn embed_passages_serial<'a, I>(
  234|      0|    embedder: &Mutex<TextEmbedding>,
  235|      0|    texts: I,
  236|      0|) -> Result<Vec<Vec<f32>>, AppError>
  237|      0|where
  238|      0|    I: IntoIterator<Item = &'a str>,
  239|       |{
  240|      0|    let iter = texts.into_iter();
  241|      0|    let (lower, _) = iter.size_hint();
  242|      0|    let mut results = Vec::with_capacity(lower);
  243|      0|    for text in iter {
  244|      0|        results.push(embed_passage(embedder, text)?);
  245|       |    }
  246|      0|    Ok(results)
  247|      0|}
  248|       |
  249|      2|fn plan_controlled_batches(token_counts: &[usize]) -> Vec<(usize, usize)> {
  250|      2|    let mut batches =
  251|      2|        Vec::with_capacity((token_counts.len() / REMEMBER_MAX_CONTROLLED_BATCH_CHUNKS).max(1));
  252|      2|    let mut start = 0usize;
  253|       |
  254|      6|    while start < token_counts.len() {
  255|      4|        let mut end = start + 1;
  256|      4|        let mut max_tokens = token_counts[start].max(1);
  257|       |
  258|      7|        while end < token_counts.len() && end - start < REMEMBER_MAX_CONTROLLED_BATCH_CHUNKS {
                                                        ^5
  259|      4|            let candidate_max = max_tokens.max(token_counts[end].max(1));
  260|      4|            let candidate_len = end + 1 - start;
  261|      4|            if candidate_max * candidate_len > REMEMBER_MAX_CONTROLLED_BATCH_PADDED_TOKENS {
  262|      1|                break;
  263|      3|            }
  264|      3|            max_tokens = candidate_max;
  265|      3|            end += 1;
  266|       |        }
  267|       |
  268|      4|        batches.push((start, end));
  269|      4|        start = end;
  270|       |    }
  271|       |
  272|      2|    batches
  273|      2|}
  274|       |
  275|       |/// Convert `&[f32]` to `&[u8]` for sqlite-vec storage.
  276|       |///
  277|       |/// # Safety
  278|       |///
  279|       |/// This function is sound when the following invariants hold:
  280|       |/// 1. `f32` has no padding bytes per the Rust reference
  281|       |///    (<https://doc.rust-lang.org/reference/types/numeric.html>);
  282|       |///    `[f32]` has the same byte representation as `[u8; size_of_val(v)]`.
  283|       |/// 2. The returned `&[u8]` borrows from `v`; its lifetime is tied to the input slice.
  284|       |/// 3. Endianness matches sqlite-vec on supported platforms (x86_64, aarch64 little-endian).
  285|       |///    Targets with big-endian `f32` storage are not supported by sqlite-vec.
  286|       |#[cfg(target_endian = "big")]
  287|       |compile_error!(
  288|       |    "sqlite-graphrag requires little-endian f32 layout for sqlite-vec compatibility. \
  289|       |     Big-endian targets (PPC64, S390x) are not supported."
  290|       |);
  291|       |
  292|     25|pub fn f32_to_bytes(v: &[f32]) -> &[u8] {
  293|       |    // SAFETY: see invariants above. f32→u8 transmute via from_raw_parts is sound.
  294|     25|    unsafe { std::slice::from_raw_parts(v.as_ptr() as *const u8, std::mem::size_of_val(v)) }
  295|     25|}
  296|       |
  297|       |#[cfg(test)]
  298|       |mod tests {
  299|       |    use super::*;
  300|       |    use crate::constants::{EMBEDDING_DIM, PASSAGE_PREFIX, QUERY_PREFIX};
  301|       |
  302|       |    // --- f32_to_bytes tests (pure function, no model) ---
  303|       |
  304|       |    #[test]
  305|      1|    fn f32_to_bytes_empty_slice_returns_empty() {
  306|      1|        let v: Vec<f32> = vec![];
  307|      1|        assert_eq!(f32_to_bytes(&v), &[] as &[u8]);
  308|      1|    }
  309|       |
  310|       |    #[test]
  311|      1|    fn f32_to_bytes_one_element_returns_4_bytes() {
  312|      1|        let v = vec![1.0_f32];
  313|      1|        let bytes = f32_to_bytes(&v);
  314|      1|        assert_eq!(bytes.len(), 4);
  315|       |        // roundtrip: the 4 bytes must reconstruct the original f32
  316|      1|        let recovered = f32::from_le_bytes([bytes[0], bytes[1], bytes[2], bytes[3]]);
  317|      1|        assert_eq!(recovered, 1.0_f32);
  318|      1|    }
  319|       |
  320|       |    #[test]
  321|      1|    fn f32_to_bytes_length_is_4x_elements() {
  322|      1|        let v = vec![0.0_f32, 1.0, 2.0, 3.0];
  323|      1|        assert_eq!(f32_to_bytes(&v).len(), v.len() * 4);
  324|      1|    }
  325|       |
  326|       |    #[test]
  327|      1|    fn f32_to_bytes_zero_encoded_as_4_zeros() {
  328|      1|        let v = vec![0.0_f32];
  329|      1|        assert_eq!(f32_to_bytes(&v), &[0u8, 0, 0, 0]);
  330|      1|    }
  331|       |
  332|       |    #[test]
  333|      1|    fn f32_to_bytes_roundtrip_vector_embedding_dim() {
  334|    384|        let v: Vec<f32> = (0..EMBEDDING_DIM).map(|i| i as f32 * 0.001).collect();
                          ^1 ^1         ^1                 ^1                        ^1
  335|      1|        let bytes = f32_to_bytes(&v);
  336|      1|        assert_eq!(bytes.len(), EMBEDDING_DIM * 4);
  337|       |        // reconstructs and compares first and last element
  338|      1|        let first = f32::from_le_bytes(bytes[0..4].try_into().unwrap());
  339|      1|        assert!((first - 0.0_f32).abs() < 1e-6);
  340|      1|        let last_start = (EMBEDDING_DIM - 1) * 4;
  341|      1|        let last = f32::from_le_bytes(bytes[last_start..last_start + 4].try_into().unwrap());
  342|      1|        assert!((last - (EMBEDDING_DIM - 1) as f32 * 0.001).abs() < 1e-4);
  343|      1|    }
  344|       |
  345|       |    // --- verifies prefixes used by the embedder (no model) ---
  346|       |
  347|       |    #[test]
  348|      1|    fn passage_prefix_not_empty() {
  349|      1|        assert_eq!(PASSAGE_PREFIX, "passage: ");
  350|      1|    }
  351|       |
  352|       |    #[test]
  353|      1|    fn query_prefix_not_empty() {
  354|      1|        assert_eq!(QUERY_PREFIX, "query: ");
  355|      1|    }
  356|       |
  357|       |    #[test]
  358|      1|    fn embedding_dim_is_384() {
  359|      1|        assert_eq!(EMBEDDING_DIM, 384);
  360|      1|    }
  361|       |
  362|       |    // --- testes com modelo real (ignorados no CI normal) ---
  363|       |
  364|       |    #[test]
  365|       |    #[ignore = "requires ~600 MB model on disk; run with --include-ignored"]
  366|      0|    fn embed_passage_returns_vector_with_correct_dimension() {
  367|      0|        let dir = tempfile::tempdir().unwrap();
  368|      0|        let embedder = get_embedder(dir.path()).unwrap();
  369|      0|        let result = embed_passage(embedder, "test text").unwrap();
  370|      0|        assert_eq!(result.len(), EMBEDDING_DIM);
  371|      0|    }
  372|       |
  373|       |    #[test]
  374|       |    #[ignore = "requires ~600 MB model on disk; run with --include-ignored"]
  375|      0|    fn embed_query_returns_vector_with_correct_dimension() {
  376|      0|        let dir = tempfile::tempdir().unwrap();
  377|      0|        let embedder = get_embedder(dir.path()).unwrap();
  378|      0|        let result = embed_query(embedder, "test query").unwrap();
  379|      0|        assert_eq!(result.len(), EMBEDDING_DIM);
  380|      0|    }
  381|       |
  382|       |    #[test]
  383|       |    #[ignore = "requires ~600 MB model on disk; run with --include-ignored"]
  384|      0|    fn embed_passages_batch_returns_one_vector_per_text() {
  385|      0|        let dir = tempfile::tempdir().unwrap();
  386|      0|        let embedder = get_embedder(dir.path()).unwrap();
  387|      0|        let textos = ["primeiro", "segundo"];
  388|      0|        let results = embed_passages_batch(embedder, &textos, 2).unwrap();
  389|      0|        assert_eq!(results.len(), 2);
  390|      0|        for emb in &results {
  391|      0|            assert_eq!(emb.len(), EMBEDDING_DIM);
  392|       |        }
  393|      0|    }
  394|       |
  395|       |    #[test]
  396|      1|    fn controlled_batch_plan_respects_budget() {
  397|      1|        assert_eq!(
  398|      1|            plan_controlled_batches(&[100, 100, 100, 100, 300, 300]),
  399|      1|            vec![(0, 4), (4, 5), (5, 6)]
  400|       |        );
  401|      1|    }
  402|       |
  403|       |    #[test]
  404|      1|    fn controlled_batch_count_returns_one_for_single_chunk() {
  405|      1|        assert_eq!(controlled_batch_count(&[350]), 1);
  406|      1|    }
  407|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/entity_type.rs:
    1|       |//! Canonical entity type taxonomy used across extraction, storage and CLI.
    2|       |//!
    3|       |//! `EntityType` is the single source of truth for the 13 graph entity kinds.
    4|       |//! It derives `clap::ValueEnum` so CLI flags can use it directly, and derives
    5|       |//! `serde::{Serialize, Deserialize}` with `rename_all = "lowercase"` so JSON
    6|       |//! round-trips remain backward-compatible with the pre-enum string format.
    7|       |
    8|       |use crate::errors::AppError;
    9|       |
   10|       |/// The 13 canonical graph entity classifications.
   11|       |///
   12|       |/// Values are serialized as lowercase strings (`"person"`, `"organization"`,
   13|       |/// etc.) matching the pre-enum wire format and the SQLite `type` column.
   14|       |#[derive(
   15|       |    Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize, clap::ValueEnum,
   16|       |)]
   17|       |#[serde(rename_all = "snake_case")]
   18|       |#[clap(rename_all = "snake_case")]
   19|       |pub enum EntityType {
   20|       |    Concept,
   21|       |    Date,
   22|       |    Dashboard,
   23|       |    Decision,
   24|       |    File,
   25|       |    Incident,
   26|       |    IssueTracker,
   27|       |    Location,
   28|       |    Memory,
   29|       |    Organization,
   30|       |    Person,
   31|       |    Project,
   32|       |    Tool,
   33|       |}
   34|       |
   35|       |impl EntityType {
   36|       |    /// Returns the canonical lowercase string representation stored in SQLite.
   37|     96|    pub fn as_str(self) -> &'static str {
   38|     96|        match self {
   39|     24|            EntityType::Concept => "concept",
   40|      0|            EntityType::Date => "date",
   41|      0|            EntityType::Dashboard => "dashboard",
   42|      0|            EntityType::Decision => "decision",
   43|      0|            EntityType::File => "file",
   44|      0|            EntityType::Incident => "incident",
   45|      1|            EntityType::IssueTracker => "issue_tracker",
   46|      0|            EntityType::Location => "location",
   47|      0|            EntityType::Memory => "memory",
   48|     11|            EntityType::Organization => "organization",
   49|      5|            EntityType::Person => "person",
   50|     53|            EntityType::Project => "project",
   51|      2|            EntityType::Tool => "tool",
   52|       |        }
   53|     96|    }
   54|       |}
   55|       |
   56|       |impl std::fmt::Display for EntityType {
   57|      0|    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
   58|      0|        f.write_str(self.as_str())
   59|      0|    }
   60|       |}
   61|       |
   62|       |impl std::str::FromStr for EntityType {
   63|       |    type Err = AppError;
   64|       |
   65|     16|    fn from_str(s: &str) -> Result<Self, Self::Err> {
   66|     16|        match s.to_lowercase().as_str() {
   67|     16|            "concept" => Ok(EntityType::Concept),
                                       ^1
   68|     15|            "date" => Ok(EntityType::Date),
                                    ^1
   69|     14|            "dashboard" => Ok(EntityType::Dashboard),
                                         ^0
   70|     14|            "decision" => Ok(EntityType::Decision),
                                        ^1
   71|     13|            "file" => Ok(EntityType::File),
                                    ^1
   72|     12|            "incident" => Ok(EntityType::Incident),
                                        ^1
   73|     11|            "issue_tracker" => Ok(EntityType::IssueTracker),
                                             ^1
   74|     10|            "location" => Ok(EntityType::Location),
                                        ^1
   75|      9|            "memory" => Ok(EntityType::Memory),
                                      ^0
   76|      9|            "organization" => Ok(EntityType::Organization),
                                            ^3
   77|      6|            "person" => Ok(EntityType::Person),
                                      ^3
   78|      3|            "project" => Ok(EntityType::Project),
                                       ^1
   79|      2|            "tool" => Ok(EntityType::Tool),
                                    ^1
   80|      1|            other => {
   81|      1|                let hint = match other {
   82|      1|                    "reference" | "skill" | "note" | "feedback" => Some("concept"),
                                                                                 ^0
   83|      1|                    "document" => Some("file"),
                                                ^0
   84|      1|                    "user" => Some("person"),
                                            ^0
   85|      1|                    _ => None,
   86|       |                };
   87|      1|                let msg = if let Some(suggested) = hint {
                                                    ^0
   88|      0|                    format!(
   89|      0|                        "invalid entity_type '{other}'; '{other}' is a MEMORY type, not an entity type. \
   90|      0|                         Try '{suggested}' instead. Valid entity types: concept, date, dashboard, \
   91|      0|                         decision, file, incident, issue_tracker, location, memory, organization, \
   92|      0|                         person, project, tool"
   93|       |                    )
   94|       |                } else {
   95|      1|                    format!(
   96|      1|                        "invalid entity type: {other}; expected one of: concept, date, dashboard, \
   97|      1|                         decision, file, incident, issue_tracker, location, memory, organization, \
   98|      1|                         person, project, tool"
   99|       |                    )
  100|       |                };
  101|      1|                Err(AppError::Validation(msg))
  102|       |            }
  103|       |        }
  104|     16|    }
  105|       |}
  106|       |
  107|       |impl rusqlite::types::FromSql for EntityType {
  108|      0|    fn column_result(value: rusqlite::types::ValueRef<'_>) -> rusqlite::types::FromSqlResult<Self> {
  109|      0|        let s = String::column_result(value)?;
  110|      0|        s.parse::<EntityType>().map_err(|e| {
  111|      0|            rusqlite::types::FromSqlError::Other(Box::new(std::io::Error::other(e.to_string())))
  112|      0|        })
  113|      0|    }
  114|       |}
  115|       |
  116|       |impl rusqlite::types::ToSql for EntityType {
  117|     55|    fn to_sql(&self) -> rusqlite::Result<rusqlite::types::ToSqlOutput<'_>> {
  118|     55|        Ok(rusqlite::types::ToSqlOutput::from(self.as_str()))
  119|     55|    }
  120|       |}
  121|       |
  122|       |#[cfg(test)]
  123|       |mod tests {
  124|       |    use super::*;
  125|       |
  126|       |    #[test]
  127|      1|    fn from_str_lowercase_roundtrip() {
  128|      1|        assert_eq!("person".parse::<EntityType>().unwrap(), EntityType::Person);
  129|      1|        assert_eq!(
  130|      1|            "organization".parse::<EntityType>().unwrap(),
  131|       |            EntityType::Organization
  132|       |        );
  133|      1|        assert_eq!(
  134|      1|            "issue_tracker".parse::<EntityType>().unwrap(),
  135|       |            EntityType::IssueTracker
  136|       |        );
  137|      1|    }
  138|       |
  139|       |    #[test]
  140|      1|    fn from_str_uppercase_is_case_insensitive() {
  141|      1|        assert_eq!("PERSON".parse::<EntityType>().unwrap(), EntityType::Person);
  142|      1|        assert_eq!(
  143|      1|            "Organization".parse::<EntityType>().unwrap(),
  144|       |            EntityType::Organization
  145|       |        );
  146|      1|    }
  147|       |
  148|       |    #[test]
  149|      1|    fn from_str_invalid_returns_err() {
  150|      1|        let result = "invalid".parse::<EntityType>();
  151|      1|        assert!(result.is_err());
  152|      1|        let msg = result.unwrap_err().to_string();
  153|      1|        assert!(msg.contains("invalid entity type"));
  154|      1|    }
  155|       |
  156|       |    #[test]
  157|      1|    fn as_str_returns_canonical_lowercase() {
  158|      1|        assert_eq!(EntityType::Person.as_str(), "person");
  159|      1|        assert_eq!(EntityType::IssueTracker.as_str(), "issue_tracker");
  160|      1|    }
  161|       |
  162|       |    #[test]
  163|      1|    fn serde_json_serializes_as_lowercase_string() {
  164|      1|        let json = serde_json::to_string(&EntityType::Person).unwrap();
  165|      1|        assert_eq!(json, "\"person\"");
  166|      1|        let json = serde_json::to_string(&EntityType::IssueTracker).unwrap();
  167|      1|        assert_eq!(json, "\"issue_tracker\"");
  168|      1|    }
  169|       |
  170|       |    #[test]
  171|      1|    fn serde_json_deserializes_from_lowercase_string() {
  172|      1|        let et: EntityType = serde_json::from_str("\"person\"").unwrap();
  173|      1|        assert_eq!(et, EntityType::Person);
  174|      1|    }
  175|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/errors.rs:
    1|       |//! Library-wide error type.
    2|       |//!
    3|       |//! `AppError` is the single error type returned by every public API in the
    4|       |//! crate. Each variant maps to a deterministic exit code through
    5|       |//! `AppError::exit_code`, which the binary propagates to the shell on
    6|       |//! failure. See the README for the full exit code contract.
    7|       |
    8|       |use crate::i18n::{current, Language};
    9|       |use thiserror::Error;
   10|       |
   11|       |/// Unified error type for all CLI and library operations.
   12|       |///
   13|       |/// Each variant corresponds to a distinct failure category. The
   14|       |/// [`AppError::exit_code`] method converts a variant into a stable numeric
   15|       |/// code so that shell callers and LLM agents can route on it.
   16|       |///
   17|       |/// # SemVer Policy
   18|       |///
   19|       |/// This enum is `#[non_exhaustive]`. New variants may be added in minor
   20|       |/// releases without breaking downstream match arms (use a wildcard `_`).
   21|       |#[derive(Error, Debug)]
   22|       |#[non_exhaustive]
   23|       |pub enum AppError {
   24|       |    /// Input failed schema, length or format validation. Maps to exit code `1`.
   25|       |    ///
   26|       |    /// This variant groups multiple validation failure causes. Callers that need
   27|       |    /// programmatic retry decisions should use [`AppError::is_retryable`] instead
   28|       |    /// of parsing the message string.
   29|       |    #[error("validation error: {0}")]
   30|       |    Validation(String),
   31|       |
   32|       |    /// External binary required for operation was not found in PATH. Maps to exit code `1`.
   33|       |    #[error("binary not found: {name} — ensure it is installed and in PATH")]
   34|       |    BinaryNotFound { name: String },
   35|       |
   36|       |    /// Remote service signaled rate limiting; caller should retry with backoff. Maps to exit code `1`.
   37|       |    #[error("rate limited: {detail}")]
   38|       |    RateLimited { detail: String },
   39|       |
   40|       |    /// Operation exceeded its time budget. Maps to exit code `1`.
   41|       |    #[error("timeout after {duration_secs}s: {operation}")]
   42|       |    Timeout {
   43|       |        operation: String,
   44|       |        duration_secs: u64,
   45|       |    },
   46|       |
   47|       |    /// A memory or entity with the same `(namespace, name)` already exists. Maps to exit code `9`.
   48|       |    #[error("duplicate detected: {0}")]
   49|       |    Duplicate(String),
   50|       |
   51|       |    /// Optimistic update lost the race because `updated_at` changed. Maps to exit code `3`.
   52|       |    #[error("conflict: {0}")]
   53|       |    Conflict(String),
   54|       |
   55|       |    /// The requested record does not exist or was soft-deleted. Maps to exit code `4`.
   56|       |    #[error("not found: {0}")]
   57|       |    NotFound(String),
   58|       |
   59|       |    /// Namespace could not be resolved from flag, environment or markers. Maps to exit code `5`.
   60|       |    #[error("namespace not resolved: {0}")]
   61|       |    NamespaceError(String),
   62|       |
   63|       |    /// Payload exceeded one of the configured body, name or batch limits. Maps to exit code `6`.
   64|       |    #[error("limit exceeded: {0}")]
   65|       |    LimitExceeded(String),
   66|       |
   67|       |    /// Low-level SQLite error propagated from `rusqlite`. Maps to exit code `10`.
   68|       |    #[error("database error: {0}")]
   69|       |    Database(#[from] rusqlite::Error),
   70|       |
   71|       |    /// Embedding generation via `fastembed` failed or produced the wrong shape. Maps to exit code `11`.
   72|       |    #[error("embedding error: {0}")]
   73|       |    Embedding(String),
   74|       |
   75|       |    /// The `sqlite-vec` extension could not load or register its virtual table. Maps to exit code `12`.
   76|       |    #[error("sqlite-vec extension failed: {0}")]
   77|       |    VecExtension(String),
   78|       |
   79|       |    /// SQLite returned `SQLITE_BUSY` after exhausting retries. Maps to exit code `15` (was `13` before v2.0.0; relocated to free `13` for BatchPartialFailure per PRD).
   80|       |    #[error("database busy: {0}")]
   81|       |    DbBusy(String),
   82|       |
   83|       |    /// Batch operation failed partially — N of M items failed. Maps to exit code `13` (PRD 1822).
   84|       |    ///
   85|       |    /// Reserved for use in `import`, `reindex` and batch stdin (BLOCK 3/4). Variant present
   86|       |    /// since v2.0.0 even if call-sites do not yet exist — stable exit code mapping.
   87|       |    #[error("batch partial failure: {failed} of {total} items failed")]
   88|       |    BatchPartialFailure { total: usize, failed: usize },
   89|       |
   90|       |    /// Filesystem I/O error while reading or writing the database or cache. Maps to exit code `14`.
   91|       |    #[error("IO error: {0}")]
   92|       |    Io(#[from] std::io::Error),
   93|       |
   94|       |    /// Unexpected internal error surfaced through `anyhow`. Maps to exit code `20`.
   95|       |    #[error(transparent)]
   96|       |    Internal(#[from] anyhow::Error),
   97|       |
   98|       |    /// JSON serialization or deserialization failure. Maps to exit code `20`.
   99|       |    #[error("json error: {0}")]
  100|       |    Json(#[from] serde_json::Error),
  101|       |
  102|       |    /// Another instance is already running and holds the advisory lock. Maps to exit code `75`.
  103|       |    ///
  104|       |    /// Use `--allow-parallel` to skip the lock or `--wait-lock SECONDS` to retry.
  105|       |    #[error("lock busy: {0}")]
  106|       |    LockBusy(String),
  107|       |
  108|       |    /// All concurrency slots are occupied after the wait timeout. Maps to exit code `75`.
  109|       |    ///
  110|       |    /// Occurs when [`crate::constants::MAX_CONCURRENT_CLI_INSTANCES`] instances are already
  111|       |    /// active and the wait limit [`crate::constants::CLI_LOCK_DEFAULT_WAIT_SECS`] is exhausted.
  112|       |    #[error(
  113|       |        "all {max} concurrency slots occupied after waiting {waited_secs}s (exit 75); \
  114|       |         use --max-concurrency or wait for other invocations to finish"
  115|       |    )]
  116|       |    AllSlotsFull { max: usize, waited_secs: u64 },
  117|       |
  118|       |    /// A heavy long-running job is already running for this job_type/namespace
  119|       |    /// pair. Maps to exit code `75` (the same `EX_TEMPFAIL` code used by the
  120|       |    /// CLI semaphore).
  121|       |    ///
  122|       |    /// G28-B (v1.0.68): ensures at most one `enrich`, `ingest --mode
  123|       |    /// claude-code`, or `ingest --mode codex` runs at a time per namespace.
  124|       |    /// Use `--wait-job-singleton <SECONDS>` (per-command) to poll until the
  125|       |    /// other invocation finishes.
  126|       |    #[error(
  127|       |        "job {job_type} for namespace '{namespace}' is already running (exit 75); \
  128|       |         wait for it to finish or pass --wait-job-singleton <SECONDS>"
  129|       |    )]
  130|       |    JobSingletonLocked { job_type: String, namespace: String },
  131|       |
  132|       |    /// Available memory is below the minimum required to load the model. Maps to exit code `77`.
  133|       |    ///
  134|       |    /// Returned when `sysinfo` reports available memory below
  135|       |    /// [`crate::constants::MIN_AVAILABLE_MEMORY_MB`] MiB before starting the ONNX model load.
  136|       |    #[error(
  137|       |        "available memory ({available_mb}MB) below required minimum ({required_mb}MB) \
  138|       |         to load the model; abort other loads or use --skip-memory-guard (exit 77)"
  139|       |    )]
  140|       |    LowMemory { available_mb: u64, required_mb: u64 },
  141|       |}
  142|       |
  143|       |impl AppError {
  144|       |    /// Returns the deterministic process exit code for this error variant.
  145|       |    ///
  146|       |    /// The codes follow the contract documented in the README: `1` for
  147|       |    /// validation, `9` for duplicates (moved from `2` in v1.0.52), `3` for conflicts, `4` for missing
  148|       |    /// records, `5` for namespace errors, `6` for limit violations, `10`–`14`
  149|       |    /// for infrastructure failures, `13` for BatchPartialFailure (PRD 1822),
  150|       |    /// `15` for DbBusy (migrated from `13` in v2.0.0), `20` for internal errors,
  151|       |    /// `75` (EX_TEMPFAIL) when the advisory CLI lock is held or all concurrency
  152|       |    /// slots are exhausted, and `77` when available memory is insufficient to
  153|       |    /// load the embedding model.
  154|       |    ///
  155|       |    /// # Examples
  156|       |    ///
  157|       |    /// ```
  158|       |    /// use sqlite_graphrag::errors::AppError;
  159|       |    ///
  160|       |    /// assert_eq!(AppError::Validation("invalid field".into()).exit_code(), 1);
  161|       |    /// assert_eq!(AppError::Duplicate("ns/mem".into()).exit_code(), 9);
  162|       |    /// assert_eq!(AppError::Conflict("ts changed".into()).exit_code(), 3);
  163|       |    /// assert_eq!(AppError::NotFound("id 42".into()).exit_code(), 4);
  164|       |    /// assert_eq!(AppError::NamespaceError("no marker".into()).exit_code(), 5);
  165|       |    /// assert_eq!(AppError::LimitExceeded("body too large".into()).exit_code(), 6);
  166|       |    /// assert_eq!(AppError::Embedding("wrong dim".into()).exit_code(), 11);
  167|       |    /// assert_eq!(AppError::DbBusy("retries exhausted".into()).exit_code(), 15);
  168|       |    /// assert_eq!(AppError::LockBusy("another instance".into()).exit_code(), 75);
  169|       |    /// ```
  170|       |    #[inline]
  171|       |    #[must_use]
  172|     26|    pub fn exit_code(&self) -> i32 {
  173|     26|        match self {
  174|      4|            Self::Validation(_) => 1,
  175|      1|            Self::BinaryNotFound { .. } => 1,
  176|      1|            Self::RateLimited { .. } => 1,
  177|      1|            Self::Timeout { .. } => 1,
  178|      1|            Self::Duplicate(_) => crate::constants::DUPLICATE_EXIT_CODE,
  179|      3|            Self::Conflict(_) => 3,
  180|      1|            Self::NotFound(_) => 4,
  181|      1|            Self::NamespaceError(_) => 5,
  182|      1|            Self::LimitExceeded(_) => 6,
  183|      0|            Self::Database(_) => 10,
  184|      1|            Self::Embedding(_) => 11,
  185|      1|            Self::VecExtension(_) => 12,
  186|      1|            Self::BatchPartialFailure { .. } => crate::constants::BATCH_PARTIAL_FAILURE_EXIT_CODE,
  187|      1|            Self::DbBusy(_) => crate::constants::DB_BUSY_EXIT_CODE,
  188|      2|            Self::Io(_) => 14,
  189|      2|            Self::Internal(_) => 20,
  190|      2|            Self::Json(_) => 20,
  191|      2|            Self::LockBusy(_) => crate::constants::CLI_LOCK_EXIT_CODE,
  192|      0|            Self::AllSlotsFull { .. } => crate::constants::CLI_LOCK_EXIT_CODE,
  193|      0|            Self::JobSingletonLocked { .. } => crate::constants::CLI_LOCK_EXIT_CODE,
  194|      0|            Self::LowMemory { .. } => crate::constants::LOW_MEMORY_EXIT_CODE,
  195|       |        }
  196|     26|    }
  197|       |
  198|       |    /// Returns `true` when the error is transient and the operation may
  199|       |    /// succeed on retry with backoff.
  200|       |    ///
  201|       |    /// # Examples
  202|       |    ///
  203|       |    /// ```
  204|       |    /// use sqlite_graphrag::errors::AppError;
  205|       |    ///
  206|       |    /// assert!(AppError::DbBusy("busy".into()).is_retryable());
  207|       |    /// assert!(AppError::LockBusy("held".into()).is_retryable());
  208|       |    /// assert!(!AppError::NotFound("x".into()).is_retryable());
  209|       |    /// assert!(!AppError::Validation("bad".into()).is_retryable());
  210|       |    /// ```
  211|       |    #[inline]
  212|       |    #[must_use]
  213|     11|    pub fn is_retryable(&self) -> bool {
  214|      5|        matches!(
  215|     11|            self,
  216|       |            Self::DbBusy(_)
  217|       |                | Self::LockBusy(_)
  218|       |                | Self::AllSlotsFull { .. }
  219|       |                | Self::JobSingletonLocked { .. }
  220|       |                | Self::LowMemory { .. }
  221|       |                | Self::RateLimited { .. }
  222|       |                | Self::Timeout { .. }
  223|       |        )
  224|     11|    }
  225|       |
  226|       |    /// Returns `true` when the error is permanent and must NOT be retried.
  227|       |    ///
  228|       |    /// Complement to [`Self::is_retryable`]. Errors not classified by either
  229|       |    /// method (e.g. `Database`, `Io`, `Internal`) are ambiguous — the caller
  230|       |    /// decides based on context.
  231|       |    ///
  232|       |    /// # Examples
  233|       |    ///
  234|       |    /// ```
  235|       |    /// use sqlite_graphrag::errors::AppError;
  236|       |    ///
  237|       |    /// assert!(AppError::Validation("bad".into()).is_permanent());
  238|       |    /// assert!(!AppError::DbBusy("busy".into()).is_permanent());
  239|       |    /// ```
  240|       |    #[inline]
  241|       |    #[must_use]
  242|      0|    pub fn is_permanent(&self) -> bool {
  243|      0|        matches!(
  244|      0|            self,
  245|       |            Self::Validation(_)
  246|       |                | Self::BinaryNotFound { .. }
  247|       |                | Self::Duplicate(_)
  248|       |                | Self::NotFound(_)
  249|       |                | Self::NamespaceError(_)
  250|       |                | Self::LimitExceeded(_)
  251|       |                | Self::VecExtension(_)
  252|       |        )
  253|      0|    }
  254|       |
  255|       |    /// Returns the localized error message in the active language (`--lang` / `SQLITE_GRAPHRAG_LANG`).
  256|       |    ///
  257|       |    /// In English the text is identical to the `Display` generated by thiserror.
  258|       |    /// In Portuguese the prefixes and messages are translated to PT-BR.
  259|      0|    pub fn localized_message(&self) -> String {
  260|      0|        self.localized_message_for(current())
  261|      0|    }
  262|       |
  263|       |    /// Returns the localized message for the explicitly provided language.
  264|       |    /// Useful in tests that cannot depend on the global `OnceLock`.
  265|       |    ///
  266|       |    /// # Examples
  267|       |    ///
  268|       |    /// ```
  269|       |    /// use sqlite_graphrag::errors::AppError;
  270|       |    /// use sqlite_graphrag::i18n::Language;
  271|       |    ///
  272|       |    /// let err = AppError::NotFound("mem-xyz".into());
  273|       |    ///
  274|       |    /// let en = err.localized_message_for(Language::English);
  275|       |    /// assert!(en.contains("not found"));
  276|       |    ///
  277|       |    /// let pt = err.localized_message_for(Language::Portuguese);
  278|       |    /// assert!(pt.contains("n\u{e3}o encontrado"));
  279|       |    /// ```
  280|     21|    pub fn localized_message_for(&self, lang: Language) -> String {
  281|     21|        match lang {
  282|      2|            Language::English => self.to_string(),
  283|     19|            Language::Portuguese => self.to_string_pt(),
  284|       |        }
  285|     21|    }
  286|       |
  287|     19|    fn to_string_pt(&self) -> String {
  288|       |        use crate::i18n::validation::app_error_pt as pt;
  289|     19|        match self {
  290|      1|            Self::Validation(msg) => pt::validation(msg),
  291|      1|            Self::BinaryNotFound { name } => pt::binary_not_found(name),
  292|      1|            Self::RateLimited { detail } => pt::rate_limited(detail),
  293|       |            Self::Timeout {
  294|      1|                operation,
  295|      1|                duration_secs,
  296|      1|            } => pt::timeout(operation, *duration_secs),
  297|      2|            Self::Duplicate(msg) => pt::duplicate(msg),
  298|      1|            Self::Conflict(msg) => pt::conflict(msg),
  299|      3|            Self::NotFound(msg) => pt::not_found(msg),
  300|      1|            Self::NamespaceError(msg) => pt::namespace_error(msg),
  301|      1|            Self::LimitExceeded(msg) => pt::limit_exceeded(msg),
  302|      0|            Self::Database(e) => pt::database(&e.to_string()),
  303|      1|            Self::Embedding(msg) => pt::embedding(msg),
  304|      1|            Self::VecExtension(msg) => pt::vec_extension(msg),
  305|      1|            Self::DbBusy(msg) => pt::db_busy(msg),
  306|      1|            Self::BatchPartialFailure { total, failed } => {
  307|      1|                pt::batch_partial_failure(*total, *failed)
  308|       |            }
  309|      0|            Self::Io(e) => pt::io(&e.to_string()),
  310|      0|            Self::Internal(e) => pt::internal(&e.to_string()),
  311|      0|            Self::Json(e) => pt::json(&e.to_string()),
  312|      1|            Self::LockBusy(msg) => pt::lock_busy(msg),
  313|      1|            Self::AllSlotsFull { max, waited_secs } => pt::all_slots_full(*max, *waited_secs),
  314|       |            Self::JobSingletonLocked {
  315|      0|                job_type,
  316|      0|                namespace,
  317|      0|            } => pt::job_singleton_locked(job_type, namespace),
  318|       |            Self::LowMemory {
  319|      1|                available_mb,
  320|      1|                required_mb,
  321|      1|            } => pt::low_memory(*available_mb, *required_mb),
  322|       |        }
  323|     19|    }
  324|       |}
  325|       |
  326|       |#[cfg(test)]
  327|       |mod tests {
  328|       |    use super::*;
  329|       |    use std::io;
  330|       |
  331|       |    #[test]
  332|      1|    fn exit_code_validation_returns_1() {
  333|      1|        assert_eq!(AppError::Validation("invalid field".into()).exit_code(), 1);
  334|      1|    }
  335|       |
  336|       |    #[test]
  337|      1|    fn exit_code_duplicate_returns_9() {
  338|      1|        assert_eq!(AppError::Duplicate("namespace/name".into()).exit_code(), 9);
  339|      1|    }
  340|       |
  341|       |    #[test]
  342|      1|    fn exit_code_conflict_returns_3() {
  343|      1|        assert_eq!(
  344|      1|            AppError::Conflict("updated_at changed".into()).exit_code(),
  345|       |            3
  346|       |        );
  347|      1|    }
  348|       |
  349|       |    #[test]
  350|      1|    fn exit_code_not_found_returns_4() {
  351|      1|        assert_eq!(AppError::NotFound("memory missing".into()).exit_code(), 4);
  352|      1|    }
  353|       |
  354|       |    #[test]
  355|      1|    fn exit_code_namespace_error_returns_5() {
  356|      1|        assert_eq!(
  357|      1|            AppError::NamespaceError("not resolved".into()).exit_code(),
  358|       |            5
  359|       |        );
  360|      1|    }
  361|       |
  362|       |    #[test]
  363|      1|    fn exit_code_limit_exceeded_returns_6() {
  364|      1|        assert_eq!(
  365|      1|            AppError::LimitExceeded("body too large".into()).exit_code(),
  366|       |            6
  367|       |        );
  368|      1|    }
  369|       |
  370|       |    #[test]
  371|      1|    fn exit_code_embedding_returns_11() {
  372|      1|        assert_eq!(AppError::Embedding("model failure".into()).exit_code(), 11);
  373|      1|    }
  374|       |
  375|       |    #[test]
  376|      1|    fn exit_code_vec_extension_returns_12() {
  377|      1|        assert_eq!(
  378|      1|            AppError::VecExtension("extension did not load".into()).exit_code(),
  379|       |            12
  380|       |        );
  381|      1|    }
  382|       |
  383|       |    #[test]
  384|      1|    fn exit_code_db_busy_returns_15() {
  385|      1|        assert_eq!(AppError::DbBusy("retries exhausted".into()).exit_code(), 15);
  386|      1|    }
  387|       |
  388|       |    #[test]
  389|      1|    fn exit_code_batch_partial_failure_returns_13() {
  390|      1|        assert_eq!(
  391|      1|            AppError::BatchPartialFailure {
  392|      1|                total: 10,
  393|      1|                failed: 3
  394|      1|            }
  395|      1|            .exit_code(),
  396|       |            13
  397|       |        );
  398|      1|    }
  399|       |
  400|       |    #[test]
  401|      1|    fn display_batch_partial_failure_includes_counts() {
  402|      1|        let err = AppError::BatchPartialFailure {
  403|      1|            total: 50,
  404|      1|            failed: 7,
  405|      1|        };
  406|      1|        let msg = err.to_string();
  407|      1|        assert!(msg.contains("7"));
  408|      1|        assert!(msg.contains("50"));
  409|       |        // to_string() uses the English #[error] attr; PT is in localized_message_for
  410|      1|        assert!(msg.contains("batch partial failure"));
  411|      1|    }
  412|       |
  413|       |    #[test]
  414|      1|    fn exit_code_io_returns_14() {
  415|      1|        let io_err = io::Error::new(io::ErrorKind::NotFound, "file missing");
  416|      1|        assert_eq!(AppError::Io(io_err).exit_code(), 14);
  417|      1|    }
  418|       |
  419|       |    #[test]
  420|      1|    fn exit_code_internal_returns_20() {
  421|      1|        let anyhow_err = anyhow::anyhow!("unexpected internal error");
  422|      1|        assert_eq!(AppError::Internal(anyhow_err).exit_code(), 20);
  423|      1|    }
  424|       |
  425|       |    #[test]
  426|      1|    fn exit_code_json_returns_20() {
  427|      1|        let json_err = serde_json::from_str::<serde_json::Value>("invalid json {{").unwrap_err();
  428|      1|        assert_eq!(AppError::Json(json_err).exit_code(), 20);
  429|      1|    }
  430|       |
  431|       |    #[test]
  432|      1|    fn exit_code_lock_busy_returns_75() {
  433|      1|        assert_eq!(
  434|      1|            AppError::LockBusy("another active instance".into()).exit_code(),
  435|       |            75
  436|       |        );
  437|      1|    }
  438|       |
  439|       |    #[test]
  440|      1|    fn display_validation_includes_message() {
  441|      1|        let err = AppError::Validation("invalid id".into());
  442|      1|        assert!(err.to_string().contains("invalid id"));
  443|      1|        assert!(err.to_string().contains("validation error"));
  444|      1|    }
  445|       |
  446|       |    #[test]
  447|      1|    fn display_duplicate_includes_message() {
  448|      1|        let err = AppError::Duplicate("proj/mem".into());
  449|      1|        assert!(err.to_string().contains("proj/mem"));
  450|      1|        assert!(err.to_string().contains("duplicate detected"));
  451|      1|    }
  452|       |
  453|       |    #[test]
  454|      1|    fn display_not_found_includes_message() {
  455|      1|        let err = AppError::NotFound("id 42".into());
  456|      1|        assert!(err.to_string().contains("id 42"));
  457|      1|        assert!(err.to_string().contains("not found"));
  458|      1|    }
  459|       |
  460|       |    #[test]
  461|      1|    fn display_embedding_includes_message() {
  462|      1|        let err = AppError::Embedding("wrong dimension".into());
  463|      1|        assert!(err.to_string().contains("wrong dimension"));
  464|      1|        assert!(err.to_string().contains("embedding error"));
  465|      1|    }
  466|       |
  467|       |    #[test]
  468|      1|    fn display_lock_busy_includes_message() {
  469|      1|        let err = AppError::LockBusy("pid 1234".into());
  470|      1|        assert!(err.to_string().contains("pid 1234"));
  471|      1|        assert!(err.to_string().contains("lock busy"));
  472|      1|    }
  473|       |
  474|       |    #[test]
  475|      1|    fn from_io_error_converts_correctly() {
  476|      1|        let io_err = io::Error::new(io::ErrorKind::PermissionDenied, "permission denied");
  477|      1|        let app_err: AppError = io_err.into();
  478|      1|        assert_eq!(app_err.exit_code(), 14);
  479|      1|        assert!(app_err.to_string().contains("IO error"));
  480|      1|    }
  481|       |
  482|       |    #[test]
  483|      1|    fn from_anyhow_error_converts_correctly() {
  484|      1|        let anyhow_err = anyhow::anyhow!("internal detail");
  485|      1|        let app_err: AppError = anyhow_err.into();
  486|      1|        assert_eq!(app_err.exit_code(), 20);
  487|      1|        assert!(app_err.to_string().contains("internal detail"));
  488|      1|    }
  489|       |
  490|       |    #[test]
  491|      1|    fn from_serde_json_error_converts_correctly() {
  492|      1|        let json_err = serde_json::from_str::<serde_json::Value>("{bad_field}").unwrap_err();
  493|      1|        let app_err: AppError = json_err.into();
  494|      1|        assert_eq!(app_err.exit_code(), 20);
  495|      1|        assert!(app_err.to_string().contains("json error"));
  496|      1|    }
  497|       |
  498|       |    #[test]
  499|      1|    fn exit_code_lock_busy_matches_constant() {
  500|      1|        assert_eq!(
  501|      1|            AppError::LockBusy("test".into()).exit_code(),
  502|       |            crate::constants::CLI_LOCK_EXIT_CODE
  503|       |        );
  504|      1|    }
  505|       |
  506|       |    #[test]
  507|      1|    fn localized_message_en_equals_to_string() {
  508|      1|        let err = AppError::NotFound("mem-x".into());
  509|      1|        assert_eq!(
  510|      1|            err.localized_message_for(crate::i18n::Language::English),
  511|      1|            err.to_string()
  512|       |        );
  513|      1|    }
  514|       |
  515|       |    // Detailed Portuguese-specific assertions live in `src/i18n.rs`
  516|       |    // (the bilingual module). Here we only verify that delegation is wired
  517|       |    // correctly, without embedding PT strings in this English-only file.
  518|       |
  519|       |    #[test]
  520|      1|    fn localized_message_pt_differs_from_en() {
  521|      1|        let err = AppError::NotFound("mem-x".into());
  522|      1|        let en = err.localized_message_for(crate::i18n::Language::English);
  523|      1|        let pt = err.localized_message_for(crate::i18n::Language::Portuguese);
  524|      1|        assert_ne!(en, pt, "PT and EN must produce distinct messages");
                                         ^0
  525|      1|        assert!(pt.contains("mem-x"), "PT must include the variant payload");
                                                    ^0
  526|      1|    }
  527|       |
  528|       |    #[test]
  529|      1|    fn localized_message_pt_delegates_to_app_error_pt_helper() {
  530|       |        use crate::i18n::validation::app_error_pt as pt;
  531|       |
  532|      1|        let cases: Vec<(AppError, String)> = vec![
  533|      1|            (AppError::Validation("x".into()), pt::validation("x")),
  534|      1|            (AppError::Duplicate("x".into()), pt::duplicate("x")),
  535|      1|            (AppError::Conflict("x".into()), pt::conflict("x")),
  536|      1|            (AppError::NotFound("x".into()), pt::not_found("x")),
  537|      1|            (
  538|      1|                AppError::NamespaceError("x".into()),
  539|      1|                pt::namespace_error("x"),
  540|      1|            ),
  541|      1|            (AppError::LimitExceeded("x".into()), pt::limit_exceeded("x")),
  542|      1|            (AppError::Embedding("x".into()), pt::embedding("x")),
  543|      1|            (AppError::VecExtension("x".into()), pt::vec_extension("x")),
  544|      1|            (AppError::DbBusy("x".into()), pt::db_busy("x")),
  545|      1|            (
  546|      1|                AppError::BatchPartialFailure {
  547|      1|                    total: 10,
  548|      1|                    failed: 3,
  549|      1|                },
  550|      1|                pt::batch_partial_failure(10, 3),
  551|      1|            ),
  552|      1|            (AppError::LockBusy("x".into()), pt::lock_busy("x")),
  553|      1|            (
  554|      1|                AppError::AllSlotsFull {
  555|      1|                    max: 4,
  556|      1|                    waited_secs: 60,
  557|      1|                },
  558|      1|                pt::all_slots_full(4, 60),
  559|      1|            ),
  560|      1|            (
  561|      1|                AppError::LowMemory {
  562|      1|                    available_mb: 100,
  563|      1|                    required_mb: 500,
  564|      1|                },
  565|      1|                pt::low_memory(100, 500),
  566|      1|            ),
  567|      1|            (
  568|      1|                AppError::BinaryNotFound {
  569|      1|                    name: "claude".into(),
  570|      1|                },
  571|      1|                pt::binary_not_found("claude"),
  572|      1|            ),
  573|      1|            (
  574|      1|                AppError::RateLimited {
  575|      1|                    detail: "429".into(),
  576|      1|                },
  577|      1|                pt::rate_limited("429"),
  578|      1|            ),
  579|      1|            (
  580|      1|                AppError::Timeout {
  581|      1|                    operation: "op".into(),
  582|      1|                    duration_secs: 30,
  583|      1|                },
  584|      1|                pt::timeout("op", 30),
  585|      1|            ),
  586|       |        ];
  587|       |
  588|     17|        for (err, expected) in cases {
                           ^16  ^16
  589|     16|            let actual = err.localized_message_for(crate::i18n::Language::Portuguese);
  590|     16|            assert_eq!(actual, expected, "delegation mismatch");
                                                       ^0
  591|       |        }
  592|      1|    }
  593|       |
  594|       |    #[test]
  595|      1|    fn is_retryable_transient_errors() {
  596|      1|        assert!(AppError::DbBusy("x".into()).is_retryable());
  597|      1|        assert!(AppError::LockBusy("x".into()).is_retryable());
  598|      1|        assert!(AppError::AllSlotsFull {
  599|      1|            max: 4,
  600|      1|            waited_secs: 60
  601|      1|        }
  602|      1|        .is_retryable());
  603|      1|        assert!(AppError::LowMemory {
  604|      1|            available_mb: 100,
  605|      1|            required_mb: 500
  606|      1|        }
  607|      1|        .is_retryable());
  608|      1|        assert!(AppError::RateLimited {
  609|      1|            detail: "429".into()
  610|      1|        }
  611|      1|        .is_retryable());
  612|      1|        assert!(AppError::Timeout {
  613|      1|            operation: "op".into(),
  614|      1|            duration_secs: 30
  615|      1|        }
  616|      1|        .is_retryable());
  617|      1|    }
  618|       |
  619|       |    #[test]
  620|      1|    fn is_retryable_permanent_errors() {
  621|      1|        assert!(!AppError::Validation("x".into()).is_retryable());
  622|      1|        assert!(!AppError::NotFound("x".into()).is_retryable());
  623|      1|        assert!(!AppError::Duplicate("x".into()).is_retryable());
  624|      1|        assert!(!AppError::Conflict("x".into()).is_retryable());
  625|      1|        assert!(!AppError::BinaryNotFound { name: "x".into() }.is_retryable());
  626|      1|    }
  627|       |
  628|       |    #[test]
  629|      1|    fn exit_code_new_variants() {
  630|      1|        assert_eq!(AppError::BinaryNotFound { name: "x".into() }.exit_code(), 1);
  631|      1|        assert_eq!(AppError::RateLimited { detail: "x".into() }.exit_code(), 1);
  632|      1|        assert_eq!(
  633|      1|            AppError::Timeout {
  634|      1|                operation: "x".into(),
  635|      1|                duration_secs: 5
  636|      1|            }
  637|      1|            .exit_code(),
  638|       |            1
  639|       |        );
  640|      1|    }
  641|       |
  642|       |    #[test]
  643|      1|    fn app_error_size_does_not_exceed_budget() {
  644|      1|        let size = std::mem::size_of::<AppError>();
  645|      1|        assert!(
  646|      1|            size <= 128,
  647|      0|            "AppError is {size} bytes — exceeds 128-byte budget; \
  648|      0|             consider boxing large variants to reduce memcpy cost in Result propagation"
  649|       |        );
  650|      1|    }
  651|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/extraction.rs:
    1|       |//! Entity and URL extraction pipeline (NER + regex prefilter).
    2|       |//!
    3|       |//! Runs named-entity recognition and regex heuristics to extract structured
    4|       |//! entities and hyperlinks from raw memory bodies before embedding.
    5|       |
    6|       |use std::path::{Path, PathBuf};
    7|       |use std::sync::OnceLock;
    8|       |
    9|       |use anyhow::{Context, Result};
   10|       |use ort::session::{builder::GraphOptimizationLevel, Session};
   11|       |use regex::Regex;
   12|       |use serde::{Deserialize, Serialize};
   13|       |use unicode_normalization::UnicodeNormalization;
   14|       |
   15|       |use crate::entity_type::EntityType;
   16|       |use crate::paths::AppPaths;
   17|       |use crate::storage::entities::{NewEntity, NewRelationship};
   18|       |
   19|       |const MAX_ENTS: usize = 30;
   20|       |// v1.0.31 A9: only consumed by the legacy `build_relationships`, which is
   21|       |// kept for unit tests pinning the cap behaviour.
   22|       |#[cfg(test)]
   23|       |const TOP_K_RELATIONS: usize = 5;
   24|       |const DEFAULT_RELATION: &str = "mentions";
   25|       |const MIN_ENTITY_CHARS: usize = 2;
   26|       |
   27|       |static REGEX_EMAIL: OnceLock<Regex> = OnceLock::new();
   28|       |static REGEX_URL: OnceLock<Regex> = OnceLock::new();
   29|       |static REGEX_UUID: OnceLock<Regex> = OnceLock::new();
   30|       |static REGEX_ALL_CAPS: OnceLock<Regex> = OnceLock::new();
   31|       |// v1.0.25 P0-4: filters section-structure markers like "Etapa 3", "Fase 1", "Passo 2".
   32|       |static REGEX_SECTION_MARKER: OnceLock<Regex> = OnceLock::new();
   33|       |// v1.0.25 P0-2: captures CamelCase brand names that NER model often misses (e.g. "OpenAI", "PostgreSQL").
   34|       |static REGEX_BRAND_CAMEL: OnceLock<Regex> = OnceLock::new();
   35|       |
   36|       |// v1.0.20: stopwords to filter common PT-BR/EN rule words captured as ALL_CAPS.
   37|       |// Without this filter, technical PT-BR corpora containing CAPS-formatted rules (NUNCA, PROIBIDO, DEVE)
   38|       |// generated ~70% of "garbage entities". We keep identifiers like MAX_RETRY (with underscore).
   39|       |// v1.0.22: expanded list with terms observed in 495-file flowaiper stress test.
   40|       |// Includes verbs (ADICIONAR, VALIDAR), adjectives (ALTA, BAIXA), common nouns (BANCO, CASO),
   41|       |// HTTP methods (GET, POST, DELETE) and generic data formats (JSON, XML).
   42|       |// v1.0.24: added 17 new terms observed in audit v1.0.23: generic status words (COMPLETED, DONE,
   43|       |// FIXED, PENDING), PT-BR imperative verbs (ACEITE, CONFIRME, NEGUE, RECUSE), PT-BR modal/
   44|       |// common verbs (DEVEMOS, PODEMOS, VAMOS), generic nouns (BORDA, CHECKLIST, PLAN, TOKEN),
   45|       |// and common abbreviations (ACK, ACL).
   46|       |// v1.0.25 P0-4: added technology/protocol acronyms (API, CLI, HTTP, HTTPS, JWT, LLM, REST, UI, URL)
   47|       |// and PT-BR section-label stems (CAPÍTULO, ETAPA, FASE, PASSO, SEÇÃO) to prevent section markers
   48|       |// and generic tech terms from being extracted as entities.
   49|       |// v1.0.31 A11: added PT-BR uppercase noise observed during ingest of technical Portuguese
   50|       |// rule documents — common nouns/adjectives written in caps as visual emphasis (ADAPTER, PROJETO,
   51|       |// PASSIVA, ATIVA, SOMENTE, LEITURA, ESCRITA, OBRIGATORIA, EXEMPLO, REGRA, DEFAULT). Each one
   52|       |// kept leaking as a "concept" entity and inflating the graph with non-entities.
   53|       |const ALL_CAPS_STOPWORDS: &[&str] = &[
   54|       |    "ACEITE",
   55|       |    "ACID",
   56|       |    "ACK",
   57|       |    "ACL",
   58|       |    "ACRESCENTADO",
   59|       |    "ADAPTER",
   60|       |    "ADICIONADA",
   61|       |    "ADICIONADAS",
   62|       |    "ADICIONADO",
   63|       |    "ADICIONADOS",
   64|       |    "ADICIONAR",
   65|       |    "AGENTS",
   66|       |    "AINDA",
   67|       |    "ALL",
   68|       |    "ALTA",
   69|       |    "ALWAYS",
   70|       |    "APENAS",
   71|       |    "API",
   72|       |    "ARTEFATOS",
   73|       |    "ATIVA",
   74|       |    "ATIVO",
   75|       |    "BAIXA",
   76|       |    "BANCO",
   77|       |    "BLOQUEAR",
   78|       |    "BORDA",
   79|       |    "BUG",
   80|       |    "CAPÍTULO",
   81|       |    "CASO",
   82|       |    "CEO",
   83|       |    "CHECKLIST",
   84|       |    "CLARO",
   85|       |    "CLAUDE_STREAM_IDLE_TIMEOUT_MS",
   86|       |    "CLI",
   87|       |    "COMPLETED",
   88|       |    "CONFIRMADO",
   89|       |    "CONFIRMARAM",
   90|       |    "CONFIRME",
   91|       |    "CONFIRMEI",
   92|       |    "CONFIRMOU",
   93|       |    "CONTRATO",
   94|       |    "CRIE",
   95|       |    "CRÍTICO",
   96|       |    "CRITICAL",
   97|       |    "CSV",
   98|       |    "DDL",
   99|       |    "DEFAULT",
  100|       |    "DEFINIR",
  101|       |    "DEPARTMENT",
  102|       |    "DESC",
  103|       |    "DEVE",
  104|       |    "DEVEMOS",
  105|       |    "DISCO",
  106|       |    "DONE",
  107|       |    "DSL",
  108|       |    "DTO",
  109|       |    "EFEITO",
  110|       |    "ENTRADA",
  111|       |    "EOF",
  112|       |    "EPERM",
  113|       |    "ERROR",
  114|       |    "ESCREVA",
  115|       |    "ESCRITA",
  116|       |    "ESRCH",
  117|       |    "ESSA",
  118|       |    "ESSE",
  119|       |    "ESSENCIAL",
  120|       |    "ESTA",
  121|       |    "ESTADO",
  122|       |    "ESTE",
  123|       |    "ETAPA",
  124|       |    "EVITAR",
  125|       |    "EXEMPLO",
  126|       |    "EXPANDIR",
  127|       |    "EXPOR",
  128|       |    "FALHA",
  129|       |    "FASE",
  130|       |    "FATO",
  131|       |    "FIFO",
  132|       |    "FIXED",
  133|       |    "FIXME",
  134|       |    "FLUXO",
  135|       |    "FONTES",
  136|       |    "FORBIDDEN",
  137|       |    "FUNCIONA",
  138|       |    "GNU",
  139|       |    "HACK",
  140|       |    "HEARTBEAT",
  141|       |    "HTTP",
  142|       |    "HTTPS",
  143|       |    "INATIVO",
  144|       |    "JAMAIS",
  145|       |    "JSON",
  146|       |    "JWT",
  147|       |    "LEITURA",
  148|       |    "LLM",
  149|       |    "MCP",
  150|       |    "MESMO",
  151|       |    "METADADOS",
  152|       |    "MUST",
  153|       |    "NDJSON",
  154|       |    "NEGUE",
  155|       |    "NEVER",
  156|       |    "NOTE",
  157|       |    "NUNCA",
  158|       |    "OBRIGATORIA",
  159|       |    "OBRIGATÓRIO",
  160|       |    "OBSERVEI",
  161|       |    "PADRÃO",
  162|       |    "PASSIVA",
  163|       |    "PASSO",
  164|       |    "PENDING",
  165|       |    "PGID",
  166|       |    "PID",
  167|       |    "PLAN",
  168|       |    "PODEMOS",
  169|       |    "PONTEIROS",
  170|       |    "PREFERIR",
  171|       |    "PROIBIDO",
  172|       |    "PROJETO",
  173|       |    "RECUSE",
  174|       |    "REGRA",
  175|       |    "REGRAS",
  176|       |    "REMOVIDAS",
  177|       |    "REQUIRED",
  178|       |    "REQUISITO",
  179|       |    "REST",
  180|       |    "SEÇÃO",
  181|       |    "SEMPRE",
  182|       |    "SHALL",
  183|       |    "SHOULD",
  184|       |    "SIGTERM",
  185|       |    "SOMENTE",
  186|       |    "SOUL",
  187|       |    "TODAS",
  188|       |    "TODO",
  189|       |    "TODOS",
  190|       |    "TOKEN",
  191|       |    "TOOLS",
  192|       |    "TSV",
  193|       |    "TUI",
  194|       |    "UI",
  195|       |    "URL",
  196|       |    "USAR",
  197|       |    "VALIDAR",
  198|       |    "VAMOS",
  199|       |    "VOCÊ",
  200|       |    "WARNING",
  201|       |    "XML",
  202|       |    "YAML",
  203|       |];
  204|       |
  205|       |// v1.0.22: HTTP methods are protocol verbs, not semantically useful entities.
  206|       |// Filtered in apply_regex_prefilter (regex_all_caps path).
  207|       |const HTTP_METHODS: &[&str] = &[
  208|       |    "GET", "POST", "PUT", "DELETE", "PATCH", "HEAD", "OPTIONS", "CONNECT", "TRACE",
  209|       |];
  210|       |
  211|     51|fn is_filtered_all_caps(token: &str) -> bool {
  212|       |    // Identifiers containing underscore are preserved (e.g. MAX_RETRY, FLOWAIPER_API_KEY)
  213|     51|    let is_identifier = token.contains('_');
  214|     51|    if is_identifier {
  215|      8|        return false;
  216|     43|    }
  217|     43|    ALL_CAPS_STOPWORDS.contains(&token) || HTTP_METHODS.contains(&token)
                                                         ^6           ^6       ^6
  218|     51|}
  219|       |
  220|     21|fn regex_email() -> &'static Regex {
  221|       |    // SAFETY: regex literal validated at compile-time via test::regex_literals_compile
  222|     21|    REGEX_EMAIL.get_or_init(|| {
                                             ^1
  223|      1|        Regex::new(r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}")
  224|      1|            .expect("compile-time validated email regex literal")
  225|      1|    })
  226|     21|}
  227|       |
  228|      8|fn regex_url() -> &'static Regex {
  229|       |    // SAFETY: regex literal validated at compile-time via test::regex_literals_compile
  230|      8|    REGEX_URL.get_or_init(|| {
                                           ^1
  231|      1|        Regex::new(r#"https?://[^\s\)\]\}"'<>]+"#)
  232|      1|            .expect("compile-time validated URL regex literal")
  233|      1|    })
  234|      8|}
  235|       |
  236|     21|fn regex_uuid() -> &'static Regex {
  237|       |    // SAFETY: regex literal validated at compile-time via test::regex_literals_compile
  238|     21|    REGEX_UUID.get_or_init(|| {
                                            ^1
  239|      1|        Regex::new(r"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}")
  240|      1|            .expect("compile-time validated UUID regex literal")
  241|      1|    })
  242|     21|}
  243|       |
  244|     21|fn regex_all_caps() -> &'static Regex {
  245|     21|    REGEX_ALL_CAPS.get_or_init(|| {
                                                ^1
  246|      1|        Regex::new(r"\b[A-Z][A-Z0-9_]{2,}\b")
  247|      1|            .expect("compile-time validated all-caps regex literal")
  248|      1|    })
  249|     21|}
  250|       |
  251|     25|fn regex_section_marker() -> &'static Regex {
  252|     25|    REGEX_SECTION_MARKER.get_or_init(|| {
                                                      ^1
  253|       |        // Matches PT-BR document-structure labels followed by a number: "Etapa 3", "Fase 1",
  254|       |        // "Camada 5", "Passo 2", etc. v1.0.36 (H5): added "Camada" after audit found
  255|       |        // "Camada 1".."Camada 5" leaking through into entity extraction with degree>=3.
  256|       |        // Accented characters expressed as escapes to keep this source file ASCII-only
  257|       |        // per the project language policy. Pattern is equivalent to:
  258|       |        //   \b(?:Etapa|Fase|Passo|Camada|Se\xe7\xe3o|Cap\xedtulo)\s+\d+\b
  259|      1|        Regex::new("\\b(?:Etapa|Fase|Passo|Camada|Se\u{00e7}\u{00e3}o|Cap\u{00ed}tulo)\\s+\\d+\\b")
  260|      1|            .expect("compile-time validated section marker regex literal")
  261|      1|    })
  262|     25|}
  263|       |
  264|     21|fn regex_brand_camel() -> &'static Regex {
  265|     21|    REGEX_BRAND_CAMEL.get_or_init(|| {
                                                   ^1
  266|       |        // Matches CamelCase brand names: one or more lowercase letters after an uppercase, then
  267|       |        // another uppercase followed by more letters. Covers "OpenAI", "PostgreSQL", "ChatGPT".
  268|      1|        Regex::new(r"\b[A-Z][a-z]+[A-Z][A-Za-z]+\b")
  269|      1|            .expect("compile-time validated CamelCase brand regex literal")
  270|      1|    })
  271|     21|}
  272|       |
  273|       |#[derive(Debug, Clone, PartialEq)]
  274|       |pub struct ExtractedEntity {
  275|       |    pub name: String,
  276|       |    pub entity_type: EntityType,
  277|       |}
  278|       |
  279|       |/// URL with source offset extracted from the memory body.
  280|       |#[derive(Debug, Clone)]
  281|       |pub struct ExtractedUrl {
  282|       |    pub url: String,
  283|       |    /// Byte position in the body where the URL was found.
  284|       |    pub offset: usize,
  285|       |}
  286|       |
  287|       |#[derive(Debug, Clone)]
  288|       |pub struct ExtractionResult {
  289|       |    pub entities: Vec<NewEntity>,
  290|       |    pub relationships: Vec<NewRelationship>,
  291|       |    /// True when build_relationships hit the cap before covering all entity pairs.
  292|       |    /// Exposed in RememberResponse so callers can detect when relationships were cut.
  293|       |    pub relationships_truncated: bool,
  294|       |    /// Extraction method used: `"gliner-<variant>+regex"` or `"regex-only"`.
  295|       |    /// Useful for auditing, metrics and user reports.
  296|       |    pub extraction_method: String,
  297|       |    /// URLs extracted from the body — stored separately from graph entities.
  298|       |    pub urls: Vec<ExtractedUrl>,
  299|       |}
  300|       |
  301|       |pub trait Extractor: Send + Sync {
  302|       |    fn extract(&self, body: &str) -> Result<ExtractionResult>;
  303|       |}
  304|       |
  305|       |/// GLiNER ONNX model quantization variant.
  306|       |#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
  307|       |pub enum GlinerVariant {
  308|       |    Fp32,
  309|       |    Fp16,
  310|       |    Int8,
  311|       |    Q4,
  312|       |    Q4f16,
  313|       |}
  314|       |
  315|       |impl GlinerVariant {
  316|       |    /// ONNX filename for this variant in the HuggingFace repository.
  317|      7|    pub fn as_filename(self) -> &'static str {
  318|      7|        match self {
  319|      1|            Self::Fp32 => "model.onnx",
  320|      1|            Self::Fp16 => "model_fp16.onnx",
  321|      3|            Self::Int8 => "model_quantized.onnx",
  322|      1|            Self::Q4 => "model_q4.onnx",
  323|      1|            Self::Q4f16 => "model_q4f16.onnx",
  324|       |        }
  325|      7|    }
  326|       |
  327|       |    /// Approximate model size for user-facing messages.
  328|      2|    pub fn display_size(self) -> &'static str {
  329|      2|        match self {
  330|      1|            Self::Fp32 => "1.1 GB",
  331|      0|            Self::Fp16 => "580 MB",
  332|      1|            Self::Int8 => "349 MB",
  333|      0|            Self::Q4 => "894 MB",
  334|      0|            Self::Q4f16 => "472 MB",
  335|       |        }
  336|      2|    }
  337|       |}
  338|       |
  339|       |impl std::fmt::Display for GlinerVariant {
  340|     14|    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
  341|     14|        match self {
  342|      3|            Self::Fp32 => f.write_str("fp32"),
  343|      2|            Self::Fp16 => f.write_str("fp16"),
  344|      5|            Self::Int8 => f.write_str("int8"),
  345|      2|            Self::Q4 => f.write_str("q4"),
  346|      2|            Self::Q4f16 => f.write_str("q4f16"),
  347|       |        }
  348|     14|    }
  349|       |}
  350|       |
  351|       |impl std::str::FromStr for GlinerVariant {
  352|       |    type Err = anyhow::Error;
  353|     15|    fn from_str(s: &str) -> Result<Self> {
  354|     15|        match s.to_lowercase().as_str() {
  355|     15|            "fp32" => Ok(Self::Fp32),
                                    ^3
  356|     12|            "fp16" => Ok(Self::Fp16),
                                    ^2
  357|     10|            "int8" => Ok(Self::Int8),
                                    ^3
  358|      7|            "q4" => Ok(Self::Q4),
                                  ^2
  359|      5|            "q4f16" => Ok(Self::Q4f16),
                                     ^2
  360|      3|            other => {
  361|      3|                anyhow::bail!("unknown GLiNER variant: {other}. Valid: fp32, fp16, int8, q4, q4f16")
  362|       |            }
  363|       |        }
  364|     15|    }
  365|       |}
  366|       |
  367|       |const GLINER_MAX_WIDTH: usize = 12;
  368|       |const GLINER_MAX_SEQ_LEN: usize = 384;
  369|       |const GLINER_ENT_TOKEN: &str = "<<ENT>>";
  370|       |const GLINER_SEP_TOKEN: &str = "<<SEP>>";
  371|       |
  372|       |const GLINER_ENTITY_LABELS: &[(&str, EntityType)] = &[
  373|       |    ("person", EntityType::Person),
  374|       |    ("organization", EntityType::Organization),
  375|       |    ("location", EntityType::Location),
  376|       |    ("date", EntityType::Date),
  377|       |    ("project", EntityType::Project),
  378|       |    ("tool", EntityType::Tool),
  379|       |    ("file", EntityType::File),
  380|       |    ("concept", EntityType::Concept),
  381|       |    ("decision", EntityType::Decision),
  382|       |    ("incident", EntityType::Incident),
  383|       |    ("dashboard", EntityType::Dashboard),
  384|       |    ("issue tracker", EntityType::IssueTracker),
  385|       |    ("memory", EntityType::Memory),
  386|       |];
  387|       |
  388|       |struct GlinerModel {
  389|       |    session: parking_lot::Mutex<Session>,
  390|       |    tokenizer: tokenizers::Tokenizer,
  391|       |    #[allow(dead_code)]
  392|       |    variant: GlinerVariant,
  393|       |}
  394|       |
  395|       |impl GlinerModel {
  396|      1|    fn load(model_dir: &Path, variant: GlinerVariant) -> Result<Self> {
  397|      1|        let model_path = model_dir.join(variant.as_filename());
  398|      1|        let tokenizer_path = model_dir.join("tokenizer.json");
  399|       |
  400|      1|        let session = Session::builder()
  401|      1|            .map_err(|e| anyhow::anyhow!("creating GLiNER session builder: {e}"))?
                                                       ^0                                      ^0
  402|      1|            .with_optimization_level(GraphOptimizationLevel::Level3)
  403|      1|            .map_err(|e| anyhow::anyhow!("setting optimization level: {e}"))?
                                                       ^0                                 ^0
  404|      1|            .commit_from_file(&model_path)
  405|      1|            .map_err(|e| anyhow::anyhow!("loading GLiNER ONNX model from {model_path:?}: {e}"))?;
                                                       ^0                                                    ^0
  406|       |
  407|      1|        let tokenizer = tokenizers::Tokenizer::from_file(&tokenizer_path)
  408|      1|            .map_err(|e| anyhow::anyhow!("loading GLiNER tokenizer: {e}"))?;
                                                       ^0                               ^0
  409|       |
  410|      1|        Ok(Self {
  411|      1|            session: parking_lot::Mutex::new(session),
  412|      1|            tokenizer,
  413|      1|            variant,
  414|      1|        })
  415|      1|    }
  416|       |
  417|      3|    fn predict(
  418|      3|        &self,
  419|      3|        body: &str,
  420|      3|        entity_labels: &[(&str, EntityType)],
  421|      3|        threshold: f32,
  422|      3|    ) -> Result<Vec<ExtractedEntity>> {
  423|      3|        let label_names: Vec<&str> = entity_labels.iter().map(|(name, _)| *name).collect();
  424|      3|        let words: Vec<&str> = body.split_whitespace().collect();
  425|      3|        if words.is_empty() {
  426|      0|            return Ok(Vec::new());
  427|      3|        }
  428|       |
  429|       |        // Cap words to fit within model sequence length (accounting for label tokens)
  430|      3|        let label_token_count = label_names.len() * 2 + 1;
  431|      3|        let max_words = GLINER_MAX_SEQ_LEN.saturating_sub(label_token_count + 2);
  432|      3|        let words = if words.len() > max_words {
  433|      1|            tracing::warn!(target: "extraction",
  434|      0|                original_words = words.len(),
  435|       |                capped_words = max_words,
  436|      0|                "GLiNER input truncated to fit model sequence length"
  437|       |            );
  438|      1|            &words[..max_words]
  439|       |        } else {
  440|      2|            &words[..]
  441|       |        };
  442|      3|        let num_words = words.len();
  443|       |
  444|       |        // Build prompt: [<<ENT>>, label1, <<ENT>>, label2, ..., <<SEP>>, word1, word2, ...]
  445|      3|        let prompt_cap = label_names.len() * 2 + 1 + num_words;
  446|      3|        let mut prompt_tokens: Vec<String> = Vec::new();
  447|      3|        prompt_tokens.try_reserve(prompt_cap).map_err(|_| {
                                                                        ^0
  448|      0|            anyhow::anyhow!(
  449|      0|                "allocation of {prompt_cap} prompt tokens would exceed available memory"
  450|       |            )
  451|      0|        })?;
  452|     42|        for label in &label_names {
                          ^39
  453|     39|            prompt_tokens.push(GLINER_ENT_TOKEN.to_string());
  454|     39|            prompt_tokens.push((*label).to_string());
  455|     39|        }
  456|      3|        prompt_tokens.push(GLINER_SEP_TOKEN.to_string());
  457|    367|        for word in words {
                          ^364
  458|    364|            prompt_tokens.push((*word).to_string());
  459|    364|        }
  460|       |
  461|       |        // Encode each token individually (word-by-word encoding per GLiNER protocol)
  462|      3|        let seq_estimate = prompt_tokens.len() * 3;
  463|      3|        let mut all_ids: Vec<i64> = Vec::new();
  464|      3|        all_ids.try_reserve(seq_estimate).map_err(|_| {
                                                                    ^0
  465|      0|            anyhow::anyhow!("allocation of {seq_estimate} token IDs would exceed available memory")
  466|      0|        })?;
  467|      3|        let mut all_attention: Vec<i64> = Vec::new();
  468|      3|        all_attention.try_reserve(seq_estimate).map_err(|_| {
                                                                          ^0
  469|      0|            anyhow::anyhow!(
  470|      0|                "allocation of {seq_estimate} attention masks would exceed available memory"
  471|       |            )
  472|      0|        })?;
  473|      3|        let mut all_word_mask: Vec<i64> = Vec::new();
  474|      3|        all_word_mask.try_reserve(seq_estimate).map_err(|_| {
                                                                          ^0
  475|      0|            anyhow::anyhow!("allocation of {seq_estimate} word masks would exceed available memory")
  476|      0|        })?;
  477|       |
  478|       |        // BOS token
  479|      3|        all_ids.push(1);
  480|      3|        all_attention.push(1);
  481|      3|        all_word_mask.push(0);
  482|       |
  483|      3|        let text_offset = label_names.len() * 2 + 1;
  484|      3|        let mut word_id: i64 = 0;
  485|       |
  486|    445|        for (pos, token_str) in prompt_tokens.iter().enumerate() {
                                              ^3                   ^3
  487|    445|            let encoding = self
  488|    445|                .tokenizer
  489|    445|                .encode(token_str.as_str(), false)
  490|    445|                .map_err(|e| anyhow::anyhow!("GLiNER tokenizer encode error: {e}"))?;
                                                           ^0                                    ^0
  491|    445|            let ids = encoding.get_ids();
  492|    445|            let is_text_token = pos >= text_offset;
  493|       |
  494|    827|            for (sub_idx, &id) in ids.iter().enumerate() {
                                                ^445^445   ^445
  495|    827|                all_ids.push(id as i64);
  496|    827|                all_attention.push(1);
  497|    827|                if is_text_token && sub_idx == 0 {
                                                  ^737
  498|    364|                    word_id += 1;
  499|    364|                    all_word_mask.push(word_id);
  500|    463|                } else {
  501|    463|                    all_word_mask.push(0);
  502|    463|                }
  503|       |            }
  504|       |        }
  505|       |
  506|       |        // EOS token
  507|      3|        all_ids.push(2);
  508|      3|        all_attention.push(1);
  509|      3|        all_word_mask.push(0);
  510|       |
  511|      3|        let seq_len = all_ids.len();
  512|       |
  513|       |        // Build ORT tensors using Tensor::from_array((shape, data)) API
  514|      3|        let t_input_ids = ort::value::Tensor::<i64>::from_array(([1usize, seq_len], all_ids))
  515|      3|            .map_err(|e| anyhow::anyhow!("building input_ids tensor: {e}"))?;
                                                       ^0                                ^0
  516|      3|        let t_attention = ort::value::Tensor::<i64>::from_array(([1usize, seq_len], all_attention))
  517|      3|            .map_err(|e| anyhow::anyhow!("building attention_mask tensor: {e}"))?;
                                                       ^0                                     ^0
  518|      3|        let t_words_mask =
  519|      3|            ort::value::Tensor::<i64>::from_array(([1usize, seq_len], all_word_mask))
  520|      3|                .map_err(|e| anyhow::anyhow!("building words_mask tensor: {e}"))?;
                                                           ^0                                 ^0
  521|      3|        let t_text_lengths =
  522|      3|            ort::value::Tensor::<i64>::from_array(([1usize, 1usize], vec![num_words as i64]))
  523|      3|                .map_err(|e| anyhow::anyhow!("building text_lengths tensor: {e}"))?;
                                                           ^0                                   ^0
  524|       |
  525|       |        // Build span tensors
  526|      3|        let num_spans = num_words * GLINER_MAX_WIDTH;
  527|      3|        let mut span_idx_data = vec![0i64; num_spans * 2];
  528|      3|        let mut span_mask_data = vec![false; num_spans];
  529|       |
  530|    364|        for start in 0..num_words {
                                      ^3
  531|    364|            let remaining = num_words - start;
  532|    364|            let actual_max_width = GLINER_MAX_WIDTH.min(remaining);
  533|  4.21k|            for width in 0..actual_max_width {
                                          ^364
  534|  4.21k|                let dim = start * GLINER_MAX_WIDTH + width;
  535|  4.21k|                span_idx_data[dim * 2] = start as i64;
  536|  4.21k|                span_idx_data[dim * 2 + 1] = (start + width) as i64;
  537|  4.21k|                span_mask_data[dim] = true;
  538|  4.21k|            }
  539|       |        }
  540|       |
  541|      3|        let t_span_idx =
  542|      3|            ort::value::Tensor::<i64>::from_array(([1usize, num_spans, 2usize], span_idx_data))
  543|      3|                .map_err(|e| anyhow::anyhow!("building span_idx tensor: {e}"))?;
                                                           ^0                               ^0
  544|      3|        let t_span_mask =
  545|      3|            ort::value::Tensor::<bool>::from_array(([1usize, num_spans], span_mask_data))
  546|      3|                .map_err(|e| anyhow::anyhow!("building span_mask tensor: {e}"))?;
                                                           ^0                                ^0
  547|       |
  548|       |        // Run inference — Session::run requires &mut Session; bind guard first.
  549|      3|        let mut session_guard = self.session.lock();
  550|      3|        let outputs = session_guard
  551|      3|            .run(ort::inputs![
  552|      3|                "input_ids" => t_input_ids,
  553|      3|                "attention_mask" => t_attention,
  554|      3|                "words_mask" => t_words_mask,
  555|      3|                "text_lengths" => t_text_lengths,
  556|      3|                "span_idx" => t_span_idx,
  557|      3|                "span_mask" => t_span_mask
  558|      3|            ])
  559|      3|            .map_err(|e| anyhow::anyhow!("GLiNER inference forward pass: {e}"))?;
                                                       ^0                                    ^0
  560|       |
  561|       |        // Extract logits: [1, num_words, max_width, num_classes]
  562|       |        // try_extract_tensor returns (&Shape, &[f32]); index manually.
  563|      3|        let (logits_shape, logits_data) = outputs["logits"]
  564|      3|            .try_extract_tensor::<f32>()
  565|      3|            .map_err(|e| anyhow::anyhow!("extracting logits tensor: {e}"))?;
                                                       ^0                               ^0
  566|       |
  567|      3|        let num_classes = label_names.len();
  568|       |        // Expected shape: [1, num_words, GLINER_MAX_WIDTH, num_classes]
  569|       |        // Shape derefs to &[i64] so we can index directly.
  570|      3|        let max_width = logits_shape
  571|      3|            .get(2)
  572|      3|            .copied()
  573|      3|            .unwrap_or(GLINER_MAX_WIDTH as i64) as usize;
  574|      3|        let nc = logits_shape.get(3).copied().unwrap_or(num_classes as i64) as usize;
  575|       |
  576|      3|        let candidates_cap = num_words * max_width;
  577|      3|        let mut candidates: Vec<(usize, usize, usize, f32)> = Vec::new();
  578|      3|        candidates.try_reserve(candidates_cap).map_err(|_| {
                                                                         ^0
  579|      0|            anyhow::anyhow!(
  580|      0|                "allocation of {candidates_cap} candidates would exceed available memory"
  581|       |            )
  582|      0|        })?;
  583|       |
  584|    364|        for start in 0..num_words {
                                      ^3
  585|  4.23k|            for width in 0..max_width {
                                          ^364
  586|  4.23k|                let end = start + width;
  587|  4.23k|                if end >= num_words {
  588|     20|                    break;
  589|  4.21k|                }
  590|  54.8k|                for class_idx in 0..nc.min(num_classes) {
                                                  ^4.21k^4.21k^4.21k
  591|       |                    // flat index: batch=0 * (num_words*max_width*nc) + start*(max_width*nc) + width*nc + class_idx
  592|  54.8k|                    let flat = start * (max_width * nc) + width * nc + class_idx;
  593|  54.8k|                    if flat >= logits_data.len() {
  594|      0|                        break;
  595|  54.8k|                    }
  596|  54.8k|                    let raw = logits_data[flat];
  597|  54.8k|                    let score = 1.0 / (1.0 + (-raw).exp());
  598|  54.8k|                    if score >= threshold {
  599|      0|                        candidates.push((start, end, class_idx, score));
  600|  54.8k|                    }
  601|       |                }
  602|       |            }
  603|       |        }
  604|       |
  605|       |        // Sort by score descending for greedy NMS
  606|      3|        candidates.sort_by(|a, b| b.3.partial_cmp(&a.3).unwrap_or(std::cmp::Ordering::Equal));
                                                ^0  ^0          ^0    ^0        ^0
  607|       |
  608|       |        // Greedy non-maximum suppression
  609|      3|        let mut used = vec![false; num_words];
  610|      3|        let mut entities: Vec<ExtractedEntity> = Vec::with_capacity(candidates.len().min(MAX_ENTS));
  611|       |
  612|      3|        for (start, end, class_idx, _score) in &candidates {
                           ^0     ^0   ^0         ^0
  613|      0|            let overlap = (*start..=*end).any(|i| used[i]);
  614|      0|            if overlap {
  615|      0|                continue;
  616|      0|            }
  617|      0|            for flag in used.iter_mut().take(*end + 1).skip(*start) {
  618|      0|                *flag = true;
  619|      0|            }
  620|      0|            let text = words[*start..=*end].join(" ");
  621|      0|            if text.len() < MIN_ENTITY_CHARS {
  622|      0|                continue;
  623|      0|            }
  624|      0|            let entity_type = entity_labels[*class_idx].1;
  625|      0|            entities.push(ExtractedEntity {
  626|      0|                name: text,
  627|      0|                entity_type,
  628|      0|            });
  629|      0|            if entities.len() >= MAX_ENTS {
  630|      0|                break;
  631|      0|            }
  632|       |        }
  633|       |
  634|      3|        Ok(entities)
  635|      3|    }
  636|       |}
  637|       |
  638|       |static GLINER_MODEL: OnceLock<Option<GlinerModel>> = OnceLock::new();
  639|       |
  640|      1|fn gliner_model_dir(paths: &AppPaths, variant: GlinerVariant) -> PathBuf {
  641|      1|    paths.models.join(format!("gliner-multi-v2.1/{variant}"))
  642|      1|}
  643|       |
  644|      1|fn ensure_gliner_model_files(paths: &AppPaths, variant: GlinerVariant) -> Result<PathBuf> {
  645|      1|    let dir = gliner_model_dir(paths, variant);
  646|      1|    std::fs::create_dir_all(&dir)
  647|      1|        .with_context(|| format!("creating GLiNER model directory: {dir:?}"))?;
                                               ^0                                          ^0
  648|       |
  649|      1|    let model_file = dir.join(variant.as_filename());
  650|      1|    let tokenizer_file = dir.join("tokenizer.json");
  651|       |
  652|      1|    if model_file.exists() && tokenizer_file.exists() {
  653|      1|        return Ok(dir);
  654|      0|    }
  655|       |
  656|      0|    let repo = crate::constants::gliner_model_repo();
  657|      0|    tracing::info!(target: "extraction",
  658|      0|        "Downloading GLiNER model ({variant}, ~{})...",
  659|      0|        variant.display_size()
  660|       |    );
  661|      0|    crate::output::emit_progress_i18n(
  662|      0|        &format!(
  663|      0|            "Downloading GLiNER model ({variant}, ~{})...",
  664|      0|            variant.display_size()
  665|      0|        ),
  666|      0|        &format!(
  667|      0|            "Baixando modelo GLiNER ({variant}, ~{})...",
  668|      0|            variant.display_size()
  669|      0|        ),
  670|       |    );
  671|       |
  672|      0|    let api = huggingface_hub::api::sync::Api::new().with_context(|| "creating HF Hub client")?;
  673|      0|    let hf_repo = api.model(repo);
  674|       |
  675|      0|    let remote_model = format!("onnx/{}", variant.as_filename());
  676|      0|    if !model_file.exists() {
  677|      0|        let src = hf_repo
  678|      0|            .get(&remote_model)
  679|      0|            .with_context(|| format!("downloading {remote_model} from HF Hub"))?;
  680|      0|        std::fs::copy(&src, &model_file)
  681|      0|            .with_context(|| format!("copying {} to cache", variant.as_filename()))?;
  682|      0|    }
  683|       |
  684|      0|    if !tokenizer_file.exists() {
  685|      0|        let src = hf_repo
  686|      0|            .get("tokenizer.json")
  687|      0|            .with_context(|| "downloading tokenizer.json from HF Hub")?;
  688|      0|        std::fs::copy(&src, &tokenizer_file).with_context(|| "copying tokenizer.json to cache")?;
  689|      0|    }
  690|       |
  691|      0|    Ok(dir)
  692|      1|}
  693|       |
  694|      1|fn load_gliner_model(paths: &AppPaths, variant: GlinerVariant) -> Result<GlinerModel> {
  695|      1|    let dir = ensure_gliner_model_files(paths, variant)?;
                                                                     ^0
  696|      1|    GlinerModel::load(&dir, variant)
  697|      1|}
  698|       |
  699|      3|fn get_or_init_gliner(paths: &AppPaths, variant: GlinerVariant) -> Option<&'static GlinerModel> {
  700|      3|    GLINER_MODEL
  701|      3|        .get_or_init(|| match load_gliner_model(paths, variant) {
                                            ^1
  702|      1|            Ok(m) => Some(m),
  703|      0|            Err(e) => {
  704|      0|                tracing::warn!(target: "extraction", error = %e, "GLiNER model unavailable, graceful degradation");
  705|      0|                None
  706|       |            }
  707|      1|        })
  708|      3|        .as_ref()
  709|      3|}
  710|       |
  711|     21|fn apply_regex_prefilter(body: &str) -> Vec<ExtractedEntity> {
  712|     21|    let mut entities = Vec::with_capacity(16);
  713|     21|    let mut seen: std::collections::HashSet<String> = std::collections::HashSet::with_capacity(32);
  714|       |
  715|     21|    let add = |entities: &mut Vec<ExtractedEntity>,
  716|       |               seen: &mut std::collections::HashSet<String>,
  717|       |               name: &str,
  718|     24|               entity_type: EntityType| {
  719|     24|        let name = name.trim().to_string();
  720|     24|        if name.len() >= MIN_ENTITY_CHARS && seen.insert(name.clone()) {
  721|     24|            entities.push(ExtractedEntity { name, entity_type });
  722|     24|        }
                      ^0
  723|     24|    };
  724|       |
  725|       |    // v1.0.25 P0-4: strip section-structure markers before any other processing so that
  726|       |    // "Etapa 3", "Fase 1", "Passo 2" are not fed to downstream regex passes.
  727|     21|    let cleaned = regex_section_marker().replace_all(body, " ");
  728|     21|    let cleaned = cleaned.as_ref();
  729|       |
  730|     21|    for m in regex_email().find_iter(cleaned) {
                      ^5
  731|      5|        // v1.0.20: email is "concept" (regex alone cannot distinguish person from mailing list/role).
  732|      5|        add(&mut entities, &mut seen, m.as_str(), EntityType::Concept);
  733|      5|    }
  734|     21|    for m in regex_uuid().find_iter(cleaned) {
                      ^1
  735|      1|        add(&mut entities, &mut seen, m.as_str(), EntityType::Concept);
  736|      1|    }
  737|     51|    for m in regex_all_caps().find_iter(cleaned) {
                           ^21              ^21       ^21
  738|     51|        let candidate = m.as_str();
  739|       |        // v1.0.22: filtro consolidado (stopwords + HTTP methods); preserva identificadores com underscore.
  740|     51|        if !is_filtered_all_caps(candidate) {
  741|     14|            add(&mut entities, &mut seen, candidate, EntityType::Concept);
  742|     37|        }
  743|       |    }
  744|       |    // v1.0.25 P0-2: capture CamelCase brand names that NER model often misses.
  745|       |    // Maps to "organization" (V008 schema) because brand names are typically organisations.
  746|     21|    for m in regex_brand_camel().find_iter(cleaned) {
                      ^4
  747|      4|        let name = m.as_str();
  748|       |        // Skip if the uppercased form is a known stopword (e.g. "JsonSchema" → "JSONSCHEMA").
  749|      4|        if !ALL_CAPS_STOPWORDS.contains(&name.to_uppercase().as_str()) {
  750|      4|            add(&mut entities, &mut seen, name, EntityType::Organization);
  751|      4|        }
                      ^0
  752|       |    }
  753|       |
  754|     21|    entities
  755|     21|}
  756|       |
  757|       |/// Extracts URLs from a memory body, deduplicated by text.
  758|       |/// URLs are stored in the `memory_urls` table separately from graph entities.
  759|       |/// v1.0.24: split of the URL block that polluted apply_regex_prefilter with entity_type='concept'.
  760|      8|pub fn extract_urls(body: &str) -> Vec<ExtractedUrl> {
  761|      8|    let mut seen: std::collections::HashSet<String> = std::collections::HashSet::with_capacity(8);
  762|      8|    let mut result = Vec::with_capacity(4);
  763|      8|    for m in regex_url().find_iter(body) {
                      ^4
  764|      4|        let raw = m.as_str();
  765|      4|        let cleaned = raw
  766|      4|            .trim_end_matches('`')
  767|      4|            .trim_end_matches(',')
  768|      4|            .trim_end_matches('.')
  769|      4|            .trim_end_matches(';')
  770|      4|            .trim_end_matches(')')
  771|      4|            .trim_end_matches(']')
  772|      4|            .trim_end_matches('}');
  773|      4|        if cleaned.len() >= 10 && seen.insert(cleaned.to_string()) {
  774|      3|            result.push(ExtractedUrl {
  775|      3|                url: cleaned.to_string(),
  776|      3|                offset: m.start(),
  777|      3|            });
  778|      3|        }
                      ^1
  779|       |    }
  780|      8|    result
  781|      8|}
  782|       |
  783|       |/// Returns (relationships, truncated) where truncated is true when the cap was hit
  784|       |/// before all entity pairs were covered. Exposed in RememberResponse as
  785|       |/// `relationships_truncated` so callers can decide whether to increase the cap.
  786|       |///
  787|       |/// v1.0.31 A9: superseded by `build_relationships_by_sentence_cooccurrence` for
  788|       |/// the auto-extraction pipeline because the legacy pairwise scheme produces a
  789|       |/// dense C(N,2) graph polluted with co-mentions across unrelated paragraphs.
  790|       |/// Kept for unit tests that pin the cap behaviour and for callers that lack a
  791|       |/// body string.
  792|       |#[cfg(test)]
  793|      2|fn build_relationships(entities: &[NewEntity]) -> (Vec<NewRelationship>, bool) {
  794|      2|    if entities.len() < 2 {
  795|      0|        return (Vec::new(), false);
  796|      2|    }
  797|       |
  798|       |    // v1.0.22: cap configurable via env var (constants::max_relationships_per_memory).
  799|       |    // Allows users with dense corpora to increase beyond the default 50.
  800|      2|    let max_rels = crate::constants::max_relationships_per_memory();
  801|      2|    let n = entities.len().min(MAX_ENTS);
  802|      2|    let mut rels: Vec<NewRelationship> = Vec::with_capacity(n.min(max_rels));
  803|      2|    let mut seen: std::collections::HashSet<(usize, usize)> =
  804|      2|        std::collections::HashSet::with_capacity(n.min(max_rels));
  805|       |
  806|      2|    let mut hit_cap = false;
  807|     16|    'outer: for i in 0..n {
                                      ^2
  808|     16|        if rels.len() >= max_rels {
  809|      1|            hit_cap = true;
  810|      1|            break;
  811|     15|        }
  812|       |
  813|     15|        let mut for_entity = 0usize;
  814|     70|        for j in (i + 1)..n {
                               ^15      ^15
  815|     70|            if for_entity >= TOP_K_RELATIONS {
  816|     10|                break;
  817|     60|            }
  818|     60|            if rels.len() >= max_rels {
  819|      0|                hit_cap = true;
  820|      0|                break 'outer;
  821|     60|            }
  822|       |
  823|     60|            let key = (i.min(j), i.max(j));
  824|     60|            if !seen.insert(key) {
  825|      0|                continue;
  826|     60|            }
  827|       |
  828|     60|            rels.push(NewRelationship {
  829|     60|                // clone needed: NewRelationship requires owned String for source/target
  830|     60|                source: entities[i].name.clone(),
  831|     60|                target: entities[j].name.clone(),
  832|     60|                relation: DEFAULT_RELATION.to_string(),
  833|     60|                strength: 0.5,
  834|     60|                description: None,
  835|     60|            });
  836|     60|            for_entity += 1;
  837|       |        }
  838|       |    }
  839|       |
  840|       |    // v1.0.20: warn when relationships were truncated before covering all possible pairs.
  841|      2|    if hit_cap {
  842|      1|        tracing::warn!(target: "extraction",
  843|      0|            "relationships truncated to {max_rels} (with {n} entities, theoretical max was ~{}x combinations)",
  844|      0|            n.saturating_sub(1)
  845|       |        );
  846|      1|    }
  847|       |
  848|      2|    (rels, hit_cap)
  849|      2|}
  850|       |
  851|       |/// v1.0.31 A9: build relationships only between entities that actually
  852|       |/// co-occur within the same sentence (split on `.`, `!`, `?`, newline).
  853|       |///
  854|       |/// The legacy `build_relationships` pairs every entity with every other,
  855|       |/// yielding a dense C(N,2) graph dominated by spurious "mentions" edges
  856|       |/// across unrelated sections. Restricting to sentence-level co-occurrence
  857|       |/// keeps the edges semantically meaningful while still respecting the
  858|       |/// configurable `max_relationships_per_memory` cap.
  859|       |///
  860|       |/// Returns `(relationships, truncated)` mirroring `build_relationships`.
  861|      8|fn build_relationships_by_sentence_cooccurrence(
  862|      8|    body: &str,
  863|      8|    entities: &[NewEntity],
  864|      8|) -> (Vec<NewRelationship>, bool) {
  865|      8|    if entities.len() < 2 {
  866|      3|        return (Vec::new(), false);
  867|      5|    }
  868|       |
  869|      5|    let max_rels = crate::constants::max_relationships_per_memory();
  870|      5|    let lower_names: Vec<(usize, String)> = entities
  871|      5|        .iter()
  872|      5|        .take(MAX_ENTS)
  873|      5|        .enumerate()
  874|     11|        .map(|(i, e)| (i, e.name.to_lowercase()))
                       ^5
  875|      5|        .collect();
  876|       |
  877|      5|    let mut rels: Vec<NewRelationship> = Vec::with_capacity(max_rels);
  878|      5|    let mut seen: std::collections::HashSet<(usize, usize)> =
  879|      5|        std::collections::HashSet::with_capacity(max_rels);
  880|      5|    let mut hit_cap = false;
  881|       |
  882|     12|    for sentence in body.split(['.', '!', '?', '\n']) {
                                  ^5   ^5    ^5
  883|     12|        if sentence.trim().is_empty() {
  884|      2|            continue;
  885|     10|        }
  886|     10|        let lower_sentence = sentence.to_lowercase();
  887|     10|        let present: Vec<usize> = lower_names
  888|     10|            .iter()
  889|     22|            .filter(|(_, name)| !name.is_empty() && lower_sentence.contains(name.as_str()))
                           ^10
  890|     10|            .map(|(i, _)| *i)
  891|     10|            .collect();
  892|       |
  893|     10|        if present.len() < 2 {
  894|      7|            continue;
  895|      3|        }
  896|       |
  897|      3|        let n = present.len();
  898|      6|        for i in 0..n {
                                  ^3
  899|      6|            for j in (i + 1)..n {
                              ^3
  900|      3|                if rels.len() >= max_rels {
  901|      0|                    hit_cap = true;
  902|      0|                    tracing::warn!(target: "extraction",
  903|      0|                        "relationships truncated to {max_rels} during sentence-level pairing"
  904|       |                    );
  905|      0|                    return (rels, hit_cap);
  906|      3|                }
  907|      3|                let ei = present[i];
  908|      3|                let ej = present[j];
  909|      3|                let key = (ei.min(ej), ei.max(ej));
  910|      3|                if seen.insert(key) {
  911|      2|                    rels.push(NewRelationship {
  912|      2|                        source: entities[ei].name.clone(),
  913|      2|                        target: entities[ej].name.clone(),
  914|      2|                        relation: DEFAULT_RELATION.to_string(),
  915|      2|                        strength: 0.5,
  916|      2|                        description: None,
  917|      2|                    });
  918|      2|                }
                              ^1
  919|       |            }
  920|       |        }
  921|       |    }
  922|       |
  923|      5|    (rels, hit_cap)
  924|      8|}
  925|       |
  926|       |/// v1.0.22 P1: extends entities with hyphenated or space-separated numeric suffixes.
  927|       |/// Cases: GPT extracted but body contains "GPT-5" → rewrites to "GPT-5".
  928|       |/// Cases: Claude extracted but body contains "Claude 4" → rewrites to "Claude 4".
  929|       |/// Conservative: only extends when the suffix is at most 7 characters.
  930|       |/// v1.0.24 P2-E: suffix accepts an optional lowercase ASCII letter after digits to cover
  931|       |/// models such as "GPT-4o", "Llama-5b", "Mistral-8x" (digits + [a-z]? + [x\d+]?).
  932|      7|fn extend_with_numeric_suffix(entities: Vec<ExtractedEntity>, body: &str) -> Vec<ExtractedEntity> {
  933|       |    static SUFFIX_RE: OnceLock<Regex> = OnceLock::new();
  934|       |    // Matches: separator + digits + optional decimal + optional lowercase letter
  935|       |    // Examples: "-4", " 5", "-4o", " 5b", "-8x", " 3.5", "-3.5-turbo" (capped by len)
  936|      7|    let suffix_re = SUFFIX_RE.get_or_init(|| {
                                                           ^1
  937|      1|        Regex::new(r"^([\-\s]+\d+(?:\.\d+)?[a-z]?)")
  938|      1|            .expect("compile-time validated numeric suffix regex literal")
  939|      1|    });
  940|       |
  941|      7|    entities
  942|      7|        .into_iter()
  943|      8|        .map(|ent| {
                       ^7
  944|       |            // Finds the first case-sensitive occurrence of the entity in the body
  945|      8|            if let Some(pos) = body.find(&ent.name) {
  946|      8|                let after_pos = pos + ent.name.len();
  947|      8|                if after_pos < body.len() {
  948|      8|                    let after = &body[after_pos..];
  949|      8|                    if let Some(m) = suffix_re.find(after) {
                                              ^4
  950|      4|                        let suffix = m.as_str();
  951|       |                        // Conservative: cap suffix length to 7 chars to avoid grabbing
  952|       |                        // long hyphenated phrases while allowing "4o", "5b", "3.5b".
  953|      4|                        if suffix.len() <= 7 {
  954|      4|                            let mut extended = String::with_capacity(ent.name.len() + suffix.len());
  955|      4|                            extended.push_str(&ent.name);
  956|      4|                            extended.push_str(suffix);
  957|      4|                            return ExtractedEntity {
  958|      4|                                name: extended,
  959|      4|                                entity_type: ent.entity_type,
  960|      4|                            };
  961|      0|                        }
  962|      4|                    }
  963|      0|                }
  964|      0|            }
  965|      4|            ent
  966|      8|        })
  967|      7|        .collect()
  968|      7|}
  969|       |
  970|       |/// Captures versioned model names that NER model consistently misses.
  971|       |///
  972|       |/// NER model often classifies tokens like "Claude" or "Llama" as common nouns,
  973|       |/// failing to emit a B-PER/B-ORG tag. As a result, `extend_with_numeric_suffix`
  974|       |/// never sees these candidates and the version suffix gets lost.
  975|       |///
  976|       |/// This function scans the body with a conservative regex, matching capitalised
  977|       |/// words followed by a space-or-hyphen and a small integer. Matches that are not
  978|       |/// already covered by an existing entity (case-insensitive) are appended with the
  979|       |/// `concept` type, mirroring how `extend_with_numeric_suffix` represents these
  980|       |/// items downstream.
  981|       |///
  982|       |/// v1.0.24 P2-D: regex extended to cover:
  983|       |/// - Alphanumeric version suffixes: "GPT-4o", "Llama-3b", "Mistral-8x"
  984|       |/// - Composite versions: "Mixtral 8x7B" (digit × digit + uppercase letter)
  985|       |/// - Named release tiers after version: "Claude 4 Sonnet", "Llama 3 Pro"
  986|       |///
  987|       |/// Examples covered: "Claude 4", "Llama 3", "GPT-4o", "Claude 4 Sonnet", "Mixtral 8x7B".
  988|       |/// Examples already handled upstream and skipped here: plain "Apple" without a suffix.
  989|      8|fn augment_versioned_model_names(
  990|      8|    entities: Vec<ExtractedEntity>,
  991|      8|    body: &str,
  992|      8|) -> Vec<ExtractedEntity> {
  993|       |    static VERSIONED_MODEL_RE: OnceLock<Regex> = OnceLock::new();
  994|       |    // Pattern breakdown:
  995|       |    //   [A-Z][A-Za-z]{2,15}   — capitalised model name (3-16 chars)
  996|       |    //   [\s\-]+               — separator: space(s) or hyphen(s)
  997|       |    //   \d+(?:\.\d+)?         — version number, optional decimal
  998|       |    //   (?:[a-z]|x\d+[A-Za-z]?)? — optional alphanumeric suffix: "o", "b", "x7B"
  999|       |    //   (?:\s+(?:Sonnet|Opus|Haiku|Turbo|Pro|Lite|Mini|Nano|Flash|Ultra))? — optional release tier
 1000|      8|    let model_re = VERSIONED_MODEL_RE.get_or_init(|| {
                                                                   ^1
 1001|      1|        Regex::new(
 1002|      1|            r"\b([A-Z][A-Za-z]{2,15})[\s\-]+(\d+(?:\.\d+)?(?:[a-z]|x\d+[A-Za-z]?)?)(?:\s+(?:Sonnet|Opus|Haiku|Turbo|Pro|Lite|Mini|Nano|Flash|Ultra))?\b",
 1003|       |        )
 1004|      1|        .expect("compile-time validated versioned model regex literal")
 1005|      1|    });
 1006|       |
 1007|      8|    let mut existing_lc: std::collections::HashSet<String> =
 1008|      8|        entities.iter().map(|ent| ent.name.to_lowercase()).collect();
                                                ^5       ^5
 1009|      8|    let mut result = entities;
 1010|       |
 1011|      8|    for caps in model_re.captures_iter(body) {
                      ^5
 1012|      5|        let full_match = caps.get(0).map(|m| m.as_str()).unwrap_or("");
 1013|       |        // Conservative cap: avoid harvesting multi-word noise like "section 12" inside
 1014|       |        // long passages. A model name plus a one or two digit suffix fits in 24 chars.
 1015|      5|        if full_match.is_empty() || full_match.len() > 24 {
 1016|      0|            continue;
 1017|      5|        }
 1018|      5|        let normalized_lc = full_match.to_lowercase();
 1019|      5|        if existing_lc.contains(&normalized_lc) {
 1020|      1|            continue;
 1021|      4|        }
 1022|       |        // Stop appending once the global entity cap is reached to keep parity with
 1023|       |        // `merge_and_deduplicate` truncation semantics.
 1024|      4|        if result.len() >= MAX_ENTS {
 1025|      0|            break;
 1026|      4|        }
 1027|      4|        existing_lc.insert(normalized_lc);
 1028|      4|        result.push(ExtractedEntity {
 1029|      4|            name: full_match.to_string(),
 1030|      4|            entity_type: EntityType::Concept,
 1031|      4|        });
 1032|       |    }
 1033|       |
 1034|      8|    result
 1035|      8|}
 1036|       |
 1037|     10|fn merge_and_deduplicate(
 1038|     10|    regex_ents: Vec<ExtractedEntity>,
 1039|     10|    ner_ents: Vec<ExtractedEntity>,
 1040|     10|) -> Vec<ExtractedEntity> {
 1041|       |    // v1.0.25 P0-3: Collision detection uses substring containment (not starts_with)
 1042|       |    // and is scoped per entity_type. This fixes two bugs from prior versions:
 1043|       |    //
 1044|       |    // 1. starts_with was not symmetric for non-prefix substrings. "sonne" does not
 1045|       |    //    start_with "sonnet", so the pair could survive dedup depending on insertion
 1046|       |    //    order. contains() catches both directions unconditionally.
 1047|       |    //
 1048|       |    // 2. The lookup key omitted entity_type, so "Apple/organization" and
 1049|       |    //    "Apple/concept" collapsed into one. Key is now "type\0name_lc".
 1050|       |    //
 1051|       |    // Earlier invariants preserved:
 1052|       |    // - NFKC normalization before lowercasing (v1.0.24).
 1053|       |    // - Longest-wins: on collision keep the entity with the longer name.
 1054|       |    // - Truncation warning at MAX_ENTS.
 1055|     10|    let mut by_lc: std::collections::HashMap<String, usize> =
 1056|     10|        std::collections::HashMap::with_capacity(regex_ents.len() + ner_ents.len());
 1057|     10|    let mut result: Vec<ExtractedEntity> = Vec::with_capacity(MAX_ENTS);
 1058|     10|    let mut truncated = false;
 1059|       |
 1060|     10|    let total_input = regex_ents.len() + ner_ents.len();
 1061|     18|    for ent in regex_ents.into_iter().chain(ner_ents) {
                             ^10        ^10         ^10   ^10
 1062|     18|        let name_lc = ent.name.nfkc().collect::<String>().to_lowercase();
 1063|       |        // Composite key: entity_type + NUL + normalised lowercase name.
 1064|       |        // Collision search is scoped to the same type so that e.g.
 1065|       |        // "Apple/organization" and "Apple/concept" are kept separately.
 1066|     18|        let key = {
 1067|     18|            let et = ent.entity_type.as_str();
 1068|     18|            let mut k = String::with_capacity(et.len() + 1 + name_lc.len());
 1069|     18|            k.push_str(et);
 1070|     18|            k.push('\0');
 1071|     18|            k.push_str(&name_lc);
 1072|     18|            k
 1073|       |        };
 1074|       |
 1075|       |        // Scan stored entries for substring containment within the same type.
 1076|       |        // Two names collide when one is a case-insensitive substring of the other:
 1077|       |        //   "sonne" ⊂ "sonnet"  → collision, keep "sonnet" (longest-wins)
 1078|       |        //   "open"  ⊂ "openai"  → collision, keep "openai" (longest-wins)
 1079|     18|        let type_prefix = {
 1080|     18|            let et = ent.entity_type.as_str();
 1081|     18|            let mut p = String::with_capacity(et.len() + 1);
 1082|     18|            p.push_str(et);
 1083|     18|            p.push('\0');
 1084|     18|            p
 1085|       |        };
 1086|     18|        let mut collision_idx: Option<usize> = None;
 1087|     22|        for (existing_key, idx) in &by_lc {
                           ^9            ^9
 1088|       |            // Fast-path: check type prefix matches before scanning the name.
 1089|      9|            if !existing_key.starts_with(&type_prefix) {
 1090|      1|                continue;
 1091|      8|            }
 1092|      8|            let existing_name_lc = &existing_key[type_prefix.len()..];
 1093|      8|            if existing_name_lc == name_lc
 1094|      5|                || existing_name_lc.contains(name_lc.as_str())
 1095|      5|                || name_lc.contains(existing_name_lc)
 1096|       |            {
 1097|      5|                collision_idx = Some(*idx);
 1098|      5|                break;
 1099|      3|            }
 1100|       |        }
 1101|     18|        match collision_idx {
 1102|      5|            Some(idx) => {
 1103|       |                // Replace stored entity only when the new candidate is strictly
 1104|       |                // longer; otherwise drop the new one.
 1105|      5|                if ent.name.len() > result[idx].name.len() {
 1106|      3|                    let old_name_lc = result[idx].name.nfkc().collect::<String>().to_lowercase();
 1107|      3|                    let old_key = {
 1108|      3|                        let et = result[idx].entity_type.as_str();
 1109|      3|                        let mut k = String::with_capacity(et.len() + 1 + old_name_lc.len());
 1110|      3|                        k.push_str(et);
 1111|      3|                        k.push('\0');
 1112|      3|                        k.push_str(&old_name_lc);
 1113|      3|                        k
 1114|      3|                    };
 1115|      3|                    by_lc.remove(&old_key);
 1116|      3|                    result[idx] = ent;
 1117|      3|                    by_lc.insert(key, idx);
 1118|      3|                }
                              ^2
 1119|       |            }
 1120|     13|            None => {
 1121|     13|                by_lc.insert(key, result.len());
 1122|     13|                result.push(ent);
 1123|     13|            }
 1124|       |        }
 1125|     18|        if result.len() >= MAX_ENTS {
 1126|      0|            truncated = true;
 1127|      0|            break;
 1128|     18|        }
 1129|       |    }
 1130|       |
 1131|       |    // v1.0.20: warn when silent truncation discards entities above MAX_ENTS.
 1132|     10|    if truncated {
 1133|      0|        tracing::warn!(target: "extraction",
 1134|      0|            "extraction truncated at {MAX_ENTS} entities (input had {total_input} candidates before deduplication)"
 1135|       |        );
 1136|     10|    }
 1137|       |
 1138|     10|    result
 1139|     10|}
 1140|       |
 1141|      5|fn to_new_entities(extracted: Vec<ExtractedEntity>) -> Vec<NewEntity> {
 1142|      5|    extracted
 1143|      5|        .into_iter()
 1144|      5|        .map(|e| NewEntity {
 1145|      7|            name: e.name,
 1146|      7|            entity_type: e.entity_type,
 1147|      7|            description: None,
 1148|      7|        })
 1149|      5|        .collect()
 1150|      5|}
 1151|       |
 1152|      3|pub fn extract_graph_auto(
 1153|      3|    body: &str,
 1154|      3|    paths: &AppPaths,
 1155|      3|    variant: GlinerVariant,
 1156|      3|) -> Result<ExtractionResult> {
 1157|      3|    let regex_entities = apply_regex_prefilter(body);
 1158|      3|    let threshold = crate::constants::gliner_confidence_threshold();
 1159|       |
 1160|      3|    let mut gliner_used = false;
 1161|      3|    let ner_entities = match get_or_init_gliner(paths, variant) {
 1162|      3|        Some(model) => match model.predict(body, GLINER_ENTITY_LABELS, threshold) {
 1163|      3|            Ok(ents) => {
 1164|      3|                gliner_used = true;
 1165|      3|                ents
 1166|       |            }
 1167|      0|            Err(e) => {
 1168|      0|                tracing::warn!(target: "extraction", error = %e, "GLiNER NER failed, falling back to regex-only");
 1169|      0|                Vec::new()
 1170|       |            }
 1171|       |        },
 1172|      0|        None => Vec::new(),
 1173|       |    };
 1174|       |
 1175|      3|    let merged = merge_and_deduplicate(regex_entities, ner_entities);
 1176|      3|    let extended = extend_with_numeric_suffix(merged, body);
 1177|      3|    let with_models = augment_versioned_model_names(extended, body);
 1178|      3|    let with_models: Vec<ExtractedEntity> = with_models
 1179|      3|        .into_iter()
 1180|      4|        .filter(|e| !regex_section_marker().is_match(&e.name))
                       ^3
 1181|      3|        .collect();
 1182|      3|    let entities = to_new_entities(with_models);
 1183|      3|    let (relationships, relationships_truncated) =
 1184|      3|        build_relationships_by_sentence_cooccurrence(body, &entities);
 1185|       |
 1186|      3|    let extraction_method = if gliner_used {
 1187|      3|        format!("gliner-{variant}+regex")
 1188|       |    } else {
 1189|      0|        "regex-only".to_string()
 1190|       |    };
 1191|       |
 1192|      3|    let urls = extract_urls(body);
 1193|       |
 1194|      3|    Ok(ExtractionResult {
 1195|      3|        entities,
 1196|      3|        relationships,
 1197|      3|        relationships_truncated,
 1198|      3|        extraction_method,
 1199|      3|        urls,
 1200|      3|    })
 1201|      3|}
 1202|       |
 1203|       |pub struct RegexExtractor;
 1204|       |
 1205|       |impl Extractor for RegexExtractor {
 1206|      2|    fn extract(&self, body: &str) -> Result<ExtractionResult> {
 1207|      2|        let regex_entities = apply_regex_prefilter(body);
 1208|      2|        let entities = to_new_entities(regex_entities);
 1209|      2|        let (relationships, relationships_truncated) =
 1210|      2|            build_relationships_by_sentence_cooccurrence(body, &entities);
 1211|      2|        let urls = extract_urls(body);
 1212|      2|        Ok(ExtractionResult {
 1213|      2|            entities,
 1214|      2|            relationships,
 1215|      2|            relationships_truncated,
 1216|      2|            extraction_method: "regex-only".to_string(),
 1217|      2|            urls,
 1218|      2|        })
 1219|      2|    }
 1220|       |}
 1221|       |
 1222|       |#[cfg(test)]
 1223|       |mod tests {
 1224|       |    use super::*;
 1225|       |    use crate::entity_type::EntityType;
 1226|       |
 1227|      3|    fn make_paths() -> AppPaths {
 1228|       |        use std::path::PathBuf;
 1229|      3|        AppPaths {
 1230|      3|            db: PathBuf::from("/tmp/test.sqlite"),
 1231|      3|            models: PathBuf::from("/tmp/test_models"),
 1232|      3|        }
 1233|      3|    }
 1234|       |
 1235|       |    #[test]
 1236|      1|    fn regex_email_captures_address() {
 1237|      1|        let ents = apply_regex_prefilter("contact: someone@company.com for more info");
 1238|       |        // v1.0.20: emails are classified as "concept" (regex alone cannot distinguish person from role).
 1239|      1|        assert!(ents
 1240|      1|            .iter()
 1241|      1|            .any(|e| e.name == "someone@company.com" && e.entity_type == EntityType::Concept));
 1242|      1|    }
 1243|       |
 1244|       |    #[test]
 1245|      1|    fn regex_all_caps_filters_pt_rule_word() {
 1246|       |        // v1.0.20 fix P1: NUNCA, PROIBIDO, DEVE must not become "entities".
 1247|      1|        let ents = apply_regex_prefilter("NUNCA do this. PROIBIDO use X. DEVE follow Y.");
 1248|      1|        assert!(
 1249|      1|            !ents.iter().any(|e| e.name == "NUNCA"),
                                               ^0        ^0
 1250|      0|            "NUNCA must be filtered as a stopword"
 1251|       |        );
 1252|      1|        assert!(
 1253|      1|            !ents.iter().any(|e| e.name == "PROIBIDO"),
                                               ^0        ^0
 1254|      0|            "PROIBIDO must be filtered"
 1255|       |        );
 1256|      1|        assert!(
 1257|      1|            !ents.iter().any(|e| e.name == "DEVE"),
                                               ^0        ^0
 1258|      0|            "DEVE must be filtered"
 1259|       |        );
 1260|      1|    }
 1261|       |
 1262|       |    #[test]
 1263|      1|    fn regex_all_caps_accepts_underscored_constant() {
 1264|       |        // Technical constants like MAX_RETRY, TIMEOUT_MS must always be accepted.
 1265|      1|        let ents = apply_regex_prefilter("configure MAX_RETRY=3 and API_TIMEOUT=30");
 1266|      1|        assert!(ents.iter().any(|e| e.name == "MAX_RETRY"));
 1267|      2|        assert!(ents.iter().any(|e| e.name == "API_TIMEOUT"));
                      ^1      ^1          ^1
 1268|      1|    }
 1269|       |
 1270|       |    #[test]
 1271|      1|    fn regex_all_caps_accepts_domain_acronym() {
 1272|       |        // Legitimate (non-stopword) acronyms must pass: OPENAI, NVIDIA, GOOGLE.
 1273|      1|        let ents = apply_regex_prefilter("OPENAI launched GPT-5 with NVIDIA H100");
 1274|      1|        assert!(ents.iter().any(|e| e.name == "OPENAI"));
 1275|      3|        assert!(ents.iter().any(|e| e.name == "NVIDIA"));
                      ^1      ^1          ^1
 1276|      1|    }
 1277|       |
 1278|       |    #[test]
 1279|      1|    fn regex_url_does_not_appear_in_apply_regex_prefilter() {
 1280|       |        // v1.0.24 P0-2: URLs were removed from apply_regex_prefilter and now go through extract_urls.
 1281|      1|        let ents = apply_regex_prefilter("see https://docs.rs/crate for details");
 1282|      1|        assert!(
 1283|      1|            !ents.iter().any(|e| e.name.starts_with("https://")),
                                               ^0     ^0
 1284|      0|            "URLs must not appear as entities after the P0-2 split"
 1285|       |        );
 1286|      1|    }
 1287|       |
 1288|       |    #[test]
 1289|      1|    fn extract_urls_captures_https() {
 1290|      1|        let urls = extract_urls("see https://docs.rs/crate for details");
 1291|      1|        assert_eq!(urls.len(), 1);
 1292|      1|        assert_eq!(urls[0].url, "https://docs.rs/crate");
 1293|      1|        assert!(urls[0].offset > 0);
 1294|      1|    }
 1295|       |
 1296|       |    #[test]
 1297|      1|    fn extract_urls_trim_sufixo_pontuacao() {
 1298|      1|        let urls = extract_urls("link: https://example.com/path. fim");
 1299|      1|        assert!(!urls.is_empty());
 1300|      1|        assert!(
 1301|      1|            !urls[0].url.ends_with('.'),
 1302|      0|            "sufixo ponto deve ser removido"
 1303|       |        );
 1304|      1|    }
 1305|       |
 1306|       |    #[test]
 1307|      1|    fn extract_urls_dedupes_repeated() {
 1308|      1|        let body = "https://example.com referenciado aqui e depois aqui https://example.com";
 1309|      1|        let urls = extract_urls(body);
 1310|      1|        assert_eq!(urls.len(), 1, "URLs repetidas devem ser deduplicadas");
                                                ^0
 1311|      1|    }
 1312|       |
 1313|       |    #[test]
 1314|      1|    fn regex_uuid_captura_identificador() {
 1315|      1|        let ents = apply_regex_prefilter("id=550e8400-e29b-41d4-a716-446655440000 no sistema");
 1316|      1|        assert!(ents.iter().any(|e| e.entity_type == EntityType::Concept));
 1317|      1|    }
 1318|       |
 1319|       |    #[test]
 1320|      1|    fn regex_all_caps_captura_constante() {
 1321|      1|        let ents = apply_regex_prefilter("configure MAX_RETRY e TIMEOUT_MS");
 1322|      1|        assert!(ents.iter().any(|e| e.name == "MAX_RETRY"));
 1323|      2|        assert!(ents.iter().any(|e| e.name == "TIMEOUT_MS"));
                      ^1      ^1          ^1
 1324|      1|    }
 1325|       |
 1326|       |    #[test]
 1327|      1|    fn regex_all_caps_ignores_short_words() {
 1328|      1|        let ents = apply_regex_prefilter("use AI em seu projeto");
 1329|      1|        assert!(
 1330|      1|            !ents.iter().any(|e| e.name == "AI"),
                                               ^0        ^0
 1331|      0|            "AI tem apenas 2 chars, deve ser ignorado"
 1332|       |        );
 1333|      1|    }
 1334|       |
 1335|       |    #[test]
 1336|      1|    fn build_relationships_respeitam_max_rels() {
 1337|      1|        let entities: Vec<NewEntity> = (0..20)
 1338|      1|            .map(|i| NewEntity {
 1339|     20|                name: format!("entidade_{i}"),
 1340|     20|                entity_type: EntityType::Concept,
 1341|     20|                description: None,
 1342|     20|            })
 1343|      1|            .collect();
 1344|      1|        let (rels, truncated) = build_relationships(&entities);
 1345|      1|        let max_rels = crate::constants::max_relationships_per_memory();
 1346|      1|        assert!(rels.len() <= max_rels, "deve respeitar max_rels={max_rels}");
                                                      ^0
 1347|      1|        if rels.len() == max_rels {
 1348|      1|            assert!(truncated, "truncated deve ser true quando atingiu o cap");
                                             ^0
 1349|      0|        }
 1350|      1|    }
 1351|       |
 1352|       |    #[test]
 1353|      1|    fn build_relationships_without_duplicates() {
 1354|      1|        let entities: Vec<NewEntity> = (0..5)
 1355|      1|            .map(|i| NewEntity {
 1356|      5|                name: format!("ent_{i}"),
 1357|      5|                entity_type: EntityType::Concept,
 1358|      5|                description: None,
 1359|      5|            })
 1360|      1|            .collect();
 1361|      1|        let (rels, _truncated) = build_relationships(&entities);
 1362|      1|        let mut pares: std::collections::HashSet<(String, String)> =
 1363|      1|            std::collections::HashSet::new();
 1364|     11|        for r in &rels {
                          ^10
 1365|     10|            let par = (r.source.clone(), r.target.clone());
 1366|     10|            assert!(pares.insert(par), "par duplicado encontrado");
                                                     ^0
 1367|       |        }
 1368|      1|    }
 1369|       |
 1370|       |    #[test]
 1371|      1|    fn merge_dedupes_by_lowercase_name() {
 1372|       |        // v1.0.25: collision detection is scoped per entity_type; same name + same type
 1373|       |        // must deduplicate to one entry. Different types are kept separately.
 1374|      1|        let a = vec![ExtractedEntity {
 1375|      1|            name: "Rust".to_string(),
 1376|      1|            entity_type: EntityType::Concept,
 1377|      1|        }];
 1378|      1|        let b = vec![ExtractedEntity {
 1379|      1|            name: "rust".to_string(),
 1380|      1|            entity_type: EntityType::Concept,
 1381|      1|        }];
 1382|      1|        let merged = merge_and_deduplicate(a, b);
 1383|      1|        assert_eq!(
 1384|      1|            merged.len(),
 1385|       |            1,
 1386|      0|            "rust and Rust with the same type are the same entity"
 1387|       |        );
 1388|      1|    }
 1389|       |
 1390|       |    #[test]
 1391|      1|    fn regex_extractor_implements_trait() {
 1392|      1|        let extractor = RegexExtractor;
 1393|      1|        let result = extractor
 1394|      1|            .extract("contato: dev@empresa.io e MAX_TIMEOUT configurado")
 1395|      1|            .unwrap();
 1396|      1|        assert!(!result.entities.is_empty());
 1397|      1|    }
 1398|       |
 1399|       |    #[test]
 1400|      1|    fn extract_returns_ok_without_model() {
 1401|       |        // Without a downloaded model, must return Ok with regex-only entities.
 1402|      1|        let paths = make_paths();
 1403|      1|        let body = "contato: teste@exemplo.com com MAX_RETRY=3";
 1404|      1|        let result = extract_graph_auto(body, &paths, GlinerVariant::Int8).unwrap();
 1405|      1|        assert!(result
 1406|      1|            .entities
 1407|      1|            .iter()
 1408|      1|            .any(|e| e.name.contains("teste@exemplo.com")));
 1409|      1|    }
 1410|       |
 1411|       |    #[test]
 1412|      1|    fn stopwords_filter_v1024_terms() {
 1413|       |        // v1.0.24: verify that all 17 new stopwords added in P0-3 are filtered
 1414|       |        // by apply_regex_prefilter so they do not appear as entities.
 1415|      1|        let body = "ACEITE ACK ACL BORDA CHECKLIST COMPLETED CONFIRME \
 1416|      1|                    DEVEMOS DONE FIXED NEGUE PENDING PLAN PODEMOS RECUSE TOKEN VAMOS";
 1417|      1|        let ents = apply_regex_prefilter(body);
 1418|      1|        let names: Vec<&str> = ents.iter().map(|e| e.name.as_str()).collect();
                                                                 ^0     ^0
 1419|     18|        for word in &[
                          ^17
 1420|     18|            "ACEITE",
 1421|     18|            "ACK",
 1422|     18|            "ACL",
 1423|     18|            "BORDA",
 1424|     18|            "CHECKLIST",
 1425|     18|            "COMPLETED",
 1426|     18|            "CONFIRME",
 1427|     18|            "DEVEMOS",
 1428|     18|            "DONE",
 1429|     18|            "FIXED",
 1430|     18|            "NEGUE",
 1431|     18|            "PENDING",
 1432|     18|            "PLAN",
 1433|     18|            "PODEMOS",
 1434|     18|            "RECUSE",
 1435|     18|            "TOKEN",
 1436|     18|            "VAMOS",
 1437|     18|        ] {
 1438|     17|            assert!(
 1439|     17|                !names.contains(word),
 1440|      0|                "v1.0.24 stopword {word} should be filtered but was found in entities"
 1441|       |            );
 1442|       |        }
 1443|      1|    }
 1444|       |
 1445|       |    #[test]
 1446|      1|    fn dedup_normalizes_unicode_combining_marks() {
 1447|       |        // v1.0.24 P1-E: "Caf\u{e9}" (NFC precomposed) and "Cafe\u{301}" (NFD with
 1448|       |        // combining acute accent) must deduplicate to a single entity after NFKC
 1449|       |        // normalization.
 1450|      1|        let nfc = vec![ExtractedEntity {
 1451|      1|            name: "Caf\u{e9}".to_string(),
 1452|      1|            entity_type: EntityType::Concept,
 1453|      1|        }];
 1454|       |        // Build the NFD form: 'e' followed by combining acute accent U+0301
 1455|      1|        let nfd_name = "Cafe\u{301}".to_string();
 1456|      1|        let nfd = vec![ExtractedEntity {
 1457|      1|            name: nfd_name,
 1458|      1|            entity_type: EntityType::Concept,
 1459|      1|        }];
 1460|      1|        let merged = merge_and_deduplicate(nfc, nfd);
 1461|      1|        assert_eq!(
 1462|      1|            merged.len(),
 1463|       |            1,
 1464|      0|            "NFC 'Caf\\u{{e9}}' and NFD 'Cafe\\u{{301}}' must deduplicate to 1 entity after NFKC normalization"
 1465|       |        );
 1466|      1|    }
 1467|       |
 1468|       |    #[test]
 1469|      1|    fn extraction_method_regex_only_unchanged() {
 1470|       |        // RegexExtractor always returns "regex-only" regardless of GLINER_MODEL state.
 1471|       |        // This guards against accidentally changing the regex-only fallback string.
 1472|      1|        let result = RegexExtractor.extract("contact: dev@acme.io").unwrap();
 1473|      1|        assert_eq!(
 1474|       |            result.extraction_method, "regex-only",
 1475|      0|            "RegexExtractor must return regex-only"
 1476|       |        );
 1477|      1|    }
 1478|       |
 1479|       |    // --- P2-E: extend_with_numeric_suffix alphanumeric suffix ---
 1480|       |
 1481|       |    #[test]
 1482|      1|    fn extend_suffix_pure_numeric_unchanged() {
 1483|       |        // Existing behaviour: pure-numeric suffix must still work after P2-E.
 1484|      1|        let ents = vec![ExtractedEntity {
 1485|      1|            name: "GPT".to_string(),
 1486|      1|            entity_type: EntityType::Concept,
 1487|      1|        }];
 1488|      1|        let result = extend_with_numeric_suffix(ents, "using GPT-5 in the project");
 1489|      1|        assert_eq!(
 1490|      1|            result[0].name, "GPT-5",
 1491|      0|            "purely numeric suffix must be extended"
 1492|       |        );
 1493|      1|    }
 1494|       |
 1495|       |    #[test]
 1496|      1|    fn extend_suffix_alphanumeric_letter_after_digit() {
 1497|       |        // P2-E: "4o" suffix (digit + lowercase letter) must be captured.
 1498|      1|        let ents = vec![ExtractedEntity {
 1499|      1|            name: "GPT".to_string(),
 1500|      1|            entity_type: EntityType::Concept,
 1501|      1|        }];
 1502|      1|        let result = extend_with_numeric_suffix(ents, "using GPT-4o for advanced tasks");
 1503|      1|        assert_eq!(result[0].name, "GPT-4o", "suffix '4o' must be accepted");
                                                           ^0
 1504|      1|    }
 1505|       |
 1506|       |    #[test]
 1507|      1|    fn extend_suffix_alphanumeric_b_suffix() {
 1508|       |        // P2-E: "5b" suffix (digit + 'b') must be captured.
 1509|      1|        let ents = vec![ExtractedEntity {
 1510|      1|            name: "Llama".to_string(),
 1511|      1|            entity_type: EntityType::Concept,
 1512|      1|        }];
 1513|      1|        let result = extend_with_numeric_suffix(ents, "Llama-5b open-weight model");
 1514|      1|        assert_eq!(result[0].name, "Llama-5b", "suffix '5b' must be accepted");
                                                             ^0
 1515|      1|    }
 1516|       |
 1517|       |    #[test]
 1518|      1|    fn extend_suffix_alphanumeric_x_suffix() {
 1519|       |        // P2-E: "8x" suffix (digit + 'x') must be captured.
 1520|      1|        let ents = vec![ExtractedEntity {
 1521|      1|            name: "Mistral".to_string(),
 1522|      1|            entity_type: EntityType::Concept,
 1523|      1|        }];
 1524|      1|        let result = extend_with_numeric_suffix(ents, "testing Mistral-8x in production");
 1525|      1|        assert_eq!(result[0].name, "Mistral-8x", "suffix '8x' must be accepted");
                                                               ^0
 1526|      1|    }
 1527|       |
 1528|       |    // --- P2-D: augment_versioned_model_names extended regex ---
 1529|       |
 1530|       |    #[test]
 1531|      1|    fn augment_versioned_gpt4o() {
 1532|       |        // P2-D: "GPT-4o" must be captured with alphanumeric suffix.
 1533|      1|        let result = augment_versioned_model_names(vec![], "using GPT-4o for analysis");
 1534|      1|        assert!(
 1535|      1|            result.iter().any(|e| e.name == "GPT-4o"),
 1536|      0|            "GPT-4o must be captured by augment, found: {:?}",
 1537|      0|            result.iter().map(|e| &e.name).collect::<Vec<_>>()
 1538|       |        );
 1539|      1|    }
 1540|       |
 1541|       |    #[test]
 1542|      1|    fn augment_versioned_claude_4_sonnet() {
 1543|       |        // P2-D: "Claude 4 Sonnet" must be captured with release tier.
 1544|      1|        let result =
 1545|      1|            augment_versioned_model_names(vec![], "best model: Claude 4 Sonnet released today");
 1546|      1|        assert!(
 1547|      1|            result.iter().any(|e| e.name == "Claude 4 Sonnet"),
 1548|      0|            "Claude 4 Sonnet must be captured, found: {:?}",
 1549|      0|            result.iter().map(|e| &e.name).collect::<Vec<_>>()
 1550|       |        );
 1551|      1|    }
 1552|       |
 1553|       |    #[test]
 1554|      1|    fn augment_versioned_llama_3_pro() {
 1555|       |        // P2-D: "Llama 3 Pro" must be captured with release tier.
 1556|      1|        let result =
 1557|      1|            augment_versioned_model_names(vec![], "fine-tuning com Llama 3 Pro localmente");
 1558|      1|        assert!(
 1559|      1|            result.iter().any(|e| e.name == "Llama 3 Pro"),
 1560|      0|            "Llama 3 Pro deve ser capturado, achados: {:?}",
 1561|      0|            result.iter().map(|e| &e.name).collect::<Vec<_>>()
 1562|       |        );
 1563|      1|    }
 1564|       |
 1565|       |    #[test]
 1566|      1|    fn augment_versioned_mixtral_8x7b() {
 1567|       |        // P2-D: "Mixtral 8x7B" composite version must be captured.
 1568|      1|        let result =
 1569|      1|            augment_versioned_model_names(vec![], "executando Mixtral 8x7B no servidor local");
 1570|      1|        assert!(
 1571|      1|            result.iter().any(|e| e.name == "Mixtral 8x7B"),
 1572|      0|            "Mixtral 8x7B deve ser capturado, achados: {:?}",
 1573|      0|            result.iter().map(|e| &e.name).collect::<Vec<_>>()
 1574|       |        );
 1575|      1|    }
 1576|       |
 1577|       |    #[test]
 1578|      1|    fn augment_versioned_does_not_duplicate_existing() {
 1579|       |        // P2-D back-compat: entities already present must not be duplicated.
 1580|      1|        let existing = vec![ExtractedEntity {
 1581|      1|            name: "Claude 4".to_string(),
 1582|      1|            entity_type: EntityType::Concept,
 1583|      1|        }];
 1584|      1|        let result = augment_versioned_model_names(existing, "using Claude 4 in the project");
 1585|      1|        let count = result.iter().filter(|e| e.name == "Claude 4").count();
 1586|      1|        assert_eq!(count, 1, "Claude 4 must not be duplicated");
                                           ^0
 1587|      1|    }
 1588|       |
 1589|       |    // ── v1.0.25 P0-4: new stopwords (API, CLI, HTTP, HTTPS, JWT, LLM, REST, UI, URL) ──
 1590|       |
 1591|       |    #[test]
 1592|      1|    fn stopwords_filter_url_jwt_api_v1025() {
 1593|       |        // Verify that v1.0.25 tech-acronym stopwords do not leak as entities.
 1594|      1|        let body = "We use URL, JWT, and API REST in our LLM-powered CLI via HTTP/HTTPS and UI.";
 1595|      1|        let ents = apply_regex_prefilter(body);
 1596|      1|        let names: Vec<&str> = ents.iter().map(|e| e.name.as_str()).collect();
                                                                 ^0     ^0
 1597|     10|        for blocked in &[
                          ^9
 1598|     10|            "URL", "JWT", "API", "REST", "LLM", "CLI", "HTTP", "HTTPS", "UI",
 1599|     10|        ] {
 1600|      9|            assert!(
 1601|      9|                !names.contains(blocked),
 1602|      0|                "v1.0.25 stopword {blocked} leaked as entity; found names: {names:?}"
 1603|       |            );
 1604|       |        }
 1605|      1|    }
 1606|       |
 1607|       |    // ── v1.0.25 P0-4: section-marker regex strips "Etapa N", "Fase N", etc. ──
 1608|       |
 1609|       |    #[test]
 1610|      1|    fn section_markers_etapa_fase_filtered_v1025() {
 1611|       |        // "Etapa 3" and "Fase 1" are document-structure labels, not entities.
 1612|       |        // Body intentionally uses PT-BR section keywords (Etapa/Fase/Migra\u{e7}\u{e3}o) to
 1613|       |        // exercise the PT-BR section-marker filter. ASCII-escaped per the project policy.
 1614|      1|        let body = "Etapa 3 do plano: implementar Fase 1 da Migra\u{e7}\u{e3}o.";
 1615|      1|        let ents = apply_regex_prefilter(body);
 1616|      1|        assert!(
 1617|      1|            !ents
 1618|      1|                .iter()
 1619|      1|                .any(|e| e.name.contains("Etapa") || e.name.contains("Fase")),
                                       ^0                          ^0
 1620|      0|            "section markers must be stripped; entities: {:?}",
 1621|      0|            ents.iter().map(|e| &e.name).collect::<Vec<_>>()
 1622|       |        );
 1623|      1|    }
 1624|       |
 1625|       |    #[test]
 1626|      1|    fn section_markers_passo_secao_filtered_v1025() {
 1627|       |        // PT-BR keywords Passo/Se\u{e7}\u{e3}o written with Unicode escapes per the
 1628|       |        // project language policy.
 1629|      1|        let body = "Siga Passo 2 conforme Se\u{e7}\u{e3}o 3 do manual.";
 1630|      1|        let ents = apply_regex_prefilter(body);
 1631|      1|        assert!(
 1632|      1|            !ents
 1633|      1|                .iter()
 1634|      1|                .any(|e| e.name.contains("Passo") || e.name.contains("Se\u{e7}\u{e3}o")),
                                       ^0                          ^0
 1635|      0|            "Passo/Se\\u{{e7}}\\u{{e3}}o section markers must be stripped; entities: {:?}",
 1636|      0|            ents.iter().map(|e| &e.name).collect::<Vec<_>>()
 1637|       |        );
 1638|      1|    }
 1639|       |
 1640|       |    // ── v1.0.25 P0-2: CamelCase brand names extracted as organization ──
 1641|       |
 1642|       |    #[test]
 1643|      1|    fn brand_camelcase_extracted_as_organization_v1025() {
 1644|       |        // "OpenAI" is a CamelCase brand that NER model often misses.
 1645|      1|        let body = "OpenAI launched GPT-4 and PostgreSQL added pgvector.";
 1646|      1|        let ents = apply_regex_prefilter(body);
 1647|      2|        let openai = ents.iter().find(|e| e.name == "OpenAI");
                          ^1       ^1          ^1
 1648|      1|        assert!(
 1649|      1|            openai.is_some(),
 1650|      0|            "OpenAI must be extracted by CamelCase brand regex; entities: {:?}",
 1651|      0|            ents.iter().map(|e| &e.name).collect::<Vec<_>>()
 1652|       |        );
 1653|      1|        assert_eq!(
 1654|      1|            openai.unwrap().entity_type,
 1655|       |            EntityType::Organization,
 1656|      0|            "brand CamelCase must map to organization (V008)"
 1657|       |        );
 1658|      1|    }
 1659|       |
 1660|       |    #[test]
 1661|      1|    fn brand_postgresql_extracted_as_organization_v1025() {
 1662|      1|        let body = "migrating from MySQL to PostgreSQL for better performance.";
 1663|      1|        let ents = apply_regex_prefilter(body);
 1664|      1|        assert!(
 1665|      1|            ents.iter()
 1666|      2|                .any(|e| e.name == "PostgreSQL" && e.entity_type == EntityType::Organization),
                               ^1                                ^1
 1667|      0|            "PostgreSQL must be extracted as organization; entities: {:?}",
 1668|      0|            ents.iter()
 1669|      0|                .map(|e| (&e.name, &e.entity_type))
 1670|      0|                .collect::<Vec<_>>()
 1671|       |        );
 1672|      1|    }
 1673|       |
 1674|       |    // --- P0-3 longest-wins v1.0.25 ---
 1675|       |
 1676|     10|    fn entity(name: &str, entity_type: EntityType) -> ExtractedEntity {
 1677|     10|        ExtractedEntity {
 1678|     10|            name: name.to_string(),
 1679|     10|            entity_type,
 1680|     10|        }
 1681|     10|    }
 1682|       |
 1683|       |    #[test]
 1684|      1|    fn merge_resolves_sonne_vs_sonnet_keeps_longest_v1025() {
 1685|       |        // "Sonne" is a substring of "Sonnet" — longest-wins must keep "Sonnet".
 1686|      1|        let regex = vec![entity("Sonne", EntityType::Concept)];
 1687|      1|        let ner = vec![entity("Sonnet", EntityType::Concept)];
 1688|      1|        let result = merge_and_deduplicate(regex, ner);
 1689|      1|        assert_eq!(result.len(), 1, "expected 1 entity, got: {result:?}");
                                                  ^0
 1690|      1|        assert_eq!(result[0].name, "Sonnet");
 1691|      1|    }
 1692|       |
 1693|       |    #[test]
 1694|      1|    fn merge_resolves_open_vs_openai_keeps_longest_v1025() {
 1695|       |        // "Open" is a substring of "OpenAI" — longest-wins must keep "OpenAI".
 1696|      1|        let regex = vec![
 1697|      1|            entity("Open", EntityType::Organization),
 1698|      1|            entity("OpenAI", EntityType::Organization),
 1699|       |        ];
 1700|      1|        let result = merge_and_deduplicate(regex, vec![]);
 1701|      1|        assert_eq!(result.len(), 1, "expected 1 entity, got: {result:?}");
                                                  ^0
 1702|      1|        assert_eq!(result[0].name, "OpenAI");
 1703|      1|    }
 1704|       |
 1705|       |    #[test]
 1706|      1|    fn merge_keeps_both_when_no_containment_v1025() {
 1707|       |        // "Alice" and "Bob" share no containment — both must be preserved.
 1708|      1|        let regex = vec![
 1709|      1|            entity("Alice", EntityType::Person),
 1710|      1|            entity("Bob", EntityType::Person),
 1711|       |        ];
 1712|      1|        let result = merge_and_deduplicate(regex, vec![]);
 1713|      1|        assert_eq!(result.len(), 2, "expected 2 entities, got: {result:?}");
                                                  ^0
 1714|      1|    }
 1715|       |
 1716|       |    #[test]
 1717|      1|    fn merge_respects_entity_type_boundary_v1025() {
 1718|       |        // Same name "Apple" but different types: both must survive independently.
 1719|      1|        let regex = vec![
 1720|      1|            entity("Apple", EntityType::Organization),
 1721|      1|            entity("Apple", EntityType::Concept),
 1722|       |        ];
 1723|      1|        let result = merge_and_deduplicate(regex, vec![]);
 1724|      1|        assert_eq!(
 1725|      1|            result.len(),
 1726|       |            2,
 1727|      0|            "expected 2 entities (different types), got: {result:?}"
 1728|       |        );
 1729|      1|    }
 1730|       |
 1731|       |    #[test]
 1732|      1|    fn merge_case_insensitive_dedup_v1025() {
 1733|       |        // "OpenAI" and "openai" are the same entity — deduplicate to exactly one.
 1734|      1|        let regex = vec![
 1735|      1|            entity("OpenAI", EntityType::Organization),
 1736|      1|            entity("openai", EntityType::Organization),
 1737|       |        ];
 1738|      1|        let result = merge_and_deduplicate(regex, vec![]);
 1739|      1|        assert_eq!(
 1740|      1|            result.len(),
 1741|       |            1,
 1742|      0|            "expected 1 entity after case-insensitive dedup, got: {result:?}"
 1743|       |        );
 1744|      1|    }
 1745|       |
 1746|       |    // ── v1.0.31 A1: NER cap protects against pathological body sizes ──
 1747|       |
 1748|       |    #[test]
 1749|      1|    fn extract_graph_auto_handles_large_body_under_30s() {
 1750|       |        // Regression guard for the v1.0.31 A1 fix. A 80 KB body without real
 1751|       |        // entities must complete in under 30 s; before the cap it took 5+ minutes.
 1752|      1|        let body = "x ".repeat(40_000);
 1753|      1|        let paths = make_paths();
 1754|      1|        let start = std::time::Instant::now();
 1755|      1|        let result = extract_graph_auto(&body, &paths, GlinerVariant::Int8)
 1756|      1|            .expect("extraction must not error");
 1757|      1|        let elapsed = start.elapsed();
 1758|      1|        assert!(
 1759|      1|            elapsed.as_secs() < 30,
 1760|      0|            "extract_graph_auto took {}s for 80 KB body (cap should keep it well under 30s)",
 1761|      0|            elapsed.as_secs()
 1762|       |        );
 1763|       |        // No real entities expected in synthetic body, but the call must succeed.
 1764|      1|        let _ = result.entities;
 1765|      1|    }
 1766|       |
 1767|       |    // ── v1.0.31 A11: PT-BR uppercase noise must not leak as entities ──
 1768|       |
 1769|       |    #[test]
 1770|      1|    fn pt_uppercase_stopwords_filtered_v1031() {
 1771|      1|        let body = "Para o ADAPTER funcionar com PROJETO em modo PASSIVA, devemos usar \
 1772|      1|                    SOMENTE LEITURA conforme a REGRA OBRIGATORIA do EXEMPLO DEFAULT.";
 1773|      1|        let ents = apply_regex_prefilter(body);
 1774|      1|        let names: Vec<String> = ents.iter().map(|e| e.name.to_uppercase()).collect();
                                                                   ^0     ^0
 1775|     10|        for stop in &[
                          ^9
 1776|     10|            "ADAPTER",
 1777|     10|            "PROJETO",
 1778|     10|            "PASSIVA",
 1779|     10|            "SOMENTE",
 1780|     10|            "LEITURA",
 1781|     10|            "REGRA",
 1782|     10|            "OBRIGATORIA",
 1783|     10|            "EXEMPLO",
 1784|     10|            "DEFAULT",
 1785|     10|        ] {
 1786|      9|            assert!(
 1787|      9|                !names.contains(&stop.to_string()),
 1788|      0|                "v1.0.31 A11 stoplist failed: {stop} leaked as entity; got names: {names:?}"
 1789|       |            );
 1790|       |        }
 1791|      1|    }
 1792|       |
 1793|       |    #[test]
 1794|      1|    fn pt_underscored_identifier_preserved_v1031() {
 1795|       |        // Identifiers with underscore must still pass through (FLOWAIPER_API_KEY,
 1796|       |        // MAX_RETRY etc. are intentional entities, not noise).
 1797|      1|        let ents = apply_regex_prefilter("configure FLOWAIPER_API_KEY=foo and MAX_TIMEOUT=30");
 1798|      2|        let names: Vec<&str> = ents.iter().map(|e| e.name.as_str()).collect();
                          ^1     ^1          ^1          ^1                       ^1
 1799|      1|        assert!(names.contains(&"FLOWAIPER_API_KEY"));
 1800|      1|        assert!(names.contains(&"MAX_TIMEOUT"));
 1801|      1|    }
 1802|       |
 1803|       |    // ── v1.0.31 A9: relationships only between entities co-occurring in same sentence ──
 1804|       |
 1805|       |    #[test]
 1806|      1|    fn build_relationships_by_sentence_only_links_co_occurring_entities() {
 1807|      1|        let body = "Alice met Bob at the conference. Carol works alone in another room.";
 1808|      1|        let entities = vec![
 1809|      1|            NewEntity {
 1810|      1|                name: "Alice".to_string(),
 1811|      1|                entity_type: EntityType::Person,
 1812|      1|                description: None,
 1813|      1|            },
 1814|      1|            NewEntity {
 1815|      1|                name: "Bob".to_string(),
 1816|      1|                entity_type: EntityType::Person,
 1817|      1|                description: None,
 1818|      1|            },
 1819|      1|            NewEntity {
 1820|      1|                name: "Carol".to_string(),
 1821|      1|                entity_type: EntityType::Person,
 1822|      1|                description: None,
 1823|      1|            },
 1824|       |        ];
 1825|      1|        let (rels, truncated) = build_relationships_by_sentence_cooccurrence(body, &entities);
 1826|      1|        assert!(!truncated);
 1827|      1|        assert_eq!(
 1828|      1|            rels.len(),
 1829|       |            1,
 1830|      0|            "only Alice/Bob should pair (same sentence); Carol is isolated"
 1831|       |        );
 1832|      1|        let pair = (rels[0].source.as_str(), rels[0].target.as_str());
 1833|      1|        assert!(
 1834|      1|            matches!(pair, ("Alice", "Bob") | ("Bob", "Alice")),
                                                             ^0     ^0
 1835|      0|            "unexpected pair {pair:?}"
 1836|       |        );
 1837|      1|    }
 1838|       |
 1839|       |    #[test]
 1840|      1|    fn build_relationships_by_sentence_returns_empty_for_single_entity() {
 1841|      1|        let body = "Alice is here.";
 1842|      1|        let entities = vec![NewEntity {
 1843|      1|            name: "Alice".to_string(),
 1844|      1|            entity_type: EntityType::Person,
 1845|      1|            description: None,
 1846|      1|        }];
 1847|      1|        let (rels, truncated) = build_relationships_by_sentence_cooccurrence(body, &entities);
 1848|      1|        assert!(rels.is_empty());
 1849|      1|        assert!(!truncated);
 1850|      1|    }
 1851|       |
 1852|       |    #[test]
 1853|      1|    fn build_relationships_by_sentence_dedupes_pairs_across_sentences() {
 1854|      1|        let body = "Alice met Bob. Bob saw Alice again.";
 1855|      1|        let entities = vec![
 1856|      1|            NewEntity {
 1857|      1|                name: "Alice".to_string(),
 1858|      1|                entity_type: EntityType::Person,
 1859|      1|                description: None,
 1860|      1|            },
 1861|      1|            NewEntity {
 1862|      1|                name: "Bob".to_string(),
 1863|      1|                entity_type: EntityType::Person,
 1864|      1|                description: None,
 1865|      1|            },
 1866|       |        ];
 1867|      1|        let (rels, _) = build_relationships_by_sentence_cooccurrence(body, &entities);
 1868|      1|        assert_eq!(
 1869|      1|            rels.len(),
 1870|       |            1,
 1871|      0|            "Alice/Bob pair must be emitted only once even when co-occurring in multiple sentences"
 1872|       |        );
 1873|      1|    }
 1874|       |
 1875|       |    #[test]
 1876|      1|    fn extraction_max_tokens_default_is_5000() {
 1877|      1|        std::env::remove_var("SQLITE_GRAPHRAG_EXTRACTION_MAX_TOKENS");
 1878|      1|        assert_eq!(crate::constants::extraction_max_tokens(), 5_000);
 1879|      1|    }
 1880|       |
 1881|       |    #[test]
 1882|      1|    fn extraction_max_tokens_env_override_clamped() {
 1883|      1|        std::env::set_var("SQLITE_GRAPHRAG_EXTRACTION_MAX_TOKENS", "200");
 1884|      1|        assert_eq!(
 1885|      1|            crate::constants::extraction_max_tokens(),
 1886|       |            5_000,
 1887|      0|            "value below 512 must fall back to default"
 1888|       |        );
 1889|       |
 1890|      1|        std::env::set_var("SQLITE_GRAPHRAG_EXTRACTION_MAX_TOKENS", "200000");
 1891|      1|        assert_eq!(
 1892|      1|            crate::constants::extraction_max_tokens(),
 1893|       |            5_000,
 1894|      0|            "value above 100_000 must fall back to default"
 1895|       |        );
 1896|       |
 1897|      1|        std::env::set_var("SQLITE_GRAPHRAG_EXTRACTION_MAX_TOKENS", "8000");
 1898|      1|        assert_eq!(
 1899|      1|            crate::constants::extraction_max_tokens(),
 1900|       |            8_000,
 1901|      0|            "valid value must be honoured"
 1902|       |        );
 1903|       |
 1904|      1|        std::env::remove_var("SQLITE_GRAPHRAG_EXTRACTION_MAX_TOKENS");
 1905|      1|    }
 1906|       |
 1907|       |    #[test]
 1908|      1|    fn gliner_variant_from_str_valid() {
 1909|      1|        assert_eq!(
 1910|      1|            "fp32".parse::<GlinerVariant>().unwrap(),
 1911|       |            GlinerVariant::Fp32
 1912|       |        );
 1913|      1|        assert_eq!(
 1914|      1|            "fp16".parse::<GlinerVariant>().unwrap(),
 1915|       |            GlinerVariant::Fp16
 1916|       |        );
 1917|      1|        assert_eq!(
 1918|      1|            "int8".parse::<GlinerVariant>().unwrap(),
 1919|       |            GlinerVariant::Int8
 1920|       |        );
 1921|      1|        assert_eq!("q4".parse::<GlinerVariant>().unwrap(), GlinerVariant::Q4);
 1922|      1|        assert_eq!(
 1923|      1|            "q4f16".parse::<GlinerVariant>().unwrap(),
 1924|       |            GlinerVariant::Q4f16
 1925|       |        );
 1926|       |        // Case-insensitive
 1927|      1|        assert_eq!(
 1928|      1|            "FP32".parse::<GlinerVariant>().unwrap(),
 1929|       |            GlinerVariant::Fp32
 1930|       |        );
 1931|      1|        assert_eq!(
 1932|      1|            "INT8".parse::<GlinerVariant>().unwrap(),
 1933|       |            GlinerVariant::Int8
 1934|       |        );
 1935|      1|    }
 1936|       |
 1937|       |    #[test]
 1938|      1|    fn gliner_variant_from_str_invalid() {
 1939|      1|        assert!("invalid".parse::<GlinerVariant>().is_err());
 1940|      1|        assert!("fp64".parse::<GlinerVariant>().is_err());
 1941|      1|        assert!("".parse::<GlinerVariant>().is_err());
 1942|      1|    }
 1943|       |
 1944|       |    #[test]
 1945|      1|    fn gliner_variant_filename_mapping() {
 1946|      1|        assert_eq!(GlinerVariant::Fp32.as_filename(), "model.onnx");
 1947|      1|        assert_eq!(GlinerVariant::Fp16.as_filename(), "model_fp16.onnx");
 1948|      1|        assert_eq!(GlinerVariant::Int8.as_filename(), "model_quantized.onnx");
 1949|      1|        assert_eq!(GlinerVariant::Q4.as_filename(), "model_q4.onnx");
 1950|      1|        assert_eq!(GlinerVariant::Q4f16.as_filename(), "model_q4f16.onnx");
 1951|      1|    }
 1952|       |
 1953|       |    #[test]
 1954|      1|    fn gliner_variant_display() {
 1955|      1|        assert_eq!(format!("{}", GlinerVariant::Fp32), "fp32");
 1956|      1|        assert_eq!(format!("{}", GlinerVariant::Fp16), "fp16");
 1957|      1|        assert_eq!(format!("{}", GlinerVariant::Int8), "int8");
 1958|      1|        assert_eq!(format!("{}", GlinerVariant::Q4), "q4");
 1959|      1|        assert_eq!(format!("{}", GlinerVariant::Q4f16), "q4f16");
 1960|      1|    }
 1961|       |
 1962|       |    #[test]
 1963|      1|    fn gliner_variant_display_size() {
 1964|      1|        assert_eq!(GlinerVariant::Fp32.display_size(), "1.1 GB");
 1965|      1|        assert_eq!(GlinerVariant::Int8.display_size(), "349 MB");
 1966|      1|    }
 1967|       |
 1968|       |    #[test]
 1969|      1|    fn gliner_entity_labels_covers_all_types() {
 1970|      1|        let label_types: Vec<EntityType> = GLINER_ENTITY_LABELS.iter().map(|(_, t)| *t).collect();
 1971|      1|        assert!(label_types.contains(&EntityType::Person));
 1972|      1|        assert!(label_types.contains(&EntityType::Organization));
 1973|      1|        assert!(label_types.contains(&EntityType::Location));
 1974|      1|        assert!(label_types.contains(&EntityType::Date));
 1975|      1|        assert!(label_types.contains(&EntityType::Project));
 1976|      1|        assert!(label_types.contains(&EntityType::Tool));
 1977|      1|        assert!(label_types.contains(&EntityType::File));
 1978|      1|        assert!(label_types.contains(&EntityType::Concept));
 1979|      1|        assert!(label_types.contains(&EntityType::Decision));
 1980|      1|        assert!(label_types.contains(&EntityType::Incident));
 1981|      1|        assert!(label_types.contains(&EntityType::Dashboard));
 1982|      1|        assert!(label_types.contains(&EntityType::IssueTracker));
 1983|      1|        assert!(label_types.contains(&EntityType::Memory));
 1984|      1|        assert_eq!(GLINER_ENTITY_LABELS.len(), 13);
 1985|      1|    }
 1986|       |
 1987|       |    #[test]
 1988|      1|    fn gliner_entity_labels_no_duplicates() {
 1989|      1|        let mut seen = std::collections::HashSet::new();
 1990|     14|        for (label, _) in GLINER_ENTITY_LABELS {
                           ^13
 1991|     13|            assert!(seen.insert(*label), "duplicate label: {label}");
                                                       ^0
 1992|       |        }
 1993|      1|    }
 1994|       |
 1995|       |    #[test]
 1996|      1|    fn extract_graph_auto_regex_only_fallback() {
 1997|       |        // extract_graph_auto must succeed and capture regex entities regardless of whether
 1998|       |        // GLiNER model files exist in the test environment (GLINER_MODEL is a global OnceLock
 1999|       |        // that may already be initialised by a sibling test, so we cannot assert on
 2000|       |        // extraction_method; use RegexExtractor for that invariant).
 2001|      1|        let result = extract_graph_auto(
 2002|      1|            "Contact someone@test.com about OPENAI project",
 2003|      1|            &make_paths(),
 2004|      1|            GlinerVariant::Fp32,
 2005|       |        );
 2006|      1|        assert!(result.is_ok());
 2007|      1|        let res = result.unwrap();
 2008|       |        // Regex prefilter must always capture the email entity
 2009|      1|        assert!(res.entities.iter().any(|e| e.name == "someone@test.com"));
 2010|       |        // extraction_method must be one of the two valid values
 2011|      1|        assert!(
 2012|      1|            res.extraction_method == "regex-only" || res.extraction_method.starts_with("gliner-"),
 2013|      0|            "unexpected extraction_method: {}",
 2014|       |            res.extraction_method
 2015|       |        );
 2016|      1|    }
 2017|       |
 2018|       |    #[test]
 2019|      1|    fn gliner_variant_roundtrip() {
 2020|      6|        for variant in &[
                          ^5
 2021|      6|            GlinerVariant::Fp32,
 2022|      6|            GlinerVariant::Fp16,
 2023|      6|            GlinerVariant::Int8,
 2024|      6|            GlinerVariant::Q4,
 2025|      6|            GlinerVariant::Q4f16,
 2026|      6|        ] {
 2027|      5|            let s = format!("{variant}");
 2028|      5|            let parsed: GlinerVariant = s.parse().unwrap();
 2029|      5|            assert_eq!(*variant, parsed);
 2030|       |        }
 2031|      1|    }
 2032|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/graph.rs:
    1|       |//! Entity graph traversal (BFS over memory_entities + relations).
    2|       |//!
    3|       |//! Queries the SQLite entity and relation tables to expand neighbourhood
    4|       |//! sets used by the `related` and `recall` commands.
    5|       |
    6|       |// src/graph.rs
    7|       |
    8|       |use crate::errors::AppError;
    9|       |use rusqlite::{params, Connection};
   10|       |
   11|       |/// Traverses the entity graph by BFS from seed memories.
   12|       |///
   13|       |/// Returns `memory_id`s reachable through entity and relationship edges,
   14|       |/// excluding the seeds themselves. The algorithm:
   15|       |/// 1. Collects entities associated with seeds via `memory_entities`.
   16|       |/// 2. Runs BFS over `relationships` filtered by `weight >= min_weight` and `namespace`.
   17|       |/// 3. Returns memories linked to discovered entities (excluding soft-deleted).
   18|       |///
   19|       |/// # Errors
   20|       |///
   21|       |/// Propagates [`AppError::Database`] (exit 10) on SQLite query failures.
   22|       |///
   23|       |/// # Examples
   24|       |///
   25|       |/// ```
   26|       |/// use rusqlite::Connection;
   27|       |/// use sqlite_graphrag::graph::traverse_from_memories;
   28|       |///
   29|       |/// // Empty seed list returns immediately without querying the database.
   30|       |/// let conn = Connection::open_in_memory().unwrap();
   31|       |/// let ids = traverse_from_memories(&conn, &[], "global", 0.5, 3).unwrap();
   32|       |/// assert!(ids.is_empty());
   33|       |/// ```
   34|       |///
   35|       |/// ```
   36|       |/// use rusqlite::Connection;
   37|       |/// use sqlite_graphrag::graph::traverse_from_memories;
   38|       |///
   39|       |/// // max_hops == 0 returns immediately without traversal.
   40|       |/// let conn = Connection::open_in_memory().unwrap();
   41|       |/// let ids = traverse_from_memories(&conn, &[1, 2], "global", 0.5, 0).unwrap();
   42|       |/// assert!(ids.is_empty());
   43|       |/// ```
   44|     16|pub fn traverse_from_memories(
   45|     16|    conn: &Connection,
   46|     16|    seed_memory_ids: &[i64],
   47|     16|    namespace: &str,
   48|     16|    min_weight: f64,
   49|     16|    max_hops: u32,
   50|     16|) -> Result<Vec<i64>, AppError> {
   51|     16|    if seed_memory_ids.is_empty() || max_hops == 0 {
                                                   ^15
   52|      2|        return Ok(vec![]);
   53|     14|    }
   54|       |
   55|       |    // Step 1: collect seed entity IDs from seed memories
   56|     14|    let mut seed_entities: Vec<i64> = Vec::with_capacity(seed_memory_ids.len());
   57|     29|    for &mem_id in seed_memory_ids {
                       ^15
   58|     15|        let mut stmt =
   59|     15|            conn.prepare_cached("SELECT entity_id FROM memory_entities WHERE memory_id = ?1")?;
                                                                                                           ^0
   60|     15|        let ids: Vec<i64> = stmt
   61|     15|            .query_map(params![mem_id], |r| r.get(0))?
                                                                   ^0
   62|     15|            .filter_map(|r| r.ok())
   63|     15|            .collect();
   64|     15|        seed_entities.extend(ids);
   65|       |    }
   66|     14|    seed_entities.sort_unstable();
   67|     14|    seed_entities.dedup();
   68|       |
   69|     14|    if seed_entities.is_empty() {
   70|      1|        return Ok(vec![]);
   71|     13|    }
   72|       |
   73|       |    // Step 2: BFS over relationships
   74|       |    use std::collections::HashSet;
   75|     13|    let mut visited: HashSet<i64> = seed_entities.iter().copied().collect();
   76|     13|    let mut frontier: Vec<i64> = seed_entities.to_vec();
   77|       |
   78|     13|    for _ in 0..max_hops {
   79|     25|        if frontier.is_empty() {
   80|      7|            break;
   81|     18|        }
   82|     18|        let mut next_frontier = Vec::with_capacity(frontier.len() * 2);
   83|       |
   84|     38|        for &entity_id in &frontier {
                           ^20
   85|     20|            let mut stmt = conn.prepare_cached(
   86|     20|                "SELECT target_id FROM relationships
   87|     20|                 WHERE source_id = ?1 AND weight >= ?2 AND namespace = ?3",
   88|      0|            )?;
   89|     20|            let neighbors: Vec<i64> = stmt
   90|     20|                .query_map(params![entity_id, min_weight, namespace], |r| r.get(0))?
                                                                                        ^15^15    ^0
   91|     20|                .filter_map(|r| r.ok())
                                              ^15^15
   92|     20|                .filter(|id| !visited.contains(id))
                                            ^15     ^15      ^15
   93|     20|                .collect();
   94|       |
   95|     32|            for id in neighbors {
                              ^12
   96|     12|                visited.insert(id);
   97|     12|                next_frontier.push(id);
   98|     12|            }
   99|       |        }
  100|     18|        frontier = next_frontier;
  101|       |    }
  102|       |
  103|       |    // Step 3: find memories connected to traversed entities (excluding seeds)
  104|     13|    let seed_set: HashSet<i64> = seed_memory_ids.iter().copied().collect();
  105|     13|    let graph_only_entities: Vec<i64> = visited
  106|     13|        .into_iter()
  107|     27|        .filter(|id| !seed_entities.contains(id))
                       ^13
  108|     13|        .collect();
  109|       |
  110|     13|    let mut result_ids: Vec<i64> = Vec::with_capacity(graph_only_entities.len());
  111|     25|    for &entity_id in &graph_only_entities {
                       ^12
  112|     12|        let mut stmt = conn.prepare_cached(
  113|     12|            "SELECT DISTINCT me.memory_id
  114|     12|             FROM memory_entities me
  115|     12|             JOIN memories m ON m.id = me.memory_id
  116|     12|             WHERE me.entity_id = ?1 AND m.deleted_at IS NULL",
  117|      0|        )?;
  118|     12|        let mem_ids: Vec<i64> = stmt
  119|     12|            .query_map(params![entity_id], |r| r.get(0))?
                                                             ^11^11    ^0
  120|     12|            .filter_map(|r| r.ok())
                                          ^11^11
  121|     12|            .filter(|id| !seed_set.contains(id))
                                        ^11      ^11      ^11
  122|     12|            .collect();
  123|     12|        result_ids.extend(mem_ids);
  124|       |    }
  125|       |
  126|     13|    result_ids.sort_unstable();
  127|     13|    result_ids.dedup();
  128|     13|    Ok(result_ids)
  129|     16|}
  130|       |
  131|       |/// BFS graph traversal that also returns the hop distance for each reached memory.
  132|       |///
  133|       |/// Identical to [`traverse_from_memories`] but returns `(memory_id, hop_count)` tuples
  134|       |/// instead of bare IDs. `hop_count` is the BFS depth at which the entity was first
  135|       |/// discovered, starting from 1 for direct neighbours of the seed entities.
  136|       |///
  137|       |/// When `max_neighbors_per_hop` is `Some(k)`, only the top-`k` neighbours by
  138|       |/// `weight DESC` are followed at each entity expansion.  Pass `None` to retain
  139|       |/// the original behaviour (all neighbours above `min_weight` are followed).
  140|       |///
  141|       |/// # Errors
  142|       |///
  143|       |/// Propagates [`AppError::Database`] (exit 10) on SQLite query failures.
  144|      0|pub fn traverse_from_memories_with_hops(
  145|      0|    conn: &Connection,
  146|      0|    seed_memory_ids: &[i64],
  147|      0|    namespace: &str,
  148|      0|    min_weight: f64,
  149|      0|    max_hops: u32,
  150|      0|) -> Result<Vec<(i64, u32)>, AppError> {
  151|      0|    traverse_from_memories_with_hops_inner(
  152|      0|        conn,
  153|      0|        seed_memory_ids,
  154|      0|        namespace,
  155|      0|        min_weight,
  156|      0|        max_hops,
  157|      0|        None,
  158|       |    )
  159|      0|}
  160|       |
  161|       |/// Extended variant that accepts an optional neighbour cap per hop.
  162|       |///
  163|       |/// Pass `max_neighbors_per_hop = Some(k)` to prune each entity's expansion to
  164|       |/// its top-`k` neighbours by edge weight, limiting combinatorial blow-up in
  165|       |/// dense graphs.  `None` is equivalent to the public
  166|       |/// [`traverse_from_memories_with_hops`] function.
  167|       |///
  168|       |/// # Errors
  169|       |///
  170|       |/// Propagates [`AppError::Database`] (exit 10) on SQLite query failures.
  171|      0|pub fn traverse_from_memories_with_hops_capped(
  172|      0|    conn: &Connection,
  173|      0|    seed_memory_ids: &[i64],
  174|      0|    namespace: &str,
  175|      0|    min_weight: f64,
  176|      0|    max_hops: u32,
  177|      0|    max_neighbors_per_hop: Option<usize>,
  178|      0|) -> Result<Vec<(i64, u32)>, AppError> {
  179|      0|    traverse_from_memories_with_hops_inner(
  180|      0|        conn,
  181|      0|        seed_memory_ids,
  182|      0|        namespace,
  183|      0|        min_weight,
  184|      0|        max_hops,
  185|      0|        max_neighbors_per_hop,
  186|       |    )
  187|      0|}
  188|       |
  189|      0|fn traverse_from_memories_with_hops_inner(
  190|      0|    conn: &Connection,
  191|      0|    seed_memory_ids: &[i64],
  192|      0|    namespace: &str,
  193|      0|    min_weight: f64,
  194|      0|    max_hops: u32,
  195|      0|    max_neighbors_per_hop: Option<usize>,
  196|      0|) -> Result<Vec<(i64, u32)>, AppError> {
  197|      0|    if seed_memory_ids.is_empty() || max_hops == 0 {
  198|      0|        return Ok(vec![]);
  199|      0|    }
  200|       |
  201|       |    // Collect seed entity IDs from seed memories
  202|      0|    let mut seed_entities: Vec<i64> = Vec::with_capacity(seed_memory_ids.len());
  203|      0|    for &mem_id in seed_memory_ids {
  204|      0|        let mut stmt =
  205|      0|            conn.prepare_cached("SELECT entity_id FROM memory_entities WHERE memory_id = ?1")?;
  206|      0|        let ids: Vec<i64> = stmt
  207|      0|            .query_map(params![mem_id], |r| r.get(0))?
  208|      0|            .filter_map(|r| r.ok())
  209|      0|            .collect();
  210|      0|        seed_entities.extend(ids);
  211|       |    }
  212|      0|    seed_entities.sort_unstable();
  213|      0|    seed_entities.dedup();
  214|       |
  215|      0|    if seed_entities.is_empty() {
  216|      0|        return Ok(vec![]);
  217|      0|    }
  218|       |
  219|       |    // BFS over relationships, tracking depth per entity
  220|       |    use std::collections::HashMap;
  221|      0|    let mut entity_depth: HashMap<i64, u32> = seed_entities.iter().map(|&id| (id, 0)).collect();
  222|      0|    let mut frontier: Vec<i64> = seed_entities.to_vec();
  223|       |
  224|      0|    for hop in 1..=max_hops {
  225|      0|        if frontier.is_empty() {
  226|      0|            break;
  227|      0|        }
  228|      0|        let mut next_frontier = Vec::with_capacity(frontier.len() * 2);
  229|       |
  230|      0|        for &entity_id in &frontier {
  231|       |            // Fetch neighbours ordered by weight DESC to support capping.
  232|      0|            let mut stmt = conn.prepare_cached(
  233|      0|                "SELECT target_id, weight FROM relationships
  234|      0|                 WHERE source_id = ?1 AND weight >= ?2 AND namespace = ?3
  235|      0|                 ORDER BY weight DESC",
  236|      0|            )?;
  237|      0|            let mut neighbors: Vec<i64> = stmt
  238|      0|                .query_map(params![entity_id, min_weight, namespace], |r| {
  239|      0|                    Ok((r.get::<_, i64>(0)?, r.get::<_, f64>(1)?))
  240|      0|                })?
  241|      0|                .filter_map(|r| r.ok())
  242|      0|                .filter(|(id, _)| !entity_depth.contains_key(id))
  243|      0|                .map(|(id, _)| id)
  244|      0|                .collect();
  245|       |
  246|       |            // Apply optional per-hop neighbour cap.
  247|      0|            if let Some(cap) = max_neighbors_per_hop {
  248|      0|                neighbors.truncate(cap);
  249|      0|            }
  250|       |
  251|      0|            for id in neighbors {
  252|      0|                entity_depth.insert(id, hop);
  253|      0|                next_frontier.push(id);
  254|      0|            }
  255|       |        }
  256|      0|        frontier = next_frontier;
  257|       |    }
  258|       |
  259|       |    // Find memories connected to traversed entities (excluding seeds), preserving hop depth
  260|      0|    let seed_set: std::collections::HashSet<i64> = seed_memory_ids.iter().copied().collect();
  261|      0|    let seed_entity_set: std::collections::HashSet<i64> = seed_entities.iter().copied().collect();
  262|       |
  263|      0|    let mut result: Vec<(i64, u32)> = Vec::with_capacity(entity_depth.len());
  264|      0|    let mut seen_memories: std::collections::HashSet<i64> =
  265|      0|        std::collections::HashSet::with_capacity(entity_depth.len());
  266|       |
  267|      0|    for (&entity_id, &hop) in &entity_depth {
  268|      0|        if seed_entity_set.contains(&entity_id) {
  269|      0|            continue;
  270|      0|        }
  271|      0|        let mut stmt = conn.prepare_cached(
  272|      0|            "SELECT DISTINCT me.memory_id
  273|      0|             FROM memory_entities me
  274|      0|             JOIN memories m ON m.id = me.memory_id
  275|      0|             WHERE me.entity_id = ?1 AND m.deleted_at IS NULL",
  276|      0|        )?;
  277|      0|        let mem_ids: Vec<i64> = stmt
  278|      0|            .query_map(params![entity_id], |r| r.get(0))?
  279|      0|            .filter_map(|r| r.ok())
  280|      0|            .filter(|id| !seed_set.contains(id) && !seen_memories.contains(id))
  281|      0|            .collect();
  282|       |
  283|      0|        for mem_id in mem_ids {
  284|      0|            seen_memories.insert(mem_id);
  285|      0|            result.push((mem_id, hop));
  286|      0|        }
  287|       |    }
  288|       |
  289|      0|    result.sort_unstable_by_key(|&(id, _)| id);
  290|      0|    Ok(result)
  291|      0|}
  292|       |
  293|       |/// Depth map from BFS: entity_id → hop distance from seeds.
  294|       |pub type EntityDepthMap = std::collections::HashMap<i64, u32>;
  295|       |
  296|       |/// Predecessor map from BFS: entity_id → (parent_entity_id, relation_type, edge_weight).
  297|       |///
  298|       |/// Enables path reconstruction from any discovered entity back to a seed.
  299|       |pub type PredecessorMap = std::collections::HashMap<i64, (i64, String, f64)>;
  300|       |
  301|       |/// BFS that also returns a predecessor map for path reconstruction.
  302|       |///
  303|       |/// Used by `deep-research` to reconstruct directed evidence chains from
  304|       |/// discovered entities back to their seeds.
  305|       |///
  306|       |/// Returns `(entity_depth, predecessor)` where:
  307|       |/// - `entity_depth`: depth of each reached entity (0 = seed).
  308|       |/// - `predecessor`: the BFS tree edge that first reached each non-seed entity.
  309|       |///
  310|       |/// # Errors
  311|       |///
  312|       |/// Propagates [`AppError::Database`] (exit 10) on SQLite query failures.
  313|      2|pub fn bfs_with_predecessors(
  314|      2|    conn: &Connection,
  315|      2|    seed_entity_ids: &[i64],
  316|      2|    namespace: &str,
  317|      2|    min_weight: f64,
  318|      2|    max_hops: u32,
  319|      2|    max_neighbors_per_hop: Option<usize>,
  320|      2|) -> Result<(EntityDepthMap, PredecessorMap), AppError> {
  321|       |    use std::collections::HashMap;
  322|       |
  323|      2|    let mut entity_depth: HashMap<i64, u32> = seed_entity_ids.iter().map(|&id| (id, 0)).collect();
  324|      2|    let mut predecessor: HashMap<i64, (i64, String, f64)> =
  325|      2|        HashMap::with_capacity(max_hops as usize * 10);
  326|      2|    let mut frontier: Vec<i64> = seed_entity_ids.to_vec();
  327|       |
  328|      2|    for hop in 1..=max_hops {
  329|      2|        if frontier.is_empty() {
  330|      0|            break;
  331|      2|        }
  332|      2|        let mut next_frontier = Vec::with_capacity(frontier.len() * 2);
  333|       |
  334|      4|        for &entity_id in &frontier {
                           ^2
  335|      2|            let mut stmt = conn.prepare_cached(
  336|      2|                "SELECT target_id, relation, weight FROM relationships
  337|      2|                 WHERE source_id = ?1 AND weight >= ?2 AND namespace = ?3
  338|      2|                 ORDER BY weight DESC",
  339|      0|            )?;
  340|      2|            let mut neighbors: Vec<(i64, String, f64)> = stmt
  341|     10|                .query_map(params![entity_id, min_weight, namespace], |r| {
                               ^2        ^2
  342|       |                    Ok((
  343|     10|                        r.get::<_, i64>(0)?,
                                                        ^0
  344|     10|                        r.get::<_, String>(1)?,
                                                           ^0
  345|     10|                        r.get::<_, f64>(2)?,
                                                        ^0
  346|       |                    ))
  347|     10|                })?
                                ^0
  348|     10|                .filter_map(|r| r.ok())
                               ^2
  349|     10|                .filter(|(id, _, _)| !entity_depth.contains_key(id))
                               ^2
  350|      2|                .collect();
  351|       |
  352|      2|            if let Some(cap) = max_neighbors_per_hop {
                                      ^1
  353|      1|                neighbors.truncate(cap);
  354|      1|            }
  355|       |
  356|      9|            for (id, relation, weight) in neighbors {
                               ^7  ^7        ^7
  357|      7|                entity_depth.insert(id, hop);
  358|      7|                predecessor.insert(id, (entity_id, relation, weight));
  359|      7|                next_frontier.push(id);
  360|      7|            }
  361|       |        }
  362|      2|        frontier = next_frontier;
  363|       |    }
  364|       |
  365|      2|    Ok((entity_depth, predecessor))
  366|      2|}
  367|       |
  368|       |#[cfg(test)]
  369|       |mod tests {
  370|       |    use super::*;
  371|       |    use rusqlite::Connection;
  372|       |
  373|     16|    fn setup_db() -> Connection {
  374|     16|        let conn = Connection::open_in_memory().unwrap();
  375|     16|        conn.execute_batch(
  376|     16|            "CREATE TABLE memories (
  377|     16|                id INTEGER PRIMARY KEY,
  378|     16|                namespace TEXT NOT NULL,
  379|     16|                deleted_at TEXT
  380|     16|            );
  381|     16|            CREATE TABLE memory_entities (
  382|     16|                memory_id INTEGER NOT NULL,
  383|     16|                entity_id INTEGER NOT NULL
  384|     16|            );
  385|     16|            CREATE TABLE relationships (
  386|     16|                source_id INTEGER NOT NULL,
  387|     16|                target_id INTEGER NOT NULL,
  388|     16|                weight REAL NOT NULL,
  389|     16|                namespace TEXT NOT NULL
  390|     16|            );",
  391|       |        )
  392|     16|        .unwrap();
  393|     16|        conn
  394|     16|    }
  395|       |
  396|     31|    fn insert_memory(conn: &Connection, id: i64, namespace: &str, deleted: bool) {
  397|     31|        conn.execute(
  398|     31|            "INSERT INTO memories (id, namespace, deleted_at) VALUES (?1, ?2, ?3)",
  399|     31|            params![
  400|       |                id,
  401|       |                namespace,
  402|     31|                if deleted { Some("2024-01-01") } else { None }
                                           ^1                          ^30
  403|       |            ],
  404|       |        )
  405|     31|        .unwrap();
  406|     31|    }
  407|       |
  408|     31|    fn link_memory_entity(conn: &Connection, memory_id: i64, entity_id: i64) {
  409|     31|        conn.execute(
  410|     31|            "INSERT INTO memory_entities (memory_id, entity_id) VALUES (?1, ?2)",
  411|     31|            params![memory_id, entity_id],
  412|       |        )
  413|     31|        .unwrap();
  414|     31|    }
  415|       |
  416|     18|    fn insert_relationship(conn: &Connection, src: i64, tgt: i64, weight: f64, ns: &str) {
  417|     18|        conn.execute(
  418|     18|            "INSERT INTO relationships (source_id, target_id, weight, namespace) VALUES (?1, ?2, ?3, ?4)",
  419|     18|            params![src, tgt, weight, ns],
  420|       |        )
  421|     18|        .unwrap();
  422|     18|    }
  423|       |
  424|       |    // --- edge cases retornando vazio ---
  425|       |
  426|       |    #[test]
  427|      1|    fn returns_empty_when_seeds_empty() {
  428|      1|        let conn = setup_db();
  429|      1|        let result = traverse_from_memories(&conn, &[], "ns", 0.5, 3).unwrap();
  430|      1|        assert!(result.is_empty());
  431|      1|    }
  432|       |
  433|       |    #[test]
  434|      1|    fn returns_empty_when_max_hops_zero() {
  435|      1|        let conn = setup_db();
  436|      1|        insert_memory(&conn, 1, "ns", false);
  437|      1|        link_memory_entity(&conn, 1, 10);
  438|      1|        let result = traverse_from_memories(&conn, &[1], "ns", 0.5, 0).unwrap();
  439|      1|        assert!(result.is_empty());
  440|      1|    }
  441|       |
  442|       |    #[test]
  443|      1|    fn returns_empty_when_seed_has_no_entities() {
  444|      1|        let conn = setup_db();
  445|      1|        insert_memory(&conn, 1, "ns", false);
  446|       |        // memory exists but has no associated entities
  447|      1|        let result = traverse_from_memories(&conn, &[1], "ns", 0.5, 3).unwrap();
  448|      1|        assert!(result.is_empty());
  449|      1|    }
  450|       |
  451|       |    #[test]
  452|      1|    fn returns_empty_when_no_relationships() {
  453|      1|        let conn = setup_db();
  454|      1|        insert_memory(&conn, 1, "ns", false);
  455|      1|        link_memory_entity(&conn, 1, 10);
  456|       |        // entity 10 has no relationships
  457|      1|        let result = traverse_from_memories(&conn, &[1], "ns", 0.5, 3).unwrap();
  458|      1|        assert!(result.is_empty());
  459|      1|    }
  460|       |
  461|       |    // --- basic happy path ---
  462|       |
  463|       |    #[test]
  464|      1|    fn traversal_basic_one_hop() {
  465|      1|        let conn = setup_db();
  466|       |
  467|       |        // seed: memory 1 com entity 10
  468|      1|        insert_memory(&conn, 1, "ns", false);
  469|      1|        link_memory_entity(&conn, 1, 10);
  470|       |
  471|       |        // vizinha: entity 20 ligada a memory 2
  472|      1|        insert_memory(&conn, 2, "ns", false);
  473|      1|        link_memory_entity(&conn, 2, 20);
  474|       |
  475|       |        // relacionamento 10 -> 20
  476|      1|        insert_relationship(&conn, 10, 20, 1.0, "ns");
  477|       |
  478|      1|        let result = traverse_from_memories(&conn, &[1], "ns", 0.5, 1).unwrap();
  479|      1|        assert_eq!(result, vec![2]);
  480|      1|    }
  481|       |
  482|       |    #[test]
  483|      1|    fn traversal_two_hops() {
  484|      1|        let conn = setup_db();
  485|       |
  486|      1|        insert_memory(&conn, 1, "ns", false);
  487|      1|        link_memory_entity(&conn, 1, 10);
  488|       |
  489|      1|        insert_memory(&conn, 2, "ns", false);
  490|      1|        link_memory_entity(&conn, 2, 20);
  491|       |
  492|      1|        insert_memory(&conn, 3, "ns", false);
  493|      1|        link_memory_entity(&conn, 3, 30);
  494|       |
  495|       |        // cadeia 10 -> 20 -> 30
  496|      1|        insert_relationship(&conn, 10, 20, 1.0, "ns");
  497|      1|        insert_relationship(&conn, 20, 30, 1.0, "ns");
  498|       |
  499|      1|        let mut result = traverse_from_memories(&conn, &[1], "ns", 0.5, 2).unwrap();
  500|      1|        result.sort_unstable();
  501|      1|        assert_eq!(result, vec![2, 3]);
  502|      1|    }
  503|       |
  504|       |    #[test]
  505|      1|    fn max_hops_limits_depth() {
  506|      1|        let conn = setup_db();
  507|       |
  508|      1|        insert_memory(&conn, 1, "ns", false);
  509|      1|        link_memory_entity(&conn, 1, 10);
  510|       |
  511|      1|        insert_memory(&conn, 2, "ns", false);
  512|      1|        link_memory_entity(&conn, 2, 20);
  513|       |
  514|      1|        insert_memory(&conn, 3, "ns", false);
  515|      1|        link_memory_entity(&conn, 3, 30);
  516|       |
  517|      1|        insert_relationship(&conn, 10, 20, 1.0, "ns");
  518|      1|        insert_relationship(&conn, 20, 30, 1.0, "ns");
  519|       |
  520|       |        // with only 1 hop, memory 3 must not appear
  521|      1|        let result = traverse_from_memories(&conn, &[1], "ns", 0.5, 1).unwrap();
  522|      1|        assert_eq!(result, vec![2]);
  523|      1|        assert!(!result.contains(&3));
  524|      1|    }
  525|       |
  526|       |    // --- filtro de peso ---
  527|       |
  528|       |    #[test]
  529|      1|    fn relationship_with_weight_below_min_ignored() {
  530|      1|        let conn = setup_db();
  531|       |
  532|      1|        insert_memory(&conn, 1, "ns", false);
  533|      1|        link_memory_entity(&conn, 1, 10);
  534|       |
  535|      1|        insert_memory(&conn, 2, "ns", false);
  536|      1|        link_memory_entity(&conn, 2, 20);
  537|       |
  538|       |        // peso 0.3 < min_weight 0.5
  539|      1|        insert_relationship(&conn, 10, 20, 0.3, "ns");
  540|       |
  541|      1|        let result = traverse_from_memories(&conn, &[1], "ns", 0.5, 3).unwrap();
  542|      1|        assert!(result.is_empty());
  543|      1|    }
  544|       |
  545|       |    #[test]
  546|      1|    fn relationship_with_weight_exactly_at_min_included() {
  547|      1|        let conn = setup_db();
  548|       |
  549|      1|        insert_memory(&conn, 1, "ns", false);
  550|      1|        link_memory_entity(&conn, 1, 10);
  551|       |
  552|      1|        insert_memory(&conn, 2, "ns", false);
  553|      1|        link_memory_entity(&conn, 2, 20);
  554|       |
  555|      1|        insert_relationship(&conn, 10, 20, 0.5, "ns");
  556|       |
  557|      1|        let result = traverse_from_memories(&conn, &[1], "ns", 0.5, 1).unwrap();
  558|      1|        assert_eq!(result, vec![2]);
  559|      1|    }
  560|       |
  561|       |    // --- isolamento de namespace ---
  562|       |
  563|       |    #[test]
  564|      1|    fn relationship_from_different_namespace_ignored() {
  565|      1|        let conn = setup_db();
  566|       |
  567|      1|        insert_memory(&conn, 1, "ns_a", false);
  568|      1|        link_memory_entity(&conn, 1, 10);
  569|       |
  570|      1|        insert_memory(&conn, 2, "ns_a", false);
  571|      1|        link_memory_entity(&conn, 2, 20);
  572|       |
  573|       |        // relacionamento no namespace errado
  574|      1|        insert_relationship(&conn, 10, 20, 1.0, "ns_b");
  575|       |
  576|      1|        let result = traverse_from_memories(&conn, &[1], "ns_a", 0.5, 3).unwrap();
  577|      1|        assert!(result.is_empty());
  578|      1|    }
  579|       |
  580|       |    // --- exclude seeds from result ---
  581|       |
  582|       |    #[test]
  583|      1|    fn seeds_do_not_appear_in_result() {
  584|      1|        let conn = setup_db();
  585|       |
  586|      1|        insert_memory(&conn, 1, "ns", false);
  587|      1|        link_memory_entity(&conn, 1, 10);
  588|       |
  589|      1|        insert_memory(&conn, 2, "ns", false);
  590|      1|        link_memory_entity(&conn, 2, 20);
  591|       |
  592|       |        // relacionamento de 20 de volta para 10 (ciclo)
  593|      1|        insert_relationship(&conn, 10, 20, 1.0, "ns");
  594|      1|        insert_relationship(&conn, 20, 10, 1.0, "ns");
  595|       |
  596|      1|        let result = traverse_from_memories(&conn, &[1], "ns", 0.5, 3).unwrap();
  597|       |        // memory 1 must not appear even with a cycle
  598|      1|        assert!(!result.contains(&1));
  599|      1|        assert_eq!(result, vec![2]);
  600|      1|    }
  601|       |
  602|       |    // --- soft-deleted memories excluded ---
  603|       |
  604|       |    #[test]
  605|      1|    fn deleted_memories_not_included() {
  606|      1|        let conn = setup_db();
  607|       |
  608|      1|        insert_memory(&conn, 1, "ns", false);
  609|      1|        link_memory_entity(&conn, 1, 10);
  610|       |
  611|       |        // memory 2 foi deletada
  612|      1|        insert_memory(&conn, 2, "ns", true);
  613|      1|        link_memory_entity(&conn, 2, 20);
  614|       |
  615|      1|        insert_relationship(&conn, 10, 20, 1.0, "ns");
  616|       |
  617|      1|        let result = traverse_from_memories(&conn, &[1], "ns", 0.5, 3).unwrap();
  618|      1|        assert!(result.is_empty());
  619|      1|    }
  620|       |
  621|       |    // --- multiple seeds ---
  622|       |
  623|       |    #[test]
  624|      1|    fn multiple_seeds_merged_in_result() {
  625|      1|        let conn = setup_db();
  626|       |
  627|      1|        insert_memory(&conn, 1, "ns", false);
  628|      1|        link_memory_entity(&conn, 1, 10);
  629|       |
  630|      1|        insert_memory(&conn, 2, "ns", false);
  631|      1|        link_memory_entity(&conn, 2, 20);
  632|       |
  633|      1|        insert_memory(&conn, 3, "ns", false);
  634|      1|        link_memory_entity(&conn, 3, 30);
  635|       |
  636|      1|        insert_memory(&conn, 4, "ns", false);
  637|      1|        link_memory_entity(&conn, 4, 40);
  638|       |
  639|      1|        insert_relationship(&conn, 10, 30, 1.0, "ns");
  640|      1|        insert_relationship(&conn, 20, 40, 1.0, "ns");
  641|       |
  642|      1|        let mut result = traverse_from_memories(&conn, &[1, 2], "ns", 0.5, 1).unwrap();
  643|      1|        result.sort_unstable();
  644|      1|        assert_eq!(result, vec![3, 4]);
  645|      1|    }
  646|       |
  647|       |    // --- result deduplication ---
  648|       |
  649|       |    #[test]
  650|      1|    fn result_without_duplicates() {
  651|      1|        let conn = setup_db();
  652|       |
  653|      1|        insert_memory(&conn, 1, "ns", false);
  654|      1|        link_memory_entity(&conn, 1, 10);
  655|      1|        link_memory_entity(&conn, 1, 11); // dois seeds na mesma memory
  656|       |
  657|      1|        insert_memory(&conn, 2, "ns", false);
  658|      1|        link_memory_entity(&conn, 2, 20);
  659|       |
  660|       |        // ambos os seeds apontam para a mesma entity 20
  661|      1|        insert_relationship(&conn, 10, 20, 1.0, "ns");
  662|      1|        insert_relationship(&conn, 11, 20, 1.0, "ns");
  663|       |
  664|      1|        let result = traverse_from_memories(&conn, &[1], "ns", 0.5, 1).unwrap();
  665|       |        // memory 2 deve aparecer apenas uma vez
  666|      1|        assert_eq!(result.len(), 1);
  667|      1|        assert_eq!(result, vec![2]);
  668|      1|    }
  669|       |
  670|       |    // --- single node ---
  671|       |
  672|       |    #[test]
  673|      1|    fn single_node_without_neighbors_returns_empty() {
  674|      1|        let conn = setup_db();
  675|       |
  676|      1|        insert_memory(&conn, 1, "ns", false);
  677|      1|        link_memory_entity(&conn, 1, 10);
  678|       |        // entity 10 has no outgoing relationships
  679|       |
  680|      1|        let result = traverse_from_memories(&conn, &[1], "ns", 0.5, 5).unwrap();
  681|      1|        assert!(result.is_empty());
  682|      1|    }
  683|       |
  684|       |    // --- ciclos no grafo ---
  685|       |
  686|       |    #[test]
  687|      1|    fn cycle_does_not_cause_infinite_loop() {
  688|      1|        let conn = setup_db();
  689|       |
  690|      1|        insert_memory(&conn, 1, "ns", false);
  691|      1|        link_memory_entity(&conn, 1, 10);
  692|       |
  693|      1|        insert_memory(&conn, 2, "ns", false);
  694|      1|        link_memory_entity(&conn, 2, 20);
  695|       |
  696|      1|        insert_memory(&conn, 3, "ns", false);
  697|      1|        link_memory_entity(&conn, 3, 30);
  698|       |
  699|       |        // triangle 10 -> 20 -> 30 -> 10
  700|      1|        insert_relationship(&conn, 10, 20, 1.0, "ns");
  701|      1|        insert_relationship(&conn, 20, 30, 1.0, "ns");
  702|      1|        insert_relationship(&conn, 30, 10, 1.0, "ns");
  703|       |
  704|      1|        let mut result = traverse_from_memories(&conn, &[1], "ns", 0.5, 10).unwrap();
  705|      1|        result.sort_unstable();
  706|       |        // deve retornar 2 e 3 sem loop infinito
  707|      1|        assert_eq!(result, vec![2, 3]);
  708|      1|    }
  709|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/i18n.rs:
    1|       |//! Bilingual human-readable message layer.
    2|       |//!
    3|       |//! The CLI uses `--lang en|pt` (global flag) or `SQLITE_GRAPHRAG_LANG` (env var) to choose
    4|       |//! the language of stderr progress messages. JSON stdout is deterministic and identical
    5|       |//! across languages — only strings intended for humans pass through this module.
    6|       |//!
    7|       |//! Detection (highest to lowest priority):
    8|       |//! 1. Explicit `--lang` flag
    9|       |//! 2. Env var `SQLITE_GRAPHRAG_LANG`
   10|       |//! 3. OS locale (`LANG`, `LC_ALL`) with `pt` prefix
   11|       |//! 4. Fallback `English`
   12|       |
   13|       |use std::sync::OnceLock;
   14|       |
   15|       |#[derive(Copy, Clone, Debug, PartialEq, Eq, clap::ValueEnum)]
   16|       |pub enum Language {
   17|       |    #[value(name = "en", aliases = ["english", "EN"])]
   18|       |    English,
   19|       |    #[value(name = "pt", aliases = ["portugues", "portuguese", "pt-BR", "pt-br", "PT"])]
   20|       |    Portuguese,
   21|       |}
   22|       |
   23|       |impl Language {
   24|       |    /// Parses a command-line string into a `Language` without relying on clap.
   25|       |    /// Accepts the same aliases defined in `#[value(...)]`: "en", "pt", etc.
   26|      0|    pub fn from_str_opt(s: &str) -> Option<Self> {
   27|      0|        match s.to_lowercase().as_str() {
   28|      0|            "en" | "english" => Some(Language::English),
   29|      0|            "pt" | "pt-br" | "portugues" | "portuguese" => Some(Language::Portuguese),
   30|      0|            _ => None,
   31|       |        }
   32|      0|    }
   33|       |
   34|      8|    pub fn from_env_or_locale() -> Self {
   35|       |        // Priority 1: explicit SQLITE_GRAPHRAG_LANG env var (highest precedence).
   36|       |        // Empty string treated as unset per POSIX convention.
   37|      8|        if let Ok(v) = std::env::var("SQLITE_GRAPHRAG_LANG") {
                                ^3
   38|      3|            if !v.is_empty() {
   39|      3|                let lower = v.to_lowercase();
   40|      3|                if lower.starts_with("pt") {
   41|      3|                    return Language::Portuguese;
   42|      0|                }
   43|      0|                if lower.starts_with("en") {
   44|      0|                    return Language::English;
   45|      0|                }
   46|      0|                tracing::warn!(target: "i18n",
   47|       |                    value = %v,
   48|      0|                    "SQLITE_GRAPHRAG_LANG value not recognized, falling back to locale detection"
   49|       |                );
   50|      0|            }
   51|      5|        }
   52|       |        // Priority 2: POSIX locale precedence LC_ALL > LC_MESSAGES > LANG.
   53|       |        // We read these via std::env (not via sys_locale) because:
   54|       |        // (a) `sys_locale::get_locale()` calls into native OS APIs (CFLocaleCopyCurrent
   55|       |        //     on macOS, GetUserDefaultLocaleName on Windows) which cache the
   56|       |        //     system locale and IGNORE env vars set at runtime by tests;
   57|       |        // (b) POSIX specifies LC_ALL > LC_MESSAGES > LANG ordering and an
   58|       |        //     unrecognised LC_ALL value must stop iteration (fall back to
   59|       |        //     English default).
   60|      7|        for var in ["LC_ALL", "LC_MESSAGES", "LANG"] {
                                            ^5             ^5
   61|      7|            if let Ok(v) = std::env::var(var) {
                                    ^5
   62|      5|                if v.is_empty() {
   63|      0|                    continue;
   64|      5|                }
   65|      5|                let lower = v.to_lowercase();
   66|      5|                if lower.starts_with("pt") {
   67|      2|                    return Language::Portuguese;
   68|      3|                }
   69|      3|                if lower.starts_with("en") {
   70|      1|                    return Language::English;
   71|      2|                }
   72|       |                // Unrecognised value in a higher-precedence variable stops
   73|       |                // iteration per POSIX.1-2017 §8.2.
   74|      2|                if var == "LC_ALL" {
   75|      2|                    return Language::English;
   76|      0|                }
   77|      2|            }
   78|       |        }
   79|       |        // Priority 3: cross-platform locale detection via native OS APIs.
   80|       |        // Only reached when no POSIX env var is set.
   81|      0|        if let Some(locale) = sys_locale::get_locale() {
   82|      0|            let lower = locale.to_lowercase();
   83|      0|            if lower.starts_with("pt") {
   84|      0|                return Language::Portuguese;
   85|      0|            }
   86|      0|            if lower.starts_with("en") {
   87|      0|                return Language::English;
   88|      0|            }
   89|      0|        }
   90|      0|        Language::English
   91|      8|    }
   92|       |}
   93|       |
   94|       |static GLOBAL_LANGUAGE: OnceLock<Language> = OnceLock::new();
   95|       |
   96|       |/// Initializes the global language. Subsequent calls are silently ignored
   97|       |/// (OnceLock semantics) — guaranteeing thread-safety and determinism.
   98|       |///
   99|       |/// v1.0.36 (L6): early-return when already initialized so the env-fallback
  100|       |/// resolver (`from_env_or_locale`) does not run a second time. Without this
  101|       |/// guard, calling `init(None)` after `current()` already populated the
  102|       |/// OnceLock causes `from_env_or_locale` to fire its `tracing::warn!` twice
  103|       |/// for unrecognized `SQLITE_GRAPHRAG_LANG` values.
  104|      0|pub fn init(explicit: Option<Language>) {
  105|      0|    if GLOBAL_LANGUAGE.get().is_some() {
  106|      0|        return;
  107|      0|    }
  108|      0|    let resolved = explicit.unwrap_or_else(Language::from_env_or_locale);
  109|      0|    let _ = GLOBAL_LANGUAGE.set(resolved);
  110|      0|}
  111|       |
  112|       |/// Returns the active language, or fallback English if `init` was never called.
  113|      4|pub fn current() -> Language {
  114|      4|    *GLOBAL_LANGUAGE.get_or_init(Language::from_env_or_locale)
  115|      4|}
  116|       |
  117|       |/// Translates a bilingual message by selecting the active variant.
  118|       |///
  119|       |/// v1.0.36 (M4): inputs are constrained to `&'static str` so the function
  120|       |/// can return one of them directly without `Box::leak`. The previous
  121|       |/// implementation leaked one allocation per call which accumulated in
  122|       |/// long-running pipelines; this version is allocation-free. All in-tree
  123|       |/// callers already pass string literals, which are `&'static str`.
  124|      0|pub fn tr(en: &'static str, pt: &'static str) -> &'static str {
  125|      0|    match current() {
  126|      0|        Language::English => en,
  127|      0|        Language::Portuguese => pt,
  128|       |    }
  129|      0|}
  130|       |
  131|       |/// Progress message emitted after pruning relationships.
  132|       |///
  133|       |/// English-only: this string is emitted to stderr as a progress notice and
  134|       |/// does not vary by language because the prune-relations command targets
  135|       |/// agent-first pipelines where deterministic output matters.
  136|      0|pub fn relations_pruned(count: usize, relation: &str, namespace: &str) -> String {
  137|      0|    format!("pruned {count} '{relation}' relationships in namespace '{namespace}'")
  138|      0|}
  139|       |
  140|       |/// Progress message for dry-run preview of prune-relations.
  141|       |///
  142|       |/// English-only: emitted to stderr as a progress notice.
  143|      0|pub fn prune_dry_run(count: usize, relation: &str) -> String {
  144|      0|    format!("dry run: {count} '{relation}' relationships would be removed")
  145|      0|}
  146|       |
  147|       |/// Warning message when --yes is not passed for destructive prune-relations.
  148|       |///
  149|       |/// English-only: emitted to stderr as a progress notice.
  150|      0|pub fn prune_requires_yes() -> String {
  151|      0|    "destructive operation requires --yes flag; use --dry-run to preview".to_string()
  152|      0|}
  153|       |
  154|       |/// Localized prefix for error messages displayed to the end user.
  155|      0|pub fn error_prefix() -> &'static str {
  156|      0|    match current() {
  157|      0|        Language::English => "Error",
  158|      0|        Language::Portuguese => "Erro",
  159|       |    }
  160|      0|}
  161|       |
  162|       |/// Error messages for `AppError` variants — always English.
  163|       |///
  164|       |/// These strings end up inside `AppError` inner fields and may appear in
  165|       |/// deterministic JSON stdout (e.g. ingest NDJSON). Portuguese translations
  166|       |/// for stderr live in `pub mod app_error_pt` and are applied by
  167|       |/// `localized_message_for(Language::Portuguese)`.
  168|       |pub mod errors_msg {
  169|      0|    pub fn memory_not_found(nome: &str, namespace: &str) -> String {
  170|      0|        format!("memory '{nome}' not found in namespace '{namespace}'")
  171|      0|    }
  172|       |
  173|      0|    pub fn memory_or_entity_not_found(name: &str, namespace: &str) -> String {
  174|      0|        format!("memory or entity '{name}' not found in namespace '{namespace}'")
  175|      0|    }
  176|       |
  177|      0|    pub fn database_not_found(path: &str) -> String {
  178|      0|        format!("database not found at {path}. Run 'sqlite-graphrag init' first.")
  179|      0|    }
  180|       |
  181|      0|    pub fn entity_not_found(nome: &str, namespace: &str) -> String {
  182|      0|        format!("entity \"{nome}\" does not exist in namespace \"{namespace}\"")
  183|      0|    }
  184|       |
  185|      0|    pub fn relationship_not_found(de: &str, rel: &str, para: &str, namespace: &str) -> String {
  186|      0|        format!(
  187|      0|            "relationship \"{de}\" --[{rel}]--> \"{para}\" does not exist in namespace \"{namespace}\""
  188|       |        )
  189|      0|    }
  190|       |
  191|      0|    pub fn duplicate_memory(nome: &str, namespace: &str) -> String {
  192|      0|        format!(
  193|      0|            "memory '{nome}' already exists in namespace '{namespace}'. Use --force-merge to update."
  194|       |        )
  195|      0|    }
  196|       |
  197|      0|    pub fn duplicate_memory_soft_deleted(name: &str, namespace: &str) -> String {
  198|      0|        format!(
  199|      0|            "memory '{name}' exists but is soft-deleted in namespace '{namespace}'; \
  200|      0|             use --force-merge to restore and update, or `restore` to revive it"
  201|       |        )
  202|      0|    }
  203|       |
  204|      0|    pub fn optimistic_lock_conflict(expected: i64, current_ts: i64) -> String {
  205|      0|        format!(
  206|      0|            "optimistic lock conflict: expected updated_at={expected}, but current is {current_ts}"
  207|       |        )
  208|      0|    }
  209|       |
  210|      0|    pub fn version_not_found(versao: i64, nome: &str) -> String {
  211|      0|        format!("version {versao} not found for memory '{nome}'")
  212|      0|    }
  213|       |
  214|      0|    pub fn no_recall_results(max_distance: f32, query: &str, namespace: &str) -> String {
  215|      0|        format!(
  216|      0|            "no results within --max-distance {max_distance} for query '{query}' in namespace '{namespace}'"
  217|       |        )
  218|      0|    }
  219|       |
  220|      0|    pub fn soft_deleted_memory_not_found(nome: &str, namespace: &str) -> String {
  221|      0|        format!("soft-deleted memory '{nome}' not found in namespace '{namespace}'")
  222|      0|    }
  223|       |
  224|      0|    pub fn concurrent_process_conflict() -> String {
  225|      0|        "optimistic lock conflict: memory was modified by another process".to_string()
  226|      0|    }
  227|       |
  228|      0|    pub fn entity_limit_exceeded(max: usize) -> String {
  229|      0|        format!("entities exceed limit of {max}")
  230|      0|    }
  231|       |
  232|      0|    pub fn relationship_limit_exceeded(max: usize) -> String {
  233|      0|        format!("relationships exceed limit of {max}")
  234|      0|    }
  235|       |}
  236|       |
  237|       |/// Localized validation messages for memory fields.
  238|       |pub mod validation {
  239|       |    use super::current;
  240|       |    use crate::i18n::Language;
  241|       |
  242|      1|    pub fn name_length(max: usize) -> String {
  243|      1|        match current() {
  244|      0|            Language::English => format!("name must be 1-{max} chars"),
  245|      1|            Language::Portuguese => format!("nome deve ter entre 1 e {max} caracteres"),
  246|       |        }
  247|      1|    }
  248|       |
  249|      1|    pub fn reserved_name() -> String {
  250|      1|        match current() {
  251|       |            Language::English => {
  252|      0|                "names and namespaces starting with __ are reserved for internal use".to_string()
  253|       |            }
  254|       |            Language::Portuguese => {
  255|      1|                "nomes e namespaces iniciados com __ são reservados para uso interno".to_string()
  256|       |            }
  257|       |        }
  258|      1|    }
  259|       |
  260|      0|    pub fn name_kebab(nome: &str) -> String {
  261|      0|        match current() {
  262|      0|            Language::English => format!(
  263|      0|                "name must be kebab-case slug (lowercase letters, digits, hyphens): '{nome}'"
  264|       |            ),
  265|       |            Language::Portuguese => {
  266|      0|                format!("nome deve estar em kebab-case (minúsculas, dígitos, hífens): '{nome}'")
  267|       |            }
  268|       |        }
  269|      0|    }
  270|       |
  271|      0|    pub fn description_exceeds(max: usize) -> String {
  272|      0|        match current() {
  273|      0|            Language::English => format!("description must be <= {max} chars"),
  274|      0|            Language::Portuguese => format!("descrição deve ter no máximo {max} caracteres"),
  275|       |        }
  276|      0|    }
  277|       |
  278|      0|    pub fn body_exceeds(max: usize) -> String {
  279|      0|        match current() {
  280|      0|            Language::English => format!("body exceeds {max} bytes"),
  281|      0|            Language::Portuguese => format!("corpo excede {max} bytes"),
  282|       |        }
  283|      0|    }
  284|       |
  285|      0|    pub fn new_name_length(max: usize) -> String {
  286|      0|        match current() {
  287|      0|            Language::English => format!("new-name must be 1-{max} chars"),
  288|      0|            Language::Portuguese => format!("novo nome deve ter entre 1 e {max} caracteres"),
  289|       |        }
  290|      0|    }
  291|       |
  292|      0|    pub fn new_name_kebab(nome: &str) -> String {
  293|      0|        match current() {
  294|      0|            Language::English => format!(
  295|      0|                "new-name must be kebab-case slug (lowercase letters, digits, hyphens): '{nome}'"
  296|       |            ),
  297|      0|            Language::Portuguese => format!(
  298|      0|                "novo nome deve estar em kebab-case (minúsculas, dígitos, hífens): '{nome}'"
  299|       |            ),
  300|       |        }
  301|      0|    }
  302|       |
  303|      0|    pub fn namespace_length() -> String {
  304|      0|        match current() {
  305|      0|            Language::English => "namespace must be 1-80 chars".to_string(),
  306|      0|            Language::Portuguese => "namespace deve ter entre 1 e 80 caracteres".to_string(),
  307|       |        }
  308|      0|    }
  309|       |
  310|      0|    pub fn namespace_format() -> String {
  311|      0|        match current() {
  312|      0|            Language::English => "namespace must be alphanumeric + hyphens/underscores".to_string(),
  313|       |            Language::Portuguese => {
  314|      0|                "namespace deve ser alfanumérico com hífens/sublinhados".to_string()
  315|       |            }
  316|       |        }
  317|      0|    }
  318|       |
  319|      1|    pub fn path_traversal(p: &str) -> String {
  320|      1|        match current() {
  321|      0|            Language::English => format!("path traversal rejected: {p}"),
  322|      1|            Language::Portuguese => format!("traversal de caminho rejeitado: {p}"),
  323|       |        }
  324|      1|    }
  325|       |
  326|      1|    pub fn invalid_tz(v: &str) -> String {
  327|      1|        match current() {
  328|      0|            Language::English => format!(
  329|      0|                "SQLITE_GRAPHRAG_DISPLAY_TZ invalid: '{v}'; use an IANA name like 'America/Sao_Paulo'"
  330|       |            ),
  331|      1|            Language::Portuguese => format!(
  332|      1|                "SQLITE_GRAPHRAG_DISPLAY_TZ inválido: '{v}'; use um nome IANA como 'America/Sao_Paulo'"
  333|       |            ),
  334|       |        }
  335|      1|    }
  336|       |
  337|      0|    pub fn empty_query() -> String {
  338|      0|        match current() {
  339|      0|            Language::English => "query cannot be empty".to_string(),
  340|      0|            Language::Portuguese => "a consulta não pode estar vazia".to_string(),
  341|       |        }
  342|      0|    }
  343|       |
  344|      0|    pub fn empty_body() -> String {
  345|      0|        match current() {
  346|      0|            Language::English => "body cannot be empty: provide --body, --body-file, or --body-stdin with content, or supply a graph via --entities-file/--graph-stdin".to_string(),
  347|      0|            Language::Portuguese => "o corpo não pode estar vazio: forneça --body, --body-file ou --body-stdin com conteúdo, ou um grafo via --entities-file/--graph-stdin".to_string(),
  348|       |        }
  349|      0|    }
  350|       |
  351|      0|    pub fn invalid_namespace_config(path: &str, err: &str) -> String {
  352|      0|        match current() {
  353|       |            Language::English => {
  354|      0|                format!("invalid project namespace config '{path}': {err}")
  355|       |            }
  356|       |            Language::Portuguese => {
  357|      0|                format!("configuração de namespace de projeto inválida '{path}': {err}")
  358|       |            }
  359|       |        }
  360|      0|    }
  361|       |
  362|      0|    pub fn invalid_projects_mapping(path: &str, err: &str) -> String {
  363|      0|        match current() {
  364|      0|            Language::English => format!("invalid projects mapping '{path}': {err}"),
  365|      0|            Language::Portuguese => format!("mapeamento de projetos inválido '{path}': {err}"),
  366|       |        }
  367|      0|    }
  368|       |
  369|      0|    pub fn self_referential_link() -> String {
  370|      0|        match current() {
  371|      0|            Language::English => "--from and --to must be different entities — self-referential relationships are not supported".to_string(),
  372|      0|            Language::Portuguese => "--from e --to devem ser entidades diferentes — relacionamentos auto-referenciais não são suportados".to_string(),
  373|       |        }
  374|      0|    }
  375|       |
  376|      0|    pub fn invalid_link_weight(weight: f64) -> String {
  377|      0|        match current() {
  378|       |            Language::English => {
  379|      0|                format!("--weight: must be between 0.0 and 1.0 (actual: {weight})")
  380|       |            }
  381|       |            Language::Portuguese => {
  382|      0|                format!("--weight: deve estar entre 0.0 e 1.0 (atual: {weight})")
  383|       |            }
  384|       |        }
  385|      0|    }
  386|       |
  387|      0|    pub fn sync_destination_equals_source() -> String {
  388|      0|        match current() {
  389|       |            Language::English => {
  390|      0|                "destination path must differ from the source database path".to_string()
  391|       |            }
  392|       |            Language::Portuguese => {
  393|      0|                "caminho de destino deve ser diferente do caminho do banco de dados fonte"
  394|      0|                    .to_string()
  395|       |            }
  396|       |        }
  397|      0|    }
  398|       |
  399|       |    /// Portuguese translations for `AppError` Display messages.
  400|       |    ///
  401|       |    /// Each helper mirrors a single `AppError` variant's `#[error(...)]` text in
  402|       |    /// Portuguese, keeping the language barrier confined to this module. The
  403|       |    /// English source of truth lives in `src/errors.rs` via `thiserror`.
  404|       |    pub mod app_error_pt {
  405|      2|        pub fn validation(msg: &str) -> String {
  406|      2|            format!("erro de validação: {msg}")
  407|      2|        }
  408|       |
  409|      3|        pub fn duplicate(msg: &str) -> String {
  410|      3|            let translated = msg
  411|      3|                .replace("already exists in namespace", "já existe no namespace")
  412|      3|                .replace(
  413|      3|                    "exists but is soft-deleted in namespace",
  414|      3|                    "existe mas está excluída temporariamente no namespace",
  415|      3|                )
  416|      3|                .replace(
  417|      3|                    "Use --force-merge to update.",
  418|      3|                    "Use --force-merge para atualizar.",
  419|      3|                )
  420|      3|                .replace(
  421|      3|                    "use --force-merge to restore and update, or `restore` to revive it",
  422|      3|                    "use --force-merge para restaurar e atualizar, ou `restore` para revivê-la",
  423|      3|                )
  424|      3|                .replace("memory", "memória");
  425|      3|            format!("duplicata detectada: {translated}")
  426|      3|        }
  427|       |
  428|      2|        pub fn conflict(msg: &str) -> String {
  429|      2|            let translated = msg
  430|      2|                .replace("optimistic lock conflict", "conflito de lock otimista")
  431|      2|                .replace("but current is", "mas atual é")
  432|      2|                .replace(
  433|      2|                    "was modified by another process",
  434|      2|                    "foi modificada por outro processo",
  435|      2|                );
  436|      2|            format!("conflito: {translated}")
  437|      2|        }
  438|       |
  439|      4|        pub fn not_found(msg: &str) -> String {
  440|      4|            let translated = msg
  441|      4|                .replace("not found in namespace", "não encontrada no namespace")
  442|      4|                .replace("not found for memory", "não encontrada para memória")
  443|      4|                .replace("does not exist in namespace", "não existe no namespace")
  444|      4|                .replace("memory or entity", "memória ou entidade")
  445|      4|                .replace("memory", "memória")
  446|      4|                .replace("entity", "entidade")
  447|      4|                .replace("version", "versão")
  448|      4|                .replace("soft-deleted", "excluída temporariamente");
  449|      4|            format!("não encontrado: {translated}")
  450|      4|        }
  451|       |
  452|      2|        pub fn namespace_error(msg: &str) -> String {
  453|      2|            format!("namespace não resolvido: {msg}")
  454|      2|        }
  455|       |
  456|      2|        pub fn limit_exceeded(msg: &str) -> String {
  457|      2|            let translated = msg
  458|      2|                .replace("exceeds limit of", "excede limite de")
  459|      2|                .replace("body exceeds", "corpo excede")
  460|      2|                .replace("entities exceed limit", "entidades excedem limite")
  461|      2|                .replace(
  462|      2|                    "relationships exceed limit",
  463|      2|                    "relacionamentos excedem limite",
  464|      2|                );
  465|      2|            format!("limite excedido: {translated}")
  466|      2|        }
  467|       |
  468|      0|        pub fn database(err: &str) -> String {
  469|      0|            format!("erro de banco de dados: {err}")
  470|      0|        }
  471|       |
  472|      2|        pub fn embedding(msg: &str) -> String {
  473|      2|            format!("erro de embedding: {msg}")
  474|      2|        }
  475|       |
  476|      2|        pub fn vec_extension(msg: &str) -> String {
  477|      2|            format!("extensão sqlite-vec falhou: {msg}")
  478|      2|        }
  479|       |
  480|      2|        pub fn db_busy(msg: &str) -> String {
  481|      2|            format!("banco ocupado: {msg}")
  482|      2|        }
  483|       |
  484|      2|        pub fn batch_partial_failure(total: usize, failed: usize) -> String {
  485|      2|            format!("falha parcial em batch: {failed} de {total} itens falharam")
  486|      2|        }
  487|       |
  488|      0|        pub fn io(err: &str) -> String {
  489|      0|            format!("erro de I/O: {err}")
  490|      0|        }
  491|       |
  492|      0|        pub fn internal(err: &str) -> String {
  493|      0|            format!("erro interno: {err}")
  494|      0|        }
  495|       |
  496|      0|        pub fn json(err: &str) -> String {
  497|      0|            format!("erro de JSON: {err}")
  498|      0|        }
  499|       |
  500|      2|        pub fn lock_busy(msg: &str) -> String {
  501|      2|            format!("lock ocupado: {msg}")
  502|      2|        }
  503|       |
  504|      2|        pub fn all_slots_full(max: usize, waited_secs: u64) -> String {
  505|      2|            format!(
  506|      2|                "todos os {max} slots de concorrência ocupados após aguardar {waited_secs}s \
  507|      2|                 (exit 75); use --max-concurrency ou aguarde outras invocações terminarem"
  508|       |            )
  509|      2|        }
  510|       |
  511|      0|        pub fn job_singleton_locked(job_type: &str, namespace: &str) -> String {
  512|      0|            format!(
  513|      0|                "job {job_type} para o namespace '{namespace}' já está em execução (exit 75); \
  514|      0|                 aguarde a conclusão ou passe --wait-job-singleton <SEGUNDOS>"
  515|       |            )
  516|      0|        }
  517|       |
  518|      2|        pub fn low_memory(available_mb: u64, required_mb: u64) -> String {
  519|      2|            format!(
  520|      2|                "memória disponível ({available_mb}MB) abaixo do mínimo requerido ({required_mb}MB) \
  521|      2|                 para carregar o modelo; aborte outras cargas ou use --skip-memory-guard (exit 77)"
  522|       |            )
  523|      2|        }
  524|       |
  525|      2|        pub fn binary_not_found(name: &str) -> String {
  526|      2|            format!("binário não encontrado: {name} — instale e adicione ao PATH")
  527|      2|        }
  528|       |
  529|      2|        pub fn rate_limited(detail: &str) -> String {
  530|      2|            format!("taxa de requisição excedida: {detail}")
  531|      2|        }
  532|       |
  533|      2|        pub fn timeout(operation: &str, secs: u64) -> String {
  534|      2|            format!("timeout após {secs}s: {operation}")
  535|      2|        }
  536|       |    }
  537|       |
  538|       |    /// Portuguese translations for runtime startup messages emitted from `main.rs`.
  539|       |    ///
  540|       |    /// These mirror the English text supplied alongside each call to
  541|       |    /// `output::emit_progress_i18n` / `output::emit_error_i18n`, keeping the PT
  542|       |    /// strings confined to this module per the language policy.
  543|       |    pub mod runtime_pt {
  544|      0|        pub fn embedding_heavy_must_measure_ram() -> String {
  545|      0|            "comando intensivo em embedding precisa medir RAM disponível".to_string()
  546|      0|        }
  547|       |
  548|      0|        pub fn heavy_command_detected(available_mb: u64, safe_concurrency: usize) -> String {
  549|      0|            format!(
  550|      0|                "Comando pesado detectado; memória disponível: {available_mb} MB; \
  551|      0|                 concorrência segura: {safe_concurrency}"
  552|       |            )
  553|      0|        }
  554|       |
  555|      0|        pub fn reducing_concurrency(
  556|      0|            requested_concurrency: usize,
  557|      0|            effective_concurrency: usize,
  558|      0|        ) -> String {
  559|      0|            format!(
  560|      0|                "Reduzindo a concorrência solicitada de {requested_concurrency} para \
  561|      0|                 {effective_concurrency} para evitar oversubscription de memória"
  562|       |            )
  563|      0|        }
  564|       |
  565|      0|        pub fn initializing_embedding_model() -> &'static str {
  566|      0|            "Inicializando modelo de embedding (pode baixar na primeira execução)..."
  567|      0|        }
  568|       |
  569|      0|        pub fn embedding_chunks_serially(count: usize) -> String {
  570|      0|            format!("Embedando {count} chunks serialmente para manter memória limitada...")
  571|      0|        }
  572|       |
  573|      0|        pub fn remember_step_input_validated(available_mb: u64) -> String {
  574|      0|            format!("Etapa remember: entrada validada; memória disponível {available_mb} MB")
  575|      0|        }
  576|       |
  577|      0|        pub fn remember_step_chunking_completed(
  578|      0|            total_passage_tokens: usize,
  579|      0|            model_max_length: usize,
  580|      0|            chunks_count: usize,
  581|      0|            rss_mb: u64,
  582|      0|        ) -> String {
  583|      0|            format!(
  584|      0|                "Etapa remember: tokenizer contou {total_passage_tokens} tokens de passagem \
  585|      0|                 (máximo do modelo {model_max_length}); chunking gerou {chunks_count} chunks; \
  586|      0|                 RSS do processo {rss_mb} MB"
  587|       |            )
  588|      0|        }
  589|       |
  590|      0|        pub fn remember_step_embeddings_completed(rss_mb: u64) -> String {
  591|      0|            format!("Etapa remember: embeddings dos chunks concluídos; RSS do processo {rss_mb} MB")
  592|      0|        }
  593|       |
  594|      0|        pub fn restore_recomputing_embedding() -> &'static str {
  595|      0|            "Recalculando embedding da memória restaurada..."
  596|      0|        }
  597|       |
  598|      0|        pub fn edit_recomputing_embedding() -> &'static str {
  599|      0|            "Recalculando embedding da memória editada..."
  600|      0|        }
  601|       |    }
  602|       |}
  603|       |
  604|       |#[cfg(test)]
  605|       |mod tests {
  606|       |    use super::*;
  607|       |    use serial_test::serial;
  608|       |
  609|       |    #[test]
  610|       |    #[serial]
  611|      1|    fn fallback_english_when_env_absent() {
  612|      1|        std::env::remove_var("SQLITE_GRAPHRAG_LANG");
  613|      1|        std::env::set_var("LC_ALL", "C");
  614|      1|        std::env::set_var("LANG", "C");
  615|      1|        assert_eq!(Language::from_env_or_locale(), Language::English);
  616|      1|        std::env::remove_var("LC_ALL");
  617|      1|        std::env::remove_var("LANG");
  618|       |    }
  619|       |
  620|       |    #[test]
  621|       |    #[serial]
  622|      1|    fn env_pt_selects_portuguese() {
  623|      1|        std::env::remove_var("LC_ALL");
  624|      1|        std::env::remove_var("LANG");
  625|      1|        std::env::set_var("SQLITE_GRAPHRAG_LANG", "pt");
  626|      1|        assert_eq!(Language::from_env_or_locale(), Language::Portuguese);
  627|      1|        std::env::remove_var("SQLITE_GRAPHRAG_LANG");
  628|       |    }
  629|       |
  630|       |    #[test]
  631|       |    #[serial]
  632|      1|    fn env_pt_br_selects_portuguese() {
  633|      1|        std::env::remove_var("LC_ALL");
  634|      1|        std::env::remove_var("LANG");
  635|      1|        std::env::set_var("SQLITE_GRAPHRAG_LANG", "pt-BR");
  636|      1|        assert_eq!(Language::from_env_or_locale(), Language::Portuguese);
  637|      1|        std::env::remove_var("SQLITE_GRAPHRAG_LANG");
  638|       |    }
  639|       |
  640|       |    #[test]
  641|       |    #[serial]
  642|      1|    fn locale_ptbr_utf8_selects_portuguese() {
  643|      1|        std::env::remove_var("SQLITE_GRAPHRAG_LANG");
  644|      1|        std::env::set_var("LC_ALL", "pt_BR.UTF-8");
  645|      1|        assert_eq!(Language::from_env_or_locale(), Language::Portuguese);
  646|      1|        std::env::remove_var("LC_ALL");
  647|       |    }
  648|       |
  649|       |    #[test]
  650|       |    #[serial]
  651|      1|    fn posix_precedence_lc_all_overrides_lang() {
  652|      1|        std::env::remove_var("SQLITE_GRAPHRAG_LANG");
  653|      1|        std::env::remove_var("LC_MESSAGES");
  654|      1|        std::env::set_var("LC_ALL", "en_US.UTF-8");
  655|      1|        std::env::set_var("LANG", "pt_BR.UTF-8");
  656|      1|        assert_eq!(
  657|      1|            Language::from_env_or_locale(),
  658|       |            Language::English,
  659|      0|            "LC_ALL=en_US must override LANG=pt_BR per POSIX"
  660|       |        );
  661|      1|        std::env::remove_var("LC_ALL");
  662|      1|        std::env::remove_var("LANG");
  663|       |    }
  664|       |
  665|       |    #[test]
  666|       |    #[serial]
  667|      1|    fn posix_precedence_lc_all_unrecognized_stops_iteration() {
  668|      1|        std::env::remove_var("SQLITE_GRAPHRAG_LANG");
  669|      1|        std::env::remove_var("LC_MESSAGES");
  670|      1|        std::env::set_var("LC_ALL", "ja_JP.UTF-8");
  671|      1|        std::env::set_var("LANG", "pt_BR.UTF-8");
  672|      1|        assert_eq!(
  673|      1|            Language::from_env_or_locale(),
  674|       |            Language::English,
  675|      0|            "LC_ALL=ja_JP set must stop iteration; falls back to English default"
  676|       |        );
  677|      1|        std::env::remove_var("LC_ALL");
  678|      1|        std::env::remove_var("LANG");
  679|       |    }
  680|       |
  681|       |    #[test]
  682|       |    #[serial]
  683|      1|    fn lang_pt_selects_portuguese_when_lc_all_unset() {
  684|      1|        std::env::remove_var("SQLITE_GRAPHRAG_LANG");
  685|      1|        std::env::remove_var("LC_ALL");
  686|      1|        std::env::remove_var("LC_MESSAGES");
  687|      1|        std::env::set_var("LANG", "pt_BR.UTF-8");
  688|      1|        assert_eq!(Language::from_env_or_locale(), Language::Portuguese);
  689|      1|        std::env::remove_var("LANG");
  690|       |    }
  691|       |
  692|       |    mod validation_tests {
  693|       |        use super::*;
  694|       |
  695|       |        #[test]
  696|      1|        fn name_length_en() {
  697|      1|            let msg = match Language::English {
  698|      1|                Language::English => format!("name must be 1-{} chars", 80),
  699|      0|                Language::Portuguese => format!("nome deve ter entre 1 e {} caracteres", 80),
  700|       |            };
  701|      1|            assert!(msg.contains("name must be 1-80 chars"), "obtido: {msg}");
                                                                           ^0
  702|      1|        }
  703|       |
  704|       |        #[test]
  705|      1|        fn name_length_pt() {
  706|      1|            let msg = match Language::Portuguese {
  707|      0|                Language::English => format!("name must be 1-{} chars", 80),
  708|      1|                Language::Portuguese => format!("nome deve ter entre 1 e {} caracteres", 80),
  709|       |            };
  710|      1|            assert!(
  711|      1|                msg.contains("nome deve ter entre 1 e 80 caracteres"),
  712|      0|                "obtido: {msg}"
  713|       |            );
  714|      1|        }
  715|       |
  716|       |        #[test]
  717|      1|        fn name_kebab_en() {
  718|      1|            let nome = "Invalid_Name";
  719|      1|            let msg = match Language::English {
  720|      1|                Language::English => format!(
  721|      1|                    "name must be kebab-case slug (lowercase letters, digits, hyphens): '{nome}'"
  722|       |                ),
  723|       |                Language::Portuguese => {
  724|      0|                    format!("nome deve estar em kebab-case (minúsculas, dígitos, hífens): '{nome}'")
  725|       |                }
  726|       |            };
  727|      1|            assert!(msg.contains("kebab-case slug"), "obtido: {msg}");
                                                                   ^0
  728|      1|            assert!(msg.contains("Invalid_Name"), "obtido: {msg}");
                                                                ^0
  729|      1|        }
  730|       |
  731|       |        #[test]
  732|      1|        fn name_kebab_pt() {
  733|      1|            let nome = "Invalid_Name";
  734|      1|            let msg = match Language::Portuguese {
  735|      0|                Language::English => format!(
  736|      0|                    "name must be kebab-case slug (lowercase letters, digits, hyphens): '{nome}'"
  737|       |                ),
  738|       |                Language::Portuguese => {
  739|      1|                    format!("nome deve estar em kebab-case (minúsculas, dígitos, hífens): '{nome}'")
  740|       |                }
  741|       |            };
  742|      1|            assert!(msg.contains("kebab-case"), "obtido: {msg}");
                                                              ^0
  743|      1|            assert!(msg.contains("minúsculas"), "obtido: {msg}");
                                                               ^0
  744|      1|            assert!(msg.contains("Invalid_Name"), "obtido: {msg}");
                                                                ^0
  745|      1|        }
  746|       |
  747|       |        #[test]
  748|      1|        fn description_exceeds_en() {
  749|      1|            let msg = match Language::English {
  750|      1|                Language::English => format!("description must be <= {} chars", 500),
  751|      0|                Language::Portuguese => format!("descrição deve ter no máximo {} caracteres", 500),
  752|       |            };
  753|      1|            assert!(msg.contains("description must be <= 500"), "obtido: {msg}");
                                                                              ^0
  754|      1|        }
  755|       |
  756|       |        #[test]
  757|      1|        fn description_exceeds_pt() {
  758|      1|            let msg = match Language::Portuguese {
  759|      0|                Language::English => format!("description must be <= {} chars", 500),
  760|      1|                Language::Portuguese => format!("descrição deve ter no máximo {} caracteres", 500),
  761|       |            };
  762|      1|            assert!(
  763|      1|                msg.contains("descrição deve ter no máximo 500"),
  764|      0|                "obtido: {msg}"
  765|       |            );
  766|      1|        }
  767|       |
  768|       |        #[test]
  769|      1|        fn body_exceeds_en() {
  770|      1|            let limite = crate::constants::MAX_MEMORY_BODY_LEN;
  771|      1|            let msg = match Language::English {
  772|      1|                Language::English => format!("body exceeds {limite} bytes"),
  773|      0|                Language::Portuguese => format!("corpo excede {limite} bytes"),
  774|       |            };
  775|      1|            assert!(msg.contains("body exceeds 512000"), "obtido: {msg}");
                                                                       ^0
  776|      1|        }
  777|       |
  778|       |        #[test]
  779|      1|        fn body_exceeds_pt() {
  780|      1|            let limite = crate::constants::MAX_MEMORY_BODY_LEN;
  781|      1|            let msg = match Language::Portuguese {
  782|      0|                Language::English => format!("body exceeds {limite} bytes"),
  783|      1|                Language::Portuguese => format!("corpo excede {limite} bytes"),
  784|       |            };
  785|      1|            assert!(msg.contains("corpo excede 512000"), "obtido: {msg}");
                                                                       ^0
  786|      1|        }
  787|       |
  788|       |        #[test]
  789|      1|        fn new_name_length_en() {
  790|      1|            let msg = match Language::English {
  791|      1|                Language::English => format!("new-name must be 1-{} chars", 80),
  792|      0|                Language::Portuguese => format!("novo nome deve ter entre 1 e {} caracteres", 80),
  793|       |            };
  794|      1|            assert!(msg.contains("new-name must be 1-80"), "obtido: {msg}");
                                                                         ^0
  795|      1|        }
  796|       |
  797|       |        #[test]
  798|      1|        fn new_name_length_pt() {
  799|      1|            let msg = match Language::Portuguese {
  800|      0|                Language::English => format!("new-name must be 1-{} chars", 80),
  801|      1|                Language::Portuguese => format!("novo nome deve ter entre 1 e {} caracteres", 80),
  802|       |            };
  803|      1|            assert!(
  804|      1|                msg.contains("novo nome deve ter entre 1 e 80"),
  805|      0|                "obtido: {msg}"
  806|       |            );
  807|      1|        }
  808|       |
  809|       |        #[test]
  810|      1|        fn new_name_kebab_en() {
  811|      1|            let nome = "Bad Name";
  812|      1|            let msg = match Language::English {
  813|      1|                Language::English => format!(
  814|      1|                    "new-name must be kebab-case slug (lowercase letters, digits, hyphens): '{nome}'"
  815|       |                ),
  816|      0|                Language::Portuguese => format!(
  817|      0|                    "novo nome deve estar em kebab-case (minúsculas, dígitos, hífens): '{nome}'"
  818|       |                ),
  819|       |            };
  820|      1|            assert!(msg.contains("new-name must be kebab-case"), "obtido: {msg}");
                                                                               ^0
  821|      1|        }
  822|       |
  823|       |        #[test]
  824|      1|        fn new_name_kebab_pt() {
  825|      1|            let nome = "Bad Name";
  826|      1|            let msg = match Language::Portuguese {
  827|      0|                Language::English => format!(
  828|      0|                    "new-name must be kebab-case slug (lowercase letters, digits, hyphens): '{nome}'"
  829|       |                ),
  830|      1|                Language::Portuguese => format!(
  831|      1|                    "novo nome deve estar em kebab-case (minúsculas, dígitos, hífens): '{nome}'"
  832|       |                ),
  833|       |            };
  834|      1|            assert!(
  835|      1|                msg.contains("novo nome deve estar em kebab-case"),
  836|      0|                "obtido: {msg}"
  837|       |            );
  838|      1|        }
  839|       |
  840|       |        #[test]
  841|      1|        fn reserved_name_en() {
  842|      1|            let msg = match Language::English {
  843|       |                Language::English => {
  844|      1|                    "names and namespaces starting with __ are reserved for internal use"
  845|      1|                        .to_string()
  846|       |                }
  847|       |                Language::Portuguese => {
  848|      0|                    "nomes e namespaces iniciados com __ são reservados para uso interno"
  849|      0|                        .to_string()
  850|       |                }
  851|       |            };
  852|      1|            assert!(msg.contains("reserved for internal use"), "obtido: {msg}");
                                                                             ^0
  853|      1|        }
  854|       |
  855|       |        #[test]
  856|      1|        fn reserved_name_pt() {
  857|      1|            let msg = match Language::Portuguese {
  858|       |                Language::English => {
  859|      0|                    "names and namespaces starting with __ are reserved for internal use"
  860|      0|                        .to_string()
  861|       |                }
  862|       |                Language::Portuguese => {
  863|      1|                    "nomes e namespaces iniciados com __ são reservados para uso interno"
  864|      1|                        .to_string()
  865|       |                }
  866|       |            };
  867|      1|            assert!(msg.contains("reservados para uso interno"), "obtido: {msg}");
                                                                               ^0
  868|      1|        }
  869|       |    }
  870|       |
  871|       |    mod app_error_pt_translation_tests {
  872|       |        use crate::errors::AppError;
  873|       |
  874|       |        #[test]
  875|      1|        fn localized_message_pt_not_found_fully_translated() {
  876|      1|            let err =
  877|      1|                AppError::NotFound("memory 'test-mem' not found in namespace 'global'".into());
  878|      1|            let pt = err.localized_message_for(crate::i18n::Language::Portuguese);
  879|      1|            assert!(
  880|      1|                pt.contains("memória"),
  881|      0|                "PT must translate 'memory' to 'memória': {pt}"
  882|       |            );
  883|      1|            assert!(
  884|      1|                pt.contains("não encontrada no namespace"),
  885|      0|                "PT must translate full phrase: {pt}"
  886|       |            );
  887|      1|            assert!(
  888|      1|                !pt.contains("not found in namespace"),
  889|      0|                "PT must not contain English phrase: {pt}"
  890|       |            );
  891|      1|        }
  892|       |
  893|       |        #[test]
  894|      1|        fn localized_message_pt_duplicate_fully_translated() {
  895|      1|            let err = AppError::Duplicate(
  896|      1|                "memory 'x' already exists in namespace 'global'. Use --force-merge to update."
  897|      1|                    .into(),
  898|      1|            );
  899|      1|            let pt = err.localized_message_for(crate::i18n::Language::Portuguese);
  900|      1|            assert!(pt.contains("memória"), "PT must translate 'memory': {pt}");
                                                           ^0
  901|      1|            assert!(
  902|      1|                pt.contains("já existe no namespace"),
  903|      0|                "PT must translate 'already exists': {pt}"
  904|       |            );
  905|      1|        }
  906|       |    }
  907|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/lib.rs:
    1|       |//! # sqlite-graphrag
    2|       |//!
    3|       |//! Local GraphRAG memory for LLMs in a single SQLite file — zero external
    4|       |//! services required.
    5|       |//!
    6|       |//! `sqlite-graphrag` is a CLI-first library that persists memories, entities and
    7|       |//! typed relationships inside a single SQLite database. It combines FTS5
    8|       |//! full-text search with `sqlite-vec` KNN over locally-generated embeddings to
    9|       |//! expose a hybrid retrieval ranker tailored for LLM agents.
   10|       |//!
   11|       |//! ## CLI usage
   12|       |//!
   13|       |//! Install and initialize once, then save and recall memories:
   14|       |//!
   15|       |//! ```bash
   16|       |//! cargo install sqlite-graphrag
   17|       |//! sqlite-graphrag init
   18|       |//! sqlite-graphrag remember \
   19|       |//!     --name onboarding-note \
   20|       |//!     --type user \
   21|       |//!     --description "first memory" \
   22|       |//!     --body "hello graphrag"
   23|       |//! sqlite-graphrag recall "graphrag" --k 5
   24|       |//! ```
   25|       |//!
   26|       |//! ## Crate layout
   27|       |//!
   28|       |//! The public modules group the CLI, the SQLite storage layer and the
   29|       |//! supporting primitives (embedder, chunking, graph, namespace detection,
   30|       |//! output, paths and pragmas). The CLI binary wires them together through the
   31|       |//! commands in [`commands`].
   32|       |//!
   33|       |//! ## Exit codes
   34|       |//!
   35|       |//! Errors returned from [`errors::AppError`] map to deterministic exit codes
   36|       |//! suitable for orchestration by shell scripts and LLM agents. Consult the
   37|       |//! README for the full contract.
   38|       |
   39|       |use std::sync::atomic::{AtomicBool, AtomicU8, Ordering};
   40|       |use std::sync::OnceLock;
   41|       |use tokio_util::sync::CancellationToken;
   42|       |
   43|       |/// Signals that a shutdown signal (SIGINT / SIGTERM / SIGHUP) has been received.
   44|       |///
   45|       |/// Set in `main` via `ctrlc::set_handler`. Long-running subcommands can
   46|       |/// poll [`shutdown_requested`] to shut down gracefully before timeout.
   47|       |/// Async code should prefer [`cancel_token`] with `tokio::select!`.
   48|       |pub static SHUTDOWN: AtomicBool = AtomicBool::new(false);
   49|       |
   50|       |/// Counter of shutdown signals received. 0=none, 1=graceful, 2+=forced exit.
   51|       |pub static SIGNAL_COUNT: AtomicU8 = AtomicU8::new(0);
   52|       |
   53|       |/// Signal number that triggered shutdown (2=SIGINT, 15=SIGTERM). 0=none.
   54|       |static SIGNAL_NUMBER: AtomicU8 = AtomicU8::new(0);
   55|       |
   56|       |static CANCEL: OnceLock<CancellationToken> = OnceLock::new();
   57|       |
   58|       |/// Returns the process-wide cancellation token for async graceful shutdown.
   59|       |///
   60|       |/// The token is cancelled by the signal handler alongside [`SHUTDOWN`].
   61|       |/// Async loops should use `token.cancelled().await` inside `tokio::select!`
   62|       |/// for instant wake-up instead of polling [`shutdown_requested`].
   63|      0|pub fn cancel_token() -> &'static CancellationToken {
   64|      0|    CANCEL.get_or_init(CancellationToken::new)
   65|      0|}
   66|       |
   67|       |/// Returns `true` if a shutdown signal has been received since the process started.
   68|       |///
   69|       |/// The value reflects the state of [`SHUTDOWN`]. Without a `ctrlc::set_handler` call,
   70|       |/// the initial state is always `false`.
   71|       |///
   72|       |/// # Examples
   73|       |///
   74|       |/// ```
   75|       |/// use sqlite_graphrag::shutdown_requested;
   76|       |///
   77|       |/// // Under normal startup conditions the signal has not been received.
   78|       |/// assert!(!shutdown_requested());
   79|       |/// ```
   80|       |///
   81|       |/// ```
   82|       |/// use std::sync::atomic::Ordering;
   83|       |/// use sqlite_graphrag::{SHUTDOWN, shutdown_requested};
   84|       |///
   85|       |/// // Simulate receiving a signal and verify that the function reflects the state.
   86|       |/// SHUTDOWN.store(true, Ordering::Release);
   87|       |/// assert!(shutdown_requested());
   88|       |/// // Restore to avoid contaminating other tests.
   89|       |/// SHUTDOWN.store(false, Ordering::Release);
   90|       |/// ```
   91|      0|pub fn shutdown_requested() -> bool {
   92|       |    // ORDERING: Acquire pairs with the Release store in the signal handler (main.rs).
   93|      0|    SHUTDOWN.load(Ordering::Acquire)
   94|      0|}
   95|       |
   96|       |/// Returns the signal number that triggered shutdown (0 if none received).
   97|       |///
   98|       |/// Typically 2 (SIGINT) for Ctrl+C. Used to compute Unix-conventional exit
   99|       |/// code 128+N in the main function.
  100|      0|pub fn shutdown_signal() -> u8 {
  101|      0|    SIGNAL_NUMBER.load(Ordering::Acquire)
  102|      0|}
  103|       |
  104|       |/// Token-aware chunking utilities for bodies that exceed the embedding window.
  105|       |pub mod chunking;
  106|       |
  107|       |/// Hybrid entity extraction: regex pre-filter + GLiNER zero-shot NER (graceful degradation).
  108|       |pub mod extraction;
  109|       |
  110|       |/// `clap` definitions for the top-level `sqlite-graphrag` binary.
  111|       |pub mod cli;
  112|       |
  113|       |/// Subcommand handlers wired into the `clap` tree from [`cli`].
  114|       |pub mod commands;
  115|       |
  116|       |/// Compile-time constants: embedding dimensions, limits and thresholds.
  117|       |pub mod constants;
  118|       |
  119|       |/// Daemon IPC for persistent embedding model reuse across CLI invocations.
  120|       |pub mod daemon;
  121|       |
  122|       |/// Local embedding generation backed by `fastembed`.
  123|       |pub mod embedder;
  124|       |
  125|       |/// Canonical entity type taxonomy: 13 variants, ValueEnum + serde + rusqlite impls.
  126|       |pub mod entity_type;
  127|       |
  128|       |/// Library-wide error type and the mapping to process exit codes (see [`errors::AppError`]).
  129|       |pub mod errors;
  130|       |
  131|       |/// Graph traversal helpers over the entities and relationships tables.
  132|       |pub mod graph;
  133|       |
  134|       |/// Type aliases for AHash-backed collections in hot paths.
  135|       |pub mod hash;
  136|       |
  137|       |/// Bilingual message layer for human-facing stderr progress (`--lang en|pt`, `SQLITE_GRAPHRAG_LANG`).
  138|       |pub mod i18n;
  139|       |
  140|       |/// Counting semaphore via lock files to limit parallel invocations.
  141|       |/// Provides `acquire_cli_slot` (counting semaphore) and the G28-B
  142|       |/// per-namespace heavy-job singleton `acquire_job_singleton` for
  143|       |/// `enrich`, `ingest --mode claude-code`, `ingest --mode codex`.
  144|       |pub mod lock;
  145|       |
  146|       |/// Memory guard: checks RAM availability before loading the ONNX model.
  147|       |pub mod memory_guard;
  148|       |
  149|       |/// Type-safe enumeration of the five `memories.source` CHECK constraint values.
  150|       |/// Replaces the footgun `pub source: String` to prevent G29-style regressions.
  151|       |#[allow(rustdoc::broken_intra_doc_links)]
  152|       |pub mod memory_source;
  153|       |
  154|       |/// Namespace resolution with precedence between flag, environment and markers.
  155|       |pub mod namespace;
  156|       |
  157|       |/// Centralized stdout/stderr emitters for CLI output formatting.
  158|       |pub mod output;
  159|       |
  160|       |/// Dual-format argument parser: accepts Unix epoch and RFC 3339.
  161|       |pub mod parsers;
  162|       |
  163|       |/// G29 Passo 4: preservation checks (Jaccard trigram) for LLM-enriched bodies.
  164|       |pub mod preservation;
  165|       |
  166|       |/// Filesystem paths for the project-local database and app support directories.
  167|       |pub mod paths;
  168|       |
  169|       |/// SQLite pragma helpers applied on every connection.
  170|       |pub mod pragmas;
  171|       |
  172|       |/// Cross-platform signal handling: SIGINT, SIGTERM, SIGHUP.
  173|       |pub mod signals;
  174|       |
  175|       |/// Centralized retry infrastructure with exponential backoff and half-jitter.
  176|       |pub mod retry;
  177|       |
  178|       |/// G28: orphan-process reaper that runs at CLI startup.
  179|       |#[allow(rustdoc::broken_intra_doc_links)]
  180|       |pub mod reaper;
  181|       |
  182|       |/// G28-D: system load average observation (pre-spawn saturation check).
  183|       |pub mod system_load;
  184|       |
  185|       |/// Persistence layer: memories, entities, chunks and version history.
  186|       |pub mod storage;
  187|       |
  188|       |/// Centralized tracing subscriber initialization with panic hook and log bridge.
  189|       |pub mod telemetry;
  190|       |
  191|       |/// Cross-platform terminal initialization: UTF-8 console, ANSI colors, NO_COLOR.
  192|       |pub mod terminal;
  193|       |
  194|       |/// Display time zone for `*_iso` fields (flag `--tz`, env `SQLITE_GRAPHRAG_DISPLAY_TZ`, fallback UTC).
  195|       |pub mod tz;
  196|       |
  197|       |/// Stdin reader with configurable timeout to prevent indefinite blocking.
  198|       |pub mod stdin_helper;
  199|       |
  200|       |/// Real tokenizer of the embedding model for accurate token counting and chunking.
  201|       |pub mod tokenizer;
  202|       |
  203|       |mod embedded_migrations {
  204|       |    use refinery::embed_migrations;
  205|       |    embed_migrations!("migrations");
  206|       |}
  207|       |
  208|       |pub use embedded_migrations::migrations;

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/lock.rs:
    1|       |//! Counting semaphore via lock files to limit parallel CLI invocations.
    2|       |//!
    3|       |//! `acquire_cli_slot` tries to acquire one of `N` available slots by opening the file
    4|       |//! `cli-slot-{N}.lock` in the OS cache directory and obtaining an exclusive `flock`.
    5|       |//! The returned [`std::fs::File`] MUST be kept alive for the entire duration of `main`;
    6|       |//! dropping it releases the slot automatically for the next invocation.
    7|       |//!
    8|       |//! When `wait_seconds` is `Some(n) > 0`, the function polls every
    9|       |//! [`crate::constants::CLI_LOCK_POLL_INTERVAL_MS`] milliseconds until the deadline. When it
   10|       |//! is `None` or `Some(0)`, a single attempt is made and `Err(AppError::AllSlotsFull)` is
   11|       |//! returned immediately if all slots are occupied.
   12|       |//!
   13|       |//! ## Job-type singleton (G28-B, v1.0.68)
   14|       |//!
   15|       |//! Heavy long-running jobs (`enrich`, `ingest --mode claude-code`,
   16|       |//! `ingest --mode codex`) also acquire a *singleton* lock per `(job_type,
   17|       |//! namespace)` via `acquire_job_singleton`.  This guarantees at most one
   18|       |//! heavy job per namespace runs at any time, which was the root cause
   19|       |//! of the 2026-06-03 process-proliferation incident (4 parallel `enrich`
   20|       |//! instances × N workers × 10 MCP servers = ~192 spawned processes).
   21|       |// Workload: I/O-bound (flock polling with exponential backoff sleep)
   22|       |
   23|       |use std::fs::{File, OpenOptions};
   24|       |use std::path::{Path, PathBuf};
   25|       |use std::thread;
   26|       |use std::time::{Duration, Instant};
   27|       |
   28|       |use directories::ProjectDirs;
   29|       |use fs4::fs_std::FileExt;
   30|       |
   31|       |use crate::constants::{
   32|       |    CLI_LOCK_POLL_INTERVAL_MS, JOB_SINGLETON_POLL_INTERVAL_MS, MAX_CONCURRENT_CLI_INSTANCES,
   33|       |};
   34|       |use crate::errors::AppError;
   35|       |
   36|       |/// Job-type classification for `acquire_job_singleton`.
   37|       |///
   38|       |/// `Light` is intentionally NOT a variant here because lightweight
   39|       |/// commands (`recall`, `stats`, `read`, `list`) share the existing
   40|       |/// counting-semaphore in [`acquire_cli_slot`] and do not need a singleton.
   41|       |#[derive(Debug, Clone, Copy, PartialEq, Eq)]
   42|       |pub enum JobType {
   43|       |    /// `enrich` command (LLM-driven entity/relation/body enrichment).
   44|       |    Enrich,
   45|       |    /// `ingest --mode claude-code` (LLM-curated ingestion).
   46|       |    IngestClaudeCode,
   47|       |    /// `ingest --mode codex` (OpenAI Codex CLI ingestion).
   48|       |    IngestCodex,
   49|       |}
   50|       |
   51|       |impl JobType {
   52|       |    /// Returns the kebab-case tag used inside the lock file name.
   53|      8|    fn tag(self) -> &'static str {
   54|      8|        match self {
   55|      6|            JobType::Enrich => "enrich",
   56|      2|            JobType::IngestClaudeCode => "ingest-claude-code",
   57|      0|            JobType::IngestCodex => "ingest-codex",
   58|       |        }
   59|      8|    }
   60|       |}
   61|       |
   62|       |/// Returns the lock file path for the given slot.
   63|       |///
   64|       |/// Honours `SQLITE_GRAPHRAG_CACHE_DIR` when set (useful for tests, containers,
   65|       |/// and NFS caches), falling back to the OS default cache directory via
   66|       |/// `directories::ProjectDirs`. The slot must be 1-based.
   67|      0|fn slot_path(slot: usize) -> Result<PathBuf, AppError> {
   68|      0|    let cache = cache_dir()?;
   69|      0|    std::fs::create_dir_all(&cache)?;
   70|      0|    Ok(cache.join(format!("cli-slot-{slot}.lock")))
   71|      0|}
   72|       |
   73|       |/// Resolves the lock-file directory honouring `SQLITE_GRAPHRAG_CACHE_DIR`.
   74|      7|fn cache_dir() -> Result<PathBuf, AppError> {
   75|      7|    if let Some(override_dir) = std::env::var_os("SQLITE_GRAPHRAG_CACHE_DIR") {
                              ^0
   76|      0|        Ok(PathBuf::from(override_dir))
   77|       |    } else {
   78|      7|        let dirs = ProjectDirs::from("", "", "sqlite-graphrag").ok_or_else(|| {
                                                                                            ^0
   79|      0|            AppError::Io(std::io::Error::new(
   80|      0|                std::io::ErrorKind::NotFound,
   81|      0|                "could not determine cache directory for sqlite-graphrag lock files",
   82|      0|            ))
   83|      0|        })?;
   84|      7|        Ok(dirs.cache_dir().to_path_buf())
   85|       |    }
   86|      7|}
   87|       |
   88|       |/// Computes a short, filesystem-safe hash of the database path so two distinct
   89|       |/// databases (e.g. `/tmp/a.sqlite` and `/tmp/b.sqlite`) get distinct lock
   90|       |/// files in the shared cache directory. First 12 hex chars of BLAKE3 are
   91|       |/// sufficient for collision avoidance across the local filesystem.
   92|     10|pub fn db_path_hash(db_path: &Path) -> String {
   93|     10|    let canonical = db_path
   94|     10|        .canonicalize()
   95|     10|        .unwrap_or_else(|_| db_path.to_path_buf());
   96|     10|    let hash = blake3::hash(canonical.to_string_lossy().as_bytes());
   97|     10|    hash.to_hex().to_string()[..12].to_string()
   98|     10|}
   99|       |
  100|       |/// Returns the singleton lock file path for a given (job_type, namespace, db_hash).
  101|       |///
  102|       |/// Layout: `job-singleton-{tag}-{namespace_slug}-{db_hash}.lock` in the same
  103|       |/// cache dir as the CLI slots. The namespace is sanitised to a filesystem-safe
  104|       |/// slug (lowercase, hyphens, alphanumeric) and defaults to `default` when
  105|       |/// empty. The `db_hash` is the BLAKE3 prefix returned by [`db_path_hash`].
  106|       |///
  107|       |/// G30 (v1.0.69): the previous implementation ignored the database path
  108|       |/// entirely, so two concurrent `enrich` invocations against different
  109|       |/// `graphrag.sqlite` files (production vs. test) collided on the same
  110|       |/// cache-dir lock. The db_hash scope makes the singleton per-database while
  111|       |/// still sharing the same cache dir.
  112|      7|pub fn job_singleton_path(
  113|      7|    job_type: JobType,
  114|      7|    namespace: &str,
  115|      7|    db_hash: &str,
  116|      7|) -> Result<PathBuf, AppError> {
  117|      7|    let cache = cache_dir()?;
                                         ^0
  118|      7|    std::fs::create_dir_all(&cache)?;
                                                 ^0
  119|      7|    let slug = if namespace.is_empty() {
  120|      0|        "default".to_string()
  121|       |    } else {
  122|      7|        namespace
  123|      7|            .chars()
  124|     83|            .map(|c| {
                           ^7
  125|     83|                if c.is_ascii_alphanumeric() || c == '-' || c == '_' {
                                                              ^14         ^2
  126|     81|                    c.to_ascii_lowercase()
  127|       |                } else {
  128|      2|                    '-'
  129|       |                }
  130|     83|            })
  131|      7|            .collect::<String>()
  132|       |    };
  133|      7|    let safe_hash: String = db_hash
  134|      7|        .chars()
  135|     84|        .filter(|c| c.is_ascii_alphanumeric())
                       ^7
  136|      7|        .take(16)
  137|      7|        .collect();
  138|      7|    Ok(cache.join(format!(
  139|      7|        "job-singleton-{}-{slug}-{safe_hash}.lock",
  140|      7|        job_type.tag()
  141|      7|    )))
  142|      7|}
  143|       |
  144|       |/// Tries to open and exclusively lock the lock file for the given slot.
  145|       |///
  146|       |/// Returns `Ok(file)` if the slot is free, or `Err(io::Error)` if it is
  147|       |/// held by another instance (non-blocking).
  148|      0|fn try_acquire_slot(slot: usize) -> Result<File, AppError> {
  149|      0|    let path = slot_path(slot)?;
  150|      0|    let file = OpenOptions::new()
  151|      0|        .read(true)
  152|      0|        .write(true)
  153|      0|        .create(true)
  154|      0|        .truncate(false)
  155|      0|        .open(&path)?;
  156|      0|    file.try_lock_exclusive().map_err(AppError::Io)?;
  157|      0|    Ok(file)
  158|      0|}
  159|       |
  160|       |/// Acquires a concurrency slot from the `max_concurrency`-position semaphore.
  161|       |///
  162|       |/// Iterates slots `1..=max_concurrency` attempting `try_lock_exclusive` on each
  163|       |/// `cli-slot-N.lock` file. When a free slot is found, returns `(File, slot_number)`.
  164|       |/// If all slots are occupied:
  165|       |///
  166|       |/// - If `wait_seconds` is `None` or `Some(0)`, returns immediately with
  167|       |///   `AppError::AllSlotsFull { max, waited_secs: 0 }`.
  168|       |/// - If `wait_seconds` is `Some(n) > 0`, enters a polling loop every
  169|       |///   [`crate::constants::CLI_LOCK_POLL_INTERVAL_MS`] ms until the deadline expires, returning
  170|       |///   `AppError::AllSlotsFull { max, waited_secs: n }` if no slot opens.
  171|       |///
  172|       |/// The returned `File` MUST be kept alive until the process exits; dropping it
  173|       |/// releases the slot automatically via the implicit `flock` on close.
  174|      0|pub fn acquire_cli_slot(
  175|      0|    max_concurrency: usize,
  176|      0|    wait_seconds: Option<u64>,
  177|      0|) -> Result<(File, usize), AppError> {
  178|       |    // G18: use env override or 2*cpus as ceiling instead of hardcoded 4
  179|      0|    let ncpus = std::thread::available_parallelism()
  180|      0|        .map(|n| n.get())
  181|      0|        .unwrap_or(4);
  182|      0|    let ceiling = std::env::var("SQLITE_GRAPHRAG_MAX_CLI_INSTANCES")
  183|      0|        .ok()
  184|      0|        .and_then(|v| v.parse::<usize>().ok())
  185|      0|        .unwrap_or_else(|| (2 * ncpus).max(MAX_CONCURRENT_CLI_INSTANCES));
  186|      0|    let max = max_concurrency.clamp(1, ceiling);
  187|      0|    let wait_secs = wait_seconds.unwrap_or(0);
  188|       |
  189|       |    // Tentativa inicial sem espera.
  190|      0|    if let Some((file, slot)) = try_any_slot(max)? {
  191|      0|        return Ok((file, slot));
  192|      0|    }
  193|       |
  194|      0|    if wait_secs == 0 {
  195|      0|        return Err(AppError::AllSlotsFull {
  196|      0|            max,
  197|      0|            waited_secs: 0,
  198|      0|        });
  199|      0|    }
  200|       |
  201|       |    // Polling loop with progressive backoff until the deadline.
  202|      0|    let deadline = Instant::now() + Duration::from_secs(wait_secs);
  203|      0|    let mut polls: u64 = 0;
  204|       |    loop {
  205|      0|        let poll_delay = CLI_LOCK_POLL_INTERVAL_MS
  206|      0|            .saturating_mul(1 + polls / 4)
  207|      0|            .min(CLI_LOCK_POLL_INTERVAL_MS * 4);
  208|      0|        thread::sleep(Duration::from_millis(poll_delay));
  209|      0|        polls += 1;
  210|      0|        if let Some((file, slot)) = try_any_slot(max)? {
  211|      0|            return Ok((file, slot));
  212|      0|        }
  213|      0|        if Instant::now() >= deadline {
  214|      0|            return Err(AppError::AllSlotsFull {
  215|      0|                max,
  216|      0|                waited_secs: wait_secs,
  217|      0|            });
  218|      0|        }
  219|       |    }
  220|      0|}
  221|       |
  222|       |/// Acquires a process-wide singleton lock for a heavy job type and namespace.
  223|       |///
  224|       |/// G28-B (v1.0.68): ensures at most one `enrich`, `ingest --mode
  225|       |/// claude-code`, or `ingest --mode codex` runs at a time per namespace.
  226|       |/// A second invocation in the same namespace either:
  227|       |///
  228|       |/// - Returns immediately with `AppError::JobSingletonLocked { job_type,
  229|       |///   namespace }` when `wait_seconds` is `None` or `Some(0)`.
  230|       |/// - Polls every [`JOB_SINGLETON_POLL_INTERVAL_MS`] ms until the lock
  231|       |///   drops or the deadline expires, returning the same error on timeout.
  232|       |///
  233|       |/// The returned `File` MUST be kept alive until the process exits;
  234|       |/// dropping it releases the singleton for the next invocation.
  235|      6|pub fn acquire_job_singleton(
  236|      6|    job_type: JobType,
  237|      6|    namespace: &str,
  238|      6|    db_path: &Path,
  239|      6|    wait_seconds: Option<u64>,
  240|      6|    force: bool,
  241|      6|) -> Result<File, AppError> {
  242|      6|    let db_hash = db_path_hash(db_path);
  243|      6|    let path = job_singleton_path(job_type, namespace, &db_hash)?;
                                                                              ^0
  244|       |
  245|       |    // G30+G09: when --force is set, attempt to break a stale lock by
  246|       |    // detecting and removing a pre-existing lock file. This is a last
  247|       |    // resort: only enabled by an explicit operator flag. A real orphan
  248|       |    // lock from a previous crash leaves a 0-byte file behind, which the
  249|       |    // next non-forced caller would still try to lock.
  250|      6|    if force && path.exists() {
                              ^0
  251|      0|        tracing::warn!(target: "lock",
  252|      0|            path = %path.display(),
  253|      0|            "force=true; removing pre-existing singleton lock file"
  254|       |        );
  255|      0|        let _ = std::fs::remove_file(&path);
  256|      6|    }
  257|       |
  258|      6|    let file = OpenOptions::new()
  259|      6|        .read(true)
  260|      6|        .write(true)
  261|      6|        .create(true)
  262|      6|        .truncate(false)
  263|      6|        .open(&path)?;
                                  ^0
  264|      6|    if let Err(e) = file.try_lock_exclusive() {
                             ^1
  265|      1|        if !is_lock_contended(&e) {
  266|      0|            return Err(AppError::Io(e));
  267|      1|        }
  268|       |        // Already held by another instance.
  269|      1|        let wait_secs = wait_seconds.unwrap_or(0);
  270|      1|        if wait_secs == 0 {
  271|      1|            return Err(AppError::JobSingletonLocked {
  272|      1|                job_type: job_type.tag().to_string(),
  273|      1|                namespace: namespace.to_string(),
  274|      1|            });
  275|      0|        }
  276|      0|        let deadline = Instant::now() + Duration::from_secs(wait_secs);
  277|       |        // Drop the failed handle before polling; flock is per-process so we
  278|       |        // re-open each attempt to refresh contention state.
  279|      0|        drop(file);
  280|       |        loop {
  281|      0|            thread::sleep(Duration::from_millis(JOB_SINGLETON_POLL_INTERVAL_MS));
  282|      0|            let file = OpenOptions::new()
  283|      0|                .read(true)
  284|      0|                .write(true)
  285|      0|                .create(true)
  286|      0|                .truncate(false)
  287|      0|                .open(&path)?;
  288|      0|            if file.try_lock_exclusive().is_ok() {
  289|      0|                return Ok(file);
  290|      0|            }
  291|      0|            if Instant::now() >= deadline {
  292|      0|                return Err(AppError::JobSingletonLocked {
  293|      0|                    job_type: job_type.tag().to_string(),
  294|      0|                    namespace: namespace.to_string(),
  295|      0|                });
  296|      0|            }
  297|       |        }
  298|      5|    }
  299|      5|    Ok(file)
  300|      6|}
  301|       |
  302|       |/// Tries to acquire any free slot in `1..=max`, returning the first available one.
  303|       |///
  304|       |/// Returns `Ok(Some((file, slot)))` if a slot was obtained, `Ok(None)` if all are
  305|       |/// occupied (`EWOULDBLOCK`). Propagates I/O errors other than "lock contended".
  306|      0|fn try_any_slot(max: usize) -> Result<Option<(File, usize)>, AppError> {
  307|      0|    for slot in 1..=max {
  308|      0|        match try_acquire_slot(slot) {
  309|      0|            Ok(file) => return Ok(Some((file, slot))),
  310|      0|            Err(AppError::Io(e)) if is_lock_contended(&e) => continue,
  311|      0|            Err(e) => return Err(e),
  312|       |        }
  313|       |    }
  314|      0|    Ok(None)
  315|      0|}
  316|       |
  317|      1|fn is_lock_contended(error: &std::io::Error) -> bool {
  318|      1|    if error.kind() == std::io::ErrorKind::WouldBlock {
  319|      1|        return true;
  320|      0|    }
  321|       |
  322|       |    #[cfg(windows)]
  323|       |    {
  324|       |        matches!(error.raw_os_error(), Some(32 | 33))
  325|       |    }
  326|       |
  327|       |    #[cfg(not(windows))]
  328|       |    {
  329|      0|        false
  330|       |    }
  331|      1|}
  332|       |
  333|       |#[cfg(test)]
  334|       |mod tests {
  335|       |    use super::*;
  336|       |    use std::sync::atomic::{AtomicUsize, Ordering};
  337|       |    static SEQ: AtomicUsize = AtomicUsize::new(0);
  338|       |
  339|      9|    fn unique_ns() -> String {
  340|      9|        let n = SEQ.fetch_add(1, Ordering::SeqCst);
  341|      9|        let pid = std::process::id();
  342|      9|        format!("test-{pid}-{n}")
  343|      9|    }
  344|       |
  345|       |    #[test]
  346|      1|    fn job_singleton_path_sanitises_namespace() {
  347|      1|        let p = job_singleton_path(JobType::Enrich, "Foo Bar/Baz", "abc123def456")
  348|      1|            .expect("path should resolve");
  349|      1|        let name = p.file_name().unwrap().to_string_lossy().to_string();
  350|      1|        assert!(name.contains("enrich"), "got {name}");
                                                       ^0
  351|      1|        assert!(name.contains("foo-bar-baz"), "got {name}");
                                                            ^0
  352|      1|        assert!(
  353|      1|            name.contains("abc123def456"),
  354|      0|            "must embed db_hash: got {name}"
  355|       |        );
  356|      1|    }
  357|       |
  358|       |    #[test]
  359|      1|    fn job_singleton_blocks_second_invocation_same_namespace() {
  360|      1|        let ns = unique_ns();
  361|      1|        let db = std::env::temp_dir().join(format!("test-{}.sqlite", unique_ns()));
  362|      1|        let first = acquire_job_singleton(JobType::Enrich, &ns, &db, Some(0), false)
  363|      1|            .expect("first acquire should succeed");
  364|      1|        let second = acquire_job_singleton(JobType::Enrich, &ns, &db, Some(0), false);
  365|      1|        assert!(
  366|      1|            matches!(second, Err(AppError::JobSingletonLocked { .. })),
                          ^0
  367|      0|            "expected JobSingletonLocked, got {second:?}"
  368|       |        );
  369|      1|        drop(first);
  370|      1|    }
  371|       |
  372|       |    #[test]
  373|      1|    fn job_singleton_allows_different_namespaces() {
  374|      1|        let ns_a = unique_ns();
  375|      1|        let ns_b = unique_ns();
  376|      1|        let db_a = std::env::temp_dir().join(format!("test-a-{}.sqlite", unique_ns()));
  377|      1|        let db_b = std::env::temp_dir().join(format!("test-b-{}.sqlite", unique_ns()));
  378|      1|        let first = acquire_job_singleton(JobType::IngestClaudeCode, &ns_a, &db_a, Some(0), false)
  379|      1|            .expect("ns_a should acquire");
  380|      1|        let second = acquire_job_singleton(JobType::IngestClaudeCode, &ns_b, &db_b, Some(0), false)
  381|      1|            .expect("ns_b should acquire in parallel");
  382|      1|        drop(first);
  383|      1|        drop(second);
  384|      1|    }
  385|       |
  386|       |    #[test]
  387|      1|    fn job_singleton_scoped_by_db_hash() {
  388|       |        // G30: two databases, same namespace, different content. Both locks
  389|       |        // should succeed because the db_hash differs.
  390|      1|        let ns = unique_ns();
  391|      1|        let db_a = std::env::temp_dir().join(format!("test-x-{}.sqlite", unique_ns()));
  392|      1|        let db_b = std::env::temp_dir().join(format!("test-y-{}.sqlite", unique_ns()));
  393|      1|        let first = acquire_job_singleton(JobType::Enrich, &ns, &db_a, Some(0), false)
  394|      1|            .expect("db_a should acquire");
  395|      1|        let second = acquire_job_singleton(JobType::Enrich, &ns, &db_b, Some(0), false)
  396|      1|            .expect("db_b should acquire independently (G30 fix)");
  397|      1|        drop(first);
  398|      1|        drop(second);
  399|      1|    }
  400|       |
  401|       |    #[test]
  402|      1|    fn db_path_hash_is_stable_for_same_path() {
  403|      1|        let p = std::env::temp_dir().join("hashing-test.sqlite");
  404|      1|        let h1 = db_path_hash(&p);
  405|      1|        let h2 = db_path_hash(&p);
  406|      1|        assert_eq!(h1, h2, "same path must produce same hash");
                                         ^0
  407|      1|        assert_eq!(h1.len(), 12, "BLAKE3 prefix must be 12 hex chars");
                                               ^0
  408|      1|    }
  409|       |
  410|       |    #[test]
  411|      1|    fn db_path_hash_differs_for_different_paths() {
  412|      1|        let a = std::env::temp_dir().join("hash-a.sqlite");
  413|      1|        let b = std::env::temp_dir().join("hash-b.sqlite");
  414|      1|        assert_ne!(db_path_hash(&a), db_path_hash(&b));
  415|      1|    }
  416|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/memory_guard.rs:
    1|       |//! Memory guard: checks RAM availability before loading the ONNX model.
    2|       |//!
    3|       |//! Loading the model via `fastembed` consumes approximately
    4|       |//! [`crate::constants::EMBEDDING_LOAD_EXPECTED_RSS_MB`] MiB of resident memory.
    5|       |//! Without this guard, multiple parallel invocations can exhaust RAM and trigger
    6|       |//! OOM (Out-Of-Memory), stalling the system.
    7|       |//!
    8|       |//! This guard queries the OS via `sysinfo` before any heavy initialisation,
    9|       |//! aborting with [`crate::errors::AppError::LowMemory`] (exit 77) when the
   10|       |//! configured floor is not met.
   11|       |
   12|       |use sysinfo::{
   13|       |    get_current_pid, MemoryRefreshKind, ProcessRefreshKind, ProcessesToUpdate, RefreshKind, System,
   14|       |    UpdateKind,
   15|       |};
   16|       |
   17|       |use crate::errors::AppError;
   18|       |
   19|       |/// Returns the current available memory in MiB.
   20|      3|pub fn available_memory_mb() -> u64 {
   21|      3|    let sys =
   22|      3|        System::new_with_specifics(RefreshKind::new().with_memory(MemoryRefreshKind::everything()));
   23|      3|    let available_bytes = sys.available_memory();
   24|      3|    available_bytes / (1024 * 1024)
   25|      3|}
   26|       |
   27|       |/// Returns the current process RSS in MiB when available.
   28|      1|pub fn current_process_memory_mb() -> Option<u64> {
   29|      1|    let pid = get_current_pid().ok()?;
                                                  ^0
   30|      1|    let mut sys =
   31|      1|        System::new_with_specifics(RefreshKind::new().with_memory(MemoryRefreshKind::everything()));
   32|      1|    sys.refresh_processes_specifics(
   33|      1|        ProcessesToUpdate::Some(&[pid]),
   34|       |        true,
   35|      1|        ProcessRefreshKind::new()
   36|      1|            .with_memory()
   37|      1|            .with_exe(UpdateKind::OnlyIfNotSet),
   38|       |    );
   39|      1|    sys.process(pid).map(|p| p.memory() / (1024 * 1024))
   40|      1|}
   41|       |
   42|       |/// Calculates the safe concurrency ceiling for heavy embedding workloads.
   43|       |///
   44|       |/// Canonical formula:
   45|       |/// `permits = min(cpus, available_memory_mb / ram_per_task_mb) * 0.5`
   46|       |///
   47|       |/// The result is clamped between `1` and `max_concurrency`.
   48|      3|pub fn calculate_safe_concurrency(
   49|      3|    available_mb: u64,
   50|      3|    cpu_count: usize,
   51|      3|    ram_per_task_mb: u64,
   52|      3|    max_concurrency: usize,
   53|      3|) -> usize {
   54|      3|    let cpu_count = cpu_count.max(1);
   55|      3|    let max_concurrency = max_concurrency.max(1);
   56|      3|    let ram_per_task_mb = ram_per_task_mb.max(1);
   57|       |
   58|      3|    let memory_bound = (available_mb / ram_per_task_mb) as usize;
   59|      3|    let resource_bound = cpu_count.min(memory_bound).max(1);
   60|       |    // G18: removed unconditional /2 margin — callers should pass lower ram_per_task_mb
   61|       |    // when daemon is active (model shared) instead of halving the result
   62|      3|    resource_bound.min(max_concurrency)
   63|      3|}
   64|       |
   65|       |/// Checks whether sufficient memory is available to start loading the model.
   66|       |///
   67|       |/// # Parameters
   68|       |/// - `min_mb`: minimum floor in MiB of available memory (typically
   69|       |///   [`crate::constants::MIN_AVAILABLE_MEMORY_MB`]).
   70|       |///
   71|       |/// # Errors
   72|       |/// Returns [`AppError::LowMemory`] when `available_mb < min_mb`.
   73|       |///
   74|       |/// # Returns
   75|       |/// Returns `Ok(available_mb)` with the actual available memory in MiB.
   76|      3|pub fn check_available_memory(min_mb: u64) -> Result<u64, AppError> {
   77|      3|    let available_mb = available_memory_mb();
   78|       |
   79|      3|    if available_mb < min_mb {
   80|      2|        return Err(AppError::LowMemory {
   81|      2|            available_mb,
   82|      2|            required_mb: min_mb,
   83|      2|        });
   84|      1|    }
   85|       |
   86|      1|    Ok(available_mb)
   87|      3|}
   88|       |
   89|       |#[cfg(test)]
   90|       |mod tests {
   91|       |    use super::*;
   92|       |
   93|       |    #[test]
   94|      1|    fn check_available_memory_with_zero_always_passes() {
   95|      1|        let result = check_available_memory(0);
   96|      1|        assert!(result.is_ok(), "min_mb=0 must always pass, got: {result:?}");
                                              ^0
   97|      1|        let mb = result.unwrap();
   98|      1|        assert!(mb > 0, "system must report positive memory");
                                      ^0
   99|      1|    }
  100|       |
  101|       |    #[test]
  102|      1|    fn check_available_memory_with_huge_value_fails() {
  103|      1|        let result = check_available_memory(u64::MAX);
  104|      1|        assert!(
  105|      1|            matches!(result, Err(AppError::LowMemory { .. })),
                          ^0
  106|      0|            "u64::MAX MiB must fail with LowMemory, got: {result:?}"
  107|       |        );
  108|      1|    }
  109|       |
  110|       |    #[test]
  111|      1|    fn low_memory_error_contains_correct_values() {
  112|      1|        match check_available_memory(u64::MAX) {
  113|       |            Err(AppError::LowMemory {
  114|      1|                available_mb,
  115|      1|                required_mb,
  116|       |            }) => {
  117|      1|                assert_eq!(required_mb, u64::MAX);
  118|      1|                assert!(available_mb < u64::MAX);
  119|       |            }
  120|      0|            other => unreachable!("expected LowMemory, got: {other:?}"),
  121|       |        }
  122|      1|    }
  123|       |
  124|       |    #[test]
  125|      1|    fn calculate_safe_concurrency_respects_half_margin() {
  126|      1|        let permits = calculate_safe_concurrency(8_000, 8, 1_000, 4);
  127|      1|        assert_eq!(permits, 4);
  128|      1|    }
  129|       |
  130|       |    #[test]
  131|      1|    fn calculate_safe_concurrency_never_returns_zero() {
  132|      1|        let permits = calculate_safe_concurrency(100, 1, 10_000, 4);
  133|      1|        assert_eq!(permits, 1);
  134|      1|    }
  135|       |
  136|       |    #[test]
  137|      1|    fn calculate_safe_concurrency_respects_max_ceiling() {
  138|      1|        let permits = calculate_safe_concurrency(128_000, 64, 500, 4);
  139|      1|        assert_eq!(permits, 4);
  140|      1|    }
  141|       |
  142|       |    #[test]
  143|      1|    fn current_process_memory_mb_returns_some_value() {
  144|      1|        let rss = current_process_memory_mb();
  145|      1|        assert!(rss.is_some());
  146|      1|    }
  147|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/memory_source.rs:
    1|       |//! Type-safe enumeration of the `memories.source` column domain.
    2|       |//!
    3|       |//! The CHECK constraint on the `memories` table accepts exactly five values:
    4|       |//! `agent`, `user`, `system`, `import`, `sync`. Any other literal is rejected
    5|       |//! at runtime by SQLite with `SQLITE_CONSTRAINT_CHECK`.
    6|       |//!
    7|       |//! This enum eliminates the silent footgun of `pub source: String` by forcing
    8|       |//! every call-site to pick a typed variant that maps deterministically to one
    9|       |//! of the five allowed CHECK values via [`MemorySource::as_str`].
   10|       |//!
   11|       |//! # Examples
   12|       |//!
   13|       |//! ```
   14|       |//! use sqlite_graphrag::memory_source::MemorySource;
   15|       |//!
   16|       |//! let src = MemorySource::Agent;
   17|       |//! assert_eq!(src.as_str(), "agent");
   18|       |//!
   19|       |//! let parsed = MemorySource::try_from("user").expect("user is valid");
   20|       |//! assert_eq!(parsed, MemorySource::User);
   21|       |//!
   22|       |//! let err = MemorySource::try_from("enrich").unwrap_err();
   23|       |//! assert!(format!("{err}").contains("invalid memory source"));
   24|       |//! ```
   25|       |
   26|       |use crate::errors::AppError;
   27|       |use serde::{Deserialize, Serialize};
   28|       |
   29|       |/// Enumerates the five values accepted by the `memories.source` CHECK constraint.
   30|       |///
   31|       |/// Adding a new variant requires:
   32|       |///
   33|       |/// 1. Updating the DDL CHECK constraint in `migrations/V001__init.sql`.
   34|       |/// 2. Running a migration that backfills any pre-existing values
   35|       |///    (`UPDATE memories SET source='agent' WHERE source NOT IN (...)`).
   36|       |/// 3. Bumping [`crate::constants::CURRENT_SCHEMA_VERSION`].
   37|       |#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
   38|       |#[serde(rename_all = "snake_case")]
   39|       |pub enum MemorySource {
   40|       |    /// Mutated by an LLM agent (remember, edit, rename, body-enrich).
   41|       |    Agent,
   42|       |    /// Mutated by a human operator.
   43|       |    User,
   44|       |    /// Mutated by an internal migration or system job.
   45|       |    System,
   46|       |    /// Inserted by bulk import (ingest, ingest --mode claude-code, ingest --mode codex).
   47|       |    Import,
   48|       |    /// Inserted by an external sync job.
   49|       |    Sync,
   50|       |}
   51|       |
   52|       |impl MemorySource {
   53|       |    /// Returns the canonical snake_case string stored in the SQLite column.
   54|       |    ///
   55|       |    /// The returned slice has `'static` lifetime because all five values are
   56|       |    /// ASCII literals known at compile time.
   57|     25|    pub const fn as_str(self) -> &'static str {
   58|     25|        match self {
   59|      5|            Self::Agent => "agent",
   60|      5|            Self::User => "user",
   61|      5|            Self::System => "system",
   62|      5|            Self::Import => "import",
   63|      5|            Self::Sync => "sync",
   64|       |        }
   65|     25|    }
   66|       |
   67|       |    /// Returns every variant as a static slice, useful for error messages and docs.
   68|       |    pub const ALL: &'static [MemorySource] = &[
   69|       |        Self::Agent,
   70|       |        Self::User,
   71|       |        Self::System,
   72|       |        Self::Import,
   73|       |        Self::Sync,
   74|       |    ];
   75|       |}
   76|       |
   77|       |impl std::fmt::Display for MemorySource {
   78|      5|    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
   79|      5|        f.write_str(self.as_str())
   80|      5|    }
   81|       |}
   82|       |
   83|       |/// Parses a stored `memories.source` string back into a typed variant.
   84|       |///
   85|       |/// # Errors
   86|       |///
   87|       |/// Returns [`AppError::Validation`] when the input is not one of the five
   88|       |/// canonical values. The error message lists every accepted value so the
   89|       |/// caller can self-correct without consulting the schema.
   90|       |impl TryFrom<&str> for MemorySource {
   91|       |    type Error = AppError;
   92|       |
   93|      8|    fn try_from(value: &str) -> Result<Self, Self::Error> {
   94|      8|        match value {
   95|      8|            "agent" => Ok(Self::Agent),
                                     ^2
   96|      6|            "user" => Ok(Self::User),
                                    ^1
   97|      5|            "system" => Ok(Self::System),
                                      ^1
   98|      4|            "import" => Ok(Self::Import),
                                      ^1
   99|      3|            "sync" => Ok(Self::Sync),
                                    ^1
  100|      2|            other => Err(AppError::Validation(format!(
  101|      2|                "invalid memory source: {other:?}; expected one of {}",
  102|      2|                Self::ALL
  103|      2|                    .iter()
  104|     10|                    .map(|v| v.as_str())
                                   ^2
  105|      2|                    .collect::<Vec<_>>()
  106|      2|                    .join(", ")
  107|       |            ))),
  108|       |        }
  109|      8|    }
  110|       |}
  111|       |
  112|       |impl TryFrom<String> for MemorySource {
  113|       |    type Error = AppError;
  114|       |
  115|      1|    fn try_from(value: String) -> Result<Self, Self::Error> {
  116|      1|        Self::try_from(value.as_str())
  117|      1|    }
  118|       |}
  119|       |
  120|       |/// Validates a raw `memories.source` string against the CHECK constraint domain.
  121|       |///
  122|       |/// This is the runtime guard for callers that still take `&str` (legacy
  123|       |/// call-sites, FTS rows already in the database, deserialised JSON). The
  124|       |/// function returns the canonical slice on success and an [`AppError::Validation`]
  125|       |/// on failure, with an actionable message listing every accepted value.
  126|       |///
  127|       |/// Use this at every boundary that touches the `source` column:
  128|       |/// `memories::insert`, `memories::update`, and any new code path that
  129|       |/// builds a `NewMemory` from operator-supplied input. It is the safety
  130|       |/// net that prevented the original G29 bug from regressing in v1.0.69
  131|       |/// when the typed [`MemorySource`] enum was still being rolled out.
  132|     43|pub fn validate_source(raw: &str) -> Result<&'static str, AppError> {
  133|     43|    match raw {
  134|     43|        "agent" => Ok("agent"),
  135|      0|        "user" => Ok("user"),
  136|      0|        "system" => Ok("system"),
  137|      0|        "import" => Ok("import"),
  138|      0|        "sync" => Ok("sync"),
  139|      0|        other => Err(AppError::Validation(format!(
  140|      0|            "invalid memory source: {other:?}; expected one of {}",
  141|      0|            MemorySource::ALL
  142|      0|                .iter()
  143|      0|                .map(|v| v.as_str())
  144|      0|                .collect::<Vec<_>>()
  145|      0|                .join(", ")
  146|       |        ))),
  147|       |    }
  148|     43|}
  149|       |
  150|       |#[cfg(test)]
  151|       |mod tests {
  152|       |    use super::*;
  153|       |
  154|       |    #[test]
  155|      1|    fn as_str_returns_canonical_lowercase() {
  156|      1|        assert_eq!(MemorySource::Agent.as_str(), "agent");
  157|      1|        assert_eq!(MemorySource::User.as_str(), "user");
  158|      1|        assert_eq!(MemorySource::System.as_str(), "system");
  159|      1|        assert_eq!(MemorySource::Import.as_str(), "import");
  160|      1|        assert_eq!(MemorySource::Sync.as_str(), "sync");
  161|      1|    }
  162|       |
  163|       |    #[test]
  164|      1|    fn try_from_valid_strings_succeeds() {
  165|      1|        assert_eq!(
  166|      1|            MemorySource::try_from("agent").unwrap(),
  167|       |            MemorySource::Agent
  168|       |        );
  169|      1|        assert_eq!(MemorySource::try_from("user").unwrap(), MemorySource::User);
  170|      1|        assert_eq!(
  171|      1|            MemorySource::try_from("system").unwrap(),
  172|       |            MemorySource::System
  173|       |        );
  174|      1|        assert_eq!(
  175|      1|            MemorySource::try_from("import").unwrap(),
  176|       |            MemorySource::Import
  177|       |        );
  178|      1|        assert_eq!(MemorySource::try_from("sync").unwrap(), MemorySource::Sync);
  179|      1|    }
  180|       |
  181|       |    #[test]
  182|      1|    fn try_from_invalid_string_returns_err() {
  183|       |        // G29 reproducer: "enrich" is the historical bug.
  184|      1|        let err = MemorySource::try_from("enrich").unwrap_err();
  185|      1|        let msg = format!("{err}");
  186|      1|        assert!(msg.contains("invalid memory source"), "got: {msg}");
                                                                     ^0
  187|      1|        assert!(msg.contains("\"enrich\""), "got: {msg}");
                                                          ^0
  188|      1|        assert!(msg.contains("agent"), "must list agent as valid: {msg}");
                                                     ^0
  189|      1|    }
  190|       |
  191|       |    #[test]
  192|      1|    fn try_from_empty_string_returns_err() {
  193|      1|        assert!(MemorySource::try_from("").is_err());
  194|      1|    }
  195|       |
  196|       |    #[test]
  197|      1|    fn try_from_string_owned_works() {
  198|      1|        let src: MemorySource = String::from("agent").try_into().unwrap();
  199|      1|        assert_eq!(src, MemorySource::Agent);
  200|      1|    }
  201|       |
  202|       |    #[test]
  203|      1|    fn display_matches_as_str() {
  204|      6|        for v in MemorySource::ALL {
                          ^5
  205|      5|            assert_eq!(format!("{v}"), v.as_str());
  206|       |        }
  207|      1|    }
  208|       |
  209|       |    #[test]
  210|      1|    fn serialize_round_trip_preserves_variant() {
  211|      1|        let v = MemorySource::Import;
  212|      1|        let json = serde_json::to_string(&v).unwrap();
  213|      1|        assert_eq!(json, "\"import\"");
  214|      1|        let back: MemorySource = serde_json::from_str(&json).unwrap();
  215|      1|        assert_eq!(back, v);
  216|      1|    }
  217|       |
  218|       |    #[test]
  219|      1|    fn all_slice_has_exactly_five_variants() {
  220|      1|        assert_eq!(MemorySource::ALL.len(), 5);
  221|      1|    }
  222|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/namespace.rs:
    1|       |//! Namespace resolution layer (flag > env > "global" fallback).
    2|       |//!
    3|       |//! Validates and resolves the active namespace used to scope all SQLite
    4|       |//! operations, enforcing safe characters and traversal-free names.
    5|       |
    6|       |use crate::errors::AppError;
    7|       |use crate::i18n::validation;
    8|       |use serde::Serialize;
    9|       |use std::path::Path;
   10|       |
   11|       |#[derive(Debug, Clone, Copy, Serialize, PartialEq, Eq)]
   12|       |#[serde(rename_all = "snake_case")]
   13|       |pub enum NamespaceSource {
   14|       |    ExplicitFlag,
   15|       |    Environment,
   16|       |    Default,
   17|       |}
   18|       |
   19|       |#[derive(Debug, Clone, Serialize)]
   20|       |pub struct NamespaceResolution {
   21|       |    pub namespace: String,
   22|       |    pub source: NamespaceSource,
   23|       |    pub cwd: String,
   24|       |}
   25|       |
   26|       |/// Resolves the active namespace, returning only the final name.
   27|       |///
   28|       |/// Shortcut over [`detect_namespace`] when the source does not matter.
   29|       |/// With a valid explicit flag, the returned namespace is exactly the passed value.
   30|       |/// Without a flag, the final fallback is `"global"`.
   31|       |///
   32|       |/// # Errors
   33|       |///
   34|       |/// Returns [`AppError::Validation`] if `explicit` contains invalid characters
   35|       |/// or exceeds 80 characters.
   36|       |///
   37|       |/// # Examples
   38|       |///
   39|       |/// ```
   40|       |/// use sqlite_graphrag::namespace::resolve_namespace;
   41|       |///
   42|       |/// // A valid explicit flag is accepted and reflected in the result.
   43|       |/// let ns = resolve_namespace(Some("meu-projeto")).unwrap();
   44|       |/// assert_eq!(ns, "meu-projeto");
   45|       |/// ```
   46|       |///
   47|       |/// ```
   48|       |/// use sqlite_graphrag::namespace::resolve_namespace;
   49|       |/// use sqlite_graphrag::errors::AppError;
   50|       |///
   51|       |/// // Namespace with invalid characters causes a validation error (exit 1).
   52|       |/// let err = resolve_namespace(Some("ns with space")).unwrap_err();
   53|       |/// assert_eq!(err.exit_code(), 1);
   54|       |/// ```
   55|      1|pub fn resolve_namespace(explicit: Option<&str>) -> Result<String, AppError> {
   56|      1|    Ok(detect_namespace(explicit)?.namespace)
                                               ^0
   57|      1|}
   58|       |
   59|       |/// Resolves the active namespace, returning a struct with the source and current directory.
   60|       |///
   61|       |/// Precedence: explicit flag > `SQLITE_GRAPHRAG_NAMESPACE` > fallback `"global"`.
   62|       |///
   63|       |/// # Errors
   64|       |///
   65|       |/// Returns [`AppError::Validation`] if the resolved namespace contains invalid characters.
   66|       |///
   67|       |/// # Examples
   68|       |///
   69|       |/// ```
   70|       |/// use sqlite_graphrag::namespace::{detect_namespace, NamespaceSource};
   71|       |///
   72|       |/// // With an explicit flag, the source is `ExplicitFlag`.
   73|       |/// let res = detect_namespace(Some("producao")).unwrap();
   74|       |/// assert_eq!(res.namespace, "producao");
   75|       |/// assert_eq!(res.source, NamespaceSource::ExplicitFlag);
   76|       |/// ```
   77|       |///
   78|       |/// ```
   79|       |/// use sqlite_graphrag::namespace::{detect_namespace, NamespaceSource};
   80|       |///
   81|       |/// // Without any explicit configuration, fallback is "global".
   82|       |/// // Removes env var to guarantee deterministic behaviour.
   83|       |/// std::env::remove_var("SQLITE_GRAPHRAG_NAMESPACE");
   84|       |/// let res = detect_namespace(None).unwrap();
   85|       |/// assert_eq!(res.namespace, "global");
   86|       |/// assert_eq!(res.source, NamespaceSource::Default);
   87|       |/// ```
   88|      4|pub fn detect_namespace(explicit: Option<&str>) -> Result<NamespaceResolution, AppError> {
   89|      4|    let cwd = std::env::current_dir().map_err(AppError::Io)?;
                                                                         ^0
   90|      4|    let cwd_display = normalize_path(&cwd);
   91|       |
   92|      4|    if let Some(ns) = explicit {
                              ^1
   93|      1|        validate_namespace(ns)?;
                                            ^0
   94|      1|        return Ok(NamespaceResolution {
   95|      1|            namespace: ns.to_owned(),
   96|      1|            source: NamespaceSource::ExplicitFlag,
   97|      1|            cwd: cwd_display,
   98|      1|        });
   99|      3|    }
  100|       |
  101|      3|    if let Ok(ns) = std::env::var("SQLITE_GRAPHRAG_NAMESPACE") {
                            ^1
  102|      1|        if !ns.is_empty() {
  103|      1|            validate_namespace(&ns)?;
                                                 ^0
  104|      1|            return Ok(NamespaceResolution {
  105|      1|                namespace: ns,
  106|      1|                source: NamespaceSource::Environment,
  107|      1|                cwd: cwd_display,
  108|      1|            });
  109|      0|        }
  110|      2|    }
  111|       |
  112|      2|    Ok(NamespaceResolution {
  113|      2|        namespace: "global".to_owned(),
  114|      2|        source: NamespaceSource::Default,
  115|      2|        cwd: cwd_display,
  116|      2|    })
  117|      4|}
  118|       |
  119|      2|fn validate_namespace(ns: &str) -> Result<(), AppError> {
  120|      2|    if ns.is_empty() || ns.len() > 80 {
  121|      0|        return Err(AppError::Validation(validation::namespace_length()));
  122|      2|    }
  123|      2|    if !ns
  124|      2|        .chars()
  125|     30|        .all(|c| c.is_alphanumeric() || c == '-' || c == '_')
                       ^2                             ^3          ^0
  126|       |    {
  127|      0|        return Err(AppError::Validation(validation::namespace_format()));
  128|      2|    }
  129|      2|    Ok(())
  130|      2|}
  131|       |
  132|      4|fn normalize_path(path: &Path) -> String {
  133|      4|    path.canonicalize()
  134|      4|        .unwrap_or_else(|_| path.to_path_buf())
                                          ^0   ^0
  135|      4|        .display()
  136|      4|        .to_string()
  137|      4|}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/output.rs:
    1|       |//! Single point of terminal I/O for the CLI (stdout JSON, stderr human).
    2|       |//!
    3|       |//! All user-visible output must go through this module; direct `println!` in
    4|       |//! other modules is forbidden.
    5|       |
    6|       |use crate::errors::AppError;
    7|       |use serde::Serialize;
    8|       |
    9|       |/// Output format variants accepted by `--format` CLI flags.
   10|       |#[derive(Debug, Clone, Copy, clap::ValueEnum, Default)]
   11|       |pub enum OutputFormat {
   12|       |    #[default]
   13|       |    Json,
   14|       |    Text,
   15|       |    Markdown,
   16|       |}
   17|       |
   18|       |/// Restricted JSON-only format for commands that always emit JSON.
   19|       |#[derive(Debug, Clone, Copy, clap::ValueEnum, Default)]
   20|       |pub enum JsonOutputFormat {
   21|       |    #[default]
   22|       |    Json,
   23|       |}
   24|       |
   25|       |/// Serializes `value` as pretty-printed JSON and writes it to stdout with a trailing newline.
   26|       |///
   27|       |/// Flushes stdout after writing. A `BrokenPipe` error is silenced so that
   28|       |/// piping to consumers that close early (e.g. `head`) does not surface an error.
   29|       |///
   30|       |/// # Errors
   31|       |/// Returns `Err` when serialization fails or when a non-`BrokenPipe` I/O error occurs.
   32|       |#[inline]
   33|      3|pub fn emit_json<T: Serialize>(value: &T) -> Result<(), AppError> {
   34|      3|    let json = serde_json::to_string_pretty(value)?;
                      ^2                                        ^1
   35|      2|    let mut out = std::io::stdout().lock();
   36|      2|    if let Err(e) = std::io::Write::write_all(&mut out, json.as_bytes())
                             ^0
   37|      2|        .and_then(|()| std::io::Write::write_all(&mut out, b"\n"))
   38|      2|        .and_then(|()| std::io::Write::flush(&mut out))
   39|       |    {
   40|      0|        if e.kind() == std::io::ErrorKind::BrokenPipe {
   41|      0|            return Ok(());
   42|      0|        }
   43|      0|        return Err(AppError::Io(e));
   44|      2|    }
   45|      2|    Ok(())
   46|      3|}
   47|       |
   48|       |/// Serializes `value` as compact (single-line) JSON and writes it to stdout with a trailing newline.
   49|       |///
   50|       |/// Flushes stdout after writing. A `BrokenPipe` error is silenced.
   51|       |///
   52|       |/// # Errors
   53|       |/// Returns `Err` when serialization fails or when a non-`BrokenPipe` I/O error occurs.
   54|       |#[inline]
   55|      2|pub fn emit_json_compact<T: Serialize>(value: &T) -> Result<(), AppError> {
   56|      2|    let json = serde_json::to_string(value)?;
                      ^1                                 ^1
   57|      1|    let mut out = std::io::stdout().lock();
   58|      1|    if let Err(e) = std::io::Write::write_all(&mut out, json.as_bytes())
                             ^0
   59|      1|        .and_then(|()| std::io::Write::write_all(&mut out, b"\n"))
   60|      1|        .and_then(|()| std::io::Write::flush(&mut out))
   61|       |    {
   62|      0|        if e.kind() == std::io::ErrorKind::BrokenPipe {
   63|      0|            return Ok(());
   64|      0|        }
   65|      0|        return Err(AppError::Io(e));
   66|      1|    }
   67|      1|    Ok(())
   68|      2|}
   69|       |
   70|       |/// Writes compact JSON to stdout, silently ignoring serialization and I/O errors.
   71|       |/// Designed for NDJSON streaming where partial output is acceptable.
   72|       |#[inline]
   73|      0|pub fn emit_json_line<T: Serialize>(value: &T) {
   74|      0|    if let Ok(json) = serde_json::to_string(value) {
   75|      0|        let mut out = std::io::stdout().lock();
   76|      0|        let _ = std::io::Write::write_all(&mut out, json.as_bytes());
   77|      0|        let _ = std::io::Write::write_all(&mut out, b"\n");
   78|      0|        let _ = std::io::Write::flush(&mut out);
   79|      0|    }
   80|      0|}
   81|       |
   82|       |/// Writes `msg` followed by a newline to stdout and flushes.
   83|       |///
   84|       |/// A `BrokenPipe` error is silenced gracefully.
   85|       |#[inline]
   86|      1|pub fn emit_text(msg: &str) {
   87|      1|    let mut out = std::io::stdout().lock();
   88|      1|    let _ = std::io::Write::write_all(&mut out, msg.as_bytes())
   89|      1|        .and_then(|()| std::io::Write::write_all(&mut out, b"\n"))
   90|      1|        .and_then(|()| std::io::Write::flush(&mut out));
   91|      1|}
   92|       |
   93|       |/// Logs `msg` as a structured `tracing::info!` event (does not write to stdout).
   94|       |#[inline]
   95|      1|pub fn emit_progress(msg: &str) {
   96|      1|    tracing::info!(target: "output", message = msg);
   97|      1|}
   98|       |
   99|       |/// Emits a bilingual progress message honouring `--lang` or `SQLITE_GRAPHRAG_LANG`.
  100|       |/// Usage: `output::emit_progress_i18n("Computing embedding...", "Calculando embedding...")`.
  101|      0|pub fn emit_progress_i18n(en: &str, pt: &str) {
  102|       |    use crate::i18n::{current, Language};
  103|      0|    match current() {
  104|      0|        Language::English => tracing::info!(target: "output", message = en),
  105|      0|        Language::Portuguese => tracing::info!(target: "output", message = pt),
  106|       |    }
  107|      0|}
  108|       |
  109|       |/// Emits a JSON error envelope to stdout for machine consumers.
  110|       |///
  111|       |/// Ensures the stdout JSON contract is honoured even on error paths:
  112|       |/// `{"error": true, "code": <exit_code>, "message": "<localized_msg>"}`.
  113|       |/// A `BrokenPipe` error is silenced so piping to early-closing consumers
  114|       |/// does not surface a secondary error.
  115|       |#[cold]
  116|       |#[inline(never)]
  117|      0|pub fn emit_error_json(code: i32, message: &str) {
  118|       |    #[derive(serde::Serialize)]
  119|       |    struct ErrorEnvelope<'a> {
  120|       |        error: bool,
  121|       |        code: i32,
  122|       |        message: &'a str,
  123|       |    }
  124|      0|    let envelope = ErrorEnvelope {
  125|      0|        error: true,
  126|      0|        code,
  127|      0|        message,
  128|      0|    };
  129|      0|    if emit_json(&envelope).is_err() {
  130|       |        use std::io::Write;
  131|      0|        let escaped = message.replace('\\', "\\\\").replace('"', "\\\"");
  132|      0|        let _ = writeln!(
  133|      0|            std::io::stdout().lock(),
  134|      0|            r#"{{"error":true,"code":{code},"message":"{escaped}"}}"#
  135|       |        );
  136|      0|    }
  137|      0|}
  138|       |
  139|       |/// Emits a localised error message to stderr with the `Error:`/`Erro:` prefix.
  140|       |///
  141|       |/// Centralises human-readable error output following Pattern 5 (`output.rs` is the
  142|       |/// SOLE I/O point of the CLI). Does not log via `tracing` — call `tracing::error!`
  143|       |/// explicitly before this function when structured observability is desired.
  144|       |#[cold]
  145|       |#[inline(never)]
  146|      0|pub fn emit_error(localized_msg: &str) {
  147|      0|    tracing::error!(target: "output", message = localized_msg);
  148|      0|    eprintln!("{}: {}", crate::i18n::error_prefix(), localized_msg);
  149|      0|}
  150|       |
  151|       |/// Emits a bilingual error to stderr honouring `--lang` or `SQLITE_GRAPHRAG_LANG`.
  152|       |/// Usage: `output::emit_error_i18n("invariant violated", "invariante violado")`.
  153|       |#[cold]
  154|       |#[inline(never)]
  155|      0|pub fn emit_error_i18n(en: &str, pt: &str) {
  156|       |    use crate::i18n::{current, Language};
  157|      0|    let msg = match current() {
  158|      0|        Language::English => en,
  159|      0|        Language::Portuguese => pt,
  160|       |    };
  161|      0|    emit_error(msg);
  162|      0|}
  163|       |
  164|       |/// JSON payload emitted by the `remember` subcommand.
  165|       |///
  166|       |/// All fields are required by the JSON contract (see `docs/schemas/remember.schema.json`).
  167|       |/// `operation` is an alias of `action` for compatibility with clients using the old field name.
  168|       |///
  169|       |/// # Examples
  170|       |///
  171|       |/// ```
  172|       |/// use sqlite_graphrag::output::RememberResponse;
  173|       |///
  174|       |/// let resp = RememberResponse {
  175|       |///     memory_id: 1,
  176|       |///     name: "nota-inicial".into(),
  177|       |///     namespace: "global".into(),
  178|       |///     action: "created".into(),
  179|       |///     operation: "created".into(),
  180|       |///     version: 1,
  181|       |///     entities_persisted: 0,
  182|       |///     relationships_persisted: 0,
  183|       |///     relationships_truncated: false,
  184|       |///     chunks_created: 1,
  185|       |///     chunks_persisted: 0,
  186|       |///     urls_persisted: 0,
  187|       |///     extraction_method: None,
  188|       |///     merged_into_memory_id: None,
  189|       |///     warnings: vec![],
  190|       |///     created_at: 1_700_000_000,
  191|       |///     created_at_iso: "2023-11-14T22:13:20Z".into(),
  192|       |///     elapsed_ms: 42,
  193|       |///     name_was_normalized: false,
  194|       |///     original_name: None,
  195|       |/// };
  196|       |///
  197|       |/// let json = serde_json::to_string(&resp).unwrap();
  198|       |/// assert!(json.contains("\"memory_id\":1"));
  199|       |/// assert!(json.contains("\"elapsed_ms\":42"));
  200|       |/// assert!(json.contains("\"merged_into_memory_id\":null"));
  201|       |/// assert!(json.contains("\"urls_persisted\":0"));
  202|       |/// assert!(json.contains("\"relationships_truncated\":false"));
  203|       |/// ```
  204|       |#[derive(Serialize)]
  205|       |pub struct RememberResponse {
  206|       |    pub memory_id: i64,
  207|       |    pub name: String,
  208|       |    pub namespace: String,
  209|       |    pub action: String,
  210|       |    /// Semantic alias of `action` for compatibility with the contract documented in SKILL.md.
  211|       |    pub operation: String,
  212|       |    pub version: i64,
  213|       |    pub entities_persisted: usize,
  214|       |    pub relationships_persisted: usize,
  215|       |    /// True when the relationship builder hit the cap before covering all entity pairs.
  216|       |    /// Callers can use this to decide whether to increase GRAPHRAG_MAX_RELATIONSHIPS_PER_MEMORY.
  217|       |    pub relationships_truncated: bool,
  218|       |    /// Total number of chunks the body was split into BEFORE dedup.
  219|       |    ///
  220|       |    /// For single-chunk bodies this equals 1 even though no row is added to
  221|       |    /// the `memory_chunks` table — the memory row itself acts as the chunk.
  222|       |    /// Use `chunks_persisted` to know how many rows were actually written.
  223|       |    pub chunks_created: usize,
  224|       |    /// Number of chunks actually written to chunks/embeddings tables. Always <= chunks_created.
  225|       |    ///
  226|       |    /// Equal when no chunk had identical normalized text already in DB; less when dedup skipped
  227|       |    /// some. Equals zero for single-chunk bodies (the memory row is the chunk) and equals
  228|       |    /// `chunks_created` for multi-chunk bodies. Added in v1.0.23 to disambiguate from
  229|       |    /// `chunks_created` and reflect database state precisely.
  230|       |    pub chunks_persisted: usize,
  231|       |    /// Number of unique URLs inserted into `memory_urls` for this memory.
  232|       |    /// Added in v1.0.24 — split URLs out of the entity graph (P0-2 fix).
  233|       |    #[serde(default)]
  234|       |    pub urls_persisted: usize,
  235|       |    /// Extraction method used: "gliner-{variant}+regex" or "regex-only". None when NER is not enabled.
  236|       |    #[serde(skip_serializing_if = "Option::is_none")]
  237|       |    pub extraction_method: Option<String>,
  238|       |    pub merged_into_memory_id: Option<i64>,
  239|       |    pub warnings: Vec<String>,
  240|       |    /// Timestamp Unix epoch seconds.
  241|       |    pub created_at: i64,
  242|       |    /// RFC 3339 UTC timestamp string parallel to `created_at` for ISO 8601 parsers.
  243|       |    pub created_at_iso: String,
  244|       |    /// Total execution time in milliseconds from handler start to serialisation.
  245|       |    pub elapsed_ms: u64,
  246|       |    /// True when the user-supplied `--name` differed from the persisted slug
  247|       |    /// (i.e. kebab-case normalization changed the value). Added in v1.0.32 so
  248|       |    /// callers can detect normalization without parsing stderr WARN logs.
  249|       |    #[serde(default)]
  250|       |    pub name_was_normalized: bool,
  251|       |    /// Original user-supplied `--name` value before normalization.
  252|       |    /// Present only when `name_was_normalized == true`; omitted otherwise to
  253|       |    /// keep the common (already-kebab) payload small.
  254|       |    #[serde(skip_serializing_if = "Option::is_none")]
  255|       |    pub original_name: Option<String>,
  256|       |}
  257|       |
  258|       |/// Individual item returned by the `recall` query.
  259|       |///
  260|       |/// The `memory_type` field is serialised as `"type"` in JSON to maintain
  261|       |/// compatibility with external clients — the Rust name uses `memory_type`
  262|       |/// to avoid conflict with the reserved keyword.
  263|       |///
  264|       |/// # Examples
  265|       |///
  266|       |/// ```
  267|       |/// use sqlite_graphrag::output::RecallItem;
  268|       |///
  269|       |/// let item = RecallItem {
  270|       |///     memory_id: 7,
  271|       |///     name: "nota-rust".into(),
  272|       |///     namespace: "global".into(),
  273|       |///     memory_type: "user".into(),
  274|       |///     description: "aprendizado de Rust".into(),
  275|       |///     snippet: "ownership e borrowing".into(),
  276|       |///     distance: 0.12,
  277|       |///     score: 0.88,
  278|       |///     source: "direct".into(),
  279|       |///     graph_depth: None,
  280|       |/// };
  281|       |///
  282|       |/// let json = serde_json::to_string(&item).unwrap();
  283|       |/// // Rust field `memory_type` appears as `"type"` in JSON.
  284|       |/// assert!(json.contains("\"type\":\"user\""));
  285|       |/// assert!(!json.contains("memory_type"));
  286|       |/// assert!(json.contains("\"distance\":0.12"));
  287|       |/// ```
  288|       |#[derive(Serialize, Clone)]
  289|       |pub struct RecallItem {
  290|       |    pub memory_id: i64,
  291|       |    pub name: String,
  292|       |    pub namespace: String,
  293|       |    #[serde(rename = "type")]
  294|       |    pub memory_type: String,
  295|       |    pub description: String,
  296|       |    pub snippet: String,
  297|       |    pub distance: f32,
  298|       |    /// Cosine similarity in `[0.0, 1.0]` derived as `1.0 - distance` and clamped
  299|       |    /// to that interval. Always populated to satisfy the documented contract
  300|       |    /// (M-A5 in v1.0.40); higher means more similar. For graph hits the value
  301|       |    /// reflects the hop-derived distance proxy and should be interpreted
  302|       |    /// alongside `graph_depth` rather than as a true cosine score.
  303|       |    pub score: f32,
  304|       |    pub source: String,
  305|       |    /// Number of graph hops between this match and the seed memories.
  306|       |    ///
  307|       |    /// Set to `None` for direct vector matches (where `distance` is meaningful)
  308|       |    /// and to `Some(N)` for traversal results, with `N=0` when the depth could
  309|       |    /// not be tracked precisely. Added in v1.0.23 to disambiguate graph results
  310|       |    /// from the `distance: 0.0` placeholder previously used for graph entries.
  311|       |    /// Field is omitted from JSON output when `None`.
  312|       |    #[serde(skip_serializing_if = "Option::is_none")]
  313|       |    pub graph_depth: Option<u32>,
  314|       |}
  315|       |
  316|       |impl RecallItem {
  317|       |    /// Computes the similarity score from a vector distance, clamped to
  318|       |    /// `[0.0, 1.0]`. Cosine distance returned by sqlite-vec lives in `[0, 2]`
  319|       |    /// in theory but the embedder produces unit-norm vectors so the practical
  320|       |    /// range is `[0, 1]`. Centralized so every constructor keeps the contract.
  321|       |    #[inline]
  322|     11|    pub fn score_from_distance(distance: f32) -> f32 {
  323|     11|        let raw = 1.0 - distance;
  324|     11|        if raw.is_nan() {
  325|      1|            0.0
  326|       |        } else {
  327|     10|            raw.clamp(0.0, 1.0)
  328|       |        }
  329|     11|    }
  330|       |}
  331|       |
  332|       |/// Full response envelope returned by the `recall` subcommand.
  333|       |///
  334|       |/// Contains both direct vector matches and graph-traversal matches, plus the
  335|       |/// aggregated `results` list that merges both for callers that do not need
  336|       |/// to distinguish the source.
  337|       |#[derive(Serialize)]
  338|       |pub struct RecallResponse {
  339|       |    pub query: String,
  340|       |    pub k: usize,
  341|       |    pub direct_matches: Vec<RecallItem>,
  342|       |    pub graph_matches: Vec<RecallItem>,
  343|       |    /// Aggregated alias of `direct_matches` + `graph_matches` for the contract documented in SKILL.md.
  344|       |    pub results: Vec<RecallItem>,
  345|       |    /// Total execution time in milliseconds from handler start to serialisation.
  346|       |    pub elapsed_ms: u64,
  347|       |}
  348|       |
  349|       |#[cfg(test)]
  350|       |mod tests {
  351|       |    use super::*;
  352|       |    use serde::Serialize;
  353|       |
  354|       |    #[derive(Serialize)]
  355|       |    struct Dummy {
  356|       |        val: u32,
  357|       |    }
  358|       |
  359|       |    // Non-serializable type to force a JSON serialization error
  360|       |    struct NotSerializable;
  361|       |    impl Serialize for NotSerializable {
  362|      2|        fn serialize<S: serde::Serializer>(&self, _: S) -> Result<S::Ok, S::Error> {
  363|      2|            Err(serde::ser::Error::custom(
  364|      2|                "intentional serialization failure",
  365|      2|            ))
  366|      2|        }
  367|       |    }
  368|       |
  369|       |    #[test]
  370|      1|    fn emit_json_returns_ok_for_valid_value() {
  371|      1|        let v = Dummy { val: 42 };
  372|      1|        assert!(emit_json(&v).is_ok());
  373|      1|    }
  374|       |
  375|       |    #[test]
  376|      1|    fn emit_json_returns_err_for_non_serializable_value() {
  377|      1|        let v = NotSerializable;
  378|      1|        assert!(emit_json(&v).is_err());
  379|      1|    }
  380|       |
  381|       |    #[test]
  382|      1|    fn emit_json_compact_returns_ok_for_valid_value() {
  383|      1|        let v = Dummy { val: 7 };
  384|      1|        assert!(emit_json_compact(&v).is_ok());
  385|      1|    }
  386|       |
  387|       |    #[test]
  388|      1|    fn emit_json_compact_returns_err_for_non_serializable_value() {
  389|      1|        let v = NotSerializable;
  390|      1|        assert!(emit_json_compact(&v).is_err());
  391|      1|    }
  392|       |
  393|       |    #[test]
  394|      1|    fn emit_text_does_not_panic() {
  395|      1|        emit_text("mensagem de teste");
  396|      1|    }
  397|       |
  398|       |    #[test]
  399|      1|    fn emit_progress_does_not_panic() {
  400|      1|        emit_progress("progresso de teste");
  401|      1|    }
  402|       |
  403|       |    #[test]
  404|      1|    fn remember_response_serializes_correctly() {
  405|      1|        let r = RememberResponse {
  406|      1|            memory_id: 1,
  407|      1|            name: "teste".to_string(),
  408|      1|            namespace: "ns".to_string(),
  409|      1|            action: "created".to_string(),
  410|      1|            operation: "created".to_string(),
  411|      1|            version: 1,
  412|      1|            entities_persisted: 2,
  413|      1|            relationships_persisted: 3,
  414|      1|            relationships_truncated: false,
  415|      1|            chunks_created: 4,
  416|      1|            chunks_persisted: 4,
  417|      1|            urls_persisted: 2,
  418|      1|            extraction_method: None,
  419|      1|            merged_into_memory_id: None,
  420|      1|            warnings: vec!["aviso".to_string()],
  421|      1|            created_at: 1776569715,
  422|      1|            created_at_iso: "2026-04-19T03:34:15Z".to_string(),
  423|      1|            elapsed_ms: 123,
  424|      1|            name_was_normalized: false,
  425|      1|            original_name: None,
  426|      1|        };
  427|      1|        let json = serde_json::to_string(&r).unwrap();
  428|      1|        assert!(json.contains("memory_id"));
  429|      1|        assert!(json.contains("aviso"));
  430|      1|        assert!(json.contains("\"namespace\""));
  431|      1|        assert!(json.contains("\"merged_into_memory_id\""));
  432|      1|        assert!(json.contains("\"operation\""));
  433|      1|        assert!(json.contains("\"created_at\""));
  434|      1|        assert!(json.contains("\"created_at_iso\""));
  435|      1|        assert!(json.contains("\"elapsed_ms\""));
  436|      1|        assert!(json.contains("\"urls_persisted\""));
  437|      1|        assert!(json.contains("\"relationships_truncated\":false"));
  438|      1|    }
  439|       |
  440|       |    #[test]
  441|      1|    fn recall_item_serializes_renamed_type_field() {
  442|      1|        let item = RecallItem {
  443|      1|            memory_id: 10,
  444|      1|            name: "entidade".to_string(),
  445|      1|            namespace: "ns".to_string(),
  446|      1|            memory_type: "entity".to_string(),
  447|      1|            description: "desc".to_string(),
  448|      1|            snippet: "trecho".to_string(),
  449|      1|            distance: 0.5,
  450|      1|            score: RecallItem::score_from_distance(0.5),
  451|      1|            source: "db".to_string(),
  452|      1|            graph_depth: None,
  453|      1|        };
  454|      1|        let json = serde_json::to_string(&item).unwrap();
  455|      1|        assert!(json.contains("\"type\""));
  456|      1|        assert!(!json.contains("memory_type"));
  457|       |        // Field is omitted from JSON when None.
  458|      1|        assert!(!json.contains("graph_depth"));
  459|      1|        assert!(json.contains("\"score\":0.5"));
  460|      1|    }
  461|       |
  462|       |    #[test]
  463|      1|    fn recall_response_serializes_with_lists() {
  464|      1|        let resp = RecallResponse {
  465|      1|            query: "busca".to_string(),
  466|      1|            k: 10,
  467|      1|            direct_matches: vec![],
  468|      1|            graph_matches: vec![],
  469|      1|            results: vec![],
  470|      1|            elapsed_ms: 42,
  471|      1|        };
  472|      1|        let json = serde_json::to_string(&resp).unwrap();
  473|      1|        assert!(json.contains("direct_matches"));
  474|      1|        assert!(json.contains("graph_matches"));
  475|      1|        assert!(json.contains("\"k\":"));
  476|      1|        assert!(json.contains("\"results\""));
  477|      1|        assert!(json.contains("\"elapsed_ms\""));
  478|      1|    }
  479|       |
  480|       |    #[test]
  481|      1|    fn error_envelope_serializes_correctly() {
  482|       |        #[derive(serde::Serialize)]
  483|       |        struct ErrorEnvelope<'a> {
  484|       |            error: bool,
  485|       |            code: i32,
  486|       |            message: &'a str,
  487|       |        }
  488|      1|        let envelope = ErrorEnvelope {
  489|      1|            error: true,
  490|      1|            code: 10,
  491|      1|            message: "database disk image is malformed",
  492|      1|        };
  493|      1|        let json = serde_json::to_value(&envelope).unwrap();
  494|      1|        assert_eq!(json["error"], true);
  495|      1|        assert_eq!(json["code"], 10);
  496|      1|        assert_eq!(json["message"], "database disk image is malformed");
  497|      1|    }
  498|       |
  499|       |    #[test]
  500|      1|    fn output_format_default_is_json() {
  501|      1|        let fmt = OutputFormat::default();
  502|      1|        assert!(matches!(fmt, OutputFormat::Json));
                              ^0
  503|      1|    }
  504|       |
  505|       |    #[test]
  506|      1|    fn output_format_variants_exist() {
  507|      1|        let _text = OutputFormat::Text;
  508|      1|        let _md = OutputFormat::Markdown;
  509|      1|        let _json = OutputFormat::Json;
  510|      1|    }
  511|       |
  512|       |    #[test]
  513|      1|    fn recall_item_clone_produces_equal_value() {
  514|      1|        let item = RecallItem {
  515|      1|            memory_id: 99,
  516|      1|            name: "clone".to_string(),
  517|      1|            namespace: "ns".to_string(),
  518|      1|            memory_type: "relation".to_string(),
  519|      1|            description: "d".to_string(),
  520|      1|            snippet: "s".to_string(),
  521|      1|            distance: 0.1,
  522|      1|            score: RecallItem::score_from_distance(0.1),
  523|      1|            source: "src".to_string(),
  524|      1|            graph_depth: Some(2),
  525|      1|        };
  526|      1|        let cloned = item.clone();
  527|      1|        assert_eq!(cloned.memory_id, item.memory_id);
  528|      1|        assert_eq!(cloned.name, item.name);
  529|      1|        assert_eq!(cloned.graph_depth, Some(2));
  530|      1|    }
  531|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/parsers/mod.rs:
    1|       |//! Input format parsers (timestamp, range validators).
    2|       |
    3|       |use chrono::DateTime;
    4|       |use unicode_normalization::UnicodeNormalization;
    5|       |
    6|       |/// Accepts a Unix epoch (integer >= 0) or RFC 3339 timestamp and returns the Unix epoch.
    7|      7|pub fn parse_expected_updated_at(s: &str) -> Result<i64, String> {
    8|      7|    if let Ok(secs) = s.parse::<i64>() {
                            ^3
    9|      3|        if secs >= 0 {
   10|      2|            return Ok(secs);
   11|      1|        }
   12|      4|    }
   13|      5|    DateTime::parse_from_rfc3339(s)
   14|      5|        .map(|dt| dt.timestamp())
                                ^2 ^2
   15|      5|        .map_err(|e| {
                                   ^3
   16|      3|            format!(
   17|      3|                "value must be a Unix epoch (integer >= 0) or RFC 3339 (e.g. 2026-04-19T12:00:00Z): {e}"
   18|       |            )
   19|      3|        })
   20|      7|}
   21|       |
   22|       |/// Validates `-k`/`--k` for `recall` and `hybrid-search` to the inclusive range `1..=4096`.
   23|       |///
   24|       |/// The upper bound matches the `sqlite-vec` knn limit; values above it would surface a leaky
   25|       |/// engine error such as `k value in knn query too large, provided 10000 and the limit is 4096`.
   26|       |/// Validating at parse time turns the failure into a clean Clap error before any database work.
   27|     11|pub fn parse_k_range(s: &str) -> Result<usize, String> {
   28|     11|    let value: usize = s
                      ^9     ^9
   29|     11|        .parse()
   30|     11|        .map_err(|_| format!("'{s}' is not a valid non-negative integer"))?;
                                           ^2                                           ^2
   31|      9|    if !(1..=4096).contains(&value) {
   32|      2|        return Err(format!(
   33|      2|            "k must be between 1 and 4096 (inclusive); got {value}"
   34|      2|        ));
   35|      7|    }
   36|      7|    Ok(value)
   37|     11|}
   38|       |
   39|       |/// Flexible boolean parser for Clap env var integration.
   40|       |///
   41|       |/// Accepts common truthy/falsy conventions used in shell environments:
   42|       |/// truthy: `1`, `true`, `yes`, `on` (case-insensitive)
   43|       |/// falsy: `0`, `false`, `no`, `off`, empty string (case-insensitive)
   44|     24|pub fn parse_bool_flexible(s: &str) -> Result<bool, String> {
   45|     24|    match s.to_lowercase().as_str() {
   46|     24|        "1" | "true" | "yes" | "on" => Ok(true),
                            ^23      ^20     ^18     ^8
   47|     16|        "0" | "false" | "no" | "off" | "" => Ok(false),
                            ^15       ^8     ^6      ^4    ^13
   48|      3|        _ => Err(format!(
   49|      3|            "invalid boolean value '{s}': expected true/false/1/0/yes/no/on/off"
   50|      3|        )),
   51|       |    }
   52|     24|}
   53|       |
   54|       |#[cfg(test)]
   55|       |mod tests {
   56|       |    use super::*;
   57|       |
   58|       |    #[test]
   59|      1|    fn accepts_unix_epoch() {
   60|      1|        assert_eq!(parse_expected_updated_at("1700000000").unwrap(), 1700000000);
   61|      1|    }
   62|       |
   63|       |    #[test]
   64|      1|    fn accepts_zero() {
   65|      1|        assert_eq!(parse_expected_updated_at("0").unwrap(), 0);
   66|      1|    }
   67|       |
   68|       |    #[test]
   69|      1|    fn accepts_rfc_3339_utc() {
   70|      1|        let result = parse_expected_updated_at("2020-01-01T00:00:00Z");
   71|      1|        assert!(result.is_ok());
   72|      1|        assert_eq!(result.unwrap(), 1577836800);
   73|      1|    }
   74|       |
   75|       |    #[test]
   76|      1|    fn accepts_rfc_3339_with_offset() {
   77|      1|        let result = parse_expected_updated_at("2026-04-19T12:00:00+00:00");
   78|      1|        assert!(result.is_ok());
   79|      1|    }
   80|       |
   81|       |    #[test]
   82|      1|    fn rejects_invalid_string() {
   83|      1|        assert!(parse_expected_updated_at("bananas").is_err());
   84|      1|    }
   85|       |
   86|       |    #[test]
   87|      1|    fn rejects_negative() {
   88|      1|        let err = parse_expected_updated_at("-1");
   89|      1|        assert!(err.is_err());
   90|      1|    }
   91|       |
   92|       |    #[test]
   93|      1|    fn error_message_mentions_format() {
   94|      1|        let msg = parse_expected_updated_at("invalid").unwrap_err();
   95|      1|        assert!(msg.contains("RFC 3339") || msg.contains("Unix epoch"));
                                                          ^0
   96|      1|    }
   97|       |
   98|       |    #[test]
   99|      1|    fn k_accepts_valid_range_endpoints() {
  100|      1|        assert_eq!(parse_k_range("1").unwrap(), 1);
  101|      1|        assert_eq!(parse_k_range("4096").unwrap(), 4096);
  102|      1|        assert_eq!(parse_k_range("10").unwrap(), 10);
  103|      1|    }
  104|       |
  105|       |    #[test]
  106|      1|    fn k_rejects_zero() {
  107|      1|        let msg = parse_k_range("0").unwrap_err();
  108|      1|        assert!(msg.contains("between 1 and 4096"));
  109|      1|    }
  110|       |
  111|       |    #[test]
  112|      1|    fn k_rejects_above_limit() {
  113|      1|        let msg = parse_k_range("10000").unwrap_err();
  114|      1|        assert!(msg.contains("between 1 and 4096"));
  115|      1|    }
  116|       |
  117|       |    #[test]
  118|      1|    fn k_rejects_non_integer() {
  119|      1|        let msg = parse_k_range("abc").unwrap_err();
  120|      1|        assert!(msg.contains("not a valid"));
  121|      1|    }
  122|       |
  123|       |    #[test]
  124|      1|    fn k_rejects_negative() {
  125|       |        // usize parser fails on negatives before range check
  126|      1|        assert!(parse_k_range("-5").is_err());
  127|      1|    }
  128|       |
  129|       |    #[test]
  130|      1|    fn bool_flexible_truthy() {
  131|      9|        for v in &["1", "true", "True", "TRUE", "yes", "Yes", "on", "ON"] {
                          ^8
  132|      8|            assert!(parse_bool_flexible(v).unwrap(), "should be true: {v}");
                                                                   ^0
  133|       |        }
  134|      1|    }
  135|       |
  136|       |    #[test]
  137|      1|    fn bool_flexible_falsy() {
  138|     10|        for v in &["0", "false", "False", "FALSE", "no", "No", "off", "OFF", ""] {
                          ^9
  139|      9|            assert!(!parse_bool_flexible(v).unwrap(), "should be false: {v}");
                                                                    ^0
  140|       |        }
  141|      1|    }
  142|       |
  143|       |    #[test]
  144|      1|    fn bool_flexible_rejects_invalid() {
  145|      1|        assert!(parse_bool_flexible("banana").is_err());
  146|      1|        assert!(parse_bool_flexible("2").is_err());
  147|      1|        assert!(parse_bool_flexible("nope").is_err());
  148|      1|    }
  149|       |}
  150|       |
  151|       |/// The 12 well-known relation types from v1.0.0.
  152|       |///
  153|       |/// Non-canonical relations are accepted but emit a `tracing::warn!`.
  154|       |pub const CANONICAL_RELATIONS: &[&str] = &[
  155|       |    "applies_to",
  156|       |    "uses",
  157|       |    "depends_on",
  158|       |    "causes",
  159|       |    "fixes",
  160|       |    "contradicts",
  161|       |    "supports",
  162|       |    "follows",
  163|       |    "related",
  164|       |    "mentions",
  165|       |    "replaces",
  166|       |    "tracked_in",
  167|       |];
  168|       |
  169|       |/// Returns `true` when the relation is one of the 12 canonical types.
  170|      6|pub fn is_canonical_relation(s: &str) -> bool {
  171|      6|    CANONICAL_RELATIONS.contains(&s)
  172|      6|}
  173|       |
  174|       |/// Normalizes a relation string: lowercase + hyphens to underscores.
  175|      6|pub fn normalize_relation(s: &str) -> String {
  176|      6|    s.to_lowercase().replace('-', "_")
  177|      6|}
  178|       |
  179|       |/// Normalizes an entity name to kebab-case ASCII.
  180|       |///
  181|       |/// Applies NFKD decomposition, filters to ASCII (transliterating by dropping
  182|       |/// diacritical combining marks), lowercases, converts spaces and underscores
  183|       |/// to hyphens, collapses consecutive hyphens, and trims leading/trailing hyphens.
  184|       |///
  185|       |/// # Examples
  186|       |///
  187|       |/// ```
  188|       |/// use sqlite_graphrag::parsers::normalize_entity_name;
  189|       |///
  190|       |/// assert_eq!(normalize_entity_name("Danilo Aguiar"), "danilo-aguiar");
  191|       |/// assert_eq!(normalize_entity_name("CANONICAL_RELATIONS"), "canonical-relations");
  192|       |/// assert_eq!(normalize_entity_name("  hello  world  "), "hello-world");
  193|       |/// assert_eq!(normalize_entity_name("danilo-aguiar"), "danilo-aguiar"); // idempotent
  194|       |/// ```
  195|     77|pub fn normalize_entity_name(s: &str) -> String {
  196|       |    // NFKD: decompose precomposed characters into base + combining marks.
  197|       |    // Then keep only ASCII characters, effectively stripping diacritics.
  198|    648|    let ascii: String = s.nfkd().filter(|c| c.is_ascii()).collect();
                      ^77    ^77      ^77^77    ^77                      ^77
  199|       |    // Lowercase, then replace spaces and underscores with hyphens.
  200|     77|    let hyphenated: String = ascii
  201|     77|        .to_lowercase()
  202|     77|        .chars()
  203|    645|        .map(|c| if c.is_ascii_alphanumeric() { c } else { '-' })
                       ^77                                    ^545       ^100
  204|     77|        .collect();
  205|       |    // Collapse consecutive hyphens and trim from both ends.
  206|     77|    let mut result = String::with_capacity(hyphenated.len());
  207|     77|    let mut prev_was_hyphen = false;
  208|    645|    for ch in hyphenated.chars() {
                            ^77        ^77
  209|    645|        if ch == '-' {
  210|    100|            if !prev_was_hyphen {
  211|     90|                result.push('-');
  212|     90|            }
                          ^10
  213|    100|            prev_was_hyphen = true;
  214|    545|        } else {
  215|    545|            result.push(ch);
  216|    545|            prev_was_hyphen = false;
  217|    545|        }
  218|       |    }
  219|     77|    result.trim_matches('-').to_string()
  220|     77|}
  221|       |
  222|       |/// Validates that a normalized relation matches `^[a-z][a-z0-9_]*$`.
  223|     26|pub fn validate_relation_format(s: &str) -> Result<(), String> {
  224|     26|    if s.is_empty() {
  225|      3|        return Err("relation must not be empty".to_string());
  226|     23|    }
  227|     23|    if !s.as_bytes()[0].is_ascii_lowercase() {
  228|      1|        return Err(format!(
  229|      1|            "relation must start with a lowercase letter, got '{s}'"
  230|      1|        ));
  231|     22|    }
  232|     22|    if !s
  233|     22|        .bytes()
  234|    173|        .all(|b| b.is_ascii_lowercase() || b.is_ascii_digit() || b == b'_')
                       ^22                               ^9^9                  ^9
  235|       |    {
  236|      1|        return Err(format!(
  237|      1|            "relation must contain only lowercase letters, digits and underscores, got '{s}'"
  238|      1|        ));
  239|     21|    }
  240|     21|    Ok(())
  241|     26|}
  242|       |
  243|       |/// Emits a `tracing::warn!` when the relation is not in [`CANONICAL_RELATIONS`].
  244|      0|pub fn warn_if_non_canonical(relation: &str) {
  245|      0|    if !is_canonical_relation(relation) {
  246|      0|        tracing::warn!(target: "parsers",
  247|       |            relation,
  248|      0|            "non-canonical relation accepted; consider using a well-known value"
  249|       |        );
  250|      0|    }
  251|      0|}
  252|       |
  253|       |/// Clap `value_parser` for `--relation`: normalizes and validates format.
  254|       |///
  255|       |/// Accepts any kebab-case or snake_case string. Non-canonical values are
  256|       |/// accepted at parse time; the warning is emitted at command execution.
  257|      3|pub fn parse_relation(s: &str) -> Result<String, String> {
  258|      3|    let normalized = normalize_relation(s);
  259|      3|    validate_relation_format(&normalized)?;
                                                       ^1
  260|      2|    Ok(normalized)
  261|      3|}
  262|       |
  263|       |#[cfg(test)]
  264|       |mod relation_tests {
  265|       |    use super::*;
  266|       |
  267|       |    #[test]
  268|      1|    fn canonical_relations_all_valid() {
  269|     13|        for r in CANONICAL_RELATIONS {
                          ^12
  270|     12|            assert!(
  271|     12|                validate_relation_format(r).is_ok(),
  272|      0|                "canonical relation '{r}' should be valid"
  273|       |            );
  274|       |        }
  275|      1|    }
  276|       |
  277|       |    #[test]
  278|      1|    fn normalize_converts_hyphens_and_uppercase() {
  279|      1|        assert_eq!(normalize_relation("Depends-On"), "depends_on");
  280|      1|        assert_eq!(normalize_relation("TESTED-BY"), "tested_by");
  281|      1|        assert_eq!(normalize_relation("uses"), "uses");
  282|      1|    }
  283|       |
  284|       |    #[test]
  285|      1|    fn validate_rejects_empty() {
  286|      1|        assert!(validate_relation_format("").is_err());
  287|      1|    }
  288|       |
  289|       |    #[test]
  290|      1|    fn validate_rejects_digit_start() {
  291|      1|        assert!(validate_relation_format("123abc").is_err());
  292|      1|    }
  293|       |
  294|       |    #[test]
  295|      1|    fn validate_rejects_spaces() {
  296|      1|        assert!(validate_relation_format("has spaces").is_err());
  297|      1|    }
  298|       |
  299|       |    #[test]
  300|      1|    fn validate_accepts_custom_relations() {
  301|      1|        assert!(validate_relation_format("implements").is_ok());
  302|      1|        assert!(validate_relation_format("tested_by").is_ok());
  303|      1|        assert!(validate_relation_format("part_of").is_ok());
  304|      1|        assert!(validate_relation_format("blocks").is_ok());
  305|      1|    }
  306|       |
  307|       |    #[test]
  308|      1|    fn parse_relation_normalizes_and_validates() {
  309|      1|        assert_eq!(parse_relation("Tested-By").unwrap(), "tested_by");
  310|      1|        assert_eq!(parse_relation("uses").unwrap(), "uses");
  311|      1|        assert!(parse_relation("").is_err());
  312|      1|    }
  313|       |
  314|       |    #[test]
  315|      1|    fn is_canonical_detects_known() {
  316|      1|        assert!(is_canonical_relation("uses"));
  317|      1|        assert!(is_canonical_relation("applies_to"));
  318|      1|        assert!(!is_canonical_relation("implements"));
  319|      1|        assert!(!is_canonical_relation("blocks"));
  320|      1|    }
  321|       |}
  322|       |
  323|       |#[cfg(test)]
  324|       |mod entity_name_tests {
  325|       |    use super::*;
  326|       |
  327|       |    #[test]
  328|      1|    fn strips_diacritics_from_accented_name() {
  329|      1|        assert_eq!(normalize_entity_name("Danilo Aguiar"), "danilo-aguiar");
  330|      1|    }
  331|       |
  332|       |    #[test]
  333|      1|    fn strips_diacritics_unicode_accents() {
  334|       |        // é → e, ã → a, ç → c
  335|      1|        assert_eq!(normalize_entity_name("São Paulo"), "sao-paulo");
  336|      1|        assert_eq!(normalize_entity_name("Ünit Tëst"), "unit-test");
  337|      1|    }
  338|       |
  339|       |    #[test]
  340|      1|    fn converts_spaces_to_hyphens() {
  341|      1|        assert_eq!(normalize_entity_name("hello world"), "hello-world");
  342|      1|        assert_eq!(normalize_entity_name("  hello  world  "), "hello-world");
  343|      1|    }
  344|       |
  345|       |    #[test]
  346|      1|    fn converts_underscores_to_hyphens() {
  347|      1|        assert_eq!(normalize_entity_name("hello_world"), "hello-world");
  348|      1|        assert_eq!(
  349|      1|            normalize_entity_name("CANONICAL_RELATIONS"),
  350|       |            "canonical-relations"
  351|       |        );
  352|      1|    }
  353|       |
  354|       |    #[test]
  355|      1|    fn all_caps_becomes_lowercase_kebab() {
  356|      1|        assert_eq!(
  357|      1|            normalize_entity_name("CANONICAL_RELATIONS"),
  358|       |            "canonical-relations"
  359|       |        );
  360|      1|        assert_eq!(normalize_entity_name("MY_ENTITY_NAME"), "my-entity-name");
  361|      1|    }
  362|       |
  363|       |    #[test]
  364|      1|    fn idempotent_on_already_normalized() {
  365|      1|        let name = "danilo-aguiar";
  366|      1|        assert_eq!(normalize_entity_name(name), name);
  367|      1|        let name2 = "canonical-relations";
  368|      1|        assert_eq!(normalize_entity_name(name2), name2);
  369|      1|    }
  370|       |
  371|       |    #[test]
  372|      1|    fn collapses_consecutive_hyphens() {
  373|      1|        assert_eq!(normalize_entity_name("foo--bar"), "foo-bar");
  374|      1|        assert_eq!(normalize_entity_name("foo - bar"), "foo-bar");
  375|      1|    }
  376|       |
  377|       |    #[test]
  378|      1|    fn trims_leading_trailing_hyphens() {
  379|      1|        assert_eq!(normalize_entity_name("-foo-"), "foo");
  380|      1|        assert_eq!(normalize_entity_name("--hello--"), "hello");
  381|      1|    }
  382|       |
  383|       |    #[test]
  384|      1|    fn empty_or_only_separators_returns_empty() {
  385|      1|        assert_eq!(normalize_entity_name(""), "");
  386|      1|        assert_eq!(normalize_entity_name("---"), "");
  387|      1|    }
  388|       |
  389|       |    #[test]
  390|      1|    fn normalizes_dots_slashes_and_punctuation() {
  391|      1|        assert_eq!(normalize_entity_name("lei-14.478/2022"), "lei-14-478-2022");
  392|      1|        assert_eq!(normalize_entity_name("src/main.rs"), "src-main-rs");
  393|      1|        assert_eq!(normalize_entity_name("user@domain.com"), "user-domain-com");
  394|      1|        assert_eq!(normalize_entity_name("v1.0.66"), "v1-0-66");
  395|      1|        assert_eq!(normalize_entity_name("key:value"), "key-value");
  396|      1|    }
  397|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/paths.rs:
    1|       |//! XDG/cwd path resolution and traversal-safe overrides.
    2|       |//!
    3|       |//! Resolves data directories via [`directories::ProjectDirs`] and validates
    4|       |//! that user-supplied paths cannot escape the project root.
    5|       |
    6|       |use crate::errors::AppError;
    7|       |use crate::i18n::validation;
    8|       |use directories::ProjectDirs;
    9|       |use std::path::{Component, Path, PathBuf};
   10|       |
   11|       |/// Resolved filesystem paths used by the CLI at runtime.
   12|       |///
   13|       |/// Constructed via [`AppPaths::resolve`], which applies the three-layer precedence:
   14|       |/// CLI flag → `SQLITE_GRAPHRAG_DB_PATH` env var → `SQLITE_GRAPHRAG_HOME` env var → cwd.
   15|       |#[derive(Debug, Clone)]
   16|       |pub struct AppPaths {
   17|       |    /// Absolute path to the SQLite database file.
   18|       |    pub db: PathBuf,
   19|       |    /// Directory where embedding model files are cached.
   20|       |    pub models: PathBuf,
   21|       |}
   22|       |
   23|       |impl AppPaths {
   24|      7|    pub fn resolve(db_override: Option<&str>) -> Result<Self, AppError> {
   25|      7|        let proj = ProjectDirs::from("", "", "sqlite-graphrag").ok_or_else(|| {
                                                                                            ^0
   26|      0|            AppError::Io(std::io::Error::other("could not determine home directory"))
   27|      0|        })?;
   28|       |
   29|      7|        let cache_root = if let Some(override_dir) = std::env::var_os("SQLITE_GRAPHRAG_CACHE_DIR") {
                                                   ^0
   30|      0|            PathBuf::from(override_dir)
   31|       |        } else {
   32|      7|            proj.cache_dir().to_path_buf()
   33|       |        };
   34|       |
   35|      7|        let db = if let Some(p) = db_override {
                          ^6               ^2
   36|      2|            validate_path(p)?;
                                          ^0
   37|      2|            PathBuf::from(p)
   38|      5|        } else if let Ok(env_path) = std::env::var("SQLITE_GRAPHRAG_DB_PATH") {
                                       ^2
   39|      2|            validate_path(&env_path)?;
                                                  ^0
   40|      2|            PathBuf::from(env_path)
   41|      3|        } else if let Some(home_dir) = home_env_dir()? {
                                         ^1                        ^1
   42|      1|            home_dir.join("graphrag.sqlite")
   43|       |        } else {
   44|      1|            std::env::current_dir()
   45|      1|                .map_err(AppError::Io)?
                                                    ^0
   46|      1|                .join("graphrag.sqlite")
   47|       |        };
   48|       |
   49|      6|        Ok(Self {
   50|      6|            db,
   51|      6|            models: cache_root.join("models"),
   52|      6|        })
   53|      7|    }
   54|       |
   55|      1|    pub fn ensure_dirs(&self) -> Result<(), AppError> {
   56|      2|        for dir in [parent_or_err(&self.db)?, self.models.as_path()] {
                                  ^1            ^1       ^0 ^1
   57|      2|            std::fs::create_dir_all(dir)?;
                                                      ^0
   58|       |        }
   59|      1|        Ok(())
   60|      1|    }
   61|       |}
   62|       |
   63|      6|fn validate_path(p: &str) -> Result<(), AppError> {
   64|     44|    if Path::new(p).components().any(|c| c == Component::ParentDir) {
                     ^6                        ^6
   65|      1|        return Err(AppError::Validation(validation::path_traversal(p)));
   66|      5|    }
   67|      5|    Ok(())
   68|      6|}
   69|       |
   70|       |/// Resolves `SQLITE_GRAPHRAG_HOME` as the root directory for the default database.
   71|       |///
   72|       |/// Returns `Ok(Some(dir))` when the env var is set and valid,
   73|       |/// `Ok(None)` when absent or empty (falls back to `current_dir`),
   74|       |/// and `Err(...)` when the value contains traversal components.
   75|      3|fn home_env_dir() -> Result<Option<PathBuf>, AppError> {
   76|      3|    let raw = match std::env::var("SQLITE_GRAPHRAG_HOME") {
   77|      3|        Ok(v) => v,
   78|      0|        Err(_) => return Ok(None),
   79|       |    };
   80|      3|    if raw.is_empty() {
   81|      1|        return Ok(None);
   82|      2|    }
   83|      2|    validate_path(&raw)?;
                                     ^1
   84|      1|    Ok(Some(PathBuf::from(raw)))
   85|      3|}
   86|       |
   87|     17|pub(crate) fn parent_or_err(path: &Path) -> Result<&Path, AppError> {
   88|     17|    path.parent().ok_or_else(|| {
                                              ^2
   89|      2|        AppError::Validation(format!(
   90|      2|            "path '{}' has no valid parent component",
   91|      2|            path.display()
   92|      2|        ))
   93|      2|    })
   94|     17|}
   95|       |
   96|       |#[cfg(test)]
   97|       |mod tests {
   98|       |    use super::*;
   99|       |    use serial_test::serial;
  100|       |    use tempfile::TempDir;
  101|       |
  102|       |    /// Clears all variables that affect `AppPaths::resolve` to isolate the
  103|       |    /// test from the developer/CI environment.
  104|     10|    fn clean_env_paths() {
  105|       |        // SAFETY: tests are annotated with #[serial], guaranteeing single-threaded execution.
  106|     10|        unsafe {
  107|     10|            std::env::remove_var("SQLITE_GRAPHRAG_HOME");
  108|     10|            std::env::remove_var("SQLITE_GRAPHRAG_DB_PATH");
  109|     10|            std::env::remove_var("SQLITE_GRAPHRAG_CACHE_DIR");
  110|     10|        }
  111|     10|    }
  112|       |
  113|       |    #[test]
  114|       |    #[serial]
  115|      1|    fn home_env_resolves_db_in_subdir() {
  116|      1|        clean_env_paths();
  117|      1|        let tmp = TempDir::new().expect("tempdir");
  118|       |        // SAFETY: tests are annotated with #[serial], guaranteeing single-threaded execution.
  119|      1|        unsafe {
  120|      1|            std::env::set_var("SQLITE_GRAPHRAG_HOME", tmp.path());
  121|      1|        }
  122|       |
  123|      1|        let paths = AppPaths::resolve(None).expect("resolve with valid HOME");
  124|      1|        assert_eq!(paths.db, tmp.path().join("graphrag.sqlite"));
  125|       |
  126|      1|        clean_env_paths();
  127|       |    }
  128|       |
  129|       |    #[test]
  130|       |    #[serial]
  131|      1|    fn home_env_traversal_rejected() {
  132|      1|        clean_env_paths();
  133|       |        // SAFETY: tests are annotated with #[serial], guaranteeing single-threaded execution.
  134|      1|        unsafe {
  135|      1|            std::env::set_var("SQLITE_GRAPHRAG_HOME", "/tmp/../etc");
  136|      1|        }
  137|       |
  138|      1|        let result = AppPaths::resolve(None);
  139|      1|        assert!(
  140|      1|            matches!(result, Err(AppError::Validation(_))),
                          ^0
  141|      0|            "traversal in SQLITE_GRAPHRAG_HOME must fail as Validation, got {result:?}"
  142|       |        );
  143|       |
  144|      1|        clean_env_paths();
  145|       |    }
  146|       |
  147|       |    #[test]
  148|       |    #[serial]
  149|      1|    fn db_path_overrides_home() {
  150|      1|        clean_env_paths();
  151|      1|        let tmp_home = TempDir::new().expect("tempdir home");
  152|      1|        let tmp_db = TempDir::new().expect("tempdir db");
  153|      1|        let explicit_db = tmp_db.path().join("explicit.sqlite");
  154|       |        // SAFETY: tests are annotated with #[serial], guaranteeing single-threaded execution.
  155|      1|        unsafe {
  156|      1|            std::env::set_var("SQLITE_GRAPHRAG_HOME", tmp_home.path());
  157|      1|            std::env::set_var("SQLITE_GRAPHRAG_DB_PATH", &explicit_db);
  158|      1|        }
  159|       |
  160|      1|        let paths = AppPaths::resolve(None).expect("resolve with DB_PATH and HOME");
  161|      1|        assert_eq!(paths.db, explicit_db);
  162|       |
  163|      1|        clean_env_paths();
  164|       |    }
  165|       |
  166|       |    #[test]
  167|       |    #[serial]
  168|      1|    fn flag_overrides_home() {
  169|      1|        clean_env_paths();
  170|      1|        let tmp_home = TempDir::new().expect("tempdir home");
  171|      1|        let tmp_flag = TempDir::new().expect("tempdir flag");
  172|      1|        let db_flag = tmp_flag.path().join("via-flag.sqlite");
  173|       |        // SAFETY: tests are annotated with #[serial], guaranteeing single-threaded execution.
  174|      1|        unsafe {
  175|      1|            std::env::set_var("SQLITE_GRAPHRAG_HOME", tmp_home.path());
  176|      1|        }
  177|       |
  178|      1|        let paths = AppPaths::resolve(Some(db_flag.to_str().expect("utf8")))
  179|      1|            .expect("resolve with flag and HOME");
  180|      1|        assert_eq!(paths.db, db_flag);
  181|       |
  182|      1|        clean_env_paths();
  183|       |    }
  184|       |
  185|       |    #[test]
  186|       |    #[serial]
  187|      1|    fn home_env_empty_falls_back_to_cwd() {
  188|      1|        clean_env_paths();
  189|       |        // SAFETY: tests are annotated with #[serial], guaranteeing single-threaded execution.
  190|      1|        unsafe {
  191|      1|            std::env::set_var("SQLITE_GRAPHRAG_HOME", "");
  192|      1|        }
  193|       |
  194|      1|        let paths = AppPaths::resolve(None).expect("resolve with empty HOME");
  195|      1|        let expected = std::env::current_dir()
  196|      1|            .expect("cwd")
  197|      1|            .join("graphrag.sqlite");
  198|      1|        assert_eq!(paths.db, expected);
  199|       |
  200|      1|        clean_env_paths();
  201|       |    }
  202|       |
  203|       |    #[test]
  204|      1|    fn parent_or_err_accepts_normal_path() {
  205|      1|        let p = PathBuf::from("/home/user/db.sqlite");
  206|      1|        let parent = parent_or_err(&p).expect("valid parent");
  207|      1|        assert_eq!(parent, Path::new("/home/user"));
  208|      1|    }
  209|       |
  210|       |    #[test]
  211|      1|    fn parent_or_err_accepts_relative_path() {
  212|      1|        let p = PathBuf::from("subdir/file.sqlite");
  213|      1|        let parent = parent_or_err(&p).expect("relative parent");
  214|      1|        assert_eq!(parent, Path::new("subdir"));
  215|      1|    }
  216|       |
  217|       |    #[test]
  218|      1|    fn parent_or_err_rejects_unix_root() {
  219|      1|        let p = PathBuf::from("/");
  220|      1|        let result = parent_or_err(&p);
  221|      1|        assert!(matches!(result, Err(AppError::Validation(_))));
                              ^0
  222|      1|    }
  223|       |
  224|       |    #[test]
  225|      1|    fn parent_or_err_rejects_empty_path() {
  226|      1|        let p = PathBuf::from("");
  227|      1|        let result = parent_or_err(&p);
  228|      1|        assert!(matches!(result, Err(AppError::Validation(_))));
                              ^0
  229|      1|    }
  230|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/pragmas.rs:
    1|       |//! SQLite PRAGMA helpers applied at connection open and on each transaction.
    2|       |
    3|       |use crate::errors::AppError;
    4|       |use rusqlite::Connection;
    5|       |
    6|       |/// Applies one-time PRAGMAs on a freshly opened connection (e.g. `auto_vacuum`).
    7|       |///
    8|       |/// Calls [`apply_connection_pragmas`] internally and then sets `wal_autocheckpoint`.
    9|       |/// Must be called once per database file, not once per connection.
   10|       |///
   11|       |/// # Errors
   12|       |/// Returns `Err` when any PRAGMA execution fails.
   13|      1|pub fn apply_init_pragmas(conn: &Connection) -> Result<(), AppError> {
   14|      1|    conn.execute_batch("PRAGMA auto_vacuum = INCREMENTAL;")?;
                                                                         ^0
   15|      1|    apply_connection_pragmas(conn)?;
                                                ^0
   16|      1|    conn.execute_batch(&format!(
   17|      1|        "PRAGMA wal_autocheckpoint = {};",
   18|      1|        crate::constants::WAL_AUTOCHECKPOINT_PAGES
   19|      1|    ))?;
                    ^0
   20|      1|    Ok(())
   21|      1|}
   22|       |
   23|       |/// Re-asserts `PRAGMA journal_mode = WAL` after operations that may revert it
   24|       |/// (notably refinery-driven migrations, which can open internal handles that
   25|       |/// reset the journal mode in some scenarios). Idempotent and cheap; emits
   26|       |/// `tracing::warn!` if WAL fails to engage so degraded behaviour is observable.
   27|      1|pub fn ensure_wal_mode(conn: &Connection) -> Result<(), AppError> {
   28|      1|    let mode: String = conn.query_row("PRAGMA journal_mode = WAL;", [], |r| r.get(0))?;
                                                                                                   ^0
   29|      1|    if mode != "wal" {
   30|      0|        tracing::warn!(target: "pragmas", mode = %mode, "journal_mode did not switch to WAL after re-assertion");
   31|      1|    }
   32|      1|    Ok(())
   33|      1|}
   34|       |
   35|       |/// Applies per-connection PRAGMAs: synchronous, foreign keys, busy timeout, cache, mmap, WAL.
   36|       |///
   37|       |/// Safe to call on every new connection; all settings are idempotent.
   38|       |///
   39|       |/// # Errors
   40|       |/// Returns `Err` when any PRAGMA execution fails.
   41|      3|pub fn apply_connection_pragmas(conn: &Connection) -> Result<(), AppError> {
   42|      3|    conn.execute_batch(&format!(
   43|      3|        "PRAGMA synchronous   = NORMAL;
   44|      3|         PRAGMA foreign_keys  = ON;
   45|      3|         PRAGMA busy_timeout  = {busy};
   46|      3|         PRAGMA cache_size    = {cache};
   47|      3|         PRAGMA temp_store    = MEMORY;
   48|      3|         PRAGMA mmap_size     = {mmap};",
   49|      3|        busy = crate::constants::BUSY_TIMEOUT_MILLIS,
   50|      3|        cache = crate::constants::CACHE_SIZE_KB,
   51|      3|        mmap = crate::constants::MMAP_SIZE_BYTES,
   52|      3|    ))?;
                    ^0
   53|      3|    let mode: String = conn.query_row("PRAGMA journal_mode = WAL;", [], |r| r.get(0))?;
                                                                                                   ^0
   54|      3|    if mode != "wal" {
   55|      0|        tracing::warn!(target: "pragmas", mode = %mode, "journal_mode did not switch to WAL");
   56|      3|    }
   57|      3|    Ok(())
   58|      3|}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/preservation.rs:
    1|       |//! Preservation checks for LLM-enriched memory bodies (G29 Passo 4).
    2|       |//!
    3|       |//! When a language model rewrites a memory body, the operator must be
    4|       |//! protected against silent hallucination: the LLM may invent facts, drop
    5|       |//! key terms, or drift semantically far from the source. This module
    6|       |//! provides a lightweight, deterministic similarity metric that runs
    7|       |//! locally without any model call, so the gate can be enforced before the
    8|       |//! enriched body touches persistent storage.
    9|       |//!
   10|       |//! The default metric is a normalised trigram-Jaccard similarity computed
   11|       |//! on the union of `set_a` and `set_b`. The score is in `[0.0, 1.0]`,
   12|       |//! where `1.0` means the two inputs share every trigram and `0.0` means
   13|       |//! they share none. The threshold default of `0.7` follows the gap G29
   14|       |//! specification, with `--preserve-threshold <F>` letting operators tune
   15|       |//! it per workload.
   16|       |//!
   17|       |//! # Examples
   18|       |//!
   19|       |//! ```
   20|       |//! use sqlite_graphrag::preservation::{jaccard_similarity, PreservationVerdict};
   21|       |//!
   22|       |//! let score = jaccard_similarity("the quick brown fox", "the quick red fox");
   23|       |//! assert!(score > 0.5);
   24|       |//!
   25|       |//! let verdict = PreservationVerdict::evaluate("orig body", "rewritten body", 0.7);
   26|       |//! assert!(matches!(verdict, PreservationVerdict::Preserved { .. }));
   27|       |//! ```
   28|       |
   29|       |use serde::{Deserialize, Serialize};
   30|       |use std::collections::HashSet;
   31|       |
   32|       |/// Computes the trigram-Jaccard similarity between two strings.
   33|       |///
   34|       |/// The score is `|A ∩ B| / |A ∪ B|` where `A` and `B` are the sets of
   35|       |/// character-trigrams extracted from each input. The trigrams are taken
   36|       |/// over Unicode scalar values via `char_indices`, so the function is
   37|       |/// safe to call on multi-byte UTF-8 inputs without byte-boundary errors.
   38|       |///
   39|       |/// # Edge cases
   40|       |///
   41|       |/// - Both inputs empty: returns `1.0` (the empty trigram set is trivially
   42|       |///   contained in itself).
   43|       |/// - One input empty, the other non-empty: returns `0.0` (no overlap).
   44|       |/// - Identical inputs: returns `1.0`.
   45|       |///
   46|       |/// The function is pure: no I/O, no allocation beyond the two trigram
   47|       |/// sets, deterministic for a given pair of inputs. It is safe to call
   48|       |/// in hot paths.
   49|     11|pub fn jaccard_similarity(a: &str, b: &str) -> f64 {
   50|     11|    let set_a = trigrams(a);
   51|     11|    let set_b = trigrams(b);
   52|     11|    if set_a.is_empty() && set_b.is_empty() {
                                         ^2    ^2
   53|      1|        return 1.0;
   54|     10|    }
   55|     10|    let intersection = set_a.intersection(&set_b).count() as f64;
   56|     10|    let union = set_a.union(&set_b).count() as f64;
   57|     10|    if union == 0.0 {
   58|      0|        0.0
   59|       |    } else {
   60|     10|        intersection / union
   61|       |    }
   62|     11|}
   63|       |
   64|       |/// Extracts the set of character-trigrams from a string.
   65|       |///
   66|       |/// Padding handles short strings: inputs with fewer than three characters
   67|       |/// are represented by the unique chars they do contain (with the
   68|       |/// `[c, '\0', '\0']` padding), which guarantees that two identical
   69|       |/// short strings still produce the same trigram set and score `1.0`.
   70|     22|fn trigrams(input: &str) -> HashSet<[char; 3]> {
   71|     22|    let chars: Vec<char> = input.chars().collect();
   72|     22|    if chars.is_empty() {
   73|      4|        return HashSet::new();
   74|     18|    }
   75|     18|    let mut out: HashSet<[char; 3]> = HashSet::with_capacity(chars.len().saturating_add(2));
   76|     18|    let mut window: [char; 3] = ['\0', '\0', '\0'];
   77|    339|    for (i, ch) in chars.iter().enumerate() {
                                 ^18          ^18
   78|    339|        window[0] = if i >= 1 { chars[i - 1] } else { '\0' };
                                              ^321                  ^18
   79|    339|        window[1] = *ch;
   80|    339|        window[2] = if i + 1 < chars.len() {
   81|    321|            chars[i + 1]
   82|       |        } else {
   83|     18|            '\0'
   84|       |        };
   85|    339|        out.insert(window);
   86|       |    }
   87|     18|    out
   88|     22|}
   89|       |
   90|       |/// Outcome of a preservation evaluation against a configurable threshold.
   91|       |///
   92|       |/// `PreservationVerdict` is the wire type the enrich pipeline emits in its
   93|       |/// NDJSON stream: every body-enrich attempt ends in one of the four
   94|       |/// variants so callers can route the result without re-running the
   95|       |/// similarity computation.
   96|       |#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
   97|       |#[serde(tag = "verdict", rename_all = "snake_case")]
   98|       |pub enum PreservationVerdict {
   99|       |    /// The rewritten body is at least `threshold`-similar to the original.
  100|       |    Preserved { score: f64, threshold: f64 },
  101|       |    /// The rewritten body diverges too much from the original and was
  102|       |    /// rejected by the gate.
  103|       |    Rejected { score: f64, threshold: f64 },
  104|       |    /// The original and rewritten bodies are byte-equal (no rewrite was
  105|       |    /// needed); preserved by definition.
  106|       |    Unchanged { byte_len: usize },
  107|       |}
  108|       |
  109|       |impl PreservationVerdict {
  110|       |    /// Evaluates the gate against `threshold` and returns the matching
  111|       |    /// variant. The threshold is clamped to `[0.0, 1.0]` defensively; an
  112|       |    /// out-of-range value does not panic the caller.
  113|      6|    pub fn evaluate(original: &str, rewritten: &str, threshold: f64) -> Self {
  114|      6|        let threshold = threshold.clamp(0.0, 1.0);
  115|      6|        if original == rewritten {
  116|      2|            return Self::Unchanged {
  117|      2|                byte_len: original.len(),
  118|      2|            };
  119|      4|        }
  120|      4|        let score = jaccard_similarity(original, rewritten);
  121|      4|        if score >= threshold {
  122|      3|            Self::Preserved { score, threshold }
  123|       |        } else {
  124|      1|            Self::Rejected { score, threshold }
  125|       |        }
  126|      6|    }
  127|       |
  128|       |    /// Returns `true` when the gate accepted the rewrite.
  129|      6|    pub fn is_accepted(&self) -> bool {
  130|      6|        matches!(self, Self::Preserved { .. } | Self::Unchanged { .. })
                      ^1
  131|      6|    }
  132|       |}
  133|       |
  134|       |#[cfg(test)]
  135|       |mod tests {
  136|       |    use super::*;
  137|       |
  138|       |    #[test]
  139|      1|    fn identical_strings_score_one() {
  140|      1|        let s = "the quick brown fox jumps over the lazy dog";
  141|      1|        assert!((jaccard_similarity(s, s) - 1.0).abs() < f64::EPSILON);
  142|      1|    }
  143|       |
  144|       |    #[test]
  145|      1|    fn completely_different_strings_score_zero_or_near_zero() {
  146|      1|        let a = "aaaaaaaaaa";
  147|      1|        let b = "zzzzzzzzzz";
  148|      1|        assert!(jaccard_similarity(a, b) < 0.05);
  149|      1|    }
  150|       |
  151|       |    #[test]
  152|      1|    fn partial_overlap_scores_between_zero_and_one() {
  153|      1|        let a = "the quick brown fox jumps";
  154|      1|        let b = "the slow brown cat sleeps";
  155|      1|        let score = jaccard_similarity(a, b);
  156|      1|        assert!(score > 0.0 && score < 1.0, "got {score}");
                                                          ^0
  157|      1|    }
  158|       |
  159|       |    #[test]
  160|      1|    fn both_empty_score_one() {
  161|      1|        assert!((jaccard_similarity("", "") - 1.0).abs() < f64::EPSILON);
  162|      1|    }
  163|       |
  164|       |    #[test]
  165|      1|    fn one_empty_scores_zero() {
  166|      1|        assert!(jaccard_similarity("hello", "").abs() < f64::EPSILON);
  167|      1|        assert!(jaccard_similarity("", "hello").abs() < f64::EPSILON);
  168|      1|    }
  169|       |
  170|       |    #[test]
  171|      1|    fn unicode_strings_do_not_panic() {
  172|       |        // Multi-byte UTF-8: 1 char each, very short.
  173|      1|        let a = "ç日本語";
  174|      1|        let b = "ç中文";
  175|      1|        let _ = jaccard_similarity(a, b);
  176|      1|    }
  177|       |
  178|       |    #[test]
  179|      1|    fn verdict_preserved_when_above_threshold() {
  180|      1|        let v = PreservationVerdict::evaluate("hello world", "hello world!", 0.5);
  181|      1|        assert!(v.is_accepted());
  182|      1|        assert!(matches!(v, PreservationVerdict::Preserved { .. }));
                              ^0
  183|      1|    }
  184|       |
  185|       |    #[test]
  186|      1|    fn verdict_unchanged_for_identical() {
  187|      1|        let v = PreservationVerdict::evaluate("same", "same", 0.9);
  188|      1|        assert!(v.is_accepted());
  189|      1|        assert!(matches!(v, PreservationVerdict::Unchanged { byte_len: 4 }));
                              ^0
  190|      1|    }
  191|       |
  192|       |    #[test]
  193|      1|    fn threshold_clamped_out_of_range() {
  194|       |        // Threshold above 1.0 is clamped to 1.0: identical bodies match
  195|       |        // by the `Unchanged` short-circuit, accepted.
  196|      1|        let v = PreservationVerdict::evaluate("abc", "abc", 99.0);
  197|      1|        assert!(v.is_accepted());
  198|       |        // Threshold below 0.0 is clamped to 0.0: every non-empty rewrite
  199|       |        // meets a 0.0 floor and is accepted. This is the documented
  200|       |        // behaviour of `clamp(0.0, 1.0)` and is the only sane reading
  201|       |        // once a negative threshold is no longer in scope.
  202|      1|        let v = PreservationVerdict::evaluate("abc", "xyz", -5.0);
  203|      1|        assert!(v.is_accepted());
  204|       |        // Threshold of exactly 0.0 accepts only identical bodies; even
  205|       |        // a single-character drift fails the gate.
  206|      1|        let v = PreservationVerdict::evaluate("abc", "abcd", 0.0);
  207|      1|        assert!(
  208|      1|            v.is_accepted(),
  209|      0|            "single-char append is mostly the same body"
  210|       |        );
  211|      1|    }
  212|       |
  213|       |    #[test]
  214|      1|    fn g29_repro_evaluates_rejected_when_diverges() {
  215|       |        // G29 reproducer: LLM rewrites a body and drifts far from source.
  216|      1|        let original = "JWT token rotation strategy with 15-min expiry and refresh flow";
  217|      1|        let drifted = "The weather in Tokyo is sunny today with mild temperatures expected";
  218|      1|        let v = PreservationVerdict::evaluate(original, drifted, 0.7);
  219|      1|        assert!(!v.is_accepted(), "should reject hallucinated rewrite");
                                                ^0
  220|      1|    }
  221|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/reaper.rs:
    1|       |//! G28: Reaper for orphan external processes.
    2|       |//!
    3|       |//! When the CLI crashes or is killed (SIGKILL, OOM, machine reset), child
    4|       |//! processes spawned by `claude -p` or `codex exec` may be left running.
    5|       |//! Without cleanup they accumulate as zombies that consume CPU, RAM, and
    6|       |//! MCP-spawned subprocess trees (the 2026-06-03 incident: 1.877 processes
    7|       |//! total, load average 276 on a 10-CPU host).
    8|       |//!
    9|       |//! [`scan_and_kill_orphans`] walks the process table at startup and
   10|       |//! terminates any `claude` or `codex` invocation whose `PPID` is `1`
   11|       |//! (reparented to `init`/`launchd` after the parent died) and that is
   12|       |//! older than the `ORPHAN_MIN_AGE_SECS` constant. The scan is conservative: it only
   13|       |//! kills processes that (a) match a known LLM CLI name, AND (b) are
   14|       |//! orphaned, AND (c) are older than the threshold. A short-lived CLI
   15|       |//! that is just starting up is left alone.
   16|       |
   17|       |use std::time::Duration;
   18|       |
   19|       |const ORPHAN_MIN_AGE_SECS: u64 = 60;
   20|       |const ORPHAN_SCAN_TARGETS: &[&str] = &["claude", "codex"];
   21|       |
   22|       |#[derive(Debug, Clone, Copy, PartialEq, Eq)]
   23|       |pub struct ReaperReport {
   24|       |    /// Number of orphan processes detected.
   25|       |    pub found: usize,
   26|       |    /// Number of orphan processes successfully terminated.
   27|       |    pub killed: usize,
   28|       |    /// Number that we could not terminate (permission, ESRCH, etc).
   29|       |    pub failed: usize,
   30|       |    /// Elapsed wall time of the scan.
   31|       |    pub elapsed_ms: u64,
   32|       |}
   33|       |
   34|       |/// Walks the process table and kills orphan LLM invocations.
   35|       |///
   36|       |/// The scan is best-effort and never panics: on any unexpected error it
   37|       |/// logs the failure and returns a report with `killed = 0`.
   38|      1|pub fn scan_and_kill_orphans() -> ReaperReport {
   39|      1|    let start = std::time::Instant::now();
   40|      1|    let mut report = ReaperReport {
   41|      1|        found: 0,
   42|      1|        killed: 0,
   43|      1|        failed: 0,
   44|      1|        elapsed_ms: 0,
   45|      1|    };
   46|       |
   47|       |    #[cfg(unix)]
   48|       |    {
   49|      1|        if let Err(e) = scan_unix(&mut report) {
   50|      1|            tracing::warn!(target: "reaper", error = %e, "orphan scan failed");
                                                                       ^0
   51|      0|        }
   52|       |    }
   53|       |
   54|       |    #[cfg(not(unix))]
   55|       |    {
   56|       |        tracing::debug!(target: "reaper", "orphan scan is a no-op on non-Unix platforms");
   57|       |    }
   58|       |
   59|      1|    report.elapsed_ms = start.elapsed().as_millis() as u64;
   60|      1|    if report.killed > 0 {
   61|      0|        tracing::warn!(
   62|       |            target: "reaper",
   63|       |            found = report.found,
   64|       |            killed = report.killed,
   65|       |            failed = report.failed,
   66|      0|            "reaped orphan LLM subprocesses"
   67|       |        );
   68|       |    } else {
   69|      1|        tracing::info!(target: "reaper", found = report.found, "no orphan LLM subprocesses detected");
                                                                             ^0
   70|       |    }
   71|      1|    report
   72|      1|}
   73|       |
   74|       |#[cfg(unix)]
   75|      1|fn scan_unix(report: &mut ReaperReport) -> std::io::Result<()> {
   76|       |    use std::fs;
   77|       |    use std::path::Path;
   78|       |
   79|      1|    let proc = Path::new("/proc");
   80|      1|    let entries = fs::read_dir(proc)?;
                      ^0
   81|      0|    for entry in entries.flatten() {
   82|      0|        let name = entry.file_name();
   83|      0|        let Some(name_str) = name.to_str() else {
   84|      0|            continue;
   85|       |        };
   86|      0|        if !name_str.chars().all(|c| c.is_ascii_digit()) {
   87|      0|            continue;
   88|      0|        }
   89|      0|        let pid: i32 = match name_str.parse() {
   90|      0|            Ok(p) => p,
   91|      0|            Err(_) => continue,
   92|       |        };
   93|      0|        if pid == std::process::id() as i32 {
   94|      0|            continue;
   95|      0|        }
   96|       |
   97|      0|        let stat_path = entry.path().join("stat");
   98|      0|        let stat = match fs::read_to_string(&stat_path) {
   99|      0|            Ok(s) => s,
  100|      0|            Err(_) => continue,
  101|       |        };
  102|       |
  103|       |        // /proc/[pid]/stat has the form: `pid (comm) state ppid ...`
  104|       |        // The comm field can contain spaces and parens; the last `)`
  105|       |        // separates the comm from the rest.
  106|      0|        let Some(close_paren) = stat.rfind(')') else {
  107|      0|            continue;
  108|       |        };
  109|      0|        let after = &stat[close_paren + 1..];
  110|      0|        let mut parts = after.split_whitespace();
  111|       |        // parts[0] = state (e.g. "R"), parts[1] = ppid, parts[2] = pgrp, ...
  112|      0|        let state = parts.next().unwrap_or("");
  113|      0|        let ppid: i32 = parts.next().and_then(|p| p.parse().ok()).unwrap_or(-1);
  114|       |
  115|       |        // Only target processes orphaned to init (PPID 1 on Linux/Unix
  116|       |        // when the parent is gone) or whose parent is also dead.
  117|      0|        if ppid != 1 {
  118|      0|            continue;
  119|      0|        }
  120|       |
  121|       |        // Skip zombies (state Z) — they need no kill.
  122|      0|        if state.starts_with('Z') {
  123|      0|            continue;
  124|      0|        }
  125|       |
  126|       |        // Resolve the comm field. proc/[pid]/comm is the short program
  127|       |        // name (no path); we use it instead of parsing the bracketed
  128|       |        // comm from stat to avoid encoding edge cases.
  129|      0|        let comm_path = entry.path().join("comm");
  130|      0|        let comm = match fs::read_to_string(&comm_path) {
  131|      0|            Ok(s) => s.trim().to_string(),
  132|      0|            Err(_) => continue,
  133|       |        };
  134|       |
  135|      0|        if !ORPHAN_SCAN_TARGETS.iter().any(|t| comm == *t) {
  136|      0|            continue;
  137|      0|        }
  138|       |
  139|       |        // Age check: skip processes that just spawned (under 60s old) so
  140|       |        // we never race with a concurrent CLI invocation.
  141|      0|        let age_ok = check_process_age(pid, ORPHAN_MIN_AGE_SECS);
  142|      0|        if !age_ok {
  143|      0|            continue;
  144|      0|        }
  145|       |
  146|      0|        report.found += 1;
  147|      0|        match terminate_pid(pid) {
  148|       |            Ok(()) => {
  149|      0|                report.killed += 1;
  150|      0|                tracing::info!(target: "reaper", pid, comm = %comm, "killed orphan LLM subprocess");
  151|       |            }
  152|      0|            Err(e) => {
  153|      0|                report.failed += 1;
  154|      0|                tracing::warn!(target: "reaper", pid, comm = %comm, error = %e, "failed to kill orphan");
  155|       |            }
  156|       |        }
  157|       |    }
  158|      0|    Ok(())
  159|      1|}
  160|       |
  161|       |#[cfg(unix)]
  162|      0|fn check_process_age(pid: i32, min_age_secs: u64) -> bool {
  163|       |    use std::fs;
  164|       |    // /proc/[pid]/stat field 22 is start_time in clock ticks since boot.
  165|       |    // We instead use the simpler heuristic: stat file mtime.
  166|      0|    let stat_path = std::path::Path::new("/proc")
  167|      0|        .join(pid.to_string())
  168|      0|        .join("stat");
  169|      0|    let Ok(meta) = fs::metadata(&stat_path) else {
  170|      0|        return false;
  171|       |    };
  172|      0|    let Ok(modified) = meta.modified() else {
  173|      0|        return false;
  174|       |    };
  175|      0|    let Ok(elapsed) = std::time::SystemTime::now().duration_since(modified) else {
  176|      0|        return false;
  177|       |    };
  178|      0|    elapsed >= Duration::from_secs(min_age_secs)
  179|      0|}
  180|       |
  181|       |#[cfg(unix)]
  182|      0|fn terminate_pid(pid: i32) -> std::io::Result<()> {
  183|       |    // SIGTERM first; if the process ignores it for >2s, the caller can
  184|       |    // escalate to SIGKILL. For the reaper we send TERM and return; a
  185|       |    // follow-up sweep can send KILL if needed.
  186|      0|    let rc = unsafe { libc::kill(pid, libc::SIGTERM) };
  187|      0|    if rc == 0 {
  188|      0|        Ok(())
  189|       |    } else {
  190|      0|        Err(std::io::Error::last_os_error())
  191|       |    }
  192|      0|}
  193|       |
  194|       |#[cfg(test)]
  195|       |mod tests {
  196|       |    use super::*;
  197|       |
  198|       |    #[test]
  199|      1|    fn reaper_report_starts_zeroed() {
  200|      1|        let r = ReaperReport {
  201|      1|            found: 0,
  202|      1|            killed: 0,
  203|      1|            failed: 0,
  204|      1|            elapsed_ms: 0,
  205|      1|        };
  206|      1|        assert_eq!(r.found, 0);
  207|      1|        assert_eq!(r.killed, 0);
  208|      1|        assert_eq!(r.failed, 0);
  209|      1|    }
  210|       |
  211|       |    #[test]
  212|      1|    fn orphan_min_age_is_one_minute() {
  213|       |        // G28: the threshold of 60s is the safety margin that prevents
  214|       |        // a CLI invocation from killing a concurrent peer that just
  215|       |        // started 5s ago.
  216|      1|        assert_eq!(ORPHAN_MIN_AGE_SECS, 60);
  217|      1|    }
  218|       |
  219|       |    #[test]
  220|      1|    fn orphan_targets_include_claude_and_codex() {
  221|      1|        assert!(ORPHAN_SCAN_TARGETS.contains(&"claude"));
  222|      1|        assert!(ORPHAN_SCAN_TARGETS.contains(&"codex"));
  223|      1|    }
  224|       |
  225|       |    #[test]
  226|      1|    fn scan_completes_without_panic_on_linux() {
  227|       |        // Just ensure the function returns a ReaperReport on the test
  228|       |        // host. On Linux CI we may be PID 1 in containers; the report
  229|       |        // will simply have found=0.
  230|      1|        let r = scan_and_kill_orphans();
  231|      1|        assert!(r.elapsed_ms < 30_000, "scan must finish in <30s");
                                                     ^0
  232|      1|    }
  233|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/retry.rs:
    1|       |//! Centralized retry infrastructure with exponential backoff and half-jitter.
    2|       |//!
    3|       |//! Provides [`RetryConfig`](crate::retry::RetryConfig) with named constructors for each failure domain
    4|       |//! (SQLite BUSY, LLM rate-limit, cold-start) and a [`compute_delay`](crate::retry::compute_delay) function
    5|       |//! that applies the configured jitter strategy.
    6|       |
    7|       |use std::time::{Duration, Instant};
    8|       |
    9|       |/// Configures retry behavior for a specific failure domain.
   10|       |///
   11|       |/// Use the named constructors ([`Self::sqlite_busy`], [`Self::llm_rate_limit`],
   12|       |/// [`Self::cold_start`]) for pre-tuned policies. All timing values are in
   13|       |/// milliseconds except `max_elapsed_secs` which is in seconds.
   14|       |#[derive(Debug, Clone)]
   15|       |pub struct RetryConfig {
   16|       |    /// Base delay for the first retry attempt (ms).
   17|       |    pub initial_delay_ms: u64,
   18|       |    /// Upper bound on any single delay (ms).
   19|       |    pub max_delay_ms: u64,
   20|       |    /// Multiplicative factor applied per attempt.
   21|       |    pub multiplier: u64,
   22|       |    /// Hard cap on total attempts (0 = unlimited, use deadline).
   23|       |    pub max_attempts: u32,
   24|       |    /// Total elapsed wall-clock time before giving up (seconds).
   25|       |    pub max_elapsed_secs: u64,
   26|       |    /// Jitter strategy applied to computed delays.
   27|       |    pub jitter: JitterKind,
   28|       |}
   29|       |
   30|       |/// Jitter strategy for randomizing retry delays.
   31|       |#[derive(Debug, Clone, Copy, PartialEq, Eq)]
   32|       |pub enum JitterKind {
   33|       |    /// No randomization — deterministic delay.
   34|       |    None,
   35|       |    /// Half-jitter: delay in [base/2, base). Guarantees minimum wait.
   36|       |    Half,
   37|       |    /// Full-jitter: delay in [0, base). Maximum spread.
   38|       |    Full,
   39|       |}
   40|       |
   41|       |impl RetryConfig {
   42|       |    /// SQLite BUSY retry: 5 attempts, 300ms base, half-jitter, 30s deadline.
   43|      1|    pub fn sqlite_busy() -> Self {
   44|      1|        Self {
   45|      1|            initial_delay_ms: 300,
   46|      1|            max_delay_ms: 4800,
   47|      1|            multiplier: 2,
   48|      1|            max_attempts: 5,
   49|      1|            max_elapsed_secs: 30,
   50|      1|            jitter: JitterKind::Half,
   51|      1|        }
   52|      1|    }
   53|       |
   54|       |    /// LLM rate-limit retry: 60s base, 900s cap, half-jitter, 1h deadline.
   55|      2|    pub fn llm_rate_limit() -> Self {
   56|      2|        Self {
   57|      2|            initial_delay_ms: 60_000,
   58|      2|            max_delay_ms: 900_000,
   59|      2|            multiplier: 2,
   60|      2|            max_attempts: 20,
   61|      2|            max_elapsed_secs: 3600,
   62|      2|            jitter: JitterKind::Half,
   63|      2|        }
   64|      2|    }
   65|       |
   66|       |    /// Cold-start retry: 2s base, 2 attempts, no jitter, 30s deadline.
   67|      1|    pub fn cold_start() -> Self {
   68|      1|        Self {
   69|      1|            initial_delay_ms: 2000,
   70|      1|            max_delay_ms: 4000,
   71|      1|            multiplier: 2,
   72|      1|            max_attempts: 2,
   73|      1|            max_elapsed_secs: 30,
   74|      1|            jitter: JitterKind::None,
   75|      1|        }
   76|      1|    }
   77|       |}
   78|       |
   79|       |/// Computes the delay for a given attempt using the config's jitter strategy.
   80|       |///
   81|       |/// # Formula
   82|       |///
   83|       |/// ```text
   84|       |/// base = min(initial_delay_ms * multiplier^attempt, max_delay_ms)
   85|       |/// delay = apply_jitter(base, jitter_kind)
   86|       |/// ```
   87|    902|pub fn compute_delay(config: &RetryConfig, attempt: u32) -> Duration {
   88|    902|    let base = config
   89|    902|        .initial_delay_ms
   90|    902|        .saturating_mul(config.multiplier.saturating_pow(attempt))
   91|    902|        .min(config.max_delay_ms);
   92|       |
   93|    902|    let delay_ms = match config.jitter {
   94|      2|        JitterKind::None => base,
   95|       |        JitterKind::Half => {
   96|    500|            let half = base / 2;
   97|    500|            if half == 0 {
   98|      0|                base
   99|       |            } else {
  100|    500|                half + fastrand::u64(0..half)
  101|       |            }
  102|       |        }
  103|       |        JitterKind::Full => {
  104|    400|            if base == 0 {
  105|      0|                0
  106|       |            } else {
  107|    400|                fastrand::u64(0..base)
  108|       |            }
  109|       |        }
  110|       |    };
  111|       |
  112|    902|    Duration::from_millis(delay_ms)
  113|    902|}
  114|       |
  115|       |/// Returns `true` if the env var `SQLITE_GRAPHRAG_DISABLE_RETRY` is set to `1`.
  116|       |///
  117|       |/// When active, all retry loops should propagate the error immediately without
  118|       |/// sleeping. Use during incidents to prevent retry storms.
  119|      8|pub fn is_kill_switch_active() -> bool {
  120|      8|    std::env::var("SQLITE_GRAPHRAG_DISABLE_RETRY").is_ok_and(|v| v == "1")
                                                                               ^0   ^0
  121|      8|}
  122|       |
  123|       |#[cfg(test)]
  124|       |mod tests {
  125|       |    use super::*;
  126|       |
  127|       |    #[test]
  128|      1|    fn compute_delay_half_jitter_in_bounds() {
  129|      1|        let cfg = RetryConfig::llm_rate_limit();
  130|      6|        for attempt in 0..5 {
                          ^5
  131|    505|            for _ in 0..100 {
  132|    500|                let d = compute_delay(&cfg, attempt);
  133|    500|                let base = cfg
  134|    500|                    .initial_delay_ms
  135|    500|                    .saturating_mul(cfg.multiplier.saturating_pow(attempt))
  136|    500|                    .min(cfg.max_delay_ms);
  137|    500|                let half = base / 2;
  138|    500|                assert!(d.as_millis() >= half as u128);
  139|    500|                assert!(d.as_millis() < base as u128);
  140|       |            }
  141|       |        }
  142|      1|    }
  143|       |
  144|       |    #[test]
  145|      1|    fn compute_delay_no_jitter_is_deterministic() {
  146|      1|        let cfg = RetryConfig::cold_start();
  147|      1|        let d1 = compute_delay(&cfg, 0);
  148|      1|        let d2 = compute_delay(&cfg, 0);
  149|      1|        assert_eq!(d1, d2);
  150|      1|        assert_eq!(d1, Duration::from_millis(2000));
  151|      1|    }
  152|       |
  153|       |    #[test]
  154|      1|    fn kill_switch_inactive_by_default() {
  155|      1|        std::env::remove_var("SQLITE_GRAPHRAG_DISABLE_RETRY");
  156|      1|        assert!(!is_kill_switch_active());
  157|      1|    }
  158|       |
  159|       |    #[test]
  160|      1|    fn sqlite_busy_config_matches_constants() {
  161|      1|        let cfg = RetryConfig::sqlite_busy();
  162|      1|        assert_eq!(cfg.initial_delay_ms, 300);
  163|      1|        assert_eq!(cfg.max_attempts, 5);
  164|      1|        assert_eq!(cfg.max_elapsed_secs, 30);
  165|      1|    }
  166|       |
  167|       |    #[test]
  168|      1|    fn llm_rate_limit_has_deadline() {
  169|      1|        let cfg = RetryConfig::llm_rate_limit();
  170|      1|        assert_eq!(cfg.max_elapsed_secs, 3600);
  171|      1|        assert_eq!(cfg.max_delay_ms, 900_000);
  172|      1|    }
  173|       |
  174|       |    #[test]
  175|      1|    fn full_jitter_stays_below_base() {
  176|      1|        let cfg = RetryConfig {
  177|      1|            initial_delay_ms: 1000,
  178|      1|            max_delay_ms: 10_000,
  179|      1|            multiplier: 2,
  180|      1|            max_attempts: 5,
  181|      1|            max_elapsed_secs: 60,
  182|      1|            jitter: JitterKind::Full,
  183|      1|        };
  184|      5|        for attempt in 0..4 {
                          ^4
  185|    404|            for _ in 0..100 {
  186|    400|                let d = compute_delay(&cfg, attempt);
  187|    400|                let base = cfg
  188|    400|                    .initial_delay_ms
  189|    400|                    .saturating_mul(cfg.multiplier.saturating_pow(attempt))
  190|    400|                    .min(cfg.max_delay_ms);
  191|    400|                assert!(d.as_millis() < base as u128);
  192|       |            }
  193|       |        }
  194|      1|    }
  195|       |}
  196|       |
  197|       |// ---------------------------------------------------------------------------
  198|       |// Circuit Breaker (G28-D, v1.0.68)
  199|       |// ---------------------------------------------------------------------------
  200|       |
  201|       |/// Outcome of a single retry attempt, used to feed a [`CircuitBreaker`].
  202|       |///
  203|       |/// We keep this intentionally narrow: rate-limit / timeout errors are
  204|       |/// TRANSIENT and should NOT count toward the breaker; everything else
  205|       |/// counts as a HARD failure that contributes to opening the breaker.
  206|       |#[derive(Debug, Clone, Copy, PartialEq, Eq)]
  207|       |pub enum AttemptOutcome {
  208|       |    /// Transient error: counts as a successful iteration, does NOT trip the breaker.
  209|       |    /// Examples: `AppError::RateLimited`, `AppError::Timeout`, `AppError::DbBusy`.
  210|       |    Transient,
  211|       |    /// Hard failure: counts toward the breaker's failure threshold.
  212|       |    /// Examples: `AppError::Validation`, `AppError::Conflict`,
  213|       |    /// `AppError::Embedding`, `AppError::Internal`.
  214|       |    HardFailure,
  215|       |    /// Successful iteration: resets the consecutive-failure counter.
  216|       |    Success,
  217|       |}
  218|       |
  219|       |/// Counts consecutive hard failures and trips open after a threshold.
  220|       |///
  221|       |/// G28-D (v1.0.68): caps `enrich --retry-failed` and `ingest --retry-failed`
  222|       |/// loops so persistent failures (e.g., LLM provider returning the same
  223|       |/// 4xx for hours) cannot run unbounded.  After `threshold` consecutive
  224|       |/// [`AttemptOutcome::HardFailure`] outcomes, `record` returns `true` and
  225|       |/// the caller is expected to abort with `AppError::CircuitBreakerOpen`.
  226|       |///
  227|       |/// Rate-limited / transient errors are explicitly NOT counted, so a
  228|       |/// provider that throttles but eventually recovers will not trip the
  229|       |/// breaker.
  230|       |#[derive(Debug, Clone)]
  231|       |pub struct CircuitBreaker {
  232|       |    threshold: u32,
  233|       |    cooldown: Duration,
  234|       |    consecutive_failures: u32,
  235|       |    open_until: Option<Instant>,
  236|       |}
  237|       |
  238|       |impl CircuitBreaker {
  239|       |    /// Creates a breaker that opens after `threshold` consecutive hard
  240|       |    /// failures and stays open for `cooldown` after the last failure.
  241|      3|    pub fn new(threshold: u32, cooldown: Duration) -> Self {
  242|      3|        Self {
  243|      3|            threshold,
  244|      3|            cooldown,
  245|      3|            consecutive_failures: 0,
  246|      3|            open_until: None,
  247|      3|        }
  248|      3|    }
  249|       |
  250|       |    /// Records one attempt outcome.
  251|       |    ///
  252|       |    /// Returns `true` when the breaker is now open and the caller must
  253|       |    /// abort the job.  Returns `false` when the attempt should continue.
  254|     17|    pub fn record(&mut self, outcome: AttemptOutcome) -> bool {
  255|     17|        match outcome {
  256|       |            AttemptOutcome::Success | AttemptOutcome::Transient => {
  257|     11|                self.consecutive_failures = 0;
  258|     11|                false
  259|       |            }
  260|       |            AttemptOutcome::HardFailure => {
  261|      6|                self.consecutive_failures = self.consecutive_failures.saturating_add(1);
  262|      6|                if self.consecutive_failures >= self.threshold.max(1) {
  263|      1|                    self.open_until = Some(Instant::now() + self.cooldown);
  264|      1|                    tracing::error!(
  265|       |                        target: "circuit_breaker",
  266|       |                        consecutive_failures = self.consecutive_failures,
  267|       |                        threshold = self.threshold,
  268|      0|                        cooldown_secs = self.cooldown.as_secs(),
  269|      0|                        "circuit breaker opened — aborting job"
  270|       |                    );
  271|      1|                    true
  272|       |                } else {
  273|      5|                    false
  274|       |                }
  275|       |            }
  276|       |        }
  277|     17|    }
  278|       |
  279|       |    /// `true` when the breaker is currently open (and not yet cooled down).
  280|      3|    pub fn is_open(&self) -> bool {
  281|      3|        self.open_until
  282|      3|            .map(|deadline| Instant::now() < deadline)
                                          ^1               ^1
  283|      3|            .unwrap_or(false)
  284|      3|    }
  285|       |
  286|       |    /// Resets the breaker to closed state.
  287|      0|    pub fn reset(&mut self) {
  288|      0|        self.consecutive_failures = 0;
  289|      0|        self.open_until = None;
  290|      0|    }
  291|       |
  292|       |    /// Returns the number of consecutive HardFailure outcomes observed
  293|       |    /// since the last success or reset. Public so callers can include
  294|       |    /// the value in their abort log line.
  295|      0|    pub fn consecutive_failures(&self) -> u32 {
  296|      0|        self.consecutive_failures
  297|      0|    }
  298|       |}
  299|       |
  300|       |#[cfg(test)]
  301|       |mod circuit_breaker_tests {
  302|       |    use super::*;
  303|       |
  304|       |    #[test]
  305|      1|    fn opens_after_threshold_consecutive_hard_failures() {
  306|      1|        let mut cb = CircuitBreaker::new(3, Duration::from_secs(60));
  307|      1|        assert!(!cb.record(AttemptOutcome::HardFailure));
  308|      1|        assert!(!cb.record(AttemptOutcome::HardFailure));
  309|      1|        assert!(cb.record(AttemptOutcome::HardFailure));
  310|      1|        assert!(cb.is_open());
  311|      1|    }
  312|       |
  313|       |    #[test]
  314|      1|    fn ignores_transient_errors() {
  315|      1|        let mut cb = CircuitBreaker::new(2, Duration::from_secs(60));
  316|       |        // 10 transients in a row should never open the breaker.
  317|     11|        for _ in 0..10 {
  318|     10|            assert!(!cb.record(AttemptOutcome::Transient));
  319|       |        }
  320|      1|        assert!(!cb.is_open());
  321|      1|    }
  322|       |
  323|       |    #[test]
  324|      1|    fn success_resets_consecutive_failures() {
  325|      1|        let mut cb = CircuitBreaker::new(3, Duration::from_secs(60));
  326|      1|        cb.record(AttemptOutcome::HardFailure);
  327|      1|        cb.record(AttemptOutcome::HardFailure);
  328|      1|        cb.record(AttemptOutcome::Success);
  329|      1|        assert!(!cb.record(AttemptOutcome::HardFailure));
  330|      1|        assert!(!cb.is_open());
  331|      1|    }
  332|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/signals.rs:
    1|       |//! Cross-platform signal handling: SIGINT, SIGTERM, SIGHUP.
    2|       |
    3|       |use std::sync::atomic::Ordering;
    4|       |
    5|       |/// Registers the global shutdown handler for Ctrl+C / SIGTERM / SIGHUP.
    6|       |///
    7|       |/// First signal: sets [`SHUTDOWN`](crate::SHUTDOWN) flag, cancels the global
    8|       |/// cancellation token, logs graceful shutdown intent.
    9|       |///
   10|       |/// Second signal: calls [`std::process::exit(130)`] for immediate termination
   11|       |/// following Unix convention (128 + SIGINT=2).
   12|      0|pub fn register_shutdown_handler() {
   13|      0|    if let Err(e) = ctrlc::set_handler(move || {
   14|      0|        let prev = crate::SIGNAL_COUNT.fetch_add(1, Ordering::AcqRel);
   15|      0|        if prev == 0 {
   16|      0|            crate::SHUTDOWN.store(true, Ordering::Release);
   17|      0|            crate::SIGNAL_NUMBER.store(2, Ordering::Release);
   18|      0|            crate::cancel_token().cancel();
   19|      0|            tracing::warn!(
   20|       |                target: "signals",
   21|      0|                "shutdown signal received; finishing current operation gracefully"
   22|       |            );
   23|       |        } else {
   24|      0|            eprintln!("\nForced shutdown (second signal received). Exiting immediately.");
   25|      0|            std::process::exit(130);
   26|       |        }
   27|      0|    }) {
   28|      0|        tracing::warn!(target: "signals", error = %e, "signal handler registration failed");
   29|      0|    }
   30|      0|}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/stdin_helper.rs:
    1|       |//! Stdin reader with timeout to prevent indefinite blocking when the
    2|       |//! upstream pipe is held open without sending data.
    3|       |//!
    4|       |//! Used by `remember --body-stdin` and `edit` body input to enforce a
    5|       |//! deadline (default 60s). When the timeout fires, the spawned reader
    6|       |//! thread is leaked because `std::io::stdin()` cannot be cancelled
    7|       |//! from outside; this is acceptable in error scenarios because the
    8|       |//! process is about to exit anyway.
    9|       |//!
   10|       |//! When stdin is attached to a terminal (interactive TTY), the function
   11|       |//! returns an `AppError::Internal` immediately with an actionable message
   12|       |//! instead of blocking for up to `secs` seconds waiting for EOF.
   13|       |
   14|       |use crate::errors::AppError;
   15|       |use std::io::{IsTerminal, Read};
   16|       |use std::sync::mpsc;
   17|       |use std::thread;
   18|       |use std::time::Duration;
   19|       |
   20|       |/// Reads stdin to a `String` with a hard deadline.
   21|       |///
   22|       |/// Returns `AppError::Internal` immediately when stdin is attached to a
   23|       |/// terminal (TTY) — the caller must redirect data via a pipe or file.
   24|       |///
   25|       |/// # Errors
   26|       |/// Returns `AppError::Internal` when stdin is a TTY, when the read does
   27|       |/// not finish within `secs` seconds, or `AppError::Io` when the
   28|       |/// underlying read fails.
   29|      1|pub fn read_stdin_with_timeout(secs: u64) -> Result<String, AppError> {
   30|      1|    if std::io::stdin().is_terminal() {
   31|      0|        return Err(AppError::Internal(anyhow::anyhow!(
   32|      0|            "stdin is attached to a terminal; pipe data via stdin \
   33|      0|             (e.g. `echo ... | sqlite-graphrag ...` or `... < file`) \
   34|      0|             or use --body instead of --body-stdin"
   35|      0|        )));
   36|      1|    }
   37|      1|    let (tx, rx) = mpsc::channel::<std::io::Result<String>>();
   38|      1|    thread::spawn(move || {
   39|      1|        let mut buf = String::new();
   40|      1|        let result = std::io::stdin().read_to_string(&mut buf).map(|_| buf);
   41|      1|        let _ = tx.send(result);
   42|      1|    });
   43|      1|    match rx.recv_timeout(Duration::from_secs(secs)) {
   44|      1|        Ok(Ok(buf)) => Ok(buf),
   45|      0|        Ok(Err(e)) => Err(AppError::Io(e)),
   46|      0|        Err(mpsc::RecvTimeoutError::Timeout) => Err(AppError::Internal(anyhow::anyhow!(
   47|      0|            "stdin read timed out after {secs}s; pipe must close within timeout window"
   48|      0|        ))),
   49|      0|        Err(mpsc::RecvTimeoutError::Disconnected) => Err(AppError::Internal(anyhow::anyhow!(
   50|      0|            "stdin reader thread disconnected unexpectedly"
   51|      0|        ))),
   52|       |    }
   53|      1|}
   54|       |
   55|       |#[cfg(test)]
   56|       |mod tests {
   57|       |    use super::*;
   58|       |    use std::time::Instant;
   59|       |
   60|       |    // Note: we cannot easily test the success path because tests inherit stdin
   61|       |    // from the test runner. We only assert the timeout path here.
   62|       |    #[test]
   63|      1|    fn read_stdin_with_timeout_returns_internal_error_on_timeout() {
   64|       |        // 1s is enough — stdin in test runner is typically a tty or pipe with no input.
   65|      1|        let start = Instant::now();
   66|      1|        let result = read_stdin_with_timeout(1);
   67|      1|        let elapsed = start.elapsed();
   68|       |        // We expect either a timeout (most cases), an immediate TTY error, or a
   69|       |        // successful EOF read (rare in CI environments).
   70|      0|        match result {
   71|      0|            Err(AppError::Internal(e)) => {
   72|      0|                let msg = e.to_string();
   73|       |                // Accept both the TTY-detected error and the timeout error.
   74|      0|                assert!(
   75|      0|                    msg.contains("timed out") || msg.contains("terminal"),
   76|      0|                    "unexpected internal error: {msg}"
   77|       |                );
   78|       |                // TTY path exits immediately; timeout path takes ~1s.
   79|      0|                assert!(elapsed.as_secs_f64() < 2.5);
   80|       |            }
   81|      1|            Ok(_) | Err(AppError::Io(_)) => {
   82|      1|                // EOF reached before timeout — also acceptable in CI environments.
   83|      1|            }
   84|      0|            Err(other) => unreachable!("stdin test: expected Internal/Io, got {other:?}"),
   85|       |        }
   86|      1|    }
   87|       |
   88|       |    // TTY detection cannot be simulated in unit tests because the test runner
   89|       |    // always provides a non-TTY stdin (pipe). Empirical validation:
   90|       |    //   cargo run --release -- remember --body-stdin --name h1-test
   91|       |    // Expected: exits in <2s with "stdin is attached to a terminal" message.
   92|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/storage/backend.rs:
    1|       |//! Storage backend abstraction layer (G14 — phase 1).
    2|       |//!
    3|       |//! Defines a trait that abstracts the database connection, enabling future
    4|       |//! migration from rusqlite to libSQL embedded replicas or other backends.
    5|       |//!
    6|       |//! Phase 1 scope: trait definition + SqliteBackend wrapper only.
    7|       |//! Phase 2 (v1.0.69+): migrate remaining 43 command handlers to use the trait.
    8|       |
    9|       |use rusqlite::Connection;
   10|       |
   11|       |/// Backend-agnostic storage abstraction.
   12|       |///
   13|       |/// Phase 1: wraps `rusqlite::Connection` without functional change.
   14|       |/// Phase 2: will be implemented for `libsql::Connection` with embedded replicas.
   15|       |pub trait StorageBackend {
   16|       |    /// Execute a SQL statement and return the number of affected rows.
   17|       |    fn execute_sql(
   18|       |        &self,
   19|       |        sql: &str,
   20|       |        params: &[&dyn rusqlite::types::ToSql],
   21|       |    ) -> Result<usize, crate::errors::AppError>;
   22|       |
   23|       |    /// Query a single row and map it with the provided closure.
   24|       |    fn query_one<T, F>(
   25|       |        &self,
   26|       |        sql: &str,
   27|       |        params: &[&dyn rusqlite::types::ToSql],
   28|       |        f: F,
   29|       |    ) -> Result<Option<T>, crate::errors::AppError>
   30|       |    where
   31|       |        F: FnOnce(&rusqlite::Row<'_>) -> Result<T, rusqlite::Error>;
   32|       |
   33|       |    /// Returns a reference to the underlying rusqlite Connection.
   34|       |    /// Phase 1 escape hatch — will be removed when full migration is complete.
   35|       |    fn as_connection(&self) -> &Connection;
   36|       |}
   37|       |
   38|       |/// Default implementation wrapping a rusqlite Connection.
   39|       |pub struct SqliteBackend {
   40|       |    conn: Connection,
   41|       |}
   42|       |
   43|       |impl SqliteBackend {
   44|      1|    pub fn new(conn: Connection) -> Self {
   45|      1|        Self { conn }
   46|      1|    }
   47|       |
   48|      0|    pub fn into_inner(self) -> Connection {
   49|      0|        self.conn
   50|      0|    }
   51|       |}
   52|       |
   53|       |impl StorageBackend for SqliteBackend {
   54|      1|    fn execute_sql(
   55|      1|        &self,
   56|      1|        sql: &str,
   57|      1|        params: &[&dyn rusqlite::types::ToSql],
   58|      1|    ) -> Result<usize, crate::errors::AppError> {
   59|      1|        self.conn
   60|      1|            .execute(sql, params)
   61|      1|            .map_err(crate::errors::AppError::Database)
   62|      1|    }
   63|       |
   64|      1|    fn query_one<T, F>(
   65|      1|        &self,
   66|      1|        sql: &str,
   67|      1|        params: &[&dyn rusqlite::types::ToSql],
   68|      1|        f: F,
   69|      1|    ) -> Result<Option<T>, crate::errors::AppError>
   70|      1|    where
   71|      1|        F: FnOnce(&rusqlite::Row<'_>) -> Result<T, rusqlite::Error>,
   72|       |    {
   73|      1|        match self.conn.query_row(sql, params, f) {
   74|      1|            Ok(val) => Ok(Some(val)),
   75|      0|            Err(rusqlite::Error::QueryReturnedNoRows) => Ok(None),
   76|      0|            Err(e) => Err(crate::errors::AppError::Database(e)),
   77|       |        }
   78|      1|    }
   79|       |
   80|      0|    fn as_connection(&self) -> &Connection {
   81|      0|        &self.conn
   82|      0|    }
   83|       |}
   84|       |
   85|       |#[cfg(test)]
   86|       |mod tests {
   87|       |    use super::*;
   88|       |
   89|       |    #[test]
   90|      1|    fn sqlite_backend_wraps_connection() {
   91|      1|        let conn = Connection::open_in_memory().unwrap();
   92|      1|        conn.execute_batch("CREATE TABLE test (id INTEGER PRIMARY KEY, val TEXT)")
   93|      1|            .unwrap();
   94|      1|        let backend = SqliteBackend::new(conn);
   95|      1|        let affected = backend
   96|      1|            .execute_sql(
   97|      1|                "INSERT INTO test (val) VALUES (?1)",
   98|      1|                &[&"hello" as &dyn rusqlite::types::ToSql],
   99|       |            )
  100|      1|            .unwrap();
  101|      1|        assert_eq!(affected, 1);
  102|       |
  103|      1|        let result: Option<String> = backend
  104|      1|            .query_one("SELECT val FROM test WHERE id = 1", &[], |r| r.get(0))
  105|      1|            .unwrap();
  106|      1|        assert_eq!(result, Some("hello".to_string()));
  107|      1|    }
  108|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/storage/chunks.rs:
    1|       |//! Chunk storage CRUD for multi-chunk memories.
    2|       |//!
    3|       |//! Manages the `memory_chunks` table: insert embeddings for bodies that
    4|       |//! exceed the 512-token E5 limit and query chunks for vector search.
    5|       |
    6|       |// src/storage/chunks.rs
    7|       |// Chunk storage for bodies exceeding 512 tokens E5 limit
    8|       |
    9|       |use crate::embedder::f32_to_bytes;
   10|       |use crate::errors::AppError;
   11|       |use rusqlite::{params, Connection};
   12|       |
   13|       |#[derive(Debug, Clone)]
   14|       |pub struct Chunk {
   15|       |    pub memory_id: i64,
   16|       |    pub chunk_idx: i32,
   17|       |    pub chunk_text: String,
   18|       |    pub start_offset: i32,
   19|       |    pub end_offset: i32,
   20|       |    pub token_count: i32,
   21|       |}
   22|       |
   23|      6|pub fn insert_chunks(conn: &Connection, chunks: &[Chunk]) -> Result<(), AppError> {
   24|     14|    for chunk in chunks {
                      ^9
   25|      9|        conn.execute(
   26|      9|            "INSERT INTO memory_chunks (memory_id, chunk_idx, chunk_text, start_offset, end_offset, token_count)
   27|      9|             VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
   28|      9|            params![
   29|       |                chunk.memory_id,
   30|       |                chunk.chunk_idx,
   31|       |                chunk.chunk_text,
   32|       |                chunk.start_offset,
   33|       |                chunk.end_offset,
   34|       |                chunk.token_count,
   35|       |            ],
   36|      1|        )?;
   37|       |    }
   38|      5|    Ok(())
   39|      6|}
   40|       |
   41|      0|pub fn insert_chunk_slices(
   42|      0|    conn: &Connection,
   43|      0|    memory_id: i64,
   44|      0|    body: &str,
   45|      0|    chunks: &[crate::chunking::Chunk],
   46|      0|) -> Result<(), AppError> {
   47|      0|    for (chunk_idx, chunk) in chunks.iter().enumerate() {
   48|      0|        conn.execute(
   49|      0|            "INSERT INTO memory_chunks (memory_id, chunk_idx, chunk_text, start_offset, end_offset, token_count)
   50|      0|             VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
   51|      0|            params![
   52|       |                memory_id,
   53|      0|                chunk_idx as i32,
   54|      0|                crate::chunking::chunk_text(body, chunk),
   55|      0|                chunk.start_offset as i32,
   56|      0|                chunk.end_offset as i32,
   57|      0|                chunk.token_count_approx as i32,
   58|       |            ],
   59|      0|        )?;
   60|       |    }
   61|      0|    Ok(())
   62|      0|}
   63|       |
   64|      1|pub fn upsert_chunk_vec(
   65|      1|    conn: &Connection,
   66|      1|    _rowid: i64,
   67|      1|    memory_id: i64,
   68|      1|    chunk_idx: i32,
   69|      1|    embedding: &[f32],
   70|      1|) -> Result<(), AppError> {
   71|      1|    conn.execute(
   72|      1|        "INSERT OR REPLACE INTO vec_chunks(rowid, memory_id, chunk_idx, embedding)
   73|      1|         VALUES (
   74|      1|             (SELECT id FROM memory_chunks WHERE memory_id = ?1 AND chunk_idx = ?2),
   75|      1|             ?1, ?2, ?3
   76|      1|         )",
   77|      1|        params![memory_id, chunk_idx, f32_to_bytes(embedding)],
   78|      0|    )?;
   79|      1|    Ok(())
   80|      1|}
   81|       |
   82|      2|pub fn delete_chunks(conn: &Connection, memory_id: i64) -> Result<(), AppError> {
   83|      2|    conn.execute(
   84|      2|        "DELETE FROM memory_chunks WHERE memory_id = ?1",
   85|      2|        params![memory_id],
   86|      0|    )?;
   87|      2|    Ok(())
   88|      2|}
   89|       |
   90|      2|pub fn knn_search_chunks(
   91|      2|    conn: &Connection,
   92|      2|    embedding: &[f32],
   93|      2|    k: usize,
   94|      2|) -> Result<Vec<(i64, i32, f32)>, AppError> {
   95|      2|    let bytes = f32_to_bytes(embedding);
   96|      2|    let mut stmt = conn.prepare_cached(
   97|      2|        "SELECT memory_id, chunk_idx, distance FROM vec_chunks
   98|      2|         WHERE embedding MATCH ?1
   99|      2|         ORDER BY distance LIMIT ?2",
  100|      0|    )?;
  101|      2|    let rows = stmt
  102|      2|        .query_map(params![bytes, k as i64], |r| {
                                                               ^1
  103|       |            Ok((
  104|      1|                r.get::<_, i64>(0)?,
                                                ^0
  105|      1|                r.get::<_, i32>(1)?,
                                                ^0
  106|      1|                r.get::<_, f32>(2)?,
                                                ^0
  107|       |            ))
  108|      1|        })?
                        ^0
  109|      2|        .collect::<Result<Vec<_>, _>>()?;
                                                     ^0
  110|      2|    Ok(rows)
  111|      2|}
  112|       |
  113|      4|pub fn get_chunks_by_memory(conn: &Connection, memory_id: i64) -> Result<Vec<Chunk>, AppError> {
  114|      4|    let mut stmt = conn.prepare_cached(
  115|      4|        "SELECT memory_id, chunk_idx, chunk_text, start_offset, end_offset, token_count
  116|      4|         FROM memory_chunks WHERE memory_id = ?1 ORDER BY chunk_idx",
  117|      0|    )?;
  118|      4|    let rows = stmt
  119|      5|        .query_map(params![memory_id], |r| {
                       ^4        ^4
  120|       |            Ok(Chunk {
  121|      5|                memory_id: r.get(0)?,
                                                 ^0
  122|      5|                chunk_idx: r.get(1)?,
                                                 ^0
  123|      5|                chunk_text: r.get(2)?,
                                                  ^0
  124|      5|                start_offset: r.get(3)?,
                                                    ^0
  125|      5|                end_offset: r.get(4)?,
                                                  ^0
  126|      5|                token_count: r.get(5)?,
                                                   ^0
  127|       |            })
  128|      5|        })?
                        ^0
  129|      4|        .collect::<Result<Vec<_>, _>>()?;
                                                     ^0
  130|      4|    Ok(rows)
  131|      4|}
  132|       |
  133|       |#[cfg(test)]
  134|       |mod tests {
  135|       |    use super::*;
  136|       |    use crate::constants::EMBEDDING_DIM;
  137|       |    use crate::storage::connection::register_vec_extension;
  138|       |    use rusqlite::Connection;
  139|       |    use tempfile::TempDir;
  140|       |
  141|      9|    fn setup_db() -> (TempDir, Connection) {
  142|      9|        register_vec_extension();
  143|      9|        let tmp = TempDir::new().unwrap();
  144|      9|        let db_path = tmp.path().join("test.db");
  145|      9|        let mut conn = Connection::open(&db_path).unwrap();
  146|      9|        crate::migrations::runner().run(&mut conn).unwrap();
  147|      9|        (tmp, conn)
  148|      9|    }
  149|       |
  150|      4|    fn insert_memory(conn: &Connection) -> i64 {
  151|      4|        conn.execute(
  152|      4|            "INSERT INTO memories (namespace, name, type, description, body, body_hash)
  153|      4|             VALUES ('global', 'test-mem', 'user', 'desc', 'body', 'hash1')",
  154|      4|            [],
  155|       |        )
  156|      4|        .unwrap();
  157|      4|        conn.last_insert_rowid()
  158|      4|    }
  159|       |
  160|       |    #[test]
  161|      1|    fn test_insert_chunks_empty_ok() {
  162|      1|        let (_tmp, conn) = setup_db();
  163|      1|        let resultado = insert_chunks(&conn, &[]);
  164|      1|        assert!(resultado.is_ok());
  165|      1|    }
  166|       |
  167|       |    #[test]
  168|      1|    fn test_insert_chunks_and_get_by_memory() {
  169|      1|        let (_tmp, conn) = setup_db();
  170|      1|        let memory_id = insert_memory(&conn);
  171|       |
  172|      1|        let chunks = vec![
  173|      1|            Chunk {
  174|      1|                memory_id,
  175|      1|                chunk_idx: 0,
  176|      1|                chunk_text: "primeiro chunk".to_string(),
  177|      1|                start_offset: 0,
  178|      1|                end_offset: 14,
  179|      1|                token_count: 3,
  180|      1|            },
  181|      1|            Chunk {
  182|      1|                memory_id,
  183|      1|                chunk_idx: 1,
  184|      1|                chunk_text: "segundo chunk".to_string(),
  185|      1|                start_offset: 15,
  186|      1|                end_offset: 28,
  187|      1|                token_count: 3,
  188|      1|            },
  189|       |        ];
  190|       |
  191|      1|        insert_chunks(&conn, &chunks).unwrap();
  192|       |
  193|      1|        let recuperados = get_chunks_by_memory(&conn, memory_id).unwrap();
  194|      1|        assert_eq!(recuperados.len(), 2);
  195|      1|        assert_eq!(recuperados[0].chunk_idx, 0);
  196|      1|        assert_eq!(recuperados[0].chunk_text, "primeiro chunk");
  197|      1|        assert_eq!(recuperados[0].start_offset, 0);
  198|      1|        assert_eq!(recuperados[0].end_offset, 14);
  199|      1|        assert_eq!(recuperados[0].token_count, 3);
  200|      1|        assert_eq!(recuperados[1].chunk_idx, 1);
  201|      1|        assert_eq!(recuperados[1].chunk_text, "segundo chunk");
  202|      1|    }
  203|       |
  204|       |    #[test]
  205|      1|    fn test_get_chunks_missing_memory_returns_empty() {
  206|      1|        let (_tmp, conn) = setup_db();
  207|      1|        let resultado = get_chunks_by_memory(&conn, 9999).unwrap();
  208|      1|        assert!(resultado.is_empty());
  209|      1|    }
  210|       |
  211|       |    #[test]
  212|      1|    fn test_delete_chunks_removes_all() {
  213|      1|        let (_tmp, conn) = setup_db();
  214|      1|        let memory_id = insert_memory(&conn);
  215|       |
  216|      1|        let chunks = vec![
  217|      1|            Chunk {
  218|      1|                memory_id,
  219|      1|                chunk_idx: 0,
  220|      1|                chunk_text: "chunk a".to_string(),
  221|      1|                start_offset: 0,
  222|      1|                end_offset: 7,
  223|      1|                token_count: 2,
  224|      1|            },
  225|      1|            Chunk {
  226|      1|                memory_id,
  227|      1|                chunk_idx: 1,
  228|      1|                chunk_text: "chunk b".to_string(),
  229|      1|                start_offset: 8,
  230|      1|                end_offset: 15,
  231|      1|                token_count: 2,
  232|      1|            },
  233|       |        ];
  234|      1|        insert_chunks(&conn, &chunks).unwrap();
  235|       |
  236|      1|        delete_chunks(&conn, memory_id).unwrap();
  237|       |
  238|      1|        let recuperados = get_chunks_by_memory(&conn, memory_id).unwrap();
  239|      1|        assert!(recuperados.is_empty());
  240|      1|    }
  241|       |
  242|       |    #[test]
  243|      1|    fn test_delete_chunks_memory_without_chunks_ok() {
  244|      1|        let (_tmp, conn) = setup_db();
  245|      1|        let resultado = delete_chunks(&conn, 9999);
  246|      1|        assert!(resultado.is_ok());
  247|      1|    }
  248|       |
  249|       |    #[test]
  250|      1|    fn test_get_chunks_ordered_by_chunk_idx() {
  251|      1|        let (_tmp, conn) = setup_db();
  252|      1|        let memory_id = insert_memory(&conn);
  253|       |
  254|      1|        let chunks = vec![
  255|      1|            Chunk {
  256|      1|                memory_id,
  257|      1|                chunk_idx: 2,
  258|      1|                chunk_text: "terceiro".to_string(),
  259|      1|                start_offset: 20,
  260|      1|                end_offset: 28,
  261|      1|                token_count: 1,
  262|      1|            },
  263|      1|            Chunk {
  264|      1|                memory_id,
  265|      1|                chunk_idx: 0,
  266|      1|                chunk_text: "primeiro".to_string(),
  267|      1|                start_offset: 0,
  268|      1|                end_offset: 8,
  269|      1|                token_count: 1,
  270|      1|            },
  271|      1|            Chunk {
  272|      1|                memory_id,
  273|      1|                chunk_idx: 1,
  274|      1|                chunk_text: "segundo".to_string(),
  275|      1|                start_offset: 9,
  276|      1|                end_offset: 16,
  277|      1|                token_count: 1,
  278|      1|            },
  279|       |        ];
  280|      1|        insert_chunks(&conn, &chunks).unwrap();
  281|       |
  282|      1|        let recuperados = get_chunks_by_memory(&conn, memory_id).unwrap();
  283|      1|        assert_eq!(recuperados.len(), 3);
  284|      1|        assert_eq!(recuperados[0].chunk_idx, 0);
  285|      1|        assert_eq!(recuperados[1].chunk_idx, 1);
  286|      1|        assert_eq!(recuperados[2].chunk_idx, 2);
  287|      1|    }
  288|       |
  289|       |    #[test]
  290|      1|    fn test_upsert_chunk_vec_and_knn_search() {
  291|      1|        let (_tmp, conn) = setup_db();
  292|      1|        let memory_id = insert_memory(&conn);
  293|       |
  294|      1|        let chunk = Chunk {
  295|      1|            memory_id,
  296|      1|            chunk_idx: 0,
  297|      1|            chunk_text: "embedding test".to_string(),
  298|      1|            start_offset: 0,
  299|      1|            end_offset: 14,
  300|      1|            token_count: 2,
  301|      1|        };
  302|      1|        insert_chunks(&conn, &[chunk]).unwrap();
  303|       |
  304|      1|        let mut embedding = vec![0.0f32; EMBEDDING_DIM];
  305|      1|        embedding[0] = 1.0;
  306|       |
  307|      1|        let chunk_id: i64 = conn
  308|      1|            .query_row(
  309|      1|                "SELECT id FROM memory_chunks WHERE memory_id = ?1 AND chunk_idx = 0",
  310|      1|                params![memory_id],
  311|      1|                |r| r.get(0),
  312|       |            )
  313|      1|            .unwrap();
  314|       |
  315|      1|        upsert_chunk_vec(&conn, chunk_id, memory_id, 0, &embedding).unwrap();
  316|       |
  317|      1|        let resultados = knn_search_chunks(&conn, &embedding, 1).unwrap();
  318|      1|        assert_eq!(resultados.len(), 1);
  319|      1|        assert_eq!(resultados[0].0, memory_id);
  320|      1|        assert_eq!(resultados[0].1, 0);
  321|      1|    }
  322|       |
  323|       |    #[test]
  324|      1|    fn test_knn_search_chunks_without_data_returns_empty() {
  325|      1|        let (_tmp, conn) = setup_db();
  326|      1|        let embedding = vec![0.0f32; EMBEDDING_DIM];
  327|      1|        let resultado = knn_search_chunks(&conn, &embedding, 5).unwrap();
  328|      1|        assert!(resultado.is_empty());
  329|      1|    }
  330|       |
  331|       |    #[test]
  332|      1|    fn test_insert_chunks_invalid_fk_fails() {
  333|      1|        let (_tmp, conn) = setup_db();
  334|      1|        let chunk = Chunk {
  335|      1|            memory_id: 99999,
  336|      1|            chunk_idx: 0,
  337|      1|            chunk_text: "sem pai".to_string(),
  338|      1|            start_offset: 0,
  339|      1|            end_offset: 7,
  340|      1|            token_count: 1,
  341|      1|        };
  342|      1|        let resultado = insert_chunks(&conn, &[chunk]);
  343|      1|        assert!(resultado.is_err());
  344|      1|    }
  345|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/storage/connection.rs:
    1|       |//! SQLite connection setup with PRAGMAs and 0600 permissions.
    2|       |//!
    3|       |//! Opens (or creates) the database file, loads the `sqlite-vec` extension,
    4|       |//! applies WAL/journal PRAGMAs, and enforces 0600 file permissions on Unix.
    5|       |
    6|       |use crate::errors::AppError;
    7|       |use crate::paths::AppPaths;
    8|       |use crate::pragmas::{apply_connection_pragmas, apply_init_pragmas, ensure_wal_mode};
    9|       |use rusqlite::Connection;
   10|       |use sqlite_vec::sqlite3_vec_init;
   11|       |use std::path::Path;
   12|       |use std::sync::OnceLock;
   13|       |
   14|       |static VEC_EXTENSION_REGISTERED: OnceLock<()> = OnceLock::new();
   15|       |
   16|       |/// Register sqlite-vec GLOBALLY before any connection is opened.
   17|       |///
   18|       |/// Idempotent: subsequent calls are no-ops thanks to `OnceLock`. Safe to invoke from
   19|       |/// both the binary entry point (`main.rs`) and library helpers like `ensure_db_ready`
   20|       |/// so unit tests that exercise CRUD handlers do not need to pre-register the extension.
   21|     86|pub fn register_vec_extension() {
   22|     86|    VEC_EXTENSION_REGISTERED.get_or_init(|| {
                                                          ^1
   23|       |        // SAFETY: sqlite3_auto_extension is a C FFI function that registers a callback
   24|       |        // invoked when SQLite opens any new connection. Soundness assumptions:
   25|       |        // 1. `sqlite3_vec_init` has the exact ABI signature `extern "C" fn(...) -> i32`
   26|       |        //    expected by SQLite's auto-extension API (verified by sqlite-vec crate).
   27|       |        // 2. The transmute from `*const ()` to the expected fn pointer is valid because
   28|       |        //    both have identical layout on supported platforms (Linux, macOS, Windows).
   29|       |        // 3. `OnceLock::get_or_init` guarantees this closure runs at most once across
   30|       |        //    all threads; the auto-extension list is mutated exactly one time.
   31|       |        #[allow(clippy::missing_transmute_annotations)]
   32|      1|        unsafe {
   33|      1|            rusqlite::ffi::sqlite3_auto_extension(Some(std::mem::transmute(
   34|      1|                sqlite3_vec_init as *const (),
   35|      1|            )));
   36|      1|        }
   37|      1|    });
   38|     86|}
   39|       |
   40|      2|pub fn open_rw(path: &Path) -> Result<Connection, AppError> {
   41|      2|    let conn = Connection::open(path)?;
                                                   ^0
   42|      2|    apply_connection_pragmas(&conn)?;
                                                 ^0
   43|      2|    apply_secure_permissions(path);
   44|      2|    Ok(conn)
   45|      2|}
   46|       |
   47|      0|pub fn ensure_schema(conn: &mut Connection) -> Result<(), AppError> {
   48|      0|    crate::migrations::runner()
   49|      0|        .run(conn)
   50|      0|        .map_err(|e| AppError::Internal(anyhow::anyhow!("migration failed: {e}")))?;
   51|      0|    conn.execute_batch(&format!(
   52|      0|        "PRAGMA user_version = {};",
   53|      0|        crate::constants::SCHEMA_USER_VERSION
   54|      0|    ))?;
   55|      0|    Ok(())
   56|      0|}
   57|       |
   58|       |/// Ensures the database file exists and the schema is at the current version.
   59|       |///
   60|       |/// Behavior:
   61|       |/// - DB does not exist: creates the file, applies init PRAGMAs, runs all migrations,
   62|       |///   sets `PRAGMA user_version`, and populates `schema_meta` with default values.
   63|       |///   Emits `tracing::info!` on creation.
   64|       |/// - DB exists with `user_version` below `SCHEMA_USER_VERSION`: runs the remaining
   65|       |///   migrations and updates `user_version`. Emits `tracing::warn!` on auto-migration.
   66|       |/// - DB exists with `user_version` equal to `SCHEMA_USER_VERSION`: no-op.
   67|       |///
   68|       |/// This helper unifies the auto-init contract across CRUD handlers so users can run
   69|       |/// any subcommand on a fresh directory without invoking `init` first. Idempotent
   70|       |/// and safe to call before every handler that needs a ready database.
   71|      1|pub fn ensure_db_ready(paths: &AppPaths) -> Result<(), AppError> {
   72|      1|    register_vec_extension();
   73|      1|    paths.ensure_dirs()?;
                                     ^0
   74|       |
   75|      1|    let db_existed = paths.db.exists();
   76|       |
   77|      1|    if !db_existed {
   78|      1|        tracing::info!(target: "storage",
   79|      0|            path = %paths.db.display(),
   80|       |            schema_version = crate::constants::CURRENT_SCHEMA_VERSION,
   81|      0|            "creating database (auto-init)"
   82|       |        );
   83|      0|    }
   84|       |
   85|      1|    let mut conn = open_rw(&paths.db)?;
                                                   ^0
   86|       |
   87|      1|    if !db_existed {
   88|      1|        apply_init_pragmas(&conn)?;
                                               ^0
   89|      0|    }
   90|       |
   91|      1|    let current_user_version: i64 = conn
   92|      1|        .query_row("PRAGMA user_version", [], |row| row.get(0))
   93|      1|        .unwrap_or(0);
   94|      1|    let target_user_version = crate::constants::SCHEMA_USER_VERSION;
   95|       |
   96|      1|    if current_user_version < target_user_version {
   97|      1|        if db_existed {
   98|      0|            tracing::warn!(target: "storage",
   99|       |                from = current_user_version,
  100|       |                to = target_user_version,
  101|      0|                path = %paths.db.display(),
  102|      0|                "auto-migrating database schema"
  103|       |            );
  104|      1|        }
  105|      1|        crate::migrations::runner()
  106|      1|            .run(&mut conn)
  107|      1|            .map_err(|e| AppError::Internal(anyhow::anyhow!("auto-migration failed: {e}")))?;
                                                          ^0              ^0                             ^0
  108|      1|        conn.execute_batch(&format!("PRAGMA user_version = {target_user_version};"))?;
                                                                                                  ^0
  109|       |
  110|      1|        if !db_existed {
  111|      1|            insert_default_schema_meta(&conn)?;
                                                           ^0
  112|      0|        }
  113|       |
  114|       |        // Defensive re-assertion: refinery's migration runner may open internal
  115|       |        // handles that revert journal_mode to delete on some platforms. Re-apply
  116|       |        // WAL after migrations to guarantee the documented contract holds for
  117|       |        // every command that goes through the auto-init path.
  118|      1|        ensure_wal_mode(&conn)?;
                                            ^0
  119|      0|    }
  120|       |
  121|      1|    Ok(())
  122|      1|}
  123|       |
  124|      1|fn insert_default_schema_meta(conn: &Connection) -> Result<(), AppError> {
  125|      1|    conn.execute(
  126|      1|        "INSERT OR REPLACE INTO schema_meta (key, value) VALUES ('schema_version', ?1)",
  127|      1|        rusqlite::params![crate::constants::CURRENT_SCHEMA_VERSION.to_string()],
  128|      0|    )?;
  129|      1|    conn.execute(
  130|      1|        "INSERT OR REPLACE INTO schema_meta (key, value) VALUES ('model', 'multilingual-e5-small')",
  131|      1|        [],
  132|      0|    )?;
  133|      1|    conn.execute(
  134|      1|        "INSERT OR REPLACE INTO schema_meta (key, value) VALUES ('dim', '384')",
  135|      1|        [],
  136|      0|    )?;
  137|      1|    conn.execute(
  138|      1|        "INSERT OR REPLACE INTO schema_meta (key, value) VALUES ('created_at', CAST(unixepoch() AS TEXT))",
  139|      1|        [],
  140|      0|    )?;
  141|      1|    conn.execute(
  142|      1|        "INSERT OR REPLACE INTO schema_meta (key, value) VALUES ('sqlite-graphrag_version', ?1)",
  143|      1|        rusqlite::params![crate::constants::SQLITE_GRAPHRAG_VERSION],
  144|      0|    )?;
  145|      1|    Ok(())
  146|      1|}
  147|       |
  148|       |/// Applies 600 permissions (owner read/write only) to the SQLite file and its WAL/SHM
  149|       |/// companion files on Unix to prevent leaking private memories in shared directories
  150|       |/// (e.g. multi-user /tmp, Dropbox, NFS). On Windows, NTFS DACL default is private-to-user
  151|       |/// so explicit permission setting is unnecessary; a debug log records the skip. Failures
  152|       |/// are silent to avoid blocking the operation when the process does not own the file
  153|       |/// (e.g. read-only mount).
  154|       |#[allow(unused_variables)]
  155|      2|fn apply_secure_permissions(path: &Path) {
  156|       |    #[cfg(unix)]
  157|       |    {
  158|       |        use std::os::unix::fs::PermissionsExt;
  159|      2|        let candidates = [
  160|      2|            path.to_path_buf(),
  161|      2|            path.with_extension(format!(
  162|      2|                "{}-wal",
  163|      2|                path.extension()
  164|      2|                    .and_then(|e| e.to_str())
  165|      2|                    .unwrap_or("sqlite")
  166|       |            )),
  167|      2|            path.with_extension(format!(
  168|      2|                "{}-shm",
  169|      2|                path.extension()
  170|      2|                    .and_then(|e| e.to_str())
  171|      2|                    .unwrap_or("sqlite")
  172|       |            )),
  173|       |        ];
  174|      6|        for file in candidates.iter() {
                                  ^2         ^2
  175|      6|            if file.exists() {
  176|      4|                if let Ok(meta) = std::fs::metadata(file) {
  177|      4|                    let mut perms = meta.permissions();
  178|      4|                    perms.set_mode(0o600);
  179|      4|                    let _ = std::fs::set_permissions(file, perms);
  180|      4|                }
                              ^0
  181|      2|            }
  182|       |        }
  183|       |    }
  184|       |    #[cfg(windows)]
  185|       |    {
  186|       |        tracing::debug!(target: "storage",
  187|       |            path = %path.display(),
  188|       |            "skipping Unix mode 0o600 on Windows; NTFS DACL default is private-to-user"
  189|       |        );
  190|       |    }
  191|      2|}
  192|       |
  193|      0|pub fn open_ro(path: &Path) -> Result<Connection, AppError> {
  194|      0|    let conn = Connection::open_with_flags(
  195|      0|        path,
  196|      0|        rusqlite::OpenFlags::SQLITE_OPEN_READ_ONLY | rusqlite::OpenFlags::SQLITE_OPEN_URI,
  197|      0|    )?;
  198|      0|    conn.execute_batch("PRAGMA foreign_keys = ON;")?;
  199|      0|    Ok(conn)
  200|      0|}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/storage/entities.rs:
    1|       |//! Persistence layer for entities, relationships and their junction tables.
    2|       |//!
    3|       |//! The entity graph mirrors the conceptual content of memories: `entities`
    4|       |//! holds nodes, `relationships` holds typed edges and `memory_entities` and
    5|       |//! `memory_relationships` connect each memory to the graph slice it emitted.
    6|       |
    7|       |use crate::embedder::f32_to_bytes;
    8|       |use crate::entity_type::EntityType;
    9|       |use crate::errors::AppError;
   10|       |use crate::parsers::normalize_entity_name;
   11|       |use crate::storage::utils::with_busy_retry;
   12|       |use rusqlite::{params, Connection};
   13|       |use serde::{Deserialize, Serialize};
   14|       |
   15|       |/// Input payload used to upsert a single entity.
   16|       |///
   17|       |/// `name` is normalized to kebab-case by the caller. `description` is
   18|       |/// optional and preserved across upserts when the new value is `None`.
   19|       |#[derive(Debug, Serialize, Deserialize, Clone)]
   20|       |#[serde(deny_unknown_fields)]
   21|       |pub struct NewEntity {
   22|       |    pub name: String,
   23|       |    #[serde(alias = "type")]
   24|       |    pub entity_type: EntityType,
   25|       |    pub description: Option<String>,
   26|       |}
   27|       |
   28|       |/// Input payload used to upsert a typed relationship between entities.
   29|       |///
   30|       |/// `strength` must lie within `[0.0, 1.0]` and is mapped to the `weight`
   31|       |/// column of the `relationships` table.
   32|       |#[derive(Debug, Serialize, Deserialize, Clone)]
   33|       |#[serde(deny_unknown_fields)]
   34|       |pub struct NewRelationship {
   35|       |    #[serde(alias = "from")]
   36|       |    pub source: String,
   37|       |    #[serde(alias = "to")]
   38|       |    pub target: String,
   39|       |    #[serde(alias = "type")]
   40|       |    pub relation: String,
   41|       |    #[serde(alias = "weight")]
   42|       |    pub strength: f64,
   43|       |    pub description: Option<String>,
   44|       |}
   45|       |
   46|       |/// Validates entity name against quality rules.
   47|       |///
   48|       |/// Rejects names with newlines, names shorter than 2 characters, and
   49|       |/// ALL_CAPS abbreviations of 4 characters or fewer (common NER noise).
   50|       |///
   51|       |/// # Errors
   52|       |///
   53|       |/// Returns `Err(AppError::Validation)` when the name violates any rule.
   54|     62|pub fn validate_entity_name(name: &str) -> Result<(), AppError> {
   55|     62|    if name.len() < 2 {
   56|      2|        return Err(AppError::Validation(format!(
   57|      2|            "entity name '{name}' must be at least 2 characters"
   58|      2|        )));
   59|     60|    }
   60|     60|    if name.contains('\n') || name.contains('\r') {
                                            ^59  ^59
   61|      2|        return Err(AppError::Validation(
   62|      2|            "entity name must not contain newline characters".to_string(),
   63|      2|        ));
   64|     58|    }
   65|     58|    if name.len() <= 4
   66|     19|        && name
   67|     19|            .chars()
   68|     29|            .all(|c| c.is_ascii_uppercase() || c == '_' || c == '-')
                           ^19                               ^15         ^15
   69|       |    {
   70|      4|        return Err(AppError::Validation(format!(
   71|      4|            "entity name '{name}' rejected: short ALL_CAPS names are typically NER noise"
   72|      4|        )));
   73|     54|    }
   74|     54|    Ok(())
   75|     62|}
   76|       |
   77|       |/// Upserts an entity and returns its primary key.
   78|       |///
   79|       |/// Uses `ON CONFLICT(namespace, name)` to keep one row per entity within a
   80|       |/// namespace, refreshing `type` and `description` opportunistically.
   81|       |///
   82|       |/// # Errors
   83|       |///
   84|       |/// Returns `Err(AppError::Database)` on any `rusqlite` failure.
   85|     48|pub fn upsert_entity(conn: &Connection, namespace: &str, e: &NewEntity) -> Result<i64, AppError> {
   86|       |    // Step 1: validate the original name — catches ALL_CAPS short noise (NER artefacts),
   87|       |    // newlines, and names shorter than 2 characters before any transformation.
   88|     48|    validate_entity_name(&e.name)?;
                                               ^0
   89|       |    // Step 2: normalize to kebab-case ASCII (NFKD, lowercase, spaces/underscores → hyphens).
   90|     48|    let normalized_name = normalize_entity_name(&e.name);
   91|       |    // Step 3: guard post-normalization length — a valid original could collapse to < 2 chars
   92|       |    // (e.g. a single accented character that strips entirely).
   93|     48|    if normalized_name.chars().count() < 2 {
   94|      0|        return Err(AppError::Validation(format!(
   95|      0|            "entity name '{}' normalizes to '{}' which is too short (minimum 2 characters)",
   96|      0|            e.name, normalized_name
   97|      0|        )));
   98|     48|    }
   99|     48|    conn.execute(
  100|     48|        "INSERT INTO entities (namespace, name, type, description)
  101|     48|         VALUES (?1, ?2, ?3, ?4)
  102|     48|         ON CONFLICT(namespace, name) DO UPDATE SET
  103|     48|           type        = excluded.type,
  104|     48|           description = COALESCE(excluded.description, entities.description),
  105|     48|           updated_at  = unixepoch()",
  106|     48|        params![namespace, normalized_name, e.entity_type, e.description],
  107|      0|    )?;
  108|     48|    let id: i64 = conn.query_row(
  109|     48|        "SELECT id FROM entities WHERE namespace = ?1 AND name = ?2",
  110|     48|        params![namespace, normalized_name],
  111|     48|        |r| r.get(0),
  112|      0|    )?;
  113|     48|    Ok(id)
  114|     48|}
  115|       |
  116|       |/// Replaces the vector row for an entity in `vec_entities`.
  117|       |///
  118|       |/// vec0 virtual tables do not honour `INSERT OR REPLACE` when the primary key
  119|       |/// already exists — they raise a UNIQUE constraint error instead of silently
  120|       |/// replacing the row. The workaround is an explicit DELETE before INSERT so
  121|       |/// that the insert never conflicts. `embedding` must have length
  122|       |/// [`crate::constants::EMBEDDING_DIM`].
  123|       |///
  124|       |/// # Errors
  125|       |///
  126|       |/// Returns `Err(AppError::Database)` on any `rusqlite` failure.
  127|      7|pub fn upsert_entity_vec(
  128|      7|    conn: &Connection,
  129|      7|    entity_id: i64,
  130|      7|    namespace: &str,
  131|      7|    entity_type: EntityType,
  132|      7|    embedding: &[f32],
  133|      7|    name: &str,
  134|      7|) -> Result<(), AppError> {
  135|       |    // Both statements wrapped in with_busy_retry: WAL concurrency can cause
  136|       |    // SQLITE_BUSY on vec0 virtual table writes when multiple CLI instances run.
  137|      7|    let embedding_bytes = f32_to_bytes(embedding);
  138|      7|    with_busy_retry(|| {
  139|      7|        conn.execute(
  140|      7|            "DELETE FROM vec_entities WHERE entity_id = ?1",
  141|      7|            params![entity_id],
  142|      0|        )?;
  143|      7|        conn.execute(
  144|      7|            "INSERT INTO vec_entities(entity_id, namespace, type, embedding, name)
  145|      7|             VALUES (?1, ?2, ?3, ?4, ?5)",
  146|      7|            params![entity_id, namespace, entity_type, &embedding_bytes, name],
  147|      0|        )?;
  148|      7|        Ok(())
  149|      7|    })
  150|      7|}
  151|       |
  152|       |/// Upserts a typed relationship between two entity ids.
  153|       |///
  154|       |/// Conflicts on `(source_id, target_id, relation)` refresh `weight` and
  155|       |/// preserve a non-null `description`. Returns the `rowid` of the stored row.
  156|       |///
  157|       |/// # Errors
  158|       |///
  159|       |/// Returns `Err(AppError::Database)` on any `rusqlite` failure.
  160|      9|pub fn upsert_relationship(
  161|      9|    conn: &Connection,
  162|      9|    namespace: &str,
  163|      9|    source_id: i64,
  164|      9|    target_id: i64,
  165|      9|    rel: &NewRelationship,
  166|      9|) -> Result<i64, AppError> {
  167|      9|    conn.execute(
  168|      9|        "INSERT INTO relationships (namespace, source_id, target_id, relation, weight, description)
  169|      9|         VALUES (?1, ?2, ?3, ?4, ?5, ?6)
  170|      9|         ON CONFLICT(source_id, target_id, relation) DO UPDATE SET
  171|      9|           weight = excluded.weight,
  172|      9|           description = COALESCE(excluded.description, relationships.description)",
  173|      9|        params![
  174|       |            namespace,
  175|       |            source_id,
  176|       |            target_id,
  177|       |            rel.relation,
  178|       |            rel.strength,
  179|       |            rel.description
  180|       |        ],
  181|      0|    )?;
  182|      9|    let id: i64 = conn.query_row(
  183|      9|        "SELECT id FROM relationships WHERE source_id=?1 AND target_id=?2 AND relation=?3",
  184|      9|        params![source_id, target_id, rel.relation],
  185|      9|        |r| r.get(0),
  186|      0|    )?;
  187|      9|    Ok(id)
  188|      9|}
  189|       |
  190|       |/// Links a memory to an entity in the `memory_entities` join table.
  191|       |///
  192|       |/// # Errors
  193|       |///
  194|       |/// Returns [`AppError::Database`] when the underlying SQLite operation fails.
  195|      3|pub fn link_memory_entity(
  196|      3|    conn: &Connection,
  197|      3|    memory_id: i64,
  198|      3|    entity_id: i64,
  199|      3|) -> Result<(), AppError> {
  200|      3|    conn.execute(
  201|      3|        "INSERT OR IGNORE INTO memory_entities (memory_id, entity_id) VALUES (?1, ?2)",
  202|      3|        params![memory_id, entity_id],
  203|      0|    )?;
  204|      3|    Ok(())
  205|      3|}
  206|       |
  207|       |/// Links a memory to a relationship in the `memory_relationships` join table.
  208|       |///
  209|       |/// # Errors
  210|       |///
  211|       |/// Returns [`AppError::Database`] when the underlying SQLite operation fails.
  212|      2|pub fn link_memory_relationship(
  213|      2|    conn: &Connection,
  214|      2|    memory_id: i64,
  215|      2|    rel_id: i64,
  216|      2|) -> Result<(), AppError> {
  217|      2|    conn.execute(
  218|      2|        "INSERT OR IGNORE INTO memory_relationships (memory_id, relationship_id) VALUES (?1, ?2)",
  219|      2|        params![memory_id, rel_id],
  220|      0|    )?;
  221|      2|    Ok(())
  222|      2|}
  223|       |
  224|       |/// Increments the `degree` counter of an entity by one.
  225|       |///
  226|       |/// # Errors
  227|       |///
  228|       |/// Returns [`AppError::Database`] when the underlying SQLite operation fails.
  229|      2|pub fn increment_degree(conn: &Connection, entity_id: i64) -> Result<(), AppError> {
  230|      2|    conn.execute(
  231|      2|        "UPDATE entities SET degree = degree + 1 WHERE id = ?1",
  232|      2|        params![entity_id],
  233|      0|    )?;
  234|      2|    Ok(())
  235|      2|}
  236|       |
  237|       |/// Looks up the entity by name and namespace. Returns the id when it exists.
  238|       |///
  239|       |/// # Errors
  240|       |///
  241|       |/// Returns [`AppError::Database`] when the underlying SQLite operation fails.
  242|      6|pub fn find_entity_id(
  243|      6|    conn: &Connection,
  244|      6|    namespace: &str,
  245|      6|    name: &str,
  246|      6|) -> Result<Option<i64>, AppError> {
  247|       |    // Normalize the lookup name so it matches the normalized names written by
  248|       |    // `upsert_entity`. Without this, an entity written through normalization
  249|       |    // (e.g. "Foo Bar" -> "foo-bar") would be unreachable by its original
  250|       |    // spelling, breaking delete-entity, reclassify, merge-entities, rename and
  251|       |    // memory-entities lookups.
  252|      6|    let name = normalize_entity_name(name);
  253|      6|    let mut stmt =
  254|      6|        conn.prepare_cached("SELECT id FROM entities WHERE namespace = ?1 AND name = ?2")?;
                                                                                                       ^0
  255|      6|    match stmt.query_row(params![namespace, &name], |r| r.get::<_, i64>(0)) {
                                                                      ^2^2
  256|      2|        Ok(id) => Ok(Some(id)),
  257|      4|        Err(rusqlite::Error::QueryReturnedNoRows) => Ok(None),
  258|      0|        Err(e) => Err(AppError::Database(e)),
  259|       |    }
  260|      6|}
  261|       |
  262|       |/// Structure representing an existing relation.
  263|       |#[derive(Debug, Serialize)]
  264|       |pub struct RelationshipRow {
  265|       |    pub id: i64,
  266|       |    pub namespace: String,
  267|       |    pub source_id: i64,
  268|       |    pub target_id: i64,
  269|       |    pub relation: String,
  270|       |    pub weight: f64,
  271|       |    pub description: Option<String>,
  272|       |}
  273|       |
  274|       |/// Looks up a specific relation by (source_id, target_id, relation).
  275|       |///
  276|       |/// # Errors
  277|       |///
  278|       |/// Returns [`AppError::Database`] when the underlying SQLite operation fails.
  279|      6|pub fn find_relationship(
  280|      6|    conn: &Connection,
  281|      6|    source_id: i64,
  282|      6|    target_id: i64,
  283|      6|    relation: &str,
  284|      6|) -> Result<Option<RelationshipRow>, AppError> {
  285|      6|    let mut stmt = conn.prepare_cached(
  286|      6|        "SELECT id, namespace, source_id, target_id, relation, weight, description
  287|      6|         FROM relationships
  288|      6|         WHERE source_id = ?1 AND target_id = ?2 AND relation = ?3",
  289|      0|    )?;
  290|      6|    match stmt.query_row(params![source_id, target_id, relation], |r| {
                                                                                    ^2
  291|       |        Ok(RelationshipRow {
  292|      2|            id: r.get(0)?,
                                      ^0
  293|      2|            namespace: r.get(1)?,
                                             ^0
  294|      2|            source_id: r.get(2)?,
                                             ^0
  295|      2|            target_id: r.get(3)?,
                                             ^0
  296|      2|            relation: r.get(4)?,
                                            ^0
  297|      2|            weight: r.get(5)?,
                                          ^0
  298|      2|            description: r.get(6)?,
                                               ^0
  299|       |        })
  300|      2|    }) {
  301|      2|        Ok(row) => Ok(Some(row)),
  302|      4|        Err(rusqlite::Error::QueryReturnedNoRows) => Ok(None),
  303|      0|        Err(e) => Err(AppError::Database(e)),
  304|       |    }
  305|      6|}
  306|       |
  307|       |/// Creates a relation if it does not exist (returns action="created")
  308|       |/// or returns the existing relation (action="already_exists") with updated weight.
  309|       |///
  310|       |/// # Errors
  311|       |///
  312|       |/// - [`AppError::Database`] — SQLite query or constraint failure.
  313|       |/// - [`AppError::Validation`] — self-link attempt (source equals target).
  314|      3|pub fn create_or_fetch_relationship(
  315|      3|    conn: &Connection,
  316|      3|    namespace: &str,
  317|      3|    source_id: i64,
  318|      3|    target_id: i64,
  319|      3|    relation: &str,
  320|      3|    weight: f64,
  321|      3|    description: Option<&str>,
  322|      3|) -> Result<(i64, bool), AppError> {
  323|       |    // Check if it exists first; update weight if different.
  324|      3|    let existing = find_relationship(conn, source_id, target_id, relation)?;
                                                                                        ^0
  325|      3|    if let Some(row) = existing {
                              ^1
  326|      1|        if (row.weight - weight).abs() > f64::EPSILON {
  327|      0|            conn.execute(
  328|      0|                "UPDATE relationships SET weight = ?1 WHERE id = ?2",
  329|      0|                params![weight, row.id],
  330|      0|            )?;
  331|      1|        }
  332|      1|        return Ok((row.id, false));
  333|      2|    }
  334|      2|    conn.execute(
  335|      2|        "INSERT INTO relationships (namespace, source_id, target_id, relation, weight, description)
  336|      2|         VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
  337|      2|        params![
  338|       |            namespace,
  339|       |            source_id,
  340|       |            target_id,
  341|       |            relation,
  342|       |            weight,
  343|       |            description
  344|       |        ],
  345|      0|    )?;
  346|      2|    let id: i64 = conn.query_row(
  347|      2|        "SELECT id FROM relationships WHERE source_id = ?1 AND target_id = ?2 AND relation = ?3",
  348|      2|        params![source_id, target_id, relation],
  349|      2|        |r| r.get(0),
  350|      0|    )?;
  351|      2|    Ok((id, true))
  352|      3|}
  353|       |
  354|       |/// Removes a relation by id and cleans up memory_relationships.
  355|       |///
  356|       |/// # Errors
  357|       |///
  358|       |/// Returns [`AppError::Database`] when the underlying SQLite operation fails.
  359|      1|pub fn delete_relationship_by_id(conn: &Connection, relationship_id: i64) -> Result<(), AppError> {
  360|      1|    conn.execute(
  361|      1|        "DELETE FROM memory_relationships WHERE relationship_id = ?1",
  362|      1|        params![relationship_id],
  363|      0|    )?;
  364|      1|    conn.execute(
  365|      1|        "DELETE FROM relationships WHERE id = ?1",
  366|      1|        params![relationship_id],
  367|      0|    )?;
  368|      1|    Ok(())
  369|      1|}
  370|       |
  371|       |/// Recalculates the `degree` field of an entity.
  372|       |///
  373|       |/// # Errors
  374|       |///
  375|       |/// Returns [`AppError::Database`] when the underlying SQLite operation fails.
  376|      1|pub fn recalculate_degree(conn: &Connection, entity_id: i64) -> Result<(), AppError> {
  377|      1|    conn.execute(
  378|      1|        "UPDATE entities
  379|      1|         SET degree = (SELECT COUNT(*) FROM relationships
  380|      1|                       WHERE source_id = entities.id OR target_id = entities.id)
  381|      1|         WHERE id = ?1",
  382|      1|        params![entity_id],
  383|      0|    )?;
  384|      1|    Ok(())
  385|      1|}
  386|       |
  387|       |/// Entity row with enough data for graph export/query.
  388|       |#[derive(Debug, Serialize, Clone)]
  389|       |pub struct EntityNode {
  390|       |    pub id: i64,
  391|       |    pub name: String,
  392|       |    pub namespace: String,
  393|       |    pub kind: String,
  394|       |}
  395|       |
  396|       |/// Lists entities, filtering by namespace if provided.
  397|       |///
  398|       |/// # Errors
  399|       |///
  400|       |/// Returns [`AppError::Database`] when the underlying SQLite operation fails.
  401|      2|pub fn list_entities(
  402|      2|    conn: &Connection,
  403|      2|    namespace: Option<&str>,
  404|      2|) -> Result<Vec<EntityNode>, AppError> {
  405|      2|    if let Some(ns) = namespace {
                              ^1
  406|      1|        let mut stmt = conn.prepare_cached(
  407|      1|            "SELECT id, name, namespace, type FROM entities WHERE namespace = ?1 ORDER BY id",
  408|      0|        )?;
  409|      1|        let rows = stmt
  410|      2|            .query_map(params![ns], |r| {
                           ^1        ^1
  411|       |                Ok(EntityNode {
  412|      2|                    id: r.get(0)?,
                                              ^0
  413|      2|                    name: r.get(1)?,
                                                ^0
  414|      2|                    namespace: r.get(2)?,
                                                     ^0
  415|      2|                    kind: r.get(3)?,
                                                ^0
  416|       |                })
  417|      2|            })?
                            ^0
  418|      1|            .collect::<Result<Vec<_>, _>>()?;
                                                         ^0
  419|      1|        Ok(rows)
  420|       |    } else {
  421|      1|        let mut stmt = conn.prepare_cached(
  422|      1|            "SELECT id, name, namespace, type FROM entities ORDER BY namespace, id",
  423|      0|        )?;
  424|      1|        let rows = stmt
  425|      2|            .query_map([], |r| {
                           ^1        ^1
  426|       |                Ok(EntityNode {
  427|      2|                    id: r.get(0)?,
                                              ^0
  428|      2|                    name: r.get(1)?,
                                                ^0
  429|      2|                    namespace: r.get(2)?,
                                                     ^0
  430|      2|                    kind: r.get(3)?,
                                                ^0
  431|       |                })
  432|      2|            })?
                            ^0
  433|      1|            .collect::<Result<Vec<_>, _>>()?;
                                                         ^0
  434|      1|        Ok(rows)
  435|       |    }
  436|      2|}
  437|       |
  438|       |/// Lists relations filtered by namespace (of source/target entities).
  439|       |///
  440|       |/// # Errors
  441|       |///
  442|       |/// Returns [`AppError::Database`] when the underlying SQLite operation fails.
  443|      1|pub fn list_relationships_by_namespace(
  444|      1|    conn: &Connection,
  445|      1|    namespace: Option<&str>,
  446|      1|) -> Result<Vec<RelationshipRow>, AppError> {
  447|      1|    if let Some(ns) = namespace {
  448|      1|        let mut stmt = conn.prepare_cached(
  449|      1|            "SELECT r.id, r.namespace, r.source_id, r.target_id, r.relation, r.weight, r.description
  450|      1|             FROM relationships r
  451|      1|             JOIN entities se ON se.id = r.source_id AND se.namespace = ?1
  452|      1|             JOIN entities te ON te.id = r.target_id AND te.namespace = ?1
  453|      1|             ORDER BY r.id",
  454|      0|        )?;
  455|      1|        let rows = stmt
  456|      1|            .query_map(params![ns], |r| {
  457|       |                Ok(RelationshipRow {
  458|      1|                    id: r.get(0)?,
                                              ^0
  459|      1|                    namespace: r.get(1)?,
                                                     ^0
  460|      1|                    source_id: r.get(2)?,
                                                     ^0
  461|      1|                    target_id: r.get(3)?,
                                                     ^0
  462|      1|                    relation: r.get(4)?,
                                                    ^0
  463|      1|                    weight: r.get(5)?,
                                                  ^0
  464|      1|                    description: r.get(6)?,
                                                       ^0
  465|       |                })
  466|      1|            })?
                            ^0
  467|      1|            .collect::<Result<Vec<_>, _>>()?;
                                                         ^0
  468|      1|        Ok(rows)
  469|       |    } else {
  470|      0|        let mut stmt = conn.prepare_cached(
  471|      0|            "SELECT id, namespace, source_id, target_id, relation, weight, description
  472|      0|             FROM relationships ORDER BY id",
  473|      0|        )?;
  474|      0|        let rows = stmt
  475|      0|            .query_map([], |r| {
  476|       |                Ok(RelationshipRow {
  477|      0|                    id: r.get(0)?,
  478|      0|                    namespace: r.get(1)?,
  479|      0|                    source_id: r.get(2)?,
  480|      0|                    target_id: r.get(3)?,
  481|      0|                    relation: r.get(4)?,
  482|      0|                    weight: r.get(5)?,
  483|      0|                    description: r.get(6)?,
  484|       |                })
  485|      0|            })?
  486|      0|            .collect::<Result<Vec<_>, _>>()?;
  487|      0|        Ok(rows)
  488|       |    }
  489|      1|}
  490|       |
  491|       |/// Locates orphan entities: no link in memory_entities and no relations.
  492|       |///
  493|       |/// # Errors
  494|       |///
  495|       |/// Returns [`AppError::Database`] when the underlying SQLite operation fails.
  496|      3|pub fn find_orphan_entity_ids(
  497|      3|    conn: &Connection,
  498|      3|    namespace: Option<&str>,
  499|      3|) -> Result<Vec<i64>, AppError> {
  500|      3|    if let Some(ns) = namespace {
                              ^2
  501|      2|        let mut stmt = conn.prepare_cached(
  502|      2|            "SELECT e.id FROM entities e
  503|      2|             WHERE e.namespace = ?1
  504|      2|               AND NOT EXISTS (SELECT 1 FROM memory_entities me WHERE me.entity_id = e.id)
  505|      2|               AND NOT EXISTS (
  506|      2|                   SELECT 1 FROM relationships r
  507|      2|                   WHERE r.source_id = e.id OR r.target_id = e.id
  508|      2|               )",
  509|      0|        )?;
  510|      2|        let ids = stmt
  511|      2|            .query_map(params![ns], |r| r.get::<_, i64>(0))?
                                                      ^1^1               ^0
  512|      2|            .collect::<Result<Vec<_>, _>>()?;
                                                         ^0
  513|      2|        Ok(ids)
  514|       |    } else {
  515|      1|        let mut stmt = conn.prepare_cached(
  516|      1|            "SELECT e.id FROM entities e
  517|      1|             WHERE NOT EXISTS (SELECT 1 FROM memory_entities me WHERE me.entity_id = e.id)
  518|      1|               AND NOT EXISTS (
  519|      1|                   SELECT 1 FROM relationships r
  520|      1|                   WHERE r.source_id = e.id OR r.target_id = e.id
  521|      1|               )",
  522|      0|        )?;
  523|      1|        let ids = stmt
  524|      2|            .query_map([], |r| r.get::<_, i64>(0))?
                           ^1        ^1                         ^0
  525|      1|            .collect::<Result<Vec<_>, _>>()?;
                                                         ^0
  526|      1|        Ok(ids)
  527|       |    }
  528|      3|}
  529|       |
  530|       |/// Deletes entities and their associated vectors. Returns the number of entities removed.
  531|       |///
  532|       |/// # Errors
  533|       |///
  534|       |/// Returns [`AppError::Database`] when the underlying SQLite operation fails.
  535|      5|pub fn delete_entities_by_ids(conn: &Connection, entity_ids: &[i64]) -> Result<usize, AppError> {
  536|      5|    if entity_ids.is_empty() {
  537|      1|        return Ok(0);
  538|      4|    }
  539|      4|    let mut removed = 0usize;
  540|      9|    for id in entity_ids {
                      ^5
  541|       |        // vec0 lacks FK CASCADE — clean vec_entities explicitly.
  542|      5|        let _ = conn.execute("DELETE FROM vec_entities WHERE entity_id = ?1", params![id]);
  543|      5|        let affected = conn.execute("DELETE FROM entities WHERE id = ?1", params![id])?;
                                                                                                    ^0
  544|      5|        removed += affected;
  545|       |    }
  546|      4|    Ok(removed)
  547|      5|}
  548|       |
  549|       |/// Counts relationships matching the given relation type within a namespace.
  550|       |///
  551|       |/// Used by `prune-relations --dry-run` to preview the number of relationships
  552|       |/// that would be deleted without actually modifying the database.
  553|       |///
  554|       |/// # Errors
  555|       |///
  556|       |/// Returns `Err(AppError::Database)` on any `rusqlite` failure.
  557|      0|pub fn count_relationships_by_relation(
  558|      0|    conn: &Connection,
  559|      0|    namespace: &str,
  560|      0|    relation: &str,
  561|      0|) -> Result<usize, AppError> {
  562|      0|    let count: i64 = conn.query_row(
  563|      0|        "SELECT COUNT(*) FROM relationships WHERE namespace = ?1 AND relation = ?2",
  564|      0|        params![namespace, relation],
  565|      0|        |r| r.get(0),
  566|      0|    )?;
  567|      0|    Ok(count as usize)
  568|      0|}
  569|       |
  570|       |/// Returns unique entity names involved in relationships of the given type.
  571|       |///
  572|       |/// Queries both source and target sides of every matching relationship row,
  573|       |/// deduplicates via `DISTINCT`, and returns the names in alphabetical order.
  574|       |///
  575|       |/// # Errors
  576|       |///
  577|       |/// Returns `Err(AppError::Database)` on any `rusqlite` failure.
  578|      0|pub fn list_entity_names_by_relation(
  579|      0|    conn: &Connection,
  580|      0|    namespace: &str,
  581|      0|    relation: &str,
  582|      0|) -> Result<Vec<String>, AppError> {
  583|      0|    let mut stmt = conn.prepare_cached(
  584|      0|        "SELECT DISTINCT e.name FROM entities e
  585|      0|         INNER JOIN relationships r ON (e.id = r.source_id OR e.id = r.target_id)
  586|      0|         WHERE r.namespace = ?1 AND r.relation = ?2
  587|      0|         ORDER BY e.name",
  588|      0|    )?;
  589|      0|    let names: Vec<String> = stmt
  590|      0|        .query_map(params![namespace, relation], |row| row.get(0))?
  591|      0|        .collect::<Result<Vec<_>, _>>()?;
  592|      0|    Ok(names)
  593|      0|}
  594|       |
  595|       |/// Deletes all relationships matching a relation type within a namespace.
  596|       |///
  597|       |/// Operates in chunks of 1000 to avoid holding long write locks and blocking
  598|       |/// WAL readers. After deletion, recalculates degree for every affected entity.
  599|       |///
  600|       |/// Returns `(count_deleted, affected_entity_ids)`.
  601|       |///
  602|       |/// # Errors
  603|       |///
  604|       |/// Returns `Err(AppError::Database)` on any `rusqlite` failure.
  605|      0|pub fn delete_relationships_by_relation(
  606|      0|    conn: &Connection,
  607|      0|    namespace: &str,
  608|      0|    relation: &str,
  609|      0|) -> Result<(usize, Vec<i64>), AppError> {
  610|       |    // Step 1: collect all affected entity IDs before deletion.
  611|      0|    let mut stmt = conn.prepare_cached(
  612|      0|        "SELECT DISTINCT source_id FROM relationships WHERE namespace = ?1 AND relation = ?2
  613|      0|         UNION
  614|      0|         SELECT DISTINCT target_id FROM relationships WHERE namespace = ?1 AND relation = ?2",
  615|      0|    )?;
  616|      0|    let entity_ids: Vec<i64> = stmt
  617|      0|        .query_map(params![namespace, relation], |r| r.get::<_, i64>(0))?
  618|      0|        .collect::<Result<Vec<_>, _>>()?;
  619|       |
  620|       |    // Step 2: collect relationship IDs to delete.
  621|      0|    let mut id_stmt =
  622|      0|        conn.prepare_cached("SELECT id FROM relationships WHERE namespace = ?1 AND relation = ?2")?;
  623|      0|    let rel_ids: Vec<i64> = id_stmt
  624|      0|        .query_map(params![namespace, relation], |r| r.get::<_, i64>(0))?
  625|      0|        .collect::<Result<Vec<_>, _>>()?;
  626|       |
  627|       |    // Step 3: delete in chunks of 1000 (memory_relationships + relationships).
  628|      0|    let mut total_deleted: usize = 0;
  629|      0|    for chunk in rel_ids.chunks(1000) {
  630|      0|        for &rel_id in chunk {
  631|      0|            conn.execute(
  632|      0|                "DELETE FROM memory_relationships WHERE relationship_id = ?1",
  633|      0|                params![rel_id],
  634|      0|            )?;
  635|      0|            let affected =
  636|      0|                conn.execute("DELETE FROM relationships WHERE id = ?1", params![rel_id])?;
  637|      0|            total_deleted += affected;
  638|       |        }
  639|       |    }
  640|       |
  641|       |    // Step 4: recalculate degree for all affected entities.
  642|      0|    for &eid in &entity_ids {
  643|      0|        recalculate_degree(conn, eid)?;
  644|       |    }
  645|       |
  646|      0|    Ok((total_deleted, entity_ids))
  647|      0|}
  648|       |
  649|       |/// Searches the `vec_entities` virtual table for the k nearest neighbours.
  650|       |///
  651|       |/// # Errors
  652|       |///
  653|       |/// - [`AppError::Database`] — SQLite or sqlite-vec query failure.
  654|       |/// - [`AppError::Embedding`] — invalid or mismatched embedding dimension.
  655|      0|pub fn knn_search(
  656|      0|    conn: &Connection,
  657|      0|    embedding: &[f32],
  658|      0|    namespace: &str,
  659|      0|    k: usize,
  660|      0|) -> Result<Vec<(i64, f32)>, AppError> {
  661|      0|    let bytes = f32_to_bytes(embedding);
  662|      0|    let mut stmt = conn.prepare_cached(
  663|      0|        "SELECT entity_id, distance FROM vec_entities
  664|      0|         WHERE embedding MATCH ?1 AND namespace = ?2
  665|      0|         ORDER BY distance LIMIT ?3",
  666|      0|    )?;
  667|      0|    let rows = stmt
  668|      0|        .query_map(params![bytes, namespace, k as i64], |r| {
  669|      0|            Ok((r.get::<_, i64>(0)?, r.get::<_, f32>(1)?))
  670|      0|        })?
  671|      0|        .collect::<Result<Vec<_>, _>>()?;
  672|      0|    Ok(rows)
  673|      0|}
  674|       |
  675|       |#[cfg(test)]
  676|       |mod tests {
  677|       |    use super::*;
  678|       |    use crate::constants::EMBEDDING_DIM;
  679|       |    use crate::entity_type::EntityType;
  680|       |    use crate::storage::connection::register_vec_extension;
  681|       |    use rusqlite::Connection;
  682|       |    use tempfile::TempDir;
  683|       |
  684|       |    type TestResult = Result<(), Box<dyn std::error::Error>>;
  685|       |
  686|     31|    fn setup_db() -> Result<(TempDir, Connection), Box<dyn std::error::Error>> {
  687|     31|        register_vec_extension();
  688|     31|        let tmp = TempDir::new()?;
                                              ^0
  689|     31|        let db_path = tmp.path().join("test.db");
  690|     31|        let mut conn = Connection::open(&db_path)?;
                                                               ^0
  691|     31|        crate::migrations::runner().run(&mut conn)?;
                                                                ^0
  692|     31|        Ok((tmp, conn))
  693|     31|    }
  694|       |
  695|      3|    fn insert_memory(conn: &Connection) -> Result<i64, Box<dyn std::error::Error>> {
  696|      3|        conn.execute(
  697|      3|            "INSERT INTO memories (namespace, name, type, description, body, body_hash)
  698|      3|             VALUES ('global', 'test-mem', 'user', 'desc', 'body', 'hash1')",
  699|      3|            [],
  700|      0|        )?;
  701|      3|        Ok(conn.last_insert_rowid())
  702|      3|    }
  703|       |
  704|     45|    fn new_entity_helper(name: &str) -> NewEntity {
  705|     45|        NewEntity {
  706|     45|            name: name.to_string(),
  707|     45|            entity_type: EntityType::Project,
  708|     45|            description: None,
  709|     45|        }
  710|     45|    }
  711|       |
  712|      4|    fn embedding_zero() -> Vec<f32> {
  713|      4|        vec![0.0f32; EMBEDDING_DIM]
  714|      4|    }
  715|       |
  716|       |    // ------------------------------------------------------------------ //
  717|       |    // upsert_entity
  718|       |    // ------------------------------------------------------------------ //
  719|       |
  720|       |    #[test]
  721|      1|    fn test_upsert_entity_creates_new() -> TestResult {
  722|      1|        let (_tmp, conn) = setup_db()?;
                                                   ^0
  723|      1|        let e = new_entity_helper("projeto-alpha");
  724|      1|        let id = upsert_entity(&conn, "global", &e)?;
                                                                 ^0
  725|      1|        assert!(id > 0);
  726|      1|        Ok(())
  727|      1|    }
  728|       |
  729|       |    #[test]
  730|      1|    fn test_upsert_entity_idempotent_returns_same_id() -> TestResult {
  731|      1|        let (_tmp, conn) = setup_db()?;
                                                   ^0
  732|      1|        let e = new_entity_helper("projeto-beta");
  733|      1|        let id1 = upsert_entity(&conn, "global", &e)?;
                                                                  ^0
  734|      1|        let id2 = upsert_entity(&conn, "global", &e)?;
                                                                  ^0
  735|      1|        assert_eq!(id1, id2);
  736|      1|        Ok(())
  737|      1|    }
  738|       |
  739|       |    #[test]
  740|      1|    fn test_upsert_entity_updates_description() -> TestResult {
  741|      1|        let (_tmp, conn) = setup_db()?;
                                                   ^0
  742|      1|        let e1 = new_entity_helper("projeto-gamma");
  743|      1|        let id1 = upsert_entity(&conn, "global", &e1)?;
                                                                   ^0
  744|       |
  745|      1|        let e2 = NewEntity {
  746|      1|            name: "projeto-gamma".to_string(),
  747|      1|            entity_type: EntityType::Tool,
  748|      1|            description: Some("nova desc".to_string()),
  749|      1|        };
  750|      1|        let id2 = upsert_entity(&conn, "global", &e2)?;
                                                                   ^0
  751|      1|        assert_eq!(id1, id2);
  752|       |
  753|      1|        let desc: Option<String> = conn.query_row(
  754|      1|            "SELECT description FROM entities WHERE id = ?1",
  755|      1|            params![id1],
  756|      1|            |r| r.get(0),
  757|      0|        )?;
  758|      1|        assert_eq!(desc.as_deref(), Some("nova desc"));
  759|      1|        Ok(())
  760|      1|    }
  761|       |
  762|       |    #[test]
  763|      1|    fn test_upsert_entity_different_namespaces_create_distinct_records() -> TestResult {
  764|      1|        let (_tmp, conn) = setup_db()?;
                                                   ^0
  765|      1|        let e = new_entity_helper("compartilhada");
  766|      1|        let id1 = upsert_entity(&conn, "ns1", &e)?;
                                                               ^0
  767|      1|        let id2 = upsert_entity(&conn, "ns2", &e)?;
                                                               ^0
  768|      1|        assert_ne!(id1, id2);
  769|      1|        Ok(())
  770|      1|    }
  771|       |
  772|       |    // ------------------------------------------------------------------ //
  773|       |    // upsert_entity_vec — covers DELETE+INSERT (new branch after the OOM fix)
  774|       |    // ------------------------------------------------------------------ //
  775|       |
  776|       |    #[test]
  777|      1|    fn test_upsert_entity_vec_first_time_without_conflict() -> TestResult {
  778|      1|        let (_tmp, conn) = setup_db()?;
                                                   ^0
  779|      1|        let e = new_entity_helper("vec-nova");
  780|      1|        let entity_id = upsert_entity(&conn, "global", &e)?;
                                                                        ^0
  781|      1|        let emb = embedding_zero();
  782|       |
  783|      1|        let result = upsert_entity_vec(
  784|      1|            &conn,
  785|      1|            entity_id,
  786|      1|            "global",
  787|      1|            EntityType::Project,
  788|      1|            &emb,
  789|      1|            "vec-nova",
  790|       |        );
  791|      1|        assert!(result.is_ok(), "first insertion must succeed");
                                              ^0
  792|       |
  793|      1|        let count: i64 = conn.query_row(
  794|      1|            "SELECT COUNT(*) FROM vec_entities WHERE entity_id = ?1",
  795|      1|            params![entity_id],
  796|      1|            |r| r.get(0),
  797|      0|        )?;
  798|      1|        assert_eq!(count, 1, "must have exactly one row after insertion");
                                           ^0
  799|      1|        Ok(())
  800|      1|    }
  801|       |
  802|       |    #[test]
  803|      1|    fn test_upsert_entity_vec_second_time_replaces_without_error() -> TestResult {
  804|       |        // Covers the branch where DELETE removes the existing row before INSERT.
  805|      1|        let (_tmp, conn) = setup_db()?;
                                                   ^0
  806|      1|        let e = new_entity_helper("vec-existente");
  807|      1|        let entity_id = upsert_entity(&conn, "global", &e)?;
                                                                        ^0
  808|      1|        let emb = embedding_zero();
  809|       |
  810|      1|        upsert_entity_vec(
  811|      1|            &conn,
  812|      1|            entity_id,
  813|      1|            "global",
  814|      1|            EntityType::Project,
  815|      1|            &emb,
  816|      1|            "vec-existente",
  817|      0|        )?;
  818|       |
  819|       |        // Second call: DELETE returns 1 removed row, INSERT must succeed.
  820|      1|        let result = upsert_entity_vec(
  821|      1|            &conn,
  822|      1|            entity_id,
  823|      1|            "global",
  824|      1|            EntityType::Tool,
  825|      1|            &emb,
  826|      1|            "vec-existente",
  827|       |        );
  828|      1|        assert!(
  829|      1|            result.is_ok(),
  830|      0|            "second insertion (replace) must succeed: {result:?}"
  831|       |        );
  832|       |
  833|      1|        let count: i64 = conn.query_row(
  834|      1|            "SELECT COUNT(*) FROM vec_entities WHERE entity_id = ?1",
  835|      1|            params![entity_id],
  836|      1|            |r| r.get(0),
  837|      0|        )?;
  838|      1|        assert_eq!(count, 1, "must have exactly one row after replacement");
                                           ^0
  839|      1|        Ok(())
  840|      1|    }
  841|       |
  842|       |    #[test]
  843|      1|    fn test_upsert_entity_vec_multiple_independent_entities() -> TestResult {
  844|      1|        let (_tmp, conn) = setup_db()?;
                                                   ^0
  845|      1|        let emb = embedding_zero();
  846|       |
  847|      4|        for i in 0..3i64 {
                          ^3
  848|      3|            let nome = format!("ent-{i}");
  849|      3|            let e = new_entity_helper(&nome);
  850|      3|            let entity_id = upsert_entity(&conn, "global", &e)?;
                                                                            ^0
  851|      3|            upsert_entity_vec(&conn, entity_id, "global", EntityType::Project, &emb, &nome)?;
                                                                                                         ^0
  852|       |        }
  853|       |
  854|      1|        let count: i64 = conn.query_row("SELECT COUNT(*) FROM vec_entities", [], |r| r.get(0))?;
                                                                                                            ^0
  855|      1|        assert_eq!(count, 3, "must have three distinct rows in vec_entities");
                                           ^0
  856|      1|        Ok(())
  857|      1|    }
  858|       |
  859|       |    // ------------------------------------------------------------------ //
  860|       |    // find_entity_id
  861|       |    // ------------------------------------------------------------------ //
  862|       |
  863|       |    #[test]
  864|      1|    fn test_find_entity_id_existing_returns_some() -> TestResult {
  865|      1|        let (_tmp, conn) = setup_db()?;
                                                   ^0
  866|      1|        let e = new_entity_helper("entidade-busca");
  867|      1|        let id_inserido = upsert_entity(&conn, "global", &e)?;
                                                                          ^0
  868|      1|        let id_encontrado = find_entity_id(&conn, "global", "entidade-busca")?;
                                                                                           ^0
  869|      1|        assert_eq!(id_encontrado, Some(id_inserido));
  870|      1|        Ok(())
  871|      1|    }
  872|       |
  873|       |    #[test]
  874|      1|    fn test_find_entity_id_missing_returns_none() -> TestResult {
  875|      1|        let (_tmp, conn) = setup_db()?;
                                                   ^0
  876|      1|        let id = find_entity_id(&conn, "global", "nao-existe")?;
                                                                            ^0
  877|      1|        assert_eq!(id, None);
  878|      1|        Ok(())
  879|      1|    }
  880|       |
  881|       |    // ------------------------------------------------------------------ //
  882|       |    // delete_entities_by_ids
  883|       |    // ------------------------------------------------------------------ //
  884|       |
  885|       |    #[test]
  886|      1|    fn test_delete_entities_by_ids_empty_list_returns_zero() -> TestResult {
  887|      1|        let (_tmp, conn) = setup_db()?;
                                                   ^0
  888|      1|        let removed = delete_entities_by_ids(&conn, &[])?;
                                                                      ^0
  889|      1|        assert_eq!(removed, 0);
  890|      1|        Ok(())
  891|      1|    }
  892|       |
  893|       |    #[test]
  894|      1|    fn test_delete_entities_by_ids_removes_valid_entity() -> TestResult {
  895|      1|        let (_tmp, conn) = setup_db()?;
                                                   ^0
  896|      1|        let e = new_entity_helper("to-delete");
  897|      1|        let entity_id = upsert_entity(&conn, "global", &e)?;
                                                                        ^0
  898|       |
  899|      1|        let removed = delete_entities_by_ids(&conn, &[entity_id])?;
                                                                               ^0
  900|      1|        assert_eq!(removed, 1);
  901|       |
  902|      1|        let id = find_entity_id(&conn, "global", "to-delete")?;
                                                                           ^0
  903|      1|        assert_eq!(id, None, "entity must have been removed");
                                           ^0
  904|      1|        Ok(())
  905|      1|    }
  906|       |
  907|       |    #[test]
  908|      1|    fn test_delete_entities_by_ids_missing_id_returns_zero() -> TestResult {
  909|      1|        let (_tmp, conn) = setup_db()?;
                                                   ^0
  910|      1|        let removed = delete_entities_by_ids(&conn, &[9999])?;
                                                                          ^0
  911|      1|        assert_eq!(removed, 0);
  912|      1|        Ok(())
  913|      1|    }
  914|       |
  915|       |    #[test]
  916|      1|    fn test_delete_entities_by_ids_removes_multiple() -> TestResult {
  917|      1|        let (_tmp, conn) = setup_db()?;
                                                   ^0
  918|      1|        let id1 = upsert_entity(&conn, "global", &new_entity_helper("del-a"))?;
                                                                                           ^0
  919|      1|        let id2 = upsert_entity(&conn, "global", &new_entity_helper("del-b"))?;
                                                                                           ^0
  920|      1|        let id3 = upsert_entity(&conn, "global", &new_entity_helper("del-c"))?;
                                                                                           ^0
  921|       |
  922|      1|        let removed = delete_entities_by_ids(&conn, &[id1, id2])?;
                                                                              ^0
  923|      1|        assert_eq!(removed, 2);
  924|       |
  925|      1|        assert!(find_entity_id(&conn, "global", "del-a")?.is_none());
                                                                      ^0
  926|      1|        assert!(find_entity_id(&conn, "global", "del-b")?.is_none());
                                                                      ^0
  927|      1|        assert!(find_entity_id(&conn, "global", "del-c")?.is_some());
                                                                      ^0
  928|      1|        let _ = id3;
  929|      1|        Ok(())
  930|      1|    }
  931|       |
  932|       |    #[test]
  933|      1|    fn test_delete_entities_by_ids_also_removes_vec() -> TestResult {
  934|      1|        let (_tmp, conn) = setup_db()?;
                                                   ^0
  935|      1|        let e = new_entity_helper("del-com-vec");
  936|      1|        let entity_id = upsert_entity(&conn, "global", &e)?;
                                                                        ^0
  937|      1|        let emb = embedding_zero();
  938|      1|        upsert_entity_vec(
  939|      1|            &conn,
  940|      1|            entity_id,
  941|      1|            "global",
  942|      1|            EntityType::Project,
  943|      1|            &emb,
  944|      1|            "del-com-vec",
  945|      0|        )?;
  946|       |
  947|      1|        let count_antes: i64 = conn.query_row(
  948|      1|            "SELECT COUNT(*) FROM vec_entities WHERE entity_id = ?1",
  949|      1|            params![entity_id],
  950|      1|            |r| r.get(0),
  951|      0|        )?;
  952|      1|        assert_eq!(count_antes, 1);
  953|       |
  954|      1|        delete_entities_by_ids(&conn, &[entity_id])?;
                                                                 ^0
  955|       |
  956|      1|        let count_depois: i64 = conn.query_row(
  957|      1|            "SELECT COUNT(*) FROM vec_entities WHERE entity_id = ?1",
  958|      1|            params![entity_id],
  959|      1|            |r| r.get(0),
  960|      0|        )?;
  961|      1|        assert_eq!(
  962|       |            count_depois, 0,
  963|      0|            "vec_entities deve ser limpo junto com entities"
  964|       |        );
  965|      1|        Ok(())
  966|      1|    }
  967|       |
  968|       |    // ------------------------------------------------------------------ //
  969|       |    // upsert_relationship / find_relationship
  970|       |    // ------------------------------------------------------------------ //
  971|       |
  972|       |    #[test]
  973|      1|    fn test_upsert_relationship_creates_new() -> TestResult {
  974|      1|        let (_tmp, conn) = setup_db()?;
                                                   ^0
  975|      1|        let id_a = upsert_entity(&conn, "global", &new_entity_helper("rel-a"))?;
                                                                                            ^0
  976|      1|        let id_b = upsert_entity(&conn, "global", &new_entity_helper("rel-b"))?;
                                                                                            ^0
  977|       |
  978|      1|        let rel = NewRelationship {
  979|      1|            source: "rel-a".to_string(),
  980|      1|            target: "rel-b".to_string(),
  981|      1|            relation: "uses".to_string(),
  982|      1|            strength: 0.8,
  983|      1|            description: None,
  984|      1|        };
  985|      1|        let rel_id = upsert_relationship(&conn, "global", id_a, id_b, &rel)?;
                                                                                         ^0
  986|      1|        assert!(rel_id > 0);
  987|      1|        Ok(())
  988|      1|    }
  989|       |
  990|       |    #[test]
  991|      1|    fn test_upsert_relationship_idempotent() -> TestResult {
  992|      1|        let (_tmp, conn) = setup_db()?;
                                                   ^0
  993|      1|        let id_a = upsert_entity(&conn, "global", &new_entity_helper("idem-a"))?;
                                                                                             ^0
  994|      1|        let id_b = upsert_entity(&conn, "global", &new_entity_helper("idem-b"))?;
                                                                                             ^0
  995|       |
  996|      1|        let rel = NewRelationship {
  997|      1|            source: "idem-a".to_string(),
  998|      1|            target: "idem-b".to_string(),
  999|      1|            relation: "uses".to_string(),
 1000|      1|            strength: 0.5,
 1001|      1|            description: None,
 1002|      1|        };
 1003|      1|        let id1 = upsert_relationship(&conn, "global", id_a, id_b, &rel)?;
                                                                                      ^0
 1004|      1|        let id2 = upsert_relationship(&conn, "global", id_a, id_b, &rel)?;
                                                                                      ^0
 1005|      1|        assert_eq!(id1, id2);
 1006|      1|        Ok(())
 1007|      1|    }
 1008|       |
 1009|       |    #[test]
 1010|      1|    fn test_find_relationship_existing() -> TestResult {
 1011|      1|        let (_tmp, conn) = setup_db()?;
                                                   ^0
 1012|      1|        let id_a = upsert_entity(&conn, "global", &new_entity_helper("fr-a"))?;
                                                                                           ^0
 1013|      1|        let id_b = upsert_entity(&conn, "global", &new_entity_helper("fr-b"))?;
                                                                                           ^0
 1014|       |
 1015|      1|        let rel = NewRelationship {
 1016|      1|            source: "fr-a".to_string(),
 1017|      1|            target: "fr-b".to_string(),
 1018|      1|            relation: "depends_on".to_string(),
 1019|      1|            strength: 0.7,
 1020|      1|            description: None,
 1021|      1|        };
 1022|      1|        upsert_relationship(&conn, "global", id_a, id_b, &rel)?;
                                                                            ^0
 1023|       |
 1024|      1|        let encontrada = find_relationship(&conn, id_a, id_b, "depends_on")?;
                                                                                         ^0
 1025|      1|        let row = encontrada.ok_or("relationship should exist")?;
                                                                             ^0
 1026|      1|        assert_eq!(row.source_id, id_a);
 1027|      1|        assert_eq!(row.target_id, id_b);
 1028|      1|        assert!((row.weight - 0.7).abs() < 1e-9);
 1029|      1|        Ok(())
 1030|      1|    }
 1031|       |
 1032|       |    #[test]
 1033|      1|    fn test_find_relationship_missing_returns_none() -> TestResult {
 1034|      1|        let (_tmp, conn) = setup_db()?;
                                                   ^0
 1035|      1|        let resultado = find_relationship(&conn, 9999, 8888, "uses")?;
                                                                                  ^0
 1036|      1|        assert!(resultado.is_none());
 1037|      1|        Ok(())
 1038|      1|    }
 1039|       |
 1040|       |    // ------------------------------------------------------------------ //
 1041|       |    // link_memory_entity / link_memory_relationship
 1042|       |    // ------------------------------------------------------------------ //
 1043|       |
 1044|       |    #[test]
 1045|      1|    fn test_link_memory_entity_idempotent() -> TestResult {
 1046|      1|        let (_tmp, conn) = setup_db()?;
                                                   ^0
 1047|      1|        let memory_id = insert_memory(&conn)?;
                                                          ^0
 1048|      1|        let entity_id = upsert_entity(&conn, "global", &new_entity_helper("me-ent"))?;
                                                                                                  ^0
 1049|       |
 1050|      1|        link_memory_entity(&conn, memory_id, entity_id)?;
                                                                     ^0
 1051|      1|        let resultado = link_memory_entity(&conn, memory_id, entity_id);
 1052|      1|        assert!(
 1053|      1|            resultado.is_ok(),
 1054|      0|            "INSERT OR IGNORE must not fail on duplicate"
 1055|       |        );
 1056|      1|        Ok(())
 1057|      1|    }
 1058|       |
 1059|       |    #[test]
 1060|      1|    fn test_link_memory_relationship_idempotent() -> TestResult {
 1061|      1|        let (_tmp, conn) = setup_db()?;
                                                   ^0
 1062|      1|        let memory_id = insert_memory(&conn)?;
                                                          ^0
 1063|      1|        let id_a = upsert_entity(&conn, "global", &new_entity_helper("mr-a"))?;
                                                                                           ^0
 1064|      1|        let id_b = upsert_entity(&conn, "global", &new_entity_helper("mr-b"))?;
                                                                                           ^0
 1065|       |
 1066|      1|        let rel = NewRelationship {
 1067|      1|            source: "mr-a".to_string(),
 1068|      1|            target: "mr-b".to_string(),
 1069|      1|            relation: "uses".to_string(),
 1070|      1|            strength: 0.5,
 1071|      1|            description: None,
 1072|      1|        };
 1073|      1|        let rel_id = upsert_relationship(&conn, "global", id_a, id_b, &rel)?;
                                                                                         ^0
 1074|       |
 1075|      1|        link_memory_relationship(&conn, memory_id, rel_id)?;
                                                                        ^0
 1076|      1|        let resultado = link_memory_relationship(&conn, memory_id, rel_id);
 1077|      1|        assert!(
 1078|      1|            resultado.is_ok(),
 1079|      0|            "INSERT OR IGNORE must not fail on duplicate"
 1080|       |        );
 1081|      1|        Ok(())
 1082|      1|    }
 1083|       |
 1084|       |    // ------------------------------------------------------------------ //
 1085|       |    // increment_degree / recalculate_degree
 1086|       |    // ------------------------------------------------------------------ //
 1087|       |
 1088|       |    #[test]
 1089|      1|    fn test_increment_degree_increases_counter() -> TestResult {
 1090|      1|        let (_tmp, conn) = setup_db()?;
                                                   ^0
 1091|      1|        let entity_id = upsert_entity(&conn, "global", &new_entity_helper("grau-ent"))?;
                                                                                                    ^0
 1092|       |
 1093|      1|        increment_degree(&conn, entity_id)?;
                                                        ^0
 1094|      1|        increment_degree(&conn, entity_id)?;
                                                        ^0
 1095|       |
 1096|      1|        let degree: i64 = conn.query_row(
 1097|      1|            "SELECT degree FROM entities WHERE id = ?1",
 1098|      1|            params![entity_id],
 1099|      1|            |r| r.get(0),
 1100|      0|        )?;
 1101|      1|        assert_eq!(degree, 2);
 1102|      1|        Ok(())
 1103|      1|    }
 1104|       |
 1105|       |    #[test]
 1106|      1|    fn test_recalculate_degree_reflects_actual_relations() -> TestResult {
 1107|      1|        let (_tmp, conn) = setup_db()?;
                                                   ^0
 1108|      1|        let id_a = upsert_entity(&conn, "global", &new_entity_helper("rc-a"))?;
                                                                                           ^0
 1109|      1|        let id_b = upsert_entity(&conn, "global", &new_entity_helper("rc-b"))?;
                                                                                           ^0
 1110|      1|        let id_c = upsert_entity(&conn, "global", &new_entity_helper("rc-c"))?;
                                                                                           ^0
 1111|       |
 1112|      1|        let rel1 = NewRelationship {
 1113|      1|            source: "rc-a".to_string(),
 1114|      1|            target: "rc-b".to_string(),
 1115|      1|            relation: "uses".to_string(),
 1116|      1|            strength: 0.5,
 1117|      1|            description: None,
 1118|      1|        };
 1119|      1|        let rel2 = NewRelationship {
 1120|      1|            source: "rc-c".to_string(),
 1121|      1|            target: "rc-a".to_string(),
 1122|      1|            relation: "depends_on".to_string(),
 1123|      1|            strength: 0.5,
 1124|      1|            description: None,
 1125|      1|        };
 1126|      1|        upsert_relationship(&conn, "global", id_a, id_b, &rel1)?;
                                                                             ^0
 1127|      1|        upsert_relationship(&conn, "global", id_c, id_a, &rel2)?;
                                                                             ^0
 1128|       |
 1129|      1|        recalculate_degree(&conn, id_a)?;
                                                     ^0
 1130|       |
 1131|      1|        let degree: i64 = conn.query_row(
 1132|      1|            "SELECT degree FROM entities WHERE id = ?1",
 1133|      1|            params![id_a],
 1134|      1|            |r| r.get(0),
 1135|      0|        )?;
 1136|      1|        assert_eq!(
 1137|       |            degree, 2,
 1138|      0|            "rc-a appears in two relationships (source+target)"
 1139|       |        );
 1140|      1|        Ok(())
 1141|      1|    }
 1142|       |
 1143|       |    // ------------------------------------------------------------------ //
 1144|       |    // find_orphan_entity_ids
 1145|       |    // ------------------------------------------------------------------ //
 1146|       |
 1147|       |    #[test]
 1148|      1|    fn test_find_orphan_entity_ids_without_orphans() -> TestResult {
 1149|      1|        let (_tmp, conn) = setup_db()?;
                                                   ^0
 1150|      1|        let memory_id = insert_memory(&conn)?;
                                                          ^0
 1151|      1|        let entity_id = upsert_entity(&conn, "global", &new_entity_helper("nao-orfa"))?;
                                                                                                    ^0
 1152|      1|        link_memory_entity(&conn, memory_id, entity_id)?;
                                                                     ^0
 1153|       |
 1154|      1|        let orfas = find_orphan_entity_ids(&conn, Some("global"))?;
                                                                               ^0
 1155|      1|        assert!(!orfas.contains(&entity_id));
 1156|      1|        Ok(())
 1157|      1|    }
 1158|       |
 1159|       |    #[test]
 1160|      1|    fn test_find_orphan_entity_ids_detects_orphans() -> TestResult {
 1161|      1|        let (_tmp, conn) = setup_db()?;
                                                   ^0
 1162|      1|        let entity_id = upsert_entity(&conn, "global", &new_entity_helper("sim-orfa"))?;
                                                                                                    ^0
 1163|       |
 1164|      1|        let orfas = find_orphan_entity_ids(&conn, Some("global"))?;
                                                                               ^0
 1165|      1|        assert!(orfas.contains(&entity_id));
 1166|      1|        Ok(())
 1167|      1|    }
 1168|       |
 1169|       |    #[test]
 1170|      1|    fn test_find_orphan_entity_ids_without_namespace_returns_all() -> TestResult {
 1171|      1|        let (_tmp, conn) = setup_db()?;
                                                   ^0
 1172|      1|        let id1 = upsert_entity(&conn, "ns-a", &new_entity_helper("orfa-a"))?;
                                                                                          ^0
 1173|      1|        let id2 = upsert_entity(&conn, "ns-b", &new_entity_helper("orfa-b"))?;
                                                                                          ^0
 1174|       |
 1175|      1|        let orfas = find_orphan_entity_ids(&conn, None)?;
                                                                     ^0
 1176|      1|        assert!(orfas.contains(&id1));
 1177|      1|        assert!(orfas.contains(&id2));
 1178|      1|        Ok(())
 1179|      1|    }
 1180|       |
 1181|       |    // ------------------------------------------------------------------ //
 1182|       |    // list_entities / list_relationships_by_namespace
 1183|       |    // ------------------------------------------------------------------ //
 1184|       |
 1185|       |    #[test]
 1186|      1|    fn test_list_entities_with_namespace() -> TestResult {
 1187|      1|        let (_tmp, conn) = setup_db()?;
                                                   ^0
 1188|      1|        upsert_entity(&conn, "le-ns", &new_entity_helper("le-ent-1"))?;
                                                                                   ^0
 1189|      1|        upsert_entity(&conn, "le-ns", &new_entity_helper("le-ent-2"))?;
                                                                                   ^0
 1190|      1|        upsert_entity(&conn, "outro-ns", &new_entity_helper("le-ent-3"))?;
                                                                                      ^0
 1191|       |
 1192|      1|        let lista = list_entities(&conn, Some("le-ns"))?;
                                                                     ^0
 1193|      1|        assert_eq!(lista.len(), 2);
 1194|      2|        assert!(lista.iter().all(|e| e.namespace == "le-ns"));
                      ^1      ^1           ^1
 1195|      1|        Ok(())
 1196|      1|    }
 1197|       |
 1198|       |    #[test]
 1199|      1|    fn test_list_entities_without_namespace_returns_all() -> TestResult {
 1200|      1|        let (_tmp, conn) = setup_db()?;
                                                   ^0
 1201|      1|        upsert_entity(&conn, "ns1", &new_entity_helper("all-ent-1"))?;
                                                                                  ^0
 1202|      1|        upsert_entity(&conn, "ns2", &new_entity_helper("all-ent-2"))?;
                                                                                  ^0
 1203|       |
 1204|      1|        let lista = list_entities(&conn, None)?;
                                                            ^0
 1205|      1|        assert!(lista.len() >= 2);
 1206|      1|        Ok(())
 1207|      1|    }
 1208|       |
 1209|       |    #[test]
 1210|      1|    fn test_list_relationships_by_namespace_filters_correctly() -> TestResult {
 1211|      1|        let (_tmp, conn) = setup_db()?;
                                                   ^0
 1212|      1|        let id_a = upsert_entity(&conn, "rel-ns", &new_entity_helper("lr-a"))?;
                                                                                           ^0
 1213|      1|        let id_b = upsert_entity(&conn, "rel-ns", &new_entity_helper("lr-b"))?;
                                                                                           ^0
 1214|       |
 1215|      1|        let rel = NewRelationship {
 1216|      1|            source: "lr-a".to_string(),
 1217|      1|            target: "lr-b".to_string(),
 1218|      1|            relation: "uses".to_string(),
 1219|      1|            strength: 0.5,
 1220|      1|            description: None,
 1221|      1|        };
 1222|      1|        upsert_relationship(&conn, "rel-ns", id_a, id_b, &rel)?;
                                                                            ^0
 1223|       |
 1224|      1|        let lista = list_relationships_by_namespace(&conn, Some("rel-ns"))?;
                                                                                        ^0
 1225|      1|        assert!(!lista.is_empty());
 1226|      1|        assert!(lista.iter().all(|r| r.namespace == "rel-ns"));
 1227|      1|        Ok(())
 1228|      1|    }
 1229|       |
 1230|       |    // ------------------------------------------------------------------ //
 1231|       |    // delete_relationship_by_id / create_or_fetch_relationship
 1232|       |    // ------------------------------------------------------------------ //
 1233|       |
 1234|       |    #[test]
 1235|      1|    fn test_delete_relationship_by_id_removes_relation() -> TestResult {
 1236|      1|        let (_tmp, conn) = setup_db()?;
                                                   ^0
 1237|      1|        let id_a = upsert_entity(&conn, "global", &new_entity_helper("dr-a"))?;
                                                                                           ^0
 1238|      1|        let id_b = upsert_entity(&conn, "global", &new_entity_helper("dr-b"))?;
                                                                                           ^0
 1239|       |
 1240|      1|        let rel = NewRelationship {
 1241|      1|            source: "dr-a".to_string(),
 1242|      1|            target: "dr-b".to_string(),
 1243|      1|            relation: "uses".to_string(),
 1244|      1|            strength: 0.5,
 1245|      1|            description: None,
 1246|      1|        };
 1247|      1|        let rel_id = upsert_relationship(&conn, "global", id_a, id_b, &rel)?;
                                                                                         ^0
 1248|       |
 1249|      1|        delete_relationship_by_id(&conn, rel_id)?;
                                                              ^0
 1250|       |
 1251|      1|        let encontrada = find_relationship(&conn, id_a, id_b, "uses")?;
                                                                                   ^0
 1252|      1|        assert!(encontrada.is_none(), "relationship must have been removed");
                                                    ^0
 1253|      1|        Ok(())
 1254|      1|    }
 1255|       |
 1256|       |    #[test]
 1257|      1|    fn test_create_or_fetch_relationship_creates_new() -> TestResult {
 1258|      1|        let (_tmp, conn) = setup_db()?;
                                                   ^0
 1259|      1|        let id_a = upsert_entity(&conn, "global", &new_entity_helper("cf-a"))?;
                                                                                           ^0
 1260|      1|        let id_b = upsert_entity(&conn, "global", &new_entity_helper("cf-b"))?;
                                                                                           ^0
 1261|       |
 1262|      1|        let (rel_id, created) =
 1263|      1|            create_or_fetch_relationship(&conn, "global", id_a, id_b, "uses", 0.5, None)?;
                                                                                                      ^0
 1264|      1|        assert!(rel_id > 0);
 1265|      1|        assert!(created);
 1266|      1|        Ok(())
 1267|      1|    }
 1268|       |
 1269|       |    #[test]
 1270|      1|    fn test_create_or_fetch_relationship_returns_existing() -> TestResult {
 1271|      1|        let (_tmp, conn) = setup_db()?;
                                                   ^0
 1272|      1|        let id_a = upsert_entity(&conn, "global", &new_entity_helper("cf2-a"))?;
                                                                                            ^0
 1273|      1|        let id_b = upsert_entity(&conn, "global", &new_entity_helper("cf2-b"))?;
                                                                                            ^0
 1274|       |
 1275|      1|        create_or_fetch_relationship(&conn, "global", id_a, id_b, "uses", 0.5, None)?;
                                                                                                  ^0
 1276|      1|        let (_, created) =
 1277|      1|            create_or_fetch_relationship(&conn, "global", id_a, id_b, "uses", 0.5, None)?;
                                                                                                      ^0
 1278|      1|        assert!(
 1279|      1|            !created,
 1280|      0|            "second call must return the existing relationship"
 1281|       |        );
 1282|      1|        Ok(())
 1283|      1|    }
 1284|       |
 1285|       |    // ------------------------------------------------------------------ //
 1286|       |    // serde alias: field "type" accepted as a synonym for "entity_type"
 1287|       |    // ------------------------------------------------------------------ //
 1288|       |
 1289|       |    #[test]
 1290|      1|    fn accepts_type_field_as_alias() -> TestResult {
 1291|      1|        let json = r#"{"name": "X", "type": "concept"}"#;
 1292|      1|        let ent: NewEntity = serde_json::from_str(json)?;
                                                                     ^0
 1293|      1|        assert_eq!(ent.entity_type, EntityType::Concept);
 1294|      1|        Ok(())
 1295|      1|    }
 1296|       |
 1297|       |    #[test]
 1298|      1|    fn accepts_canonical_entity_type_field() -> TestResult {
 1299|      1|        let json = r#"{"name": "X", "entity_type": "concept"}"#;
 1300|      1|        let ent: NewEntity = serde_json::from_str(json)?;
                                                                     ^0
 1301|      1|        assert_eq!(ent.entity_type, EntityType::Concept);
 1302|      1|        Ok(())
 1303|      1|    }
 1304|       |
 1305|       |    #[test]
 1306|      1|    fn both_fields_present_yields_duplicate_error() {
 1307|       |        // having both entity_type and type in the same JSON is a duplicate and must fail
 1308|      1|        let json = r#"{"name": "X", "entity_type": "concept", "type": "person"}"#;
 1309|      1|        let resultado: Result<NewEntity, _> = serde_json::from_str(json);
 1310|      1|        assert!(
 1311|      1|            resultado.is_err(),
 1312|      0|            "both fields in the same JSON are a duplicate"
 1313|       |        );
 1314|      1|    }
 1315|       |
 1316|       |    #[test]
 1317|      1|    fn validate_entity_name_accepts_valid() {
 1318|      1|        assert!(validate_entity_name("rust-lang").is_ok());
 1319|      1|        assert!(validate_entity_name("sqlite-graphrag").is_ok());
 1320|      1|        assert!(validate_entity_name("ab").is_ok());
 1321|      1|    }
 1322|       |
 1323|       |    #[test]
 1324|      1|    fn validate_entity_name_rejects_short() {
 1325|      1|        assert!(validate_entity_name("a").is_err());
 1326|      1|        assert!(validate_entity_name("").is_err());
 1327|      1|    }
 1328|       |
 1329|       |    #[test]
 1330|      1|    fn validate_entity_name_rejects_newlines() {
 1331|      1|        assert!(validate_entity_name("foo\nbar").is_err());
 1332|      1|        assert!(validate_entity_name("foo\rbar").is_err());
 1333|      1|    }
 1334|       |
 1335|       |    #[test]
 1336|      1|    fn validate_entity_name_rejects_short_allcaps() {
 1337|      1|        assert!(validate_entity_name("RAM").is_err());
 1338|      1|        assert!(validate_entity_name("NAO").is_err());
 1339|      1|        assert!(validate_entity_name("OK").is_err());
 1340|      1|    }
 1341|       |
 1342|       |    #[test]
 1343|      1|    fn validate_entity_name_accepts_long_allcaps() {
 1344|      1|        assert!(validate_entity_name("SQLITE").is_ok());
 1345|      1|        assert!(validate_entity_name("GRAPHRAG").is_ok());
 1346|      1|    }
 1347|       |
 1348|       |    #[test]
 1349|      1|    fn validate_entity_name_accepts_mixed_case() {
 1350|      1|        assert!(validate_entity_name("FTS5").is_ok()); // 4 chars but has digit
 1351|      1|        assert!(validate_entity_name("WAL").is_err()); // 3 chars ALL_CAPS
 1352|      1|    }
 1353|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/storage/fusion.rs:
    1|       |//! RRF (Reciprocal Rank Fusion) utilities shared between `hybrid-search` and
    2|       |//! `deep-research`.
    3|       |//!
    4|       |//! The formula used is the canonical RRF score:
    5|       |//!
    6|       |//! ```text
    7|       |//! score(d) = sum_over_lists { weight * 1 / (rrf_k + rank(d)) }
    8|       |//! ```
    9|       |//!
   10|       |//! where `rank` is 1-indexed position in each ordered list.  The map returned
   11|       |//! by [`rrf_fuse`] contains un-normalised scores; callers that need a `[0,1]`
   12|       |//! range should divide by the theoretical maximum:
   13|       |//!
   14|       |//! ```text
   15|       |//! max_possible = sum_over_lists { weight * 1 / (rrf_k + 1) }
   16|       |//! ```
   17|       |
   18|       |use std::collections::HashMap;
   19|       |
   20|       |/// Fuse multiple ranked lists of integer IDs via Reciprocal Rank Fusion.
   21|       |///
   22|       |/// Each element of `lists` is `(weight, ranked_ids)` where `ranked_ids` is
   23|       |/// ordered best-first (index 0 = rank 1).
   24|       |///
   25|       |/// Returns a `HashMap<id, combined_score>` using un-normalised RRF scores.
   26|       |/// Higher score means higher relevance.
   27|       |///
   28|       |/// # Examples
   29|       |///
   30|       |/// ```
   31|       |/// use sqlite_graphrag::storage::fusion::rrf_fuse;
   32|       |///
   33|       |/// // Two lists with equal weight — item 1 appears in both at rank 1 and 2
   34|       |/// // so it accumulates more score than item 2 (rank 2) or item 3 (rank 1 only).
   35|       |/// let knn: Vec<i64> = vec![1, 2];
   36|       |/// let fts: Vec<i64> = vec![1, 3];
   37|       |/// let scores = rrf_fuse(&[(1.0, &knn), (1.0, &fts)], 60.0);
   38|       |/// assert!(scores[&1] > scores[&2]);
   39|       |/// assert!(scores[&1] > scores[&3]);
   40|       |/// ```
   41|      9|pub fn rrf_fuse(lists: &[(f64, &Vec<i64>)], rrf_k: f64) -> HashMap<i64, f64> {
   42|     14|    let total_ids: usize = lists.iter().map(|(_, ids)| ids.len()).sum();
                      ^9         ^9      ^9    ^9     ^9                        ^9
   43|      9|    let mut combined: HashMap<i64, f64> = HashMap::with_capacity(total_ids);
   44|     23|    for (weight, ids) in lists {
                       ^14     ^14
   45|     31|        for (rank, &id) in ids.iter().enumerate() {
                                         ^14        ^14
   46|     31|            // rank is 0-indexed here; formula uses 1-indexed, so we add 1.
   47|     31|            let contribution = weight * (1.0 / (rrf_k + rank as f64 + 1.0));
   48|     31|            *combined.entry(id).or_insert(0.0) += contribution;
   49|     31|        }
   50|       |    }
   51|      9|    combined
   52|      9|}
   53|       |
   54|       |/// Compute the theoretical maximum RRF score for a given set of weights and
   55|       |/// `rrf_k`.
   56|       |///
   57|       |/// Useful for normalising `rrf_fuse` scores to `[0, 1]`:
   58|       |///
   59|       |/// ```
   60|       |/// use sqlite_graphrag::storage::fusion::{rrf_fuse, rrf_max_possible};
   61|       |///
   62|       |/// let weights = vec![1.0_f64, 1.0_f64];
   63|       |/// let max = rrf_max_possible(&weights, 60.0);
   64|       |/// assert!(max > 0.0);
   65|       |/// ```
   66|      2|pub fn rrf_max_possible(weights: &[f64], rrf_k: f64) -> f64 {
   67|      3|    weights.iter().map(|w| w * (1.0 / (rrf_k + 1.0))).sum()
                  ^2      ^2     ^2                                 ^2
   68|      2|}
   69|       |
   70|       |#[cfg(test)]
   71|       |mod tests {
   72|       |    use super::*;
   73|       |
   74|       |    #[test]
   75|      1|    fn rrf_fuse_single_list_rank_order_preserved() {
   76|       |        // Items at lower rank index get higher scores.
   77|      1|        let list = vec![10i64, 20, 30];
   78|      1|        let scores = rrf_fuse(&[(1.0, &list)], 60.0);
   79|      1|        assert!(scores[&10] > scores[&20]);
   80|      1|        assert!(scores[&20] > scores[&30]);
   81|      1|    }
   82|       |
   83|       |    #[test]
   84|      1|    fn rrf_fuse_two_lists_overlap_accumulates() {
   85|       |        // Item 1 appears first in both lists — must beat item 2 (rank 1 in one list only).
   86|      1|        let knn = vec![1i64, 2];
   87|      1|        let fts = vec![1i64, 3];
   88|      1|        let scores = rrf_fuse(&[(1.0, &knn), (1.0, &fts)], 60.0);
   89|      1|        assert!(scores[&1] > scores[&2], "overlap item must score higher");
                                                       ^0
   90|      1|        assert!(scores[&1] > scores[&3], "overlap item must score higher");
                                                       ^0
   91|      1|    }
   92|       |
   93|       |    #[test]
   94|      1|    fn rrf_fuse_empty_lists_returns_empty() {
   95|      1|        let empty: Vec<i64> = vec![];
   96|      1|        let scores = rrf_fuse(&[(1.0, &empty)], 60.0);
   97|      1|        assert!(scores.is_empty());
   98|      1|    }
   99|       |
  100|       |    #[test]
  101|      1|    fn rrf_fuse_zero_weight_list_has_no_effect() {
  102|      1|        let list_a = vec![1i64, 2];
  103|      1|        let list_b = vec![3i64, 4];
  104|      1|        let scores_with = rrf_fuse(&[(1.0, &list_a), (0.0, &list_b)], 60.0);
  105|       |        // Items 3 and 4 should have score 0.0 (or not present).
  106|      1|        assert_eq!(scores_with.get(&3).copied().unwrap_or(0.0), 0.0);
  107|      1|        assert_eq!(scores_with.get(&4).copied().unwrap_or(0.0), 0.0);
  108|      1|    }
  109|       |
  110|       |    #[test]
  111|      1|    fn rrf_fuse_weights_scale_contribution() {
  112|       |        // Higher weight means higher score for same rank.
  113|      1|        let list = vec![1i64];
  114|      1|        let low = rrf_fuse(&[(0.5, &list)], 60.0);
  115|      1|        let high = rrf_fuse(&[(2.0, &list)], 60.0);
  116|      1|        assert!(high[&1] > low[&1]);
  117|      1|    }
  118|       |
  119|       |    #[test]
  120|      1|    fn rrf_max_possible_sums_weights() {
  121|       |        // With rrf_k=60, max for one list of weight 1.0 is 1/(60+1) ≈ 0.01639.
  122|      1|        let max = rrf_max_possible(&[1.0], 60.0);
  123|      1|        let expected = 1.0 / 61.0;
  124|      1|        assert!((max - expected).abs() < 1e-9);
  125|       |
  126|       |        // Two equal-weight lists: sum of both.
  127|      1|        let max2 = rrf_max_possible(&[1.0, 1.0], 60.0);
  128|      1|        assert!((max2 - 2.0 / 61.0).abs() < 1e-9);
  129|      1|    }
  130|       |
  131|       |    #[test]
  132|      1|    fn rrf_fuse_deterministic_for_same_input() {
  133|      1|        let list_a = vec![1i64, 2, 3];
  134|      1|        let list_b = vec![2i64, 1, 4];
  135|      1|        let scores_1 = rrf_fuse(&[(1.0, &list_a), (1.0, &list_b)], 60.0);
  136|      1|        let scores_2 = rrf_fuse(&[(1.0, &list_a), (1.0, &list_b)], 60.0);
  137|      5|        for id in [1i64, 2, 3, 4] {
                          ^4
  138|      4|            assert_eq!(
  139|      4|                scores_1.get(&id).copied().unwrap_or(0.0),
  140|      4|                scores_2.get(&id).copied().unwrap_or(0.0)
  141|       |            );
  142|       |        }
  143|      1|    }
  144|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/storage/memories.rs:
    1|       |//! Persistence layer for the `memories` table and its vector companion.
    2|       |//!
    3|       |//! Functions here encapsulate every SQL statement touching `memories`,
    4|       |//! `vec_memories` and the FTS5 `fts_memories` shadow table. Callers receive
    5|       |//! typed [`MemoryRow`] or [`NewMemory`] values and never build SQL strings.
    6|       |
    7|       |use crate::embedder::f32_to_bytes;
    8|       |use crate::errors::AppError;
    9|       |use crate::storage::utils::with_busy_retry;
   10|       |use rusqlite::{params, Connection};
   11|       |use serde::{Deserialize, Serialize};
   12|       |
   13|       |/// Input payload for inserting or updating a memory.
   14|       |///
   15|       |/// `body_hash` must be the BLAKE3 digest of `body`. The `metadata` field is
   16|       |/// stored as a TEXT column containing JSON.
   17|       |#[derive(Debug, Serialize, Deserialize)]
   18|       |pub struct NewMemory {
   19|       |    pub namespace: String,
   20|       |    pub name: String,
   21|       |    pub memory_type: String,
   22|       |    pub description: String,
   23|       |    pub body: String,
   24|       |    pub body_hash: String,
   25|       |    pub session_id: Option<String>,
   26|       |    pub source: String,
   27|       |    pub metadata: serde_json::Value,
   28|       |}
   29|       |
   30|       |/// Fully materialized row from the `memories` table.
   31|       |///
   32|       |/// Returned by [`read_by_name`], [`read_full`], [`list`] and [`fts_search`].
   33|       |/// The `metadata` field is kept as a JSON string to avoid double parsing.
   34|       |#[derive(Debug, Serialize)]
   35|       |pub struct MemoryRow {
   36|       |    pub id: i64,
   37|       |    pub namespace: String,
   38|       |    pub name: String,
   39|       |    pub memory_type: String,
   40|       |    pub description: String,
   41|       |    pub body: String,
   42|       |    pub body_hash: String,
   43|       |    pub session_id: Option<String>,
   44|       |    pub source: String,
   45|       |    pub metadata: String,
   46|       |    pub created_at: i64,
   47|       |    pub updated_at: i64,
   48|       |    /// Unix epoch when the memory was soft-deleted, or `None` for active memories.
   49|       |    /// Surfaced in `list --include-deleted --json` so LLM consumers can distinguish
   50|       |    /// active from soft-deleted rows without a second SQL query (v1.0.37 H7+M9 fix).
   51|       |    #[serde(skip_serializing_if = "Option::is_none")]
   52|       |    pub deleted_at: Option<i64>,
   53|       |}
   54|       |
   55|       |/// Finds a live memory by `(namespace, name)` and returns key metadata.
   56|       |///
   57|       |/// # Arguments
   58|       |///
   59|       |/// - `conn` — open SQLite connection configured with the project pragmas.
   60|       |/// - `namespace` — resolved namespace for the lookup.
   61|       |/// - `name` — kebab-case memory name.
   62|       |///
   63|       |/// # Returns
   64|       |///
   65|       |/// `Ok(Some((id, updated_at, max_version)))` when the memory exists and is
   66|       |/// not soft-deleted, `Ok(None)` otherwise.
   67|       |///
   68|       |/// # Errors
   69|       |///
   70|       |/// Returns `Err(AppError::Database)` on any `rusqlite` failure.
   71|      8|pub fn find_by_name(
   72|      8|    conn: &Connection,
   73|      8|    namespace: &str,
   74|      8|    name: &str,
   75|      8|) -> Result<Option<(i64, i64, i64)>, AppError> {
   76|      8|    let mut stmt = conn.prepare_cached(
   77|      8|        "SELECT m.id, m.updated_at, COALESCE(MAX(v.version), 0)
   78|      8|         FROM memories m
   79|      8|         LEFT JOIN memory_versions v ON v.memory_id = m.id
   80|      8|         WHERE m.namespace = ?1 AND m.name = ?2 AND m.deleted_at IS NULL
   81|      8|         GROUP BY m.id",
   82|      0|    )?;
   83|      8|    let result = stmt.query_row(params![namespace, name], |r| {
                                                                            ^5
   84|       |        Ok((
   85|      5|            r.get::<_, i64>(0)?,
                                            ^0
   86|      5|            r.get::<_, i64>(1)?,
                                            ^0
   87|      5|            r.get::<_, i64>(2)?,
                                            ^0
   88|       |        ))
   89|      5|    });
   90|      3|    match result {
   91|      5|        Ok(row) => Ok(Some(row)),
   92|      3|        Err(rusqlite::Error::QueryReturnedNoRows) => Ok(None),
   93|      0|        Err(e) => Err(AppError::Database(e)),
   94|       |    }
   95|      8|}
   96|       |
   97|       |/// Looks up a memory by `(namespace, name)` regardless of deletion state.
   98|       |///
   99|       |/// Returns `Some((id, is_deleted))` when the row exists.
  100|       |/// `is_deleted` is `true` when `deleted_at IS NOT NULL`.
  101|       |///
  102|       |/// # Errors
  103|       |///
  104|       |/// Propagates [`AppError::Database`] on SQLite failures.
  105|      3|pub fn find_by_name_any_state(
  106|      3|    conn: &Connection,
  107|      3|    namespace: &str,
  108|      3|    name: &str,
  109|      3|) -> Result<Option<(i64, bool)>, AppError> {
  110|      3|    let mut stmt = conn.prepare_cached(
  111|      3|        "SELECT id, (deleted_at IS NOT NULL) AS is_deleted
  112|      3|         FROM memories WHERE namespace = ?1 AND name = ?2",
  113|      0|    )?;
  114|      3|    let result = stmt.query_row(params![namespace, name], |r| {
                                                                            ^2
  115|      2|        Ok((r.get::<_, i64>(0)?, r.get::<_, bool>(1)?))
                                            ^0                    ^0
  116|      2|    });
  117|      1|    match result {
  118|      2|        Ok(row) => Ok(Some(row)),
  119|      1|        Err(rusqlite::Error::QueryReturnedNoRows) => Ok(None),
  120|      0|        Err(e) => Err(AppError::Database(e)),
  121|       |    }
  122|      3|}
  123|       |
  124|       |/// Clears `deleted_at` to restore a soft-deleted memory.
  125|       |///
  126|       |/// # Errors
  127|       |///
  128|       |/// Propagates [`AppError::Database`] on SQLite failures.
  129|      1|pub fn clear_deleted_at(conn: &Connection, memory_id: i64) -> Result<(), AppError> {
  130|      1|    conn.execute(
  131|      1|        "UPDATE memories SET deleted_at = NULL WHERE id = ?1",
  132|      1|        params![memory_id],
  133|      0|    )?;
  134|      1|    Ok(())
  135|      1|}
  136|       |
  137|       |/// Looks up a live memory by exact `body_hash` within a namespace.
  138|       |///
  139|       |/// Used during `remember` to short-circuit semantic duplicates before
  140|       |/// spending an embedding call.
  141|       |///
  142|       |/// # Returns
  143|       |///
  144|       |/// `Ok(Some(id))` when a live memory with the same hash exists,
  145|       |/// `Ok(None)` otherwise.
  146|       |///
  147|       |/// # Errors
  148|       |///
  149|       |/// Returns `Err(AppError::Database)` on any `rusqlite` failure.
  150|      3|pub fn find_by_hash(
  151|      3|    conn: &Connection,
  152|      3|    namespace: &str,
  153|      3|    body_hash: &str,
  154|      3|) -> Result<Option<i64>, AppError> {
  155|      3|    let mut stmt = conn.prepare_cached(
  156|      3|        "SELECT id FROM memories WHERE namespace = ?1 AND body_hash = ?2 AND deleted_at IS NULL",
  157|      0|    )?;
  158|      3|    match stmt.query_row(params![namespace, body_hash], |r| r.get(0)) {
                                                                          ^1^1
  159|      1|        Ok(id) => Ok(Some(id)),
  160|      2|        Err(rusqlite::Error::QueryReturnedNoRows) => Ok(None),
  161|      0|        Err(e) => Err(AppError::Database(e)),
  162|       |    }
  163|      3|}
  164|       |
  165|       |/// Inserts a new row into the `memories` table.
  166|       |///
  167|       |/// # Arguments
  168|       |///
  169|       |/// - `conn` — active SQLite connection, typically inside a transaction.
  170|       |/// - `m` — validated payload including `body_hash` and serialized metadata.
  171|       |///
  172|       |/// # Returns
  173|       |///
  174|       |/// The `rowid` assigned to the newly inserted memory.
  175|       |///
  176|       |/// # Errors
  177|       |///
  178|       |/// Returns `Err(AppError::Database)` on insertion failure and
  179|       |/// `Err(AppError::Json)` if metadata serialization fails.
  180|     39|pub fn insert(conn: &Connection, m: &NewMemory) -> Result<i64, AppError> {
  181|       |    // G29 Passo 2 (v1.0.69): runtime guard for the CHECK constraint on
  182|       |    // `source`. Even though `MemorySource` is the typed future, every
  183|       |    // legacy `NewMemory { source: "..." }` literal still flows through
  184|       |    // this function; validating here keeps the footgun from regressing
  185|       |    // for callers that have not yet migrated to the enum.
  186|     39|    let validated_source = crate::memory_source::validate_source(&m.source)?;
                                                                                         ^0
  187|     39|    conn.execute(
  188|     39|        "INSERT INTO memories (namespace, name, type, description, body, body_hash, session_id, source, metadata)
  189|     39|         VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9)",
  190|     39|        params![
  191|       |            m.namespace, m.name, m.memory_type, m.description, m.body,
  192|       |            m.body_hash, m.session_id, validated_source,
  193|     39|            serde_json::to_string(&m.metadata)?
                                                            ^0
  194|       |        ],
  195|      0|    )?;
  196|     39|    Ok(conn.last_insert_rowid())
  197|     39|}
  198|       |
  199|       |/// Updates an existing memory optionally guarded by optimistic concurrency.
  200|       |///
  201|       |/// When `expected_updated_at` is `Some(ts)` the row is only updated if its
  202|       |/// current `updated_at` equals `ts`. This protects concurrent `edit` calls
  203|       |/// from silently clobbering each other.
  204|       |///
  205|       |/// # Returns
  206|       |///
  207|       |/// `Ok(true)` when exactly one row was updated, `Ok(false)` when the
  208|       |/// optimistic check failed or the memory does not exist.
  209|       |///
  210|       |/// # Errors
  211|       |///
  212|       |/// Returns `Err(AppError::Database)` on any `rusqlite` failure.
  213|      4|pub fn update(
  214|      4|    conn: &Connection,
  215|      4|    id: i64,
  216|      4|    m: &NewMemory,
  217|      4|    expected_updated_at: Option<i64>,
  218|      4|) -> Result<bool, AppError> {
  219|       |    // G29 Passo 2 (v1.0.69): runtime guard for the CHECK constraint on
  220|       |    // `source`. Mirrors `insert` so `body-enrich` and other mutations
  221|       |    // cannot reintroduce the historical "enrich" literal that broke
  222|       |    // `body-enrich` in v1.0.55 - v1.0.68.
  223|      4|    let validated_source = crate::memory_source::validate_source(&m.source)?;
                                                                                         ^0
  224|      4|    let affected = if let Some(ts) = expected_updated_at {
                                             ^2
  225|      2|        conn.execute(
  226|      2|            "UPDATE memories SET type=?2, description=?3, body=?4, body_hash=?5,
  227|      2|             session_id=?6, source=?7, metadata=?8
  228|      2|             WHERE id=?1 AND updated_at=?9 AND deleted_at IS NULL",
  229|      2|            params![
  230|       |                id,
  231|       |                m.memory_type,
  232|       |                m.description,
  233|       |                m.body,
  234|       |                m.body_hash,
  235|       |                m.session_id,
  236|       |                validated_source,
  237|      2|                serde_json::to_string(&m.metadata)?,
                                                                ^0
  238|       |                ts
  239|       |            ],
  240|      0|        )?
  241|       |    } else {
  242|      2|        conn.execute(
  243|      2|            "UPDATE memories SET type=?2, description=?3, body=?4, body_hash=?5,
  244|      2|             session_id=?6, source=?7, metadata=?8
  245|      2|             WHERE id=?1 AND deleted_at IS NULL",
  246|      2|            params![
  247|       |                id,
  248|       |                m.memory_type,
  249|       |                m.description,
  250|       |                m.body,
  251|       |                m.body_hash,
  252|       |                m.session_id,
  253|       |                validated_source,
  254|      2|                serde_json::to_string(&m.metadata)?
                                                                ^0
  255|       |            ],
  256|      0|        )?
  257|       |    };
  258|      4|    Ok(affected == 1)
  259|      4|}
  260|       |
  261|       |/// Replaces the vector row for a memory in `vec_memories`.
  262|       |///
  263|       |/// `sqlite-vec` virtual tables do not implement `INSERT OR REPLACE`, so the
  264|       |/// existing row is deleted first and a fresh vector is inserted. Callers
  265|       |/// must pass an `embedding` with length [`crate::constants::EMBEDDING_DIM`].
  266|       |///
  267|       |/// # Errors
  268|       |///
  269|       |/// Returns `Err(AppError::Database)` on any `rusqlite` failure.
  270|      7|pub fn upsert_vec(
  271|      7|    conn: &Connection,
  272|      7|    memory_id: i64,
  273|      7|    namespace: &str,
  274|      7|    memory_type: &str,
  275|      7|    embedding: &[f32],
  276|      7|    name: &str,
  277|      7|    snippet: &str,
  278|      7|) -> Result<(), AppError> {
  279|       |    // sqlite-vec virtual tables do not support INSERT OR REPLACE semantics.
  280|       |    // Must delete the existing row first, then insert.  Both statements are
  281|       |    // wrapped in `with_busy_retry` because WAL-mode concurrent writers can
  282|       |    // cause SQLITE_BUSY on vec0 virtual table writes.
  283|      7|    let embedding_bytes = f32_to_bytes(embedding);
  284|      7|    with_busy_retry(|| {
  285|      7|        conn.execute(
  286|      7|            "DELETE FROM vec_memories WHERE memory_id = ?1",
  287|      7|            params![memory_id],
  288|      0|        )?;
  289|      7|        conn.execute(
  290|      7|            "INSERT INTO vec_memories(memory_id, namespace, type, embedding, name, snippet)
  291|      7|             VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
  292|      7|            params![
  293|       |                memory_id,
  294|       |                namespace,
  295|       |                memory_type,
  296|      7|                &embedding_bytes,
  297|       |                name,
  298|       |                snippet
  299|       |            ],
  300|      0|        )?;
  301|      7|        Ok(())
  302|      7|    })
  303|      7|}
  304|       |
  305|       |/// Deletes the vector row for `memory_id` from `vec_memories`.
  306|       |///
  307|       |/// Called during `forget` and `purge` to keep the vector table consistent
  308|       |/// with the logical state of `memories`.
  309|       |///
  310|       |/// # Errors
  311|       |///
  312|       |/// Returns `Err(AppError::Database)` on any `rusqlite` failure.
  313|      2|pub fn delete_vec(conn: &Connection, memory_id: i64) -> Result<(), AppError> {
  314|      2|    conn.execute(
  315|      2|        "DELETE FROM vec_memories WHERE memory_id = ?1",
  316|      2|        params![memory_id],
  317|      0|    )?;
  318|      2|    Ok(())
  319|      2|}
  320|       |
  321|       |/// Fetches a live memory by `(namespace, name)` and returns all columns.
  322|       |///
  323|       |/// # Returns
  324|       |///
  325|       |/// `Ok(Some(row))` when found, `Ok(None)` when missing or soft-deleted.
  326|       |///
  327|       |/// # Errors
  328|       |///
  329|       |/// Returns `Err(AppError::Database)` on any `rusqlite` failure.
  330|      3|pub fn read_by_name(
  331|      3|    conn: &Connection,
  332|      3|    namespace: &str,
  333|      3|    name: &str,
  334|      3|) -> Result<Option<MemoryRow>, AppError> {
  335|      3|    let mut stmt = conn.prepare_cached(
  336|      3|        "SELECT id, namespace, name, type, description, body, body_hash,
  337|      3|                session_id, source, metadata, created_at, updated_at, deleted_at
  338|      3|         FROM memories WHERE namespace=?1 AND name=?2 AND deleted_at IS NULL",
  339|      0|    )?;
  340|      3|    match stmt.query_row(params![namespace, name], |r| {
                                                                     ^1
  341|       |        Ok(MemoryRow {
  342|      1|            id: r.get(0)?,
                                      ^0
  343|      1|            namespace: r.get(1)?,
                                             ^0
  344|      1|            name: r.get(2)?,
                                        ^0
  345|      1|            memory_type: r.get(3)?,
                                               ^0
  346|      1|            description: r.get(4)?,
                                               ^0
  347|      1|            body: r.get(5)?,
                                        ^0
  348|      1|            body_hash: r.get(6)?,
                                             ^0
  349|      1|            session_id: r.get(7)?,
                                              ^0
  350|      1|            source: r.get(8)?,
                                          ^0
  351|      1|            metadata: r.get(9)?,
                                            ^0
  352|      1|            created_at: r.get(10)?,
                                               ^0
  353|      1|            updated_at: r.get(11)?,
                                               ^0
  354|      1|            deleted_at: r.get(12)?,
                                               ^0
  355|       |        })
  356|      1|    }) {
  357|      1|        Ok(m) => Ok(Some(m)),
  358|      2|        Err(rusqlite::Error::QueryReturnedNoRows) => Ok(None),
  359|      0|        Err(e) => Err(AppError::Database(e)),
  360|       |    }
  361|      3|}
  362|       |
  363|       |/// Soft-deletes a memory by setting `deleted_at = unixepoch()`.
  364|       |///
  365|       |/// Versions and chunks are preserved so `restore` can undo the operation
  366|       |/// until a subsequent `purge` reclaims the storage permanently.
  367|       |///
  368|       |/// # Returns
  369|       |///
  370|       |/// `Ok(true)` when a live memory was soft-deleted, `Ok(false)` when no
  371|       |/// matching live row existed.
  372|       |///
  373|       |/// # Errors
  374|       |///
  375|       |/// Returns `Err(AppError::Database)` on any `rusqlite` failure.
  376|      7|pub fn soft_delete(conn: &Connection, namespace: &str, name: &str) -> Result<bool, AppError> {
  377|      7|    let affected = conn.execute(
  378|      7|        "UPDATE memories SET deleted_at = unixepoch() WHERE namespace=?1 AND name=?2 AND deleted_at IS NULL",
  379|      7|        params![namespace, name],
  380|      0|    )?;
  381|      7|    Ok(affected == 1)
  382|      7|}
  383|       |
  384|       |/// Lists live memories in a namespace ordered by `updated_at` descending.
  385|       |///
  386|       |/// # Arguments
  387|       |///
  388|       |/// - `memory_type` — optional filter on the `type` column.
  389|       |/// - `limit` / `offset` — standard pagination controls in rows.
  390|       |///
  391|       |/// # Errors
  392|       |///
  393|       |/// Returns `Err(AppError::Database)` on any `rusqlite` failure.
  394|      6|pub fn list(
  395|      6|    conn: &Connection,
  396|      6|    namespace: &str,
  397|      6|    memory_type: Option<&str>,
  398|      6|    limit: usize,
  399|      6|    offset: usize,
  400|      6|    include_deleted: bool,
  401|      6|) -> Result<Vec<MemoryRow>, AppError> {
  402|      6|    if let Some(mt) = memory_type {
                              ^2
  403|      2|        let sql = if include_deleted {
  404|      0|            "SELECT id, namespace, name, type, description, body, body_hash,
  405|      0|                    session_id, source, metadata, created_at, updated_at, deleted_at
  406|      0|             FROM memories WHERE namespace=?1 AND type=?2
  407|      0|             ORDER BY updated_at DESC LIMIT ?3 OFFSET ?4"
  408|       |        } else {
  409|      2|            "SELECT id, namespace, name, type, description, body, body_hash,
  410|      2|                    session_id, source, metadata, created_at, updated_at, deleted_at
  411|      2|             FROM memories WHERE namespace=?1 AND type=?2 AND deleted_at IS NULL
  412|      2|             ORDER BY updated_at DESC LIMIT ?3 OFFSET ?4"
  413|       |        };
  414|      2|        let mut stmt = conn.prepare_cached(sql)?;
                                                             ^0
  415|      2|        let rows = stmt
  416|      2|            .query_map(params![namespace, mt, limit as i64, offset as i64], |r| {
  417|       |                Ok(MemoryRow {
  418|      2|                    id: r.get(0)?,
                                              ^0
  419|      2|                    namespace: r.get(1)?,
                                                     ^0
  420|      2|                    name: r.get(2)?,
                                                ^0
  421|      2|                    memory_type: r.get(3)?,
                                                       ^0
  422|      2|                    description: r.get(4)?,
                                                       ^0
  423|      2|                    body: r.get(5)?,
                                                ^0
  424|      2|                    body_hash: r.get(6)?,
                                                     ^0
  425|      2|                    session_id: r.get(7)?,
                                                      ^0
  426|      2|                    source: r.get(8)?,
                                                  ^0
  427|      2|                    metadata: r.get(9)?,
                                                    ^0
  428|      2|                    created_at: r.get(10)?,
                                                       ^0
  429|      2|                    updated_at: r.get(11)?,
                                                       ^0
  430|      2|                    deleted_at: r.get(12)?,
                                                       ^0
  431|       |                })
  432|      2|            })?
                            ^0
  433|      2|            .collect::<Result<Vec<_>, _>>()?;
                                                         ^0
  434|      2|        Ok(rows)
  435|       |    } else {
  436|      4|        let sql = if include_deleted {
  437|      0|            "SELECT id, namespace, name, type, description, body, body_hash,
  438|      0|                    session_id, source, metadata, created_at, updated_at, deleted_at
  439|      0|             FROM memories WHERE namespace=?1
  440|      0|             ORDER BY updated_at DESC LIMIT ?2 OFFSET ?3"
  441|       |        } else {
  442|      4|            "SELECT id, namespace, name, type, description, body, body_hash,
  443|      4|                    session_id, source, metadata, created_at, updated_at, deleted_at
  444|      4|             FROM memories WHERE namespace=?1 AND deleted_at IS NULL
  445|      4|             ORDER BY updated_at DESC LIMIT ?2 OFFSET ?3"
  446|       |        };
  447|      4|        let mut stmt = conn.prepare_cached(sql)?;
                                                             ^0
  448|      4|        let rows = stmt
  449|      6|            .query_map(params![namespace, limit as i64, offset as i64], |r| {
                           ^4        ^4                 ^4            ^4
  450|       |                Ok(MemoryRow {
  451|      6|                    id: r.get(0)?,
                                              ^0
  452|      6|                    namespace: r.get(1)?,
                                                     ^0
  453|      6|                    name: r.get(2)?,
                                                ^0
  454|      6|                    memory_type: r.get(3)?,
                                                       ^0
  455|      6|                    description: r.get(4)?,
                                                       ^0
  456|      6|                    body: r.get(5)?,
                                                ^0
  457|      6|                    body_hash: r.get(6)?,
                                                     ^0
  458|      6|                    session_id: r.get(7)?,
                                                      ^0
  459|      6|                    source: r.get(8)?,
                                                  ^0
  460|      6|                    metadata: r.get(9)?,
                                                    ^0
  461|      6|                    created_at: r.get(10)?,
                                                       ^0
  462|      6|                    updated_at: r.get(11)?,
                                                       ^0
  463|      6|                    deleted_at: r.get(12)?,
                                                       ^0
  464|       |                })
  465|      6|            })?
                            ^0
  466|      4|            .collect::<Result<Vec<_>, _>>()?;
                                                         ^0
  467|      4|        Ok(rows)
  468|       |    }
  469|      6|}
  470|       |
  471|       |/// Runs a KNN search over `vec_memories`, optionally restricted to namespaces.
  472|       |///
  473|       |/// # Arguments
  474|       |///
  475|       |/// - `embedding` — query vector of length [`crate::constants::EMBEDDING_DIM`].
  476|       |/// - `namespaces` — namespaces to search. Empty slice means "all namespaces".
  477|       |/// - `memory_type` — optional filter on the `type` column.
  478|       |/// - `k` — maximum number of hits to return.
  479|       |///
  480|       |/// # Returns
  481|       |///
  482|       |/// A vector of `(memory_id, distance)` pairs sorted by ascending distance.
  483|       |///
  484|       |/// # Errors
  485|       |///
  486|       |/// Returns `Err(AppError::Database)` on any `rusqlite` failure.
  487|      3|pub fn knn_search(
  488|      3|    conn: &Connection,
  489|      3|    embedding: &[f32],
  490|      3|    namespaces: &[String],
  491|      3|    memory_type: Option<&str>,
  492|      3|    k: usize,
  493|      3|) -> Result<Vec<(i64, f32)>, AppError> {
  494|      3|    let bytes = f32_to_bytes(embedding);
  495|       |
  496|      3|    match namespaces.len() {
  497|       |        0 => {
  498|       |            // No namespace filter — search all namespaces.
  499|      0|            if let Some(mt) = memory_type {
  500|      0|                let mut stmt = conn.prepare_cached(
  501|      0|                    "SELECT memory_id, distance FROM vec_memories \
  502|      0|                     WHERE embedding MATCH ?1 AND type = ?2 \
  503|      0|                     ORDER BY distance LIMIT ?3",
  504|      0|                )?;
  505|      0|                let rows = stmt
  506|      0|                    .query_map(params![bytes, mt, k as i64], |r| {
  507|      0|                        Ok((r.get::<_, i64>(0)?, r.get::<_, f32>(1)?))
  508|      0|                    })?
  509|      0|                    .collect::<Result<Vec<_>, _>>()?;
  510|      0|                Ok(rows)
  511|       |            } else {
  512|      0|                let mut stmt = conn.prepare_cached(
  513|      0|                    "SELECT memory_id, distance FROM vec_memories \
  514|      0|                     WHERE embedding MATCH ?1 \
  515|      0|                     ORDER BY distance LIMIT ?2",
  516|      0|                )?;
  517|      0|                let rows = stmt
  518|      0|                    .query_map(params![bytes, k as i64], |r| {
  519|      0|                        Ok((r.get::<_, i64>(0)?, r.get::<_, f32>(1)?))
  520|      0|                    })?
  521|      0|                    .collect::<Result<Vec<_>, _>>()?;
  522|      0|                Ok(rows)
  523|       |            }
  524|       |        }
  525|       |        1 => {
  526|       |            // Fast single-namespace path (preserved from previous implementation).
  527|      3|            let ns = &namespaces[0];
  528|      3|            if let Some(mt) = memory_type {
                                      ^2
  529|      2|                let mut stmt = conn.prepare_cached(
  530|      2|                    "SELECT memory_id, distance FROM vec_memories \
  531|      2|                     WHERE embedding MATCH ?1 AND namespace = ?2 AND type = ?3 \
  532|      2|                     ORDER BY distance LIMIT ?4",
  533|      0|                )?;
  534|      2|                let rows = stmt
  535|      2|                    .query_map(params![bytes, ns, mt, k as i64], |r| {
  536|      2|                        Ok((r.get::<_, i64>(0)?, r.get::<_, f32>(1)?))
                                                            ^0                   ^0
  537|      2|                    })?
                                    ^0
  538|      2|                    .collect::<Result<Vec<_>, _>>()?;
                                                                 ^0
  539|      2|                Ok(rows)
  540|       |            } else {
  541|      1|                let mut stmt = conn.prepare_cached(
  542|      1|                    "SELECT memory_id, distance FROM vec_memories \
  543|      1|                     WHERE embedding MATCH ?1 AND namespace = ?2 \
  544|      1|                     ORDER BY distance LIMIT ?3",
  545|      0|                )?;
  546|      1|                let rows = stmt
  547|      2|                    .query_map(params![bytes, ns, k as i64], |r| {
                                   ^1        ^1                 ^1
  548|      2|                        Ok((r.get::<_, i64>(0)?, r.get::<_, f32>(1)?))
                                                            ^0                   ^0
  549|      2|                    })?
                                    ^0
  550|      1|                    .collect::<Result<Vec<_>, _>>()?;
                                                                 ^0
  551|      1|                Ok(rows)
  552|       |            }
  553|       |        }
  554|       |        _ => {
  555|       |            // Multiple explicit namespaces: build IN clause with positional placeholders.
  556|       |            // rusqlite does not support array binding, so we generate "?,?,..." manually.
  557|      0|            let placeholders = (0..namespaces.len())
  558|      0|                .map(|_| "?")
  559|      0|                .collect::<Vec<_>>()
  560|      0|                .join(",");
  561|      0|            if let Some(mt) = memory_type {
  562|      0|                let query = format!(
  563|      0|                    "SELECT memory_id, distance FROM vec_memories \
  564|      0|                     WHERE embedding MATCH ? AND type = ? AND namespace IN ({placeholders}) \
  565|      0|                     ORDER BY distance LIMIT ?"
  566|       |                );
  567|      0|                let mut stmt = conn.prepare(&query)?;
  568|       |                // Params: [bytes, mt, ns0, ns1, ..., k]
  569|      0|                let mut raw_params: Vec<Box<dyn rusqlite::ToSql>> =
  570|      0|                    vec![Box::new(bytes), Box::new(mt.to_string())];
  571|      0|                for ns in namespaces {
  572|      0|                    raw_params.push(Box::new(ns.clone()));
  573|      0|                }
  574|      0|                raw_params.push(Box::new(k as i64));
  575|      0|                let param_refs: Vec<&dyn rusqlite::ToSql> =
  576|      0|                    raw_params.iter().map(|b| b.as_ref()).collect();
  577|      0|                let rows = stmt
  578|      0|                    .query_map(param_refs.as_slice(), |r| {
  579|      0|                        Ok((r.get::<_, i64>(0)?, r.get::<_, f32>(1)?))
  580|      0|                    })?
  581|      0|                    .collect::<Result<Vec<_>, _>>()?;
  582|      0|                Ok(rows)
  583|       |            } else {
  584|      0|                let query = format!(
  585|      0|                    "SELECT memory_id, distance FROM vec_memories \
  586|      0|                     WHERE embedding MATCH ? AND namespace IN ({placeholders}) \
  587|      0|                     ORDER BY distance LIMIT ?"
  588|       |                );
  589|      0|                let mut stmt = conn.prepare(&query)?;
  590|       |                // Params: [bytes, ns0, ns1, ..., k]
  591|      0|                let mut raw_params: Vec<Box<dyn rusqlite::ToSql>> = vec![Box::new(bytes)];
  592|      0|                for ns in namespaces {
  593|      0|                    raw_params.push(Box::new(ns.clone()));
  594|      0|                }
  595|      0|                raw_params.push(Box::new(k as i64));
  596|      0|                let param_refs: Vec<&dyn rusqlite::ToSql> =
  597|      0|                    raw_params.iter().map(|b| b.as_ref()).collect();
  598|      0|                let rows = stmt
  599|      0|                    .query_map(param_refs.as_slice(), |r| {
  600|      0|                        Ok((r.get::<_, i64>(0)?, r.get::<_, f32>(1)?))
  601|      0|                    })?
  602|      0|                    .collect::<Result<Vec<_>, _>>()?;
  603|      0|                Ok(rows)
  604|       |            }
  605|       |        }
  606|       |    }
  607|      3|}
  608|       |
  609|       |/// Fetches a live memory by primary key and returns all columns.
  610|       |///
  611|       |/// Mirrors [`read_by_name`] but keyed on `rowid` for use after a KNN search.
  612|       |///
  613|       |/// # Errors
  614|       |///
  615|       |/// Returns `Err(AppError::Database)` on any `rusqlite` failure.
  616|      7|pub fn read_full(conn: &Connection, memory_id: i64) -> Result<Option<MemoryRow>, AppError> {
  617|      7|    let mut stmt = conn.prepare_cached(
  618|      7|        "SELECT id, namespace, name, type, description, body, body_hash,
  619|      7|                session_id, source, metadata, created_at, updated_at, deleted_at
  620|      7|         FROM memories WHERE id=?1 AND deleted_at IS NULL",
  621|      0|    )?;
  622|      7|    match stmt.query_row(params![memory_id], |r| {
                                                               ^6
  623|       |        Ok(MemoryRow {
  624|      6|            id: r.get(0)?,
                                      ^0
  625|      6|            namespace: r.get(1)?,
                                             ^0
  626|      6|            name: r.get(2)?,
                                        ^0
  627|      6|            memory_type: r.get(3)?,
                                               ^0
  628|      6|            description: r.get(4)?,
                                               ^0
  629|      6|            body: r.get(5)?,
                                        ^0
  630|      6|            body_hash: r.get(6)?,
                                             ^0
  631|      6|            session_id: r.get(7)?,
                                              ^0
  632|      6|            source: r.get(8)?,
                                          ^0
  633|      6|            metadata: r.get(9)?,
                                            ^0
  634|      6|            created_at: r.get(10)?,
                                               ^0
  635|      6|            updated_at: r.get(11)?,
                                               ^0
  636|      6|            deleted_at: r.get(12)?,
                                               ^0
  637|       |        })
  638|      6|    }) {
  639|      6|        Ok(m) => Ok(Some(m)),
  640|      1|        Err(rusqlite::Error::QueryReturnedNoRows) => Ok(None),
  641|      0|        Err(e) => Err(AppError::Database(e)),
  642|       |    }
  643|      7|}
  644|       |
  645|       |/// Fetches all memory_ids in a namespace that are soft-deleted and whose
  646|       |/// `deleted_at` is older than `before_ts` (unix epoch seconds).
  647|       |///
  648|       |/// Used by `purge` to collect stale rows for permanent deletion.
  649|       |///
  650|       |/// # Errors
  651|       |///
  652|       |/// Returns `Err(AppError::Database)` on any `rusqlite` failure.
  653|      2|pub fn list_deleted_before(
  654|      2|    conn: &Connection,
  655|      2|    namespace: &str,
  656|      2|    before_ts: i64,
  657|      2|) -> Result<Vec<i64>, AppError> {
  658|      2|    let mut stmt = conn.prepare_cached(
  659|      2|        "SELECT id FROM memories WHERE namespace = ?1 AND deleted_at IS NOT NULL AND deleted_at < ?2",
  660|      0|    )?;
  661|      2|    let ids = stmt
  662|      2|        .query_map(params![namespace, before_ts], |r| r.get::<_, i64>(0))?
                                                                    ^1^1               ^0
  663|      2|        .collect::<Result<Vec<_>, _>>()?;
                                                     ^0
  664|      2|    Ok(ids)
  665|      2|}
  666|       |
  667|       |/// Preprocesses a raw user query for FTS5 `MATCH`.
  668|       |///
  669|       |/// Technical separators (`-`, `.`, `_`, `/`) are treated as word boundaries by
  670|       |/// the `unicode61` tokenizer.  When the query contains any of these characters
  671|       |/// the function builds a compound FTS5 expression:
  672|       |///   1. A phrase query with the separated tokens (exact compound matching).
  673|       |///   2. Individual prefix terms joined with OR (broader recall).
  674|       |///
  675|       |/// Queries without separators keep the original `term*` prefix behaviour.
  676|     18|fn preprocess_fts_query(raw: &str) -> String {
  677|       |    const SEPARATORS: &[char] = &['-', '.', '_', '/'];
  678|       |    const FTS5_SYNTAX: &[char] = &['"', '*', '(', ')', '^', ':'];
  679|       |    const FTS5_KEYWORDS: &[&str] = &["OR", "AND", "NOT", "NEAR"];
  680|       |
  681|    177|    let sanitized: String = raw.chars().filter(|c| !FTS5_SYNTAX.contains(c)).collect();
                      ^18        ^18      ^18 ^18     ^18                                  ^18
  682|     18|    let trimmed = sanitized.trim();
  683|     18|    if trimmed.is_empty() {
  684|      2|        return String::new();
  685|     16|    }
  686|       |
  687|    108|    let is_fts_keyword = |t: &str| FTS5_KEYWORDS.iter().any(|kw| kw.eq_ignore_ascii_case(t));
                      ^16                        ^30                  ^30
  688|       |
  689|    126|    if !trimmed.chars().any(|c| SEPARATORS.contains(&c)) {
                      ^16             ^16
  690|     11|        return trimmed
  691|     11|            .split_whitespace()
  692|     17|            .filter(|t| !is_fts_keyword(t))
                           ^11
  693|     13|            .map(|t| format!("{t}*"))
                           ^11
  694|     11|            .collect::<Vec<_>>()
  695|     11|            .join(" ");
  696|      5|    }
  697|      5|    let tokens: Vec<&str> = trimmed
  698|     81|        .split(|c: char| SEPARATORS.contains(&c) || c.is_whitespace())
                       ^5                                         ^73^73
  699|     13|        .filter(|t| !t.is_empty() && !is_fts_keyword(t))
                       ^5
  700|      5|        .collect();
  701|      5|    if tokens.is_empty() {
  702|      0|        return String::new();
  703|      5|    }
  704|      5|    let phrase = format!("\"{}\"", tokens.join(" "));
  705|     12|    let prefix_terms: Vec<String> = tokens.iter().map(|t| format!("{t}*")).collect();
                      ^5            ^5            ^5            ^5                       ^5
  706|      5|    format!("{phrase} OR {}", prefix_terms.join(" OR "))
  707|     18|}
  708|       |
  709|       |/// Executes an FTS5 search against `fts_memories` with query preprocessing.
  710|       |///
  711|       |/// Technical separators in the query are converted to phrase + prefix OR
  712|       |/// expressions so compound terms like `graphrag-precompact.sh` match correctly.
  713|       |///
  714|       |/// # Errors
  715|       |///
  716|       |/// Returns `Err(AppError::Database)` on any `rusqlite` failure.
  717|      5|pub fn fts_search(
  718|      5|    conn: &Connection,
  719|      5|    query: &str,
  720|      5|    namespace: &str,
  721|      5|    memory_type: Option<&str>,
  722|      5|    limit: usize,
  723|      5|) -> Result<Vec<MemoryRow>, AppError> {
  724|      5|    let fts_query = preprocess_fts_query(query);
  725|      5|    if let Some(mt) = memory_type {
                              ^2
  726|      2|        let mut stmt = conn.prepare_cached(
  727|      2|            "SELECT m.id, m.namespace, m.name, m.type, m.description, m.body, m.body_hash,
  728|      2|                    m.session_id, m.source, m.metadata, m.created_at, m.updated_at, m.deleted_at
  729|      2|             FROM fts_memories fts
  730|      2|             JOIN memories m ON m.id = fts.rowid
  731|      2|             WHERE fts_memories MATCH ?1 AND m.namespace = ?2 AND m.type = ?3 AND m.deleted_at IS NULL
  732|      2|             ORDER BY rank LIMIT ?4",
  733|      0|        )?;
  734|      2|        let rows = stmt
  735|      2|            .query_map(params![fts_query, namespace, mt, limit as i64], |r| {
  736|       |                Ok(MemoryRow {
  737|      2|                    id: r.get(0)?,
                                              ^0
  738|      2|                    namespace: r.get(1)?,
                                                     ^0
  739|      2|                    name: r.get(2)?,
                                                ^0
  740|      2|                    memory_type: r.get(3)?,
                                                       ^0
  741|      2|                    description: r.get(4)?,
                                                       ^0
  742|      2|                    body: r.get(5)?,
                                                ^0
  743|      2|                    body_hash: r.get(6)?,
                                                     ^0
  744|      2|                    session_id: r.get(7)?,
                                                      ^0
  745|      2|                    source: r.get(8)?,
                                                  ^0
  746|      2|                    metadata: r.get(9)?,
                                                    ^0
  747|      2|                    created_at: r.get(10)?,
                                                       ^0
  748|      2|                    updated_at: r.get(11)?,
                                                       ^0
  749|      2|                    deleted_at: r.get(12)?,
                                                       ^0
  750|       |                })
  751|      2|            })?
                            ^0
  752|      2|            .collect::<Result<Vec<_>, _>>()?;
                                                         ^0
  753|      2|        Ok(rows)
  754|       |    } else {
  755|      3|        let mut stmt = conn.prepare_cached(
  756|      3|            "SELECT m.id, m.namespace, m.name, m.type, m.description, m.body, m.body_hash,
  757|      3|                    m.session_id, m.source, m.metadata, m.created_at, m.updated_at, m.deleted_at
  758|      3|             FROM fts_memories fts
  759|      3|             JOIN memories m ON m.id = fts.rowid
  760|      3|             WHERE fts_memories MATCH ?1 AND m.namespace = ?2 AND m.deleted_at IS NULL
  761|      3|             ORDER BY rank LIMIT ?3",
  762|      0|        )?;
  763|      3|        let rows = stmt
  764|      3|            .query_map(params![fts_query, namespace, limit as i64], |r| {
                                                                                      ^2
  765|       |                Ok(MemoryRow {
  766|      2|                    id: r.get(0)?,
                                              ^0
  767|      2|                    namespace: r.get(1)?,
                                                     ^0
  768|      2|                    name: r.get(2)?,
                                                ^0
  769|      2|                    memory_type: r.get(3)?,
                                                       ^0
  770|      2|                    description: r.get(4)?,
                                                       ^0
  771|      2|                    body: r.get(5)?,
                                                ^0
  772|      2|                    body_hash: r.get(6)?,
                                                     ^0
  773|      2|                    session_id: r.get(7)?,
                                                      ^0
  774|      2|                    source: r.get(8)?,
                                                  ^0
  775|      2|                    metadata: r.get(9)?,
                                                    ^0
  776|      2|                    created_at: r.get(10)?,
                                                       ^0
  777|      2|                    updated_at: r.get(11)?,
                                                       ^0
  778|      2|                    deleted_at: r.get(12)?,
                                                       ^0
  779|       |                })
  780|      2|            })?
                            ^0
  781|      3|            .collect::<Result<Vec<_>, _>>()?;
                                                         ^0
  782|      3|        Ok(rows)
  783|       |    }
  784|      5|}
  785|       |
  786|       |/// Syncs FTS5 external-content index after an UPDATE on the memories table.
  787|       |///
  788|       |/// The AFTER UPDATE trigger (`trg_fts_au`) is intentionally absent because
  789|       |/// sqlite-vec loaded via `sqlite3_auto_extension` conflicts with FTS5 inside
  790|       |/// UPDATE triggers. This function performs the equivalent sync in Rust:
  791|       |/// DELETE the old entry, then INSERT the new one (external-content FTS5
  792|       |/// tables do not support in-place UPDATE).
  793|       |#[allow(clippy::too_many_arguments)]
  794|      0|pub fn sync_fts_after_update(
  795|      0|    conn: &Connection,
  796|      0|    memory_id: i64,
  797|      0|    old_name: &str,
  798|      0|    old_desc: &str,
  799|      0|    old_body: &str,
  800|      0|    new_name: &str,
  801|      0|    new_desc: &str,
  802|      0|    new_body: &str,
  803|      0|) -> Result<(), AppError> {
  804|      0|    conn.execute(
  805|      0|        "INSERT INTO fts_memories(fts_memories, rowid, name, description, body)
  806|      0|         VALUES('delete', ?1, ?2, ?3, ?4)",
  807|      0|        params![memory_id, old_name, old_desc, old_body],
  808|      0|    )?;
  809|      0|    conn.execute(
  810|      0|        "INSERT INTO fts_memories(rowid, name, description, body)
  811|      0|         VALUES(?1, ?2, ?3, ?4)",
  812|      0|        params![memory_id, new_name, new_desc, new_body],
  813|      0|    )?;
  814|      0|    Ok(())
  815|      0|}
  816|       |
  817|       |#[cfg(test)]
  818|       |mod tests {
  819|       |    use super::*;
  820|       |    use rusqlite::Connection;
  821|       |
  822|       |    type TestResult = Result<(), Box<dyn std::error::Error>>;
  823|       |
  824|     37|    fn setup_conn() -> Result<Connection, Box<dyn std::error::Error>> {
  825|     37|        crate::storage::connection::register_vec_extension();
  826|     37|        let mut conn = Connection::open_in_memory()?;
                                                                 ^0
  827|     37|        conn.execute_batch(
  828|     37|            "PRAGMA foreign_keys = ON;
  829|     37|             PRAGMA temp_store = MEMORY;",
  830|      0|        )?;
  831|     37|        crate::migrations::runner().run(&mut conn)?;
                                                                ^0
  832|     37|        Ok(conn)
  833|     37|    }
  834|       |
  835|     42|    fn new_memory(name: &str) -> NewMemory {
  836|     42|        NewMemory {
  837|     42|            namespace: "global".to_string(),
  838|     42|            name: name.to_string(),
  839|     42|            memory_type: "user".to_string(),
  840|     42|            description: "descricao de teste".to_string(),
  841|     42|            body: "test memory body".to_string(),
  842|     42|            body_hash: format!("hash-{name}"),
  843|     42|            session_id: None,
  844|     42|            source: "agent".to_string(),
  845|     42|            metadata: serde_json::json!({}),
  846|     42|        }
  847|     42|    }
  848|       |
  849|       |    #[test]
  850|      1|    fn insert_and_find_by_name_return_id() -> TestResult {
  851|      1|        let conn = setup_conn()?;
                                             ^0
  852|      1|        let m = new_memory("mem-alpha");
  853|      1|        let id = insert(&conn, &m)?;
                                                ^0
  854|      1|        assert!(id > 0);
  855|       |
  856|      1|        let found = find_by_name(&conn, "global", "mem-alpha")?;
                                                                            ^0
  857|      1|        assert!(found.is_some());
  858|      1|        let (found_id, _, _) = found.ok_or("mem-alpha should exist")?;
                                                                                  ^0
  859|      1|        assert_eq!(found_id, id);
  860|      1|        Ok(())
  861|      1|    }
  862|       |
  863|       |    #[test]
  864|      1|    fn find_by_name_returns_none_when_not_found() -> TestResult {
  865|      1|        let conn = setup_conn()?;
                                             ^0
  866|      1|        let result = find_by_name(&conn, "global", "inexistente")?;
                                                                               ^0
  867|      1|        assert!(result.is_none());
  868|      1|        Ok(())
  869|      1|    }
  870|       |
  871|       |    #[test]
  872|      1|    fn find_by_hash_returns_correct_id() -> TestResult {
  873|      1|        let conn = setup_conn()?;
                                             ^0
  874|      1|        let m = new_memory("mem-hash");
  875|      1|        let id = insert(&conn, &m)?;
                                                ^0
  876|       |
  877|      1|        let found = find_by_hash(&conn, "global", "hash-mem-hash")?;
                                                                                ^0
  878|      1|        assert_eq!(found, Some(id));
  879|      1|        Ok(())
  880|      1|    }
  881|       |
  882|       |    #[test]
  883|      1|    fn find_by_hash_returns_none_when_hash_not_found() -> TestResult {
  884|      1|        let conn = setup_conn()?;
                                             ^0
  885|      1|        let result = find_by_hash(&conn, "global", "hash-inexistente")?;
                                                                                    ^0
  886|      1|        assert!(result.is_none());
  887|      1|        Ok(())
  888|      1|    }
  889|       |
  890|       |    #[test]
  891|      1|    fn find_by_hash_ignores_different_namespace() -> TestResult {
  892|      1|        let conn = setup_conn()?;
                                             ^0
  893|      1|        let m = new_memory("mem-ns");
  894|      1|        insert(&conn, &m)?;
                                       ^0
  895|       |
  896|      1|        let result = find_by_hash(&conn, "outro-namespace", "hash-mem-ns")?;
                                                                                        ^0
  897|      1|        assert!(result.is_none());
  898|      1|        Ok(())
  899|      1|    }
  900|       |
  901|       |    #[test]
  902|      1|    fn read_by_name_returns_full_memory() -> TestResult {
  903|      1|        let conn = setup_conn()?;
                                             ^0
  904|      1|        let m = new_memory("mem-read");
  905|      1|        let id = insert(&conn, &m)?;
                                                ^0
  906|       |
  907|      1|        let row = read_by_name(&conn, "global", "mem-read")?.ok_or("mem-read should exist")?;
                                                                         ^0                              ^0
  908|      1|        assert_eq!(row.id, id);
  909|      1|        assert_eq!(row.name, "mem-read");
  910|      1|        assert_eq!(row.memory_type, "user");
  911|      1|        assert_eq!(row.body, "test memory body");
  912|      1|        assert_eq!(row.namespace, "global");
  913|      1|        Ok(())
  914|      1|    }
  915|       |
  916|       |    #[test]
  917|      1|    fn read_by_name_returns_none_for_missing() -> TestResult {
  918|      1|        let conn = setup_conn()?;
                                             ^0
  919|      1|        let result = read_by_name(&conn, "global", "nao-existe")?;
                                                                              ^0
  920|      1|        assert!(result.is_none());
  921|      1|        Ok(())
  922|      1|    }
  923|       |
  924|       |    #[test]
  925|      1|    fn read_full_by_id_returns_memory() -> TestResult {
  926|      1|        let conn = setup_conn()?;
                                             ^0
  927|      1|        let m = new_memory("mem-full");
  928|      1|        let id = insert(&conn, &m)?;
                                                ^0
  929|       |
  930|      1|        let row = read_full(&conn, id)?.ok_or("mem-full should exist")?;
                                                    ^0                              ^0
  931|      1|        assert_eq!(row.id, id);
  932|      1|        assert_eq!(row.name, "mem-full");
  933|      1|        Ok(())
  934|      1|    }
  935|       |
  936|       |    #[test]
  937|      1|    fn read_full_returns_none_for_missing_id() -> TestResult {
  938|      1|        let conn = setup_conn()?;
                                             ^0
  939|      1|        let result = read_full(&conn, 9999)?;
                                                         ^0
  940|      1|        assert!(result.is_none());
  941|      1|        Ok(())
  942|      1|    }
  943|       |
  944|       |    #[test]
  945|      1|    fn update_without_optimism_modifies_fields() -> TestResult {
  946|      1|        let conn = setup_conn()?;
                                             ^0
  947|      1|        let m = new_memory("mem-upd");
  948|      1|        let id = insert(&conn, &m)?;
                                                ^0
  949|       |
  950|      1|        let mut m2 = new_memory("mem-upd");
  951|      1|        m2.body = "updated body".to_string();
  952|      1|        m2.body_hash = "hash-novo".to_string();
  953|      1|        let ok = update(&conn, id, &m2, None)?;
                                                           ^0
  954|      1|        assert!(ok);
  955|       |
  956|      1|        let row = read_full(&conn, id)?.ok_or("mem-upd should exist")?;
                                                    ^0                             ^0
  957|      1|        assert_eq!(row.body, "updated body");
  958|      1|        assert_eq!(row.body_hash, "hash-novo");
  959|      1|        Ok(())
  960|      1|    }
  961|       |
  962|       |    #[test]
  963|      1|    fn update_with_correct_expected_updated_at_succeeds() -> TestResult {
  964|      1|        let conn = setup_conn()?;
                                             ^0
  965|      1|        let m = new_memory("mem-opt");
  966|      1|        let id = insert(&conn, &m)?;
                                                ^0
  967|       |
  968|      1|        let (_, updated_at, _) =
  969|      1|            find_by_name(&conn, "global", "mem-opt")?.ok_or("mem-opt should exist")?;
                                                                  ^0                             ^0
  970|       |
  971|      1|        let mut m2 = new_memory("mem-opt");
  972|      1|        m2.body = "optimistic body".to_string();
  973|      1|        m2.body_hash = "hash-optimistic".to_string();
  974|      1|        let ok = update(&conn, id, &m2, Some(updated_at))?;
                                                                       ^0
  975|      1|        assert!(ok);
  976|       |
  977|      1|        let row = read_full(&conn, id)?.ok_or("mem-opt should exist after update")?;
                                                    ^0                                          ^0
  978|      1|        assert_eq!(row.body, "optimistic body");
  979|      1|        Ok(())
  980|      1|    }
  981|       |
  982|       |    #[test]
  983|      1|    fn update_with_wrong_expected_updated_at_returns_false() -> TestResult {
  984|      1|        let conn = setup_conn()?;
                                             ^0
  985|      1|        let m = new_memory("mem-conflict");
  986|      1|        let id = insert(&conn, &m)?;
                                                ^0
  987|       |
  988|      1|        let mut m2 = new_memory("mem-conflict");
  989|      1|        m2.body = "must not appear".to_string();
  990|      1|        m2.body_hash = "hash-x".to_string();
  991|      1|        let ok = update(&conn, id, &m2, Some(0))?;
                                                              ^0
  992|      1|        assert!(!ok);
  993|       |
  994|      1|        let row = read_full(&conn, id)?.ok_or("mem-conflict should exist")?;
                                                    ^0                                  ^0
  995|      1|        assert_eq!(row.body, "test memory body");
  996|      1|        Ok(())
  997|      1|    }
  998|       |
  999|       |    #[test]
 1000|      1|    fn update_missing_id_returns_false() -> TestResult {
 1001|      1|        let conn = setup_conn()?;
                                             ^0
 1002|      1|        let m = new_memory("fantasma");
 1003|      1|        let ok = update(&conn, 9999, &m, None)?;
                                                            ^0
 1004|      1|        assert!(!ok);
 1005|      1|        Ok(())
 1006|      1|    }
 1007|       |
 1008|       |    #[test]
 1009|      1|    fn soft_delete_marks_deleted_at() -> TestResult {
 1010|      1|        let conn = setup_conn()?;
                                             ^0
 1011|      1|        let m = new_memory("mem-del");
 1012|      1|        insert(&conn, &m)?;
                                       ^0
 1013|       |
 1014|      1|        let ok = soft_delete(&conn, "global", "mem-del")?;
                                                                      ^0
 1015|      1|        assert!(ok);
 1016|       |
 1017|      1|        let result = find_by_name(&conn, "global", "mem-del")?;
                                                                           ^0
 1018|      1|        assert!(result.is_none());
 1019|       |
 1020|      1|        let result_read = read_by_name(&conn, "global", "mem-del")?;
                                                                                ^0
 1021|      1|        assert!(result_read.is_none());
 1022|      1|        Ok(())
 1023|      1|    }
 1024|       |
 1025|       |    #[test]
 1026|      1|    fn soft_delete_returns_false_when_not_found() -> TestResult {
 1027|      1|        let conn = setup_conn()?;
                                             ^0
 1028|      1|        let ok = soft_delete(&conn, "global", "nao-existe")?;
                                                                         ^0
 1029|      1|        assert!(!ok);
 1030|      1|        Ok(())
 1031|      1|    }
 1032|       |
 1033|       |    #[test]
 1034|      1|    fn double_soft_delete_returns_false_on_second_call() -> TestResult {
 1035|      1|        let conn = setup_conn()?;
                                             ^0
 1036|      1|        let m = new_memory("mem-del2");
 1037|      1|        insert(&conn, &m)?;
                                       ^0
 1038|       |
 1039|      1|        soft_delete(&conn, "global", "mem-del2")?;
                                                              ^0
 1040|      1|        let ok = soft_delete(&conn, "global", "mem-del2")?;
                                                                       ^0
 1041|      1|        assert!(!ok);
 1042|      1|        Ok(())
 1043|      1|    }
 1044|       |
 1045|       |    #[test]
 1046|      1|    fn list_returns_memories_from_namespace() -> TestResult {
 1047|      1|        let conn = setup_conn()?;
                                             ^0
 1048|      1|        insert(&conn, &new_memory("mem-list-a"))?;
                                                              ^0
 1049|      1|        insert(&conn, &new_memory("mem-list-b"))?;
                                                              ^0
 1050|       |
 1051|      1|        let rows = list(&conn, "global", None, 10, 0, false)?;
                                                                          ^0
 1052|      1|        assert!(rows.len() >= 2);
 1053|      2|        let nomes: Vec<_> = rows.iter().map(|r| r.name.as_str()).collect();
                          ^1     ^1       ^1          ^1                       ^1
 1054|      1|        assert!(nomes.contains(&"mem-list-a"));
 1055|      1|        assert!(nomes.contains(&"mem-list-b"));
 1056|      1|        Ok(())
 1057|      1|    }
 1058|       |
 1059|       |    #[test]
 1060|      1|    fn list_with_type_filter_returns_only_correct_type() -> TestResult {
 1061|      1|        let conn = setup_conn()?;
                                             ^0
 1062|      1|        insert(&conn, &new_memory("mem-user"))?;
                                                            ^0
 1063|       |
 1064|      1|        let mut m2 = new_memory("mem-feedback");
 1065|      1|        m2.memory_type = "feedback".to_string();
 1066|      1|        insert(&conn, &m2)?;
                                        ^0
 1067|       |
 1068|      1|        let rows_user = list(&conn, "global", Some("user"), 10, 0, false)?;
                                                                                       ^0
 1069|      1|        assert!(rows_user.iter().all(|r| r.memory_type == "user"));
 1070|       |
 1071|      1|        let rows_fb = list(&conn, "global", Some("feedback"), 10, 0, false)?;
                                                                                         ^0
 1072|      1|        assert!(rows_fb.iter().all(|r| r.memory_type == "feedback"));
 1073|      1|        Ok(())
 1074|      1|    }
 1075|       |
 1076|       |    #[test]
 1077|      1|    fn list_exclui_soft_deleted() -> TestResult {
 1078|      1|        let conn = setup_conn()?;
                                             ^0
 1079|      1|        let m = new_memory("mem-excluida");
 1080|      1|        insert(&conn, &m)?;
                                       ^0
 1081|      1|        soft_delete(&conn, "global", "mem-excluida")?;
                                                                  ^0
 1082|       |
 1083|      1|        let rows = list(&conn, "global", None, 10, 0, false)?;
                                                                          ^0
 1084|      1|        assert!(rows.iter().all(|r| r.name != "mem-excluida"));
                                                  ^0        ^0
 1085|      1|        Ok(())
 1086|      1|    }
 1087|       |
 1088|       |    #[test]
 1089|      1|    fn list_pagination_works() -> TestResult {
 1090|      1|        let conn = setup_conn()?;
                                             ^0
 1091|      6|        for i in 0..5 {
                          ^5
 1092|      5|            insert(&conn, &new_memory(&format!("mem-pag-{i}")))?;
                                                                             ^0
 1093|       |        }
 1094|       |
 1095|      1|        let pagina1 = list(&conn, "global", None, 2, 0, false)?;
                                                                            ^0
 1096|      1|        let pagina2 = list(&conn, "global", None, 2, 2, false)?;
                                                                            ^0
 1097|      1|        assert!(pagina1.len() <= 2);
 1098|      1|        assert!(pagina2.len() <= 2);
 1099|      1|        if !pagina1.is_empty() && !pagina2.is_empty() {
 1100|      1|            assert_ne!(pagina1[0].id, pagina2[0].id);
 1101|      0|        }
 1102|      1|        Ok(())
 1103|      1|    }
 1104|       |
 1105|       |    #[test]
 1106|      1|    fn upsert_vec_and_delete_vec_work() -> TestResult {
 1107|      1|        let conn = setup_conn()?;
                                             ^0
 1108|      1|        let m = new_memory("mem-vec");
 1109|      1|        let id = insert(&conn, &m)?;
                                                ^0
 1110|       |
 1111|      1|        let embedding: Vec<f32> = vec![0.1; 384];
 1112|      1|        upsert_vec(
 1113|      1|            &conn, id, "global", "user", &embedding, "mem-vec", "snippet",
 1114|      0|        )?;
 1115|       |
 1116|      1|        let count: i64 = conn.query_row(
 1117|      1|            "SELECT COUNT(*) FROM vec_memories WHERE memory_id = ?1",
 1118|      1|            params![id],
 1119|      1|            |r| r.get(0),
 1120|      0|        )?;
 1121|      1|        assert_eq!(count, 1);
 1122|       |
 1123|      1|        delete_vec(&conn, id)?;
                                           ^0
 1124|       |
 1125|      1|        let count_after: i64 = conn.query_row(
 1126|      1|            "SELECT COUNT(*) FROM vec_memories WHERE memory_id = ?1",
 1127|      1|            params![id],
 1128|      1|            |r| r.get(0),
 1129|      0|        )?;
 1130|      1|        assert_eq!(count_after, 0);
 1131|      1|        Ok(())
 1132|      1|    }
 1133|       |
 1134|       |    #[test]
 1135|      1|    fn upsert_vec_replaces_existing_vector() -> TestResult {
 1136|      1|        let conn = setup_conn()?;
                                             ^0
 1137|      1|        let m = new_memory("mem-vec-upsert");
 1138|      1|        let id = insert(&conn, &m)?;
                                                ^0
 1139|       |
 1140|      1|        let emb1: Vec<f32> = vec![0.1; 384];
 1141|      1|        upsert_vec(&conn, id, "global", "user", &emb1, "mem-vec-upsert", "s1")?;
                                                                                            ^0
 1142|       |
 1143|      1|        let emb2: Vec<f32> = vec![0.9; 384];
 1144|      1|        upsert_vec(&conn, id, "global", "user", &emb2, "mem-vec-upsert", "s2")?;
                                                                                            ^0
 1145|       |
 1146|      1|        let count: i64 = conn.query_row(
 1147|      1|            "SELECT COUNT(*) FROM vec_memories WHERE memory_id = ?1",
 1148|      1|            params![id],
 1149|      1|            |r| r.get(0),
 1150|      0|        )?;
 1151|      1|        assert_eq!(count, 1);
 1152|      1|        Ok(())
 1153|      1|    }
 1154|       |
 1155|       |    #[test]
 1156|      1|    fn knn_search_returns_results_by_distance() -> TestResult {
 1157|      1|        let conn = setup_conn()?;
                                             ^0
 1158|       |
 1159|       |        // emb_a: predominantemente positivo — cosseno alto com query [1.0; 384]
 1160|      1|        let ma = new_memory("mem-knn-a");
 1161|      1|        let id_a = insert(&conn, &ma)?;
                                                   ^0
 1162|      1|        let emb_a: Vec<f32> = vec![1.0; 384];
 1163|      1|        upsert_vec(&conn, id_a, "global", "user", &emb_a, "mem-knn-a", "s")?;
                                                                                         ^0
 1164|       |
 1165|       |        // emb_b: predominantemente negativo — cosseno baixo com query [1.0; 384]
 1166|      1|        let mb = new_memory("mem-knn-b");
 1167|      1|        let id_b = insert(&conn, &mb)?;
                                                   ^0
 1168|      1|        let emb_b: Vec<f32> = vec![-1.0; 384];
 1169|      1|        upsert_vec(&conn, id_b, "global", "user", &emb_b, "mem-knn-b", "s")?;
                                                                                         ^0
 1170|       |
 1171|      1|        let query: Vec<f32> = vec![1.0; 384];
 1172|      1|        let results = knn_search(&conn, &query, &["global".to_string()], None, 2)?;
                                                                                               ^0
 1173|      1|        assert!(!results.is_empty());
 1174|      1|        assert_eq!(results[0].0, id_a);
 1175|      1|        Ok(())
 1176|      1|    }
 1177|       |
 1178|       |    #[test]
 1179|      1|    fn knn_search_with_type_filter_restricts_result() -> TestResult {
 1180|      1|        let conn = setup_conn()?;
                                             ^0
 1181|       |
 1182|      1|        let ma = new_memory("mem-knn-tipo-user");
 1183|      1|        let id_a = insert(&conn, &ma)?;
                                                   ^0
 1184|      1|        let emb: Vec<f32> = vec![1.0; 384];
 1185|      1|        upsert_vec(
 1186|      1|            &conn,
 1187|      1|            id_a,
 1188|      1|            "global",
 1189|      1|            "user",
 1190|      1|            &emb,
 1191|      1|            "mem-knn-tipo-user",
 1192|      1|            "s",
 1193|      0|        )?;
 1194|       |
 1195|      1|        let mut mb = new_memory("mem-knn-tipo-fb");
 1196|      1|        mb.memory_type = "feedback".to_string();
 1197|      1|        let id_b = insert(&conn, &mb)?;
                                                   ^0
 1198|      1|        upsert_vec(
 1199|      1|            &conn,
 1200|      1|            id_b,
 1201|      1|            "global",
 1202|      1|            "feedback",
 1203|      1|            &emb,
 1204|      1|            "mem-knn-tipo-fb",
 1205|      1|            "s",
 1206|      0|        )?;
 1207|       |
 1208|      1|        let query: Vec<f32> = vec![1.0; 384];
 1209|      1|        let results_user = knn_search(&conn, &query, &["global".to_string()], Some("user"), 5)?;
                                                                                                            ^0
 1210|      1|        assert!(results_user.iter().all(|(id, _)| *id == id_a));
 1211|       |
 1212|      1|        let results_fb = knn_search(&conn, &query, &["global".to_string()], Some("feedback"), 5)?;
                                                                                                              ^0
 1213|      1|        assert!(results_fb.iter().all(|(id, _)| *id == id_b));
 1214|      1|        Ok(())
 1215|      1|    }
 1216|       |
 1217|       |    #[test]
 1218|      1|    fn fts_search_finds_by_prefix_in_body() -> TestResult {
 1219|      1|        let conn = setup_conn()?;
                                             ^0
 1220|      1|        let mut m = new_memory("mem-fts");
 1221|      1|        m.body = "linguagem de programacao rust".to_string();
 1222|      1|        insert(&conn, &m)?;
                                       ^0
 1223|       |
 1224|      1|        conn.execute_batch(
 1225|      1|            "INSERT INTO fts_memories(rowid, name, description, body)
 1226|      1|             SELECT id, name, description, body FROM memories WHERE deleted_at IS NULL",
 1227|      0|        )?;
 1228|       |
 1229|      1|        let rows = fts_search(&conn, "programacao", "global", None, 10)?;
                                                                                     ^0
 1230|      1|        assert!(!rows.is_empty());
 1231|      1|        assert!(rows.iter().any(|r| r.name == "mem-fts"));
 1232|      1|        Ok(())
 1233|      1|    }
 1234|       |
 1235|       |    #[test]
 1236|      1|    fn fts_search_with_type_filter() -> TestResult {
 1237|      1|        let conn = setup_conn()?;
                                             ^0
 1238|      1|        let mut m = new_memory("mem-fts-tipo");
 1239|      1|        m.body = "linguagem especial para filtro".to_string();
 1240|      1|        insert(&conn, &m)?;
                                       ^0
 1241|       |
 1242|      1|        let mut m2 = new_memory("mem-fts-feedback");
 1243|      1|        m2.memory_type = "feedback".to_string();
 1244|      1|        m2.body = "linguagem especial para filtro".to_string();
 1245|      1|        insert(&conn, &m2)?;
                                        ^0
 1246|       |
 1247|      1|        conn.execute_batch(
 1248|      1|            "INSERT INTO fts_memories(rowid, name, description, body)
 1249|      1|             SELECT id, name, description, body FROM memories WHERE deleted_at IS NULL",
 1250|      0|        )?;
 1251|       |
 1252|      1|        let rows_user = fts_search(&conn, "especial", "global", Some("user"), 10)?;
                                                                                               ^0
 1253|      1|        assert!(rows_user.iter().all(|r| r.memory_type == "user"));
 1254|       |
 1255|      1|        let rows_fb = fts_search(&conn, "especial", "global", Some("feedback"), 10)?;
                                                                                                 ^0
 1256|      1|        assert!(rows_fb.iter().all(|r| r.memory_type == "feedback"));
 1257|      1|        Ok(())
 1258|      1|    }
 1259|       |
 1260|       |    #[test]
 1261|      1|    fn fts_search_excludes_deleted() -> TestResult {
 1262|      1|        let conn = setup_conn()?;
                                             ^0
 1263|      1|        let mut m = new_memory("mem-fts-del");
 1264|      1|        m.body = "deleted fts content".to_string();
 1265|      1|        insert(&conn, &m)?;
                                       ^0
 1266|       |
 1267|      1|        conn.execute_batch(
 1268|      1|            "INSERT INTO fts_memories(rowid, name, description, body)
 1269|      1|             SELECT id, name, description, body FROM memories WHERE deleted_at IS NULL",
 1270|      0|        )?;
 1271|       |
 1272|      1|        soft_delete(&conn, "global", "mem-fts-del")?;
                                                                 ^0
 1273|       |
 1274|      1|        let rows = fts_search(&conn, "deleted", "global", None, 10)?;
                                                                                 ^0
 1275|      1|        assert!(rows.iter().all(|r| r.name != "mem-fts-del"));
                                                  ^0        ^0
 1276|      1|        Ok(())
 1277|      1|    }
 1278|       |
 1279|       |    #[test]
 1280|      1|    fn list_deleted_before_returns_correct_ids() -> TestResult {
 1281|      1|        let conn = setup_conn()?;
                                             ^0
 1282|      1|        let m = new_memory("mem-purge");
 1283|      1|        insert(&conn, &m)?;
                                       ^0
 1284|      1|        soft_delete(&conn, "global", "mem-purge")?;
                                                               ^0
 1285|       |
 1286|      1|        let ids = list_deleted_before(&conn, "global", i64::MAX)?;
                                                                              ^0
 1287|      1|        assert!(!ids.is_empty());
 1288|       |
 1289|      1|        let ids_antes = list_deleted_before(&conn, "global", 0)?;
                                                                             ^0
 1290|      1|        assert!(ids_antes.is_empty());
 1291|      1|        Ok(())
 1292|      1|    }
 1293|       |
 1294|       |    #[test]
 1295|      1|    fn find_by_name_returns_correct_max_version() -> TestResult {
 1296|      1|        let conn = setup_conn()?;
                                             ^0
 1297|      1|        let m = new_memory("mem-ver");
 1298|      1|        let id = insert(&conn, &m)?;
                                                ^0
 1299|       |
 1300|      1|        let (_, _, v0) = find_by_name(&conn, "global", "mem-ver")?.ok_or("mem-ver should exist")?;
                                                                               ^0                             ^0
 1301|      1|        assert_eq!(v0, 0);
 1302|       |
 1303|      1|        conn.execute(
 1304|      1|            "INSERT INTO memory_versions (memory_id, version, name, type, description, body, metadata, change_reason)
 1305|      1|             VALUES (?1, 1, 'mem-ver', 'user', 'desc', 'body', '{}', 'create')",
 1306|      1|            params![id],
 1307|      0|        )?;
 1308|       |
 1309|      1|        let (_, _, v1) =
 1310|      1|            find_by_name(&conn, "global", "mem-ver")?.ok_or("mem-ver should exist after insert")?;
                                                                  ^0                                          ^0
 1311|      1|        assert_eq!(v1, 1);
 1312|      1|        Ok(())
 1313|      1|    }
 1314|       |
 1315|       |    #[test]
 1316|      1|    fn insert_com_metadata_json() -> TestResult {
 1317|      1|        let conn = setup_conn()?;
                                             ^0
 1318|      1|        let mut m = new_memory("mem-meta");
 1319|      1|        m.metadata = serde_json::json!({"chave": "valor", "numero": 42});
 1320|      1|        let id = insert(&conn, &m)?;
                                                ^0
 1321|       |
 1322|      1|        let row = read_full(&conn, id)?.ok_or("mem-meta should exist")?;
                                                    ^0                              ^0
 1323|      1|        let meta: serde_json::Value = serde_json::from_str(&row.metadata)?;
                                                                                       ^0
 1324|      1|        assert_eq!(meta["chave"], "valor");
 1325|      1|        assert_eq!(meta["numero"], 42);
 1326|      1|        Ok(())
 1327|      1|    }
 1328|       |
 1329|       |    #[test]
 1330|      1|    fn insert_com_session_id() -> TestResult {
 1331|      1|        let conn = setup_conn()?;
                                             ^0
 1332|      1|        let mut m = new_memory("mem-session");
 1333|      1|        m.session_id = Some("sessao-xyz".to_string());
 1334|      1|        let id = insert(&conn, &m)?;
                                                ^0
 1335|       |
 1336|      1|        let row = read_full(&conn, id)?.ok_or("mem-session should exist")?;
                                                    ^0                                 ^0
 1337|      1|        assert_eq!(row.session_id, Some("sessao-xyz".to_string()));
 1338|      1|        Ok(())
 1339|      1|    }
 1340|       |
 1341|       |    #[test]
 1342|      1|    fn delete_vec_for_nonexistent_id_does_not_fail() -> TestResult {
 1343|      1|        let conn = setup_conn()?;
                                             ^0
 1344|      1|        let result = delete_vec(&conn, 99999);
 1345|      1|        assert!(result.is_ok());
 1346|      1|        Ok(())
 1347|      1|    }
 1348|       |
 1349|       |    #[test]
 1350|      1|    fn preprocess_fts_query_no_separators() {
 1351|      1|        assert_eq!(preprocess_fts_query("hello"), "hello*");
 1352|      1|        assert_eq!(preprocess_fts_query("hello world"), "hello* world*");
 1353|      1|    }
 1354|       |
 1355|       |    #[test]
 1356|      1|    fn preprocess_fts_query_with_hyphens() {
 1357|      1|        let result = preprocess_fts_query("graphrag-precompact");
 1358|      1|        assert!(result.contains("\"graphrag precompact\""));
 1359|      1|        assert!(result.contains("graphrag*"));
 1360|      1|        assert!(result.contains("precompact*"));
 1361|      1|    }
 1362|       |
 1363|       |    #[test]
 1364|      1|    fn preprocess_fts_query_with_dots() {
 1365|      1|        let result = preprocess_fts_query("v1.0.44");
 1366|      1|        assert!(result.contains("\"v1 0 44\""));
 1367|      1|        assert!(result.contains("v1*"));
 1368|      1|        assert!(result.contains("44*"));
 1369|      1|    }
 1370|       |
 1371|       |    #[test]
 1372|      1|    fn preprocess_fts_query_with_mixed_separators() {
 1373|      1|        let result = preprocess_fts_query("graphrag-precompact.sh");
 1374|      1|        assert!(result.contains("\"graphrag precompact sh\""));
 1375|      1|        assert!(result.contains("graphrag*"));
 1376|      1|    }
 1377|       |
 1378|       |    #[test]
 1379|      1|    fn preprocess_fts_query_empty_and_whitespace() {
 1380|      1|        assert_eq!(preprocess_fts_query(""), "");
 1381|      1|        assert_eq!(preprocess_fts_query("  "), "");
 1382|      1|    }
 1383|       |
 1384|       |    #[test]
 1385|      1|    fn preprocess_fts_query_strips_quotes() {
 1386|      1|        let result = preprocess_fts_query(r#"hello "world"#);
 1387|      1|        assert!(result.contains("hello*"));
 1388|      1|        assert!(result.contains("world*"));
 1389|      1|    }
 1390|       |
 1391|       |    #[test]
 1392|      1|    fn preprocess_fts_query_strips_asterisks() {
 1393|      1|        assert_eq!(preprocess_fts_query("test*"), "test*");
 1394|      1|    }
 1395|       |
 1396|       |    #[test]
 1397|      1|    fn preprocess_fts_query_strips_parens() {
 1398|      1|        let result = preprocess_fts_query("(hello)");
 1399|      1|        assert!(result.contains("hello*"));
 1400|      1|        assert!(!result.contains('('));
 1401|      1|    }
 1402|       |
 1403|       |    #[test]
 1404|      1|    fn preprocess_fts_query_filters_fts_keywords() {
 1405|      1|        let result = preprocess_fts_query("foo OR bar");
 1406|      1|        assert!(result.contains("foo*"));
 1407|      1|        assert!(result.contains("bar*"));
 1408|      1|        assert!(!result.contains("OR*"));
 1409|      1|    }
 1410|       |
 1411|       |    #[test]
 1412|      1|    fn preprocess_fts_query_only_fts_keywords() {
 1413|      1|        assert_eq!(preprocess_fts_query("OR AND NOT"), "");
 1414|      1|    }
 1415|       |
 1416|       |    #[test]
 1417|      1|    fn preprocess_fts_query_keywords_with_separators() {
 1418|      1|        let result = preprocess_fts_query("hello-OR-world");
 1419|      1|        assert!(result.contains("hello*"));
 1420|      1|        assert!(result.contains("world*"));
 1421|      1|        assert!(!result.contains("OR*"));
 1422|      1|    }
 1423|       |
 1424|       |    #[test]
 1425|      1|    fn fts_search_finds_compound_term_with_hyphen() -> TestResult {
 1426|      1|        let conn = setup_conn()?;
                                             ^0
 1427|      1|        let mut m = new_memory("mem-compound");
 1428|      1|        m.body = "the graphrag-precompact script runs daily".to_string();
 1429|      1|        insert(&conn, &m)?;
                                       ^0
 1430|      1|        conn.execute_batch(
 1431|      1|            "INSERT INTO fts_memories(rowid, name, description, body)
 1432|      1|             SELECT id, name, description, body FROM memories WHERE deleted_at IS NULL",
 1433|      0|        )?;
 1434|      1|        let rows = fts_search(&conn, "graphrag-precompact", "global", None, 10)?;
                                                                                             ^0
 1435|      1|        assert!(!rows.is_empty(), "should find compound hyphenated term");
                                                ^0
 1436|      1|        Ok(())
 1437|      1|    }
 1438|       |
 1439|       |    #[test]
 1440|      1|    fn find_by_name_any_state_returns_deleted_flag() -> TestResult {
 1441|      1|        let conn = setup_conn()?;
                                             ^0
 1442|      1|        let m = new_memory("mem-soft-del");
 1443|      1|        let id = insert(&conn, &m)?;
                                                ^0
 1444|      1|        conn.execute(
 1445|      1|            "UPDATE memories SET deleted_at = unixepoch() WHERE id = ?1",
 1446|      1|            rusqlite::params![id],
 1447|      0|        )?;
 1448|      1|        let result = find_by_name_any_state(&conn, "global", "mem-soft-del")?;
                                                                                          ^0
 1449|      1|        assert_eq!(result, Some((id, true)));
 1450|      1|        Ok(())
 1451|      1|    }
 1452|       |
 1453|       |    #[test]
 1454|      1|    fn find_by_name_any_state_returns_not_deleted() -> TestResult {
 1455|      1|        let conn = setup_conn()?;
                                             ^0
 1456|      1|        let m = new_memory("mem-active");
 1457|      1|        let id = insert(&conn, &m)?;
                                                ^0
 1458|      1|        let result = find_by_name_any_state(&conn, "global", "mem-active")?;
                                                                                        ^0
 1459|      1|        assert_eq!(result, Some((id, false)));
 1460|      1|        Ok(())
 1461|      1|    }
 1462|       |
 1463|       |    #[test]
 1464|      1|    fn find_by_name_any_state_returns_none_when_absent() -> TestResult {
 1465|      1|        let conn = setup_conn()?;
                                             ^0
 1466|      1|        let result = find_by_name_any_state(&conn, "global", "does-not-exist")?;
                                                                                            ^0
 1467|      1|        assert!(result.is_none());
 1468|      1|        Ok(())
 1469|      1|    }
 1470|       |
 1471|       |    #[test]
 1472|      1|    fn clear_deleted_at_restores_memory() -> TestResult {
 1473|      1|        let conn = setup_conn()?;
                                             ^0
 1474|      1|        let m = new_memory("mem-restore");
 1475|      1|        let id = insert(&conn, &m)?;
                                                ^0
 1476|      1|        conn.execute(
 1477|      1|            "UPDATE memories SET deleted_at = unixepoch() WHERE id = ?1",
 1478|      1|            rusqlite::params![id],
 1479|      0|        )?;
 1480|       |        // Soft-deleted: find_by_name should return None.
 1481|      1|        assert!(find_by_name(&conn, "global", "mem-restore")?.is_none());
                                                                          ^0
 1482|      1|        clear_deleted_at(&conn, id)?;
                                                 ^0
 1483|       |        // Restored: find_by_name should return Some again.
 1484|      1|        let found = find_by_name(&conn, "global", "mem-restore")?;
                                                                              ^0
 1485|      1|        assert!(found.is_some());
 1486|      1|        assert_eq!(found.unwrap().0, id);
 1487|      1|        Ok(())
 1488|      1|    }
 1489|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/storage/urls.rs:
    1|       |//! Persistence for URLs extracted from memory bodies.
    2|       |//!
    3|       |//! Manages the `memory_urls` table: insert, deduplicate, and query URLs
    4|       |//! linked to a specific memory record.
    5|       |
    6|       |use crate::errors::AppError;
    7|       |use rusqlite::Connection;
    8|       |
    9|       |/// URL extracted from a memory body.
   10|       |pub struct MemoryUrl {
   11|       |    pub url: String,
   12|       |    pub offset: Option<i64>,
   13|       |}
   14|       |
   15|       |/// Insere uma URL na tabela `memory_urls`. Ignora duplicatas silenciosamente.
   16|      7|pub fn insert_url(conn: &Connection, memory_id: i64, entry: &MemoryUrl) -> Result<(), AppError> {
   17|      7|    conn.execute(
   18|      7|        "INSERT OR IGNORE INTO memory_urls (memory_id, url, url_offset) VALUES (?1, ?2, ?3)",
   19|      7|        rusqlite::params![memory_id, entry.url, entry.offset],
   20|      0|    )?;
   21|      7|    Ok(())
   22|      7|}
   23|       |
   24|       |/// Inserts multiple URLs for a memory. Returns the count inserted (duplicates ignored).
   25|       |/// Individual errors are logged as warn and not propagated — non-critical path.
   26|      1|pub fn insert_urls(conn: &Connection, memory_id: i64, urls: &[MemoryUrl]) -> usize {
   27|      1|    let mut inserted = 0usize;
   28|      4|    for entry in urls {
                      ^3
   29|      3|        match insert_url(conn, memory_id, entry) {
   30|       |            Ok(()) => {
   31|      3|                let changed = conn.changes();
   32|      3|                if changed > 0 {
   33|      2|                    inserted += 1;
   34|      2|                }
                              ^1
   35|       |            }
   36|      0|            Err(e) => {
   37|      0|                tracing::warn!(target: "storage", url = %entry.url, error = %e, "url persistence failed");
   38|       |            }
   39|       |        }
   40|       |    }
   41|      1|    inserted
   42|      1|}
   43|       |
   44|       |/// Lists all URLs associated with a memory.
   45|      4|pub fn list_by_memory(conn: &Connection, memory_id: i64) -> Result<Vec<MemoryUrl>, AppError> {
   46|      4|    let mut stmt = conn.prepare_cached(
   47|      4|        "SELECT url, url_offset FROM memory_urls WHERE memory_id = ?1 ORDER BY id",
   48|      0|    )?;
   49|      4|    let rows = stmt.query_map(rusqlite::params![memory_id], |row| {
                                                                                ^3
   50|       |        Ok(MemoryUrl {
   51|      3|            url: row.get(0)?,
                                         ^0
   52|      3|            offset: row.get(1)?,
                                            ^0
   53|       |        })
   54|      3|    })?;
                    ^0
   55|      4|    let mut result = Vec::with_capacity(8);
   56|      7|    for row in rows {
                      ^3
   57|      3|        result.push(row?);
                                     ^0
   58|       |    }
   59|      4|    Ok(result)
   60|      4|}
   61|       |
   62|       |/// Removes all URLs for a memory.
   63|      1|pub fn delete_by_memory(conn: &Connection, memory_id: i64) -> Result<(), AppError> {
   64|      1|    conn.execute(
   65|      1|        "DELETE FROM memory_urls WHERE memory_id = ?1",
   66|      1|        rusqlite::params![memory_id],
   67|      0|    )?;
   68|      1|    Ok(())
   69|      1|}
   70|       |
   71|       |#[cfg(test)]
   72|       |mod tests {
   73|       |    use super::*;
   74|       |    use rusqlite::Connection;
   75|       |    use tempfile::TempDir;
   76|       |
   77|       |    type TestResult = Result<(), Box<dyn std::error::Error>>;
   78|       |
   79|      4|    fn setup_db() -> Result<(TempDir, Connection), Box<dyn std::error::Error>> {
   80|      4|        crate::storage::connection::register_vec_extension();
   81|      4|        let tmp = TempDir::new()?;
                                              ^0
   82|      4|        let db_path = tmp.path().join("test.db");
   83|      4|        let mut conn = Connection::open(&db_path)?;
                                                               ^0
   84|      4|        crate::migrations::runner().run(&mut conn)?;
                                                                ^0
   85|      4|        Ok((tmp, conn))
   86|      4|    }
   87|       |
   88|      4|    fn insert_test_memory(conn: &Connection) -> Result<i64, Box<dyn std::error::Error>> {
   89|      4|        conn.execute(
   90|      4|            "INSERT INTO memories (name, type, description, body, body_hash) VALUES ('mem', 'user', 'desc', 'body', 'hash')",
   91|      4|            [],
   92|      0|        )?;
   93|      4|        Ok(conn.last_insert_rowid())
   94|      4|    }
   95|       |
   96|       |    #[test]
   97|      1|    fn insert_url_persists_and_list_returns() -> TestResult {
   98|      1|        let (_tmp, conn) = setup_db()?;
                                                   ^0
   99|      1|        let mem_id = insert_test_memory(&conn)?;
                                                            ^0
  100|       |
  101|      1|        insert_url(
  102|      1|            &conn,
  103|      1|            mem_id,
  104|      1|            &MemoryUrl {
  105|      1|                url: "https://example.com/page".to_string(),
  106|      1|                offset: Some(5),
  107|      1|            },
  108|      0|        )?;
  109|       |
  110|      1|        let urls = list_by_memory(&conn, mem_id)?;
                                                              ^0
  111|      1|        assert_eq!(urls.len(), 1);
  112|      1|        assert_eq!(urls[0].url, "https://example.com/page");
  113|      1|        assert_eq!(urls[0].offset, Some(5));
  114|      1|        Ok(())
  115|      1|    }
  116|       |
  117|       |    #[test]
  118|      1|    fn insert_url_duplicate_ignored() -> TestResult {
  119|      1|        let (_tmp, conn) = setup_db()?;
                                                   ^0
  120|      1|        let mem_id = insert_test_memory(&conn)?;
                                                            ^0
  121|       |
  122|      1|        let entry = MemoryUrl {
  123|      1|            url: "https://example.com/dup".to_string(),
  124|      1|            offset: None,
  125|      1|        };
  126|      1|        insert_url(&conn, mem_id, &entry)?;
                                                       ^0
  127|      1|        insert_url(&conn, mem_id, &entry)?;
                                                       ^0
  128|       |
  129|      1|        let urls = list_by_memory(&conn, mem_id)?;
                                                              ^0
  130|      1|        assert_eq!(urls.len(), 1, "duplicata deve ser ignorada");
                                                ^0
  131|      1|        Ok(())
  132|      1|    }
  133|       |
  134|       |    #[test]
  135|      1|    fn insert_urls_returns_inserted_count() -> TestResult {
  136|      1|        let (_tmp, conn) = setup_db()?;
                                                   ^0
  137|      1|        let mem_id = insert_test_memory(&conn)?;
                                                            ^0
  138|       |
  139|      1|        let batch = vec![
  140|      1|            MemoryUrl {
  141|      1|                url: "https://alpha.example.com".to_string(),
  142|      1|                offset: Some(0),
  143|      1|            },
  144|      1|            MemoryUrl {
  145|      1|                url: "https://beta.example.com".to_string(),
  146|      1|                offset: Some(10),
  147|      1|            },
  148|      1|            MemoryUrl {
  149|      1|                url: "https://alpha.example.com".to_string(),
  150|      1|                offset: Some(0),
  151|      1|            },
  152|       |        ];
  153|      1|        let count = insert_urls(&conn, mem_id, &batch);
  154|      1|        assert_eq!(count, 2, "only 2 unique entries must be inserted");
                                           ^0
  155|      1|        Ok(())
  156|      1|    }
  157|       |
  158|       |    #[test]
  159|      1|    fn delete_by_memory_removes_all_urls() -> TestResult {
  160|      1|        let (_tmp, conn) = setup_db()?;
                                                   ^0
  161|      1|        let mem_id = insert_test_memory(&conn)?;
                                                            ^0
  162|       |
  163|      1|        insert_url(
  164|      1|            &conn,
  165|      1|            mem_id,
  166|      1|            &MemoryUrl {
  167|      1|                url: "https://to-delete.example.com".to_string(),
  168|      1|                offset: None,
  169|      1|            },
  170|      0|        )?;
  171|      1|        assert_eq!(list_by_memory(&conn, mem_id)?.len(), 1);
                                                              ^0
  172|       |
  173|      1|        delete_by_memory(&conn, mem_id)?;
                                                     ^0
  174|      1|        assert_eq!(list_by_memory(&conn, mem_id)?.len(), 0);
                                                              ^0
  175|      1|        Ok(())
  176|      1|    }
  177|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/storage/utils.rs:
    1|       |//! Storage utility helpers shared across the storage sub-modules.
    2|       |
    3|       |use crate::constants::{MAX_SQLITE_BUSY_RETRIES, SQLITE_BUSY_BASE_DELAY_MS};
    4|       |use crate::errors::AppError;
    5|       |use rusqlite::ErrorCode;
    6|       |use std::thread;
    7|       |use std::time::Duration;
    8|       |
    9|       |/// Returns `true` when `err` wraps an `SQLITE_BUSY` (or `SQLITE_LOCKED`)
   10|       |/// condition reported by rusqlite.
   11|       |///
   12|       |/// Both `SQLITE_BUSY` (`ErrorCode::DatabaseBusy`) and `SQLITE_LOCKED`
   13|       |/// (`ErrorCode::DatabaseLocked`) indicate that the write cannot proceed
   14|       |/// immediately due to WAL concurrency.  We treat both as transient and
   15|       |/// eligible for retry.
   16|     11|pub fn is_sqlite_busy(err: &AppError) -> bool {
   17|      9|    match err {
   18|      9|        AppError::Database(rusqlite::Error::SqliteFailure(e, _)) => {
   19|      9|            e.code == ErrorCode::DatabaseBusy || e.code == ErrorCode::DatabaseLocked
                                                               ^1
   20|       |        }
   21|      2|        _ => false,
   22|       |    }
   23|     11|}
   24|       |
   25|       |/// Executes `op` up to `MAX_SQLITE_BUSY_RETRIES` times with exponential
   26|       |/// backoff whenever the operation fails with `SQLITE_BUSY` / `SQLITE_LOCKED`.
   27|       |///
   28|       |/// Delay schedule (base = `SQLITE_BUSY_BASE_DELAY_MS`):
   29|       |/// - attempt 1 → `base` ms
   30|       |/// - attempt 2 → `base * 2` ms
   31|       |/// - attempt 3 → `base * 4` ms
   32|       |/// - attempt 4 → `base * 8` ms
   33|       |/// - attempt 5 → `base * 16` ms
   34|       |///
   35|       |/// After all retries are exhausted the last `SQLITE_BUSY` error is converted
   36|       |/// to [`AppError::DbBusy`] so callers can route on exit-code `15`.
   37|     17|pub fn with_busy_retry<F>(op: F) -> Result<(), AppError>
   38|     17|where
   39|     17|    F: Fn() -> Result<(), AppError>,
   40|       |{
   41|     24|    for attempt in 0..MAX_SQLITE_BUSY_RETRIES {
                      ^23
   42|     23|        match op() {
   43|     15|            Ok(()) => return Ok(()),
   44|      8|            Err(e) if is_sqlite_busy(&e) => {
                              ^7                     ^7
   45|      7|                if crate::retry::is_kill_switch_active() {
   46|      0|                    tracing::warn!(target: "storage", "SQLITE_GRAPHRAG_DISABLE_RETRY=1, propagating SQLITE_BUSY immediately");
   47|      0|                    return Err(e);
   48|      7|                }
   49|      7|                let base_ms = SQLITE_BUSY_BASE_DELAY_MS * (1u64 << attempt);
   50|      7|                let half = base_ms / 2;
   51|      7|                let jitter = if half == 0 { 0 } else { fastrand::u64(0..half) };
                                                          ^0
   52|      7|                let delay_ms = half + jitter;
   53|      7|                tracing::debug!(
   54|       |                    target: "storage",
   55|      0|                    attempt = attempt + 1,
   56|       |                    attempt_max = MAX_SQLITE_BUSY_RETRIES,
   57|       |                    delay_ms,
   58|      0|                    "SQLITE_BUSY retry with half-jitter"
   59|       |                );
   60|      7|                thread::sleep(Duration::from_millis(delay_ms));
   61|       |            }
   62|      1|            Err(other) => return Err(other),
   63|       |        }
   64|       |    }
   65|       |
   66|      1|    tracing::error!(
   67|       |        target: "storage",
   68|       |        retries = MAX_SQLITE_BUSY_RETRIES,
   69|      0|        "SQLITE_BUSY exhausted all retries"
   70|       |    );
   71|      1|    Err(AppError::DbBusy(format!(
   72|      1|        "SQLITE_BUSY after {MAX_SQLITE_BUSY_RETRIES} retries"
   73|      1|    )))
   74|     17|}
   75|       |
   76|       |#[cfg(test)]
   77|       |mod tests {
   78|       |    use super::*;
   79|       |    use std::sync::atomic::{AtomicU32, Ordering};
   80|       |    use std::sync::Arc;
   81|       |
   82|       |    /// Helper that builds a fake `AppError::Database` wrapping
   83|       |    /// `SQLITE_BUSY` (error code 5) so that `is_sqlite_busy` can be tested
   84|       |    /// without needing a live SQLite connection.
   85|      8|    fn make_busy_error() -> AppError {
   86|       |        // rusqlite::Error::SqliteFailure requires a `ffi::Error` + optional msg.
   87|       |        // We construct it via the public `rusqlite::ffi` interface.
   88|      8|        let ffi_err = rusqlite::ffi::Error {
   89|      8|            code: ErrorCode::DatabaseBusy,
   90|      8|            extended_code: 5,
   91|      8|        };
   92|      8|        AppError::Database(rusqlite::Error::SqliteFailure(ffi_err, None))
   93|      8|    }
   94|       |
   95|      1|    fn make_locked_error() -> AppError {
   96|      1|        let ffi_err = rusqlite::ffi::Error {
   97|      1|            code: ErrorCode::DatabaseLocked,
   98|      1|            extended_code: 6,
   99|      1|        };
  100|      1|        AppError::Database(rusqlite::Error::SqliteFailure(ffi_err, None))
  101|      1|    }
  102|       |
  103|       |    #[test]
  104|      1|    fn is_sqlite_busy_detects_database_busy() {
  105|      1|        assert!(is_sqlite_busy(&make_busy_error()));
  106|      1|    }
  107|       |
  108|       |    #[test]
  109|      1|    fn is_sqlite_busy_detects_database_locked() {
  110|      1|        assert!(is_sqlite_busy(&make_locked_error()));
  111|      1|    }
  112|       |
  113|       |    #[test]
  114|      1|    fn is_sqlite_busy_rejects_other_errors() {
  115|      1|        let err = AppError::Validation("invalid field".into());
  116|      1|        assert!(!is_sqlite_busy(&err));
  117|      1|    }
  118|       |
  119|       |    #[test]
  120|      1|    fn with_busy_retry_propagates_non_busy_error() {
  121|      1|        let calls = Arc::new(AtomicU32::new(0));
  122|      1|        let calls_clone = Arc::clone(&calls);
  123|       |
  124|      1|        let result = with_busy_retry(|| {
  125|      1|            calls_clone.fetch_add(1, Ordering::SeqCst);
  126|      1|            Err(AppError::Validation("campo x".into()))
  127|      1|        });
  128|       |
  129|       |        // Non-busy errors must propagate immediately without retrying.
  130|      1|        assert_eq!(calls.load(Ordering::SeqCst), 1);
  131|      1|        assert!(matches!(result, Err(AppError::Validation(_))));
                              ^0
  132|      1|    }
  133|       |
  134|       |    #[test]
  135|      1|    fn with_busy_retry_succeeds_on_third_attempt() {
  136|      1|        let calls = Arc::new(AtomicU32::new(0));
  137|      1|        let calls_clone = Arc::clone(&calls);
  138|       |
  139|       |        // Fail twice with SQLITE_BUSY, succeed on the third call.
  140|      3|        let result = with_busy_retry(|| {
                          ^1       ^1
  141|      3|            let n = calls_clone.fetch_add(1, Ordering::SeqCst);
  142|      3|            if n < 2 {
  143|      2|                Err(make_busy_error())
  144|       |            } else {
  145|      1|                Ok(())
  146|       |            }
  147|      3|        });
  148|       |
  149|      1|        assert_eq!(calls.load(Ordering::SeqCst), 3);
  150|      1|        assert!(result.is_ok(), "expected Ok after 3rd attempt");
                                              ^0
  151|      1|    }
  152|       |
  153|       |    #[test]
  154|      1|    fn busy_retry_jitter_in_range() {
  155|       |        // Verify that the half-jitter formula stays within [base/2, base) for attempt=2.
  156|       |        // attempt=2 → base_ms = SQLITE_BUSY_BASE_DELAY_MS * 4; half = base_ms/2.
  157|       |        // We call fastrand::u64 indirectly through with_busy_retry by observing that the
  158|       |        // function completes; direct delay bounds are tested via the formula invariant.
  159|      1|        let base_ms = SQLITE_BUSY_BASE_DELAY_MS * (1u64 << 2); // attempt=2
  160|      1|        let half = base_ms / 2;
  161|    101|        for _ in 0..100 {
  162|    100|            let jitter = fastrand::u64(0..half);
  163|    100|            let delay_ms = half + jitter;
  164|    100|            assert!(
  165|    100|                delay_ms >= half && delay_ms < base_ms,
  166|      0|                "delay_ms {delay_ms} out of [{half}, {base_ms})"
  167|       |            );
  168|       |        }
  169|      1|    }
  170|       |
  171|       |    #[test]
  172|      1|    fn with_busy_retry_returns_db_busy_after_all_retries() {
  173|      1|        let calls = Arc::new(AtomicU32::new(0));
  174|      1|        let calls_clone = Arc::clone(&calls);
  175|       |
  176|      5|        let result = with_busy_retry(|| {
                          ^1       ^1
  177|      5|            calls_clone.fetch_add(1, Ordering::SeqCst);
  178|      5|            Err(make_busy_error())
  179|      5|        });
  180|       |
  181|      1|        assert_eq!(
  182|      1|            calls.load(Ordering::SeqCst),
  183|       |            MAX_SQLITE_BUSY_RETRIES,
  184|      0|            "must attempt exactly MAX_SQLITE_BUSY_RETRIES times"
  185|       |        );
  186|      1|        assert!(
  187|      1|            matches!(result, Err(AppError::DbBusy(_))),
                          ^0
  188|      0|            "must convert to DbBusy after exhausting retries"
  189|       |        );
  190|      1|    }
  191|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/storage/versions.rs:
    1|       |//! Version history storage for memory records.
    2|       |//!
    3|       |//! Manages the `memory_versions` table: inserts a new version snapshot on
    4|       |//! every update so the `restore` command can roll back to any prior body.
    5|       |
    6|       |use crate::errors::AppError;
    7|       |use rusqlite::{params, Connection};
    8|       |
    9|       |#[allow(clippy::too_many_arguments)]
   10|      0|pub fn insert_version(
   11|      0|    conn: &Connection,
   12|      0|    memory_id: i64,
   13|      0|    version: i64,
   14|      0|    name: &str,
   15|      0|    memory_type: &str,
   16|      0|    description: &str,
   17|      0|    body: &str,
   18|      0|    metadata: &str,
   19|      0|    changed_by: Option<&str>,
   20|      0|    change_reason: &str,
   21|      0|) -> Result<(), AppError> {
   22|      0|    conn.execute(
   23|      0|        "INSERT INTO memory_versions
   24|      0|         (memory_id, version, name, type, description, body, metadata, changed_by, change_reason)
   25|      0|         VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9)",
   26|      0|        params![
   27|       |            memory_id,
   28|       |            version,
   29|       |            name,
   30|       |            memory_type,
   31|       |            description,
   32|       |            body,
   33|       |            metadata,
   34|       |            changed_by,
   35|       |            change_reason
   36|       |        ],
   37|      0|    )?;
   38|      0|    Ok(())
   39|      0|}
   40|       |
   41|      0|pub fn next_version(conn: &Connection, memory_id: i64) -> Result<i64, AppError> {
   42|      0|    let v: i64 = conn.query_row(
   43|      0|        "SELECT COALESCE(MAX(version), 0) + 1 FROM memory_versions WHERE memory_id = ?1",
   44|      0|        params![memory_id],
   45|      0|        |r| r.get(0),
   46|      0|    )?;
   47|      0|    Ok(v)
   48|      0|}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/system_load.rs:
    1|       |//! G28-D: system load average observation before spawning LLM subprocesses.
    2|       |//!
    3|       |//! The 2026-06-03 incident saturated a 10-CPU host with load 276 because
    4|       |//! parallel `enrich` workers kept spawning `claude -p` / `codex exec`
    5|       |//! children even when the system was already at saturation. This module
    6|       |//! exposes a single helper that returns `true` when the 1-minute load
    7|       |//! average is above `2 × ncpus` (the conservative threshold the G28-D
    8|       |//! original discussion recommended).
    9|       |//!
   10|       |//! Uses `sysinfo::System::load_average()` which is already a transitive
   11|       |//! dependency of the project. The read is cheap (single syscall on
   12|       |//! Linux) and throttled to once per second via a Mutex-cached timestamp.
   13|       |
   14|       |use std::sync::Mutex;
   15|       |use std::time::{Duration, Instant};
   16|       |
   17|       |static LAST_REFRESH: Mutex<Option<Instant>> = Mutex::new(None);
   18|       |
   19|       |/// Returns the 1-minute load average as reported by the OS.
   20|       |///
   21|       |/// On platforms where `sysinfo` cannot read load average (very old Linux
   22|       |/// without /proc/loadavg), returns `0.0` so callers default to "no
   23|       |/// saturation detected".
   24|      2|pub fn load_average_one() -> f64 {
   25|      2|    let _ = ensure_fresh();
   26|      2|    sysinfo::System::load_average().one
   27|      2|}
   28|       |
   29|       |/// Returns the number of logical CPUs the runtime can detect.
   30|       |///
   31|       |/// Used together with [`load_average_one`] to apply a saturation check.
   32|      2|pub fn ncpus() -> usize {
   33|      2|    std::thread::available_parallelism()
   34|      2|        .map(|n| n.get())
   35|      2|        .unwrap_or(4)
   36|      2|}
   37|       |
   38|       |/// G28-D: returns `true` when the 1-minute load average exceeds
   39|       |/// `2 × ncpus` (the conservative threshold originally proposed in the
   40|       |/// G28 audit). The default threshold can be overridden by the
   41|       |/// `SQLITE_GRAPHRAG_MAX_LOAD_PER_NCPU` env var.
   42|      1|pub fn is_system_saturated() -> bool {
   43|      1|    let load = load_average_one();
   44|      1|    let n = ncpus() as f64;
   45|      1|    let multiplier: f64 = std::env::var("SQLITE_GRAPHRAG_MAX_LOAD_PER_NCPU")
   46|      1|        .ok()
   47|      1|        .and_then(|v| v.parse().ok())
                                    ^0        ^0
   48|      1|        .unwrap_or(2.0);
   49|      1|    load > n * multiplier
   50|      1|}
   51|       |
   52|       |/// Throttles the cached refresh timestamp so we read /proc/loadavg at
   53|       |/// most once per second across all callers. The function returns the
   54|       |/// previous timestamp (or None on first call) so the caller can decide
   55|       |/// whether to actually invoke the syscall.
   56|      4|fn ensure_fresh() -> Option<Instant> {
   57|      4|    let mut guard = LAST_REFRESH.lock().expect("loadavg mutex poisoned");
   58|      4|    let now = Instant::now();
   59|      4|    let should_refresh = guard
   60|      4|        .as_ref()
   61|      4|        .is_none_or(|last| now.duration_since(*last) > Duration::from_secs(1));
                                         ^3                          ^3
   62|      4|    let prev = guard.as_ref().copied();
   63|      4|    if should_refresh {
   64|      1|        *guard = Some(now);
   65|      3|    }
   66|      4|    prev
   67|      4|}
   68|       |
   69|       |#[cfg(test)]
   70|       |mod tests {
   71|       |    use super::*;
   72|       |
   73|       |    #[test]
   74|      1|    fn ncpus_is_at_least_one() {
   75|      1|        assert!(ncpus() >= 1);
   76|      1|    }
   77|       |
   78|       |    #[test]
   79|      1|    fn load_average_is_non_negative() {
   80|      1|        assert!(load_average_one() >= 0.0);
   81|      1|    }
   82|       |
   83|       |    #[test]
   84|      1|    fn saturation_default_threshold_is_two() {
   85|       |        // G28-D default: 2 × ncpus. Operators can lower it via env var
   86|       |        // when running on contended CI runners.
   87|      1|        let env_default = std::env::var("SQLITE_GRAPHRAG_MAX_LOAD_PER_NCPU")
   88|      1|            .ok()
   89|      1|            .and_then(|v| v.parse().ok())
                                        ^0        ^0
   90|      1|            .unwrap_or(2.0);
   91|      1|        assert!(env_default >= 1.0);
   92|      1|    }
   93|       |
   94|       |    #[test]
   95|      1|    fn saturation_check_does_not_panic() {
   96|       |        // The function must always return a definitive answer.
   97|      1|        let _ = is_system_saturated();
   98|      1|    }
   99|       |
  100|       |    #[test]
  101|      1|    fn ensure_fresh_returns_previous_then_sets_new() {
  102|      1|        let prev = ensure_fresh();
  103|       |        // On the first call prev is None; subsequent calls return Some.
  104|      1|        if prev.is_none() {
  105|      1|            let second = ensure_fresh();
  106|       |            // Within the same second the cache is fresh so prev is Some.
  107|      1|            assert!(second.is_some());
  108|      0|        }
  109|      1|    }
  110|       |}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/telemetry.rs:
    1|       |//! Centralized tracing subscriber initialization.
    2|       |//!
    3|       |//! Configures the global subscriber with JSON or pretty format,
    4|       |//! installs the panic hook and the log-to-tracing bridge.
    5|       |
    6|       |use tracing_subscriber::EnvFilter;
    7|       |
    8|       |/// Initializes the global tracing subscriber, panic hook, and log bridge.
    9|       |///
   10|       |/// Must be called exactly once, before any tracing events are emitted.
   11|       |/// After this call, panics on any thread produce `tracing::error!` events,
   12|       |/// and `log` crate events from dependencies (refinery, ureq, ort) are
   13|       |/// forwarded to the tracing subscriber.
   14|      0|pub fn init_tracing(log_level: &str, log_format: &str) {
   15|       |    // TR02: the log→tracing bridge is activated automatically by
   16|       |    // tracing-subscriber's built-in `tracing-log` feature (default).
   17|       |    // Calling LogTracer::init() separately would conflict with the
   18|       |    // global logger that tracing-subscriber installs via .init().
   19|      0|    let use_ansi = crate::terminal::should_use_ansi();
   20|       |
   21|      0|    if log_format == "json" {
   22|      0|        tracing_subscriber::fmt()
   23|      0|            .json()
   24|      0|            .with_ansi(false)
   25|      0|            .with_thread_ids(true)
   26|      0|            .with_thread_names(true)
   27|      0|            .with_env_filter(EnvFilter::new(log_level))
   28|      0|            .with_writer(std::io::stderr)
   29|      0|            .init();
   30|      0|    } else {
   31|      0|        tracing_subscriber::fmt()
   32|      0|            .with_ansi(use_ansi)
   33|      0|            .with_env_filter(EnvFilter::new(log_level))
   34|      0|            .with_writer(std::io::stderr)
   35|      0|            .init();
   36|      0|    }
   37|       |
   38|       |    // TR05: confirm effective filter after init
   39|      0|    tracing::debug!(
   40|       |        target: "telemetry",
   41|       |        filter = %log_level,
   42|       |        format = %log_format,
   43|       |        ansi = use_ansi,
   44|      0|        "tracing subscriber initialized"
   45|       |    );
   46|       |
   47|       |    // TR01: panic hook emitting structured tracing::error!
   48|      0|    let prev_hook = std::panic::take_hook();
   49|      0|    std::panic::set_hook(Box::new(move |info| {
   50|      0|        let payload = info
   51|      0|            .payload()
   52|      0|            .downcast_ref::<&str>()
   53|      0|            .copied()
   54|      0|            .or_else(|| info.payload().downcast_ref::<String>().map(|s| s.as_str()))
   55|      0|            .unwrap_or("<non-string panic>");
   56|      0|        let location = info
   57|      0|            .location()
   58|      0|            .map(|l| format!("{}:{}:{}", l.file(), l.line(), l.column()));
   59|      0|        tracing::error!(
   60|       |            target: "panic",
   61|       |            message = %payload,
   62|      0|            location = location.as_deref().unwrap_or("unknown"),
   63|      0|            "thread panicked"
   64|       |        );
   65|      0|        prev_hook(info);
   66|      0|    }));
   67|      0|}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/terminal.rs:
    1|       |//! Cross-platform terminal initialization: UTF-8, ANSI colors, NO_COLOR.
    2|       |
    3|       |/// Initializes the console for correct UTF-8 output and ANSI escape
    4|       |/// support.  On non-Windows platforms this is a no-op because modern
    5|       |/// Unix terminals handle both natively.
    6|      0|pub fn init_console() {
    7|       |    #[cfg(windows)]
    8|       |    init_windows_console();
    9|      0|}
   10|       |
   11|       |#[cfg(windows)]
   12|       |fn init_windows_console() {
   13|       |    use windows_sys::Win32::Foundation::{HANDLE, INVALID_HANDLE_VALUE};
   14|       |    use windows_sys::Win32::System::Console::{
   15|       |        GetConsoleMode, GetStdHandle, SetConsoleCP, SetConsoleMode, SetConsoleOutputCP,
   16|       |        ENABLE_VIRTUAL_TERMINAL_PROCESSING, STD_ERROR_HANDLE, STD_OUTPUT_HANDLE,
   17|       |    };
   18|       |    const CP_UTF8: u32 = 65001;
   19|       |
   20|       |    // SAFETY: Win32 console functions are safe to call from a single-threaded
   21|       |    // context before any output occurs.  GetStdHandle returns
   22|       |    // INVALID_HANDLE_VALUE on failure (checked below); SetConsoleMode failure
   23|       |    // is silently tolerated so the CLI degrades to plain text.
   24|       |    // G29 (v1.0.68): HANDLE was `isize` in windows-sys <= 0.52 and became
   25|       |    // `*mut c_void` in >= 0.59; the previous comparison `handle != 0 &&
   26|       |    // handle as isize != -1` only worked for the old type and now fails
   27|       |    // compilation.  Replaced with the type-safe idiom `!handle.is_null() &&
   28|       |    // handle != INVALID_HANDLE_VALUE`, which works for both type eras and
   29|       |    // also catches the distinct INVALID_HANDLE_VALUE sentinel ((HANDLE)-1).
   30|       |    unsafe {
   31|       |        SetConsoleOutputCP(CP_UTF8);
   32|       |        SetConsoleCP(CP_UTF8);
   33|       |
   34|       |        for handle_id in [STD_OUTPUT_HANDLE, STD_ERROR_HANDLE] {
   35|       |            let handle: HANDLE = GetStdHandle(handle_id);
   36|       |            if !handle.is_null() && handle != INVALID_HANDLE_VALUE {
   37|       |                let mut mode: u32 = 0;
   38|       |                if GetConsoleMode(handle, &mut mode) != 0 {
   39|       |                    let _ = SetConsoleMode(handle, mode | ENABLE_VIRTUAL_TERMINAL_PROCESSING);
   40|       |                }
   41|       |            }
   42|       |        }
   43|       |    }
   44|       |}
   45|       |
   46|       |/// Returns whether ANSI escape codes should be emitted to stderr.
   47|       |///
   48|       |/// Precedence:
   49|       |/// 1. `NO_COLOR` set (any value) → false (<https://no-color.org> standard)
   50|       |/// 2. `CLICOLOR_FORCE=1` → true (force colors even without TTY)
   51|       |/// 3. stderr is a terminal → true
   52|       |/// 4. fallback → false
   53|      0|pub fn should_use_ansi() -> bool {
   54|      0|    if std::env::var_os("NO_COLOR").is_some() {
   55|      0|        return false;
   56|      0|    }
   57|      0|    if std::env::var("CLICOLOR_FORCE").ok().as_deref() == Some("1") {
   58|      0|        return true;
   59|      0|    }
   60|      0|    std::io::IsTerminal::is_terminal(&std::io::stderr())
   61|      0|}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/tokenizer.rs:
    1|       |//! Token-count utilities for embedding input sizing.
    2|       |//!
    3|       |//! Provides fast approximate token counting used to decide whether a body
    4|       |//! fits in a single chunk or requires the multi-chunk splitter.
    5|       |
    6|       |use crate::constants::PASSAGE_PREFIX;
    7|       |use crate::errors::AppError;
    8|       |use fastembed::{EmbeddingModel, TextEmbedding};
    9|       |use huggingface_hub::api::sync::ApiBuilder;
   10|       |use std::path::{Path, PathBuf};
   11|       |use std::sync::OnceLock;
   12|       |use tokenizers::Tokenizer;
   13|       |
   14|       |struct TokenizerRuntime {
   15|       |    tokenizer: Tokenizer,
   16|       |    model_max_length: usize,
   17|       |}
   18|       |
   19|       |static TOKENIZER_RUNTIME: OnceLock<TokenizerRuntime> = OnceLock::new();
   20|       |
   21|       |/// Returns the process-wide [`Tokenizer`] singleton, initializing it on first call.
   22|       |///
   23|       |/// # Errors
   24|       |/// Returns `Err` when the tokenizer files cannot be loaded from `models_dir`.
   25|      0|pub fn get_tokenizer(models_dir: &Path) -> Result<&'static Tokenizer, AppError> {
   26|      0|    Ok(&get_runtime(models_dir)?.tokenizer)
   27|      0|}
   28|       |
   29|       |/// Returns the model's `model_max_length` from `tokenizer_config.json`.
   30|       |///
   31|       |/// # Errors
   32|       |/// Returns `Err` when the tokenizer files cannot be loaded or the field is missing.
   33|      0|pub fn get_model_max_length(models_dir: &Path) -> Result<usize, AppError> {
   34|      0|    Ok(get_runtime(models_dir)?.model_max_length)
   35|      0|}
   36|       |
   37|       |/// Counts the tokens produced by encoding `text` with the passage prefix.
   38|       |///
   39|       |/// Prepends `PASSAGE_PREFIX` before tokenizing so the count reflects the actual
   40|       |/// number of tokens consumed by the embedding model.
   41|       |///
   42|       |/// # Errors
   43|       |/// Returns `Err` when the tokenizer fails to encode the input.
   44|      0|pub fn count_passage_tokens(tokenizer: &Tokenizer, text: &str) -> Result<usize, AppError> {
   45|      0|    let prefixed = format!("{PASSAGE_PREFIX}{text}");
   46|      0|    count_tokens(tokenizer, &prefixed)
   47|      0|}
   48|       |
   49|       |/// Returns the byte-offset pairs `(start, end)` for each token in `text`.
   50|       |///
   51|       |/// The passage prefix is prepended before tokenizing; offsets in the returned
   52|       |/// vector are adjusted back to be relative to the original `text` slice.
   53|       |///
   54|       |/// # Errors
   55|       |/// Returns `Err` when the tokenizer fails to encode the input.
   56|      0|pub fn passage_token_offsets(
   57|      0|    tokenizer: &Tokenizer,
   58|      0|    text: &str,
   59|      0|) -> Result<Vec<(usize, usize)>, AppError> {
   60|      0|    let prefixed = format!("{PASSAGE_PREFIX}{text}");
   61|      0|    let prefix_len = PASSAGE_PREFIX.len();
   62|      0|    let encoding = tokenizer
   63|      0|        .encode(prefixed, true)
   64|      0|        .map_err(|e| AppError::Embedding(e.to_string()))?;
   65|       |
   66|      0|    let mut offsets = Vec::with_capacity(encoding.get_offsets().len());
   67|      0|    for &(start, end) in encoding.get_offsets() {
   68|      0|        if end <= start || end <= prefix_len {
   69|      0|            continue;
   70|      0|        }
   71|       |
   72|      0|        let adjusted_start = start.saturating_sub(prefix_len).min(text.len());
   73|      0|        let adjusted_end = end.saturating_sub(prefix_len).min(text.len());
   74|       |
   75|      0|        if adjusted_end > adjusted_start
   76|      0|            && text.is_char_boundary(adjusted_start)
   77|      0|            && text.is_char_boundary(adjusted_end)
   78|      0|        {
   79|      0|            offsets.push((adjusted_start, adjusted_end));
   80|      0|        }
   81|       |    }
   82|       |
   83|      0|    if offsets.is_empty() && !text.is_empty() {
   84|      0|        offsets.push((0, text.len()));
   85|      0|    }
   86|       |
   87|      0|    Ok(offsets)
   88|      0|}
   89|       |
   90|      0|fn count_tokens(tokenizer: &Tokenizer, text: &str) -> Result<usize, AppError> {
   91|      0|    let encoding = tokenizer
   92|      0|        .encode(text, true)
   93|      0|        .map_err(|e| AppError::Embedding(e.to_string()))?;
   94|      0|    Ok(encoding.len())
   95|      0|}
   96|       |
   97|      0|fn get_runtime(models_dir: &Path) -> Result<&'static TokenizerRuntime, AppError> {
   98|      0|    if let Some(runtime) = TOKENIZER_RUNTIME.get() {
   99|      0|        return Ok(runtime);
  100|      0|    }
  101|       |
  102|      0|    let runtime = load_runtime(models_dir)?;
  103|      0|    let _ = TOKENIZER_RUNTIME.set(runtime);
  104|      0|    Ok(TOKENIZER_RUNTIME
  105|      0|        .get()
  106|      0|        .expect("OnceLock::set succeeded above; get cannot fail in this single-init path"))
  107|      0|}
  108|       |
  109|      0|fn load_runtime(models_dir: &Path) -> Result<TokenizerRuntime, AppError> {
  110|      0|    let model_info = TextEmbedding::get_model_info(&EmbeddingModel::MultilingualE5Small)
  111|      0|        .map_err(|e| AppError::Embedding(e.to_string()))?;
  112|       |
  113|      0|    let cache_dir = std::env::var("HF_HOME")
  114|      0|        .map(PathBuf::from)
  115|      0|        .unwrap_or_else(|_| models_dir.to_path_buf());
  116|      0|    let endpoint =
  117|      0|        std::env::var("HF_ENDPOINT").unwrap_or_else(|_| "https://huggingface.co".to_string());
  118|       |
  119|      0|    let api = ApiBuilder::new()
  120|      0|        .with_cache_dir(cache_dir)
  121|      0|        .with_endpoint(endpoint)
  122|      0|        .with_progress(false)
  123|      0|        .build()
  124|      0|        .map_err(|e| AppError::Embedding(e.to_string()))?;
  125|      0|    let repo = api.model(model_info.model_code.clone());
  126|       |
  127|      0|    let tokenizer_bytes =
  128|      0|        std::fs::read(repo.get("tokenizer.json").map_err(map_hf_err)?).map_err(AppError::Io)?;
  129|      0|    let tokenizer_config_bytes =
  130|      0|        std::fs::read(repo.get("tokenizer_config.json").map_err(map_hf_err)?)
  131|      0|            .map_err(AppError::Io)?;
  132|       |
  133|      0|    let tokenizer =
  134|      0|        Tokenizer::from_bytes(tokenizer_bytes).map_err(|e| AppError::Embedding(e.to_string()))?;
  135|      0|    let tokenizer_config: serde_json::Value =
  136|      0|        serde_json::from_slice(&tokenizer_config_bytes).map_err(AppError::Json)?;
  137|      0|    let model_max_length = tokenizer_config["model_max_length"]
  138|      0|        .as_u64()
  139|      0|        .map(|n| n as usize)
  140|      0|        .or_else(|| {
  141|      0|            tokenizer_config["model_max_length"]
  142|      0|                .as_f64()
  143|      0|                .map(|n| n as usize)
  144|      0|        })
  145|      0|        .ok_or_else(|| {
  146|      0|            AppError::Embedding("tokenizer_config.json missing model_max_length field".into())
  147|      0|        })?;
  148|       |
  149|      0|    Ok(TokenizerRuntime {
  150|      0|        tokenizer,
  151|      0|        model_max_length,
  152|      0|    })
  153|      0|}
  154|       |
  155|      0|fn map_hf_err(err: huggingface_hub::api::sync::ApiError) -> AppError {
  156|      0|    AppError::Embedding(err.to_string())
  157|      0|}

/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/tz.rs:
    1|       |//! Display timezone for `*_iso` fields in JSON output.
    2|       |//!
    3|       |//! Precedence (highest to lowest priority):
    4|       |//! 1. `--tz <IANA>` flag passed on the CLI
    5|       |//! 2. Env var `SQLITE_GRAPHRAG_DISPLAY_TZ`
    6|       |//! 3. Fallback UTC
    7|       |//!
    8|       |//! The timezone is initialized once via [`init`][crate::tz::init] and stored in
    9|       |//! `GLOBAL_TZ` (OnceLock). After initialization, [`format_iso`][crate::tz::format_iso] and
   10|       |//! [`epoch_to_iso`][crate::tz::epoch_to_iso] convert timestamps applying the chosen timezone.
   11|       |
   12|       |use crate::errors::AppError;
   13|       |use crate::i18n::validation;
   14|       |use chrono::{DateTime, TimeZone, Utc};
   15|       |use chrono_tz::Tz;
   16|       |use std::sync::OnceLock;
   17|       |
   18|       |static GLOBAL_TZ: OnceLock<Tz> = OnceLock::new();
   19|       |
   20|       |/// Resolves the timezone from the `SQLITE_GRAPHRAG_DISPLAY_TZ` env var.
   21|       |///
   22|       |/// Returns `Tz::UTC` if the variable is absent or empty.
   23|       |/// Returns a validation error if the value is an invalid IANA name.
   24|      4|fn resolve_tz_from_env() -> Result<Tz, AppError> {
   25|      4|    match std::env::var("SQLITE_GRAPHRAG_DISPLAY_TZ") {
   26|      3|        Ok(v) if !v.trim().is_empty() => v
   27|      3|            .trim()
   28|      3|            .parse::<Tz>()
   29|      3|            .map_err(|_| AppError::Validation(validation::invalid_tz(v.trim()))),
                                                            ^1                     ^1
   30|      1|        _ => Ok(Tz::UTC),
   31|       |    }
   32|      4|}
   33|       |
   34|       |/// Initializes the global timezone.
   35|       |///
   36|       |/// `explicit` — value from the `--tz` CLI flag (already parsed).
   37|       |/// If `explicit` is `None`, tries `SQLITE_GRAPHRAG_DISPLAY_TZ`, then UTC.
   38|       |///
   39|       |/// Subsequent calls are silently ignored (OnceLock semantics).
   40|       |/// Returns an error only if `explicit` is `None` and the env var is invalid.
   41|      0|pub fn init(explicit: Option<Tz>) -> Result<(), AppError> {
   42|      0|    let fuso = match explicit {
   43|      0|        Some(tz) => tz,
   44|      0|        None => resolve_tz_from_env()?,
   45|       |    };
   46|      0|    let _ = GLOBAL_TZ.set(fuso);
   47|      0|    Ok(())
   48|      0|}
   49|       |
   50|       |/// Returns the active timezone.
   51|       |///
   52|       |/// If [`init`] was never called, tries to read the env var; fallback UTC.
   53|      7|pub fn current_tz() -> Tz {
   54|      7|    *GLOBAL_TZ.get_or_init(|| resolve_tz_from_env().unwrap_or(Tz::UTC))
                                            ^1                    ^1        ^1
   55|      7|}
   56|       |
   57|       |/// Formats a `DateTime<Utc>` using the global timezone.
   58|       |///
   59|       |/// Format: `%Y-%m-%dT%H:%M:%S%:z` (e.g. `2026-04-19T10:00:00+00:00` for UTC,
   60|       |/// `2026-04-19T07:00:00-03:00` for `America/Sao_Paulo`).
   61|      7|pub fn format_iso(ts: DateTime<Utc>) -> String {
   62|      7|    let fuso = current_tz();
   63|      7|    ts.with_timezone(&fuso)
   64|      7|        .format("%Y-%m-%dT%H:%M:%S%:z")
   65|      7|        .to_string()
   66|      7|}
   67|       |
   68|       |/// Converts a Unix epoch (seconds) to an ISO 8601 string with the global timezone.
   69|       |///
   70|       |/// Values outside the representable range return the fallback
   71|       |/// `"1970-01-01T00:00:00+00:00"`.
   72|      9|pub fn epoch_to_iso(epoch: i64) -> String {
   73|      9|    Utc.timestamp_opt(epoch, 0)
   74|      9|        .single()
   75|      9|        .map(format_iso)
   76|      9|        .unwrap_or_else(|| "1970-01-01T00:00:00+00:00".to_string())
                                         ^2                          ^2
   77|      9|}
   78|       |
   79|       |#[cfg(test)]
   80|       |mod tests {
   81|       |    use super::*;
   82|       |    use serial_test::serial;
   83|       |
   84|       |    #[test]
   85|       |    #[serial]
   86|      1|    fn utc_default_when_env_missing() {
   87|       |        // Remove variable to ensure UTC fallback
   88|      1|        std::env::remove_var("SQLITE_GRAPHRAG_DISPLAY_TZ");
   89|      1|        let result = resolve_tz_from_env().expect("must not fail with env absent");
   90|      1|        assert_eq!(result, Tz::UTC);
   91|       |    }
   92|       |
   93|       |    #[test]
   94|       |    #[serial]
   95|      1|    fn env_valid_applies_timezone() {
   96|      1|        std::env::set_var("SQLITE_GRAPHRAG_DISPLAY_TZ", "America/Sao_Paulo");
   97|      1|        let result = resolve_tz_from_env().expect("America/Sao_Paulo is valid");
   98|      1|        assert_eq!(result.name(), "America/Sao_Paulo");
   99|      1|        std::env::remove_var("SQLITE_GRAPHRAG_DISPLAY_TZ");
  100|       |    }
  101|       |
  102|       |    #[test]
  103|       |    #[serial]
  104|      1|    fn env_invalid_returns_validation_error() {
  105|      1|        std::env::set_var("SQLITE_GRAPHRAG_DISPLAY_TZ", "Invalid/Nonexistent");
  106|      1|        let result = resolve_tz_from_env();
  107|      1|        assert!(result.is_err(), "invalid timezone must return Err");
                                               ^0
  108|      1|        match result {
  109|      1|            Err(AppError::Validation(msg)) => {
  110|      1|                assert!(
  111|      1|                    msg.contains("SQLITE_GRAPHRAG_DISPLAY_TZ"),
  112|      0|                    "message must cite the env var"
  113|       |                );
  114|      1|                assert!(
  115|      1|                    msg.contains("Invalid/Nonexistent"),
  116|      0|                    "message must cite the invalid value"
  117|       |                );
  118|       |            }
  119|      0|            other => unreachable!("expected AppError::Validation, got: {other:?}"),
  120|       |        }
  121|      1|        std::env::remove_var("SQLITE_GRAPHRAG_DISPLAY_TZ");
  122|       |    }
  123|       |
  124|       |    #[test]
  125|      1|    fn epoch_zero_yields_utc_iso() {
  126|       |        // Tests epoch_to_iso directly without global state
  127|      1|        std::env::remove_var("SQLITE_GRAPHRAG_DISPLAY_TZ");
  128|      1|        let result = {
  129|       |            // Applies UTC directly without using GLOBAL_TZ
  130|      1|            let tz = Tz::UTC;
  131|      1|            Utc.timestamp_opt(0, 0)
  132|      1|                .single()
  133|      1|                .map(|dt| {
  134|      1|                    dt.with_timezone(&tz)
  135|      1|                        .format("%Y-%m-%dT%H:%M:%S%:z")
  136|      1|                        .to_string()
  137|      1|                })
  138|      1|                .unwrap_or_else(|| "1970-01-01T00:00:00+00:00".to_string())
                                                 ^0                          ^0
  139|       |        };
  140|      1|        assert_eq!(result, "1970-01-01T00:00:00+00:00");
  141|      1|    }
  142|       |
  143|       |    #[test]
  144|      1|    fn format_iso_utc_preserves_zero_offset() {
  145|      1|        let ts = Utc.timestamp_opt(1_705_320_000, 0).single().unwrap();
  146|       |        // Applies UTC directly
  147|      1|        let result = ts
  148|      1|            .with_timezone(&Tz::UTC)
  149|      1|            .format("%Y-%m-%dT%H:%M:%S%:z")
  150|      1|            .to_string();
  151|      1|        assert_eq!(result, "2024-01-15T12:00:00+00:00");
  152|      1|    }
  153|       |
  154|       |    #[test]
  155|      1|    fn format_iso_sao_paulo_applies_offset() {
  156|      1|        let ts = Utc.timestamp_opt(1_705_320_000, 0).single().unwrap();
  157|      1|        let sao_paulo: Tz = "America/Sao_Paulo".parse().unwrap();
  158|      1|        let result = ts
  159|      1|            .with_timezone(&sao_paulo)
  160|      1|            .format("%Y-%m-%dT%H:%M:%S%:z")
  161|      1|            .to_string();
  162|       |        // America/Sao_Paulo in January is UTC-3
  163|      1|        assert!(
  164|      1|            result.contains("-03:00"),
  165|      0|            "expected offset -03:00, got: {result}"
  166|       |        );
  167|      1|    }
  168|       |}