/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/chunking.rs:
1| |//! Semantic chunking for embedding inputs (Markdown-aware, 512-token limit).
2| |//!
3| |//! Splits bodies using [`text_splitter::MarkdownSplitter`] with overlap so
4| |//! multi-chunk memories preserve context across chunk boundaries.
5| |
6| |// src/chunking.rs
7| |// Token-based chunking for E5 model (512 token limit)
8| |
9| |use crate::constants::{CHUNK_OVERLAP_TOKENS, CHUNK_SIZE_TOKENS, EMBEDDING_DIM};
10| |use text_splitter::{ChunkConfig, MarkdownSplitter};
11| |use tokenizers::Tokenizer;
12| |
13| |// Conservative heuristic to reduce the risk of underestimating the real token count
14| |// in Markdown, code, and multilingual text. The previous value (4 chars/token) allowed
15| |// chunks that were too large for some real documents.
16| |/// Characters per token heuristic: 2 chars/token reduces the risk of underestimating
17| |/// real token counts in Markdown, code, and multilingual text.
18| |const CHARS_PER_TOKEN: usize = 2;
19| |
20| |/// Maximum character length of a single chunk (derived from token limit × chars-per-token).
21| |pub const CHUNK_SIZE_CHARS: usize = CHUNK_SIZE_TOKENS * CHARS_PER_TOKEN;
22| |
23| |/// Character overlap between consecutive chunks to preserve cross-boundary context.
24| |pub const CHUNK_OVERLAP_CHARS: usize = CHUNK_OVERLAP_TOKENS * CHARS_PER_TOKEN;
25| |
26| |/// A contiguous slice of a body string identified by byte offsets.
27| |#[derive(Debug, Clone)]
28| |pub struct Chunk {
29| | /// Byte offset of the first character (inclusive).
30| | pub start_offset: usize,
31| | /// Byte offset past the last character (exclusive).
32| | pub end_offset: usize,
33| | /// Approximate token count for this chunk (chars / `CHARS_PER_TOKEN`).
34| | pub token_count_approx: usize,
35| |}
36| |
37| |/// Returns `true` when `body` exceeds `CHUNK_SIZE_CHARS` and must be split.
38| 5|pub fn needs_chunking(body: &str) -> bool {
39| 5| body.len() > CHUNK_SIZE_CHARS
40| 5|}
41| |
42| |/// Splits `body` into overlapping [`Chunk`]s using a character-based heuristic.
43| |///
44| |/// Short bodies (≤ `CHUNK_SIZE_CHARS`) are returned as a single chunk.
45| |/// Splits prefer paragraph breaks, then sentence-end punctuation, then word boundaries.
46| |///
47| |/// # Errors
48| |/// This function is infallible; it returns a `Vec` directly.
49| 3|pub fn split_into_chunks(body: &str) -> Vec<Chunk> {
50| 3| if !needs_chunking(body) {
51| 1| return vec![Chunk {
52| 1| token_count_approx: body.chars().count() / CHARS_PER_TOKEN,
53| 1| start_offset: 0,
54| 1| end_offset: body.len(),
55| 1| }];
56| 2| }
57| |
58| 2| let mut chunks = Vec::with_capacity(body.len() / CHUNK_SIZE_CHARS + 1);
59| 2| let mut start = 0usize;
60| |
61| 26| while start < body.len() {
62| 26| start = next_char_boundary(body, start);
63| 26| let desired_end = previous_char_boundary(body, (start + CHUNK_SIZE_CHARS).min(body.len()));
64| 26| let end = if desired_end < body.len() {
65| 24| find_split_boundary(body, start, desired_end)
66| | } else {
67| 2| desired_end
68| | };
69| |
70| 26| let end = if end <= start {
71| 0| let fallback = previous_char_boundary(body, (start + CHUNK_SIZE_CHARS).min(body.len()));
72| 0| if fallback > start {
73| 0| fallback
74| | } else {
75| 0| body.len()
76| | }
77| | } else {
78| 26| end
79| | };
80| |
81| 26| let token_count_approx = body[start..end].chars().count() / CHARS_PER_TOKEN;
82| 26| chunks.push(Chunk {
83| 26| start_offset: start,
84| 26| end_offset: end,
85| 26| token_count_approx,
86| 26| });
87| |
88| 26| if end >= body.len() {
89| 2| break;
90| 24| }
91| |
92| 24| let next_start = next_char_boundary(body, end.saturating_sub(CHUNK_OVERLAP_CHARS));
93| 24| start = if next_start >= end { end } else { next_start };
^0
94| | }
95| |
96| 2| chunks
97| 3|}
98| |
99| |/// Splits `body` into [`Chunk`]s using pre-computed token byte-offsets.
100| |///
101| |/// Each element of `token_offsets` is a `(start, end)` byte range for one token.
102| |/// Respects `CHUNK_SIZE_TOKENS` and `CHUNK_OVERLAP_TOKENS` constants.
103| |/// Short bodies (≤ `CHUNK_SIZE_TOKENS` tokens) are returned as a single chunk.
104| 2|pub fn split_into_chunks_by_token_offsets(
105| 2| body: &str,
106| 2| token_offsets: &[(usize, usize)],
107| 2|) -> Vec<Chunk> {
108| 2| if token_offsets.len() <= CHUNK_SIZE_TOKENS {
109| 1| return vec![Chunk {
110| 1| token_count_approx: token_offsets.len(),
111| 1| start_offset: 0,
112| 1| end_offset: body.len(),
113| 1| }];
114| 1| }
115| |
116| 1| let mut chunks = Vec::with_capacity(token_offsets.len() / CHUNK_SIZE_TOKENS + 1);
117| 1| let mut start_token = 0usize;
118| |
119| 2| while start_token < token_offsets.len() {
120| 2| let end_token = (start_token + CHUNK_SIZE_TOKENS).min(token_offsets.len());
121| |
122| 2| chunks.push(Chunk {
123| 2| start_offset: if start_token == 0 {
124| 1| 0
125| | } else {
126| 1| token_offsets[start_token].0
127| | },
128| 2| end_offset: if end_token == token_offsets.len() {
129| 1| body.len()
130| | } else {
131| 1| token_offsets[end_token - 1].1
132| | },
133| 2| token_count_approx: end_token - start_token,
134| | });
135| |
136| 2| if end_token == token_offsets.len() {
137| 1| break;
138| 1| }
139| |
140| 1| let next_start = end_token.saturating_sub(CHUNK_OVERLAP_TOKENS);
141| 1| start_token = if next_start <= start_token {
142| 0| end_token
143| | } else {
144| 1| next_start
145| | };
146| | }
147| |
148| 1| chunks
149| 2|}
150| |
151| |/// Splits body into chunks using MarkdownSplitter with a real tokenizer.
152| |/// Respects Markdown semantic boundaries (H1-H6, paragraphs, blocks).
153| |/// For plain text without Markdown markers, falls back to paragraph and sentence breaks.
154| 0|pub fn split_into_chunks_hierarchical(body: &str, tokenizer: &Tokenizer) -> Vec<Chunk> {
155| 0| if body.is_empty() {
156| 0| return Vec::new();
157| 0| }
158| |
159| 0| let config = ChunkConfig::new(CHUNK_SIZE_TOKENS)
160| 0| .with_sizer(tokenizer)
161| 0| .with_overlap(CHUNK_OVERLAP_TOKENS)
162| 0| .expect(
163| 0| "compile-time invariant: CHUNK_OVERLAP_TOKENS must be smaller than CHUNK_SIZE_TOKENS",
164| | );
165| |
166| 0| let splitter = MarkdownSplitter::new(config);
167| |
168| 0| let items: Vec<(usize, &str)> = splitter.chunk_indices(body).collect();
169| |
170| 0| if items.is_empty() {
171| 0| return vec![Chunk {
172| 0| start_offset: 0,
173| 0| end_offset: body.len(),
174| 0| token_count_approx: body.chars().count() / CHARS_PER_TOKEN,
175| 0| }];
176| 0| }
177| |
178| 0| items
179| 0| .into_iter()
180| 0| .map(|(start, text)| {
181| 0| let end = start + text.len();
182| 0| Chunk {
183| 0| start_offset: start,
184| 0| end_offset: end,
185| 0| token_count_approx: text.chars().count() / CHARS_PER_TOKEN,
186| 0| }
187| 0| })
188| 0| .collect()
189| 0|}
190| |
191| |/// Returns the string slice of `body` described by `chunk`'s byte offsets.
192| 240|pub fn chunk_text<'a>(body: &'a str, chunk: &Chunk) -> &'a str {
193| 240| &body[chunk.start_offset..chunk.end_offset]
194| 240|}
195| |
196| 24|fn find_split_boundary(body: &str, start: usize, desired_end: usize) -> usize {
197| 24| let slice = &body[start..desired_end];
198| 24| if let Some(pos) = slice.rfind("\n\n") {
^0
199| 0| return start + pos + 2;
200| 24| }
201| 24| if let Some(pos) = slice.rfind(". ") {
^0
202| 0| return start + pos + 2;
203| 24| }
204| 24| if let Some(pos) = slice.rfind(' ') {
205| 24| return start + pos + 1;
206| 0| }
207| 0| desired_end
208| 24|}
209| |
210| 26|fn previous_char_boundary(body: &str, mut idx: usize) -> usize {
211| 26| idx = idx.min(body.len());
212| 26| while idx > 0 && !body.is_char_boundary(idx) {
213| 0| idx -= 1;
214| 0| }
215| 26| idx
216| 26|}
217| |
218| 50|fn next_char_boundary(body: &str, mut idx: usize) -> usize {
219| 50| idx = idx.min(body.len());
220| 59| while idx < body.len() && !body.is_char_boundary(idx) {
221| 9| idx += 1;
222| 9| }
223| 50| idx
224| 50|}
225| |
226| |/// Computes the mean of `chunk_embeddings` and L2-normalizes the result.
227| |///
228| |/// Returns a zero-vector of length `EMBEDDING_DIM` when the input is empty.
229| |/// When a single embedding is provided it is returned as-is (no copy).
230| 1|pub fn aggregate_embeddings(chunk_embeddings: &[Vec<f32>]) -> Vec<f32> {
231| 1| if chunk_embeddings.is_empty() {
232| 0| return vec![0.0f32; EMBEDDING_DIM];
233| 1| }
234| 1| if chunk_embeddings.len() == 1 {
235| 0| return chunk_embeddings[0].clone();
236| 1| }
237| |
238| 1| let dim = chunk_embeddings[0].len();
239| 1| let mut mean = vec![0.0f32; dim];
240| 3| for emb in chunk_embeddings {
^2
241| 4| for (i, v) in emb.iter().enumerate() {
^2 ^2
242| 4| mean[i] += v;
243| 4| }
244| | }
245| 1| let n = chunk_embeddings.len() as f32;
246| 3| for v in &mut mean {
^2
247| 2| *v /= n;
248| 2| }
249| |
250| 2| let norm: f32 = mean.iter().map(|x| x * x).sum::<f32>().sqrt();
^1 ^1 ^1 ^1 ^1 ^1
251| 1| if norm > 1e-9 {
252| 3| for v in &mut mean {
^2
253| 2| *v /= norm;
254| 2| }
255| 0| }
256| 1| mean
257| 1|}
258| |
259| |#[cfg(test)]
260| |mod tests {
261| | use super::*;
262| |
263| | #[test]
264| 1| fn test_short_body_no_chunking() {
265| 1| let body = "short text";
266| 1| assert!(!needs_chunking(body));
267| 1| let chunks = split_into_chunks(body);
268| 1| assert_eq!(chunks.len(), 1);
269| 1| assert_eq!(chunk_text(body, &chunks[0]), body);
270| 1| }
271| |
272| | #[test]
273| 1| fn test_long_body_produces_multiple_chunks() {
274| 1| let body = "word ".repeat(1000);
275| 1| assert!(needs_chunking(&body));
276| 1| let chunks = split_into_chunks(&body);
277| 1| assert!(chunks.len() > 1);
278| 7| assert!(chunks.iter().all(|c| !chunk_text(&body, c).is_empty()));
^1 ^1 ^1
279| 1| }
280| |
281| | #[test]
282| 1| fn split_by_token_offsets_respeita_limite_e_overlap() {
283| 1| let body = "ab".repeat(460);
284| 1| let offsets: Vec<(usize, usize)> = (0..460)
285| 460| .map(|i| {
^1
286| 460| let start = i * 2;
287| 460| (start, start + 2)
288| 460| })
289| 1| .collect();
290| |
291| 1| let chunks = split_into_chunks_by_token_offsets(&body, &offsets);
292| 1| assert_eq!(chunks.len(), 2);
293| 1| assert_eq!(chunks[0].token_count_approx, CHUNK_SIZE_TOKENS);
294| 1| assert_eq!(chunks[1].token_count_approx, 110);
295| 1| assert_eq!(chunks[0].start_offset, 0);
296| 1| assert_eq!(
297| 1| chunks[1].start_offset,
298| 1| offsets[CHUNK_SIZE_TOKENS - CHUNK_OVERLAP_TOKENS].0
299| | );
300| 1| }
301| |
302| | #[test]
303| 1| fn split_by_token_offsets_returns_one_chunk_when_fits() {
304| 1| let body = "texto curto";
305| 1| let offsets = vec![(0, 5), (6, 11)];
306| 1| let chunks = split_into_chunks_by_token_offsets(body, &offsets);
307| 1| assert_eq!(chunks.len(), 1);
308| 1| assert_eq!(chunks[0].start_offset, 0);
309| 1| assert_eq!(chunks[0].end_offset, body.len());
310| 1| assert_eq!(chunks[0].token_count_approx, 2);
311| 1| }
312| |
313| | #[test]
314| 1| fn test_multibyte_body_preserves_progress_and_boundaries() {
315| | // Multibyte body intentionally includes 2-byte UTF-8 sequences (Latin-1 supplement)
316| | // expressed as Unicode escapes so this source file remains ASCII-only per the
317| | // language policy. The original PT-BR phrase "a\u{e7}\u{e3}o \u{fa}til " is preserved
318| | // since the test exercises UTF-8 char-boundary handling.
319| 1| let body = "a\u{e7}\u{e3}o \u{fa}til ".repeat(1000);
320| 1| let chunks = split_into_chunks(&body);
321| 1| assert!(chunks.len() > 1);
322| 20| for chunk in &chunks {
^19
323| 19| assert!(!chunk_text(&body, chunk).is_empty());
324| 19| assert!(body.is_char_boundary(chunk.start_offset));
325| 19| assert!(body.is_char_boundary(chunk.end_offset));
326| 19| assert!(chunk.end_offset > chunk.start_offset);
327| | }
328| 18| for pair in chunks.windows(2) {
^1 ^1
329| 18| assert!(pair[1].start_offset >= pair[0].start_offset);
330| 18| assert!(pair[1].end_offset > pair[0].start_offset);
331| | }
332| 1| }
333| |
334| | #[test]
335| 1| fn test_aggregate_embeddings_normalizes() {
336| 1| let embs = vec![vec![1.0f32, 0.0], vec![0.0f32, 1.0]];
337| 1| let agg = aggregate_embeddings(&embs);
338| 2| let norm: f32 = agg.iter().map(|x| x * x).sum::<f32>().sqrt();
^1 ^1 ^1 ^1 ^1 ^1
339| 1| assert!((norm - 1.0).abs() < 1e-5);
340| 1| }
341| |
342| 5| fn split_hier_chars(body: &str, size: usize) -> Vec<Chunk> {
343| | use text_splitter::{Characters, ChunkConfig, MarkdownSplitter};
344| 5| if body.is_empty() {
345| 0| return Vec::new();
346| 5| }
347| 5| let config = ChunkConfig::new(size)
348| 5| .with_sizer(Characters)
349| 5| .with_overlap(0)
350| 5| .expect("overlap must be smaller than size");
351| 5| let splitter = MarkdownSplitter::new(config);
352| 5| let items: Vec<(usize, &str)> = splitter.chunk_indices(body).collect();
353| 5| if items.is_empty() {
354| 0| return vec![Chunk {
355| 0| start_offset: 0,
356| 0| end_offset: body.len(),
357| 0| token_count_approx: body.chars().count() / CHARS_PER_TOKEN,
358| 0| }];
359| 5| }
360| 5| items
361| 5| .into_iter()
362| 255| .map(|(start, text)| {
^5
363| 255| let end = start + text.len();
364| 255| Chunk {
365| 255| start_offset: start,
366| 255| end_offset: end,
367| 255| token_count_approx: text.chars().count() / CHARS_PER_TOKEN,
368| 255| }
369| 255| })
370| 5| .collect()
371| 5| }
372| |
373| | #[test]
374| 1| fn test_hierarchical_empty_body_returns_empty() {
375| | use text_splitter::{Characters, ChunkConfig, MarkdownSplitter};
376| 1| let config = ChunkConfig::new(100)
377| 1| .with_sizer(Characters)
378| 1| .with_overlap(0)
379| 1| .expect("overlap < size");
380| 1| let splitter = MarkdownSplitter::new(config);
381| 1| let result: Vec<_> = splitter.chunk_indices("").collect();
382| 1| assert!(result.is_empty());
383| 1| }
384| |
385| | #[test]
386| 1| fn test_markdown_h1_boundary_yields_two_chunks() {
387| 1| let body = "# Title 1\n\nbody1 body1 body1 body1 body1 body1\n\n# Title 2\n\nbody2 body2 body2 body2 body2 body2";
388| 1| let chunks = split_hier_chars(body, 30);
389| 1| assert!(
390| 1| chunks.len() >= 2,
391| 0| "expected >=2 chunks, got {}",
392| 0| chunks.len()
393| | );
394| 7| for c in &chunks {
^6
395| 6| assert!(body.is_char_boundary(c.start_offset));
396| 6| assert!(body.is_char_boundary(c.end_offset));
397| | }
398| 1| }
399| |
400| | #[test]
401| 1| fn test_markdown_h2_nested_respects_boundaries() {
402| 1| let body = "# H1\n\n## H2a\n\nParagraph A with enough text to force a split.\n\n## H2b\n\nParagraph B with enough text to force a split as well.";
403| 1| let chunks = split_hier_chars(body, 40);
404| 1| assert!(!chunks.is_empty());
405| 8| for c in &chunks {
^7
406| 7| assert!(body.is_char_boundary(c.start_offset));
407| 7| assert!(body.is_char_boundary(c.end_offset));
408| 7| assert!(c.end_offset > c.start_offset);
409| 7| assert!(c.end_offset <= body.len());
410| | }
411| 1| }
412| |
413| | #[test]
414| 1| fn test_markdown_paragraph_soft_boundary() {
415| 1| let para = "Plain text sentence used to fill the paragraph. ";
416| 1| let body = format!(
417| 1| "{}\n\n{}\n\n{}",
418| 1| para.repeat(3),
419| 1| para.repeat(3),
420| 1| para.repeat(3)
421| | );
422| 1| let chunks = split_hier_chars(&body, 80);
423| 1| assert!(
424| 1| chunks.len() >= 2,
425| 0| "expected >=2 chunks with a body of {} chars",
426| 0| body.len()
427| | );
428| 10| for c in &chunks {
^9
429| 9| assert!(body.is_char_boundary(c.start_offset));
430| 9| assert!(body.is_char_boundary(c.end_offset));
431| | }
432| 1| }
433| |
434| | #[test]
435| 1| fn test_markdown_60kb_valid_offsets() {
436| 1| let block = "# Section\n\nBlock content text. ".repeat(1700);
437| 1| assert!(
438| 1| block.len() > 50_000,
439| 0| "body must be >50KB, has {} bytes",
440| 0| block.len()
441| | );
442| 1| let chunks = split_hier_chars(&block, 256);
443| 1| assert!(chunks.len() > 1);
444| 214| for c in &chunks {
^213
445| 213| assert!(block.is_char_boundary(c.start_offset));
446| 213| assert!(block.is_char_boundary(c.end_offset));
447| 213| assert!(c.end_offset > c.start_offset);
448| 213| assert!(!chunk_text(&block, c).is_empty());
449| | }
450| 1| }
451| |
452| | #[test]
453| 1| fn test_fallback_plain_text_without_markers() {
454| 1| let body = "a ".repeat(1000);
455| 1| let chunks = split_hier_chars(&body, 100);
456| 1| assert!(!chunks.is_empty());
457| 21| for c in &chunks {
^20
458| 20| assert!(body.is_char_boundary(c.start_offset));
459| 20| assert!(body.is_char_boundary(c.end_offset));
460| | }
461| 1| }
462| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/cli.rs:
1| |//! CLI argument structs and command surface (clap-based).
2| |//!
3| |//! Defines `Cli` and all subcommand enums; contains no business logic.
4| |
5| |use crate::commands::*;
6| |use crate::i18n::{current, Language};
7| |use clap::{Parser, Subcommand};
8| |
9| |/// Common daemon-control options shared across embedding-heavy subcommands.
10| |#[derive(clap::Args, Debug, Clone)]
11| |pub struct DaemonOpts {
12| | /// Allow the CLI to spawn a background daemon if none is running.
13| | ///
14| | /// Default `true`. Pass `--autostart-daemon=false` to disable.
15| | /// Env var `SQLITE_GRAPHRAG_DAEMON_DISABLE_AUTOSTART=1` is honoured only when this flag is unset.
16| | #[arg(long, default_value_t = true, action = clap::ArgAction::Set)]
17| | pub autostart_daemon: bool,
18| |}
19| |
20| |/// Returns the maximum simultaneous invocations allowed by the CPU heuristic.
21| 0|fn max_concurrency_ceiling() -> usize {
22| 0| std::thread::available_parallelism()
23| 0| .map(|n| n.get() * 2)
24| 0| .unwrap_or(8)
25| 0|}
26| |
27| |#[derive(Copy, Clone, Debug, PartialEq, Eq, clap::ValueEnum)]
28| |pub enum GraphExportFormat {
29| | Json,
30| | Dot,
31| | Mermaid,
32| | /// Stream one JSON object per entity, then one per edge, then a summary line.
33| | Ndjson,
34| |}
35| |
36| |#[derive(Parser)]
37| |#[command(name = "sqlite-graphrag")]
38| |#[command(version)]
39| |#[command(about = "Local GraphRAG memory for LLMs in a single SQLite file")]
40| |#[command(arg_required_else_help = true)]
41| |pub struct Cli {
42| | /// Maximum number of simultaneous CLI invocations allowed (default: 4).
43| | ///
44| | /// Caps the counting semaphore used for CLI concurrency slots. The value must
45| | /// stay within [1, 2×nCPUs]. Values above the ceiling are rejected with exit 2.
46| | #[arg(long, global = true, value_name = "N")]
47| | pub max_concurrency: Option<usize>,
48| |
49| | /// Wait up to SECONDS for a free concurrency slot before giving up (exit 75).
50| | ///
51| | /// Useful in retrying agent pipelines: the process polls every 500 ms until a
52| | /// slot opens or the timeout expires. Default: 300s (5 minutes).
53| | #[arg(long, global = true, value_name = "SECONDS")]
54| | pub wait_lock: Option<u64>,
55| |
56| | /// Skip the available-memory check before loading the model.
57| | ///
58| | /// Exclusive use in automated tests where real allocation does not occur.
59| | #[arg(long, global = true, hide = true, default_value_t = false)]
60| | pub skip_memory_guard: bool,
61| |
62| | /// Language for human-facing stderr messages. Accepts `en` or `pt`.
63| | ///
64| | /// Without the flag, detection falls back to `SQLITE_GRAPHRAG_LANG` and then
65| | /// `LC_ALL`/`LANG`. JSON stdout stays deterministic and identical across
66| | /// languages; only human-facing strings are affected.
67| | #[arg(long, global = true, value_enum, value_name = "LANG")]
68| | pub lang: Option<crate::i18n::Language>,
69| |
70| | /// Time zone for `*_iso` fields in JSON output (for example `America/Sao_Paulo`).
71| | ///
72| | /// Accepts any IANA time zone name. Without the flag, it falls back to
73| | /// `SQLITE_GRAPHRAG_DISPLAY_TZ`; if unset, UTC is used. Integer epoch fields
74| | /// are not affected.
75| | #[arg(long, global = true, value_name = "IANA")]
76| | pub tz: Option<chrono_tz::Tz>,
77| |
78| | /// Increase logging verbosity (-v=info, -vv=debug, -vvv=trace).
79| | ///
80| | /// Overrides `SQLITE_GRAPHRAG_LOG_LEVEL` env var when present. Logs are emitted
81| | /// to stderr; JSON stdout is unaffected.
82| | #[arg(short = 'v', long, global = true, action = clap::ArgAction::Count)]
83| | pub verbose: u8,
84| |
85| | #[command(subcommand)]
86| | pub command: Commands,
87| |}
88| |
89| |#[cfg(test)]
90| |mod json_only_format_tests {
91| | use super::Cli;
92| | use clap::Parser;
93| |
94| | #[test]
95| 1| fn restore_accepts_only_format_json() {
96| 1| assert!(Cli::try_parse_from([
97| 1| "sqlite-graphrag",
98| 1| "restore",
99| 1| "--name",
100| 1| "mem",
101| 1| "--version",
102| 1| "1",
103| 1| "--format",
104| 1| "json",
105| 1| ])
106| 1| .is_ok());
107| |
108| 1| assert!(Cli::try_parse_from([
109| 1| "sqlite-graphrag",
110| 1| "restore",
111| 1| "--name",
112| 1| "mem",
113| 1| "--version",
114| 1| "1",
115| 1| "--format",
116| 1| "text",
117| 1| ])
118| 1| .is_err());
119| 1| }
120| |
121| | #[test]
122| 1| fn hybrid_search_accepts_only_format_json() {
123| 1| assert!(Cli::try_parse_from([
124| 1| "sqlite-graphrag",
125| 1| "hybrid-search",
126| 1| "query",
127| 1| "--format",
128| 1| "json",
129| 1| ])
130| 1| .is_ok());
131| |
132| 1| assert!(Cli::try_parse_from([
133| 1| "sqlite-graphrag",
134| 1| "hybrid-search",
135| 1| "query",
136| 1| "--format",
137| 1| "markdown",
138| 1| ])
139| 1| .is_err());
140| 1| }
141| |
142| | #[test]
143| 1| fn remember_recall_rename_vacuum_json_only() {
144| 1| assert!(Cli::try_parse_from([
145| 1| "sqlite-graphrag",
146| 1| "remember",
147| 1| "--name",
148| 1| "mem",
149| 1| "--type",
150| 1| "project",
151| 1| "--description",
152| 1| "desc",
153| 1| "--format",
154| 1| "json",
155| 1| ])
156| 1| .is_ok());
157| 1| assert!(Cli::try_parse_from([
158| 1| "sqlite-graphrag",
159| 1| "remember",
160| 1| "--name",
161| 1| "mem",
162| 1| "--type",
163| 1| "project",
164| 1| "--description",
165| 1| "desc",
166| 1| "--format",
167| 1| "text",
168| 1| ])
169| 1| .is_err());
170| |
171| 1| assert!(
172| 1| Cli::try_parse_from(["sqlite-graphrag", "recall", "query", "--format", "json",])
173| 1| .is_ok()
174| | );
175| 1| assert!(
176| 1| Cli::try_parse_from(["sqlite-graphrag", "recall", "query", "--format", "text",])
177| 1| .is_err()
178| | );
179| |
180| 1| assert!(Cli::try_parse_from([
181| 1| "sqlite-graphrag",
182| 1| "rename",
183| 1| "--name",
184| 1| "old",
185| 1| "--new-name",
186| 1| "new",
187| 1| "--format",
188| 1| "json",
189| 1| ])
190| 1| .is_ok());
191| 1| assert!(Cli::try_parse_from([
192| 1| "sqlite-graphrag",
193| 1| "rename",
194| 1| "--name",
195| 1| "old",
196| 1| "--new-name",
197| 1| "new",
198| 1| "--format",
199| 1| "markdown",
200| 1| ])
201| 1| .is_err());
202| |
203| 1| assert!(Cli::try_parse_from(["sqlite-graphrag", "vacuum", "--format", "json",]).is_ok());
204| 1| assert!(Cli::try_parse_from(["sqlite-graphrag", "vacuum", "--format", "text",]).is_err());
205| 1| }
206| |}
207| |
208| |impl Cli {
209| | /// Validates concurrency flags and returns a localised descriptive error if invalid.
210| | ///
211| | /// Requires that `crate::i18n::init()` has already been called (happens before this
212| | /// function in the `main` flow). In English it emits EN messages; in Portuguese it emits PT.
213| 0| pub fn validate_flags(&self) -> Result<(), String> {
214| 0| if let Some(n) = self.max_concurrency {
215| 0| if n == 0 {
216| 0| return Err(match current() {
217| 0| Language::English => "--max-concurrency must be >= 1".to_string(),
218| 0| Language::Portuguese => "--max-concurrency deve ser >= 1".to_string(),
219| | });
220| 0| }
221| 0| let teto = max_concurrency_ceiling();
222| 0| if n > teto {
223| 0| return Err(match current() {
224| 0| Language::English => format!(
225| 0| "--max-concurrency {n} exceeds the ceiling of {teto} (2×nCPUs) on this system"
226| | ),
227| 0| Language::Portuguese => format!(
228| 0| "--max-concurrency {n} excede o teto de {teto} (2×nCPUs) neste sistema"
229| | ),
230| | });
231| 0| }
232| 0| }
233| 0| Ok(())
234| 0| }
235| |}
236| |
237| |impl Commands {
238| | /// Returns true for subcommands that load the ONNX model locally.
239| 5| pub fn is_embedding_heavy(&self) -> bool {
240| 1| matches!(
241| 5| self,
242| | Self::Init(_)
243| | | Self::Remember(_)
244| | | Self::RememberBatch(_)
245| | | Self::Recall(_)
246| | | Self::HybridSearch(_)
247| | | Self::DeepResearch(_)
248| | )
249| 5| }
250| |
251| 0| pub fn uses_cli_slot(&self) -> bool {
252| 0| !matches!(self, Self::Daemon(_))
253| 0| }
254| |}
255| |
256| |#[derive(Subcommand)]
257| |pub enum Commands {
258| | /// Initialize database and download embedding model
259| | #[command(after_long_help = "EXAMPLES:\n \
260| | # Initialize in current directory (default behavior)\n \
261| | sqlite-graphrag init\n\n \
262| | # Initialize at a specific path\n \
263| | sqlite-graphrag init --db /path/to/graphrag.sqlite\n\n \
264| | # Initialize using SQLITE_GRAPHRAG_HOME env var\n \
265| | SQLITE_GRAPHRAG_HOME=/data sqlite-graphrag init\n\n\
266| | NOTES:\n \
267| | - `init` is OPTIONAL: any subsequent CRUD command auto-initializes graphrag.sqlite if missing.\n \
268| | - As a side effect, `init` warms a smoke-test embedding which auto-spawns the persistent daemon (~600s idle timeout).")]
269| | Init(init::InitArgs),
270| | /// Run or control the persistent embedding daemon
271| | Daemon(daemon::DaemonArgs),
272| | /// Save a memory with optional entity graph
273| | #[command(after_long_help = "EXAMPLES:\n \
274| | # Inline body\n \
275| | sqlite-graphrag remember --name onboarding --type user --description \"intro\" --body \"hello\"\n\n \
276| | # Body from file\n \
277| | sqlite-graphrag remember --name doc1 --type document --description \"...\" --body-file ./README.md\n\n \
278| | # Body from stdin (pipe)\n \
279| | cat README.md | sqlite-graphrag remember --name doc1 --type document --description \"...\" --body-stdin\n\n \
280| | # Enable GLiNER entity extraction (disabled by default)\n \
281| | sqlite-graphrag remember --name rich --type note --description \"...\" --body \"...\" --enable-ner")]
282| | Remember(remember::RememberArgs),
283| | /// Batch-create memories from NDJSON stdin (one invocation, one slot)
284| | #[command(after_long_help = "EXAMPLES:\n \
285| | # Batch create from NDJSON\n \
286| | cat memories.ndjson | sqlite-graphrag remember-batch --force-merge --json\n\n \
287| | # Atomic batch\n \
288| | cat memories.ndjson | sqlite-graphrag remember-batch --transaction --json")]
289| | RememberBatch(remember_batch::RememberBatchArgs),
290| | /// Bulk-ingest every file under a directory as separate memories (NDJSON output)
291| | Ingest(ingest::IngestArgs),
292| | /// Search memories semantically
293| | #[command(after_long_help = "EXAMPLES:\n \
294| | # Top 10 semantic matches (default)\n \
295| | sqlite-graphrag recall \"agent memory\"\n\n \
296| | # Top 3 only\n \
297| | sqlite-graphrag recall \"agent memory\" -k 3\n\n \
298| | # Search across all namespaces\n \
299| | sqlite-graphrag recall \"agent memory\" --all-namespaces\n\n \
300| | # Disable graph traversal (vector-only)\n \
301| | sqlite-graphrag recall \"agent memory\" --no-graph")]
302| | Recall(recall::RecallArgs),
303| | /// Read a memory by exact name
304| | Read(read::ReadArgs),
305| | /// List memories with filters
306| | List(list::ListArgs),
307| | /// Soft-delete a memory
308| | Forget(forget::ForgetArgs),
309| | /// Permanently delete soft-deleted memories
310| | Purge(purge::PurgeArgs),
311| | /// Rename a memory preserving history
312| | Rename(rename::RenameArgs),
313| | /// Edit a memory's body or description
314| | Edit(edit::EditArgs),
315| | /// List all versions of a memory
316| | History(history::HistoryArgs),
317| | /// Restore a memory to a previous version
318| | Restore(restore::RestoreArgs),
319| | /// Search using hybrid vector + full-text search
320| | #[command(after_long_help = "EXAMPLES:\n \
321| | # Hybrid search combining KNN + FTS5 BM25 with RRF\n \
322| | sqlite-graphrag hybrid-search \"agent memory architecture\"\n\n \
323| | # Custom weights for vector vs full-text components\n \
324| | sqlite-graphrag hybrid-search \"agent\" --weight-vec 0.7 --weight-fts 0.3")]
325| | HybridSearch(hybrid_search::HybridSearchArgs),
326| | /// Show database health
327| | Health(health::HealthArgs),
328| | /// Apply pending schema migrations
329| | Migrate(migrate::MigrateArgs),
330| | /// Resolve namespace precedence for the current invocation
331| | NamespaceDetect(namespace_detect::NamespaceDetectArgs),
332| | /// Run PRAGMA optimize on the database
333| | Optimize(optimize::OptimizeArgs),
334| | /// Show database statistics
335| | Stats(stats::StatsArgs),
336| | /// Create a checkpointed copy safe for file sync
337| | SyncSafeCopy(sync_safe_copy::SyncSafeCopyArgs),
338| | /// Back up the database using the SQLite Online Backup API
339| | Backup(backup::BackupArgs),
340| | /// Run VACUUM after checkpointing the WAL
341| | Vacuum(vacuum::VacuumArgs),
342| | /// Create an explicit relationship between two entities
343| | Link(link::LinkArgs),
344| | /// Remove a specific relationship between two entities
345| | Unlink(unlink::UnlinkArgs),
346| | /// Deep parallel multi-hop GraphRAG research
347| | #[command(name = "deep-research")]
348| | DeepResearch(deep_research::DeepResearchArgs),
349| | /// List memories connected via the entity graph
350| | Related(related::RelatedArgs),
351| | /// Export a graph snapshot in json, dot or mermaid
352| | Graph(graph_export::GraphArgs),
353| | /// Export memories as NDJSON (one JSON line per memory, plus a summary line)
354| | Export(export::ExportArgs),
355| | /// FTS5 full-text search index management (rebuild or check)
356| | Fts(fts::FtsArgs),
357| | /// Vector index maintenance (orphan detection, purge, stats) — G39
358| | Vec(vec::VecArgs),
359| | /// List codex OAuth models accepted by ChatGPT Pro (G33).
360| | #[command(name = "codex-models")]
361| | CodexModels,
362| | /// Bulk-delete all relationships of a given type (e.g. mentions)
363| | PruneRelations(prune_relations::PruneRelationsArgs),
364| | /// Remove NER bindings (memory_entities rows) for an entity or all entities
365| | #[command(name = "prune-ner")]
366| | PruneNer(prune_ner::PruneNerArgs),
367| | /// Remove entities that have no memories and no relationships
368| | CleanupOrphans(cleanup_orphans::CleanupOrphansArgs),
369| | /// List entities linked to a specific memory
370| | MemoryEntities(memory_entities::MemoryEntitiesArgs),
371| | /// Manage cached resources (embedding models, etc.)
372| | Cache(cache::CacheArgs),
373| | /// Delete an entity and all its relationships from the graph
374| | #[command(name = "delete-entity")]
375| | DeleteEntity(delete_entity::DeleteEntityArgs),
376| | /// Reclassify one entity or a batch of entities to a new type
377| | Reclassify(reclassify::ReclassifyArgs),
378| | /// Rename an entity preserving all relationships and memory bindings
379| | #[command(name = "rename-entity")]
380| | RenameEntity(rename_entity::RenameEntityArgs),
381| | /// Merge multiple source entities into a single target entity
382| | #[command(name = "merge-entities")]
383| | MergeEntities(merge_entities::MergeEntitiesArgs),
384| | /// Enrich graph memories and entities using an LLM provider
385| | Enrich(enrich::EnrichArgs),
386| | /// Reclassify relationship types across the graph using rules or LLM judgment
387| | #[command(name = "reclassify-relation")]
388| | ReclassifyRelation(reclassify_relation::ReclassifyRelationArgs),
389| | /// Normalize entity names (deduplicate, kebab-case, merge near-duplicates)
390| | #[command(name = "normalize-entities")]
391| | NormalizeEntities(normalize_entities::NormalizeEntitiesArgs),
392| | /// Generate shell completions for Bash, Zsh, Fish, PowerShell, or Elvish
393| | Completions(completions::CompletionsArgs),
394| | #[command(name = "debug-schema", hide = true)]
395| | DebugSchema(debug_schema::DebugSchemaArgs),
396| |}
397| |
398| |#[derive(Copy, Clone, Debug, Default, clap::ValueEnum)]
399| |pub enum MemoryType {
400| | User,
401| | Feedback,
402| | Project,
403| | Reference,
404| | Decision,
405| | Incident,
406| | Skill,
407| | #[default]
408| | Document,
409| | Note,
410| |}
411| |
412| |#[cfg(test)]
413| |mod heavy_concurrency_tests {
414| | use super::*;
415| |
416| | #[test]
417| 1| fn command_heavy_detects_init_and_embeddings() {
418| 1| let init = Cli::try_parse_from(["sqlite-graphrag", "init"]).expect("parse init");
419| 1| assert!(init.command.is_embedding_heavy());
420| |
421| 1| let remember = Cli::try_parse_from([
422| 1| "sqlite-graphrag",
423| 1| "remember",
424| 1| "--name",
425| 1| "test-memory",
426| 1| "--type",
427| 1| "project",
428| 1| "--description",
429| 1| "desc",
430| 1| ])
431| 1| .expect("parse remember");
432| 1| assert!(remember.command.is_embedding_heavy());
433| |
434| 1| let recall =
435| 1| Cli::try_parse_from(["sqlite-graphrag", "recall", "query"]).expect("parse recall");
436| 1| assert!(recall.command.is_embedding_heavy());
437| |
438| 1| let hybrid = Cli::try_parse_from(["sqlite-graphrag", "hybrid-search", "query"])
439| 1| .expect("parse hybrid");
440| 1| assert!(hybrid.command.is_embedding_heavy());
441| 1| }
442| |
443| | #[test]
444| 1| fn command_light_does_not_mark_stats() {
445| 1| let stats = Cli::try_parse_from(["sqlite-graphrag", "stats"]).expect("parse stats");
446| 1| assert!(!stats.command.is_embedding_heavy());
447| 1| }
448| |}
449| |
450| |impl MemoryType {
451| 0| pub fn as_str(&self) -> &'static str {
452| 0| match self {
453| 0| Self::User => "user",
454| 0| Self::Feedback => "feedback",
455| 0| Self::Project => "project",
456| 0| Self::Reference => "reference",
457| 0| Self::Decision => "decision",
458| 0| Self::Incident => "incident",
459| 0| Self::Skill => "skill",
460| 0| Self::Document => "document",
461| 0| Self::Note => "note",
462| | }
463| 0| }
464| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/backup.rs:
1| |//! Handler for the `backup` CLI subcommand.
2| |//!
3| |//! Uses the SQLite Online Backup API (via rusqlite) to produce a consistent
4| |//! point-in-time copy of the database file even while the database is in use.
5| |
6| |use crate::errors::AppError;
7| |use crate::output;
8| |use crate::paths::AppPaths;
9| |use crate::storage::connection::open_ro;
10| |use serde::Serialize;
11| |use std::path::PathBuf;
12| |use tempfile::NamedTempFile;
13| |
14| |/// Default number of pages copied per backup step.
15| |///
16| |/// G38: the previous default of 100 pages with 50 ms sleep between steps
17| |/// was the dominant cost on large databases (4.3 GB took ~9 minutes purely
18| |/// on sleep). 1000 pages × 5 ms is ~25× faster on a 4.3 GB database while
19| |/// remaining gentle on SSD I/O. Override with `--backup-step-size`.
20| |const DEFAULT_BACKUP_STEP_PAGES: usize = 1000;
21| |const DEFAULT_BACKUP_STEP_SLEEP_MS: u64 = 5;
22| |
23| |#[derive(clap::Args)]
24| |#[command(after_long_help = "EXAMPLES:\n \
25| | # Back up the default database to a specific path\n \
26| | sqlite-graphrag backup --output /backup/graphrag-$(date +%F).sqlite\n\n \
27| | # Back up a custom source database\n \
28| | sqlite-graphrag backup --db /data/graphrag.sqlite --output /backup/snapshot.sqlite\n\n \
29| | # Tuned for a 4.3 GB database on local SSD\n \
30| | sqlite-graphrag backup --output /backup/snap.sqlite --backup-step-size 2000 --backup-step-sleep-ms 2\n\n \
31| | # Maximum throughput (no sleep between steps — risks I/O contention)\n \
32| | sqlite-graphrag backup --output /backup/snap.sqlite --backup-no-sleep\n\n \
33| |NOTES:\n \
34| | Uses the SQLite Online Backup API: safe to run while the database is in use.\n \
35| | The destination is written atomically via tempfile-rename in the same directory.\n \
36| | If the process is interrupted, the previous file (if any) remains intact.\n \
37| | On Unix the destination is chmod 0600 after the backup completes.")]
38| |pub struct BackupArgs {
39| | /// Destination path for the backup file. Required.
40| | #[arg(long, value_name = "PATH")]
41| | pub output: PathBuf,
42| | #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
43| | pub json: bool,
44| | #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
45| | pub db: Option<String>,
46| | /// Number of pages copied per backup step. Default: 1000 (was 100 before v1.0.69).
47| | /// Larger values finish faster on local SSD but may contend on NFS.
48| | #[arg(long, value_name = "PAGES", default_value_t = DEFAULT_BACKUP_STEP_PAGES)]
49| | pub backup_step_size: usize,
50| | /// Sleep duration in milliseconds between backup steps. Default: 5 (was 50 before v1.0.69).
51| | /// Ignored when --backup-no-sleep is set.
52| | #[arg(long, value_name = "MILLIS", default_value_t = DEFAULT_BACKUP_STEP_SLEEP_MS)]
53| | pub backup_step_sleep_ms: u64,
54| | /// Disable the inter-step sleep entirely. Maximum throughput, but risks
55| | /// starving concurrent I/O on shared storage.
56| | #[arg(long, default_value_t = false)]
57| | pub backup_no_sleep: bool,
58| | /// Emit a progress line to stderr every N pages (G38 observability).
59| | /// Default: 100 (every 100 pages = ~400 KB). Set to 0 to disable.
60| | #[arg(long, value_name = "PAGES", default_value_t = 100)]
61| | pub backup_progress: i32,
62| |}
63| |
64| |#[derive(Serialize)]
65| |struct BackupResponse {
66| | action: String,
67| | source: String,
68| | destination: String,
69| | size_bytes: u64,
70| | elapsed_ms: u64,
71| | pages_copied: Option<i64>,
72| | step_size: usize,
73| |}
74| |
75| 0|pub fn run(args: BackupArgs) -> Result<(), AppError> {
76| 0| let start = std::time::Instant::now();
77| 0| let paths = AppPaths::resolve(args.db.as_deref())?;
78| |
79| 0| crate::storage::connection::ensure_db_ready(&paths)?;
80| |
81| | // Validate: destination must differ from source.
82| 0| if args.output == paths.db {
83| 0| return Err(AppError::Validation(
84| 0| "destination path must differ from the source database path".to_string(),
85| 0| ));
86| 0| }
87| |
88| | // Create parent directories if necessary.
89| 0| let parent = args.output.parent().unwrap_or(std::path::Path::new("."));
90| 0| if !parent.as_os_str().is_empty() {
91| 0| std::fs::create_dir_all(parent)?;
92| 0| }
93| |
94| | // Atomic write: backup to tempfile in the SAME directory, then rename.
95| 0| let temp = NamedTempFile::new_in(parent).map_err(AppError::Io)?;
96| 0| let temp_path = temp.path().to_path_buf();
97| |
98| 0| let src_conn = open_ro(&paths.db)?;
99| 0| let mut dst_conn = rusqlite::Connection::open(&temp_path)?;
100| |
101| 0| let step_size = args.backup_step_size.max(1);
102| 0| let sleep = if args.backup_no_sleep {
103| 0| std::time::Duration::ZERO
104| | } else {
105| 0| std::time::Duration::from_millis(args.backup_step_sleep_ms)
106| | };
107| |
108| 0| let pages_copied: Option<i64> = {
109| 0| let backup = rusqlite::backup::Backup::new(&src_conn, &mut dst_conn)?;
110| | // G38: drive the backup in a manual step() loop so we can emit
111| | // per-step progress events without depending on a Copy closure
112| | // (which the rusqlite Progress callback requires). The loop
113| | // mirrors run_to_completion but exposes progress for observability.
114| 0| let step_size_i32: i32 = step_size.try_into().unwrap_or(1000);
115| 0| let progress_every = args.backup_progress.max(1);
116| 0| let mut last_emit_pages: i32 = -1;
117| | loop {
118| | use rusqlite::backup::StepResult;
119| 0| match backup.step(step_size_i32) {
120| | Ok(StepResult::More) => {
121| | // step returned More: backup still in progress.
122| 0| if progress_every > 0 {
123| 0| let p = backup.progress();
124| 0| let copied = p.pagecount - p.remaining;
125| 0| if copied > 0 && copied - last_emit_pages >= progress_every {
126| 0| last_emit_pages = copied;
127| 0| let percent = if p.pagecount > 0 {
128| 0| (copied as f64 / p.pagecount as f64) * 100.0
129| | } else {
130| 0| 100.0
131| | };
132| 0| eprintln!(
133| 0| "{{\"progress\":{{\"pages_copied\":{copied},\"total_pages\":{pc},\"percent\":{pct:.2}}}}}",
134| | pc = p.pagecount,
135| | pct = percent
136| | );
137| 0| }
138| 0| }
139| 0| if !sleep.is_zero() {
140| 0| std::thread::sleep(sleep);
141| 0| }
142| | }
143| 0| Ok(StepResult::Done) => break, // backup complete
144| 0| Ok(_) => {
145| 0| // Transient (Busy / Locked on newer rusqlite or any
146| 0| // future non-exhaustive variant): retry after backoff.
147| 0| std::thread::sleep(std::time::Duration::from_millis(50));
148| 0| }
149| 0| Err(e) => return Err(AppError::Database(e)),
150| | }
151| | }
152| | // `Progress { remaining, pagecount }` (see rusqlite::backup::Progress):
153| | // pages already copied = pagecount - remaining.
154| 0| let progress = backup.progress();
155| 0| let copied = (progress.pagecount - progress.remaining).max(0);
156| 0| Some(copied as i64)
157| | };
158| 0| drop(dst_conn);
159| |
160| 0| temp.persist(&args.output)
161| 0| .map_err(|e| AppError::Io(e.error))?;
162| |
163| | // Apply 0600 permissions on Unix to prevent leakage in shared directories.
164| | #[cfg(unix)]
165| | {
166| | use std::os::unix::fs::PermissionsExt;
167| 0| if let Ok(meta) = std::fs::metadata(&args.output) {
168| 0| let mut perms = meta.permissions();
169| 0| perms.set_mode(0o600);
170| 0| if let Err(e) = std::fs::set_permissions(&args.output, perms) {
171| 0| tracing::warn!(target: "backup",
172| 0| path = %args.output.display(),
173| | error = %e,
174| 0| "failed to set 0600 permissions on backup file"
175| | );
176| 0| }
177| 0| }
178| | }
179| | #[cfg(windows)]
180| | {
181| | tracing::debug!(target: "backup",
182| | path = %args.output.display(),
183| | "skipping Unix mode 0o600 on Windows; NTFS DACL default is private-to-user"
184| | );
185| | }
186| |
187| 0| let size_bytes = std::fs::metadata(&args.output)
188| 0| .map(|m| m.len())
189| 0| .unwrap_or(0);
190| |
191| 0| output::emit_json(&BackupResponse {
192| 0| action: "backed_up".to_string(),
193| 0| source: paths.db.display().to_string(),
194| 0| destination: args.output.display().to_string(),
195| 0| size_bytes,
196| 0| elapsed_ms: start.elapsed().as_millis() as u64,
197| 0| pages_copied,
198| 0| step_size,
199| 0| })?;
200| |
201| 0| Ok(())
202| 0|}
203| |
204| |#[cfg(test)]
205| |mod tests {
206| | use super::*;
207| |
208| | #[test]
209| 1| fn backup_response_serializes_all_fields() {
210| 1| let resp = BackupResponse {
211| 1| action: "backed_up".to_string(),
212| 1| source: "/data/graphrag.sqlite".to_string(),
213| 1| destination: "/backup/snapshot.sqlite".to_string(),
214| 1| size_bytes: 32768,
215| 1| elapsed_ms: 42,
216| 1| pages_copied: Some(512),
217| 1| step_size: 1000,
218| 1| };
219| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
220| 1| assert_eq!(json["action"], "backed_up");
221| 1| assert_eq!(json["source"], "/data/graphrag.sqlite");
222| 1| assert_eq!(json["destination"], "/backup/snapshot.sqlite");
223| 1| assert_eq!(json["size_bytes"], 32768u64);
224| 1| assert_eq!(json["elapsed_ms"], 42u64);
225| 1| assert_eq!(json["step_size"], 1000usize);
226| 1| assert_eq!(json["pages_copied"], 512i64);
227| 1| }
228| |
229| | #[test]
230| 1| fn backup_response_action_is_backed_up() {
231| 1| let resp = BackupResponse {
232| 1| action: "backed_up".to_string(),
233| 1| source: "/a.sqlite".to_string(),
234| 1| destination: "/b.sqlite".to_string(),
235| 1| size_bytes: 0,
236| 1| elapsed_ms: 0,
237| 1| pages_copied: None,
238| 1| step_size: 1000,
239| 1| };
240| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
241| 1| assert_eq!(
242| 1| json["action"], "backed_up",
243| 0| "action must always be 'backed_up'"
244| | );
245| 1| }
246| |
247| | #[test]
248| 1| fn backup_rejects_destination_equal_to_source() {
249| | // Simulate the guard without a real DB.
250| 1| let src = PathBuf::from("/tmp/graphrag.sqlite");
251| 1| let dst = PathBuf::from("/tmp/graphrag.sqlite");
252| 1| let result: Result<(), AppError> = if dst == src {
253| 1| Err(AppError::Validation(
254| 1| "destination path must differ from the source database path".to_string(),
255| 1| ))
256| | } else {
257| 0| Ok(())
258| | };
259| 1| assert!(
260| 1| result.is_err(),
261| 0| "must reject identical source and destination"
262| | );
263| 1| if let Err(AppError::Validation(msg)) = result {
264| 1| assert!(msg.contains("destination path must differ"));
265| 0| }
266| 1| }
267| |
268| | #[test]
269| 1| fn backup_response_size_bytes_zero_is_valid() {
270| 1| let resp = BackupResponse {
271| 1| action: "backed_up".to_string(),
272| 1| source: "/a.sqlite".to_string(),
273| 1| destination: "/b.sqlite".to_string(),
274| 1| size_bytes: 0,
275| 1| elapsed_ms: 1,
276| 1| pages_copied: Some(0),
277| 1| step_size: 1000,
278| 1| };
279| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
280| 1| assert!(json["size_bytes"].as_u64().is_some());
281| 1| }
282| |
283| | #[test]
284| 1| fn backup_default_step_size_is_one_thousand() {
285| | // G38: the historical default of 100 pages caused backups of 4.3 GB
286| | // databases to take 9 minutes solely on sleep. The new default of
287| | // 1000 pages with 5 ms sleep gives ~25x speedup.
288| 1| assert_eq!(DEFAULT_BACKUP_STEP_PAGES, 1000);
289| 1| assert_eq!(DEFAULT_BACKUP_STEP_SLEEP_MS, 5);
290| 1| }
291| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/cache.rs:
1| |//! Handler for the `cache` CLI subcommand and its nested operations.
2| |//!
3| |//! Manages cached resources such as the multilingual-e5-small ONNX model and
4| |//! the GLiNER NER classifier downloaded into the XDG cache directory on first
5| |//! `init`. Used to reclaim disk space or recover from corrupted cache state.
6| |
7| |use crate::errors::AppError;
8| |use crate::output;
9| |use crate::paths::AppPaths;
10| |use serde::Serialize;
11| |
12| |#[derive(clap::Args)]
13| |#[command(after_long_help = "EXAMPLES:\n \
14| | # Remove cached embedding/NER model files (forces re-download on next init)\n \
15| | sqlite-graphrag cache clear-models\n\n \
16| | # Skip the confirmation prompt\n \
17| | sqlite-graphrag cache clear-models --yes\n\n \
18| | # List cached model files\n \
19| | sqlite-graphrag cache list\n\n \
20| | # List cached model files as JSON\n \
21| | sqlite-graphrag cache list --json")]
22| |pub struct CacheArgs {
23| | #[command(subcommand)]
24| | pub command: CacheCommands,
25| |}
26| |
27| |#[derive(clap::Subcommand)]
28| |pub enum CacheCommands {
29| | /// Remove cached embedding/NER model files (forces re-download on next `init`).
30| | ClearModels(ClearModelsArgs),
31| | /// List cached embedding/NER model files with sizes and total disk usage.
32| | List(CacheListArgs),
33| |}
34| |
35| |#[derive(clap::Args)]
36| |pub struct CacheListArgs {
37| | /// Output as JSON.
38| | #[arg(long)]
39| | pub json: bool,
40| |}
41| |
42| |#[derive(clap::Args)]
43| |pub struct ClearModelsArgs {
44| | /// Skip confirmation prompt and proceed with deletion immediately.
45| | #[arg(long, default_value_t = false, help = "Skip confirmation prompt")]
46| | pub yes: bool,
47| | /// Output format: json (default), text, or markdown.
48| | #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
49| | pub json: bool,
50| |}
51| |
52| |#[derive(Serialize)]
53| |struct ClearModelsResponse {
54| | cache_path: String,
55| | existed: bool,
56| | bytes_freed: u64,
57| | files_removed: usize,
58| | /// Total execution time in milliseconds from handler start to serialisation.
59| | elapsed_ms: u64,
60| |}
61| |
62| 0|pub fn run(args: CacheArgs) -> Result<(), AppError> {
63| 0| match args.command {
64| 0| CacheCommands::ClearModels(a) => clear_models(a),
65| 0| CacheCommands::List(a) => run_list(a),
66| | }
67| 0|}
68| |
69| 1|fn clear_models(args: ClearModelsArgs) -> Result<(), AppError> {
70| 1| let inicio = std::time::Instant::now();
71| | // Resolve the canonical models directory through AppPaths to honour
72| | // SQLITE_GRAPHRAG_CACHE_DIR overrides used by tests and CI.
73| 1| let paths = AppPaths::resolve(None)?;
^0
74| 1| let models_dir = paths.models.clone();
75| |
76| 1| if !args.yes {
77| | // For machine consumption stay deterministic: refuse without --yes.
78| 1| return Err(AppError::Validation(
79| 1| "destructive operation: pass --yes to confirm cache deletion".to_string(),
80| 1| ));
81| 0| }
82| |
83| 0| let existed = models_dir.exists();
84| 0| let mut bytes_freed: u64 = 0;
85| 0| let mut files_removed: usize = 0;
86| |
87| 0| if existed {
88| 0| bytes_freed = dir_size(&models_dir).unwrap_or(0);
89| 0| files_removed = count_files(&models_dir).unwrap_or(0);
90| 0| std::fs::remove_dir_all(&models_dir)?;
91| 0| }
92| |
93| 0| output::emit_json(&ClearModelsResponse {
94| 0| cache_path: models_dir.display().to_string(),
95| 0| existed,
96| 0| bytes_freed,
97| 0| files_removed,
98| 0| elapsed_ms: inicio.elapsed().as_millis() as u64,
99| 0| })?;
100| |
101| 0| Ok(())
102| 1|}
103| |
104| |#[derive(Serialize)]
105| |struct CacheFileEntry {
106| | name: String,
107| | path: String,
108| | size_bytes: u64,
109| | modified_at: String,
110| |}
111| |
112| |#[derive(Serialize)]
113| |struct CacheListResponse {
114| | schema_version: u32,
115| | cache_path: String,
116| | files: Vec<CacheFileEntry>,
117| | total_bytes: u64,
118| | total_human: String,
119| |}
120| |
121| 0|fn format_bytes_human(bytes: u64) -> String {
122| | const KB: u64 = 1024;
123| | const MB: u64 = KB * 1024;
124| | const GB: u64 = MB * 1024;
125| 0| if bytes >= GB {
126| 0| format!("{:.1} GB", bytes as f64 / GB as f64)
127| 0| } else if bytes >= MB {
128| 0| format!("{:.1} MB", bytes as f64 / MB as f64)
129| 0| } else if bytes >= KB {
130| 0| format!("{:.1} KB", bytes as f64 / KB as f64)
131| | } else {
132| 0| format!("{bytes} B")
133| | }
134| 0|}
135| |
136| 0|fn collect_cache_files(
137| 0| dir: &std::path::Path,
138| 0| base: &std::path::Path,
139| 0| entries: &mut Vec<CacheFileEntry>,
140| 0|) -> std::io::Result<()> {
141| 0| for entry in std::fs::read_dir(dir)? {
142| 0| let entry = entry?;
143| 0| let meta = entry.metadata()?;
144| 0| let path = entry.path();
145| 0| if meta.is_dir() {
146| 0| collect_cache_files(&path, base, entries)?;
147| | } else {
148| 0| let size_bytes = meta.len();
149| 0| let relative = path.strip_prefix(base).unwrap_or(&path);
150| 0| let name = relative.to_string_lossy().into_owned();
151| 0| let modified_at = meta
152| 0| .modified()
153| 0| .ok()
154| 0| .map(|t| {
155| 0| let secs = t
156| 0| .duration_since(std::time::UNIX_EPOCH)
157| 0| .unwrap_or_default()
158| 0| .as_secs();
159| | // Format as RFC 3339 (UTC) without chrono dependency.
160| 0| let secs_i64 = secs as i64;
161| 0| let (y, mo, d, h, mi, s) = epoch_to_ymd_hms(secs_i64);
162| 0| format!("{y:04}-{mo:02}-{d:02}T{h:02}:{mi:02}:{s:02}Z")
163| 0| })
164| 0| .unwrap_or_else(|| "unknown".to_string());
165| 0| entries.push(CacheFileEntry {
166| 0| name,
167| 0| path: path.display().to_string(),
168| 0| size_bytes,
169| 0| modified_at,
170| 0| });
171| | }
172| | }
173| 0| Ok(())
174| 0|}
175| |
176| |/// Converts Unix epoch seconds to (year, month, day, hour, minute, second) UTC.
177| 0|fn epoch_to_ymd_hms(secs: i64) -> (i32, u8, u8, u8, u8, u8) {
178| 0| let s = (secs % 60) as u8;
179| 0| let total_min = secs / 60;
180| 0| let mi = (total_min % 60) as u8;
181| 0| let total_h = total_min / 60;
182| 0| let h = (total_h % 24) as u8;
183| 0| let mut days = total_h / 24;
184| | // Compute year/month/day from days since epoch (1970-01-01).
185| 0| let mut y = 1970i32;
186| | loop {
187| 0| let days_in_y = if is_leap(y) { 366 } else { 365 };
188| 0| if days < days_in_y {
189| 0| break;
190| 0| }
191| 0| days -= days_in_y;
192| 0| y += 1;
193| | }
194| 0| let leap = is_leap(y);
195| 0| let months = [
196| | 31u8,
197| 0| if leap { 29 } else { 28 },
198| | 31,
199| | 30,
200| | 31,
201| | 30,
202| | 31,
203| | 31,
204| | 30,
205| | 31,
206| | 30,
207| | 31,
208| | ];
209| 0| let mut mo = 1u8;
210| 0| for &days_in_m in &months {
211| 0| if days < days_in_m as i64 {
212| 0| break;
213| 0| }
214| 0| days -= days_in_m as i64;
215| 0| mo += 1;
216| | }
217| 0| let d = (days + 1) as u8;
218| 0| (y, mo, d, h, mi, s)
219| 0|}
220| |
221| 0|fn is_leap(y: i32) -> bool {
222| 0| (y % 4 == 0 && y % 100 != 0) || y % 400 == 0
223| 0|}
224| |
225| 0|fn run_list(args: CacheListArgs) -> Result<(), AppError> {
226| 0| let paths = AppPaths::resolve(None)?;
227| 0| let models_dir = &paths.models;
228| |
229| 0| let mut entries: Vec<CacheFileEntry> = Vec::with_capacity(4);
230| 0| if models_dir.exists() {
231| 0| collect_cache_files(models_dir, models_dir, &mut entries).map_err(AppError::Io)?;
232| 0| }
233| |
234| 0| entries.sort_unstable_by(|a, b| a.name.cmp(&b.name));
235| 0| let total_bytes: u64 = entries.iter().map(|e| e.size_bytes).sum();
236| 0| let total_human = format_bytes_human(total_bytes);
237| 0| let n_files = entries.len();
238| |
239| 0| if args.json {
240| 0| output::emit_json(&CacheListResponse {
241| 0| schema_version: 1,
242| 0| cache_path: models_dir.display().to_string(),
243| 0| files: entries,
244| 0| total_bytes,
245| 0| total_human,
246| 0| })?;
247| 0| } else if entries.is_empty() {
248| 0| output::emit_text("(empty)");
249| 0| } else {
250| 0| for e in &entries {
251| 0| output::emit_text(&format!(
252| 0| "{:<40} {:>10} {}",
253| 0| e.name,
254| 0| format_bytes_human(e.size_bytes),
255| 0| e.modified_at
256| 0| ));
257| 0| }
258| 0| output::emit_text(&format!("\nTOTAL: {n_files} files, {total_human}"));
259| | }
260| |
261| 0| Ok(())
262| 0|}
263| |
264| 0|fn dir_size(path: &std::path::Path) -> std::io::Result<u64> {
265| 0| let mut total = 0u64;
266| 0| for entry in std::fs::read_dir(path)? {
267| 0| let entry = entry?;
268| 0| let meta = entry.metadata()?;
269| 0| if meta.is_dir() {
270| 0| total = total.saturating_add(dir_size(&entry.path()).unwrap_or(0));
271| 0| } else {
272| 0| total = total.saturating_add(meta.len());
273| 0| }
274| | }
275| 0| Ok(total)
276| 0|}
277| |
278| 0|fn count_files(path: &std::path::Path) -> std::io::Result<usize> {
279| 0| let mut count = 0usize;
280| 0| for entry in std::fs::read_dir(path)? {
281| 0| let entry = entry?;
282| 0| let meta = entry.metadata()?;
283| 0| if meta.is_dir() {
284| 0| count = count.saturating_add(count_files(&entry.path()).unwrap_or(0));
285| 0| } else {
286| 0| count += 1;
287| 0| }
288| | }
289| 0| Ok(count)
290| 0|}
291| |
292| |#[cfg(test)]
293| |mod tests {
294| | use super::*;
295| |
296| | #[test]
297| 1| fn clear_models_response_serializes_all_fields() {
298| 1| let resp = ClearModelsResponse {
299| 1| cache_path: "/tmp/sqlite-graphrag/models".to_string(),
300| 1| existed: true,
301| 1| bytes_freed: 465_000_000,
302| 1| files_removed: 14,
303| 1| elapsed_ms: 12,
304| 1| };
305| 1| let json = serde_json::to_value(&resp).expect("serialization");
306| 1| assert_eq!(json["existed"], true);
307| 1| assert_eq!(json["bytes_freed"], 465_000_000u64);
308| 1| assert_eq!(json["files_removed"], 14);
309| 1| assert_eq!(json["elapsed_ms"], 12);
310| 1| }
311| |
312| | #[test]
313| 1| fn clear_models_without_yes_returns_validation_error() {
314| 1| let args = ClearModelsArgs {
315| 1| yes: false,
316| 1| json: false,
317| 1| };
318| 1| let result = clear_models(args);
319| 1| assert!(matches!(result, Err(AppError::Validation(_))));
^0
320| 1| }
321| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/claude_runner.rs:
1| |//! Shared module for spawning Claude Code (`claude -p`) subprocesses.
2| |//!
3| |//! Eliminates duplication between `enrich.rs` and `ingest_claude.rs` (G02).
4| |//! Detects `terminal_reason: "max_turns"` in the JSON output (G03).
5| |
6| |use crate::errors::AppError;
7| |use std::path::Path;
8| |use std::process::{Command, Stdio};
9| |
10| |/// Minimum Claude Code version required for structured JSON output.
11| |const MIN_CLAUDE_VERSION: &str = "2.1.0";
12| |
13| |/// Environment variables whitelisted for the subprocess.
14| |const ENV_WHITELIST: &[&str] = &[
15| | "PATH",
16| | "HOME",
17| | "USER",
18| | "SHELL",
19| | "TERM",
20| | "LANG",
21| | "XDG_CONFIG_HOME",
22| | "XDG_DATA_HOME",
23| | "XDG_RUNTIME_DIR",
24| | // NOTE: `ANTHROPIC_API_KEY` is INTENTIONALLY ABSENT from this whitelist
25| | // (gaps.md:47). The OAuth-only flow uses the session token from
26| | // `~/.claude/.credentials.json` (or the OS keychain), not an env var.
27| | // The OAuth-only guard in `build_claude_command` aborts the spawn if
28| | // `ANTHROPIC_API_KEY` is set in the environment, but defence-in-depth
29| | // also requires the variable to never reach the child process.
30| | "CLAUDE_CONFIG_DIR",
31| | "TMPDIR",
32| | "TMP",
33| | "TEMP",
34| | "DYLD_FALLBACK_LIBRARY_PATH",
35| |];
36| |
37| |/// Windows-only environment variables.
38| |#[cfg(windows)]
39| |const ENV_WHITELIST_WINDOWS: &[&str] = &[
40| | "LOCALAPPDATA",
41| | "APPDATA",
42| | "USERPROFILE",
43| | "SystemRoot",
44| | "COMSPEC",
45| | "PATHEXT",
46| | "HOMEPATH",
47| | "HOMEDRIVE",
48| |];
49| |
50| |/// Default virtual memory limit for LLM subprocesses (4 GiB).
51| |#[cfg(target_os = "linux")]
52| |const DEFAULT_SUBPROCESS_MEMORY_LIMIT_MB: u64 = 4096;
53| |
54| |// G28-C (v1.0.69): process lifecycle. The G28 gap asks for
55| |// `tokio::process::Command::kill_on_drop(true)`. This codebase uses
56| |// `std::process::Command` (synchronous) so the tokio helper is not
57| |// available. Equivalent defence-in-depth is provided by:
58| |//
59| |// 1. `SIGTERM` via `libc::kill` in the timeout branch of `run_claude`
60| |// and `run_codex` (graceful — gives the child a chance to clean up
61| |// MCP children and write logs).
62| |// 2. `child.kill()` (SIGKILL) if SIGTERM was ignored.
63| |// 3. `reaper::scan_and_kill_orphans()` at startup, which walks `/proc`
64| |// and reaps any `claude`/`codex` processes that were orphaned by a
65| |// previous crash.
66| |//
67| |// SIGKILL on drop is intentionally NOT used because (a) the gaps.md
68| |// Passo C warning flags it as risky per tokio-rs/tokio#7082, and (b)
69| |// the SIGTERM-then-SIGKILL pair covers the same threat model with
70| |// better cleanup behaviour.
71| |
72| |/// Spawns a command with a virtual memory limit via `setrlimit(RLIMIT_AS)`.
73| |///
74| |/// On Linux, applies the limit in a `pre_exec` hook before the child process
75| |/// starts. On non-Linux platforms, falls back to an unlimited spawn.
76| |/// The limit is read from `SQLITE_GRAPHRAG_SUBPROCESS_MEMORY_LIMIT_MB`
77| |/// (default: 4096 MiB).
78| |#[cfg(target_os = "linux")]
79| |pub fn spawn_with_memory_limit(cmd: &mut Command) -> std::io::Result<std::process::Child> {
80| | use std::os::unix::process::CommandExt;
81| | let max_mb: u64 = std::env::var("SQLITE_GRAPHRAG_SUBPROCESS_MEMORY_LIMIT_MB")
82| | .ok()
83| | .and_then(|v| v.parse().ok())
84| | .unwrap_or(DEFAULT_SUBPROCESS_MEMORY_LIMIT_MB);
85| | let max_bytes = max_mb * 1024 * 1024;
86| | // SAFETY: pre_exec closure runs between fork() and exec() in the
87| | // single-threaded child process — no other threads exist.
88| | // libc::setsid and libc::setrlimit are async-signal-safe per POSIX.1-2008 §2.4.3.
89| | // RLIMIT_AS limits virtual address space, not physical RSS.
90| | // setsid failure with EPERM is tolerated (process already a session leader).
91| | // On setrlimit failure, Err(last_os_error()) prevents exec.
92| | unsafe {
93| | cmd.pre_exec(move || {
94| | let sid = libc::setsid();
95| | if sid == -1 {
96| | let err = std::io::Error::last_os_error();
97| | if err.raw_os_error() != Some(libc::EPERM) {
98| | return Err(err);
99| | }
100| | }
101| | let limit = libc::rlimit {
102| | rlim_cur: max_bytes,
103| | rlim_max: max_bytes,
104| | };
105| | if libc::setrlimit(libc::RLIMIT_AS, &limit) != 0 {
106| | return Err(std::io::Error::last_os_error());
107| | }
108| | Ok(())
109| | });
110| | }
111| | tracing::debug!(
112| | target: "process",
113| | program = ?cmd.get_program(),
114| | args = ?cmd.get_args().collect::<Vec<_>>(),
115| | "spawning external process"
116| | );
117| | cmd.spawn()
118| |}
119| |
120| |/// Spawns a command without memory limits (non-Linux fallback).
121| |/// On Unix (macOS, FreeBSD), applies setsid for process group isolation.
122| |#[cfg(not(target_os = "linux"))]
123| 0|pub fn spawn_with_memory_limit(cmd: &mut Command) -> std::io::Result<std::process::Child> {
124| | #[cfg(unix)]
125| | {
126| | use std::os::unix::process::CommandExt;
127| | // SAFETY: setsid() is async-signal-safe per POSIX.1-2008 §2.4.3.
128| | // Creates independent session for cascade termination.
129| | unsafe {
130| 0| cmd.pre_exec(|| {
131| 0| let sid = libc::setsid();
132| 0| if sid == -1 {
133| 0| let err = std::io::Error::last_os_error();
134| 0| if err.raw_os_error() != Some(libc::EPERM) {
135| 0| return Err(err);
136| 0| }
137| 0| }
138| 0| Ok(())
139| 0| });
140| | }
141| | }
142| 0| tracing::debug!(
143| | target: "process",
144| 0| program = ?cmd.get_program(),
145| 0| args = ?cmd.get_args().collect::<Vec<_>>(),
146| 0| "spawning external process"
147| | );
148| 0| cmd.spawn()
149| 0|}
150| |
151| |/// Parsed output element from `claude -p --output-format json`.
152| |#[derive(Debug, serde::Deserialize)]
153| |pub struct ClaudeOutputElement {
154| | pub r#type: Option<String>,
155| | pub subtype: Option<String>,
156| | #[serde(default)]
157| | pub is_error: bool,
158| | pub structured_output: Option<serde_json::Value>,
159| | pub result: Option<String>,
160| | pub total_cost_usd: Option<f64>,
161| | pub error: Option<String>,
162| | pub terminal_reason: Option<String>,
163| | #[serde(rename = "apiKeySource")]
164| | pub api_key_source: Option<String>,
165| |}
166| |
167| |/// Result of a successful Claude invocation.
168| |#[derive(Debug)]
169| |pub struct ClaudeResult {
170| | pub value: serde_json::Value,
171| | pub cost_usd: f64,
172| | pub is_oauth: bool,
173| |}
174| |
175| |/// Validates that the Claude binary meets the minimum version requirement.
176| 0|pub fn validate_claude_version(binary: &Path) -> Result<String, AppError> {
177| 0| let resolved = which::which(binary).map_err(|_| {
178| 0| AppError::Validation(format!(
179| 0| "executable '{}' not found in PATH; ensure it is installed and accessible",
180| 0| binary.display()
181| 0| ))
182| 0| })?;
183| 0| let output = Command::new(&resolved)
184| 0| .arg("--version")
185| 0| .stdin(Stdio::null())
186| 0| .stdout(Stdio::piped())
187| 0| .stderr(Stdio::piped())
188| 0| .output()
189| 0| .map_err(AppError::Io)?;
190| |
191| 0| if !output.status.success() {
192| 0| return Err(AppError::Validation(
193| 0| "failed to run 'claude --version'".to_string(),
194| 0| ));
195| 0| }
196| |
197| 0| let version_str = String::from_utf8(output.stdout)
198| 0| .map_err(|_| AppError::Validation("claude --version output is not UTF-8".to_string()))?;
199| 0| let version = version_str.trim().to_string();
200| 0| let numeric = version.split([' ', '(']).next().unwrap_or("").trim();
201| |
202| 0| fn parse_semver(s: &str) -> Option<(u64, u64, u64)> {
203| 0| let parts: Vec<&str> = s.splitn(3, '.').collect();
204| 0| if parts.len() < 2 {
205| 0| return None;
206| 0| }
207| 0| let major = parts[0].parse::<u64>().ok()?;
208| 0| let minor = parts[1].parse::<u64>().ok()?;
209| 0| let patch = parts
210| 0| .get(2)
211| 0| .and_then(|p| p.parse::<u64>().ok())
212| 0| .unwrap_or(0);
213| 0| Some((major, minor, patch))
214| 0| }
215| |
216| 0| if let (Some(actual), Some(min)) = (parse_semver(numeric), parse_semver(MIN_CLAUDE_VERSION)) {
217| 0| if actual < min {
218| 0| return Err(AppError::Validation(format!(
219| 0| "Claude Code version {numeric} is below minimum required {MIN_CLAUDE_VERSION}"
220| 0| )));
221| 0| }
222| 0| }
223| |
224| 0| Ok(version)
225| 0|}
226| |
227| |/// Builds a `Command` for `claude -p` with least-privilege environment.
228| |///
229| |/// G28-A (v1.0.68) + OAuth-only hardening (v1.0.69, mandated by gaps.md
230| |/// lines 41-49): the command ALWAYS uses the OAuth flow. The flag set
231| |/// is the canonical one documented in gaps.md Correção A:
232| |///
233| |/// ```text
234| |/// claude -p "TAREFA" \
235| |/// --strict-mcp-config \
236| |/// --mcp-config '{}' \
237| |/// --dangerously-skip-permissions \
238| |/// --settings '{"hooks":{}}' \
239| |/// --model <X> \
240| |/// --max-turns <N> \
241| |/// --output-format json \
242| |/// --no-session-persistence
243| |/// ```
244| |///
245| |/// The combination cuts the typical 8-10 MCP process tree to zero and
246| |/// disables user hooks. The reaper sweep at startup (see `reaper::scan_and_kill_orphans`)
247| |/// is the last line of defence for any process that ignored the flags.
248| |///
249| |/// **`--bare` is FORBIDDEN** (gaps.md:49 and operator policy):
250| |/// `--bare` cuts MCPs but disables OAuth and demands `ANTHROPIC_API_KEY`,
251| |/// which is PROHIBITED in this project. We also ABORT the spawn if
252| |/// `ANTHROPIC_API_KEY` is set in the environment, because that is the
253| |/// gateway to the prohibited API-key path.
254| |///
255| |/// GitHub issue [anthropics/claude-code#10787] documents that earlier
256| |/// Claude Code CLI builds sometimes ignored `--strict-mcp-config` and
257| |/// fell back to `~/.mcp.json`. We still pass the flags as defence-in-depth
258| |/// and ALSO honour `SQLITE_GRAPHRAG_CLAUDE_EMPTY_CONFIG_DIR` so users
259| |/// who need belt-and-suspenders isolation can point Claude at an empty
260| |/// config directory (no MCP, no hooks, no settings).
261| |///
262| |/// [anthropics/claude-code#10787]: https://github.com/anthropics/claude-code/issues/10787
263| 2|pub fn build_claude_command(
264| 2| binary: &Path,
265| 2| prompt: &str,
266| 2| json_schema: &str,
267| 2| model: Option<&str>,
268| 2| max_turns: u32,
269| 2|) -> Command {
270| | // OAuth-only guard (gaps.md:47). If `ANTHROPIC_API_KEY` is set in the
271| | // environment we MUST abort — that is the API-key path which is
272| | // explicitly PROHIBITED. Use the OAuth flow exclusively.
273| 2| if let Ok(_key) = std::env::var("ANTHROPIC_API_KEY") {
^1
274| | // Return a command that will fail loudly at spawn time. We
275| | // intentionally do NOT pass `--bare` (PROHIBITED) and we do NOT
276| | // allow the API-key path at all.
277| 1| let mut cmd = Command::new("false");
278| 1| cmd.env_clear();
279| 1| cmd.env("PATH", "/nonexistent");
280| 1| cmd.arg("--oauth-only-violation-anthropic-api-key-set");
281| 1| return cmd;
282| 1| }
283| |
284| 1| let mut cmd = Command::new(binary);
285| |
286| 1| cmd.env_clear();
287| 15| for var in ENV_WHITELIST {
^14
288| 14| if let Ok(val) = std::env::var(var) {
^10
289| 10| cmd.env(var, val);
290| 10| }
^4
291| | }
292| |
293| | #[cfg(windows)]
294| | for var in ENV_WHITELIST_WINDOWS {
295| | if let Ok(val) = std::env::var(var) {
296| | cmd.env(var, val);
297| | }
298| | }
299| |
300| | // G28-A: if the user has pointed us at an empty config dir, force Claude
301| | // Code to use it (which suppresses user-scoped MCP servers and hooks).
302| 1| if let Ok(empty_dir) = std::env::var("SQLITE_GRAPHRAG_CLAUDE_EMPTY_CONFIG_DIR") {
^0
303| 0| if std::path::Path::new(&empty_dir).is_dir() {
304| 0| cmd.env("CLAUDE_CONFIG_DIR", &empty_dir);
305| 0| tracing::debug!(
306| | target: "claude_runner",
307| 0| "isolating claude subprocess to CLAUDE_CONFIG_DIR={}",
308| | empty_dir
309| | );
310| | } else {
311| 0| tracing::warn!(
312| | target: "claude_runner",
313| | path = %empty_dir,
314| 0| "SQLITE_GRAPHRAG_CLAUDE_EMPTY_CONFIG_DIR is set but path is not a directory; \
315| 0| ignoring. MCP isolation will NOT be applied."
316| | );
317| | }
318| 1| }
319| |
320| | // Canonical OAuth-only command line (gaps.md:201-208). Every flag is
321| | // mandatory; do NOT pass `--bare` (PROHIBITED, gaps.md:49).
322| 1| cmd.arg("-p")
323| 1| .arg(prompt)
324| 1| .arg("--strict-mcp-config")
325| 1| .arg("--mcp-config")
326| 1| .arg("{}")
327| 1| .arg("--dangerously-skip-permissions")
328| 1| .arg("--settings")
329| 1| .arg(r#"{"hooks":{}}"#)
330| 1| .arg("--output-format")
331| 1| .arg("json")
332| 1| .arg("--json-schema")
333| 1| .arg(json_schema)
334| 1| .arg("--max-turns")
335| 1| .arg(max_turns.to_string())
336| 1| .arg("--no-session-persistence");
337| |
338| 1| if let Some(m) = model {
339| 1| cmd.arg("--model").arg(m);
340| 1| }
^0
341| |
342| 1| cmd.stdin(Stdio::null())
343| 1| .stdout(Stdio::piped())
344| 1| .stderr(Stdio::piped());
345| |
346| 1| cmd
347| 2|}
348| |
349| |/// Parses `claude -p --output-format json` output array.
350| |///
351| |/// G03: detects `terminal_reason: "max_turns"` and returns a specific error
352| |/// instead of a generic failure message.
353| 7|pub fn parse_claude_output(stdout: &str) -> Result<ClaudeResult, AppError> {
354| 7| let elements: Vec<ClaudeOutputElement> = serde_json::from_str(stdout).map_err(|e| {
^0
355| 0| AppError::Validation(format!("failed to parse claude output as JSON array: {e}"))
356| 0| })?;
357| |
358| 7| let is_oauth = elements
359| 7| .iter()
360| 7| .find(|e| e.r#type.as_deref() == Some("system") && e.subtype.as_deref() == Some("init"))
^6
361| 7| .and_then(|e| e.api_key_source.as_deref())
^6 ^6
362| 7| .map(|s| s == "none")
^3 ^3
363| 7| .unwrap_or(false);
364| |
365| 7| let result_elem = elements
366| 7| .iter()
367| 13| .find(|e| e.r#type.as_deref() == Some("result"))
^7
368| 7| .ok_or_else(|| {
^0
369| 0| AppError::Validation("claude output missing 'result' element".to_string())
370| 0| })?;
371| |
372| | // G03: detect max_turns exhaustion before checking is_error
373| 7| if result_elem.terminal_reason.as_deref() == Some("max_turns") {
374| 1| tracing::warn!(
375| | target: "claude_runner",
376| 0| "claude -p hit max_turns limit — hooks may have consumed turns"
377| | );
378| 1| return Err(AppError::Validation(
379| 1| "claude -p hit max_turns: hooks may be consuming turns; increase --max-turns or disable hooks".to_string(),
380| 1| ));
381| 6| }
382| |
383| 6| if result_elem.is_error {
384| 3| let err_msg = result_elem
385| 3| .error
386| 3| .as_deref()
387| 3| .or(result_elem.result.as_deref())
388| 3| .unwrap_or("unknown error");
389| 3| if err_msg.contains("rate_limit") || err_msg.contains("overloaded") {
^1 ^1
390| 2| return Err(AppError::RateLimited {
391| 2| detail: err_msg.to_string(),
392| 2| });
393| 1| }
394| 1| if err_msg.contains("Not logged in") || err_msg.contains("authentication") {
395| 1| tracing::warn!(
396| | target: "claude_runner",
397| 0| "Claude Code authentication failed. Re-authenticate interactively with: claude"
398| | );
399| 0| }
400| 1| return Err(AppError::Validation(format!(
401| 1| "claude extraction failed: {err_msg}"
402| 1| )));
403| 3| }
404| |
405| 3| let value = if let Some(v) = result_elem.structured_output.clone() {
406| 3| v
407| 0| } else if let Some(text) = &result_elem.result {
408| 0| serde_json::from_str(text).map_err(|e| {
409| 0| AppError::Validation(format!("failed to parse claude result field as JSON: {e}"))
410| 0| })?
411| | } else {
412| 0| return Err(AppError::Validation(
413| 0| "claude result missing structured_output and result field".into(),
414| 0| ));
415| | };
416| |
417| 3| let cost = result_elem.total_cost_usd.unwrap_or(0.0);
418| 3| Ok(ClaudeResult {
419| 3| value,
420| 3| cost_usd: cost,
421| 3| is_oauth,
422| 3| })
423| 7|}
424| |
425| |/// Calls `claude -p` with prompt and schema, waits with timeout, and parses output.
426| |///
427| |/// G03: parses stdout even on non-zero exit to detect `terminal_reason: "max_turns"`.
428| |/// G28-C (v1.0.69): the child is killed explicitly on timeout to avoid
429| |/// leaving a `claude -p` zombie with its MCP children behind.
430| 0|pub fn run_claude(
431| 0| binary: &Path,
432| 0| prompt: &str,
433| 0| json_schema: &str,
434| 0| input_text: &str,
435| 0| model: Option<&str>,
436| 0| timeout_secs: u64,
437| 0| max_turns: u32,
438| 0|) -> Result<ClaudeResult, AppError> {
439| | use wait_timeout::ChildExt;
440| |
441| 0| let full_prompt = format!("{prompt}\n\n{input_text}");
442| 0| let mut cmd = build_claude_command(binary, &full_prompt, json_schema, model, max_turns);
443| |
444| 0| let mut child = spawn_with_memory_limit(&mut cmd).map_err(|e| {
445| 0| AppError::Io(std::io::Error::new(
446| 0| e.kind(),
447| 0| format!("failed to spawn claude: {e}"),
448| 0| ))
449| 0| })?;
450| |
451| 0| let start = std::time::Instant::now();
452| 0| let timeout = std::time::Duration::from_secs(timeout_secs);
453| 0| let status = child.wait_timeout(timeout).map_err(AppError::Io)?;
454| |
455| 0| if status.is_none() {
456| 0| // G28-C: timeout hit — send SIGTERM to the child so the MCP
457| 0| // children it spawned (and their npm/node tree) are also
458| 0| // reaped. SIGTERM gives the child a chance to clean up; the
459| 0| // reaper sweep in main.rs is the last line of defence for
460| 0| // anything that ignored it.
461| 0| #[cfg(unix)]
462| 0| unsafe {
463| 0| libc::kill(child.id() as i32, libc::SIGTERM);
464| 0| }
465| 0| let _ = child.kill();
466| 0| let _ = child.wait();
467| 0| }
468| |
469| 0| match status {
470| 0| Some(exit_status) => {
471| 0| tracing::debug!(
472| | target: "process",
473| 0| exit_code = ?exit_status.code(),
474| 0| elapsed_ms = start.elapsed().as_millis() as u64,
475| 0| "external process completed"
476| | );
477| |
478| 0| let mut stdout_buf = Vec::new();
479| 0| let mut stderr_buf = Vec::new();
480| 0| if let Some(mut out) = child.stdout.take() {
481| 0| std::io::Read::read_to_end(&mut out, &mut stdout_buf).map_err(AppError::Io)?;
482| 0| }
483| 0| if let Some(mut err) = child.stderr.take() {
484| 0| std::io::Read::read_to_end(&mut err, &mut stderr_buf).map_err(AppError::Io)?;
485| 0| }
486| |
487| 0| let stdout_str = String::from_utf8(stdout_buf)
488| 0| .map_err(|_| AppError::Validation("claude -p stdout is not valid UTF-8".into()))?;
489| |
490| | // G03: parse stdout even on failure to detect terminal_reason
491| 0| if !exit_status.success() {
492| 0| if let Ok(result) = parse_claude_output(&stdout_str) {
493| 0| return Ok(result);
494| 0| }
495| 0| let stderr_str = String::from_utf8_lossy(&stderr_buf);
496| 0| if stderr_str.contains("auth") || stderr_str.contains("login") {
497| 0| tracing::warn!(
498| | target: "claude_runner",
499| 0| "Claude Code authentication may have failed. Re-authenticate with: claude"
500| | );
501| 0| }
502| 0| return Err(AppError::Validation(format!(
503| 0| "claude -p exited with code {:?}: {}",
504| 0| exit_status.code(),
505| 0| stderr_str.trim()
506| 0| )));
507| 0| }
508| |
509| 0| parse_claude_output(&stdout_str)
510| | }
511| | None => {
512| 0| tracing::warn!(target: "claude_runner", timeout_secs, "claude -p timed out, terminating");
513| 0| terminate_gracefully(&mut child, 3);
514| 0| Err(AppError::Validation(format!(
515| 0| "claude -p timed out after {timeout_secs} seconds"
516| 0| )))
517| | }
518| | }
519| 0|}
520| |
521| |/// Terminates a child process gracefully: SIGTERM first, SIGKILL after grace period.
522| |#[cfg(unix)]
523| 0|pub fn terminate_gracefully(child: &mut std::process::Child, grace_secs: u64) {
524| | use wait_timeout::ChildExt;
525| 0| unsafe {
526| 0| libc::kill(child.id() as i32, libc::SIGTERM);
527| 0| }
528| 0| match child.wait_timeout(std::time::Duration::from_secs(grace_secs)) {
529| 0| Ok(Some(_)) => {}
530| | _ => {
531| 0| tracing::warn!(target: "process", pid = child.id(), "child ignored SIGTERM, sending SIGKILL");
532| 0| let _ = child.kill();
533| 0| let _ = child.wait();
534| | }
535| | }
536| 0|}
537| |
538| |/// Non-Unix fallback: kill immediately (Windows TerminateProcess).
539| |#[cfg(not(unix))]
540| |pub fn terminate_gracefully(child: &mut std::process::Child, _grace_secs: u64) {
541| | let _ = child.kill();
542| | let _ = child.wait();
543| |}
544| |
545| |#[cfg(test)]
546| |mod tests {
547| | use super::*;
548| |
549| | #[test]
550| 1| fn parse_output_detects_max_turns() {
551| 1| let stdout = r#"[{"type":"system","subtype":"init","apiKeySource":"none"},{"type":"result","is_error":false,"terminal_reason":"max_turns","structured_output":{"name":"t"}}]"#;
552| 1| let err = parse_claude_output(stdout).unwrap_err();
553| 1| assert!(
554| 1| format!("{err}").contains("max_turns"),
555| 0| "must detect max_turns in output"
556| | );
557| 1| }
558| |
559| | #[test]
560| 1| fn parse_output_extracts_structured_value() {
561| 1| let stdout = r#"[{"type":"system","subtype":"init","apiKeySource":"none"},{"type":"result","is_error":false,"structured_output":{"key":"val"},"total_cost_usd":0.01}]"#;
562| 1| let result = parse_claude_output(stdout).unwrap();
563| 1| assert_eq!(result.value["key"], "val");
564| 1| assert!((result.cost_usd - 0.01).abs() < f64::EPSILON);
565| 1| assert!(result.is_oauth);
566| 1| }
567| |
568| | #[test]
569| 1| fn parse_output_detects_rate_limit() {
570| 1| let stdout = r#"[{"type":"result","is_error":true,"error":"rate_limit exceeded"}]"#;
571| 1| let err = parse_claude_output(stdout).unwrap_err();
572| 1| assert!(
573| 1| matches!(err, AppError::RateLimited { .. }),
^0
574| 0| "expected AppError::RateLimited, got: {err}"
575| | );
576| 1| }
577| |
578| | /// OAuth-only conformance test (gaps.md:41-49, v1.0.69 mandate).
579| | /// Verifies that `build_claude_command` always emits the canonical
580| | /// flag set and NEVER emits `--bare` or any API-key path.
581| | #[test]
582| | #[serial_test::serial(env)]
583| 1| fn build_command_oauth_only_mandatory_flags() {
584| | // SAFETY: this is a unit test, no concurrent env mutation
585| 1| unsafe {
586| 1| std::env::remove_var("ANTHROPIC_API_KEY");
587| 1| }
588| 1| let cmd = build_claude_command(
589| 1| std::path::Path::new("/usr/bin/false"),
590| 1| "test prompt",
591| 1| "{}",
592| 1| Some("sonnet"),
593| | 4,
594| | );
595| 17| let args: Vec<&str> = cmd.get_args().filter_map(|a| a.to_str()).collect();
^1 ^1 ^1 ^1 ^1 ^1
596| | // Mandatory OAuth-only flags from gaps.md lines 201-208
597| 1| assert!(args.contains(&"-p"), "must have -p");
^0
598| 1| assert!(
599| 1| args.contains(&"--strict-mcp-config"),
600| 0| "must have --strict-mcp-config (gaps.md:206)"
601| | );
602| 1| assert!(
603| 1| args.contains(&"--mcp-config"),
604| 0| "must have --mcp-config (gaps.md:207)"
605| | );
606| 1| assert!(
607| 1| args.contains(&"--dangerously-skip-permissions"),
608| 0| "must have --dangerously-skip-permissions (gaps.md:208)"
609| | );
610| 1| assert!(
611| 1| args.contains(&"--settings"),
612| 0| "must have --settings (gaps.md:209)"
613| | );
614| 1| assert!(
615| 1| args.contains(&"--output-format"),
616| 0| "must have --output-format json (gaps.md:213)"
617| | );
618| 1| assert!(args.contains(&"--json-schema"), "must have --json-schema");
^0
619| 1| assert!(
620| 1| args.contains(&"--max-turns"),
621| 0| "must have --max-turns (gaps.md:212)"
622| | );
623| 1| assert!(
624| 1| args.contains(&"--no-session-persistence"),
625| 0| "must have --no-session-persistence"
626| | );
627| 1| assert!(
628| 1| args.contains(&"--model"),
629| 0| "must have --model when model is Some"
630| | );
631| | // PROHIBITED flags (gaps.md:49)
632| 1| assert!(
633| 1| !args.contains(&"--bare"),
634| 0| "--bare is PROHIBITED (gaps.md:49)"
635| | );
636| | }
637| |
638| | /// OAuth-only guard: when `ANTHROPIC_API_KEY` is in the environment,
639| | /// `build_claude_command` MUST abort the spawn (return a `false`
640| | /// command), NOT silently fall back to the API-key path.
641| | #[test]
642| | #[serial_test::serial(env)]
643| 1| fn build_command_aborts_when_anthropic_api_key_set() {
644| | // SAFETY: unit test
645| 1| unsafe {
646| 1| std::env::set_var("ANTHROPIC_API_KEY", "sk-test-violation");
647| 1| }
648| 1| let cmd = build_claude_command(
649| 1| std::path::Path::new("/usr/bin/claude"),
650| 1| "test prompt",
651| 1| "{}",
652| 1| Some("sonnet"),
653| | 4,
654| | );
655| 1| let program = cmd.get_program().to_string_lossy().to_string();
656| 1| let args: Vec<&str> = cmd.get_args().filter_map(|a| a.to_str()).collect();
657| 1| assert_eq!(
658| | program, "false",
659| 0| "when ANTHROPIC_API_KEY is set, build_claude_command must abort"
660| | );
661| 1| assert!(
662| 1| args.contains(&"--oauth-only-violation-anthropic-api-key-set"),
663| 0| "aborted command must carry violation marker"
664| | );
665| 1| unsafe {
666| 1| std::env::remove_var("ANTHROPIC_API_KEY");
667| 1| }
668| | }
669| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/cleanup_orphans.rs:
1| |//! Handler for the `cleanup-orphans` CLI subcommand.
2| |
3| |use crate::errors::AppError;
4| |use crate::output::{self, OutputFormat};
5| |use crate::paths::AppPaths;
6| |use crate::storage::connection::open_rw;
7| |use crate::storage::entities;
8| |use serde::Serialize;
9| |
10| |#[derive(clap::Args)]
11| |#[command(after_long_help = "EXAMPLES:\n \
12| | # Remove orphan entities (no memories, no relationships) from the global namespace\n \
13| | sqlite-graphrag cleanup-orphans\n\n \
14| | # Preview which entities would be removed without deleting\n \
15| | sqlite-graphrag cleanup-orphans --dry-run\n\n \
16| | # Cleanup within a specific namespace\n \
17| | sqlite-graphrag cleanup-orphans --namespace my-project --yes")]
18| |pub struct CleanupOrphansArgs {
19| | #[arg(long)]
20| | pub namespace: Option<String>,
21| | #[arg(long)]
22| | pub dry_run: bool,
23| | #[arg(long)]
24| | pub yes: bool,
25| | #[arg(long, value_enum, default_value = "json")]
26| | pub format: OutputFormat,
27| | #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
28| | pub json: bool,
29| | #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
30| | pub db: Option<String>,
31| |}
32| |
33| |#[derive(Serialize)]
34| |struct CleanupResponse {
35| | orphan_count: usize,
36| | deleted: usize,
37| | dry_run: bool,
38| | namespace: Option<String>,
39| | /// Total execution time in milliseconds from handler start to serialisation.
40| | elapsed_ms: u64,
41| |}
42| |
43| 0|pub fn run(args: CleanupOrphansArgs) -> Result<(), AppError> {
44| 0| let inicio = std::time::Instant::now();
45| 0| let paths = AppPaths::resolve(args.db.as_deref())?;
46| |
47| 0| crate::storage::connection::ensure_db_ready(&paths)?;
48| |
49| 0| let mut conn = open_rw(&paths.db)?;
50| |
51| 0| let orphan_ids = entities::find_orphan_entity_ids(&conn, args.namespace.as_deref())?;
52| 0| let orphan_count = orphan_ids.len();
53| |
54| 0| let deleted = if args.dry_run {
55| 0| 0
56| | } else {
57| 0| if orphan_count > 0 && !args.yes {
58| 0| output::emit_progress(&format!(
59| 0| "removing {orphan_count} orphan entities (use --yes to skip this notice)"
60| 0| ));
61| 0| }
62| 0| let tx = conn.transaction_with_behavior(rusqlite::TransactionBehavior::Immediate)?;
63| 0| let removed = entities::delete_entities_by_ids(&tx, &orphan_ids)?;
64| 0| tx.commit()?;
65| 0| conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);")?;
66| 0| removed
67| | };
68| |
69| 0| let response = CleanupResponse {
70| 0| orphan_count,
71| 0| deleted,
72| 0| dry_run: args.dry_run,
73| 0| namespace: args.namespace.clone(),
74| 0| elapsed_ms: inicio.elapsed().as_millis() as u64,
75| 0| };
76| |
77| 0| match args.format {
78| 0| OutputFormat::Json => output::emit_json(&response)?,
79| 0| OutputFormat::Text | OutputFormat::Markdown => {
80| 0| let ns = response.namespace.as_deref().unwrap_or("<all>");
81| 0| output::emit_text(&format!(
82| 0| "orphans: {} found, {} deleted (dry_run={}) [{}]",
83| 0| response.orphan_count, response.deleted, response.dry_run, ns
84| 0| ));
85| 0| }
86| | }
87| |
88| 0| Ok(())
89| 0|}
90| |
91| |#[cfg(test)]
92| |mod tests {
93| | use super::*;
94| |
95| | #[test]
96| 1| fn cleanup_response_serializes_dry_run_true() {
97| 1| let resp = CleanupResponse {
98| 1| orphan_count: 5,
99| 1| deleted: 0,
100| 1| dry_run: true,
101| 1| namespace: Some("global".to_string()),
102| 1| elapsed_ms: 12,
103| 1| };
104| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
105| 1| assert_eq!(json["orphan_count"], 5);
106| 1| assert_eq!(json["deleted"], 0);
107| 1| assert_eq!(json["dry_run"], true);
108| 1| assert_eq!(json["namespace"], "global");
109| 1| assert!(json["elapsed_ms"].is_number());
110| 1| }
111| |
112| | #[test]
113| 1| fn cleanup_response_deleted_zero_when_dry_run() {
114| 1| let resp = CleanupResponse {
115| 1| orphan_count: 10,
116| 1| deleted: 0,
117| 1| dry_run: true,
118| 1| namespace: None,
119| 1| elapsed_ms: 5,
120| 1| };
121| 1| assert_eq!(resp.deleted, 0, "dry_run must keep deleted at 0");
^0
122| 1| assert_eq!(resp.orphan_count, 10);
123| 1| }
124| |
125| | #[test]
126| 1| fn cleanup_response_namespace_none_serializes_null() {
127| 1| let resp = CleanupResponse {
128| 1| orphan_count: 0,
129| 1| deleted: 0,
130| 1| dry_run: false,
131| 1| namespace: None,
132| 1| elapsed_ms: 1,
133| 1| };
134| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
135| 1| assert!(
136| 1| json["namespace"].is_null(),
137| 0| "namespace None must serialize as null"
138| | );
139| 1| }
140| |
141| | #[test]
142| 1| fn cleanup_response_deleted_equals_orphan_count_when_executed() {
143| 1| let resp = CleanupResponse {
144| 1| orphan_count: 3,
145| 1| deleted: 3,
146| 1| dry_run: false,
147| 1| namespace: Some("projeto".to_string()),
148| 1| elapsed_ms: 20,
149| 1| };
150| 1| assert_eq!(
151| | resp.deleted, resp.orphan_count,
152| 0| "when running without dry_run, deleted must equal orphan_count"
153| | );
154| 1| }
155| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/codex_spawn.rs:
1| |//! Codex CLI spawn + JSONL parsing helper shared by `enrich` and `ingest --mode codex`.
2| |//!
3| |//! G31 (v1.0.69): `enrich --mode codex` was missing five critical hardening
4| |//! flags compared to `ingest --mode codex`. This module extracts the
5| |//! spawn pipeline into a single helper that BOTH call-sites consume,
6| |//! guaranteeing the same defaults everywhere.
7| |//!
8| |//! G32 (v1.0.69): `enrich --mode codex` used `serde_json::from_str` on the
9| |//! raw stdout, but `codex exec --json` emits JSONL (one event per line).
10| |//! [`parse_codex_jsonl`] iterates line-by-line, picking the last
11| |//! `item.completed` of type `agent_message` as the assistant text.
12| |//!
13| |//! G33 (v1.0.69): validate the model against the ChatGPT Pro OAuth whitelist
14| |//! stored in `~/.codex/models_cache.json` BEFORE spawning the subprocess.
15| |
16| |use crate::errors::AppError;
17| |use crate::extraction::{ExtractedUrl, ExtractionResult};
18| |use crate::storage::entities::{NewEntity, NewRelationship};
19| |use serde::{Deserialize, Serialize};
20| |use std::path::{Path, PathBuf};
21| |use std::process::{Command, Stdio};
22| |
23| |/// Token usage reported by Codex on `turn.completed` events.
24| |#[derive(Debug, Clone, Default, Deserialize, Serialize)]
25| |pub struct CodexUsage {
26| | #[serde(default)]
27| | pub input_tokens: u64,
28| | #[serde(default)]
29| | pub cached_input_tokens: u64,
30| | #[serde(default)]
31| | pub output_tokens: u64,
32| | #[serde(default)]
33| | pub reasoning_output_tokens: u64,
34| |}
35| |
36| |/// Combined result of one `codex exec` invocation.
37| |#[derive(Debug)]
38| |pub struct CodexResult {
39| | pub extraction: ExtractionResult,
40| | pub usage: Option<CodexUsage>,
41| | pub rate_limited: bool,
42| | pub schema_error: bool,
43| | pub turn_failed: bool,
44| | pub failed_message: String,
45| |}
46| |
47| |/// Configuration for the codex spawner.
48| |#[allow(rustdoc::broken_intra_doc_links)]
49| |pub struct CodexSpawnArgs<'a> {
50| | pub binary: &'a Path,
51| | pub prompt: &'a str,
52| | pub json_schema: &'a str,
53| | pub input_text: &'a str,
54| | pub model: Option<&'a str>,
55| | pub timeout_secs: u64,
56| | /// Caller-provided schema path (must be inside a trusted directory
57| | /// that codex recognises as sandbox-safe). Use [`trusted_schema_path`]
58| | /// to compute one under the cache dir.
59| | pub schema_path: PathBuf,
60| |}
61| |
62| |/// Computes a schema path under the cache dir so `codex exec` accepts it
63| |/// as part of a trusted directory (rejects `/tmp` on hardened installs).
64| 0|pub fn trusted_schema_path() -> Result<PathBuf, AppError> {
65| 0| let cache = crate::paths::AppPaths::resolve(None)
66| 0| .map(|p| p.models.parent().map(|m| m.to_path_buf()))
67| 0| .ok()
68| 0| .flatten()
69| 0| .unwrap_or_else(std::env::temp_dir);
70| 0| std::fs::create_dir_all(&cache).map_err(AppError::Io)?;
71| 0| Ok(cache.join(format!("enrich-schema-{}.json", std::process::id())))
72| 0|}
73| |
74| |/// Models accepted by Codex CLI when using ChatGPT Pro OAuth.
75| |///
76| |/// Mirrored from `~/.codex/models_cache.json` (which the official CLI
77| |/// refreshes on every login). This list is intentionally narrow; passing
78| |/// a model not in this set with `--mode codex` returns
79| |/// `AppError::Validation` BEFORE any OAuth turn is spent.
80| |pub const CODEX_PRO_OAUTH_MODELS: &[&str] = &[
81| | "codex-auto-review",
82| | "gpt-5.3-codex-spark",
83| | "gpt-5.4",
84| | "gpt-5.4-mini",
85| | "gpt-5.5",
86| |];
87| |
88| |/// Validates the requested model against [`CODEX_PRO_OAUTH_MODELS`].
89| |///
90| |/// # Errors
91| |/// Returns [`AppError::Validation`] listing the accepted models when the
92| |/// caller supplied a model outside the whitelist.
93| 4|pub fn validate_codex_model(model: Option<&str>) -> Result<(), AppError> {
94| 4| let Some(m) = model else {
^3
95| 1| return Ok(()); // no override; codex picks its default
96| | };
97| 3| if CODEX_PRO_OAUTH_MODELS.contains(&m) {
98| 2| Ok(())
99| | } else {
100| 1| Err(AppError::Validation(format!(
101| 1| "--codex-model {m:?} is not supported with ChatGPT Pro OAuth. \
102| 1| Accepted: {}",
103| 1| CODEX_PRO_OAUTH_MODELS.join(", ")
104| 1| )))
105| | }
106| 4|}
107| |
108| |/// Returns the list of models accepted by Codex with ChatGPT Pro OAuth.
109| |///
110| |/// Tries to read `~/.codex/models_cache.json` (which the official CLI
111| |/// refreshes on every login) and falls back to the static
112| |/// [`CODEX_PRO_OAUTH_MODELS`] constant when the file is missing or
113| |/// malformed. The returned `Vec<String>` is the union of both sources,
114| |/// de-duplicated.
115| 5|pub fn list_codex_models() -> Vec<String> {
116| | use std::collections::BTreeSet;
117| 5| let mut out: BTreeSet<String> = CODEX_PRO_OAUTH_MODELS
118| 5| .iter()
119| 25| .map(|s| s.to_string())
^5
120| 5| .collect();
121| |
122| 5| if let Some(home) = std::env::var_os("HOME") {
123| 5| let path = std::path::Path::new(&home)
124| 5| .join(".codex")
125| 5| .join("models_cache.json");
126| 5| if let Ok(content) = std::fs::read_to_string(&path) {
127| | // The file is a JSON object whose keys are model ids.
128| | // Use serde_json::Value to traverse safely without depending
129| | // on a precise schema.
130| 5| if let Ok(value) = serde_json::from_str::<serde_json::Value>(&content) {
131| 5| if let Some(obj) = value.as_object() {
132| 20| for key in obj.keys() {
^5 ^5
133| 20| out.insert(key.clone());
134| 20| }
135| 0| } else if let Some(arr) = value.as_array() {
136| 0| for v in arr {
137| 0| if let Some(s) = v.as_str() {
138| 0| out.insert(s.to_string());
139| 0| }
140| | }
141| 0| }
142| 0| }
143| 0| }
144| 0| }
145| 5| out.into_iter().collect()
146| 5|}
147| |
148| |/// Suggests the closest codex OAuth model to a user-supplied substring
149| |/// (G33). Returns `None` when no candidate is close enough.
150| |///
151| |/// Match strategy: exact substring containment wins; otherwise Levenshtein
152| |/// distance below `max_distance = max(2, query.len() / 3)`.
153| 3|pub fn suggest_codex_model(query: &str) -> Option<String> {
154| 3| let query_lc = query.to_ascii_lowercase();
155| 3| let models = list_codex_model_lc();
156| |
157| | // Exact substring match wins.
158| 25| for m in &models {
^23
159| 23| if m.contains(&query_lc) {
160| 1| return Some(m.clone());
161| 22| }
162| | }
163| |
164| | // Levenshtein fallback.
165| 2| let max_distance = (query.len() / 3).max(2);
166| 2| let mut best: Option<(usize, String)> = None;
167| 20| for m in &models {
^18
168| 18| let d = levenshtein(query_lc.as_str(), m.as_str());
169| 18| if d <= max_distance && best.as_ref().is_none_or(|(bd, _)| d < *bd) {
^2 ^2 ^2 ^1 ^1
170| 2| best = Some((d, m.clone()));
171| 16| }
172| | }
173| 2| best.map(|(_, m)| m)
174| 3|}
175| |
176| 3|fn list_codex_model_lc() -> Vec<String> {
177| 3| list_codex_models()
178| 3| .into_iter()
179| 27| .map(|s| s.to_ascii_lowercase())
^3
180| 3| .collect()
181| 3|}
182| |
183| 18|fn levenshtein(a: &str, b: &str) -> usize {
184| 18| let a_chars: Vec<char> = a.chars().collect();
185| 18| let b_chars: Vec<char> = b.chars().collect();
186| 18| if a_chars.is_empty() {
187| 0| return b_chars.len();
188| 18| }
189| 18| if b_chars.is_empty() {
190| 0| return a_chars.len();
191| 18| }
192| 18| let mut prev: Vec<usize> = (0..=b_chars.len()).collect();
193| 18| let mut curr = vec![0; b_chars.len() + 1];
194| 243| for (i, &ac) in a_chars.iter().enumerate() {
^18 ^18
195| 243| curr[0] = i + 1;
196| 2.59k| for (j, &bc) in b_chars.iter().enumerate() {
^243 ^243
197| 2.59k| let cost = if ac == bc { 0 } else { 1 };
^128 ^2.46k
198| 2.59k| curr[j + 1] = (curr[j] + 1).min(prev[j + 1] + 1).min(prev[j] + cost);
199| | }
200| 243| std::mem::swap(&mut prev, &mut curr);
201| | }
202| 18| prev[b_chars.len()]
203| 18|}
204| |
205| |/// Builds the `codex exec` command with the canonical hardening flags.
206| |///
207| |/// G31 + OAuth-only hardening (v1.0.69, mandated by gaps.md lines 41-49):
208| |/// the command ALWAYS uses the OAuth `auth.json` flow. The flag set is
209| |/// the canonical one documented in gaps.md Correção A:
210| |///
211| |/// ```text
212| |/// codex exec \
213| |/// -c mcp_servers='{}' \
214| |/// --json --output-schema <SCHEMA> \
215| |/// --ephemeral \
216| |/// --skip-git-repo-check \
217| |/// --sandbox read-only \
218| |/// --ignore-user-config \
219| |/// --ignore-rules \
220| |/// --ask-for-approval never \
221| |/// -m <MODEL> \
222| |/// -
223| |/// ```
224| |///
225| |/// The combination zeroes MCP servers (via two complementary mechanisms:
226| |/// the inline `-c mcp_servers='{}'` override AND `--ignore-user-config`),
227| |/// disables user-defined rules, and never asks for interactive approval.
228| |///
229| |/// **`OPENAI_API_KEY` is FORBIDDEN** in the spawned environment (gaps.md:48).
230| |/// OAuth flows via `~/.codex/auth.json` and `CODEX_ACCESS_TOKEN` only.
231| 3|pub fn build_codex_command(args: &CodexSpawnArgs<'_>) -> Command {
232| 3| let full_prompt = format!("{}\n\n{}", args.prompt, args.input_text);
233| |
234| | // OAuth-only guard (gaps.md:48). If `OPENAI_API_KEY` is set in the
235| | // environment we MUST abort — that is the API-key path which is
236| | // explicitly PROHIBITED. Use the OAuth `auth.json` flow exclusively.
237| 3| if let Ok(_key) = std::env::var("OPENAI_API_KEY") {
^1
238| 1| let mut cmd = Command::new("false");
239| 1| cmd.env_clear();
240| 1| cmd.env("PATH", "/nonexistent");
241| 1| cmd.arg("--oauth-only-violation-openai-api-key-set");
242| 1| return cmd;
243| 2| }
244| |
245| | // Write the JSON schema to a path the caller controls. Callers should
246| | // pass a path under the cache dir (see [`trusted_schema_path`]).
247| 2| std::fs::write(&args.schema_path, args.json_schema).ok();
248| |
249| 2| let mut cmd = Command::new(args.binary);
250| 2| cmd.env_clear();
251| | // OAuth flow: `CODEX_ACCESS_TOKEN` (Bearer) and `CODEX_HOME` (auth.json
252| | // location) are whitelisted. `OPENAI_API_KEY` is INTENTIONALLY ABSENT.
253| 34| for var in &[
^32
254| 34| "PATH",
255| 34| "HOME",
256| 34| "USER",
257| 34| "SHELL",
258| 34| "TERM",
259| 34| "LANG",
260| 34| "XDG_CONFIG_HOME",
261| 34| "XDG_DATA_HOME",
262| 34| "XDG_RUNTIME_DIR",
263| 34| "XDG_CACHE_HOME",
264| 34| "CODEX_ACCESS_TOKEN",
265| 34| "CODEX_HOME",
266| 34| "TMPDIR",
267| 34| "TMP",
268| 34| "TEMP",
269| 34| "DYLD_FALLBACK_LIBRARY_PATH",
270| 34| ] {
271| 32| if let Ok(val) = std::env::var(var) {
^22
272| 22| cmd.env(var, val);
273| 22| }
^10
274| | }
275| |
276| | #[cfg(windows)]
277| | for var in &[
278| | "LOCALAPPDATA",
279| | "APPDATA",
280| | "USERPROFILE",
281| | "SystemRoot",
282| | "COMSPEC",
283| | "PATHEXT",
284| | ] {
285| | if let Ok(val) = std::env::var(var) {
286| | cmd.env(var, val);
287| | }
288| | }
289| |
290| 2| cmd.arg("exec")
291| 2| .arg("-c")
292| 2| .arg("mcp_servers='{}'")
293| 2| .arg("--json")
294| 2| .arg("--output-schema")
295| 2| .arg(&args.schema_path)
296| 2| .arg("--ephemeral")
297| 2| .arg("--skip-git-repo-check")
298| 2| .arg("--sandbox")
299| 2| .arg("read-only")
300| 2| .arg("--ignore-user-config")
301| 2| .arg("--ignore-rules")
302| 2| .arg("--ask-for-approval")
303| 2| .arg("never");
304| |
305| 2| if let Some(m) = args.model {
306| 2| cmd.arg("-m").arg(m);
307| 2| }
^0
308| |
309| | // `-` means: read the prompt from stdin (Codex Paperclip pattern)
310| 2| cmd.arg("-");
311| |
312| 2| cmd.stdin(Stdio::piped())
313| 2| .stdout(Stdio::piped())
314| 2| .stderr(Stdio::piped());
315| | // Keep the prompt alive for the stdin thread spawned in `spawn_codex`.
316| 2| let _ = full_prompt; // captured by closure below
317| |
318| 2| cmd
319| 3|}
320| |
321| |/// Parses JSONL output from `codex exec --json`.
322| |///
323| |/// Event format (DOTS notation):
324| |/// - `thread.started` — session init
325| |/// - `turn.started` — model turn begins
326| |/// - `item.completed` — message or tool call; last `agent_message` wins
327| |/// - `turn.completed` — includes usage stats
328| |/// - `turn.failed` — error with optional rate-limit indicator
329| |/// - `error` — schema or validation error
330| |///
331| |/// G32 (v1.0.69): this function is the single source of truth for JSONL
332| |/// parsing. Both `enrich` and `ingest --mode codex` consume it.
333| 5|pub fn parse_codex_jsonl(stdout: &str) -> Result<CodexResult, AppError> {
334| 5| let mut last_agent_text: Option<String> = None;
335| 5| let mut usage: Option<CodexUsage> = None;
336| 5| let mut rate_limited = false;
337| 5| let mut schema_error = false;
338| 5| let mut turn_failed = false;
339| 5| let mut failed_message = String::new();
340| |
341| 17| for line in stdout.lines() {
^5 ^5
342| 17| let line = line.trim();
343| 17| if line.is_empty() {
344| 0| continue;
345| 17| }
346| |
347| 17| let event: serde_json::Value = match serde_json::from_str(line) {
^16 ^16
348| 16| Ok(v) => v,
349| | Err(_) => {
350| 1| tracing::warn!(target: "codex_spawn", line, "skipping malformed JSONL line");
^0
351| 1| continue;
352| | }
353| | };
354| |
355| 16| let event_type = match event.get("type").and_then(|t| t.as_str()) {
356| 16| Some(t) => t,
357| 0| None => continue,
358| | };
359| |
360| 16| match event_type {
361| 16| "item.completed" => {
362| 6| if let Some(item) = event.get("item") {
363| 6| if item.get("type").and_then(|t| t.as_str()) == Some("agent_message") {
364| 4| if let Some(text) = item.get("text").and_then(|t| t.as_str()) {
365| 4| last_agent_text = Some(text.to_string());
366| 4| }
^0
367| 2| }
368| 0| }
369| | }
370| 10| "turn.completed" => {
371| 4| if let Some(u) = event.get("usage") {
372| | // Skip events that lack the recognised token fields
373| | // (e.g. partial broadcasts with `{}`) so the last
374| | // populated usage wins instead of being overwritten
375| | // by an empty one.
376| 4| let is_populated = u
377| 4| .get("input_tokens")
378| 4| .and_then(|v| v.as_u64())
^2^2
379| 4| .map(|n| n > 0)
^2
380| 4| .unwrap_or(false)
381| 2| || u.get("output_tokens")
382| 2| .and_then(|v| v.as_u64())
^0^0
383| 2| .map(|n| n > 0)
^0
384| 2| .unwrap_or(false);
385| 4| if is_populated {
386| 2| if let Ok(parsed) = serde_json::from_value::<CodexUsage>(u.clone()) {
387| 2| usage = Some(parsed);
388| 2| }
^0
389| 2| }
390| 0| }
391| | }
392| 6| "turn.failed" => {
393| 1| turn_failed = true;
394| 1| if let Some(err) = event.get("error") {
395| 1| let msg = err
396| 1| .get("message")
397| 1| .and_then(|m| m.as_str())
398| 1| .unwrap_or("unknown error");
399| 1| failed_message = msg.to_string();
400| 1| if msg.contains("rate_limit")
401| 0| || msg.contains("429")
402| 0| || msg.contains("Too Many Requests")
403| 1| {
404| 1| rate_limited = true;
405| 1| }
^0
406| 0| }
407| | }
408| 5| "error" => {
409| 0| if let Some(msg) = event.get("message").and_then(|m| m.as_str()) {
410| 0| if msg.contains("invalid_json_schema") || msg.contains("schema") {
411| 0| schema_error = true;
412| 0| }
413| 0| }
414| | }
415| 5| _ => {}
416| | }
417| | }
418| |
419| 5| let text = last_agent_text.ok_or_else(|| {
^4 ^1
420| 1| AppError::Validation(format!(
421| 1| "no agent_message in codex JSONL output (rate_limited={rate_limited}, schema_error={schema_error}, turn_failed={turn_failed})"
422| 1| ))
423| 1| })?;
424| |
425| 4| if turn_failed {
426| 1| return Err(AppError::Validation(format!(
427| 1| "codex turn failed: {failed_message}"
428| 1| )));
429| 3| }
430| 3| if schema_error {
431| 0| return Err(AppError::Validation(
432| 0| "codex reported invalid_json_schema; check the --output-schema file".to_string(),
433| 0| ));
434| 3| }
435| 3| if rate_limited {
436| 0| return Err(AppError::Validation(format!(
437| 0| "codex rate-limited: {failed_message}"
438| 0| )));
439| 3| }
440| |
441| 3| let extraction = parse_extraction_text(&text)?;
^0
442| 3| Ok(CodexResult {
443| 3| extraction,
444| 3| usage,
445| 3| rate_limited,
446| 3| schema_error,
447| 3| turn_failed,
448| 3| failed_message,
449| 3| })
450| 5|}
451| |
452| |/// Parses the agent_message text as an `ExtractionResult` JSON payload.
453| |///
454| |/// The schema is shared by both `enrich` and `ingest --mode codex`; the
455| |/// `text` is the JSON value the assistant returned, not a wrapper object.
456| 3|pub fn parse_extraction_text(text: &str) -> Result<ExtractionResult, AppError> {
457| 3| let value: serde_json::Value = serde_json::from_str(text).map_err(|e| {
^0
458| 0| AppError::Validation(format!("failed to parse codex agent_message as JSON: {e}"))
459| 0| })?;
460| 3| let obj = value.as_object().ok_or_else(|| {
^0
461| 0| AppError::Validation("codex agent_message is not a JSON object".to_string())
462| 0| })?;
463| |
464| 3| let mut entities: Vec<NewEntity> = Vec::new();
465| 3| if let Some(arr) = obj.get("entities").and_then(|v| v.as_array()) {
466| 5| for e in arr {
^2
467| 2| if let Some(name) = e.get("name").and_then(|v| v.as_str()) {
468| | // Accept either "type" or "entity_type" from the LLM payload
469| | // and fall back to "concept" when the LLM omits it.
470| 2| let entity_type_str = e
471| 2| .get("type")
472| 2| .or_else(|| e.get("entity_type"))
^0^0
473| 2| .and_then(|v| v.as_str())
474| 2| .unwrap_or("concept");
475| 2| let entity_type = serde_json::from_value::<crate::entity_type::EntityType>(
476| 2| serde_json::Value::String(entity_type_str.to_string()),
477| | )
478| 2| .unwrap_or(crate::entity_type::EntityType::Concept);
479| 2| entities.push(NewEntity {
480| 2| name: name.to_string(),
481| 2| entity_type,
482| 2| description: None,
483| 2| });
484| 0| }
485| | }
486| 0| }
487| |
488| 3| let mut relationships: Vec<NewRelationship> = Vec::new();
489| 3| if let Some(arr) = obj.get("relationships").and_then(|v| v.as_array()) {
490| 5| for r in arr {
^2
491| 2| let from = r.get("source").or_else(|| r.get("from"));
^0^0
492| 2| let to = r.get("target").or_else(|| r.get("to"));
^0^0
493| 2| let rel = r.get("relation").and_then(|v| v.as_str());
494| 2| if let (Some(from_v), Some(to_v), Some(rel_v)) = (
495| 2| from.and_then(|v| v.as_str()),
496| 2| to.and_then(|v| v.as_str()),
497| 2| rel,
498| | ) {
499| 2| relationships.push(NewRelationship {
500| 2| source: from_v.to_string(),
501| 2| target: to_v.to_string(),
502| 2| relation: rel_v.to_string(),
503| 2| strength: r.get("strength").and_then(|v| v.as_f64()).unwrap_or(0.5),
504| 2| description: None,
505| | });
506| 0| }
507| | }
508| 0| }
509| |
510| 3| let urls: Vec<ExtractedUrl> = obj
511| 3| .get("urls")
512| 3| .and_then(|v| v.as_array())
^2^2
513| 3| .map(|arr| {
^2
514| 2| arr.iter()
515| 2| .filter_map(|u| {
^0
516| | Some(ExtractedUrl {
517| 0| url: u.get("url")?.as_str()?.to_string(),
518| 0| offset: u.get("offset").and_then(|v| v.as_u64()).unwrap_or(0) as usize,
519| | })
520| 0| })
521| 2| .collect()
522| 2| })
523| 3| .unwrap_or_default();
524| |
525| | Ok(ExtractionResult {
526| 3| entities,
527| 3| relationships,
528| 3| relationships_truncated: obj
529| 3| .get("relationships_truncated")
530| 3| .and_then(|v| v.as_bool())
^0^0
531| 3| .unwrap_or(false),
532| 3| extraction_method: obj
533| 3| .get("extraction_method")
534| 3| .and_then(|v| v.as_str())
535| 3| .unwrap_or("codex")
536| 3| .to_string(),
537| 3| urls,
538| | })
539| 3|}
540| |
541| |#[cfg(test)]
542| |mod tests {
543| | use super::*;
544| |
545| | const SAMPLE_JSONL: &str = r#"{"type":"thread.started","thread_id":"abc"}
546| |{"type":"turn.started"}
547| |{"type":"item.completed","item":{"type":"reasoning","text":"thinking"}}
548| |{"type":"item.completed","item":{"type":"agent_message","text":"{\"entities\":[{\"name\":\"alpha\",\"type\":\"concept\"}],\"relationships\":[{\"source\":\"alpha\",\"target\":\"beta\",\"relation\":\"uses\",\"strength\":0.7}],\"extraction_method\":\"codex\",\"urls\":[]}"}}
549| |{"type":"turn.completed","usage":{"input_tokens":120,"output_tokens":45}}
550| |{"type":"turn.completed","usage":{}}
551| |"#;
552| |
553| | #[test]
554| 1| fn parse_codex_jsonl_extracts_last_agent_message() {
555| 1| let result = parse_codex_jsonl(SAMPLE_JSONL).expect("parse must succeed");
556| 1| assert_eq!(result.extraction.entities.len(), 1);
557| 1| assert_eq!(result.extraction.entities[0].name, "alpha");
558| 1| assert_eq!(result.extraction.relationships.len(), 1);
559| 1| assert_eq!(result.extraction.relationships[0].relation, "uses");
560| 1| assert!((result.extraction.relationships[0].strength - 0.7).abs() < 1e-6);
561| 1| }
562| |
563| | #[test]
564| 1| fn parse_codex_jsonl_collects_usage() {
565| 1| let result = parse_codex_jsonl(SAMPLE_JSONL).expect("parse must succeed");
566| 1| let usage = result.usage.expect("usage must be populated");
567| 1| assert_eq!(usage.input_tokens, 120);
568| 1| assert_eq!(usage.output_tokens, 45);
569| 1| }
570| |
571| | #[test]
572| 1| fn parse_codex_jsonl_detects_rate_limit() {
573| 1| let r = parse_codex_jsonl(
574| 1| "{\"type\":\"turn.failed\",\"error\":{\"message\":\"rate_limit: 429 too many\"}}\n{\"type\":\"item.completed\",\"item\":{\"type\":\"agent_message\",\"text\":\"{}\"}}",
575| | );
576| 1| assert!(matches!(r, Err(AppError::Validation(_))));
^0
577| 1| }
578| |
579| | #[test]
580| 1| fn parse_codex_jsonl_handles_no_agent_message() {
581| 1| let r = parse_codex_jsonl("{\"type\":\"thread.started\"}");
582| 1| assert!(matches!(r, Err(AppError::Validation(_))));
^0
583| 1| }
584| |
585| | #[test]
586| 1| fn parse_codex_jsonl_skips_malformed_lines() {
587| 1| let r = parse_codex_jsonl(
588| 1| "{not valid json\n{\"type\":\"item.completed\",\"item\":{\"type\":\"agent_message\",\"text\":\"{\\\"entities\\\":[],\\\"relationships\\\":[],\\\"extraction_method\\\":\\\"codex\\\"}\"}}",
589| | );
590| 1| assert!(r.is_ok(), "malformed lines must be skipped, got {r:?}");
^0
591| 1| }
592| |
593| | #[test]
594| 1| fn validate_codex_model_accepts_known() {
595| 1| assert!(validate_codex_model(Some("gpt-5.5")).is_ok());
596| 1| assert!(validate_codex_model(Some("gpt-5.4")).is_ok());
597| 1| assert!(validate_codex_model(None).is_ok()); // no override
598| 1| }
599| |
600| | #[test]
601| 1| fn validate_codex_model_rejects_unknown() {
602| 1| let err = validate_codex_model(Some("gpt-4")).unwrap_err();
603| 1| let msg = format!("{err}");
604| 1| assert!(msg.contains("not supported"));
605| 1| assert!(msg.contains("gpt-5.5"));
606| 1| }
607| |
608| | #[test]
609| 1| fn list_codex_models_includes_all_static_whitelist() {
610| 1| let models = list_codex_models();
611| 6| for m in CODEX_PRO_OAUTH_MODELS {
^5
612| 5| assert!(models.contains(&m.to_string()), "missing {m} in {models:?}");
^0
613| | }
614| 1| }
615| |
616| | #[test]
617| 1| fn suggest_codex_model_substring_match() {
618| 1| let s = suggest_codex_model("gpt-5");
619| 1| assert!(s.is_some(), "must suggest a gpt-5.x model");
^0
620| 1| }
621| |
622| | #[test]
623| 1| fn suggest_codex_model_fuzzy_match() {
624| | // 'gpt5.5' has no hyphen; should still suggest 'gpt-5.5'.
625| 1| let s = suggest_codex_model("gpt5.5");
626| 1| assert!(s.is_some(), "fuzzy must suggest gpt-5.5 for 'gpt5.5'");
^0
627| 1| assert_eq!(s.unwrap(), "gpt-5.5");
628| 1| }
629| |
630| | #[test]
631| 1| fn suggest_codex_model_unrelated_returns_none() {
632| 1| let s = suggest_codex_model("totally-unrelated-zzz");
633| 1| assert!(s.is_none());
634| 1| }
635| |
636| | #[test]
637| 1| fn build_codex_command_includes_hardening_flags() {
638| 1| let args = CodexSpawnArgs {
639| 1| binary: Path::new("/bin/true"),
640| 1| prompt: "p",
641| 1| json_schema: "{}",
642| 1| input_text: "i",
643| 1| model: Some("gpt-5.5"),
644| 1| timeout_secs: 60,
645| 1| schema_path: std::env::temp_dir().join("test-schema.json"),
646| 1| };
647| 1| let cmd = build_codex_command(&args);
648| 1| let collected: Vec<String> = cmd
649| 1| .get_args()
650| 17| .filter_map(|a| a.to_str().map(|s| s.to_string()))
^1
651| 1| .collect();
652| 13| for required in &[
^12
653| 13| "exec",
654| 13| "--json",
655| 13| "--output-schema",
656| 13| "--ephemeral",
657| 13| "--skip-git-repo-check",
658| 13| "--sandbox",
659| 13| "read-only",
660| 13| "--ignore-user-config",
661| 13| "--ignore-rules",
662| 13| "-m",
663| 13| "gpt-5.5",
664| 13| "-",
665| 13| ] {
666| 12| assert!(
667| 115| collected.iter().any(|a| a == required),
^12 ^12
668| 0| "missing flag {required} in {collected:?}"
669| | );
670| | }
671| 1| }
672| |
673| | #[test]
674| 1| fn list_codex_models_dedupes_with_cache_file() {
675| | // Ensure the union with the cache file (when present) does not
676| | // produce duplicates. We can't actually write a cache file in
677| | // a test, so we just verify the static path is dedup'd.
678| 1| let models = list_codex_models();
679| 1| let unique: std::collections::HashSet<_> = models.iter().collect();
680| 1| assert_eq!(unique.len(), models.len(), "list_codex_models must dedupe");
^0
681| 1| }
682| |
683| | /// OAuth-only conformance test (gaps.md:41-49, v1.0.69 mandate).
684| | /// Verifies that `build_codex_command` always emits `-c mcp_servers='{}'`,
685| | /// `--ignore-user-config`, `--ask-for-approval never` and does NOT
686| | /// whitelist `OPENAI_API_KEY` in the env_clear whitelist.
687| | #[test]
688| | #[serial_test::serial(env)]
689| 1| fn build_command_oauth_only_mandatory_flags() {
690| | // SAFETY: unit test
691| 1| unsafe {
692| 1| std::env::remove_var("OPENAI_API_KEY");
693| 1| }
694| 1| let schema = std::env::temp_dir().join("codex-test-schema.json");
695| 1| let _ = std::fs::remove_file(&schema);
696| 1| let args = CodexSpawnArgs {
697| 1| binary: std::path::Path::new("/usr/bin/false"),
698| 1| prompt: "p",
699| 1| json_schema: "{}",
700| 1| input_text: "i",
701| 1| model: Some("gpt-5.4-mini"),
702| 1| timeout_secs: 60,
703| 1| schema_path: schema.clone(),
704| 1| };
705| 1| let cmd = build_codex_command(&args);
706| 17| let argv: Vec<&str> = cmd.get_args().filter_map(|a| a.to_str()).collect();
^1 ^1 ^1 ^1 ^1 ^1
707| | // Mandatory flags from gaps.md lines 233-238
708| 1| assert!(argv.contains(&"-c"), "must have -c (gaps.md:234)");
^0
709| 1| assert!(
710| 1| argv.contains(&"mcp_servers='{}'"),
711| 0| "must have mcp_servers override (gaps.md:234)"
712| | );
713| 1| assert!(
714| 1| argv.contains(&"--ignore-user-config"),
715| 0| "must have --ignore-user-config (gaps.md:266)"
716| | );
717| 1| assert!(
718| 1| argv.contains(&"--ask-for-approval"),
719| 0| "must have --ask-for-approval never (gaps.md:237)"
720| | );
721| 1| assert!(
722| 1| argv.contains(&"--sandbox"),
723| 0| "must have --sandbox read-only (G31)"
724| | );
725| 1| assert!(argv.contains(&"--ephemeral"), "must have --ephemeral (G31)");
^0
726| 1| assert!(
727| 1| argv.contains(&"--skip-git-repo-check"),
728| 0| "must have --skip-git-repo-check (G31)"
729| | );
730| 1| assert!(
731| 1| argv.contains(&"--ignore-rules"),
732| 0| "must have --ignore-rules (G31)"
733| | );
734| | }
735| |
736| | /// OAuth-only guard: when `OPENAI_API_KEY` is in the environment,
737| | /// `build_codex_command` MUST abort the spawn (return a `false`
738| | /// command), NOT pass the key through to the child.
739| | #[test]
740| | #[serial_test::serial(env)]
741| 1| fn build_command_aborts_when_openai_api_key_set() {
742| | // SAFETY: unit test
743| 1| unsafe {
744| 1| std::env::set_var("OPENAI_API_KEY", "sk-violation-test");
745| 1| }
746| 1| let schema = std::env::temp_dir().join("codex-test-schema-abort.json");
747| 1| let _ = std::fs::remove_file(&schema);
748| 1| let args = CodexSpawnArgs {
749| 1| binary: std::path::Path::new("/usr/bin/codex"),
750| 1| prompt: "p",
751| 1| json_schema: "{}",
752| 1| input_text: "i",
753| 1| model: Some("gpt-5.4-mini"),
754| 1| timeout_secs: 60,
755| 1| schema_path: schema.clone(),
756| 1| };
757| 1| let cmd = build_codex_command(&args);
758| 1| let program = cmd.get_program().to_string_lossy().to_string();
759| 1| let argv: Vec<&str> = cmd.get_args().filter_map(|a| a.to_str()).collect();
760| 1| assert_eq!(
761| | program, "false",
762| 0| "when OPENAI_API_KEY is set, build_codex_command must abort"
763| | );
764| 1| assert!(
765| 1| argv.contains(&"--oauth-only-violation-openai-api-key-set"),
766| 0| "aborted command must carry violation marker"
767| | );
768| 1| unsafe {
769| 1| std::env::remove_var("OPENAI_API_KEY");
770| 1| }
771| | }
772| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/completions.rs:
1| |//! Shell completion script generation.
2| |
3| |use clap::CommandFactory;
4| |use clap_complete::{generate, Shell};
5| |
6| |#[derive(clap::Args, Debug)]
7| |pub struct CompletionsArgs {
8| | /// Shell to generate completions for
9| | #[arg(value_enum)]
10| | pub shell: Shell,
11| |}
12| |
13| 0|pub fn run(args: CompletionsArgs) -> Result<(), crate::errors::AppError> {
14| 0| let mut cmd = crate::cli::Cli::command();
15| 0| let bin_name = cmd.get_name().to_string();
16| 0| generate(args.shell, &mut cmd, bin_name, &mut std::io::stdout());
17| 0| Ok(())
18| 0|}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/daemon.rs:
1| |use crate::constants::DAEMON_IDLE_SHUTDOWN_SECS;
2| |use crate::errors::AppError;
3| |use crate::output;
4| |use crate::paths::AppPaths;
5| |
6| |#[derive(clap::Args)]
7| |#[command(after_long_help = "EXAMPLES:\n \
8| | # Start the embedding daemon in the foreground (default 600s idle timeout)\n \
9| | sqlite-graphrag daemon\n\n \
10| | # Start with a longer idle timeout for batch ingestion\n \
11| | sqlite-graphrag daemon --idle-shutdown-secs 3600\n\n \
12| | # Health-check a running daemon (exit 4 if not running)\n \
13| | sqlite-graphrag daemon --ping\n\n \
14| | # Request graceful shutdown of a running daemon\n \
15| | sqlite-graphrag daemon --stop\n\n\
16| |AUTO-SPAWN BEHAVIOR:\n \
17| | recall and hybrid-search spawn a daemon automatically when none is running,\n \
18| | amortising model warm-up across multiple invocations (idle timeout 600s).\n\n \
19| | Disable per-invocation: sqlite-graphrag recall \"query\" --autostart-daemon=false\n \
20| | Disable globally: export SQLITE_GRAPHRAG_DAEMON_DISABLE_AUTOSTART=1\n\n \
21| | The --autostart-daemon flag takes precedence over the env var.")]
22| |pub struct DaemonArgs {
23| | /// Idle timeout in seconds before the daemon auto-shuts down to release the embedding model.
24| | /// Default 600s; raise for long-running batch ingestion to avoid cold-start overhead.
25| | #[arg(long, default_value_t = DAEMON_IDLE_SHUTDOWN_SECS)]
26| | pub idle_shutdown_secs: u64,
27| | /// Send a health-check ping to a running daemon and exit. Returns NotFound (exit 4) if no daemon.
28| | #[arg(long)]
29| | pub ping: bool,
30| | /// Request graceful shutdown of a running daemon. Returns NotFound (exit 4) if no daemon.
31| | #[arg(long)]
32| | pub stop: bool,
33| | /// Timeout in seconds for graceful shutdown drain of active requests.
34| | #[arg(
35| | long,
36| | env = "SQLITE_GRAPHRAG_SHUTDOWN_TIMEOUT_SECS",
37| | default_value_t = 10
38| | )]
39| | pub shutdown_timeout_secs: u64,
40| | #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
41| | pub json: bool,
42| | #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
43| | pub db: Option<String>,
44| |}
45| |
46| 0|pub fn run(args: DaemonArgs) -> Result<(), AppError> {
47| 0| let _ = args.json;
48| 0| let paths = AppPaths::resolve(args.db.as_deref())?;
49| 0| paths.ensure_dirs()?;
50| |
51| 0| if args.ping {
52| 0| let response = crate::daemon::try_ping(&paths.models)?
53| 0| .ok_or_else(|| AppError::NotFound("daemon not running".to_string()))?;
54| 0| if let crate::daemon::DaemonResponse::Ok { ref version, .. } = response {
55| 0| if version != crate::constants::SQLITE_GRAPHRAG_VERSION {
56| 0| tracing::warn!(target: "daemon_cmd",
57| | daemon_version = %version,
58| | cli_version = crate::constants::SQLITE_GRAPHRAG_VERSION,
59| 0| "daemon version mismatch; auto-restart will occur on the next embedding request"
60| | );
61| 0| }
62| 0| }
63| 0| output::emit_json(&response)?;
64| 0| return Ok(());
65| 0| }
66| |
67| 0| if args.stop {
68| 0| let response = crate::daemon::try_shutdown(&paths.models)?
69| 0| .ok_or_else(|| AppError::NotFound("daemon not running".to_string()))?;
70| 0| output::emit_json(&response)?;
71| 0| return Ok(());
72| 0| }
73| |
74| 0| crate::daemon::run(
75| 0| &paths.models,
76| 0| args.idle_shutdown_secs,
77| 0| args.shutdown_timeout_secs,
78| | )
79| 0|}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/debug_schema.rs:
1| |//! Handler for the `debug-schema` CLI subcommand.
2| |
3| |use crate::errors::AppError;
4| |use crate::output;
5| |use crate::paths::AppPaths;
6| |use crate::storage::connection::open_ro;
7| |use serde::Serialize;
8| |use std::time::Instant;
9| |
10| |#[derive(clap::Args)]
11| |#[command(after_long_help = "EXAMPLES:\n \
12| | # Dump the SQLite schema (tables, indices, triggers) as JSON\n \
13| | sqlite-graphrag __debug_schema\n\n \
14| | # Dump schema of a database at a custom path\n \
15| | sqlite-graphrag __debug_schema --db /path/to/graphrag.sqlite\n\n \
16| | # Use SQLITE_GRAPHRAG_DB_PATH env var\n \
17| | SQLITE_GRAPHRAG_DB_PATH=/data/graphrag.sqlite sqlite-graphrag __debug_schema")]
18| |pub struct DebugSchemaArgs {
19| | #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
20| | pub json: bool,
21| | #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
22| | pub db: Option<String>,
23| |}
24| |
25| |#[derive(Serialize)]
26| |struct SchemaObject {
27| | name: String,
28| | #[serde(rename = "type")]
29| | object_type: String,
30| |}
31| |
32| |#[derive(Serialize)]
33| |struct MigrationRecord {
34| | version: i64,
35| | name: String,
36| | applied_on: String,
37| |}
38| |
39| |#[derive(Serialize)]
40| |struct DebugSchemaResponse {
41| | /// Internal SQLite counter incremented on each DDL (PRAGMA schema_version).
42| | /// Distinct from `user_version`: this one is managed automatically by SQLite.
43| | schema_version: i64,
44| | /// Canonical SCHEMA_USER_VERSION value set explicitly by migrations
45| | /// (PRAGMA user_version). Distinct from `schema_version` (SQLite DDL counter)
46| | /// and from `health.schema_version` (MAX version in refinery_schema_history).
47| | user_version: i64,
48| | objects: Vec<SchemaObject>,
49| | migrations: Vec<MigrationRecord>,
50| | elapsed_ms: u64,
51| |}
52| |
53| 0|pub fn run(args: DebugSchemaArgs) -> Result<(), AppError> {
54| 0| let inicio = Instant::now();
55| 0| let paths = AppPaths::resolve(args.db.as_deref())?;
56| |
57| 0| crate::storage::connection::ensure_db_ready(&paths)?;
58| |
59| 0| let conn = open_ro(&paths.db)?;
60| |
61| 0| let schema_version: i64 = conn
62| 0| .query_row("PRAGMA schema_version", [], |r| r.get(0))
63| 0| .unwrap_or(0);
64| |
65| | // PRAGMA user_version is set explicitly after migrations (canonical value SCHEMA_USER_VERSION).
66| 0| let user_version: i64 = conn
67| 0| .query_row("PRAGMA user_version", [], |r| r.get(0))
68| 0| .unwrap_or(0);
69| |
70| 0| let mut stmt = conn.prepare_cached(
71| 0| "SELECT name, type FROM sqlite_master \
72| 0| WHERE type IN ('table','view','trigger','index') \
73| 0| ORDER BY type, name",
74| 0| )?;
75| 0| let objects: Vec<SchemaObject> = stmt
76| 0| .query_map([], |r| {
77| | Ok(SchemaObject {
78| 0| name: r.get(0)?,
79| 0| object_type: r.get(1)?,
80| | })
81| 0| })?
82| 0| .collect::<Result<Vec<_>, _>>()?;
83| |
84| 0| let existe_hist: i64 = conn
85| 0| .query_row(
86| 0| "SELECT COUNT(*) FROM sqlite_master WHERE type='table' AND name='refinery_schema_history'",
87| 0| [],
88| 0| |r| r.get(0),
89| | )
90| 0| .unwrap_or(0);
91| |
92| 0| let migrations: Vec<MigrationRecord> = if existe_hist > 0 {
93| 0| let mut stmt_mig = conn.prepare_cached(
94| 0| "SELECT version, name, applied_on \
95| 0| FROM refinery_schema_history \
96| 0| ORDER BY version",
97| 0| )?;
98| 0| let rows: Vec<MigrationRecord> = stmt_mig
99| 0| .query_map([], |r| {
100| | Ok(MigrationRecord {
101| 0| version: r.get(0)?,
102| 0| name: r.get(1)?,
103| 0| applied_on: r.get(2)?,
104| | })
105| 0| })?
106| 0| .collect::<Result<Vec<_>, _>>()?;
107| 0| rows
108| | } else {
109| 0| Vec::new()
110| | };
111| |
112| 0| let elapsed_ms = inicio.elapsed().as_millis() as u64;
113| |
114| 0| output::emit_json(&DebugSchemaResponse {
115| 0| schema_version,
116| 0| user_version,
117| 0| objects,
118| 0| migrations,
119| 0| elapsed_ms,
120| 0| })?;
121| |
122| 0| Ok(())
123| 0|}
124| |
125| |#[cfg(test)]
126| |mod tests {
127| | use super::*;
128| | use serde_json::Value;
129| |
130| | #[test]
131| 1| fn debug_schema_response_serializes_required_fields() {
132| 1| let resp = DebugSchemaResponse {
133| 1| schema_version: 42,
134| 1| user_version: 49,
135| 1| objects: vec![SchemaObject {
136| 1| name: "memories".to_string(),
137| 1| object_type: "table".to_string(),
138| 1| }],
139| 1| migrations: vec![MigrationRecord {
140| 1| version: 1,
141| 1| name: "V001__init".to_string(),
142| 1| applied_on: "2026-01-01T00:00:00Z".to_string(),
143| 1| }],
144| 1| elapsed_ms: 7,
145| 1| };
146| 1| let json: Value = serde_json::to_value(&resp).unwrap();
147| 1| assert_eq!(json["schema_version"], 42);
148| 1| assert_eq!(json["user_version"], 49);
149| 1| assert!(json["objects"].is_array());
150| 1| assert_eq!(json["objects"][0]["name"], "memories");
151| 1| assert_eq!(json["objects"][0]["type"], "table");
152| 1| assert!(json["migrations"].is_array());
153| 1| assert_eq!(json["migrations"][0]["version"], 1);
154| 1| assert_eq!(json["elapsed_ms"], 7);
155| 1| }
156| |
157| | #[test]
158| 1| fn schema_object_renomeia_campo_type() {
159| 1| let obj = SchemaObject {
160| 1| name: "entities".to_string(),
161| 1| object_type: "table".to_string(),
162| 1| };
163| 1| let json: Value = serde_json::to_value(&obj).unwrap();
164| 1| assert!(json.get("object_type").is_none());
165| 1| assert_eq!(json["type"], "table");
166| 1| }
167| |
168| | #[test]
169| 1| fn migration_record_serializes_all_fields() {
170| 1| let rec = MigrationRecord {
171| 1| version: 3,
172| 1| name: "V003__indexes".to_string(),
173| 1| applied_on: "2026-04-19T12:00:00Z".to_string(),
174| 1| };
175| 1| let json: Value = serde_json::to_value(&rec).unwrap();
176| 1| assert_eq!(json["version"], 3);
177| 1| assert_eq!(json["name"], "V003__indexes");
178| 1| assert_eq!(json["applied_on"], "2026-04-19T12:00:00Z");
179| 1| }
180| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/deep_research.rs:
1| |//! Handler for the `deep-research` CLI subcommand.
2| |//!
3| |//! Orchestrates parallel multi-hop GraphRAG search via query decomposition.
4| |//! The workload is I/O-bound (SQLite WAL reads), so tokio is used instead of
5| |//! rayon. Each sub-query opens its own read-only connection.
6| |
7| |use crate::errors::AppError;
8| |use crate::graph::{
9| | bfs_with_predecessors, traverse_from_memories_with_hops_capped, PredecessorMap,
10| |};
11| |use crate::output;
12| |use crate::paths::AppPaths;
13| |use crate::storage::connection::open_ro;
14| |use crate::storage::fusion::{rrf_fuse, rrf_max_possible};
15| |use crate::storage::{entities, memories};
16| |
17| |use serde::Serialize;
18| |use std::collections::HashSet;
19| |use std::sync::Arc;
20| |use tokio::sync::Semaphore;
21| |use tokio::task::JoinSet;
22| |
23| |/// Arguments for the `deep-research` subcommand.
24| |#[derive(clap::Args)]
25| |#[command(
26| | about = "Deep parallel multi-hop GraphRAG research via query decomposition",
27| | after_long_help = "EXAMPLES:\n \
28| | # Basic deep research\n \
29| | sqlite-graphrag deep-research \"auth architecture decisions\"\n\n \
30| | # With custom parameters\n \
31| | sqlite-graphrag deep-research \"auth\" --k 20 --max-hops 3 --max-sub-queries 7\n\n \
32| | # Include full memory bodies in output\n \
33| | sqlite-graphrag deep-research \"auth\" --with-bodies\n\n \
34| | # Tune RRF and graph scoring\n \
35| | sqlite-graphrag deep-research \"auth and deployment\" --rrf-k 60 --graph-decay 0.7"
36| |)]
37| |pub struct DeepResearchArgs {
38| | /// Research query to decompose and search.
39| | #[arg(
40| | value_name = "QUERY",
41| | allow_hyphen_values = true,
42| | help = "Research query to decompose and search"
43| | )]
44| | pub query: String,
45| | /// Results per sub-query (Recall@20 captures 95%+ relevant hits).
46| | #[arg(
47| | long,
48| | short,
49| | aliases = ["limit", "top-k"],
50| | default_value_t = 20,
51| | help = "Results per sub-query (Recall@20 captures 95%+ relevant hits)"
52| | )]
53| | pub k: usize,
54| | /// Maximum sub-queries from decomposition (covers complex multi-hop queries).
55| | #[arg(
56| | long,
57| | default_value_t = 7,
58| | help = "Maximum sub-queries (covers complex multi-hop queries)"
59| | )]
60| | pub max_sub_queries: usize,
61| | /// Multi-hop graph traversal depth (sweet spot: 2-3 hops).
62| | #[arg(
63| | long,
64| | default_value_t = 3,
65| | help = "Multi-hop graph traversal depth (sweet spot: 2-3 hops)"
66| | )]
67| | pub max_hops: usize,
68| | /// Minimum edge weight for graph traversal.
69| | #[arg(
70| | long,
71| | default_value_t = 0.3,
72| | help = "Minimum edge weight for graph traversal"
73| | )]
74| | pub min_weight: f64,
75| | /// Maximum concurrent sub-queries (default: min(cpus, 8)).
76| | #[arg(long, help = "Maximum concurrent sub-queries (default: min(cpus, 8))")]
77| | pub max_concurrency: Option<usize>,
78| | /// Timeout per sub-query in seconds.
79| | #[arg(long, default_value_t = 30, help = "Timeout per sub-query in seconds")]
80| | pub timeout: u64,
81| | /// Include full memory bodies in results.
82| | #[arg(
83| | long,
84| | default_value_t = false,
85| | help = "Include full memory bodies in results"
86| | )]
87| | pub with_bodies: bool,
88| | /// Maximum results after deduplication.
89| | #[arg(
90| | long,
91| | default_value_t = 50,
92| | help = "Maximum results after deduplication"
93| | )]
94| | pub max_results: usize,
95| | /// RRF k parameter controlling score smoothing (higher = less weight on top ranks).
96| | #[arg(
97| | long,
98| | default_value_t = 60.0,
99| | help = "RRF k parameter (higher = less weight on top ranks)"
100| | )]
101| | pub rrf_k: f64,
102| | /// Decay factor applied to graph scores per hop (score = seed_score * decay^hop).
103| | #[arg(
104| | long,
105| | default_value_t = 0.7,
106| | help = "Graph score decay factor per hop (0.0-1.0)"
107| | )]
108| | pub graph_decay: f64,
109| | /// Minimum score threshold for graph-expanded results (filters noise).
110| | #[arg(
111| | long,
112| | default_value_t = 0.05,
113| | help = "Minimum score threshold for graph-expanded results"
114| | )]
115| | pub graph_min_score: f64,
116| | /// Limit top-k neighbours followed per entity per hop (None = unlimited).
117| | #[arg(
118| | long,
119| | help = "Limit neighbours per entity per hop for graph traversal (default: unlimited)"
120| | )]
121| | pub max_neighbors_per_hop: Option<usize>,
122| | /// Namespace (env: SQLITE_GRAPHRAG_NAMESPACE, default: global).
123| | #[arg(
124| | long,
125| | help = "Namespace (env: SQLITE_GRAPHRAG_NAMESPACE, default: global)"
126| | )]
127| | pub namespace: Option<String>,
128| | /// Research mode: `none` (local heuristic, default), `claude-code`, `codex` (v1.1.0).
129| | #[arg(long, default_value = "none", value_parser = ["none"], hide = true)]
130| | pub mode: String,
131| | /// Maximum LLM cost in USD (effective with --mode claude-code/codex, reserved for v1.1.0).
132| | #[arg(
133| | long,
134| | value_name = "USD",
135| | help = "Max LLM cost in USD (effective with --mode claude-code/codex)"
136| | )]
137| | pub max_cost_usd: Option<f64>,
138| | /// JSON output (always on, kept for consistency).
139| | #[arg(long, hide = true)]
140| | pub json: bool,
141| | /// Database path.
142| | #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
143| | pub db: Option<String>,
144| | #[command(flatten)]
145| | pub daemon: crate::cli::DaemonOpts,
146| |}
147| |
148| |#[derive(Serialize)]
149| |struct SubQuery {
150| | id: usize,
151| | text: String,
152| | source: &'static str,
153| |}
154| |
155| |#[derive(Serialize)]
156| |struct DeepResult {
157| | name: String,
158| | score: f64,
159| | source: String,
160| | sub_query_ids: Vec<usize>,
161| | snippet: String,
162| | #[serde(skip_serializing_if = "Option::is_none")]
163| | body: Option<String>,
164| | hop_distance: Option<usize>,
165| |}
166| |
167| |/// A node in a reconstructed evidence path.
168| |#[derive(Serialize, Clone)]
169| |struct EvidenceNode {
170| | entity: String,
171| | #[serde(skip_serializing_if = "Option::is_none")]
172| | relation: Option<String>,
173| | #[serde(skip_serializing_if = "Option::is_none")]
174| | weight: Option<f64>,
175| |}
176| |
177| |/// A directed evidence chain reconstructed from BFS predecessors.
178| |///
179| |/// Fields:
180| |/// - `from`: name of the seed (source) entity.
181| |/// - `to`: name of the terminal (target) entity.
182| |/// - `path`: ordered list of intermediate nodes from `from` to `to`.
183| |/// - `total_weight`: product of edge weights along the path.
184| |/// - `sub_query_ids`: which sub-queries produced this chain.
185| |#[derive(Serialize)]
186| |struct EvidenceChain {
187| | from: String,
188| | to: String,
189| | path: Vec<EvidenceNode>,
190| | total_weight: f64,
191| | depth: usize,
192| | sub_query_ids: Vec<usize>,
193| |}
194| |
195| |#[derive(Serialize)]
196| |struct ResearchStats {
197| | sub_queries_total: usize,
198| | sub_queries_completed: usize,
199| | sub_queries_failed: usize,
200| | sub_queries_timed_out: usize,
201| | unique_memories_found: usize,
202| | evidence_chains_found: usize,
203| | elapsed_ms: u64,
204| |}
205| |
206| |#[derive(Serialize)]
207| |struct GraphContextEntity {
208| | name: String,
209| | entity_type: String,
210| | degree: u32,
211| |}
212| |
213| |#[derive(Serialize)]
214| |struct GraphContextRel {
215| | from: String,
216| | to: String,
217| | relation: String,
218| | weight: f64,
219| |}
220| |
221| |#[derive(Serialize)]
222| |struct GraphContext {
223| | entities: Vec<GraphContextEntity>,
224| | relationships: Vec<GraphContextRel>,
225| |}
226| |
227| |#[derive(Serialize)]
228| |struct DeepResearchResponse {
229| | query: String,
230| | sub_queries: Vec<SubQuery>,
231| | results: Vec<DeepResult>,
232| | evidence_chains: Vec<EvidenceChain>,
233| | #[serde(skip_serializing_if = "Option::is_none")]
234| | graph_context: Option<GraphContext>,
235| | stats: ResearchStats,
236| |}
237| |
238| |/// Aggregated hit data: (score, source_label, snippet, body, hop_distance, sub_query_ids).
239| |type MergedHit = (f64, String, String, String, Option<usize>, Vec<usize>);
240| |
241| |/// Intermediate result from a single sub-query execution.
242| |struct SubQueryResult {
243| | sub_query_id: usize,
244| | /// (memory_id, score, source_label, snippet, body, hop_distance)
245| | hits: Vec<(i64, f64, String, String, String, Option<usize>)>,
246| | /// Evidence chains reconstructed from BFS.
247| | chains: Vec<EvidenceChain>,
248| |}
249| |
250| |/// Sync entry point — builds a tokio runtime for the async fan-out.
251| |#[tracing::instrument(skip_all, level = "debug", name = "deep_research")]
252| 0|pub fn run(args: DeepResearchArgs) -> Result<(), AppError> {
253| 0| tracing::debug!(target: "deep_research", query = %args.query, k = args.k, "starting deep research");
254| 0| let rt = tokio::runtime::Builder::new_multi_thread()
255| 0| .worker_threads(2)
256| 0| .enable_all()
257| 0| .build()
258| 0| .map_err(|e| AppError::Internal(anyhow::anyhow!("failed to build tokio runtime: {e}")))?;
259| 0| rt.block_on(run_async(args))
260| 0|}
261| |
262| |/// Main async logic: decompose, fan-out, assemble, emit JSON.
263| 0|async fn run_async(args: DeepResearchArgs) -> Result<(), AppError> {
264| 0| let start = std::time::Instant::now();
265| |
266| 0| if args.query.trim().is_empty() {
267| 0| return Err(AppError::Validation(crate::i18n::validation::empty_query()));
268| 0| }
269| |
270| 0| if args.max_cost_usd.is_some() && args.mode == "none" {
271| 0| tracing::warn!(target: "deep_research", "--max-cost-usd has no effect without --mode claude-code/codex");
272| 0| }
273| |
274| 0| let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
275| 0| let paths = AppPaths::resolve(args.db.as_deref())?;
276| 0| crate::storage::connection::ensure_db_ready(&paths)?;
277| |
278| | // Phase 1: Query decomposition (sync, pure logic).
279| 0| let sub_query_texts = decompose_query(&args.query, args.max_sub_queries);
280| 0| let sub_queries: Vec<SubQuery> = sub_query_texts
281| 0| .iter()
282| 0| .enumerate()
283| 0| .map(|(i, text)| SubQuery {
284| 0| id: i,
285| 0| text: text.clone(),
286| 0| source: if sub_query_texts.len() == 1 {
287| 0| "original"
288| | } else {
289| 0| "decomposed"
290| | },
291| 0| })
292| 0| .collect();
293| |
294| | // GAP-07 FIX: compute ONE embedding PER sub-query text (sequential — daemon serialises).
295| | // The previous code used a single embedding for args.query shared across all sub-queries,
296| | // making decomposition cosmetic. We now build a Vec<Arc<Vec<f32>>> indexed by sub-query.
297| 0| output::emit_progress_i18n(
298| 0| "Computing per-sub-query embeddings...",
299| 0| "Calculando embeddings por sub-consulta...",
300| | );
301| 0| let mut sub_embeddings: Vec<Arc<Vec<f32>>> = Vec::with_capacity(sub_query_texts.len());
302| 0| for sq_text in &sub_query_texts {
303| 0| let emb = crate::daemon::embed_query_or_local(
304| 0| &paths.models,
305| 0| sq_text,
306| 0| args.daemon.autostart_daemon,
307| 0| )?;
308| 0| sub_embeddings.push(Arc::new(emb));
309| | }
310| |
311| | // Phase 2: Fan-out — parallel sub-query execution.
312| 0| let cpu_count = std::thread::available_parallelism()
313| 0| .map(|n| n.get())
314| 0| .unwrap_or(4);
315| 0| let permits = args
316| 0| .max_concurrency
317| 0| .unwrap_or_else(|| cpu_count.min(8))
318| 0| .min(sub_queries.len())
319| 0| .max(1);
320| 0| let semaphore = Arc::new(Semaphore::new(permits));
321| 0| let timeout_dur = std::time::Duration::from_secs(args.timeout);
322| |
323| 0| let mut join_set: JoinSet<Result<SubQueryResult, (usize, String)>> = JoinSet::new();
324| |
325| 0| for (idx, sq_text) in sub_query_texts.iter().enumerate() {
326| 0| let sem = Arc::clone(&semaphore);
327| | // GAP-07 FIX: pass embedding for THIS specific sub-query.
328| 0| let emb = Arc::clone(&sub_embeddings[idx]);
329| 0| let ns = namespace.clone();
330| 0| let db_path = paths.db.clone();
331| 0| let query_text = sq_text.clone();
332| 0| let k = args.k;
333| 0| let max_hops = args.max_hops;
334| 0| let min_weight = args.min_weight;
335| 0| let rrf_k = args.rrf_k;
336| 0| let graph_decay = args.graph_decay;
337| 0| let graph_min_score = args.graph_min_score;
338| 0| let max_neighbors_per_hop = args.max_neighbors_per_hop;
339| |
340| 0| join_set.spawn(async move {
341| 0| let _permit = sem
342| 0| .acquire_owned()
343| 0| .await
344| 0| .map_err(|e| (idx, format!("semaphore closed: {e}")))?;
345| |
346| | // Dereference the Arc to obtain a &[f32] slice for the sync function.
347| 0| let result = tokio::time::timeout(timeout_dur, async move {
348| 0| execute_sub_query(
349| 0| idx,
350| 0| &query_text,
351| 0| emb.as_slice(),
352| 0| &ns,
353| 0| &db_path,
354| 0| k,
355| 0| max_hops,
356| 0| min_weight,
357| 0| rrf_k,
358| 0| graph_decay,
359| 0| graph_min_score,
360| 0| max_neighbors_per_hop,
361| | )
362| 0| })
363| 0| .await;
364| |
365| 0| match result {
366| 0| Ok(inner) => inner.map_err(|e| (idx, e)),
367| 0| Err(_) => Err((idx, "timeout".to_string())),
368| | }
369| 0| });
370| | }
371| |
372| | // Collect results incrementally.
373| 0| let mut sub_query_results: Vec<SubQueryResult> = Vec::with_capacity(sub_queries.len());
374| 0| let mut failed_count = 0usize;
375| 0| let mut timed_out_count = 0usize;
376| |
377| 0| while let Some(join_result) = join_set.join_next().await {
378| 0| match join_result {
379| 0| Ok(Ok(sqr)) => sub_query_results.push(sqr),
380| 0| Ok(Err((_idx, reason))) => {
381| 0| if reason == "timeout" {
382| 0| timed_out_count += 1;
383| 0| } else {
384| 0| failed_count += 1;
385| 0| }
386| 0| tracing::warn!(target: "deep_research", sub_query_id = _idx, reason = %reason, "sub-query failed");
387| | }
388| 0| Err(join_err) => {
389| 0| failed_count += 1;
390| 0| if join_err.is_panic() {
391| 0| tracing::error!(target: "deep_research", error = %join_err, "sub-query task panicked");
392| | } else {
393| 0| tracing::warn!(target: "deep_research", error = %join_err, "sub-query task cancelled");
394| | }
395| | }
396| | }
397| | }
398| |
399| | // Phase 3: Evidence assembly — merge, dedup, rank.
400| | // Aggregate hits: memory_id -> (best_score, source, snippet, body, hop_distance, sub_query_ids)
401| 0| let mut merged: crate::hash::AHashMap<i64, MergedHit> =
402| 0| crate::hash::AHashMap::with_capacity_and_hasher(
403| 0| sub_query_results.len() * args.k,
404| 0| Default::default(),
405| | );
406| |
407| 0| for sqr in &sub_query_results {
408| 0| for (mem_id, score, source, snippet, body, hop) in &sqr.hits {
409| 0| let entry = merged.entry(*mem_id).or_insert_with(|| {
410| 0| (
411| 0| *score,
412| 0| source.clone(),
413| 0| snippet.clone(),
414| 0| body.clone(),
415| 0| *hop,
416| 0| Vec::new(),
417| 0| )
418| 0| });
419| | // Keep best score.
420| 0| if *score > entry.0 {
421| 0| entry.0 = *score;
422| 0| entry.1 = source.clone();
423| 0| entry.2 = snippet.clone();
424| 0| entry.3 = body.clone();
425| 0| entry.4 = *hop;
426| 0| }
427| 0| if !entry.5.contains(&sqr.sub_query_id) {
428| 0| entry.5.push(sqr.sub_query_id);
429| 0| }
430| | }
431| | }
432| |
433| | // Resolve memory names for merged results.
434| 0| let conn = open_ro(&paths.db)?;
435| 0| let mut results: Vec<DeepResult> = Vec::with_capacity(merged.len().min(args.max_results));
436| |
437| | // Sort by score descending.
438| 0| let mut ranked: Vec<(i64, MergedHit)> = merged.into_iter().collect();
439| 0| ranked.sort_by(|a, b| {
440| 0| b.1 .0
441| 0| .partial_cmp(&a.1 .0)
442| 0| .unwrap_or(std::cmp::Ordering::Equal)
443| 0| });
444| 0| ranked.truncate(args.max_results);
445| |
446| 0| for (mem_id, (score, source, snippet, body, hop, sq_ids)) in ranked {
447| 0| let name = match memories::read_full(&conn, mem_id)? {
448| 0| Some(row) => row.name,
449| 0| None => continue,
450| | };
451| 0| results.push(DeepResult {
452| 0| name,
453| 0| score,
454| 0| source,
455| 0| sub_query_ids: sq_ids,
456| 0| snippet,
457| 0| body: if args.with_bodies { Some(body) } else { None },
458| 0| hop_distance: hop,
459| | });
460| | }
461| |
462| | // GAP-09/10 FIX: Collect evidence chains from reconstructed BFS paths.
463| | // The old code appended flat node pairs from a global SELECT; now each
464| | // sub-query returns directed EvidenceChain structs (from, to, path).
465| 0| let completed_count = sub_query_results.len();
466| 0| let mut evidence_chains: Vec<EvidenceChain> = Vec::with_capacity(completed_count * 2);
467| 0| let mut seen_chain_keys: HashSet<String> = HashSet::with_capacity(completed_count * 2);
468| |
469| 0| for sqr in sub_query_results {
470| 0| for chain in sqr.chains {
471| | // Deduplicate chains by (from, to) pair.
472| 0| let key = format!("{}->{}", chain.from, chain.to);
473| 0| if seen_chain_keys.insert(key) {
474| 0| evidence_chains.push(chain);
475| 0| }
476| | }
477| | }
478| |
479| | // Sort evidence chains by total_weight descending, discard single-hop trivial chains.
480| 0| evidence_chains.retain(|c| c.depth >= 2);
481| 0| evidence_chains.sort_by(|a, b| {
482| 0| b.total_weight
483| 0| .partial_cmp(&a.total_weight)
484| 0| .unwrap_or(std::cmp::Ordering::Equal)
485| 0| });
486| |
487| 0| let unique_memories = results.len();
488| 0| let evidence_count = evidence_chains.len();
489| |
490| | // MEDIUM-01b: Build graph_context with entities and relationships from result memories.
491| 0| let graph_context = if !results.is_empty() {
492| 0| let result_names: Vec<&str> = results.iter().map(|r| r.name.as_str()).collect();
493| 0| let mut ctx_entities: Vec<GraphContextEntity> = Vec::with_capacity(results.len());
494| 0| let mut ctx_rels: Vec<GraphContextRel> = Vec::with_capacity(results.len() * 2);
495| 0| let mut seen_entity_ids: crate::hash::AHashSet<i64> =
496| 0| crate::hash::AHashSet::with_capacity_and_hasher(results.len(), Default::default());
497| |
498| 0| for name in &result_names {
499| 0| if let Ok(Some(eid)) = entities::find_entity_id(&conn, &namespace, name) {
500| 0| if seen_entity_ids.insert(eid) {
501| 0| let etype: String = conn
502| 0| .query_row(
503| 0| "SELECT COALESCE(type,'concept') FROM entities WHERE id = ?1",
504| 0| rusqlite::params![eid],
505| 0| |r| r.get(0),
506| | )
507| 0| .unwrap_or_else(|_| "concept".to_string());
508| 0| let degree: u32 = conn
509| 0| .query_row(
510| 0| "SELECT COUNT(*) FROM relationships WHERE source_id = ?1 OR target_id = ?1",
511| 0| rusqlite::params![eid],
512| 0| |r| r.get(0),
513| | )
514| 0| .unwrap_or(0);
515| 0| ctx_entities.push(GraphContextEntity {
516| 0| name: name.to_string(),
517| 0| entity_type: etype,
518| 0| degree,
519| 0| });
520| 0| }
521| 0| }
522| | }
523| |
524| 0| let entity_ids: Vec<i64> = seen_entity_ids.iter().copied().collect();
525| 0| if entity_ids.len() >= 2 {
526| 0| let placeholders: String = entity_ids.iter().map(|_| "?").collect::<Vec<_>>().join(",");
527| 0| let sql = format!(
528| 0| "SELECT s.name, t.name, r.relation, r.weight \
529| 0| FROM relationships r \
530| 0| JOIN entities s ON s.id = r.source_id \
531| 0| JOIN entities t ON t.id = r.target_id \
532| 0| WHERE r.source_id IN ({placeholders}) AND r.target_id IN ({placeholders}) \
533| 0| LIMIT 50"
534| | );
535| 0| if let Ok(mut stmt) = conn.prepare(&sql) {
536| 0| let mut params: Vec<Box<dyn rusqlite::types::ToSql>> =
537| 0| Vec::with_capacity(entity_ids.len() * 2);
538| 0| for id in &entity_ids {
539| 0| params.push(Box::new(*id));
540| 0| }
541| 0| for id in &entity_ids {
542| 0| params.push(Box::new(*id));
543| 0| }
544| 0| let param_refs: Vec<&dyn rusqlite::types::ToSql> =
545| 0| params.iter().map(|p| p.as_ref()).collect();
546| 0| if let Ok(rows) = stmt.query_map(param_refs.as_slice(), |r| {
547| | Ok((
548| 0| r.get::<_, String>(0)?,
549| 0| r.get::<_, String>(1)?,
550| 0| r.get::<_, String>(2)?,
551| 0| r.get::<_, f64>(3)?,
552| | ))
553| 0| }) {
554| 0| for row in rows.flatten() {
555| 0| ctx_rels.push(GraphContextRel {
556| 0| from: row.0,
557| 0| to: row.1,
558| 0| relation: row.2,
559| 0| weight: row.3,
560| 0| });
561| 0| }
562| 0| }
563| 0| }
564| 0| }
565| |
566| 0| if ctx_entities.is_empty() {
567| 0| None
568| | } else {
569| 0| Some(GraphContext {
570| 0| entities: ctx_entities,
571| 0| relationships: ctx_rels,
572| 0| })
573| | }
574| | } else {
575| 0| None
576| | };
577| |
578| 0| tracing::debug!(target: "deep_research",
579| 0| total_results = results.len(),
580| 0| total_chains = evidence_chains.len(),
581| 0| "assembly complete"
582| | );
583| |
584| | // Phase 4: JSON output.
585| 0| output::emit_json(&DeepResearchResponse {
586| 0| query: args.query,
587| 0| sub_queries,
588| 0| results,
589| 0| evidence_chains,
590| 0| graph_context,
591| 0| stats: ResearchStats {
592| 0| sub_queries_total: sub_query_texts.len(),
593| 0| sub_queries_completed: completed_count,
594| 0| sub_queries_failed: failed_count,
595| 0| sub_queries_timed_out: timed_out_count,
596| 0| unique_memories_found: unique_memories,
597| 0| evidence_chains_found: evidence_count,
598| 0| elapsed_ms: start.elapsed().as_millis() as u64,
599| 0| },
600| 0| })?;
601| |
602| 0| Ok(())
603| 0|}
604| |
605| |/// Heuristic query decomposition: splits by conjunctions, commas, semicolons,
606| |/// relational phrases, and extracts explicit entities (kebab-case or quoted).
607| 9|fn decompose_query(query: &str, max: usize) -> Vec<String> {
608| 9| if query.is_empty() {
609| 1| return vec![query.to_string()];
610| 8| }
611| |
612| 8| let mut parts: Vec<String> = Vec::with_capacity(max);
613| |
614| | // Split by relational phrases first (most specific).
615| 8| let relational = [
616| 8| " that caused ",
617| 8| " depending on ",
618| 8| " related to ",
619| 8| " connected to ",
620| 8| " linked to ",
621| 8| " caused by ",
622| 8| " followed by ",
623| 8| ];
624| 8| let mut text = query.to_string();
625| 8| let mut did_relational_split = false;
626| 64| for phrase in &relational {
^56
627| 56| if text.to_lowercase().contains(phrase) {
628| 1| let lower = text.to_lowercase();
629| 1| if let Some(pos) = lower.find(phrase) {
630| 1| let left = text[..pos].trim().to_string();
631| 1| let right = text[pos + phrase.len()..].trim().to_string();
632| 1| if !left.is_empty() {
633| 1| parts.push(left);
634| 1| }
^0
635| 1| if !right.is_empty() {
636| 1| text = right;
637| 1| }
^0
638| 1| did_relational_split = true;
639| 0| }
640| 55| }
641| | }
642| 8| if did_relational_split && !text.is_empty() {
^1
643| 1| parts.push(text.clone());
644| 7| }
645| |
646| | // If no relational split, try conjunctions and delimiters.
647| 8| if parts.is_empty() {
648| | // Split by semicolons first.
649| 7| let semi_parts: Vec<&str> = query.split(';').collect();
650| 7| if semi_parts.len() > 1 {
651| 7| for p in &semi_parts {
^5
652| 5| let trimmed = p.trim();
653| 5| if !trimmed.is_empty() {
654| 5| parts.push(trimmed.to_string());
655| 5| }
^0
656| | }
657| | } else {
658| | // Split by commas and conjunctions.
659| | // Replace " and " and " e " (Portuguese) with comma, then split.
660| 5| let normalized = query
661| 5| .replace(" and ", ", ")
662| 5| .replace(" AND ", ", ")
663| 5| .replace(" e ", ", ")
664| 5| .replace(" E ", ", ");
665| 5| let comma_parts: Vec<&str> = normalized.split(',').collect();
666| 5| if comma_parts.len() > 1 {
667| 21| for p in &comma_parts {
^17
668| 17| let trimmed = p.trim();
669| 17| if !trimmed.is_empty() {
670| 17| parts.push(trimmed.to_string());
671| 17| }
^0
672| | }
673| 1| }
674| | }
675| 1| }
676| |
677| | // If still no split, try word-pair decomposition for multi-word queries.
678| 8| if parts.is_empty() {
679| 2| let words: Vec<&str> = query.split_whitespace().filter(|w| w.len() > 2).collect();
^1 ^1 ^1 ^1 ^1 ^1
680| 1| if words.len() >= 3 {
681| 0| parts.push(query.to_string());
682| 0| parts.push(format!("{} {}", words[0], words[1]));
683| 0| parts.push(format!(
684| 0| "{} {}",
685| 0| words[words.len() - 2],
686| 0| words[words.len() - 1]
687| 0| ));
688| 1| }
689| 7| }
690| |
691| 8| if parts.is_empty() {
692| 1| return vec![query.to_string()];
693| 7| }
694| |
695| | // Cap at max.
696| 7| parts.truncate(max);
697| 7| parts
698| 9|}
699| |
700| |/// Reconstruct a directed path from `target_entity_id` back to a seed using the
701| |/// predecessor map built by BFS. Returns the path nodes from root to target
702| |/// plus the accumulated edge weights.
703| 1|fn reconstruct_path(
704| 1| target_id: i64,
705| 1| seed_entity_ids: &HashSet<i64>,
706| 1| predecessor: &PredecessorMap,
707| 1| entity_names: &crate::hash::AHashMap<i64, String>,
708| 1|) -> Option<(Vec<EvidenceNode>, f64)> {
709| 1| let mut path_ids: Vec<(i64, Option<String>, Option<f64>)> = Vec::with_capacity(8);
710| 1| let mut total_weight = 1.0_f64;
711| 1| let mut current = target_id;
712| |
713| | loop {
714| 3| if seed_entity_ids.contains(¤t) {
715| 1| break;
716| 2| }
717| 2| let (parent, relation, weight) = predecessor.get(¤t)?;
^0
718| 2| total_weight *= weight;
719| 2| path_ids.push((current, Some(relation.clone()), Some(*weight)));
720| 2| current = *parent;
721| | }
722| | // Push the seed entity (root).
723| 1| path_ids.push((current, None, None));
724| |
725| | // Reverse so path goes from seed → target.
726| 1| path_ids.reverse();
727| |
728| 1| let nodes: Vec<EvidenceNode> = path_ids
729| 1| .into_iter()
730| 1| .map(|(id, relation, weight)| EvidenceNode {
731| 3| entity: entity_names
732| 3| .get(&id)
733| 3| .cloned()
734| 3| .unwrap_or_else(|| format!("entity-{id}")),
^0
735| 3| relation,
736| 3| weight,
737| 3| })
738| 1| .collect();
739| |
740| 1| Some((nodes, total_weight))
741| 1|}
742| |
743| |/// Execute a single sub-query: hybrid search (KNN + FTS fused via RRF) + graph traversal.
744| |///
745| |/// GAP-07 fix: receives the embedding for THIS sub-query (not the shared original).
746| |/// GAP-08/11 fix: uses rrf_fuse() for proper score fusion instead of hardcoded 0.5.
747| |/// GAP-09/10 fix: builds directed evidence chains filtered to discovered entities.
748| |/// GAP-17: respects max_neighbors_per_hop cap in BFS.
749| |///
750| |/// Runs synchronously on a blocking thread (called from a tokio spawn context).
751| |/// Each call opens its own read-only SQLite connection to leverage WAL concurrency.
752| |#[allow(clippy::too_many_arguments)]
753| 0|fn execute_sub_query(
754| 0| sub_query_id: usize,
755| 0| query_text: &str,
756| 0| embedding: &[f32],
757| 0| namespace: &str,
758| 0| db_path: &std::path::Path,
759| 0| k: usize,
760| 0| max_hops: usize,
761| 0| min_weight: f64,
762| 0| rrf_k: f64,
763| 0| graph_decay: f64,
764| 0| graph_min_score: f64,
765| 0| max_neighbors_per_hop: Option<usize>,
766| 0|) -> Result<SubQueryResult, String> {
767| 0| let conn = open_ro(db_path).map_err(|e| format!("failed to open db: {e}"))?;
768| |
769| 0| let mut hits: Vec<(i64, f64, String, String, String, Option<usize>)> =
770| 0| Vec::with_capacity(k * 2);
771| 0| let mut seen_ids: crate::hash::AHashSet<i64> =
772| 0| crate::hash::AHashSet::with_capacity_and_hasher(k * 2, Default::default());
773| |
774| | // --- GAP-08/11 FIX: Use RRF fusion for KNN + FTS instead of hardcoded 0.5 ---
775| |
776| | // 1. KNN vector search — collect ranked IDs.
777| 0| let knn_results = memories::knn_search(&conn, embedding, &[namespace.to_string()], None, k)
778| 0| .map_err(|e| format!("knn_search failed: {e}"))?;
779| 0| let knn_ids: Vec<i64> = knn_results.iter().map(|(id, _)| *id).collect();
780| 0| tracing::debug!(target: "deep_research", sub_query_id, knn_count = knn_ids.len(), "KNN complete");
781| |
782| | // Build distance map for score computation.
783| 0| let knn_distance_map: crate::hash::AHashMap<i64, f64> = knn_results
784| 0| .iter()
785| 0| .map(|(id, dist)| (*id, *dist as f64))
786| 0| .collect();
787| |
788| | // 2. FTS5 search — collect ranked IDs.
789| 0| let fts_results = match memories::fts_search(&conn, query_text, namespace, None, k) {
790| 0| Ok(rows) => rows,
791| 0| Err(e) => {
792| 0| tracing::warn!(target: "deep_research",
793| | sub_query_id,
794| 0| "FTS5 search failed, continuing with KNN only: {e}"
795| | );
796| 0| vec![]
797| | }
798| | };
799| 0| let fts_ids: Vec<i64> = fts_results.iter().map(|r| r.id).collect();
800| 0| tracing::debug!(target: "deep_research", sub_query_id, fts_count = fts_ids.len(), "FTS complete");
801| |
802| | // 3. Fuse via RRF.
803| 0| let rrf_scores = rrf_fuse(&[(1.0, &knn_ids), (1.0, &fts_ids)], rrf_k);
804| 0| let max_possible = rrf_max_possible(&[1.0, 1.0], rrf_k);
805| |
806| | // 4. Sort fused results and build hits.
807| 0| let mut fused: Vec<(i64, f64)> = rrf_scores.into_iter().collect();
808| 0| fused.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
809| 0| fused.truncate(k * 2);
810| 0| tracing::debug!(target: "deep_research",
811| | sub_query_id,
812| 0| fused_count = fused.len(),
813| 0| "RRF fusion complete"
814| | );
815| |
816| 0| if fused.is_empty() && !knn_ids.is_empty() {
817| 0| tracing::warn!(target: "deep_research", sub_query_id, knn_count = knn_ids.len(), fts_count = fts_ids.len(),
818| 0| "RRF fusion returned 0 results despite KNN/FTS hits; consider lowering --graph-min-score");
819| 0| }
820| |
821| 0| for (memory_id, combined_score) in &fused {
822| 0| if seen_ids.insert(*memory_id) {
823| 0| let normalized = if max_possible > 0.0 {
824| 0| combined_score / max_possible
825| | } else {
826| 0| 0.0
827| | };
828| 0| let score = normalized.clamp(0.0, 1.0);
829| 0| let in_knn = knn_distance_map.contains_key(memory_id);
830| 0| let in_fts = fts_ids.contains(memory_id);
831| 0| let source = match (in_knn, in_fts) {
832| 0| (true, true) => "hybrid",
833| 0| (true, false) => "knn",
834| 0| (false, true) => "fts",
835| 0| (false, false) => "graph",
836| | };
837| 0| if let Ok(Some(row)) = memories::read_full(&conn, *memory_id) {
838| 0| let snippet: String = row.body.chars().take(300).collect();
839| 0| hits.push((
840| 0| *memory_id,
841| 0| score,
842| 0| source.to_string(),
843| 0| snippet,
844| 0| row.body,
845| 0| None,
846| 0| ));
847| 0| }
848| 0| }
849| | }
850| |
851| | // 5. Graph traversal from discovered memories.
852| | // GAP-09/10 FIX: entity KNN also uses this sub-query's embedding.
853| 0| let memory_ids: Vec<i64> = hits.iter().map(|(id, ..)| *id).collect();
854| 0| let mut chains: Vec<EvidenceChain> = Vec::with_capacity(memory_ids.len());
855| |
856| 0| if !memory_ids.is_empty() && max_hops > 0 {
857| | // Seed entities from KNN on entity vectors using THIS sub-query's embedding.
858| 0| let entity_knn = entities::knn_search(&conn, embedding, namespace, 5)
859| 0| .inspect_err(|e| tracing::warn!(target: "deep_research", error = %e, "entity KNN search failed, skipping graph seed"))
860| 0| .unwrap_or_default();
861| 0| let entity_ids: Vec<i64> = entity_knn.iter().map(|(id, _)| *id).collect();
862| |
863| | // HIGH-01 FIX: limit seeds to top-5 memories by score to prevent
864| | // BFS from starting at every node when k >= total memories.
865| 0| let top_seed_count = 5.min(memory_ids.len());
866| 0| let top_memory_ids = &memory_ids[..top_seed_count];
867| 0| let mut seed_entity_ids: Vec<i64> = entity_ids.clone();
868| 0| for &mem_id in top_memory_ids {
869| 0| let mut stmt = conn
870| 0| .prepare_cached("SELECT entity_id FROM memory_entities WHERE memory_id = ?1")
871| 0| .map_err(|e| format!("prepare failed: {e}"))?;
872| 0| let ids: Vec<i64> = stmt
873| 0| .query_map(rusqlite::params![mem_id], |r| r.get(0))
874| 0| .map_err(|e| format!("query failed: {e}"))?
875| 0| .filter_map(|r| r.ok())
876| 0| .collect();
877| 0| seed_entity_ids.extend(ids);
878| | }
879| 0| seed_entity_ids.sort_unstable();
880| 0| seed_entity_ids.dedup();
881| 0| tracing::debug!(target: "deep_research",
882| | sub_query_id,
883| 0| seed_count = seed_entity_ids.len(),
884| 0| "seed entities collected"
885| | );
886| |
887| 0| let all_seed_ids: Vec<i64> = memory_ids
888| 0| .iter()
889| 0| .chain(entity_ids.iter())
890| 0| .copied()
891| 0| .collect();
892| |
893| | // Graph traversal with hop scores.
894| 0| if let Ok(graph_results) = traverse_from_memories_with_hops_capped(
895| 0| &conn,
896| 0| &all_seed_ids,
897| 0| namespace,
898| 0| min_weight,
899| 0| max_hops as u32,
900| 0| max_neighbors_per_hop,
901| 0| ) {
902| | // Build seed score map from RRF-fused scores for graph decay computation.
903| 0| let seed_score_map: crate::hash::AHashMap<i64, f64> = fused
904| 0| .iter()
905| 0| .map(|(id, s)| {
906| 0| let normalized = if max_possible > 0.0 {
907| 0| s / max_possible
908| | } else {
909| 0| 0.0
910| | };
911| 0| (*id, normalized.clamp(0.0, 1.0))
912| 0| })
913| 0| .collect();
914| |
915| 0| for (graph_mem_id, hop) in graph_results {
916| 0| if seen_ids.insert(graph_mem_id) {
917| | // GAP-08/11 FIX: graph score = seed_score * decay^hop * edge_weight.
918| | // For the seed score, use the best score among the seed memories that
919| | // transitively reached this graph memory (approximate with the average
920| | // seed score since we don't track the exact path yet).
921| 0| let avg_seed_score: f64 = if seed_score_map.is_empty() {
922| 0| 0.5
923| | } else {
924| 0| let sum: f64 = seed_score_map.values().sum();
925| 0| sum / seed_score_map.len() as f64
926| | };
927| 0| let graph_score =
928| 0| (avg_seed_score * graph_decay.powi(hop as i32)).clamp(0.0, 1.0);
929| |
930| 0| if graph_score < graph_min_score {
931| 0| continue;
932| 0| }
933| |
934| 0| if let Ok(Some(row)) = memories::read_full(&conn, graph_mem_id) {
935| 0| let snippet: String = row.body.chars().take(300).collect();
936| 0| hits.push((
937| 0| graph_mem_id,
938| 0| graph_score,
939| 0| "graph".to_string(),
940| 0| snippet,
941| 0| row.body,
942| 0| Some(hop as usize),
943| 0| ));
944| 0| }
945| 0| }
946| | }
947| 0| }
948| |
949| | // GAP-09/10 FIX: Build directed evidence chains using BFS with predecessor map,
950| | // filtered to entities discovered in this sub-query.
951| 0| if !seed_entity_ids.is_empty() {
952| 0| let (entity_depth, predecessor) = bfs_with_predecessors(
953| 0| &conn,
954| 0| &seed_entity_ids,
955| 0| namespace,
956| 0| min_weight,
957| 0| max_hops as u32,
958| 0| max_neighbors_per_hop,
959| 0| )
960| 0| .unwrap_or_default();
961| |
962| 0| tracing::debug!(target: "deep_research",
963| | sub_query_id,
964| 0| bfs_nodes = entity_depth.len(),
965| 0| predecessors = predecessor.len(),
966| 0| "BFS complete"
967| | );
968| |
969| 0| let seed_entity_set: HashSet<i64> = seed_entity_ids.iter().copied().collect();
970| |
971| | // Collect entity IDs we need names for.
972| 0| let all_entity_ids: Vec<i64> = entity_depth.keys().copied().collect();
973| 0| let mut entity_names: crate::hash::AHashMap<i64, String> =
974| 0| crate::hash::AHashMap::with_capacity_and_hasher(
975| 0| all_entity_ids.len(),
976| 0| ahash::RandomState::default(),
977| | );
978| 0| for &eid in &all_entity_ids {
979| 0| let name_res: rusqlite::Result<String> = conn.query_row(
980| 0| "SELECT name FROM entities WHERE id = ?1",
981| 0| rusqlite::params![eid],
982| 0| |r| r.get(0),
983| | );
984| 0| if let Ok(name) = name_res {
985| 0| entity_names.insert(eid, name);
986| 0| }
987| | }
988| |
989| | // Reconstruct a path for each non-seed entity that has a predecessor.
990| 0| for (&target_id, &_hop) in &entity_depth {
991| 0| if seed_entity_set.contains(&target_id) {
992| 0| continue;
993| 0| }
994| 0| if !predecessor.contains_key(&target_id) {
995| 0| continue;
996| 0| }
997| 0| if let Some((path_nodes, total_weight)) =
998| 0| reconstruct_path(target_id, &seed_entity_set, &predecessor, &entity_names)
999| | {
1000| 0| if path_nodes.len() < 2 {
1001| 0| continue;
1002| 0| }
1003| 0| let from = path_nodes
1004| 0| .first()
1005| 0| .map(|n| n.entity.clone())
1006| 0| .unwrap_or_default();
1007| 0| let to = path_nodes
1008| 0| .last()
1009| 0| .map(|n| n.entity.clone())
1010| 0| .unwrap_or_default();
1011| 0| let depth = path_nodes.len();
1012| 0| chains.push(EvidenceChain {
1013| 0| from,
1014| 0| to,
1015| 0| path: path_nodes,
1016| 0| total_weight,
1017| 0| depth,
1018| 0| sub_query_ids: vec![sub_query_id],
1019| 0| });
1020| 0| }
1021| | }
1022| |
1023| | // Sort chains by total_weight descending and cap to avoid huge output.
1024| 0| chains.sort_by(|a, b| {
1025| 0| b.total_weight
1026| 0| .partial_cmp(&a.total_weight)
1027| 0| .unwrap_or(std::cmp::Ordering::Equal)
1028| 0| });
1029| 0| chains.truncate(20);
1030| 0| tracing::debug!(target: "deep_research",
1031| | sub_query_id,
1032| 0| chains_count = chains.len(),
1033| 0| "evidence chains built"
1034| | );
1035| 0| }
1036| 0| }
1037| |
1038| 0| Ok(SubQueryResult {
1039| 0| sub_query_id,
1040| 0| hits,
1041| 0| chains,
1042| 0| })
1043| 0|}
1044| |
1045| |// ────────────────────────────────────────────────────────────────────────────
1046| |// Re-export sub_query_results field initialisation for the stats counter.
1047| |// The field is moved out of run_async after the join loop; we need to shadow it.
1048| |// ────────────────────────────────────────────────────────────────────────────
1049| |
1050| |#[cfg(test)]
1051| |mod tests {
1052| | use super::*;
1053| |
1054| | #[test]
1055| 1| fn test_decompose_and_conjunction() {
1056| 1| let result = decompose_query("A and B", 7);
1057| 1| assert_eq!(result, vec!["A", "B"]);
1058| 1| }
1059| |
1060| | #[test]
1061| 1| fn test_decompose_no_split() {
1062| 1| let result = decompose_query("simple query", 7);
1063| 1| assert_eq!(result, vec!["simple query"]);
1064| 1| }
1065| |
1066| | #[test]
1067| 1| fn test_decompose_three_parts() {
1068| 1| let result = decompose_query("A, B and C", 7);
1069| 1| assert_eq!(result, vec!["A", "B", "C"]);
1070| 1| }
1071| |
1072| | #[test]
1073| 1| fn test_decompose_portuguese_conjunctions() {
1074| 1| let result = decompose_query("A e B", 7);
1075| 1| assert_eq!(result, vec!["A", "B"]);
1076| 1| }
1077| |
1078| | #[test]
1079| 1| fn test_decompose_max_cap() {
1080| 10| let parts: Vec<String> = (0..10).map(|i| format!("part{i}")).collect();
^1 ^1 ^1 ^1 ^1
1081| 1| let query = parts.join(", ");
1082| 1| let result = decompose_query(&query, 7);
1083| 1| assert!(
1084| 1| result.len() <= 7,
1085| 0| "expected at most 7 sub-queries, got {}",
1086| 0| result.len()
1087| | );
1088| 1| }
1089| |
1090| | #[test]
1091| 1| fn test_decompose_empty_preserves_original() {
1092| 1| let result = decompose_query("", 7);
1093| 1| assert_eq!(result, vec![""]);
1094| 1| }
1095| |
1096| | #[test]
1097| 1| fn test_decompose_semicolons() {
1098| 1| let result = decompose_query("auth design; deployment config; logging", 7);
1099| 1| assert_eq!(result, vec!["auth design", "deployment config", "logging"]);
1100| 1| }
1101| |
1102| | #[test]
1103| 1| fn test_decompose_relational_phrase() {
1104| 1| let result = decompose_query("auth that caused deployment failure", 7);
1105| 1| assert_eq!(result, vec!["auth", "deployment failure"]);
1106| 1| }
1107| |
1108| | #[test]
1109| 1| fn test_sub_query_serialization() {
1110| 1| let sq = SubQuery {
1111| 1| id: 0,
1112| 1| text: "test query".to_string(),
1113| 1| source: "original",
1114| 1| };
1115| 1| let json = serde_json::to_value(&sq).expect("serialization failed");
1116| 1| assert_eq!(json["id"], 0);
1117| 1| assert_eq!(json["text"], "test query");
1118| 1| assert_eq!(json["source"], "original");
1119| 1| }
1120| |
1121| | #[test]
1122| 1| fn test_deep_result_omits_body_when_none() {
1123| 1| let result = DeepResult {
1124| 1| name: "test".to_string(),
1125| 1| score: 0.9,
1126| 1| source: "knn".to_string(),
1127| 1| sub_query_ids: vec![0],
1128| 1| snippet: "snippet".to_string(),
1129| 1| body: None,
1130| 1| hop_distance: None,
1131| 1| };
1132| 1| let json = serde_json::to_string(&result).expect("serialization failed");
1133| 1| assert!(!json.contains("\"body\""), "body must be omitted when None");
^0
1134| 1| }
1135| |
1136| | #[test]
1137| 1| fn test_deep_result_includes_body_when_some() {
1138| 1| let result = DeepResult {
1139| 1| name: "test".to_string(),
1140| 1| score: 0.9,
1141| 1| source: "knn".to_string(),
1142| 1| sub_query_ids: vec![0, 1],
1143| 1| snippet: "snippet".to_string(),
1144| 1| body: Some("full body content".to_string()),
1145| 1| hop_distance: Some(2),
1146| 1| };
1147| 1| let json = serde_json::to_string(&result).expect("serialization failed");
1148| 1| assert!(json.contains("\"body\""), "body must be present when Some");
^0
1149| 1| assert!(json.contains("full body content"));
1150| 1| }
1151| |
1152| | #[test]
1153| 1| fn test_evidence_node_omits_none_fields() {
1154| 1| let node = EvidenceNode {
1155| 1| entity: "auth-module".to_string(),
1156| 1| relation: None,
1157| 1| weight: None,
1158| 1| };
1159| 1| let json = serde_json::to_string(&node).expect("serialization failed");
1160| 1| assert!(
1161| 1| !json.contains("\"relation\""),
1162| 0| "relation must be omitted when None"
1163| | );
1164| 1| assert!(
1165| 1| !json.contains("\"weight\""),
1166| 0| "weight must be omitted when None"
1167| | );
1168| 1| }
1169| |
1170| | #[test]
1171| 1| fn test_research_stats_serialization() {
1172| 1| let stats = ResearchStats {
1173| 1| sub_queries_total: 3,
1174| 1| sub_queries_completed: 2,
1175| 1| sub_queries_failed: 1,
1176| 1| sub_queries_timed_out: 0,
1177| 1| unique_memories_found: 10,
1178| 1| evidence_chains_found: 2,
1179| 1| elapsed_ms: 1234,
1180| 1| };
1181| 1| let json = serde_json::to_value(&stats).expect("serialization failed");
1182| 1| assert_eq!(json["sub_queries_total"], 3);
1183| 1| assert_eq!(json["sub_queries_completed"], 2);
1184| 1| assert_eq!(json["sub_queries_failed"], 1);
1185| 1| assert_eq!(json["elapsed_ms"], 1234);
1186| 1| }
1187| |
1188| | #[test]
1189| 1| fn test_deep_research_response_serialization() {
1190| 1| let resp = DeepResearchResponse {
1191| 1| query: "test query".to_string(),
1192| 1| sub_queries: vec![SubQuery {
1193| 1| id: 0,
1194| 1| text: "test query".to_string(),
1195| 1| source: "original",
1196| 1| }],
1197| 1| results: vec![],
1198| 1| evidence_chains: vec![],
1199| 1| graph_context: None,
1200| 1| stats: ResearchStats {
1201| 1| sub_queries_total: 1,
1202| 1| sub_queries_completed: 1,
1203| 1| sub_queries_failed: 0,
1204| 1| sub_queries_timed_out: 0,
1205| 1| unique_memories_found: 0,
1206| 1| evidence_chains_found: 0,
1207| 1| elapsed_ms: 42,
1208| 1| },
1209| 1| };
1210| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
1211| 1| assert_eq!(json["query"], "test query");
1212| 1| assert!(json["sub_queries"].is_array());
1213| 1| assert!(json["results"].is_array());
1214| 1| assert!(json["evidence_chains"].is_array());
1215| 1| assert_eq!(json["stats"]["elapsed_ms"], 42);
1216| 1| }
1217| |
1218| | // ---- GAP-07 regression: different sub-queries produce distinct embeddings ----
1219| | // We test decompose_query returns texts that *would* produce distinct embeddings
1220| | // (different text inputs → different embedding inputs → different search results).
1221| | #[test]
1222| 1| fn test_distinct_sub_queries_produce_distinct_texts() {
1223| 1| let queries = [
1224| 1| "authentication design decisions",
1225| 1| "deployment configuration and infrastructure",
1226| 1| ];
1227| | // These two texts must be different strings (prerequisite for distinct embeddings).
1228| 1| assert_ne!(queries[0], queries[1]);
1229| |
1230| | // decompose_query with semicolons must preserve distinct texts.
1231| 1| let decomposed = decompose_query(
1232| 1| "authentication design decisions; deployment configuration and infrastructure",
1233| | 7,
1234| | );
1235| 1| assert_eq!(decomposed.len(), 2);
1236| 1| assert_ne!(decomposed[0], decomposed[1]);
1237| 1| }
1238| |
1239| | // ---- GAP-08/11 regression: rrf_fuse integration via fusion module ----
1240| | #[test]
1241| 1| fn test_rrf_fuse_via_fusion_module() {
1242| | use crate::storage::fusion::rrf_fuse;
1243| |
1244| 1| let knn_ids: Vec<i64> = vec![1, 2, 3];
1245| 1| let fts_ids: Vec<i64> = vec![2, 1, 4];
1246| 1| let scores = rrf_fuse(&[(1.0, &knn_ids), (1.0, &fts_ids)], 60.0);
1247| |
1248| | // Items appearing in both lists must score higher than items in only one list.
1249| 1| let score_1 = scores[&1];
1250| 1| let score_2 = scores[&2];
1251| 1| let score_3 = scores[&3]; // knn only, rank 3
1252| 1| let score_4 = scores[&4]; // fts only, rank 3
1253| |
1254| 1| assert!(
1255| 1| score_1 > score_3,
1256| 0| "id 1 (both lists) must beat id 3 (knn-only rank 3)"
1257| | );
1258| 1| assert!(
1259| 1| score_2 > score_4,
1260| 0| "id 2 (both lists) must beat id 4 (fts-only rank 3)"
1261| | );
1262| 1| }
1263| |
1264| | // ---- GAP-09/10 regression: evidence chains must be directed paths ----
1265| | #[test]
1266| 1| fn test_evidence_chain_has_from_to_and_path() {
1267| 1| let chain = EvidenceChain {
1268| 1| from: "auth-module".to_string(),
1269| 1| to: "jwt-service".to_string(),
1270| 1| path: vec![
1271| 1| EvidenceNode {
1272| 1| entity: "auth-module".to_string(),
1273| 1| relation: None,
1274| 1| weight: None,
1275| 1| },
1276| 1| EvidenceNode {
1277| 1| entity: "token-validator".to_string(),
1278| 1| relation: Some("depends-on".to_string()),
1279| 1| weight: Some(0.9),
1280| 1| },
1281| 1| EvidenceNode {
1282| 1| entity: "jwt-service".to_string(),
1283| 1| relation: Some("uses".to_string()),
1284| 1| weight: Some(0.8),
1285| 1| },
1286| 1| ],
1287| 1| total_weight: 0.72,
1288| 1| depth: 3,
1289| 1| sub_query_ids: vec![0],
1290| 1| };
1291| |
1292| 1| let json = serde_json::to_value(&chain).expect("serialization failed");
1293| 1| assert!(
1294| 1| json["from"].is_string(),
1295| 0| "evidence chain must have 'from' field"
1296| | );
1297| 1| assert!(
1298| 1| json["to"].is_string(),
1299| 0| "evidence chain must have 'to' field"
1300| | );
1301| 1| assert!(
1302| 1| json["path"].is_array(),
1303| 0| "evidence chain must have 'path' array"
1304| | );
1305| 1| assert_eq!(json["path"].as_array().unwrap().len(), 3);
1306| 1| assert!(json["total_weight"].is_number(), "must have total_weight");
^0
1307| 1| assert_eq!(json["depth"], 3);
1308| 1| }
1309| |
1310| | // ---- GAP-10 regression: reconstruct_path returns correct node order ----
1311| | #[test]
1312| 1| fn test_reconstruct_path_root_to_target_order() {
1313| | // Build a simple chain: entity 10 (seed) -> entity 20 -> entity 30 (target)
1314| 1| let seed_set: HashSet<i64> = [10i64].into_iter().collect();
1315| 1| let mut predecessor: PredecessorMap = std::collections::HashMap::new();
1316| 1| predecessor.insert(20, (10, "depends-on".to_string(), 0.9));
1317| 1| predecessor.insert(30, (20, "uses".to_string(), 0.8));
1318| 1| let mut entity_names: crate::hash::AHashMap<i64, String> = crate::hash::AHashMap::default();
1319| 1| entity_names.insert(10, "seed-entity".to_string());
1320| 1| entity_names.insert(20, "middle-entity".to_string());
1321| 1| entity_names.insert(30, "target-entity".to_string());
1322| |
1323| 1| let result = reconstruct_path(30, &seed_set, &predecessor, &entity_names);
1324| 1| assert!(result.is_some(), "path must be reconstructed");
^0
1325| 1| let (nodes, weight) = result.unwrap();
1326| | // Path must be [seed, middle, target]
1327| 1| assert_eq!(nodes.len(), 3);
1328| 1| assert_eq!(nodes[0].entity, "seed-entity");
1329| 1| assert_eq!(nodes[1].entity, "middle-entity");
1330| 1| assert_eq!(nodes[2].entity, "target-entity");
1331| | // total_weight = 0.9 * 0.8
1332| 1| assert!((weight - 0.72).abs() < 1e-6);
1333| 1| }
1334| |
1335| | // ---- GAP-09 regression: evidence chains must NOT be present for 1-hop trivial pairs ----
1336| | #[test]
1337| 1| fn test_evidence_chains_single_hop_filtered_out() {
1338| | // A chain of depth 1 (only root node) should be discarded.
1339| 1| let chain = EvidenceChain {
1340| 1| from: "a".to_string(),
1341| 1| to: "a".to_string(),
1342| 1| path: vec![EvidenceNode {
1343| 1| entity: "a".to_string(),
1344| 1| relation: None,
1345| 1| weight: None,
1346| 1| }],
1347| 1| total_weight: 1.0,
1348| 1| depth: 1,
1349| 1| sub_query_ids: vec![0],
1350| 1| };
1351| | // Simulate the filter: retain chains with depth >= 2.
1352| 1| let chains = vec![chain];
1353| 1| let retained: Vec<_> = chains.into_iter().filter(|c| c.depth >= 2).collect();
1354| 1| assert!(retained.is_empty(), "depth-1 chains must be filtered out");
^0
1355| 1| }
1356| |
1357| | // ---- GAP-17 regression: bfs_with_predecessors honours max_neighbors_per_hop ----
1358| | #[test]
1359| 1| fn test_bfs_with_predecessors_respects_neighbor_cap() {
1360| | use crate::graph::bfs_with_predecessors;
1361| | use rusqlite::Connection;
1362| |
1363| 1| let conn = Connection::open_in_memory().unwrap();
1364| 1| conn.execute_batch(
1365| 1| "CREATE TABLE relationships (
1366| 1| source_id INTEGER NOT NULL,
1367| 1| target_id INTEGER NOT NULL,
1368| 1| weight REAL NOT NULL,
1369| 1| namespace TEXT NOT NULL,
1370| 1| relation TEXT NOT NULL DEFAULT 'related'
1371| 1| );",
1372| | )
1373| 1| .unwrap();
1374| |
1375| | // Seed entity 1 has 5 neighbours.
1376| 6| for target in 2i64..=6 {
^5
1377| 5| conn.execute(
1378| 5| "INSERT INTO relationships (source_id, target_id, weight, namespace) VALUES (?1, ?2, ?3, 'ns')",
1379| 5| rusqlite::params![1i64, target, 1.0f64],
1380| 5| )
1381| 5| .unwrap();
1382| 5| }
1383| |
1384| | // Without cap: all 5 neighbours reached.
1385| 1| let (depth_uncapped, _) = bfs_with_predecessors(&conn, &[1], "ns", 0.0, 1, None).unwrap();
1386| 1| assert_eq!(
1387| 1| depth_uncapped.len() - 1,
1388| | 5,
1389| 0| "uncapped must discover all 5 neighbours (plus seed)"
1390| | );
1391| |
1392| | // With cap=2: only top-2 neighbours (by weight; all equal here so first 2 returned).
1393| 1| let (depth_capped, _) = bfs_with_predecessors(&conn, &[1], "ns", 0.0, 1, Some(2)).unwrap();
1394| | // seed + 2 neighbours = 3 entries.
1395| 1| assert_eq!(
1396| 1| depth_capped.len(),
1397| | 3,
1398| 0| "capped to 2 must yield seed + 2 neighbours"
1399| | );
1400| 1| }
1401| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/delete_entity.rs:
1| |//! Handler for the `delete-entity` CLI subcommand (GAP-17).
2| |//!
3| |//! Deletes an entity and, with `--cascade`, all of its relationships and
4| |//! memory bindings. Without `--cascade` the command refuses to proceed, which
5| |//! prevents accidental data loss.
6| |
7| |use crate::errors::AppError;
8| |use crate::i18n::errors_msg;
9| |use crate::output::{self, OutputFormat};
10| |use crate::paths::AppPaths;
11| |use crate::storage::connection::open_rw;
12| |use crate::storage::entities;
13| |use rusqlite::params;
14| |use serde::Serialize;
15| |
16| |#[derive(clap::Args)]
17| |#[command(after_long_help = "EXAMPLES:\n \
18| | # Delete an entity and all its relationships (cascade required)\n \
19| | sqlite-graphrag delete-entity --name auth-module --cascade\n\n \
20| | # Delete an entity in a specific namespace\n \
21| | sqlite-graphrag delete-entity --name legacy-service --cascade --namespace my-project\n\n \
22| | # Without --cascade the command exits with an error:\n \
23| | sqlite-graphrag delete-entity --name auth-module\n \
24| | # => Error: use --cascade to confirm deletion of entity and all its relationships\n\n\
25| |NOTE:\n \
26| | --cascade is required and acts as an explicit confirmation gate.\n \
27| | All relationships where this entity is source or target are removed.\n \
28| | All memory-entity bindings (memory_entities rows) are also removed.\n \
29| | Run `sqlite-graphrag cleanup-orphans` afterwards to remove any newly orphaned entities.")]
30| |pub struct DeleteEntityArgs {
31| | /// Entity name to delete (graph node, not memory name).
32| | #[arg(long)]
33| | pub name: String,
34| | /// Required confirmation flag. Without it the command exits with an error.
35| | ///
36| | /// Deletes all relationships and memory bindings attached to this entity.
37| | #[arg(long, default_value_t = false)]
38| | pub cascade: bool,
39| | #[arg(long)]
40| | pub namespace: Option<String>,
41| | #[arg(long, value_enum, default_value = "json")]
42| | pub format: OutputFormat,
43| | #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
44| | pub json: bool,
45| | #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
46| | pub db: Option<String>,
47| |}
48| |
49| |#[derive(Serialize)]
50| |struct DeleteEntityResponse {
51| | action: String,
52| | entity_name: String,
53| | namespace: String,
54| | relationships_removed: usize,
55| | bindings_removed: usize,
56| | /// Total execution time in milliseconds from handler start to serialisation.
57| | elapsed_ms: u64,
58| |}
59| |
60| 0|pub fn run(args: DeleteEntityArgs) -> Result<(), AppError> {
61| 0| let inicio = std::time::Instant::now();
62| |
63| 0| if !args.cascade {
64| 0| return Err(AppError::Validation(
65| 0| "use --cascade to confirm deletion of entity and all its relationships".to_string(),
66| 0| ));
67| 0| }
68| |
69| 0| let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
70| 0| let paths = AppPaths::resolve(args.db.as_deref())?;
71| |
72| 0| crate::storage::connection::ensure_db_ready(&paths)?;
73| |
74| 0| let mut conn = open_rw(&paths.db)?;
75| |
76| 0| let entity_id = entities::find_entity_id(&conn, &namespace, &args.name)?
77| 0| .ok_or_else(|| AppError::NotFound(errors_msg::entity_not_found(&args.name, &namespace)))?;
78| |
79| 0| let tx = conn.transaction_with_behavior(rusqlite::TransactionBehavior::Immediate)?;
80| |
81| | // Step 0: collect adjacent entity IDs BEFORE deleting relationships.
82| 0| let adjacent_ids: Vec<i64> = {
83| 0| let mut stmt = tx.prepare(
84| 0| "SELECT DISTINCT CASE WHEN source_id = ?1 THEN target_id ELSE source_id END
85| 0| FROM relationships WHERE source_id = ?1 OR target_id = ?1",
86| 0| )?;
87| 0| let ids: Vec<i64> = stmt
88| 0| .query_map(params![entity_id], |r| r.get(0))?
89| 0| .collect::<Result<Vec<_>, _>>()?;
90| 0| ids
91| | };
92| |
93| | // Step 1: collect relationship IDs for this entity (source or target).
94| 0| let rel_ids: Vec<i64> = {
95| 0| let mut stmt =
96| 0| tx.prepare("SELECT id FROM relationships WHERE source_id = ?1 OR target_id = ?1")?;
97| 0| let ids: Vec<i64> = stmt
98| 0| .query_map(params![entity_id], |r| r.get::<_, i64>(0))?
99| 0| .collect::<Result<Vec<_>, _>>()?;
100| 0| ids
101| | };
102| |
103| | // Step 2: delete memory_relationships for each collected relationship id.
104| 0| for &rel_id in &rel_ids {
105| 0| tx.execute(
106| 0| "DELETE FROM memory_relationships WHERE relationship_id = ?1",
107| 0| params![rel_id],
108| 0| )?;
109| | }
110| |
111| | // Step 3: delete the relationships themselves.
112| 0| let relationships_removed = tx.execute(
113| 0| "DELETE FROM relationships WHERE source_id = ?1 OR target_id = ?1",
114| 0| params![entity_id],
115| 0| )?;
116| |
117| | // Step 4: delete memory_entities bindings.
118| 0| let bindings_removed = tx.execute(
119| 0| "DELETE FROM memory_entities WHERE entity_id = ?1",
120| 0| params![entity_id],
121| 0| )?;
122| |
123| | // Step 5: delete vec_entities row (ignore error — row may not exist).
124| 0| let _ = tx.execute(
125| 0| "DELETE FROM vec_entities WHERE entity_id = ?1",
126| 0| params![entity_id],
127| 0| );
128| |
129| | // Step 6: delete the entity itself.
130| 0| tx.execute("DELETE FROM entities WHERE id = ?1", params![entity_id])?;
131| |
132| | // Step 7: recalculate degree for adjacent entities that lost relationships.
133| 0| for &adj_id in &adjacent_ids {
134| 0| if adj_id != entity_id {
135| 0| entities::recalculate_degree(&tx, adj_id)?;
136| 0| }
137| | }
138| |
139| 0| tx.commit()?;
140| |
141| 0| conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);")?;
142| |
143| 0| let response = DeleteEntityResponse {
144| 0| action: "deleted".to_string(),
145| 0| entity_name: args.name.clone(),
146| 0| namespace: namespace.clone(),
147| 0| relationships_removed,
148| 0| bindings_removed,
149| 0| elapsed_ms: inicio.elapsed().as_millis() as u64,
150| 0| };
151| |
152| 0| match args.format {
153| 0| OutputFormat::Json => output::emit_json(&response)?,
154| 0| OutputFormat::Text | OutputFormat::Markdown => {
155| 0| output::emit_text(&format!(
156| 0| "deleted: {} (relationships_removed={}, bindings_removed={}) [{}]",
157| 0| response.entity_name,
158| 0| response.relationships_removed,
159| 0| response.bindings_removed,
160| 0| response.namespace
161| 0| ));
162| 0| }
163| | }
164| |
165| 0| Ok(())
166| 0|}
167| |
168| |#[cfg(test)]
169| |mod tests {
170| | use super::*;
171| |
172| | #[test]
173| 1| fn delete_entity_response_serializes_all_fields() {
174| 1| let resp = DeleteEntityResponse {
175| 1| action: "deleted".to_string(),
176| 1| entity_name: "auth-module".to_string(),
177| 1| namespace: "global".to_string(),
178| 1| relationships_removed: 3,
179| 1| bindings_removed: 2,
180| 1| elapsed_ms: 7,
181| 1| };
182| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
183| 1| assert_eq!(json["action"], "deleted");
184| 1| assert_eq!(json["entity_name"], "auth-module");
185| 1| assert_eq!(json["namespace"], "global");
186| 1| assert_eq!(json["relationships_removed"], 3);
187| 1| assert_eq!(json["bindings_removed"], 2);
188| 1| assert!(json["elapsed_ms"].is_number());
189| 1| }
190| |
191| | #[test]
192| 1| fn delete_entity_response_action_is_deleted() {
193| 1| let resp = DeleteEntityResponse {
194| 1| action: "deleted".to_string(),
195| 1| entity_name: "x".to_string(),
196| 1| namespace: "ns".to_string(),
197| 1| relationships_removed: 0,
198| 1| bindings_removed: 0,
199| 1| elapsed_ms: 0,
200| 1| };
201| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
202| 1| assert_eq!(json["action"], "deleted");
203| 1| }
204| |
205| | #[test]
206| 1| fn delete_entity_response_zero_counts_allowed() {
207| 1| let resp = DeleteEntityResponse {
208| 1| action: "deleted".to_string(),
209| 1| entity_name: "orphan-entity".to_string(),
210| 1| namespace: "global".to_string(),
211| 1| relationships_removed: 0,
212| 1| bindings_removed: 0,
213| 1| elapsed_ms: 1,
214| 1| };
215| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
216| 1| assert_eq!(json["relationships_removed"], 0);
217| 1| assert_eq!(json["bindings_removed"], 0);
218| 1| }
219| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/edit.rs:
1| |//! Handler for the `edit` CLI subcommand.
2| |
3| |use crate::errors::AppError;
4| |use crate::i18n::errors_msg;
5| |use crate::output;
6| |use crate::paths::AppPaths;
7| |use crate::storage::connection::open_rw;
8| |use crate::storage::{memories, versions};
9| |use serde::Serialize;
10| |
11| |#[derive(clap::Args)]
12| |#[command(after_long_help = "EXAMPLES:\n \
13| | # Edit body inline\n \
14| | sqlite-graphrag edit onboarding --body \"updated content\"\n\n \
15| | # Edit body from a file\n \
16| | sqlite-graphrag edit onboarding --body-file ./updated.md\n\n \
17| | # Edit body from stdin (pipe)\n \
18| | cat updated.md | sqlite-graphrag edit onboarding --body-stdin\n\n \
19| | # Update only the description\n \
20| | sqlite-graphrag edit onboarding --description \"new short description\"")]
21| |pub struct EditArgs {
22| | /// Memory name as a positional argument. Alternative to `--name`.
23| | #[arg(
24| | value_name = "NAME",
25| | conflicts_with = "name",
26| | help = "Memory name to edit; alternative to --name"
27| | )]
28| | pub name_positional: Option<String>,
29| | /// Memory name to edit. Soft-deleted memories are not editable; use `restore` first.
30| | #[arg(long)]
31| | pub name: Option<String>,
32| | /// New inline body content. Mutually exclusive with --body-file and --body-stdin.
33| | #[arg(long, conflicts_with_all = ["body_file", "body_stdin"])]
34| | pub body: Option<String>,
35| | /// Read new body from a file. Mutually exclusive with --body and --body-stdin.
36| | #[arg(long, conflicts_with_all = ["body", "body_stdin"])]
37| | pub body_file: Option<std::path::PathBuf>,
38| | /// Read new body from stdin until EOF. Mutually exclusive with --body and --body-file.
39| | #[arg(long, conflicts_with_all = ["body", "body_file"])]
40| | pub body_stdin: bool,
41| | /// New description (≤500 chars) replacing the existing one.
42| | #[arg(long)]
43| | pub description: Option<String>,
44| | /// Change the memory type (e.g. note, skill, decision).
45| | #[arg(long, value_enum, help = "Change memory type")]
46| | pub memory_type: Option<crate::cli::MemoryType>,
47| | #[arg(
48| | long,
49| | value_name = "EPOCH_OR_RFC3339",
50| | value_parser = crate::parsers::parse_expected_updated_at,
51| | long_help = "Optimistic lock: reject if updated_at does not match. \
52| |Accepts Unix epoch (e.g. 1700000000) or RFC 3339 (e.g. 2026-04-19T12:00:00Z)."
53| | )]
54| | pub expected_updated_at: Option<i64>,
55| | #[arg(
56| | long,
57| | help = "Namespace (env: SQLITE_GRAPHRAG_NAMESPACE, default: global)"
58| | )]
59| | pub namespace: Option<String>,
60| | #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
61| | pub json: bool,
62| | #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
63| | pub db: Option<String>,
64| |}
65| |
66| |#[derive(Serialize)]
67| |struct EditResponse {
68| | memory_id: i64,
69| | name: String,
70| | action: String,
71| | version: i64,
72| | /// Total execution time in milliseconds from handler start to serialisation.
73| | elapsed_ms: u64,
74| |}
75| |
76| 0|pub fn run(args: EditArgs) -> Result<(), AppError> {
77| | use crate::constants::*;
78| |
79| 0| let inicio = std::time::Instant::now();
80| 0| tracing::debug!(target: "edit", name = ?args.name_positional.as_deref().or(args.name.as_deref()), "updating memory");
81| | // Resolve name from positional or --name flag; both are optional, at least one is required.
82| 0| let name = args.name_positional.or(args.name).ok_or_else(|| {
83| 0| AppError::Validation("name required: pass as positional argument or via --name".to_string())
84| 0| })?;
85| 0| let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
86| |
87| 0| let paths = AppPaths::resolve(args.db.as_deref())?;
88| 0| crate::storage::connection::ensure_db_ready(&paths)?;
89| 0| let mut conn = open_rw(&paths.db)?;
90| |
91| 0| let (memory_id, current_updated_at, _current_version) =
92| 0| memories::find_by_name(&conn, &namespace, &name)?
93| 0| .ok_or_else(|| AppError::NotFound(errors_msg::memory_not_found(&name, &namespace)))?;
94| |
95| 0| if let Some(expected) = args.expected_updated_at {
96| 0| if expected != current_updated_at {
97| 0| return Err(AppError::Conflict(errors_msg::optimistic_lock_conflict(
98| 0| expected,
99| 0| current_updated_at,
100| 0| )));
101| 0| }
102| 0| }
103| |
104| 0| let mut raw_body: Option<String> = None;
105| 0| if args.body.is_some() || args.body_file.is_some() || args.body_stdin {
106| 0| let b = if let Some(b) = args.body {
107| 0| b
108| 0| } else if let Some(path) = &args.body_file {
109| 0| let file_size = std::fs::metadata(path).map_err(AppError::Io)?.len();
110| 0| if file_size > MAX_MEMORY_BODY_LEN as u64 {
111| 0| return Err(AppError::LimitExceeded(
112| 0| crate::i18n::validation::body_exceeds(MAX_MEMORY_BODY_LEN),
113| 0| ));
114| 0| }
115| 0| std::fs::read_to_string(path).map_err(AppError::Io)?
116| | } else {
117| 0| crate::stdin_helper::read_stdin_with_timeout(60)?
118| | };
119| 0| if b.len() > MAX_MEMORY_BODY_LEN {
120| 0| return Err(AppError::LimitExceeded(
121| 0| crate::i18n::validation::body_exceeds(MAX_MEMORY_BODY_LEN),
122| 0| ));
123| 0| }
124| 0| raw_body = Some(b);
125| 0| }
126| |
127| 0| if let Some(ref desc) = args.description {
128| 0| if desc.len() > MAX_MEMORY_DESCRIPTION_LEN {
129| 0| return Err(AppError::Validation(
130| 0| crate::i18n::validation::description_exceeds(MAX_MEMORY_DESCRIPTION_LEN),
131| 0| ));
132| 0| }
133| 0| }
134| |
135| 0| let row = memories::read_by_name(&conn, &namespace, &name)?
136| 0| .ok_or_else(|| AppError::Internal(anyhow::anyhow!("memory row not found after check")))?;
137| |
138| 0| let body_changed = raw_body.is_some();
139| 0| let new_body = raw_body.unwrap_or(row.body.clone());
140| 0| let new_description = args.description.unwrap_or(row.description.clone());
141| 0| let new_hash = blake3::hash(new_body.as_bytes()).to_hex().to_string();
142| | // Skip re-embedding when body content is identical to the stored version.
143| 0| let body_changed = body_changed && new_hash != row.body_hash;
144| 0| let memory_type = args
145| 0| .memory_type
146| 0| .map(|t| t.as_str().to_string())
147| 0| .unwrap_or_else(|| row.memory_type.clone());
148| 0| let type_changed = memory_type != row.memory_type;
149| 0| let metadata = row.metadata.clone();
150| |
151| 0| let tx = conn.transaction_with_behavior(rusqlite::TransactionBehavior::Immediate)?;
152| |
153| 0| let affected = if let Some(ts) = args.expected_updated_at {
154| 0| tx.execute(
155| 0| "UPDATE memories SET description=?2, body=?3, body_hash=?4, type=?5
156| 0| WHERE id=?1 AND updated_at=?6 AND deleted_at IS NULL",
157| 0| rusqlite::params![
158| 0| memory_id,
159| 0| new_description,
160| 0| new_body,
161| 0| new_hash,
162| 0| memory_type,
163| 0| ts
164| 0| ],
165| 0| )?
166| | } else {
167| 0| tx.execute(
168| 0| "UPDATE memories SET description=?2, body=?3, body_hash=?4, type=?5
169| 0| WHERE id=?1 AND deleted_at IS NULL",
170| 0| rusqlite::params![memory_id, new_description, new_body, new_hash, memory_type],
171| 0| )?
172| | };
173| |
174| 0| if affected == 0 {
175| 0| return Err(AppError::Conflict(
176| 0| "optimistic lock conflict: memory was modified by another process".to_string(),
177| 0| ));
178| 0| }
179| |
180| 0| if body_changed || type_changed {
181| 0| output::emit_progress_i18n(
182| 0| "Re-computing embedding for edited body...",
183| 0| crate::i18n::validation::runtime_pt::edit_recomputing_embedding(),
184| | );
185| 0| let embedding = crate::daemon::embed_passage_or_local(&paths.models, &new_body)?;
186| 0| let snippet: String = new_body.chars().take(300).collect();
187| 0| memories::upsert_vec(
188| 0| &tx,
189| 0| memory_id,
190| 0| &namespace,
191| 0| &memory_type,
192| 0| &embedding,
193| 0| &name,
194| 0| &snippet,
195| 0| )?;
196| 0| }
197| |
198| 0| let next_v = versions::next_version(&tx, memory_id)?;
199| |
200| 0| versions::insert_version(
201| 0| &tx,
202| 0| memory_id,
203| 0| next_v,
204| 0| &name,
205| 0| &memory_type,
206| 0| &new_description,
207| 0| &new_body,
208| 0| &metadata,
209| 0| None,
210| 0| "edit",
211| 0| )?;
212| |
213| 0| memories::sync_fts_after_update(
214| 0| &tx,
215| 0| memory_id,
216| 0| &row.name,
217| 0| &row.description,
218| 0| &row.body,
219| 0| &row.name,
220| 0| &new_description,
221| 0| &new_body,
222| 0| )?;
223| |
224| 0| tx.commit()?;
225| |
226| 0| conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);")?;
227| |
228| 0| output::emit_json(&EditResponse {
229| 0| memory_id,
230| 0| name,
231| 0| action: "updated".to_string(),
232| 0| version: next_v,
233| 0| elapsed_ms: inicio.elapsed().as_millis() as u64,
234| 0| })?;
235| |
236| 0| Ok(())
237| 0|}
238| |
239| |#[cfg(test)]
240| |mod tests {
241| | use super::*;
242| |
243| | #[test]
244| 1| fn edit_response_serializes_all_fields() {
245| 1| let resp = EditResponse {
246| 1| memory_id: 42,
247| 1| name: "my-memory".to_string(),
248| 1| action: "updated".to_string(),
249| 1| version: 3,
250| 1| elapsed_ms: 7,
251| 1| };
252| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
253| 1| assert_eq!(json["memory_id"], 42i64);
254| 1| assert_eq!(json["name"], "my-memory");
255| 1| assert_eq!(json["action"], "updated");
256| 1| assert_eq!(json["version"], 3i64);
257| 1| assert!(json["elapsed_ms"].is_number());
258| 1| }
259| |
260| | #[test]
261| 1| fn edit_response_action_contains_updated() {
262| 1| let resp = EditResponse {
263| 1| memory_id: 1,
264| 1| name: "n".to_string(),
265| 1| action: "updated".to_string(),
266| 1| version: 1,
267| 1| elapsed_ms: 0,
268| 1| };
269| 1| assert_eq!(
270| | resp.action, "updated",
271| 0| "action must be 'updated' for successful edits"
272| | );
273| 1| }
274| |
275| | #[test]
276| 1| fn edit_body_exceeds_limit_returns_error() {
277| 1| let limit = crate::constants::MAX_MEMORY_BODY_LEN;
278| 1| let large_body: String = "a".repeat(limit + 1);
279| 1| assert!(
280| 1| large_body.len() > limit,
281| 0| "body above limit must have length > MAX_MEMORY_BODY_LEN"
282| | );
283| 1| }
284| |
285| | #[test]
286| 1| fn edit_description_exceeds_limit_returns_error() {
287| 1| let limit = crate::constants::MAX_MEMORY_DESCRIPTION_LEN;
288| 1| let large_desc: String = "d".repeat(limit + 1);
289| 1| assert!(
290| 1| large_desc.len() > limit,
291| 0| "description above limit must have length > MAX_MEMORY_DESCRIPTION_LEN"
292| | );
293| 1| }
294| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/enrich.rs:
1| |//! Handler for the `enrich` CLI subcommand (GAP-14 + GAP-18).
2| |//!
3| |//! Enriches the knowledge graph by running LLM-powered analysis over memories
4| |//! and entities that are missing key structural data. Operations are:
5| |//!
6| |//! - `memory-bindings`: memories without `memory_entities` rows get entity extraction
7| |//! - `entity-descriptions`: entities with NULL/empty descriptions get LLM descriptions
8| |//! - `body-enrich`: memories with short bodies get expanded by the LLM (GAP-18)
9| |//! - all others: scan + structured NDJSON output (not-yet-implemented dispatch)
10| |//!
11| |//! Architecture mirrors `ingest_claude.rs`: SCAN → JUDGE (LLM) → PERSIST, with a
12| |//! SQLite queue DB (`.enrich-queue.sqlite`) for resume/retry support.
13| |// Workload: Subprocess I/O-bound (claude/codex API calls with network wait)
14| |//!
15| |//! # DRY opportunity
16| |//!
17| |//! `extract_with_claude`, `parse_claude_output`, `emit_json`, and the `open_queue_db`
18| |//! queue schema in `ingest_claude.rs` are private functions that duplicate patterns used
19| |//! here verbatim. A future refactoring could extract them into a shared
20| |//! `src/commands/llm_runner.rs` module (or `src/llm_runner.rs`) without changing any
21| |//! public APIs. That extraction requires editing `ingest_claude.rs`, which is outside
22| |//! this stream's boundary — flagged here for the Integration stream to evaluate.
23| |
24| |use crate::commands::ingest_claude::find_claude_binary;
25| |use crate::constants::MAX_MEMORY_BODY_LEN;
26| |use crate::entity_type::EntityType;
27| |use crate::errors::AppError;
28| |use crate::paths::AppPaths;
29| |use crate::storage::connection::{ensure_db_ready, open_rw};
30| |use crate::storage::entities::{self, NewEntity, NewRelationship};
31| |use crate::storage::memories;
32| |
33| |use rusqlite::Connection;
34| |use serde::{Deserialize, Serialize};
35| |use std::io::Write;
36| |use std::path::{Path, PathBuf};
37| |use std::time::Instant;
38| |
39| |// ---------------------------------------------------------------------------
40| |// Constants
41| |// ---------------------------------------------------------------------------
42| |
43| |const DEFAULT_QUEUE_DB: &str = ".enrich-queue.sqlite";
44| |const DEFAULT_RATE_LIMIT_WAIT: u64 = 60;
45| |const DEFAULT_BODY_ENRICH_MIN_CHARS: usize = 500;
46| |const DEFAULT_BODY_ENRICH_MAX_CHARS: usize = 2000;
47| |
48| |// ---------------------------------------------------------------------------
49| |// JSON schema used for memory-bindings and body-enrich extraction
50| |// ---------------------------------------------------------------------------
51| |
52| |const BINDINGS_SCHEMA: &str = r#"{
53| | "type": "object",
54| | "properties": {
55| | "entities": {
56| | "type": "array",
57| | "items": {
58| | "type": "object",
59| | "properties": {
60| | "name": { "type": "string" },
61| | "entity_type": {
62| | "type": "string",
63| | "enum": ["project","tool","person","file","concept","incident","decision","organization","location","date"]
64| | }
65| | },
66| | "required": ["name", "entity_type"],
67| | "additionalProperties": false
68| | }
69| | },
70| | "relationships": {
71| | "type": "array",
72| | "items": {
73| | "type": "object",
74| | "properties": {
75| | "source": { "type": "string" },
76| | "target": { "type": "string" },
77| | "relation": {
78| | "type": "string",
79| | "enum": ["applies-to","uses","depends-on","causes","fixes","contradicts","supports","follows","related","replaces","tracked-in"]
80| | },
81| | "strength": { "type": "number", "minimum": 0, "maximum": 1 }
82| | },
83| | "required": ["source","target","relation","strength"],
84| | "additionalProperties": false
85| | }
86| | }
87| | },
88| | "required": ["entities","relationships"],
89| | "additionalProperties": false
90| |}"#;
91| |
92| |const ENTITY_DESCRIPTION_SCHEMA: &str = r#"{
93| | "type": "object",
94| | "properties": {
95| | "description": { "type": "string" }
96| | },
97| | "required": ["description"],
98| | "additionalProperties": false
99| |}"#;
100| |
101| |const BODY_ENRICH_SCHEMA: &str = r#"{
102| | "type": "object",
103| | "properties": {
104| | "enriched_body": { "type": "string" }
105| | },
106| | "required": ["enriched_body"],
107| | "additionalProperties": false
108| |}"#;
109| |
110| |// G27 P1: weight-calibrate
111| |const WEIGHT_CALIBRATE_PROMPT: &str = "You are a knowledge graph quality auditor. Evaluate whether this relationship weight is correctly calibrated.\n\n\
112| |Scale:\n\
113| |- 0.9 = vital hard dependency (A cannot function without B)\n\
114| |- 0.7 = important design relationship (A strongly supports/enables B)\n\
115| |- 0.5 = useful contextual link (A and B share relevant context)\n\
116| |- 0.3 = weak reference (A mentions B without strong coupling)\n\n\
117| |Respond with the calibrated weight and brief reasoning.";
118| |
119| |const WEIGHT_CALIBRATE_SCHEMA: &str = r#"{
120| | "type": "object",
121| | "properties": {
122| | "calibrated_weight": { "type": "number", "minimum": 0.0, "maximum": 1.0 },
123| | "reasoning": { "type": "string" }
124| | },
125| | "required": ["calibrated_weight", "reasoning"],
126| | "additionalProperties": false
127| |}"#;
128| |
129| |// G27 P1: relation-reclassify
130| |const RELATION_RECLASSIFY_PROMPT: &str = "You are a knowledge graph quality auditor. The relationship between these entities uses a generic type. Determine the REAL semantic relationship.\n\n\
131| |Valid canonical relations (pick exactly one):\n\
132| |- depends-on: A cannot function without B\n\
133| |- uses: A utilizes B but could substitute it\n\
134| |- supports: A reinforces or enables B\n\
135| |- causes: A triggers or produces B\n\
136| |- fixes: A resolves a problem in B\n\
137| |- contradicts: A conflicts with or invalidates B\n\
138| |- applies-to: A is relevant to or scoped within B\n\
139| |- follows: A comes after B in sequence\n\
140| |- replaces: A substitutes B\n\
141| |- tracked-in: A is monitored in B\n\
142| |- related: A and B share context (use sparingly)\n\n\
143| |Respond with the correct relation, strength, and reasoning.";
144| |
145| |const RELATION_RECLASSIFY_SCHEMA: &str = r#"{
146| | "type": "object",
147| | "properties": {
148| | "relation": { "type": "string" },
149| | "strength": { "type": "number", "minimum": 0.0, "maximum": 1.0 },
150| | "reasoning": { "type": "string" }
151| | },
152| | "required": ["relation", "strength", "reasoning"],
153| | "additionalProperties": false
154| |}"#;
155| |
156| |// G27 P2: entity-connect — suggest relationships between isolated entities
157| |const ENTITY_CONNECT_PROMPT: &str = "You are a knowledge graph quality auditor. Two entities exist in the same graph but have no relationship between them. Determine if a meaningful relationship exists.\n\n\
158| |Valid canonical relations: depends-on, uses, supports, causes, fixes, contradicts, applies-to, follows, replaces, tracked-in, related.\n\n\
159| |If NO meaningful relationship exists, set relation to \"none\".\n\
160| |Respond with the relation (or \"none\"), strength, and reasoning.";
161| |
162| |const ENTITY_CONNECT_SCHEMA: &str = r#"{
163| | "type": "object",
164| | "properties": {
165| | "relation": { "type": "string" },
166| | "strength": { "type": "number", "minimum": 0.0, "maximum": 1.0 },
167| | "reasoning": { "type": "string" }
168| | },
169| | "required": ["relation", "strength", "reasoning"],
170| | "additionalProperties": false
171| |}"#;
172| |
173| |// G27 P2: entity-type-validate — verify entity type assignments
174| |const ENTITY_TYPE_VALIDATE_PROMPT: &str = "You are a knowledge graph quality auditor. Verify whether this entity's type is correct.\n\n\
175| |Valid entity types: project, tool, person, file, concept, incident, decision, organization, location, date.\n\n\
176| |If the current type is correct, keep it. If wrong, suggest the correct type.\n\
177| |Respond with the validated type and reasoning.";
178| |
179| |const ENTITY_TYPE_VALIDATE_SCHEMA: &str = r#"{
180| | "type": "object",
181| | "properties": {
182| | "validated_type": { "type": "string" },
183| | "was_correct": { "type": "boolean" },
184| | "reasoning": { "type": "string" }
185| | },
186| | "required": ["validated_type", "was_correct", "reasoning"],
187| | "additionalProperties": false
188| |}"#;
189| |
190| |// G27 P2: description-enrich — improve generic memory descriptions
191| |const DESCRIPTION_ENRICH_PROMPT: &str = "You are a knowledge graph quality auditor. This memory has a generic or auto-generated description. Write a concise, semantic description (10-20 words) that captures WHAT this memory is about and WHY it matters.\n\n\
192| |BAD: 'ingested from docs/auth.md'\n\
193| |GOOD: 'JWT token rotation strategy with 15-min expiry and refresh flow'\n\n\
194| |Respond with the improved description and reasoning.";
195| |
196| |const DESCRIPTION_ENRICH_SCHEMA: &str = r#"{
197| | "type": "object",
198| | "properties": {
199| | "description": { "type": "string" },
200| | "reasoning": { "type": "string" }
201| | },
202| | "required": ["description", "reasoning"],
203| | "additionalProperties": false
204| |}"#;
205| |
206| |// G27 P2: domain-classify — classify memory into domain category
207| |const DOMAIN_CLASSIFY_PROMPT: &str = "You are a knowledge graph quality auditor. Classify this memory into its primary domain category.\n\n\
208| |Respond with the domain name (kebab-case, 2-4 words) and reasoning.";
209| |
210| |const DOMAIN_CLASSIFY_SCHEMA: &str = r#"{
211| | "type": "object",
212| | "properties": {
213| | "domain": { "type": "string" },
214| | "confidence": { "type": "number", "minimum": 0.0, "maximum": 1.0 },
215| | "reasoning": { "type": "string" }
216| | },
217| | "required": ["domain", "confidence", "reasoning"],
218| | "additionalProperties": false
219| |}"#;
220| |
221| |// G27 P2: graph-audit — audit graph for quality issues
222| |const GRAPH_AUDIT_PROMPT: &str = "You are a knowledge graph quality auditor. Analyze this memory and its entity bindings for quality issues.\n\n\
223| |Check for: missing entities, wrong entity types, redundant relationships, orphaned entities, generic descriptions, low-signal relationships.\n\n\
224| |Respond with a list of issues found (or empty if none) and an overall quality score.";
225| |
226| |const GRAPH_AUDIT_SCHEMA: &str = r#"{
227| | "type": "object",
228| | "properties": {
229| | "quality_score": { "type": "number", "minimum": 0.0, "maximum": 1.0 },
230| | "issues": { "type": "array", "items": { "type": "object", "properties": { "kind": { "type": "string" }, "detail": { "type": "string" } }, "required": ["kind", "detail"] } },
231| | "reasoning": { "type": "string" }
232| | },
233| | "required": ["quality_score", "issues", "reasoning"],
234| | "additionalProperties": false
235| |}"#;
236| |
237| |// G27 P2: deep-research-synth — synthesize research findings into graph
238| |const DEEP_RESEARCH_SYNTH_PROMPT: &str = "You are a knowledge graph synthesizer. Given this memory body, extract key findings and synthesize them into structured entities and relationships.\n\n\
239| |Entity names: lowercase kebab-case, domain-specific.\n\
240| |Relations: depends-on, uses, supports, causes, fixes, contradicts, applies-to, follows, related, replaces, tracked-in.\n\n\
241| |Respond with extracted entities, relationships, and a synthesis summary.";
242| |
243| |const DEEP_RESEARCH_SYNTH_SCHEMA: &str = r#"{
244| | "type": "object",
245| | "properties": {
246| | "entities": { "type": "array", "items": { "type": "object", "properties": { "name": { "type": "string" }, "entity_type": { "type": "string" } }, "required": ["name", "entity_type"] } },
247| | "relationships": { "type": "array", "items": { "type": "object", "properties": { "source": { "type": "string" }, "target": { "type": "string" }, "relation": { "type": "string" }, "strength": { "type": "number" } }, "required": ["source", "target", "relation", "strength"] } },
248| | "summary": { "type": "string" }
249| | },
250| | "required": ["entities", "relationships", "summary"],
251| | "additionalProperties": false
252| |}"#;
253| |
254| |// G27 P2: body-extract — extract structured content from unstructured text
255| |const BODY_EXTRACT_PROMPT: &str = "You are a structured data extractor. Given this memory body (which may be unstructured text, raw notes, or a transcript), extract and restructure the content into a clean, well-organized markdown body.\n\n\
256| |Preserve all factual content. Remove noise, fix formatting, add section headers where appropriate.\n\
257| |Respond with the restructured body and a brief summary of changes.";
258| |
259| |const BODY_EXTRACT_SCHEMA: &str = r#"{
260| | "type": "object",
261| | "properties": {
262| | "restructured_body": { "type": "string" },
263| | "changes_summary": { "type": "string" }
264| | },
265| | "required": ["restructured_body", "changes_summary"],
266| | "additionalProperties": false
267| |}"#;
268| |
269| |// ---------------------------------------------------------------------------
270| |// Prompts
271| |// ---------------------------------------------------------------------------
272| |
273| |const BINDINGS_PROMPT: &str = "You are a knowledge graph entity extractor. Given a memory body, extract:\n\
274| |1. Domain-specific entities (concepts, tools, people, decisions, projects, files)\n\
275| |2. Typed relationships between entities with strength scores\n\n\
276| |Rules:\n\
277| |- Entity names: lowercase kebab-case, 2+ chars, domain-specific only\n\
278| |- NEVER extract generic terms, stop words, numbers, UUIDs, or single characters\n\
279| |- Relationship types MUST be one of: applies-to, uses, depends-on, causes, fixes, contradicts, supports, follows, related, replaces, tracked-in\n\
280| |- NEVER use 'mentions' as relationship type\n\
281| |- Strength: 0.9 for hard dependencies, 0.7 for design relationships, 0.5 for contextual links, 0.3 for weak references\n\
282| |- Prefer fewer high-quality entities over many low-quality ones";
283| |
284| |const ENTITY_DESCRIPTION_PROMPT_PREFIX: &str = "You are a knowledge graph annotator. Given an entity name and type, write a concise one-sentence description (10-20 words) that explains what this entity IS and WHY it matters in the context of software/system design.\n\nEntity name: ";
285| |
286| |const BODY_ENRICH_PROMPT_PREFIX: &str = "You are a knowledge assistant. Given a short or sparse memory body, expand it into a richer, more complete and useful description. Preserve all existing facts. Add context, implications, and relationships that would be valuable for knowledge retrieval.\n\nConstraints:\n- Output only the enriched body text (no metadata, no headers)\n- Preserve the original meaning exactly\n- Target length is provided in the system context\n\nMemory body to enrich:\n\n";
287| |
288| |// ---------------------------------------------------------------------------
289| |// CLI args
290| |// ---------------------------------------------------------------------------
291| |
292| |/// Operation to perform in the `enrich` command.
293| |#[derive(Debug, Clone, PartialEq, Eq, clap::ValueEnum, Serialize, Deserialize)]
294| |#[serde(rename_all = "kebab-case")]
295| |pub enum EnrichOperation {
296| | /// Add missing entity/relationship bindings to memories (fully implemented).
297| | MemoryBindings,
298| | /// Fill NULL/empty entity descriptions with LLM-generated summaries (fully implemented).
299| | EntityDescriptions,
300| | /// Expand short memory bodies into richer content (fully implemented, GAP-18).
301| | BodyEnrich,
302| | /// Calibrate relationship weights using LLM analysis (scan only).
303| | WeightCalibrate,
304| | /// Reclassify relationship types using LLM judgment (scan only).
305| | RelationReclassify,
306| | /// Connect isolated entities by suggesting new relationships (scan only).
307| | EntityConnect,
308| | /// Validate entity type assignments using LLM judgment (scan only).
309| | EntityTypeValidate,
310| | /// Enrich memory descriptions that are generic/auto-generated (scan only).
311| | DescriptionEnrich,
312| | /// Identify cross-domain bridges between disconnected subgraphs (scan only).
313| | CrossDomainBridges,
314| | /// Classify memories into domain categories (scan only).
315| | DomainClassify,
316| | /// Audit the graph for quality issues (scan only).
317| | GraphAudit,
318| | /// Synthesize deep-research findings into graph memories (scan only).
319| | DeepResearchSynth,
320| | /// Extract structured body from unstructured text (scan only).
321| | BodyExtract,
322| |}
323| |
324| |/// LLM provider for enrichment.
325| |#[derive(Debug, Clone, PartialEq, Eq, clap::ValueEnum)]
326| |pub enum EnrichMode {
327| | /// Use locally installed Claude Code CLI (OAuth-first).
328| | ClaudeCode,
329| | /// Use locally installed OpenAI Codex CLI.
330| | Codex,
331| |}
332| |
333| |impl std::fmt::Display for EnrichMode {
334| 0| fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
335| 0| match self {
336| 0| EnrichMode::ClaudeCode => write!(f, "claude-code"),
337| 0| EnrichMode::Codex => write!(f, "codex"),
338| | }
339| 0| }
340| |}
341| |
342| |/// Arguments for the `enrich` subcommand.
343| |#[derive(clap::Args)]
344| |#[command(
345| | about = "Enrich graph memories and entities using an LLM provider",
346| | after_long_help = "EXAMPLES:\n \
347| | # Add missing entity bindings to all unbound memories\n \
348| | sqlite-graphrag enrich --operation memory-bindings --mode claude-code\n\n \
349| | # Fill entity descriptions (dry-run preview, no tokens spent)\n \
350| | sqlite-graphrag enrich --operation entity-descriptions --dry-run --json\n\n \
351| | # Expand short memory bodies (GAP-18)\n \
352| | sqlite-graphrag enrich --operation body-enrich --min-output-chars 600\n\n \
353| | # Resume an interrupted body-enrich run\n \
354| | sqlite-graphrag enrich --operation body-enrich --resume --json\n\n \
355| | # Retry only failed items from a previous run\n \
356| | sqlite-graphrag enrich --operation memory-bindings --retry-failed --json\n\n\
357| | EXIT CODES:\n \
358| | 0 success\n \
359| | 1 validation error (bad args, binary not found)\n \
360| | 14 I/O error"
361| |)]
362| |pub struct EnrichArgs {
363| | /// Enrichment operation to run.
364| | #[arg(long, short = 'o', value_enum, value_name = "OPERATION")]
365| | pub operation: EnrichOperation,
366| |
367| | /// LLM provider to use. Default: claude-code (OAuth-first).
368| | #[arg(long, value_enum, default_value = "claude-code")]
369| | pub mode: EnrichMode,
370| |
371| | /// Maximum number of items to process in this run. Omit for all.
372| | #[arg(long, value_name = "N")]
373| | pub limit: Option<usize>,
374| |
375| | /// Preview items without calling the LLM (zero tokens consumed).
376| | #[arg(long)]
377| | pub dry_run: bool,
378| |
379| | /// Namespace to operate on. Default: global.
380| | #[arg(long, env = "SQLITE_GRAPHRAG_NAMESPACE")]
381| | pub namespace: Option<String>,
382| |
383| | // -- Provider flags (Claude) --
384| | /// Path to the Claude Code binary. Default: auto-detect from PATH.
385| | #[arg(long, value_name = "PATH")]
386| | pub claude_binary: Option<PathBuf>,
387| |
388| | /// Claude model to use (e.g. claude-sonnet-4-6).
389| | #[arg(long, value_name = "MODEL")]
390| | pub claude_model: Option<String>,
391| |
392| | /// Timeout per item in seconds when using Claude Code. Default: 300.
393| | #[arg(long, value_name = "SECONDS", default_value_t = 300)]
394| | pub claude_timeout: u64,
395| |
396| | // -- Provider flags (Codex) --
397| | /// Path to the Codex CLI binary. Default: auto-detect from PATH.
398| | #[arg(long, value_name = "PATH")]
399| | pub codex_binary: Option<PathBuf>,
400| |
401| | /// Codex model to use (e.g. o4-mini).
402| | #[arg(long, value_name = "MODEL")]
403| | pub codex_model: Option<String>,
404| |
405| | /// Timeout per item in seconds when using Codex. Default: 300.
406| | #[arg(long, value_name = "SECONDS", default_value_t = 300)]
407| | pub codex_timeout: u64,
408| |
409| | // -- Cost controls --
410| | /// Abort when cumulative cost exceeds this USD budget (API key only; ignored for OAuth).
411| | #[arg(long, value_name = "USD")]
412| | pub max_cost_usd: Option<f64>,
413| |
414| | // -- Queue controls --
415| | /// Resume a previously interrupted run (skip already-done items).
416| | #[arg(long)]
417| | pub resume: bool,
418| |
419| | /// Retry only items that failed in a previous run.
420| | #[arg(long)]
421| | pub retry_failed: bool,
422| |
423| | // -- body-enrich specific flags (GAP-18) --
424| | /// Minimum output character count for body-enrich. Default: 500.
425| | #[arg(long, value_name = "CHARS", default_value_t = DEFAULT_BODY_ENRICH_MIN_CHARS)]
426| | pub min_output_chars: usize,
427| |
428| | /// Maximum output character count for body-enrich. Default: 2000.
429| | #[arg(long, value_name = "CHARS", default_value_t = DEFAULT_BODY_ENRICH_MAX_CHARS)]
430| | pub max_output_chars: usize,
431| |
432| | /// Check that enriched body preserves all facts from the original (LLM judge). Default: true.
433| | #[arg(long, default_value_t = true)]
434| | pub preserve_check: bool,
435| |
436| | /// Path to a custom prompt template file for body-enrich.
437| | #[arg(long, value_name = "PATH")]
438| | pub prompt_template: Option<PathBuf>,
439| |
440| | /// Number of parallel LLM workers (default 1 = serial).
441| | /// Each worker claims items atomically from the queue DB via UPDATE...RETURNING.
442| | /// Range: 1–32. For 2321 entities, --llm-parallelism 4 reduces wall time ~4×.
443| | #[arg(long, default_value_t = 1, value_name = "N", value_parser = clap::value_parser!(u32).range(1..=32))]
444| | pub llm_parallelism: u32,
445| |
446| | // -- Output / infra --
447| | /// Emit NDJSON output. Always true; flag accepted for compatibility.
448| | #[arg(long)]
449| | pub json: bool,
450| |
451| | /// Database path override.
452| | #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
453| | pub db: Option<String>,
454| |
455| | /// G30: poll for the job singleton every second for up to N seconds
456| | /// when another invocation holds the lock. Default: 0 (fail fast).
457| | #[arg(long, value_name = "SECONDS")]
458| | pub wait_job_singleton: Option<u64>,
459| |
460| | /// G30: force acquisition of the singleton lock by removing a stale
461| | /// lock file from a previously crashed invocation. Use only when you
462| | /// are certain no other `enrich`/`ingest` is running.
463| | #[arg(long, default_value_t = false)]
464| | pub force_job_singleton: bool,
465| |
466| | /// G37: select a specific subset of memory names to enrich instead of
467| | /// the full candidate set. Comma-separated, e.g. `--names a,b,c`.
468| | /// Empty when omitted (processes all candidates).
469| | #[arg(long, value_name = "NAMES", value_delimiter = ',')]
470| | pub names: Vec<String>,
471| |
472| | /// G37: read the subset of memory names from a file (one per line).
473| | /// Lines starting with `#` and empty lines are ignored. Combined with
474| | /// `--names` (union) when both are set.
475| | #[arg(long, value_name = "PATH")]
476| | pub names_file: Option<PathBuf>,
477| |
478| | /// G35: probe the LLM provider with a 1-turn ping before processing
479| | /// the batch. Aborts with a clear error if the rate-limit window is
480| | /// closed (avoids burning N turns only to fail on item 1).
481| | #[arg(long, default_value_t = false)]
482| | pub preflight_check: bool,
483| |
484| | /// G35: if a preflight probe or in-flight call hits the Claude rate
485| | /// limit, fall back to `--fallback-mode` (typically `codex`) instead
486| | /// of failing the batch. Ignored when `--mode` is already `codex`.
487| | #[arg(long, value_enum)]
488| | pub fallback_mode: Option<EnrichMode>,
489| |
490| | /// G35: number of seconds before the OAuth rate-limit reset at which
491| | /// the preflight probe should refuse to start. Default 300 (5 min).
492| | #[arg(long, value_name = "SECONDS", default_value_t = 300)]
493| | pub rate_limit_buffer: u64,
494| |
495| | /// G28-D: refuse to start when the 1-minute load average exceeds
496| | /// `2 × ncpus` (or `SQLITE_GRAPHRAG_MAX_LOAD_PER_NCPU` if set).
497| | /// Set to false to skip the check on contended CI runners.
498| | #[arg(long, default_value_t = true)]
499| | pub max_load_check: bool,
500| |
501| | /// G28-D: when the system is saturated, abort the job after this
502| | /// many consecutive HardFailure outcomes. Default 5.
503| | #[arg(long, value_name = "N", default_value_t = 5)]
504| | pub circuit_breaker_threshold: u32,
505| |
506| | /// G29 Passo 4: minimum trigram-Jaccard similarity between the
507| | /// original body and the LLM-rewritten body for the rewrite to be
508| | /// accepted. Scores below the threshold are rejected and emitted as
509| | /// `EnrichItemResult::PreservationFailed`. Default 0.7 (per the G29
510| | /// gap specification). Ignored when `--operation` is not
511| | /// `body-enrich`.
512| | #[arg(long, value_name = "FLOAT", default_value_t = 0.7)]
513| | pub preserve_threshold: f64,
514| |
515| | /// G33 Passo 3: when set, validate `--codex-model` against the
516| | /// ChatGPT Pro OAuth accepted-model list and abort with a
517| | /// suggestion when the value is unknown. Default true (fail fast
518| | /// to avoid burning OAuth turns). Set to false to opt out.
519| | #[arg(long, default_value_t = true)]
520| | pub codex_model_validate: bool,
521| |
522| | /// G33 Passo 3: when set together with an invalid `--codex-model`,
523| | /// automatically substitute the supplied default (e.g. `gpt-5.5`)
524| | /// instead of aborting. The substitution is recorded in the NDJSON
525| | /// stream as `provider_substituted: true` for traceability.
526| | #[arg(long, value_name = "MODEL")]
527| | pub codex_model_fallback: Option<String>,
528| |}
529| |
530| |// ---------------------------------------------------------------------------
531| |// Internal types — raw LLM output structs
532| |// ---------------------------------------------------------------------------
533| |
534| |// ---------------------------------------------------------------------------
535| |// NDJSON event types emitted to stdout
536| |// ---------------------------------------------------------------------------
537| |
538| |#[derive(Debug, Serialize)]
539| |struct PhaseEvent<'a> {
540| | phase: &'a str,
541| | #[serde(skip_serializing_if = "Option::is_none")]
542| | binary_path: Option<&'a str>,
543| | #[serde(skip_serializing_if = "Option::is_none")]
544| | version: Option<&'a str>,
545| | #[serde(skip_serializing_if = "Option::is_none")]
546| | items_total: Option<usize>,
547| | #[serde(skip_serializing_if = "Option::is_none")]
548| | items_pending: Option<usize>,
549| | /// Active parallel LLM worker count (1 = serial). Present only on the "scan" phase event.
550| | #[serde(skip_serializing_if = "Option::is_none")]
551| | llm_parallelism: Option<u32>,
552| |}
553| |
554| |#[derive(Debug, Serialize)]
555| |struct ItemEvent<'a> {
556| | /// Item identifier (memory name or entity name).
557| | item: &'a str,
558| | status: &'a str,
559| | #[serde(skip_serializing_if = "Option::is_none")]
560| | memory_id: Option<i64>,
561| | #[serde(skip_serializing_if = "Option::is_none")]
562| | entity_id: Option<i64>,
563| | #[serde(skip_serializing_if = "Option::is_none")]
564| | entities: Option<usize>,
565| | #[serde(skip_serializing_if = "Option::is_none")]
566| | rels: Option<usize>,
567| | #[serde(skip_serializing_if = "Option::is_none")]
568| | chars_before: Option<usize>,
569| | #[serde(skip_serializing_if = "Option::is_none")]
570| | chars_after: Option<usize>,
571| | #[serde(skip_serializing_if = "Option::is_none")]
572| | cost_usd: Option<f64>,
573| | #[serde(skip_serializing_if = "Option::is_none")]
574| | elapsed_ms: Option<u64>,
575| | #[serde(skip_serializing_if = "Option::is_none")]
576| | error: Option<String>,
577| | index: usize,
578| | total: usize,
579| |}
580| |
581| |#[derive(Debug, Serialize)]
582| |struct EnrichSummary {
583| | summary: bool,
584| | operation: String,
585| | items_total: usize,
586| | completed: usize,
587| | failed: usize,
588| | skipped: usize,
589| | cost_usd: f64,
590| | elapsed_ms: u64,
591| |}
592| |
593| |use crate::output::emit_json_line as emit_json;
594| |
595| |// ---------------------------------------------------------------------------
596| |// Queue DB
597| |// ---------------------------------------------------------------------------
598| |
599| |/// Opens or creates the enrichment queue database.
600| |///
601| |/// The queue schema mirrors `ingest_claude` for resume/retry parity.
602| |/// Uses a different filename (`.enrich-queue.sqlite`) to avoid collision.
603| |///
604| |/// # DRY note
605| |///
606| |/// This is a near-verbatim copy of `open_queue_db` in `ingest_claude.rs`.
607| |/// Both should be unified in a shared `llm_runner.rs` module by the
608| |/// Integration stream.
609| 1|fn open_queue_db(path: &str) -> Result<Connection, AppError> {
610| 1| let conn = Connection::open(path)?;
^0
611| 1| conn.pragma_update(None, "journal_mode", "wal")?;
^0
612| 1| conn.execute_batch(
613| 1| "CREATE TABLE IF NOT EXISTS queue (
614| 1| id INTEGER PRIMARY KEY AUTOINCREMENT,
615| 1| item_key TEXT NOT NULL UNIQUE,
616| 1| item_type TEXT NOT NULL DEFAULT 'memory',
617| 1| status TEXT NOT NULL DEFAULT 'pending',
618| 1| memory_id INTEGER,
619| 1| entity_id INTEGER,
620| 1| entities INTEGER DEFAULT 0,
621| 1| rels INTEGER DEFAULT 0,
622| 1| error TEXT,
623| 1| cost_usd REAL DEFAULT 0.0,
624| 1| attempt INTEGER DEFAULT 0,
625| 1| elapsed_ms INTEGER,
626| 1| created_at TEXT DEFAULT (datetime('now')),
627| 1| done_at TEXT
628| 1| );
629| 1| CREATE INDEX IF NOT EXISTS idx_enrich_queue_status ON queue(status);",
630| 0| )?;
631| 1| Ok(conn)
632| 1|}
633| |
634| |// ---------------------------------------------------------------------------
635| |// LLM invocation — Claude Code
636| |// ---------------------------------------------------------------------------
637| |
638| |/// Calls `claude -p` via the shared `claude_runner` module (G02).
639| |///
640| |/// Returns `(output_value, cost_usd, is_oauth)`.
641| 0|fn call_claude(
642| 0| binary: &Path,
643| 0| prompt: &str,
644| 0| json_schema: &str,
645| 0| input_text: &str,
646| 0| model: Option<&str>,
647| 0| timeout_secs: u64,
648| 0|) -> Result<(serde_json::Value, f64, bool), AppError> {
649| 0| let result = crate::commands::claude_runner::run_claude(
650| 0| binary,
651| 0| prompt,
652| 0| json_schema,
653| 0| input_text,
654| 0| model,
655| 0| timeout_secs,
656| | 7,
657| 0| )?;
658| 0| Ok((result.value, result.cost_usd, result.is_oauth))
659| 0|}
660| |
661| |// ---------------------------------------------------------------------------
662| |// Preflight probe (G35) — single-turn ping to verify the LLM provider
663| |// ---------------------------------------------------------------------------
664| |
665| |/// Result of a single preflight ping (G35).
666| |enum PreflightOutcome {
667| | /// The provider accepted the ping without rate-limit or other errors.
668| | Healthy,
669| | /// The provider rejected the ping due to OAuth rate limit. The
670| | /// `suggestion` field is a human hint that callers can embed in the
671| | /// user-facing error.
672| | RateLimited {
673| | reason: String,
674| | suggestion: &'static str,
675| | },
676| | /// Any other provider error (binary missing, auth failure, etc.).
677| | Error(AppError),
678| |}
679| |
680| |/// Probes the configured LLM provider with a 1-turn ping.
681| |///
682| |/// - Claude: `claude -p "ping" --max-turns 1 --strict-mcp-config --mcp-config '{}'`
683| |/// - Codex: `codex exec -c mcp_servers='{}' "ping" --json`
684| |///
685| |/// The probe intentionally avoids spawning any MCP server children (G28-A)
686| |/// to keep its own process footprint at the minimum.
687| 0|fn run_preflight_probe(args: &EnrichArgs) -> PreflightOutcome {
688| 0| let timeout = std::time::Duration::from_secs(args.rate_limit_buffer.max(60));
689| |
690| 0| match args.mode {
691| | EnrichMode::ClaudeCode => {
692| 0| let bin = match find_claude_binary(args.claude_binary.as_deref()) {
693| 0| Ok(b) => b,
694| 0| Err(e) => return PreflightOutcome::Error(e),
695| | };
696| 0| let mut cmd = std::process::Command::new(&bin);
697| 0| cmd.env_clear();
698| 0| for var in &["PATH", "HOME", "USER"] {
699| 0| if let Ok(val) = std::env::var(var) {
700| 0| cmd.env(var, val);
701| 0| }
702| | }
703| 0| cmd.arg("-p")
704| 0| .arg("ping")
705| 0| .arg("--max-turns")
706| 0| .arg("1")
707| 0| .arg("--strict-mcp-config")
708| 0| .arg("--mcp-config")
709| 0| .arg("{}")
710| 0| .arg("--dangerously-skip-permissions")
711| 0| .arg("--settings")
712| 0| .arg("{\"hooks\":{}}")
713| 0| .arg("--output-format")
714| 0| .arg("json")
715| 0| .stdin(std::process::Stdio::null())
716| 0| .stdout(std::process::Stdio::piped())
717| 0| .stderr(std::process::Stdio::piped());
718| |
719| 0| let child = match super::claude_runner::spawn_with_memory_limit(&mut cmd) {
720| 0| Ok(c) => c,
721| 0| Err(e) => {
722| 0| return PreflightOutcome::Error(AppError::Io(e));
723| | }
724| | };
725| 0| let output = match wait_with_timeout(child, timeout) {
726| 0| Ok(out) => out,
727| 0| Err(e) => return PreflightOutcome::Error(e),
728| | };
729| 0| if !output.status.success() {
730| 0| let stderr = String::from_utf8_lossy(&output.stderr);
731| 0| if stderr.contains("hit your session limit")
732| 0| || stderr.contains("rate_limit")
733| 0| || stderr.contains("429")
734| | {
735| 0| return PreflightOutcome::RateLimited {
736| 0| reason: stderr.trim().to_string(),
737| 0| suggestion:
738| 0| "wait for the OAuth window to reset or use --fallback-mode codex",
739| 0| };
740| 0| }
741| 0| return PreflightOutcome::Error(AppError::Validation(format!(
742| 0| "preflight probe failed: {stderr}",
743| 0| stderr = stderr.trim()
744| 0| )));
745| 0| }
746| 0| PreflightOutcome::Healthy
747| | }
748| | EnrichMode::Codex => {
749| 0| let bin = match find_codex_binary(args.codex_binary.as_deref()) {
750| 0| Ok(b) => b,
751| 0| Err(e) => return PreflightOutcome::Error(e),
752| | };
753| 0| super::codex_spawn::validate_codex_model(args.codex_model.as_deref())
754| 0| .map_err(PreflightOutcome::Error)
755| 0| .ok();
756| 0| let schema = "{}";
757| 0| let schema_path = match super::codex_spawn::trusted_schema_path() {
758| 0| Ok(p) => p,
759| 0| Err(e) => return PreflightOutcome::Error(e),
760| | };
761| 0| let spawn_args = super::codex_spawn::CodexSpawnArgs {
762| 0| binary: &bin,
763| 0| prompt: "ping",
764| 0| json_schema: schema,
765| 0| input_text: "",
766| 0| model: args.codex_model.as_deref(),
767| 0| timeout_secs: args.rate_limit_buffer.max(60),
768| 0| schema_path: schema_path.clone(),
769| 0| };
770| 0| let mut cmd = super::codex_spawn::build_codex_command(&spawn_args);
771| 0| let child = match super::claude_runner::spawn_with_memory_limit(&mut cmd) {
772| 0| Ok(c) => c,
773| 0| Err(e) => return PreflightOutcome::Error(AppError::Io(e)),
774| | };
775| 0| let output = match wait_with_timeout(child, timeout) {
776| 0| Ok(out) => out,
777| 0| Err(e) => return PreflightOutcome::Error(e),
778| | };
779| 0| let _ = std::fs::remove_file(&schema_path);
780| 0| if !output.status.success() {
781| 0| let stderr = String::from_utf8_lossy(&output.stderr);
782| 0| if stderr.contains("rate_limit")
783| 0| || stderr.contains("429")
784| 0| || stderr.contains("Too Many Requests")
785| | {
786| 0| return PreflightOutcome::RateLimited {
787| 0| reason: stderr.trim().to_string(),
788| 0| suggestion: "wait for the rate-limit window to reset",
789| 0| };
790| 0| }
791| 0| return PreflightOutcome::Error(AppError::Validation(format!(
792| 0| "preflight probe failed: {stderr}",
793| 0| stderr = stderr.trim()
794| 0| )));
795| 0| }
796| 0| PreflightOutcome::Healthy
797| | }
798| | }
799| 0|}
800| |
801| |/// Cross-platform wait with timeout (no extra crate dependency).
802| 0|fn wait_with_timeout(
803| 0| mut child: std::process::Child,
804| 0| timeout: std::time::Duration,
805| 0|) -> Result<std::process::Output, AppError> {
806| | use wait_timeout::ChildExt;
807| 0| let start = std::time::Instant::now();
808| 0| let status = child.wait_timeout(timeout).map_err(AppError::Io)?;
809| 0| if status.is_none() {
810| 0| let _ = child.kill();
811| 0| let _ = child.wait();
812| 0| return Err(AppError::Validation(format!(
813| 0| "preflight probe timed out after {}s",
814| 0| start.elapsed().as_secs()
815| 0| )));
816| 0| }
817| 0| let mut stdout = Vec::new();
818| 0| if let Some(mut out) = child.stdout.take() {
819| 0| std::io::Read::read_to_end(&mut out, &mut stdout).map_err(AppError::Io)?;
820| 0| }
821| 0| let mut stderr = Vec::new();
822| 0| if let Some(mut err) = child.stderr.take() {
823| 0| std::io::Read::read_to_end(&mut err, &mut stderr).map_err(AppError::Io)?;
824| 0| }
825| 0| let exit = status.unwrap();
826| 0| Ok(std::process::Output {
827| 0| status: exit,
828| 0| stdout,
829| 0| stderr,
830| 0| })
831| 0|}
832| |
833| |// ---------------------------------------------------------------------------
834| |// SCAN helpers — SQL queries that find items needing enrichment
835| |// ---------------------------------------------------------------------------
836| |
837| |/// Returns memories without any `memory_entities` binding.
838| |///
839| |/// These are the targets for `memory-bindings` enrichment. When `name_filter`
840| |/// is non-empty, restricts the scan to the given names (G37); unknown names
841| |/// are silently skipped (the caller can detect them by comparing
842| |/// requested vs. returned).
843| 2|fn scan_unbound_memories(
844| 2| conn: &Connection,
845| 2| namespace: &str,
846| 2| limit: Option<usize>,
847| 2| name_filter: &[String],
848| 2|) -> Result<Vec<(i64, String, String)>, AppError> {
849| 2| let limit_clause = limit.map(|n| format!("LIMIT {n}")).unwrap_or_default();
^0
850| |
851| 2| if name_filter.is_empty() {
852| 2| let sql = format!(
853| 2| "SELECT m.id, m.name, m.body
854| 2| FROM memories m
855| 2| WHERE m.namespace = ?1
856| 2| AND m.deleted_at IS NULL
857| 2| AND NOT EXISTS (
858| 2| SELECT 1 FROM memory_entities me WHERE me.memory_id = m.id
859| 2| )
860| 2| ORDER BY m.id
861| 2| {limit_clause}"
862| | );
863| 2| let mut stmt = conn.prepare(&sql)?;
^0
864| 2| let rows = stmt
865| 2| .query_map(rusqlite::params![namespace], |r| {
^1
866| | Ok((
867| 1| r.get::<_, i64>(0)?,
^0
868| 1| r.get::<_, String>(1)?,
^0
869| 1| r.get::<_, String>(2)?,
^0
870| | ))
871| 1| })?
^0
872| 2| .collect::<Result<Vec<_>, _>>()?;
^0
873| 2| Ok(rows)
874| | } else {
875| | // Build a parameterised IN clause: ?2, ?3, ..., ?{1+n}
876| 0| let placeholders: Vec<String> = (2..=name_filter.len() + 1)
877| 0| .map(|i| format!("?{i}"))
878| 0| .collect();
879| 0| let in_clause = placeholders.join(", ");
880| 0| let sql = format!(
881| 0| "SELECT m.id, m.name, m.body
882| 0| FROM memories m
883| 0| WHERE m.namespace = ?1
884| 0| AND m.deleted_at IS NULL
885| 0| AND m.name IN ({in_clause})
886| 0| AND NOT EXISTS (
887| 0| SELECT 1 FROM memory_entities me WHERE me.memory_id = m.id
888| 0| )
889| 0| ORDER BY m.id
890| 0| {limit_clause}"
891| | );
892| 0| let mut params_vec: Vec<&dyn rusqlite::ToSql> = Vec::with_capacity(1 + name_filter.len());
893| 0| params_vec.push(&namespace);
894| 0| for n in name_filter {
895| 0| params_vec.push(n);
896| 0| }
897| 0| let mut stmt = conn.prepare(&sql)?;
898| 0| let rows = stmt
899| 0| .query_map(
900| 0| rusqlite::params_from_iter(params_vec.iter().copied()),
901| 0| |r| {
902| | Ok((
903| 0| r.get::<_, i64>(0)?,
904| 0| r.get::<_, String>(1)?,
905| 0| r.get::<_, String>(2)?,
906| | ))
907| 0| },
908| 0| )?
909| 0| .collect::<Result<Vec<_>, _>>()?;
910| 0| Ok(rows)
911| | }
912| 2|}
913| |
914| |/// Reads a list of memory names from a UTF-8 text file (G37).
915| |///
916| |/// Empty lines and lines beginning with `#` are skipped. Returns a
917| |/// de-duplicated, order-preserving list of trimmed names.
918| 0|fn read_names_file(path: &Path) -> Result<Vec<String>, AppError> {
919| 0| let content = std::fs::read_to_string(path).map_err(|e| {
920| 0| AppError::Validation(format!("failed to read names file {}: {e}", path.display()))
921| 0| })?;
922| 0| let mut seen = std::collections::HashSet::new();
923| 0| let mut out = Vec::new();
924| 0| for line in content.lines() {
925| 0| let trimmed = line.trim();
926| 0| if trimmed.is_empty() || trimmed.starts_with('#') {
927| 0| continue;
928| 0| }
929| 0| if seen.insert(trimmed.to_string()) {
930| 0| out.push(trimmed.to_string());
931| 0| }
932| | }
933| 0| Ok(out)
934| 0|}
935| |
936| |/// Resolves the union of `--names` and `--names-file` (G37).
937| 0|fn resolve_name_filter(args: &EnrichArgs) -> Result<Vec<String>, AppError> {
938| 0| let mut combined: Vec<String> = args.names.clone();
939| 0| if let Some(p) = &args.names_file {
940| 0| let from_file = read_names_file(p)?;
941| 0| for n in from_file {
942| 0| if !combined.contains(&n) {
943| 0| combined.push(n);
944| 0| }
945| | }
946| 0| }
947| 0| Ok(combined)
948| 0|}
949| |
950| |/// Returns entities with NULL or empty description.
951| |///
952| |/// These are the targets for `entity-descriptions` enrichment.
953| 2|fn scan_entities_without_description(
954| 2| conn: &Connection,
955| 2| namespace: &str,
956| 2| limit: Option<usize>,
957| 2|) -> Result<Vec<(i64, String, String)>, AppError> {
958| 2| let limit_clause = limit.map(|n| format!("LIMIT {n}")).unwrap_or_default();
^0
959| 2| let sql = format!(
960| 2| "SELECT id, name, type
961| 2| FROM entities
962| 2| WHERE namespace = ?1
963| 2| AND (description IS NULL OR description = '')
964| 2| ORDER BY id
965| 2| {limit_clause}"
966| | );
967| 2| let mut stmt = conn.prepare(&sql)?;
^0
968| 2| let rows = stmt
969| 2| .query_map(rusqlite::params![namespace], |r| {
^1
970| | Ok((
971| 1| r.get::<_, i64>(0)?,
^0
972| 1| r.get::<_, String>(1)?,
^0
973| 1| r.get::<_, String>(2)?,
^0
974| | ))
975| 1| })?
^0
976| 2| .collect::<Result<Vec<_>, _>>()?;
^0
977| 2| Ok(rows)
978| 2|}
979| |
980| |/// Returns memories whose body length is below the configured minimum.
981| |///
982| |/// These are the targets for `body-enrich` (GAP-18).
983| 4|fn scan_short_body_memories(
984| 4| conn: &Connection,
985| 4| namespace: &str,
986| 4| min_chars: usize,
987| 4| limit: Option<usize>,
988| 4|) -> Result<Vec<(i64, String, String)>, AppError> {
989| 4| let limit_clause = limit.map(|n| format!("LIMIT {n}")).unwrap_or_default();
^1
990| 4| let sql = format!(
991| 4| "SELECT m.id, m.name, m.body
992| 4| FROM memories m
993| 4| WHERE m.namespace = ?1
994| 4| AND m.deleted_at IS NULL
995| 4| AND LENGTH(COALESCE(m.body,'')) < ?2
996| 4| ORDER BY m.id
997| 4| {limit_clause}"
998| | );
999| 4| let mut stmt = conn.prepare(&sql)?;
^0
1000| 4| let rows = stmt
1001| 5| .query_map(rusqlite::params![namespace, min_chars as i64], |r| {
^4 ^4 ^4
1002| | Ok((
1003| 5| r.get::<_, i64>(0)?,
^0
1004| 5| r.get::<_, String>(1)?,
^0
1005| 5| r.get::<_, String>(2)?,
^0
1006| | ))
1007| 5| })?
^0
1008| 4| .collect::<Result<Vec<_>, _>>()?;
^0
1009| 4| Ok(rows)
1010| 4|}
1011| |
1012| |/// G27: Returns relationships with weight >= 0.7 that may need recalibration.
1013| |#[allow(clippy::type_complexity)]
1014| 0|fn scan_weight_candidates(
1015| 0| conn: &Connection,
1016| 0| namespace: &str,
1017| 0| limit: Option<usize>,
1018| 0|) -> Result<Vec<(i64, String, String, String, f64)>, AppError> {
1019| 0| let limit_clause = limit.map(|n| format!("LIMIT {n}")).unwrap_or_default();
1020| 0| let sql = format!(
1021| 0| "SELECT r.id, e1.name, e2.name, r.relation, r.weight \
1022| 0| FROM relationships r \
1023| 0| JOIN entities e1 ON e1.id = r.source_id \
1024| 0| JOIN entities e2 ON e2.id = r.target_id \
1025| 0| WHERE r.weight >= 0.7 AND e1.namespace = ?1 \
1026| 0| ORDER BY r.weight DESC {limit_clause}"
1027| | );
1028| 0| let mut stmt = conn.prepare(&sql)?;
1029| 0| let rows = stmt
1030| 0| .query_map(rusqlite::params![namespace], |r| {
1031| | Ok((
1032| 0| r.get::<_, i64>(0)?,
1033| 0| r.get::<_, String>(1)?,
1034| 0| r.get::<_, String>(2)?,
1035| 0| r.get::<_, String>(3)?,
1036| 0| r.get::<_, f64>(4)?,
1037| | ))
1038| 0| })?
1039| 0| .collect::<Result<Vec<_>, _>>()?;
1040| 0| Ok(rows)
1041| 0|}
1042| |
1043| |/// G27: Returns relationships with generic relation types (applies_to).
1044| 0|fn scan_generic_relations(
1045| 0| conn: &Connection,
1046| 0| namespace: &str,
1047| 0| limit: Option<usize>,
1048| 0|) -> Result<Vec<(i64, String, String, String)>, AppError> {
1049| 0| let limit_clause = limit.map(|n| format!("LIMIT {n}")).unwrap_or_default();
1050| 0| let sql = format!(
1051| 0| "SELECT r.id, e1.name, e2.name, r.relation \
1052| 0| FROM relationships r \
1053| 0| JOIN entities e1 ON e1.id = r.source_id \
1054| 0| JOIN entities e2 ON e2.id = r.target_id \
1055| 0| WHERE r.relation = 'applies_to' AND e1.namespace = ?1 \
1056| 0| ORDER BY r.id {limit_clause}"
1057| | );
1058| 0| let mut stmt = conn.prepare(&sql)?;
1059| 0| let rows = stmt
1060| 0| .query_map(rusqlite::params![namespace], |r| {
1061| | Ok((
1062| 0| r.get::<_, i64>(0)?,
1063| 0| r.get::<_, String>(1)?,
1064| 0| r.get::<_, String>(2)?,
1065| 0| r.get::<_, String>(3)?,
1066| | ))
1067| 0| })?
1068| 0| .collect::<Result<Vec<_>, _>>()?;
1069| 0| Ok(rows)
1070| 0|}
1071| |
1072| |// ---------------------------------------------------------------------------
1073| |// PERSIST helpers for fully-implemented operations
1074| |// ---------------------------------------------------------------------------
1075| |
1076| |/// Persists entity bindings extracted by the LLM for a memory.
1077| |///
1078| |/// Creates entities via `upsert_entity`, links them to the memory via
1079| |/// `link_memory_entity`, and upserts relationships found between entities.
1080| 0|fn persist_memory_bindings(
1081| 0| conn: &Connection,
1082| 0| namespace: &str,
1083| 0| memory_id: i64,
1084| 0| entities_json: &serde_json::Value,
1085| 0| rels_json: &serde_json::Value,
1086| 0|) -> Result<(usize, usize), AppError> {
1087| | #[derive(Deserialize)]
1088| | struct EntityItem {
1089| | name: String,
1090| | entity_type: String,
1091| | }
1092| | #[derive(Deserialize)]
1093| | struct RelItem {
1094| | source: String,
1095| | target: String,
1096| | relation: String,
1097| | strength: f64,
1098| | }
1099| |
1100| 0| let extracted_entities: Vec<EntityItem> = serde_json::from_value(entities_json.clone())
1101| 0| .map_err(|e| AppError::Validation(format!("failed to parse entities array: {e}")))?;
1102| |
1103| 0| let extracted_rels: Vec<RelItem> = serde_json::from_value(rels_json.clone())
1104| 0| .map_err(|e| AppError::Validation(format!("failed to parse relationships array: {e}")))?;
1105| |
1106| 0| let mut ent_count = 0usize;
1107| 0| let mut rel_count = 0usize;
1108| |
1109| 0| for item in &extracted_entities {
1110| 0| let entity_type = match item.entity_type.parse::<EntityType>() {
1111| 0| Ok(et) => et,
1112| | Err(_) => {
1113| 0| tracing::warn!(
1114| | target: "enrich",
1115| | entity = %item.name,
1116| | entity_type = %item.entity_type,
1117| 0| "entity type not recognized, skipping"
1118| | );
1119| 0| continue;
1120| | }
1121| | };
1122| 0| match entities::upsert_entity(
1123| 0| conn,
1124| 0| namespace,
1125| 0| &NewEntity {
1126| 0| name: item.name.clone(),
1127| 0| entity_type,
1128| 0| description: None,
1129| 0| },
1130| 0| ) {
1131| 0| Ok(eid) => {
1132| 0| let _ = entities::link_memory_entity(conn, memory_id, eid);
1133| 0| ent_count += 1;
1134| 0| }
1135| 0| Err(e) => {
1136| 0| tracing::warn!(
1137| | target: "enrich",
1138| | entity = %item.name,
1139| | error = %e,
1140| 0| "entity upsert skipped"
1141| | );
1142| | }
1143| | }
1144| | }
1145| |
1146| 0| for rel in &extracted_rels {
1147| 0| let normalized = crate::parsers::normalize_relation(&rel.relation);
1148| 0| crate::parsers::warn_if_non_canonical(&normalized);
1149| |
1150| | // Normalize entity names before lookup: upsert_entity normalizes on write,
1151| | // so the lookup must use the same normalized form to find the row.
1152| 0| let src_name = crate::parsers::normalize_entity_name(&rel.source);
1153| 0| let tgt_name = crate::parsers::normalize_entity_name(&rel.target);
1154| 0| let src_id = entities::find_entity_id(conn, namespace, &src_name);
1155| 0| let tgt_id = entities::find_entity_id(conn, namespace, &tgt_name);
1156| 0| if let (Ok(Some(sid)), Ok(Some(tid))) = (src_id, tgt_id) {
1157| 0| let new_rel = NewRelationship {
1158| 0| source: rel.source.clone(),
1159| 0| target: rel.target.clone(),
1160| 0| relation: normalized,
1161| 0| strength: rel.strength,
1162| 0| description: None,
1163| 0| };
1164| 0| if entities::upsert_relationship(conn, namespace, sid, tid, &new_rel).is_ok() {
1165| 0| rel_count += 1;
1166| 0| }
1167| 0| }
1168| | }
1169| |
1170| 0| Ok((ent_count, rel_count))
1171| 0|}
1172| |
1173| |/// Updates an entity's description directly in the `entities` table.
1174| 1|fn persist_entity_description(
1175| 1| conn: &Connection,
1176| 1| entity_id: i64,
1177| 1| description: &str,
1178| 1|) -> Result<(), AppError> {
1179| 1| conn.execute(
1180| 1| "UPDATE entities SET description = ?1, updated_at = unixepoch() WHERE id = ?2",
1181| 1| rusqlite::params![description, entity_id],
1182| 0| )?;
1183| 1| Ok(())
1184| 1|}
1185| |
1186| |/// Persists an enriched memory body (body-enrich, GAP-18).
1187| |///
1188| |/// Uses `memories::update` to set the new body and `sync_fts_after_update`
1189| |/// to keep FTS5 in sync. Also re-embeds the memory for recall accuracy.
1190| 0|fn persist_enriched_body(
1191| 0| conn: &Connection,
1192| 0| namespace: &str,
1193| 0| memory_id: i64,
1194| 0| memory_name: &str,
1195| 0| new_body: &str,
1196| 0| paths: &crate::paths::AppPaths,
1197| 0|) -> Result<(), AppError> {
1198| | // Read current values for FTS sync
1199| 0| let (old_name, old_desc, old_body): (String, String, String) = conn.query_row(
1200| 0| "SELECT name, COALESCE(description,''), COALESCE(body,'') FROM memories WHERE id=?1",
1201| 0| rusqlite::params![memory_id],
1202| 0| |r| Ok((r.get(0)?, r.get(1)?, r.get(2)?)),
1203| 0| )?;
1204| |
1205| 0| let memory_type: String = conn.query_row(
1206| 0| "SELECT type FROM memories WHERE id=?1",
1207| 0| rusqlite::params![memory_id],
1208| 0| |r| r.get(0),
1209| 0| )?;
1210| |
1211| 0| let description: String = conn.query_row(
1212| 0| "SELECT COALESCE(description,'') FROM memories WHERE id=?1",
1213| 0| rusqlite::params![memory_id],
1214| 0| |r| r.get(0),
1215| 0| )?;
1216| |
1217| 0| let body_hash = blake3::hash(new_body.as_bytes()).to_hex().to_string();
1218| |
1219| 0| let new_memory = memories::NewMemory {
1220| 0| namespace: namespace.to_string(),
1221| 0| name: memory_name.to_string(),
1222| 0| memory_type: memory_type.clone(),
1223| 0| description: description.clone(),
1224| 0| body: new_body.to_string(),
1225| 0| body_hash,
1226| 0| session_id: None,
1227| 0| source: "agent".to_string(),
1228| 0| metadata: serde_json::json!({
1229| 0| "operation": "body-enrich",
1230| 0| "orig_chars": old_body.chars().count(),
1231| 0| "new_chars": new_body.chars().count(),
1232| 0| }),
1233| 0| };
1234| |
1235| | // G29 audit: insert a new immutable version BEFORE the update so the
1236| | // enriched body is reachable through `history --name <X>` and
1237| | // `restore --version N` can roll back to the pre-enrich state.
1238| 0| let next_version = crate::storage::versions::next_version(conn, memory_id)?;
1239| 0| let version_metadata = serde_json::json!({
1240| 0| "operation": "body-enrich",
1241| 0| "orig_chars": old_body.chars().count(),
1242| 0| "new_chars": new_body.chars().count(),
1243| | })
1244| 0| .to_string();
1245| 0| crate::storage::versions::insert_version(
1246| 0| conn,
1247| 0| memory_id,
1248| 0| next_version,
1249| 0| memory_name,
1250| 0| &memory_type,
1251| 0| &description,
1252| 0| new_body,
1253| 0| &version_metadata,
1254| 0| Some("enrich"),
1255| 0| "edit",
1256| 0| )?;
1257| |
1258| 0| memories::update(conn, memory_id, &new_memory, None)?;
1259| 0| memories::sync_fts_after_update(
1260| 0| conn,
1261| 0| memory_id,
1262| 0| &old_name,
1263| 0| &old_desc,
1264| 0| &old_body,
1265| 0| &new_memory.name,
1266| 0| &new_memory.description,
1267| 0| &new_memory.body,
1268| 0| )?;
1269| |
1270| | // Re-embed for recall accuracy
1271| 0| let snippet: String = new_body.chars().take(200).collect();
1272| 0| let tokenizer = crate::tokenizer::get_tokenizer(&paths.models)?;
1273| 0| let chunks_info = crate::chunking::split_into_chunks_hierarchical(new_body, tokenizer);
1274| 0| let embedding_result = if chunks_info.len() <= 1 {
1275| 0| crate::daemon::embed_passage_or_local(&paths.models, new_body)
1276| | } else {
1277| 0| let mut chunk_embeddings: Vec<Vec<f32>> = Vec::with_capacity(chunks_info.len());
1278| 0| let mut ok = true;
1279| 0| for chunk in &chunks_info {
1280| 0| let text = crate::chunking::chunk_text(new_body, chunk);
1281| 0| match crate::daemon::embed_passage_or_local(&paths.models, text) {
1282| 0| Ok(emb) => chunk_embeddings.push(emb),
1283| 0| Err(e) => {
1284| 0| tracing::warn!(target: "enrich", error = %e, "chunk embedding failed");
1285| 0| ok = false;
1286| 0| break;
1287| | }
1288| | }
1289| | }
1290| 0| if ok {
1291| 0| Ok(crate::chunking::aggregate_embeddings(&chunk_embeddings))
1292| | } else {
1293| 0| crate::daemon::embed_passage_or_local(&paths.models, new_body)
1294| | }
1295| | };
1296| |
1297| 0| if let Ok(embedding) = embedding_result {
1298| 0| if let Err(e) = memories::upsert_vec(
1299| 0| conn,
1300| 0| memory_id,
1301| 0| namespace,
1302| 0| &memory_type,
1303| 0| &embedding,
1304| 0| memory_name,
1305| 0| &snippet,
1306| 0| ) {
1307| 0| tracing::warn!(target: "enrich", memory = %memory_name, error = %e, "vec upsert failed after body-enrich");
1308| 0| }
1309| 0| }
1310| |
1311| 0| Ok(())
1312| 0|}
1313| |
1314| |// ---------------------------------------------------------------------------
1315| |// Main entry point
1316| |// ---------------------------------------------------------------------------
1317| |
1318| |/// Main entry point for the `enrich` command.
1319| 0|pub fn run(args: &EnrichArgs) -> Result<(), AppError> {
1320| | // TODO(G20): add mode-conditional flag validation before DB access.
1321| | // Flags that are silently discarded when the wrong mode is active:
1322| | // --mode claude-code: codex_binary, codex_model, codex_timeout
1323| | // --mode codex: claude_binary, claude_model, claude_timeout,
1324| | // max_cost_usd, rate_limit_wait
1325| | // Approach: check each non-default flag value early and return
1326| | // Err(AppError::Validation(...)) for incompatible mode+flag combinations.
1327| 0| let started = Instant::now();
1328| |
1329| 0| let paths = AppPaths::resolve(args.db.as_deref())?;
1330| 0| ensure_db_ready(&paths)?;
1331| 0| let conn = open_rw(&paths.db)?;
1332| 0| let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
1333| |
1334| | // G28-B (v1.0.68) + G30 (v1.0.69): enforce singleton per
1335| | // (job_type, namespace, db_hash) so two parallel `enrich` invocations
1336| | // on the same DB cannot co-exist, but concurrent enrich on different
1337| | // databases works as expected. The force flag (--force) breaks a
1338| | // stale lock from a previously crashed invocation.
1339| 0| let wait_secs = args.wait_job_singleton;
1340| 0| let force_flag = args.force_job_singleton;
1341| 0| let _singleton = crate::lock::acquire_job_singleton(
1342| 0| crate::lock::JobType::Enrich,
1343| 0| &namespace,
1344| 0| &paths.db,
1345| 0| wait_secs,
1346| 0| force_flag,
1347| 0| )?;
1348| |
1349| | // Validate provider binary upfront
1350| 0| let _effective_mode: EnrichMode = args.mode.clone();
1351| 0| let provider_binary = match args.mode {
1352| | EnrichMode::ClaudeCode => {
1353| 0| let bin = find_claude_binary(args.claude_binary.as_deref())?;
1354| 0| let version = super::claude_runner::validate_claude_version(&bin)?;
1355| 0| tracing::info!(target: "enrich", binary = %bin.display(), version = %version, "Claude Code binary validated");
1356| 0| emit_json(&PhaseEvent {
1357| 0| phase: "validate",
1358| 0| binary_path: bin.to_str(),
1359| 0| version: Some(&version),
1360| 0| items_total: None,
1361| 0| items_pending: None,
1362| 0| llm_parallelism: None,
1363| 0| });
1364| 0| bin
1365| | }
1366| | EnrichMode::Codex => {
1367| | // Codex provider: locate binary using env or PATH
1368| 0| let bin = find_codex_binary(args.codex_binary.as_deref())?;
1369| 0| emit_json(&PhaseEvent {
1370| 0| phase: "validate",
1371| 0| binary_path: bin.to_str(),
1372| 0| version: None,
1373| 0| items_total: None,
1374| 0| items_pending: None,
1375| 0| llm_parallelism: None,
1376| 0| });
1377| 0| bin
1378| | }
1379| | };
1380| |
1381| | // G28-D: refuse to start when the system is saturated. This check
1382| | // is BEFORE preflight so we never spend an OAuth turn on a host
1383| | // that is already at the limit.
1384| 0| if args.max_load_check && !args.dry_run && crate::system_load::is_system_saturated() {
1385| 0| let load = crate::system_load::load_average_one();
1386| 0| let n = crate::system_load::ncpus();
1387| 0| return Err(AppError::Validation(format!(
1388| 0| "system load average {load:.2} exceeds 2x ncpus ({n}); \
1389| 0| pass --no-max-load-check to override (not recommended)"
1390| 0| )));
1391| 0| }
1392| |
1393| | // G35: preflight probe — issue a single ping turn to verify the
1394| | // provider is healthy before scanning N candidates. If the probe
1395| | // fails with a rate-limit error, optionally fall back to a
1396| | // different mode (typically codex) instead of failing the entire
1397| | // batch. The probe itself consumes 1 OAuth turn, so it stays
1398| | // opt-in (default off) to keep --dry-run and CI flows zero-cost.
1399| 0| if args.preflight_check && !args.dry_run {
1400| 0| let preflight_result = run_preflight_probe(args);
1401| 0| match preflight_result {
1402| | PreflightOutcome::Healthy => {
1403| 0| tracing::info!(target: "enrich", mode = ?args.mode, "preflight probe healthy");
1404| | }
1405| 0| PreflightOutcome::RateLimited { reason, suggestion } => {
1406| 0| if let Some(fallback) = args.fallback_mode.clone() {
1407| 0| if fallback != args.mode {
1408| | // G35 (v1.0.69): the mid-batch mode switch is
1409| | // intentionally NOT applied because it would
1410| | // desynchronise the per-item rate-limit wait
1411| | // state (rate-limited items in the worker are
1412| | // timed against the original provider). Instead
1413| | // we abort cleanly so the operator can re-invoke
1414| | // with `--mode {fallback:?}`. This guarantees no
1415| | // OAuth window is wasted and no partial state
1416| | // is left in the queue.
1417| 0| return Err(AppError::Validation(format!(
1418| 0| "preflight detected rate limit on {mode:?}: {reason}; \
1419| 0| re-invoke with `--mode {fallback:?}` to use the fallback provider",
1420| 0| mode = args.mode
1421| 0| )));
1422| 0| }
1423| 0| return Err(AppError::Validation(format!(
1424| 0| "preflight detected rate limit on {mode:?}: {reason}; \
1425| 0| --fallback-mode matches --mode, no recovery possible",
1426| 0| mode = args.mode
1427| 0| )));
1428| 0| }
1429| 0| return Err(AppError::Validation(format!(
1430| 0| "preflight detected rate limit on {mode:?}: {reason}; \
1431| 0| {suggestion}; pass --fallback-mode codex to recover",
1432| 0| mode = args.mode
1433| 0| )));
1434| | }
1435| 0| PreflightOutcome::Error(e) => {
1436| 0| return Err(e);
1437| | }
1438| | }
1439| 0| }
1440| |
1441| | // SCAN phase
1442| 0| let scan_result = scan_operation(&conn, &namespace, args)?;
1443| 0| let total = scan_result.len();
1444| |
1445| 0| emit_json(&PhaseEvent {
1446| 0| phase: "scan",
1447| 0| binary_path: None,
1448| 0| version: None,
1449| 0| items_total: Some(total),
1450| 0| items_pending: Some(total),
1451| 0| llm_parallelism: Some(args.llm_parallelism),
1452| 0| });
1453| |
1454| | // Dry-run: emit preview events and summary without calling LLM
1455| 0| if args.dry_run {
1456| 0| for (idx, key) in scan_result.iter().enumerate() {
1457| 0| emit_json(&ItemEvent {
1458| 0| item: key,
1459| 0| status: "preview",
1460| 0| memory_id: None,
1461| 0| entity_id: None,
1462| 0| entities: None,
1463| 0| rels: None,
1464| 0| chars_before: None,
1465| 0| chars_after: None,
1466| 0| cost_usd: None,
1467| 0| elapsed_ms: None,
1468| 0| error: None,
1469| 0| index: idx,
1470| 0| total,
1471| 0| });
1472| 0| }
1473| 0| emit_json(&EnrichSummary {
1474| 0| summary: true,
1475| 0| operation: format!("{:?}", args.operation),
1476| 0| items_total: total,
1477| 0| completed: 0,
1478| 0| failed: 0,
1479| 0| skipped: 0,
1480| 0| cost_usd: 0.0,
1481| 0| elapsed_ms: started.elapsed().as_millis() as u64,
1482| 0| });
1483| 0| return Ok(());
1484| 0| }
1485| |
1486| | // All 13 operations are now implemented (G27 complete).
1487| |
1488| | // Queue setup for resume/retry
1489| 0| let queue_conn = open_queue_db(DEFAULT_QUEUE_DB)?;
1490| |
1491| 0| if args.resume {
1492| 0| let reset = queue_conn
1493| 0| .execute(
1494| 0| "UPDATE queue SET status='pending' WHERE status='processing'",
1495| 0| [],
1496| | )
1497| 0| .map_err(|e| AppError::Validation(format!("queue resume failed: {e}")))?;
1498| 0| if reset > 0 {
1499| 0| tracing::info!(target: "enrich", count = reset, "reset stuck processing items to pending");
1500| 0| }
1501| 0| }
1502| |
1503| 0| if args.retry_failed {
1504| 0| let count = queue_conn
1505| 0| .execute(
1506| 0| "UPDATE queue SET status='pending', attempt=0 WHERE status='failed'",
1507| 0| [],
1508| | )
1509| 0| .map_err(|e| AppError::Validation(format!("queue retry-failed reset failed: {e}")))?;
1510| 0| tracing::info!(target: "enrich", count, "retrying failed items");
1511| 0| }
1512| |
1513| 0| if !args.resume && !args.retry_failed {
1514| 0| queue_conn
1515| 0| .execute("DELETE FROM queue", [])
1516| 0| .map_err(|e| AppError::Validation(format!("queue clear failed: {e}")))?;
1517| 0| }
1518| |
1519| | // Populate queue
1520| 0| for (idx, key) in scan_result.iter().enumerate() {
1521| 0| let item_type = match args.operation {
1522| 0| EnrichOperation::EntityDescriptions => "entity",
1523| 0| _ => "memory",
1524| | };
1525| 0| if let Err(e) = queue_conn.execute(
1526| 0| "INSERT OR IGNORE INTO queue (item_key, item_type, status) VALUES (?1, ?2, 'pending')",
1527| 0| rusqlite::params![key, item_type],
1528| 0| ) {
1529| 0| tracing::warn!(target: "enrich", error = %e, "queue insert failed");
1530| 0| }
1531| 0| let _ = idx; // suppress unused warning
1532| | }
1533| |
1534| | // G19: parallel LLM processing via std::thread::scope when parallelism > 1.
1535| | // Clamp enforces the range even if the caller bypasses clap validation.
1536| 0| let parallelism = args.llm_parallelism.clamp(1, 32) as usize;
1537| 0| if parallelism > 1 {
1538| 0| tracing::info!(
1539| | target: "enrich",
1540| | llm_parallelism = parallelism,
1541| 0| "parallel LLM processing with bounded thread pool"
1542| | );
1543| 0| }
1544| | // G28-D (v1.0.68) + G34 (v1.0.69): warn above the recommended parallelism
1545| | // ceiling. The threshold and message depend on the LLM mode because
1546| | // Claude Code spawns MCP children (G28-A) while Codex does not.
1547| 0| if parallelism > 4 {
1548| 0| match args.mode {
1549| | EnrichMode::ClaudeCode => {
1550| 0| tracing::warn!(
1551| | target: "enrich",
1552| | llm_parallelism = parallelism,
1553| | recommended_max = 4,
1554| | mode = "claude-code",
1555| 0| "llm_parallelism above 4 multiplies Claude Code subprocess fan-out; \
1556| 0| consider combining with SQLITE_GRAPHRAG_CLAUDE_EMPTY_CONFIG_DIR \
1557| 0| to cut MCP children (G28-A)"
1558| | );
1559| | }
1560| 0| EnrichMode::Codex if parallelism > 16 => {
1561| 0| tracing::warn!(
1562| | target: "enrich",
1563| | llm_parallelism = parallelism,
1564| | recommended_max = 16,
1565| | mode = "codex",
1566| 0| "llm_parallelism above 16 risks OAuth rate-limit on Codex; \
1567| 0| consider --llm-parallelism 8 for safer concurrency"
1568| | );
1569| | }
1570| 0| EnrichMode::Codex => {
1571| 0| // No warning: codex does not spawn MCP children and was
1572| 0| // validated at parallelism 8 in production (1161 items,
1573| 0| // 0 failures) per the 2026-06-04 session audit.
1574| 0| }
1575| | }
1576| 0| }
1577| |
1578| 0| let mut completed = 0usize;
1579| 0| let mut failed = 0usize;
1580| 0| let mut skipped = 0usize;
1581| 0| let mut cost_total = 0.0f64;
1582| 0| let mut oauth_detected = false;
1583| 0| let mut backoff_secs = DEFAULT_RATE_LIMIT_WAIT;
1584| 0| let rate_limit_deadline = std::time::Instant::now() + std::time::Duration::from_secs(3600);
1585| 0| let enrich_started = std::time::Instant::now();
1586| |
1587| 0| let provider_timeout = match args.mode {
1588| 0| EnrichMode::ClaudeCode => args.claude_timeout,
1589| 0| EnrichMode::Codex => args.codex_timeout,
1590| | };
1591| |
1592| 0| let provider_model: Option<&str> = match args.mode {
1593| 0| EnrichMode::ClaudeCode => args.claude_model.as_deref(),
1594| 0| EnrichMode::Codex => args.codex_model.as_deref(),
1595| | };
1596| |
1597| | // G19: when parallelism > 1, spawn bounded worker threads.
1598| | // Each worker opens its own DB connections (WAL supports concurrent readers + serialized writers).
1599| | // The queue DB claim is atomic via UPDATE...RETURNING — no external lock needed.
1600| 0| if parallelism > 1 {
1601| 0| let stdout_mu = parking_lot::Mutex::new(());
1602| 0| let budget = args.max_cost_usd;
1603| 0| let operation = args.operation.clone();
1604| 0| let mode = args.mode.clone();
1605| 0| let min_oc = args.min_output_chars;
1606| 0| let max_oc = args.max_output_chars;
1607| 0| let prompt_tpl = args.prompt_template.as_deref().map(|p| p.to_path_buf());
1608| |
1609| | struct WorkerResult {
1610| | completed: usize,
1611| | failed: usize,
1612| | skipped: usize,
1613| | cost: f64,
1614| | oauth: bool,
1615| | }
1616| |
1617| 0| let results: Vec<WorkerResult> = std::thread::scope(|s| {
1618| 0| let handles: Vec<_> = (0..parallelism)
1619| 0| .map(|worker_id| {
1620| 0| let stdout_mu = &stdout_mu;
1621| 0| let paths = &paths;
1622| 0| let namespace = &namespace;
1623| 0| let provider_binary = &provider_binary;
1624| 0| let operation = &operation;
1625| 0| let mode = &mode;
1626| 0| let prompt_tpl = prompt_tpl.as_deref();
1627| 0| s.spawn(move || {
1628| 0| let w_conn = match open_rw(&paths.db) {
1629| 0| Ok(c) => c,
1630| 0| Err(e) => {
1631| 0| tracing::error!(target: "enrich", worker = worker_id, error = %e, "worker failed to open DB");
1632| 0| return WorkerResult { completed: 0, failed: 0, skipped: 0, cost: 0.0, oauth: false };
1633| | }
1634| | };
1635| 0| let w_queue = match open_queue_db(DEFAULT_QUEUE_DB) {
1636| 0| Ok(c) => c,
1637| 0| Err(e) => {
1638| 0| tracing::error!(target: "enrich", worker = worker_id, error = %e, "worker failed to open queue DB");
1639| 0| return WorkerResult { completed: 0, failed: 0, skipped: 0, cost: 0.0, oauth: false };
1640| | }
1641| | };
1642| 0| let mut w_completed = 0usize;
1643| 0| let mut w_failed = 0usize;
1644| 0| let mut w_skipped = 0usize;
1645| 0| let mut w_cost = 0.0f64;
1646| 0| let mut w_oauth = false;
1647| 0| let mut w_backoff = DEFAULT_RATE_LIMIT_WAIT;
1648| 0| let w_deadline = std::time::Instant::now() + std::time::Duration::from_secs(3600);
1649| | // G28-D: per-worker circuit breaker that aborts the
1650| | // loop after `circuit_breaker_threshold` consecutive
1651| | // HardFailure outcomes (transient/rate-limited errors
1652| | // do NOT count, so a recovering provider is not
1653| | // penalised).
1654| 0| let mut w_breaker = crate::retry::CircuitBreaker::new(
1655| 0| args.circuit_breaker_threshold.max(1),
1656| 0| std::time::Duration::from_secs(60),
1657| | );
1658| |
1659| | loop {
1660| 0| if crate::shutdown_requested() {
1661| 0| tracing::info!(target: "enrich", "shutdown requested, worker stopping");
1662| 0| break;
1663| 0| }
1664| 0| if let Some(b) = budget {
1665| 0| if !w_oauth && w_cost >= b {
1666| 0| break;
1667| 0| }
1668| 0| }
1669| 0| let pending: Option<(i64, String, String)> = w_queue
1670| 0| .query_row(
1671| 0| "UPDATE queue SET status='processing', attempt=attempt+1 \
1672| 0| WHERE id = (SELECT id FROM queue WHERE status='pending' ORDER BY id LIMIT 1) \
1673| 0| RETURNING id, item_key, item_type",
1674| 0| [],
1675| 0| |row| Ok((row.get(0)?, row.get(1)?, row.get(2)?)),
1676| | )
1677| 0| .ok();
1678| 0| let (queue_id, item_key, _item_type) = match pending {
1679| 0| Some(p) => p,
1680| 0| None => break,
1681| | };
1682| 0| let item_started = Instant::now();
1683| 0| let current_index = w_completed + w_failed + w_skipped;
1684| |
1685| 0| let call_result = match operation {
1686| 0| EnrichOperation::MemoryBindings => call_memory_bindings(&w_conn, namespace, &item_key, provider_binary, provider_model, provider_timeout, mode),
1687| 0| EnrichOperation::EntityDescriptions => call_entity_description(&w_conn, namespace, &item_key, provider_binary, provider_model, provider_timeout, mode),
1688| 0| EnrichOperation::BodyEnrich => call_body_enrich(&w_conn, namespace, &item_key, provider_binary, provider_model, provider_timeout, mode, min_oc, max_oc, prompt_tpl, args.preserve_threshold, paths),
1689| 0| EnrichOperation::WeightCalibrate => call_weight_calibrate(&w_conn, namespace, &item_key, provider_binary, provider_model, provider_timeout, mode),
1690| 0| EnrichOperation::RelationReclassify => call_relation_reclassify(&w_conn, namespace, &item_key, provider_binary, provider_model, provider_timeout, mode),
1691| 0| EnrichOperation::EntityConnect | EnrichOperation::CrossDomainBridges => call_entity_connect(&w_conn, namespace, &item_key, provider_binary, provider_model, provider_timeout, mode),
1692| 0| EnrichOperation::EntityTypeValidate => call_entity_type_validate(&w_conn, namespace, &item_key, provider_binary, provider_model, provider_timeout, mode),
1693| 0| EnrichOperation::DescriptionEnrich => call_description_enrich(&w_conn, namespace, &item_key, provider_binary, provider_model, provider_timeout, mode),
1694| 0| EnrichOperation::DomainClassify => call_domain_classify(&w_conn, namespace, &item_key, provider_binary, provider_model, provider_timeout, mode),
1695| 0| EnrichOperation::GraphAudit => call_graph_audit(&w_conn, namespace, &item_key, provider_binary, provider_model, provider_timeout, mode),
1696| 0| EnrichOperation::DeepResearchSynth => call_deep_research_synth(&w_conn, namespace, &item_key, provider_binary, provider_model, provider_timeout, mode),
1697| 0| EnrichOperation::BodyExtract => call_body_extract(&w_conn, namespace, &item_key, provider_binary, provider_model, provider_timeout, mode),
1698| | };
1699| |
1700| 0| match call_result {
1701| 0| Ok(EnrichItemResult::Done { cost, is_oauth, memory_id, entity_id, entities, rels, chars_before, chars_after }) => {
1702| 0| if is_oauth { w_oauth = true; }
1703| 0| w_backoff = DEFAULT_RATE_LIMIT_WAIT;
1704| 0| let _ = w_queue.execute(
1705| 0| "UPDATE queue SET status='done', memory_id=?1, entity_id=?2, entities=?3, rels=?4, cost_usd=?5, elapsed_ms=?6, done_at=datetime('now') WHERE id=?7",
1706| 0| rusqlite::params![memory_id, entity_id, entities as i64, rels as i64, cost, item_started.elapsed().as_millis() as i64, queue_id],
1707| 0| );
1708| 0| w_completed += 1;
1709| 0| if !is_oauth { w_cost += cost; }
1710| | // G28-D: count success; resets breaker.
1711| 0| let _ = w_breaker
1712| 0| .record(crate::retry::AttemptOutcome::Success);
1713| 0| let _guard = stdout_mu.lock();
1714| 0| emit_json(&ItemEvent { item: &item_key, status: "done", memory_id, entity_id, entities: Some(entities), rels: Some(rels), chars_before, chars_after, cost_usd: if is_oauth { None } else { Some(cost) }, elapsed_ms: Some(item_started.elapsed().as_millis() as u64), error: None, index: current_index, total });
1715| | }
1716| 0| Ok(EnrichItemResult::Skipped { reason }) => {
1717| 0| w_skipped += 1;
1718| 0| let _ = w_queue.execute("UPDATE queue SET status='skipped', error=?1, done_at=datetime('now') WHERE id=?2", rusqlite::params![reason, queue_id]);
1719| 0| let _guard = stdout_mu.lock();
1720| 0| emit_json(&ItemEvent { item: &item_key, status: "skipped", memory_id: None, entity_id: None, entities: None, rels: None, chars_before: None, chars_after: None, cost_usd: None, elapsed_ms: Some(item_started.elapsed().as_millis() as u64), error: None, index: current_index, total });
1721| 0| }
1722| 0| Ok(EnrichItemResult::PreservationFailed { score, threshold, chars_before, chars_after }) => {
1723| 0| // G29 Passo 4: worker mirror of the
1724| 0| // serial path. Counted as a soft
1725| 0| // skip so the queue surface shows
1726| 0| // a quality issue rather than a
1727| 0| // transport failure.
1728| 0| w_skipped += 1;
1729| 0| let reason = format!(
1730| 0| "preservation_failed: jaccard={score:.3} threshold={threshold:.3} (orig={chars_before} chars, new={chars_after} chars)"
1731| 0| );
1732| 0| let _ = w_queue.execute(
1733| 0| "UPDATE queue SET status='skipped', error=?1, done_at=datetime('now') WHERE id=?2",
1734| 0| rusqlite::params![reason, queue_id],
1735| 0| );
1736| 0| let _guard = stdout_mu.lock();
1737| 0| emit_json(&ItemEvent {
1738| 0| item: &item_key,
1739| 0| status: "preservation_failed",
1740| 0| memory_id: None,
1741| 0| entity_id: None,
1742| 0| entities: None,
1743| 0| rels: None,
1744| 0| chars_before: Some(chars_before),
1745| 0| chars_after: Some(chars_after),
1746| 0| cost_usd: None,
1747| 0| elapsed_ms: Some(item_started.elapsed().as_millis() as u64),
1748| 0| error: Some(reason),
1749| 0| index: current_index,
1750| 0| total,
1751| 0| });
1752| 0| }
1753| 0| Err(e) => {
1754| 0| let err_str = format!("{e}");
1755| 0| if matches!(e, AppError::RateLimited { .. }) {
1756| 0| if crate::retry::is_kill_switch_active() {
1757| 0| tracing::warn!(target: "enrich", "SQLITE_GRAPHRAG_DISABLE_RETRY=1, skipping rate-limit retry");
1758| 0| } else if std::time::Instant::now() >= w_deadline {
1759| 0| tracing::error!(target: "enrich", "rate-limit retry deadline (1h) exhausted in worker");
1760| | } else {
1761| 0| let half = w_backoff / 2;
1762| 0| let jitter = if half == 0 { 0 } else { fastrand::u64(0..half) };
1763| 0| let actual_wait = half + jitter;
1764| 0| tracing::warn!(target: "enrich", delay_secs = actual_wait, error_kind = "rate_limited", "rate limited in worker, backing off");
1765| 0| let _ = w_queue.execute("UPDATE queue SET status='pending' WHERE id=?1", rusqlite::params![queue_id]);
1766| 0| std::thread::sleep(std::time::Duration::from_secs(actual_wait));
1767| 0| w_backoff = (w_backoff * 2).min(900);
1768| 0| continue;
1769| | }
1770| 0| }
1771| 0| w_failed += 1;
1772| 0| let _ = w_queue.execute("UPDATE queue SET status='failed', error=?1, done_at=datetime('now') WHERE id=?2", rusqlite::params![err_str, queue_id]);
1773| 0| let _guard = stdout_mu.lock();
1774| 0| emit_json(&ItemEvent { item: &item_key, status: "failed", memory_id: None, entity_id: None, entities: None, rels: None, chars_before: None, chars_after: None, cost_usd: None, elapsed_ms: Some(item_started.elapsed().as_millis() as u64), error: Some(err_str), index: current_index, total });
1775| | // G28-D: count hard failure against breaker.
1776| 0| let breaker_opened = w_breaker
1777| 0| .record(crate::retry::AttemptOutcome::HardFailure);
1778| 0| if breaker_opened {
1779| 0| tracing::error!(target: "enrich",
1780| 0| consecutive_failures = w_breaker.consecutive_failures(),
1781| 0| "circuit breaker opened — aborting worker"
1782| | );
1783| 0| break;
1784| 0| }
1785| | }
1786| | }
1787| | }
1788| 0| WorkerResult { completed: w_completed, failed: w_failed, skipped: w_skipped, cost: w_cost, oauth: w_oauth }
1789| 0| })
1790| 0| })
1791| 0| .collect();
1792| 0| handles
1793| 0| .into_iter()
1794| 0| .map(|h| {
1795| 0| h.join().unwrap_or(WorkerResult {
1796| 0| completed: 0,
1797| 0| failed: 0,
1798| 0| skipped: 0,
1799| 0| cost: 0.0,
1800| 0| oauth: false,
1801| 0| })
1802| 0| })
1803| 0| .collect()
1804| 0| });
1805| |
1806| 0| for r in &results {
1807| 0| completed += r.completed;
1808| 0| failed += r.failed;
1809| 0| skipped += r.skipped;
1810| 0| cost_total += r.cost;
1811| 0| oauth_detected |= r.oauth;
1812| 0| }
1813| | } else {
1814| | // Serial path (parallelism == 1) — original loop
1815| | loop {
1816| 0| if crate::shutdown_requested() {
1817| 0| tracing::info!(target: "enrich", "shutdown requested, stopping enrichment");
1818| 0| break;
1819| 0| }
1820| |
1821| | // Budget check
1822| 0| if let Some(budget) = args.max_cost_usd {
1823| 0| if !oauth_detected && cost_total >= budget {
1824| 0| tracing::warn!(target: "enrich", spent = cost_total, budget, "budget exceeded, stopping");
1825| 0| break;
1826| 0| }
1827| 0| }
1828| |
1829| | // Dequeue next pending item
1830| 0| let pending: Option<(i64, String, String)> = queue_conn
1831| 0| .query_row(
1832| 0| "UPDATE queue SET status='processing', attempt=attempt+1 \
1833| 0| WHERE id = (SELECT id FROM queue WHERE status='pending' ORDER BY id LIMIT 1) \
1834| 0| RETURNING id, item_key, item_type",
1835| 0| [],
1836| 0| |row| Ok((row.get(0)?, row.get(1)?, row.get(2)?)),
1837| | )
1838| 0| .ok();
1839| |
1840| 0| let (queue_id, item_key, item_type) = match pending {
1841| 0| Some(p) => p,
1842| 0| None => break,
1843| | };
1844| |
1845| 0| let item_started = Instant::now();
1846| 0| let current_index = completed + failed + skipped;
1847| |
1848| 0| let call_result = match args.operation {
1849| 0| EnrichOperation::MemoryBindings => call_memory_bindings(
1850| 0| &conn,
1851| 0| &namespace,
1852| 0| &item_key,
1853| 0| &provider_binary,
1854| 0| provider_model,
1855| 0| provider_timeout,
1856| 0| &args.mode,
1857| | ),
1858| 0| EnrichOperation::EntityDescriptions => call_entity_description(
1859| 0| &conn,
1860| 0| &namespace,
1861| 0| &item_key,
1862| 0| &provider_binary,
1863| 0| provider_model,
1864| 0| provider_timeout,
1865| 0| &args.mode,
1866| | ),
1867| 0| EnrichOperation::BodyEnrich => call_body_enrich(
1868| 0| &conn,
1869| 0| &namespace,
1870| 0| &item_key,
1871| 0| &provider_binary,
1872| 0| provider_model,
1873| 0| provider_timeout,
1874| 0| &args.mode,
1875| 0| args.min_output_chars,
1876| 0| args.max_output_chars,
1877| 0| args.prompt_template.as_deref(),
1878| 0| args.preserve_threshold,
1879| 0| &paths,
1880| | ),
1881| 0| EnrichOperation::WeightCalibrate => call_weight_calibrate(
1882| 0| &conn,
1883| 0| &namespace,
1884| 0| &item_key,
1885| 0| &provider_binary,
1886| 0| provider_model,
1887| 0| provider_timeout,
1888| 0| &args.mode,
1889| | ),
1890| 0| EnrichOperation::RelationReclassify => call_relation_reclassify(
1891| 0| &conn,
1892| 0| &namespace,
1893| 0| &item_key,
1894| 0| &provider_binary,
1895| 0| provider_model,
1896| 0| provider_timeout,
1897| 0| &args.mode,
1898| | ),
1899| | EnrichOperation::EntityConnect | EnrichOperation::CrossDomainBridges => {
1900| 0| call_entity_connect(
1901| 0| &conn,
1902| 0| &namespace,
1903| 0| &item_key,
1904| 0| &provider_binary,
1905| 0| provider_model,
1906| 0| provider_timeout,
1907| 0| &args.mode,
1908| | )
1909| | }
1910| 0| EnrichOperation::EntityTypeValidate => call_entity_type_validate(
1911| 0| &conn,
1912| 0| &namespace,
1913| 0| &item_key,
1914| 0| &provider_binary,
1915| 0| provider_model,
1916| 0| provider_timeout,
1917| 0| &args.mode,
1918| | ),
1919| 0| EnrichOperation::DescriptionEnrich => call_description_enrich(
1920| 0| &conn,
1921| 0| &namespace,
1922| 0| &item_key,
1923| 0| &provider_binary,
1924| 0| provider_model,
1925| 0| provider_timeout,
1926| 0| &args.mode,
1927| | ),
1928| 0| EnrichOperation::DomainClassify => call_domain_classify(
1929| 0| &conn,
1930| 0| &namespace,
1931| 0| &item_key,
1932| 0| &provider_binary,
1933| 0| provider_model,
1934| 0| provider_timeout,
1935| 0| &args.mode,
1936| | ),
1937| 0| EnrichOperation::GraphAudit => call_graph_audit(
1938| 0| &conn,
1939| 0| &namespace,
1940| 0| &item_key,
1941| 0| &provider_binary,
1942| 0| provider_model,
1943| 0| provider_timeout,
1944| 0| &args.mode,
1945| | ),
1946| 0| EnrichOperation::DeepResearchSynth => call_deep_research_synth(
1947| 0| &conn,
1948| 0| &namespace,
1949| 0| &item_key,
1950| 0| &provider_binary,
1951| 0| provider_model,
1952| 0| provider_timeout,
1953| 0| &args.mode,
1954| | ),
1955| 0| EnrichOperation::BodyExtract => call_body_extract(
1956| 0| &conn,
1957| 0| &namespace,
1958| 0| &item_key,
1959| 0| &provider_binary,
1960| 0| provider_model,
1961| 0| provider_timeout,
1962| 0| &args.mode,
1963| | ),
1964| | };
1965| |
1966| 0| match call_result {
1967| | Ok(EnrichItemResult::Done {
1968| 0| memory_id,
1969| 0| entity_id,
1970| 0| entities,
1971| 0| rels,
1972| 0| chars_before,
1973| 0| chars_after,
1974| 0| cost,
1975| 0| is_oauth,
1976| | }) => {
1977| 0| if is_oauth && !oauth_detected {
1978| 0| oauth_detected = true;
1979| 0| tracing::info!(target: "enrich", "OAuth subscription detected — cost_usd omitted from output");
1980| 0| }
1981| 0| backoff_secs = DEFAULT_RATE_LIMIT_WAIT;
1982| |
1983| | // Persist depends on the operation
1984| 0| let persist_err: Option<String> = match args.operation {
1985| | EnrichOperation::MemoryBindings => {
1986| | // Bindings already persisted inside call_memory_bindings
1987| 0| None
1988| | }
1989| | EnrichOperation::EntityDescriptions => {
1990| | // Description already persisted inside call_entity_description
1991| 0| None
1992| | }
1993| | EnrichOperation::BodyEnrich => {
1994| | // Body already persisted inside call_body_enrich
1995| 0| None
1996| | }
1997| | _ => {
1998| | // All G27 operations persist inside their call_* function
1999| 0| None
2000| | }
2001| | };
2002| |
2003| 0| if let Err(e) = queue_conn.execute(
2004| 0| "UPDATE queue SET status='done', memory_id=?1, entity_id=?2, entities=?3, rels=?4, cost_usd=?5, elapsed_ms=?6, done_at=datetime('now') WHERE id=?7",
2005| 0| rusqlite::params![
2006| 0| memory_id,
2007| 0| entity_id,
2008| 0| entities as i64,
2009| 0| rels as i64,
2010| 0| cost,
2011| 0| item_started.elapsed().as_millis() as i64,
2012| 0| queue_id
2013| 0| ],
2014| 0| ) {
2015| 0| tracing::warn!(target: "enrich", error = %e, "queue done update failed");
2016| 0| }
2017| |
2018| 0| if persist_err.is_none() {
2019| 0| completed += 1;
2020| 0| if !is_oauth {
2021| 0| cost_total += cost;
2022| 0| }
2023| 0| emit_json(&ItemEvent {
2024| 0| item: &item_key,
2025| 0| status: "done",
2026| 0| memory_id,
2027| 0| entity_id,
2028| 0| entities: Some(entities),
2029| 0| rels: Some(rels),
2030| 0| chars_before,
2031| 0| chars_after,
2032| 0| cost_usd: if is_oauth { None } else { Some(cost) },
2033| 0| elapsed_ms: Some(item_started.elapsed().as_millis() as u64),
2034| 0| error: None,
2035| 0| index: current_index,
2036| 0| total,
2037| | });
2038| 0| } else {
2039| 0| failed += 1;
2040| 0| emit_json(&ItemEvent {
2041| 0| item: &item_key,
2042| 0| status: "failed",
2043| 0| memory_id: None,
2044| 0| entity_id: None,
2045| 0| entities: None,
2046| 0| rels: None,
2047| 0| chars_before: None,
2048| 0| chars_after: None,
2049| 0| cost_usd: None,
2050| 0| elapsed_ms: Some(item_started.elapsed().as_millis() as u64),
2051| 0| error: persist_err,
2052| 0| index: current_index,
2053| 0| total,
2054| 0| });
2055| 0| }
2056| | }
2057| 0| Ok(EnrichItemResult::Skipped { reason }) => {
2058| 0| skipped += 1;
2059| 0| if let Err(e) = queue_conn.execute(
2060| 0| "UPDATE queue SET status='skipped', error=?1, done_at=datetime('now') WHERE id=?2",
2061| 0| rusqlite::params![reason, queue_id],
2062| 0| ) {
2063| 0| tracing::warn!(target: "enrich", error = %e, "queue skipped update failed");
2064| 0| }
2065| 0| emit_json(&ItemEvent {
2066| 0| item: &item_key,
2067| 0| status: "skipped",
2068| 0| memory_id: None,
2069| 0| entity_id: None,
2070| 0| entities: None,
2071| 0| rels: None,
2072| 0| chars_before: None,
2073| 0| chars_after: None,
2074| 0| cost_usd: None,
2075| 0| elapsed_ms: Some(item_started.elapsed().as_millis() as u64),
2076| 0| error: None,
2077| 0| index: current_index,
2078| 0| total,
2079| 0| });
2080| | }
2081| | Ok(EnrichItemResult::PreservationFailed {
2082| 0| score,
2083| 0| threshold,
2084| 0| chars_before,
2085| 0| chars_after,
2086| | }) => {
2087| | // G29 Passo 4: the LLM rewrite diverged too far from
2088| | // the original body. Count as a soft failure (not
2089| | // `failed`) so the queue surfaces it as a quality
2090| | // issue, not a transport error. The reason is
2091| | // structured so the operator can audit why a body
2092| | // was rejected.
2093| 0| skipped += 1;
2094| 0| let reason = format!(
2095| 0| "preservation_failed: jaccard={score:.3} threshold={threshold:.3} (orig={chars_before} chars, new={chars_after} chars)"
2096| | );
2097| 0| if let Err(qe) = queue_conn.execute(
2098| 0| "UPDATE queue SET status='skipped', error=?1, done_at=datetime('now') WHERE id=?2",
2099| 0| rusqlite::params![reason, queue_id],
2100| 0| ) {
2101| 0| tracing::warn!(target: "enrich", error = %qe, "queue preservation_failed update failed");
2102| 0| }
2103| 0| emit_json(&ItemEvent {
2104| 0| item: &item_key,
2105| 0| status: "preservation_failed",
2106| 0| memory_id: None,
2107| 0| entity_id: None,
2108| 0| entities: None,
2109| 0| rels: None,
2110| 0| chars_before: Some(chars_before),
2111| 0| chars_after: Some(chars_after),
2112| 0| cost_usd: None,
2113| 0| elapsed_ms: Some(item_started.elapsed().as_millis() as u64),
2114| 0| error: Some(reason),
2115| 0| index: current_index,
2116| 0| total,
2117| 0| });
2118| | }
2119| 0| Err(e) => {
2120| 0| let err_str = format!("{e}");
2121| 0| if matches!(e, AppError::RateLimited { .. }) {
2122| 0| if crate::retry::is_kill_switch_active() {
2123| 0| tracing::warn!(target: "enrich", "SQLITE_GRAPHRAG_DISABLE_RETRY=1, skipping rate-limit retry");
2124| 0| } else if std::time::Instant::now() >= rate_limit_deadline {
2125| 0| tracing::error!(target: "enrich", total_elapsed_secs = enrich_started.elapsed().as_secs(), "rate-limit retry deadline (1h) exhausted");
2126| | } else {
2127| 0| let half = backoff_secs / 2;
2128| 0| let jitter = if half == 0 { 0 } else { fastrand::u64(0..half) };
2129| 0| let actual_wait = half + jitter;
2130| 0| tracing::warn!(target: "enrich", delay_secs = actual_wait, error_kind = "rate_limited", "rate limited, backing off");
2131| 0| if let Err(qe) = queue_conn.execute(
2132| 0| "UPDATE queue SET status='pending' WHERE id=?1",
2133| 0| rusqlite::params![queue_id],
2134| 0| ) {
2135| 0| tracing::warn!(target: "enrich", error = %qe, "queue pending update failed");
2136| 0| }
2137| 0| std::thread::sleep(std::time::Duration::from_secs(actual_wait));
2138| 0| backoff_secs = (backoff_secs * 2).min(900);
2139| 0| continue;
2140| | }
2141| 0| }
2142| |
2143| 0| failed += 1;
2144| 0| if let Err(qe) = queue_conn.execute(
2145| 0| "UPDATE queue SET status='failed', error=?1, done_at=datetime('now') WHERE id=?2",
2146| 0| rusqlite::params![err_str, queue_id],
2147| 0| ) {
2148| 0| tracing::warn!(target: "enrich", error = %qe, "queue failed update failed");
2149| 0| }
2150| 0| emit_json(&ItemEvent {
2151| 0| item: &item_key,
2152| 0| status: "failed",
2153| 0| memory_id: None,
2154| 0| entity_id: None,
2155| 0| entities: None,
2156| 0| rels: None,
2157| 0| chars_before: None,
2158| 0| chars_after: None,
2159| 0| cost_usd: None,
2160| 0| elapsed_ms: Some(item_started.elapsed().as_millis() as u64),
2161| 0| error: Some(err_str),
2162| 0| index: current_index,
2163| 0| total,
2164| 0| });
2165| | }
2166| | }
2167| |
2168| 0| let _ = item_type; // used via queue schema only
2169| | }
2170| | } // end else (serial path)
2171| |
2172| 0| let _ = conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);");
2173| 0| let _ = queue_conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);");
2174| |
2175| 0| emit_json(&EnrichSummary {
2176| 0| summary: true,
2177| 0| operation: format!("{:?}", args.operation),
2178| 0| items_total: total,
2179| 0| completed,
2180| 0| failed,
2181| 0| skipped,
2182| 0| cost_usd: cost_total,
2183| 0| elapsed_ms: started.elapsed().as_millis() as u64,
2184| 0| });
2185| |
2186| 0| if failed == 0 {
2187| 0| let _ = std::fs::remove_file(DEFAULT_QUEUE_DB);
2188| 0| }
2189| |
2190| 0| Ok(())
2191| 0|}
2192| |
2193| |// ---------------------------------------------------------------------------
2194| |// Internal result type for a single item call
2195| |// ---------------------------------------------------------------------------
2196| |
2197| |enum EnrichItemResult {
2198| | Done {
2199| | memory_id: Option<i64>,
2200| | entity_id: Option<i64>,
2201| | entities: usize,
2202| | rels: usize,
2203| | chars_before: Option<usize>,
2204| | chars_after: Option<usize>,
2205| | cost: f64,
2206| | is_oauth: bool,
2207| | },
2208| | Skipped {
2209| | reason: String,
2210| | },
2211| | /// G29 Passo 4 (v1.0.69): the LLM rewrite diverged from the original
2212| | /// body beyond the configured `--preserve-threshold` and was rejected
2213| | /// before persistence. The trigram-Jaccard score and threshold are
2214| | /// emitted in the NDJSON stream for operator audit.
2215| | PreservationFailed {
2216| | score: f64,
2217| | threshold: f64,
2218| | chars_before: usize,
2219| | chars_after: usize,
2220| | },
2221| |}
2222| |
2223| |// ---------------------------------------------------------------------------
2224| |// Per-operation call helpers (SCAN + JUDGE + PERSIST in one unit)
2225| |// ---------------------------------------------------------------------------
2226| |
2227| 0|fn call_memory_bindings(
2228| 0| conn: &Connection,
2229| 0| namespace: &str,
2230| 0| memory_name: &str,
2231| 0| binary: &Path,
2232| 0| model: Option<&str>,
2233| 0| timeout: u64,
2234| 0| mode: &EnrichMode,
2235| 0|) -> Result<EnrichItemResult, AppError> {
2236| | // Look up the memory
2237| 0| let (memory_id, body): (i64, String) = conn.query_row(
2238| 0| "SELECT id, COALESCE(body,'') FROM memories WHERE namespace=?1 AND name=?2 AND deleted_at IS NULL",
2239| 0| rusqlite::params![namespace, memory_name],
2240| 0| |r| Ok((r.get(0)?, r.get(1)?)),
2241| 0| ).map_err(|e| match e {
2242| 0| rusqlite::Error::QueryReturnedNoRows => AppError::NotFound(format!("memory '{memory_name}' not found")),
2243| 0| other => AppError::Database(other),
2244| 0| })?;
2245| |
2246| 0| if body.trim().is_empty() {
2247| 0| return Ok(EnrichItemResult::Skipped {
2248| 0| reason: "body is empty".to_string(),
2249| 0| });
2250| 0| }
2251| |
2252| 0| let (value, cost, is_oauth) = match mode {
2253| 0| EnrichMode::ClaudeCode => call_claude(
2254| 0| binary,
2255| 0| BINDINGS_PROMPT,
2256| 0| BINDINGS_SCHEMA,
2257| 0| &body,
2258| 0| model,
2259| 0| timeout,
2260| 0| )?,
2261| 0| EnrichMode::Codex => call_codex(
2262| 0| binary,
2263| 0| BINDINGS_PROMPT,
2264| 0| BINDINGS_SCHEMA,
2265| 0| &body,
2266| 0| model,
2267| 0| timeout,
2268| 0| )?,
2269| | };
2270| |
2271| 0| let empty_arr = serde_json::Value::Array(vec![]);
2272| 0| let entities_val = value.get("entities").unwrap_or(&empty_arr);
2273| 0| let rels_val = value.get("relationships").unwrap_or(&empty_arr);
2274| |
2275| 0| let (ent_count, rel_count) =
2276| 0| persist_memory_bindings(conn, namespace, memory_id, entities_val, rels_val)?;
2277| |
2278| 0| Ok(EnrichItemResult::Done {
2279| 0| memory_id: Some(memory_id),
2280| 0| entity_id: None,
2281| 0| entities: ent_count,
2282| 0| rels: rel_count,
2283| 0| chars_before: None,
2284| 0| chars_after: None,
2285| 0| cost,
2286| 0| is_oauth,
2287| 0| })
2288| 0|}
2289| |
2290| 0|fn call_entity_description(
2291| 0| conn: &Connection,
2292| 0| namespace: &str,
2293| 0| entity_name: &str,
2294| 0| binary: &Path,
2295| 0| model: Option<&str>,
2296| 0| timeout: u64,
2297| 0| mode: &EnrichMode,
2298| 0|) -> Result<EnrichItemResult, AppError> {
2299| 0| let (entity_id, entity_type): (i64, String) = conn
2300| 0| .query_row(
2301| 0| "SELECT id, type FROM entities WHERE namespace=?1 AND name=?2",
2302| 0| rusqlite::params![namespace, entity_name],
2303| 0| |r| Ok((r.get(0)?, r.get(1)?)),
2304| | )
2305| 0| .map_err(|e| match e {
2306| | rusqlite::Error::QueryReturnedNoRows => {
2307| 0| AppError::NotFound(format!("entity '{entity_name}' not found"))
2308| | }
2309| 0| other => AppError::Database(other),
2310| 0| })?;
2311| |
2312| 0| let prompt = format!(
2313| 0| "{ENTITY_DESCRIPTION_PROMPT_PREFIX}{entity_name}\nEntity type: {entity_type}\n\nGenerate a description:"
2314| | );
2315| |
2316| 0| let (value, cost, is_oauth) = match mode {
2317| 0| EnrichMode::ClaudeCode => call_claude(
2318| 0| binary,
2319| 0| &prompt,
2320| 0| ENTITY_DESCRIPTION_SCHEMA,
2321| 0| "",
2322| 0| model,
2323| 0| timeout,
2324| 0| )?,
2325| 0| EnrichMode::Codex => call_codex(
2326| 0| binary,
2327| 0| &prompt,
2328| 0| ENTITY_DESCRIPTION_SCHEMA,
2329| 0| "",
2330| 0| model,
2331| 0| timeout,
2332| 0| )?,
2333| | };
2334| |
2335| 0| let description = value
2336| 0| .get("description")
2337| 0| .and_then(|v| v.as_str())
2338| 0| .ok_or_else(|| AppError::Validation("LLM result missing 'description' field".into()))?;
2339| |
2340| 0| persist_entity_description(conn, entity_id, description)?;
2341| |
2342| 0| Ok(EnrichItemResult::Done {
2343| 0| memory_id: None,
2344| 0| entity_id: Some(entity_id),
2345| 0| entities: 0,
2346| 0| rels: 0,
2347| 0| chars_before: None,
2348| 0| chars_after: None,
2349| 0| cost,
2350| 0| is_oauth,
2351| 0| })
2352| 0|}
2353| |
2354| |#[allow(clippy::too_many_arguments)]
2355| 0|fn call_body_enrich(
2356| 0| conn: &Connection,
2357| 0| namespace: &str,
2358| 0| memory_name: &str,
2359| 0| binary: &Path,
2360| 0| model: Option<&str>,
2361| 0| timeout: u64,
2362| 0| mode: &EnrichMode,
2363| 0| min_output_chars: usize,
2364| 0| max_output_chars: usize,
2365| 0| prompt_template: Option<&Path>,
2366| 0| preserve_threshold: f64,
2367| 0| paths: &crate::paths::AppPaths,
2368| 0|) -> Result<EnrichItemResult, AppError> {
2369| 0| let (memory_id, body, description, memory_type): (i64, String, String, String) = conn
2370| 0| .query_row(
2371| 0| "SELECT id, COALESCE(body,''), COALESCE(description,''), COALESCE(type,'note') \
2372| 0| FROM memories WHERE namespace=?1 AND name=?2 AND deleted_at IS NULL",
2373| 0| rusqlite::params![namespace, memory_name],
2374| 0| |r| Ok((r.get(0)?, r.get(1)?, r.get(2)?, r.get(3)?)),
2375| | )
2376| 0| .map_err(|e| match e {
2377| | rusqlite::Error::QueryReturnedNoRows => {
2378| 0| AppError::NotFound(format!("memory '{memory_name}' not found"))
2379| | }
2380| 0| other => AppError::Database(other),
2381| 0| })?;
2382| |
2383| 0| let chars_before = body.chars().count();
2384| |
2385| | // G26: gather graph context for contextualized enrichment
2386| 0| let linked_entities: Vec<String> = {
2387| 0| let mut stmt = conn.prepare_cached(
2388| 0| "SELECT e.name FROM memory_entities me \
2389| 0| JOIN entities e ON e.id = me.entity_id \
2390| 0| WHERE me.memory_id = ?1 LIMIT 10",
2391| 0| )?;
2392| 0| let result: Vec<String> = stmt
2393| 0| .query_map(rusqlite::params![memory_id], |r| r.get::<_, String>(0))?
2394| 0| .filter_map(|r| r.ok())
2395| 0| .collect();
2396| 0| drop(stmt);
2397| 0| result
2398| | };
2399| |
2400| | // Load custom prompt template if provided
2401| 0| let prompt_prefix = if let Some(tmpl_path) = prompt_template {
2402| 0| let file_size = std::fs::metadata(tmpl_path)
2403| 0| .map_err(|e| {
2404| 0| AppError::Io(std::io::Error::new(
2405| 0| e.kind(),
2406| 0| format!("failed to stat prompt template: {e}"),
2407| 0| ))
2408| 0| })?
2409| 0| .len();
2410| 0| if file_size > MAX_MEMORY_BODY_LEN as u64 {
2411| 0| return Err(AppError::LimitExceeded(
2412| 0| crate::i18n::validation::body_exceeds(MAX_MEMORY_BODY_LEN),
2413| 0| ));
2414| 0| }
2415| 0| std::fs::read_to_string(tmpl_path).map_err(|e| {
2416| 0| AppError::Io(std::io::Error::new(
2417| 0| e.kind(),
2418| 0| format!("failed to read prompt template: {e}"),
2419| 0| ))
2420| 0| })?
2421| | } else {
2422| 0| BODY_ENRICH_PROMPT_PREFIX.to_string()
2423| | };
2424| |
2425| | // G26: build contextualized prompt with graph data
2426| 0| let context_section = if !linked_entities.is_empty() || !description.is_empty() {
2427| 0| let mut ctx = String::new();
2428| 0| ctx.push_str(&format!(
2429| 0| "\nContext:\n- Memory name: {memory_name}\n- Type: {memory_type}\n"
2430| 0| ));
2431| 0| if !description.is_empty() {
2432| 0| ctx.push_str(&format!("- Description: {description}\n"));
2433| 0| }
2434| 0| ctx.push_str(&format!("- Domain: {namespace}\n"));
2435| 0| if !linked_entities.is_empty() {
2436| 0| ctx.push_str(&format!(
2437| 0| "- Linked entities: {}\n",
2438| 0| linked_entities.join(", ")
2439| 0| ));
2440| 0| }
2441| 0| ctx
2442| | } else {
2443| 0| String::new()
2444| | };
2445| |
2446| 0| let prompt = format!(
2447| 0| "{prompt_prefix}{context_section}\nTarget minimum length: {min_output_chars} characters. Maximum: {max_output_chars} characters."
2448| | );
2449| |
2450| | // The body schema uses a free-form enriched_body field
2451| 0| let (value, cost, is_oauth) = match mode {
2452| | EnrichMode::ClaudeCode => {
2453| 0| call_claude(binary, &prompt, BODY_ENRICH_SCHEMA, &body, model, timeout)?
2454| | }
2455| | EnrichMode::Codex => {
2456| 0| call_codex(binary, &prompt, BODY_ENRICH_SCHEMA, &body, model, timeout)?
2457| | }
2458| | };
2459| |
2460| 0| let enriched_body = value
2461| 0| .get("enriched_body")
2462| 0| .and_then(|v| v.as_str())
2463| 0| .ok_or_else(|| AppError::Validation("LLM result missing 'enriched_body' field".into()))?;
2464| |
2465| 0| let chars_after = enriched_body.chars().count();
2466| |
2467| | // G29 Passo 4 (v1.0.69): preservation check. Before persisting, run
2468| | // a trigram-Jaccard similarity between the original body and the
2469| | // LLM-rewritten body. When the score falls below
2470| | // `args.preserve_threshold` (default 0.7 per the G29 gap), reject the
2471| | // rewrite as a likely hallucination. The result is recorded in the
2472| | // NDJSON stream so operators can audit what the LLM tried to do.
2473| 0| let threshold = preserve_threshold;
2474| 0| let verdict =
2475| 0| crate::preservation::PreservationVerdict::evaluate(&body, enriched_body, threshold);
2476| 0| if !verdict.is_accepted() {
2477| | return Ok(EnrichItemResult::PreservationFailed {
2478| 0| score: match verdict {
2479| 0| crate::preservation::PreservationVerdict::Preserved { score, .. } => score,
2480| 0| crate::preservation::PreservationVerdict::Rejected { score, .. } => score,
2481| 0| crate::preservation::PreservationVerdict::Unchanged { .. } => 1.0,
2482| | },
2483| 0| threshold,
2484| 0| chars_before,
2485| 0| chars_after,
2486| | });
2487| 0| }
2488| |
2489| | // G29 Passo 5 (v1.0.69): idempotency via blake3 hash. Before persisting,
2490| | // compare the hash of the original body against the hash of the enriched
2491| | // body. Identical hashes mean the LLM produced a byte-for-byte identical
2492| | // body (rare but possible) — treat as `Skipped` so re-running the batch
2493| | // is safe and the queue does not get re-persisted entries.
2494| 0| let old_hash = blake3::hash(body.as_bytes()).to_hex().to_string();
2495| 0| let new_hash = blake3::hash(enriched_body.as_bytes()).to_hex().to_string();
2496| 0| if old_hash == new_hash {
2497| 0| return Ok(EnrichItemResult::Skipped {
2498| 0| reason: format!(
2499| 0| "enriched body hash matches original (blake3:{old_hash}); idempotency skip"
2500| 0| ),
2501| 0| });
2502| 0| }
2503| |
2504| | // Only persist if the enriched body is genuinely longer
2505| 0| if chars_after <= chars_before {
2506| 0| return Ok(EnrichItemResult::Skipped {
2507| 0| reason: format!(
2508| 0| "enriched body ({chars_after} chars) not longer than original ({chars_before} chars)"
2509| 0| ),
2510| 0| });
2511| 0| }
2512| |
2513| 0| persist_enriched_body(
2514| 0| conn,
2515| 0| namespace,
2516| 0| memory_id,
2517| 0| memory_name,
2518| 0| enriched_body,
2519| 0| paths,
2520| 0| )?;
2521| |
2522| 0| Ok(EnrichItemResult::Done {
2523| 0| memory_id: Some(memory_id),
2524| 0| entity_id: None,
2525| 0| entities: 0,
2526| 0| rels: 0,
2527| 0| chars_before: Some(chars_before),
2528| 0| chars_after: Some(chars_after),
2529| 0| cost,
2530| 0| is_oauth,
2531| 0| })
2532| 0|}
2533| |
2534| |// ---------------------------------------------------------------------------
2535| |// Scan dispatcher — maps operation to scan query result (item keys)
2536| |// ---------------------------------------------------------------------------
2537| |
2538| 0|fn scan_operation(
2539| 0| conn: &Connection,
2540| 0| namespace: &str,
2541| 0| args: &EnrichArgs,
2542| 0|) -> Result<Vec<String>, AppError> {
2543| | // G37: resolve --names + --names-file once and apply to every scan path.
2544| 0| let name_filter = resolve_name_filter(args)?;
2545| 0| match args.operation {
2546| | EnrichOperation::MemoryBindings => {
2547| 0| let rows = scan_unbound_memories(conn, namespace, args.limit, &name_filter)?;
2548| 0| Ok(rows.into_iter().map(|(_, name, _)| name).collect())
2549| | }
2550| | EnrichOperation::EntityDescriptions => {
2551| 0| let rows = scan_entities_without_description(conn, namespace, args.limit)?;
2552| 0| Ok(rows.into_iter().map(|(_, name, _)| name).collect())
2553| | }
2554| | EnrichOperation::BodyEnrich => {
2555| 0| let rows =
2556| 0| scan_short_body_memories(conn, namespace, args.min_output_chars, args.limit)?;
2557| 0| Ok(rows.into_iter().map(|(_, name, _)| name).collect())
2558| | }
2559| | EnrichOperation::WeightCalibrate => {
2560| 0| let rows = scan_weight_candidates(conn, namespace, args.limit)?;
2561| 0| Ok(rows
2562| 0| .into_iter()
2563| 0| .map(|(id, _, _, _, _)| id.to_string())
2564| 0| .collect())
2565| | }
2566| | EnrichOperation::RelationReclassify => {
2567| 0| let rows = scan_generic_relations(conn, namespace, args.limit)?;
2568| 0| Ok(rows
2569| 0| .into_iter()
2570| 0| .map(|(id, _, _, _)| id.to_string())
2571| 0| .collect())
2572| | }
2573| | EnrichOperation::EntityConnect | EnrichOperation::CrossDomainBridges => {
2574| 0| let pairs = scan_isolated_entity_pairs(conn, namespace, args.limit)?;
2575| 0| Ok(pairs.into_iter().map(|(_, name, _, _)| name).collect())
2576| | }
2577| | EnrichOperation::EntityTypeValidate => {
2578| 0| let rows = scan_entities_for_type_validation(conn, namespace, args.limit)?;
2579| 0| Ok(rows.into_iter().map(|(_, name, _)| name).collect())
2580| | }
2581| | EnrichOperation::DescriptionEnrich => {
2582| 0| let rows = scan_generic_descriptions(conn, namespace, args.limit)?;
2583| 0| Ok(rows.into_iter().map(|(_, name, _)| name).collect())
2584| | }
2585| | EnrichOperation::DomainClassify
2586| | | EnrichOperation::GraphAudit
2587| | | EnrichOperation::DeepResearchSynth
2588| | | EnrichOperation::BodyExtract => {
2589| 0| let limit_clause = args.limit.map(|n| format!("LIMIT {n}")).unwrap_or_default();
2590| 0| let sql = format!(
2591| 0| "SELECT name FROM memories WHERE namespace=?1 AND deleted_at IS NULL ORDER BY id {limit_clause}"
2592| | );
2593| 0| let mut stmt = conn.prepare(&sql)?;
2594| 0| let names = stmt
2595| 0| .query_map(rusqlite::params![namespace], |r| r.get::<_, String>(0))?
2596| 0| .collect::<Result<Vec<_>, _>>()?;
2597| 0| Ok(names)
2598| | }
2599| | }
2600| 0|}
2601| |
2602| |// ---------------------------------------------------------------------------
2603| |// Codex stub provider
2604| |// ---------------------------------------------------------------------------
2605| |
2606| |/// Locates the Codex CLI binary.
2607| 0|fn find_codex_binary(explicit: Option<&Path>) -> Result<PathBuf, AppError> {
2608| 0| if let Some(p) = explicit {
2609| 0| if p.exists() {
2610| 0| return Ok(p.to_path_buf());
2611| 0| }
2612| 0| return Err(AppError::Validation(format!(
2613| 0| "Codex binary not found at explicit path: {}",
2614| 0| p.display()
2615| 0| )));
2616| 0| }
2617| |
2618| 0| if let Ok(env_path) = std::env::var("SQLITE_GRAPHRAG_CODEX_BINARY") {
2619| 0| let p = PathBuf::from(&env_path);
2620| 0| if p.exists() {
2621| 0| return Ok(p);
2622| 0| }
2623| 0| }
2624| |
2625| 0| let name = if cfg!(windows) { "codex.exe" } else { "codex" };
2626| 0| if let Some(path_var) = std::env::var_os("PATH") {
2627| 0| for dir in std::env::split_paths(&path_var) {
2628| 0| let candidate = dir.join(name);
2629| 0| if candidate.exists() {
2630| 0| return Ok(candidate);
2631| 0| }
2632| | }
2633| 0| }
2634| |
2635| 0| Err(AppError::Validation(
2636| 0| "Codex CLI binary not found in PATH. Install it or specify --codex-binary".to_string(),
2637| 0| ))
2638| 0|}
2639| |
2640| |/// G27: Calibrate weight of a single relationship via LLM.
2641| 0|fn call_weight_calibrate(
2642| 0| conn: &Connection,
2643| 0| _namespace: &str,
2644| 0| item_key: &str,
2645| 0| binary: &Path,
2646| 0| model: Option<&str>,
2647| 0| timeout: u64,
2648| 0| mode: &EnrichMode,
2649| 0|) -> Result<EnrichItemResult, AppError> {
2650| 0| let rel_id: i64 = item_key
2651| 0| .parse()
2652| 0| .map_err(|_| AppError::Validation(format!("invalid relationship id: {item_key}")))?;
2653| 0| let (source_name, target_name, relation, current_weight): (String, String, String, f64) = conn
2654| 0| .query_row(
2655| 0| "SELECT e1.name, e2.name, r.relation, r.weight \
2656| 0| FROM relationships r \
2657| 0| JOIN entities e1 ON e1.id = r.source_id \
2658| 0| JOIN entities e2 ON e2.id = r.target_id \
2659| 0| WHERE r.id = ?1",
2660| 0| rusqlite::params![rel_id],
2661| 0| |r| Ok((r.get(0)?, r.get(1)?, r.get(2)?, r.get(3)?)),
2662| | )
2663| 0| .map_err(|_| AppError::NotFound(format!("relationship {rel_id} not found")))?;
2664| |
2665| 0| let input_text = format!(
2666| 0| "Source: {source_name}\nTarget: {target_name}\nRelation: {relation}\nCurrent weight: {current_weight}"
2667| | );
2668| 0| let (value, cost, is_oauth) = match mode {
2669| 0| EnrichMode::ClaudeCode => call_claude(
2670| 0| binary,
2671| 0| WEIGHT_CALIBRATE_PROMPT,
2672| 0| WEIGHT_CALIBRATE_SCHEMA,
2673| 0| &input_text,
2674| 0| model,
2675| 0| timeout,
2676| 0| )?,
2677| 0| EnrichMode::Codex => call_codex(
2678| 0| binary,
2679| 0| WEIGHT_CALIBRATE_PROMPT,
2680| 0| WEIGHT_CALIBRATE_SCHEMA,
2681| 0| &input_text,
2682| 0| model,
2683| 0| timeout,
2684| 0| )?,
2685| | };
2686| |
2687| 0| let calibrated = value
2688| 0| .get("calibrated_weight")
2689| 0| .and_then(|v| v.as_f64())
2690| 0| .ok_or_else(|| AppError::Validation("LLM result missing 'calibrated_weight'".into()))?;
2691| |
2692| 0| conn.execute(
2693| 0| "UPDATE relationships SET weight = ?1 WHERE id = ?2",
2694| 0| rusqlite::params![calibrated, rel_id],
2695| 0| )?;
2696| |
2697| 0| Ok(EnrichItemResult::Done {
2698| 0| memory_id: None,
2699| 0| entity_id: None,
2700| 0| entities: 0,
2701| 0| rels: 1,
2702| 0| chars_before: None,
2703| 0| chars_after: None,
2704| 0| cost,
2705| 0| is_oauth,
2706| 0| })
2707| 0|}
2708| |
2709| |/// G27: Reclassify a generic relationship type via LLM.
2710| 0|fn call_relation_reclassify(
2711| 0| conn: &Connection,
2712| 0| _namespace: &str,
2713| 0| item_key: &str,
2714| 0| binary: &Path,
2715| 0| model: Option<&str>,
2716| 0| timeout: u64,
2717| 0| mode: &EnrichMode,
2718| 0|) -> Result<EnrichItemResult, AppError> {
2719| 0| let rel_id: i64 = item_key
2720| 0| .parse()
2721| 0| .map_err(|_| AppError::Validation(format!("invalid relationship id: {item_key}")))?;
2722| 0| let (source_name, target_name, current_relation): (String, String, String) = conn
2723| 0| .query_row(
2724| 0| "SELECT e1.name, e2.name, r.relation \
2725| 0| FROM relationships r \
2726| 0| JOIN entities e1 ON e1.id = r.source_id \
2727| 0| JOIN entities e2 ON e2.id = r.target_id \
2728| 0| WHERE r.id = ?1",
2729| 0| rusqlite::params![rel_id],
2730| 0| |r| Ok((r.get(0)?, r.get(1)?, r.get(2)?)),
2731| | )
2732| 0| .map_err(|_| AppError::NotFound(format!("relationship {rel_id} not found")))?;
2733| |
2734| 0| let input_text = format!(
2735| 0| "Source entity: {source_name}\nTarget entity: {target_name}\nCurrent relation: {current_relation}"
2736| | );
2737| 0| let (value, cost, is_oauth) = match mode {
2738| 0| EnrichMode::ClaudeCode => call_claude(
2739| 0| binary,
2740| 0| RELATION_RECLASSIFY_PROMPT,
2741| 0| RELATION_RECLASSIFY_SCHEMA,
2742| 0| &input_text,
2743| 0| model,
2744| 0| timeout,
2745| 0| )?,
2746| 0| EnrichMode::Codex => call_codex(
2747| 0| binary,
2748| 0| RELATION_RECLASSIFY_PROMPT,
2749| 0| RELATION_RECLASSIFY_SCHEMA,
2750| 0| &input_text,
2751| 0| model,
2752| 0| timeout,
2753| 0| )?,
2754| | };
2755| |
2756| 0| let new_relation = value
2757| 0| .get("relation")
2758| 0| .and_then(|v| v.as_str())
2759| 0| .ok_or_else(|| AppError::Validation("LLM result missing 'relation'".into()))?;
2760| 0| let new_strength = value
2761| 0| .get("strength")
2762| 0| .and_then(|v| v.as_f64())
2763| 0| .unwrap_or(0.5);
2764| |
2765| 0| conn.execute(
2766| 0| "UPDATE relationships SET relation = ?1, weight = ?2 WHERE id = ?3",
2767| 0| rusqlite::params![new_relation, new_strength, rel_id],
2768| 0| )?;
2769| |
2770| 0| Ok(EnrichItemResult::Done {
2771| 0| memory_id: None,
2772| 0| entity_id: None,
2773| 0| entities: 0,
2774| 0| rels: 1,
2775| 0| chars_before: None,
2776| 0| chars_after: None,
2777| 0| cost,
2778| 0| is_oauth,
2779| 0| })
2780| 0|}
2781| |
2782| |/// G27 P2: Connect isolated entities via LLM-suggested relationship.
2783| 0|fn call_entity_connect(
2784| 0| conn: &Connection,
2785| 0| namespace: &str,
2786| 0| item_key: &str,
2787| 0| binary: &Path,
2788| 0| model: Option<&str>,
2789| 0| timeout: u64,
2790| 0| mode: &EnrichMode,
2791| 0|) -> Result<EnrichItemResult, AppError> {
2792| 0| let pairs = scan_isolated_entity_pairs(conn, namespace, Some(1))?;
2793| 0| let (e1_id, e1_name, e2_id, e2_name) =
2794| 0| match pairs.into_iter().find(|(_, n, _, _)| n == item_key) {
2795| 0| Some(p) => p,
2796| | None => {
2797| 0| return Ok(EnrichItemResult::Skipped {
2798| 0| reason: "pair no longer isolated".into(),
2799| 0| })
2800| | }
2801| | };
2802| 0| let input_text = format!("Entity A: {e1_name}\nEntity B: {e2_name}");
2803| 0| let (value, cost, is_oauth) = match mode {
2804| 0| EnrichMode::ClaudeCode => call_claude(
2805| 0| binary,
2806| 0| ENTITY_CONNECT_PROMPT,
2807| 0| ENTITY_CONNECT_SCHEMA,
2808| 0| &input_text,
2809| 0| model,
2810| 0| timeout,
2811| 0| )?,
2812| 0| EnrichMode::Codex => call_codex(
2813| 0| binary,
2814| 0| ENTITY_CONNECT_PROMPT,
2815| 0| ENTITY_CONNECT_SCHEMA,
2816| 0| &input_text,
2817| 0| model,
2818| 0| timeout,
2819| 0| )?,
2820| | };
2821| 0| let relation = value
2822| 0| .get("relation")
2823| 0| .and_then(|v| v.as_str())
2824| 0| .unwrap_or("none");
2825| 0| if relation == "none" {
2826| 0| return Ok(EnrichItemResult::Skipped {
2827| 0| reason: "LLM determined no relationship".into(),
2828| 0| });
2829| 0| }
2830| 0| let strength = value
2831| 0| .get("strength")
2832| 0| .and_then(|v| v.as_f64())
2833| 0| .unwrap_or(0.5);
2834| 0| conn.execute(
2835| 0| "INSERT OR IGNORE INTO relationships (namespace, source_id, target_id, relation, weight) VALUES (?1, ?2, ?3, ?4, ?5)",
2836| 0| rusqlite::params![namespace, e1_id, e2_id, relation, strength],
2837| 0| )?;
2838| 0| Ok(EnrichItemResult::Done {
2839| 0| memory_id: None,
2840| 0| entity_id: None,
2841| 0| entities: 0,
2842| 0| rels: 1,
2843| 0| chars_before: None,
2844| 0| chars_after: None,
2845| 0| cost,
2846| 0| is_oauth,
2847| 0| })
2848| 0|}
2849| |
2850| |/// G27 P2: Validate entity type assignment via LLM.
2851| 0|fn call_entity_type_validate(
2852| 0| conn: &Connection,
2853| 0| _namespace: &str,
2854| 0| item_key: &str,
2855| 0| binary: &Path,
2856| 0| model: Option<&str>,
2857| 0| timeout: u64,
2858| 0| mode: &EnrichMode,
2859| 0|) -> Result<EnrichItemResult, AppError> {
2860| 0| let (ent_id, ent_name, ent_type): (i64, String, String) = conn
2861| 0| .query_row(
2862| 0| "SELECT id, name, type FROM entities WHERE name = ?1",
2863| 0| rusqlite::params![item_key],
2864| 0| |r| Ok((r.get(0)?, r.get(1)?, r.get(2)?)),
2865| | )
2866| 0| .map_err(|_| AppError::NotFound(format!("entity '{item_key}' not found")))?;
2867| 0| let input_text = format!("Entity: {ent_name}\nCurrent type: {ent_type}");
2868| 0| let (value, cost, is_oauth) = match mode {
2869| 0| EnrichMode::ClaudeCode => call_claude(
2870| 0| binary,
2871| 0| ENTITY_TYPE_VALIDATE_PROMPT,
2872| 0| ENTITY_TYPE_VALIDATE_SCHEMA,
2873| 0| &input_text,
2874| 0| model,
2875| 0| timeout,
2876| 0| )?,
2877| 0| EnrichMode::Codex => call_codex(
2878| 0| binary,
2879| 0| ENTITY_TYPE_VALIDATE_PROMPT,
2880| 0| ENTITY_TYPE_VALIDATE_SCHEMA,
2881| 0| &input_text,
2882| 0| model,
2883| 0| timeout,
2884| 0| )?,
2885| | };
2886| 0| let validated_type = value
2887| 0| .get("validated_type")
2888| 0| .and_then(|v| v.as_str())
2889| 0| .unwrap_or(&ent_type);
2890| 0| let was_correct = value
2891| 0| .get("was_correct")
2892| 0| .and_then(|v| v.as_bool())
2893| 0| .unwrap_or(true);
2894| 0| if !was_correct {
2895| 0| conn.execute(
2896| 0| "UPDATE entities SET type = ?1 WHERE id = ?2",
2897| 0| rusqlite::params![validated_type, ent_id],
2898| 0| )?;
2899| 0| }
2900| 0| Ok(EnrichItemResult::Done {
2901| 0| memory_id: None,
2902| 0| entity_id: Some(ent_id),
2903| 0| entities: 1,
2904| 0| rels: 0,
2905| 0| chars_before: None,
2906| 0| chars_after: None,
2907| 0| cost,
2908| 0| is_oauth,
2909| 0| })
2910| 0|}
2911| |
2912| |/// G27 P2: Enrich generic memory description via LLM.
2913| 0|fn call_description_enrich(
2914| 0| conn: &Connection,
2915| 0| _namespace: &str,
2916| 0| item_key: &str,
2917| 0| binary: &Path,
2918| 0| model: Option<&str>,
2919| 0| timeout: u64,
2920| 0| mode: &EnrichMode,
2921| 0|) -> Result<EnrichItemResult, AppError> {
2922| 0| let (mem_id, body, old_desc): (i64, String, String) = conn
2923| 0| .query_row(
2924| 0| "SELECT id, body, description FROM memories WHERE name = ?1 AND deleted_at IS NULL",
2925| 0| rusqlite::params![item_key],
2926| 0| |r| Ok((r.get(0)?, r.get::<_, String>(1)?, r.get::<_, String>(2)?)),
2927| | )
2928| 0| .map_err(|_| AppError::NotFound(format!("memory '{item_key}' not found")))?;
2929| 0| let snippet: String = body.chars().take(500).collect();
2930| 0| let input_text = format!(
2931| 0| "Memory name: {item_key}\nCurrent description: {old_desc}\nBody preview: {snippet}"
2932| | );
2933| 0| let (value, cost, is_oauth) = match mode {
2934| 0| EnrichMode::ClaudeCode => call_claude(
2935| 0| binary,
2936| 0| DESCRIPTION_ENRICH_PROMPT,
2937| 0| DESCRIPTION_ENRICH_SCHEMA,
2938| 0| &input_text,
2939| 0| model,
2940| 0| timeout,
2941| 0| )?,
2942| 0| EnrichMode::Codex => call_codex(
2943| 0| binary,
2944| 0| DESCRIPTION_ENRICH_PROMPT,
2945| 0| DESCRIPTION_ENRICH_SCHEMA,
2946| 0| &input_text,
2947| 0| model,
2948| 0| timeout,
2949| 0| )?,
2950| | };
2951| 0| let new_desc = value
2952| 0| .get("description")
2953| 0| .and_then(|v| v.as_str())
2954| 0| .unwrap_or(&old_desc);
2955| 0| conn.execute(
2956| 0| "UPDATE memories SET description = ?1 WHERE id = ?2",
2957| 0| rusqlite::params![new_desc, mem_id],
2958| 0| )?;
2959| 0| Ok(EnrichItemResult::Done {
2960| 0| memory_id: Some(mem_id),
2961| 0| entity_id: None,
2962| 0| entities: 0,
2963| 0| rels: 0,
2964| 0| chars_before: Some(old_desc.len()),
2965| 0| chars_after: Some(new_desc.len()),
2966| 0| cost,
2967| 0| is_oauth,
2968| 0| })
2969| 0|}
2970| |
2971| |/// G27 P2: Classify memory into domain category via LLM.
2972| 0|fn call_domain_classify(
2973| 0| conn: &Connection,
2974| 0| _namespace: &str,
2975| 0| item_key: &str,
2976| 0| binary: &Path,
2977| 0| model: Option<&str>,
2978| 0| timeout: u64,
2979| 0| mode: &EnrichMode,
2980| 0|) -> Result<EnrichItemResult, AppError> {
2981| 0| let (mem_id, body, desc): (i64, String, String) = conn
2982| 0| .query_row(
2983| 0| "SELECT id, body, description FROM memories WHERE name = ?1 AND deleted_at IS NULL",
2984| 0| rusqlite::params![item_key],
2985| 0| |r| Ok((r.get(0)?, r.get::<_, String>(1)?, r.get::<_, String>(2)?)),
2986| | )
2987| 0| .map_err(|_| AppError::NotFound(format!("memory '{item_key}' not found")))?;
2988| 0| let snippet: String = body.chars().take(500).collect();
2989| 0| let input_text = format!("Memory: {item_key}\nDescription: {desc}\nBody preview: {snippet}");
2990| 0| let (value, cost, is_oauth) = match mode {
2991| 0| EnrichMode::ClaudeCode => call_claude(
2992| 0| binary,
2993| 0| DOMAIN_CLASSIFY_PROMPT,
2994| 0| DOMAIN_CLASSIFY_SCHEMA,
2995| 0| &input_text,
2996| 0| model,
2997| 0| timeout,
2998| 0| )?,
2999| 0| EnrichMode::Codex => call_codex(
3000| 0| binary,
3001| 0| DOMAIN_CLASSIFY_PROMPT,
3002| 0| DOMAIN_CLASSIFY_SCHEMA,
3003| 0| &input_text,
3004| 0| model,
3005| 0| timeout,
3006| 0| )?,
3007| | };
3008| 0| let domain = value
3009| 0| .get("domain")
3010| 0| .and_then(|v| v.as_str())
3011| 0| .unwrap_or("uncategorized");
3012| 0| let metadata = format!(r#"{{"domain":"{}"}}"#, domain.replace('"', "\\\""));
3013| 0| conn.execute(
3014| 0| "UPDATE memories SET metadata = ?1 WHERE id = ?2",
3015| 0| rusqlite::params![metadata, mem_id],
3016| 0| )?;
3017| 0| Ok(EnrichItemResult::Done {
3018| 0| memory_id: Some(mem_id),
3019| 0| entity_id: None,
3020| 0| entities: 0,
3021| 0| rels: 0,
3022| 0| chars_before: None,
3023| 0| chars_after: None,
3024| 0| cost,
3025| 0| is_oauth,
3026| 0| })
3027| 0|}
3028| |
3029| |/// G27 P2: Audit memory graph quality via LLM.
3030| 0|fn call_graph_audit(
3031| 0| conn: &Connection,
3032| 0| _namespace: &str,
3033| 0| item_key: &str,
3034| 0| binary: &Path,
3035| 0| model: Option<&str>,
3036| 0| timeout: u64,
3037| 0| mode: &EnrichMode,
3038| 0|) -> Result<EnrichItemResult, AppError> {
3039| 0| let (mem_id, body, desc): (i64, String, String) = conn
3040| 0| .query_row(
3041| 0| "SELECT id, body, description FROM memories WHERE name = ?1 AND deleted_at IS NULL",
3042| 0| rusqlite::params![item_key],
3043| 0| |r| Ok((r.get(0)?, r.get::<_, String>(1)?, r.get::<_, String>(2)?)),
3044| | )
3045| 0| .map_err(|_| AppError::NotFound(format!("memory '{item_key}' not found")))?;
3046| 0| let snippet: String = body.chars().take(500).collect();
3047| 0| let ent_count: i64 = conn
3048| 0| .query_row(
3049| 0| "SELECT COUNT(*) FROM memory_entities WHERE memory_id = ?1",
3050| 0| rusqlite::params![mem_id],
3051| 0| |r| r.get(0),
3052| | )
3053| 0| .unwrap_or(0);
3054| 0| let input_text = format!("Memory: {item_key}\nDescription: {desc}\nEntity bindings: {ent_count}\nBody preview: {snippet}");
3055| 0| let (value, cost, is_oauth) = match mode {
3056| 0| EnrichMode::ClaudeCode => call_claude(
3057| 0| binary,
3058| 0| GRAPH_AUDIT_PROMPT,
3059| 0| GRAPH_AUDIT_SCHEMA,
3060| 0| &input_text,
3061| 0| model,
3062| 0| timeout,
3063| 0| )?,
3064| 0| EnrichMode::Codex => call_codex(
3065| 0| binary,
3066| 0| GRAPH_AUDIT_PROMPT,
3067| 0| GRAPH_AUDIT_SCHEMA,
3068| 0| &input_text,
3069| 0| model,
3070| 0| timeout,
3071| 0| )?,
3072| | };
3073| 0| let issues = value
3074| 0| .get("issues")
3075| 0| .and_then(|v| v.as_array())
3076| 0| .map(|a| a.len())
3077| 0| .unwrap_or(0);
3078| 0| Ok(EnrichItemResult::Done {
3079| 0| memory_id: Some(mem_id),
3080| 0| entity_id: None,
3081| 0| entities: 0,
3082| 0| rels: issues,
3083| 0| chars_before: None,
3084| 0| chars_after: None,
3085| 0| cost,
3086| 0| is_oauth,
3087| 0| })
3088| 0|}
3089| |
3090| |/// G27 P2: Synthesize research findings into graph entities/relationships via LLM.
3091| 0|fn call_deep_research_synth(
3092| 0| conn: &Connection,
3093| 0| namespace: &str,
3094| 0| item_key: &str,
3095| 0| binary: &Path,
3096| 0| model: Option<&str>,
3097| 0| timeout: u64,
3098| 0| mode: &EnrichMode,
3099| 0|) -> Result<EnrichItemResult, AppError> {
3100| 0| let (mem_id, body): (i64, String) = conn
3101| 0| .query_row(
3102| 0| "SELECT id, body FROM memories WHERE name = ?1 AND deleted_at IS NULL",
3103| 0| rusqlite::params![item_key],
3104| 0| |r| Ok((r.get(0)?, r.get::<_, String>(1)?)),
3105| | )
3106| 0| .map_err(|_| AppError::NotFound(format!("memory '{item_key}' not found")))?;
3107| 0| let snippet: String = body.chars().take(2000).collect();
3108| 0| let input_text = format!("Memory: {item_key}\nBody:\n{snippet}");
3109| 0| let (value, cost, is_oauth) = match mode {
3110| 0| EnrichMode::ClaudeCode => call_claude(
3111| 0| binary,
3112| 0| DEEP_RESEARCH_SYNTH_PROMPT,
3113| 0| DEEP_RESEARCH_SYNTH_SCHEMA,
3114| 0| &input_text,
3115| 0| model,
3116| 0| timeout,
3117| 0| )?,
3118| 0| EnrichMode::Codex => call_codex(
3119| 0| binary,
3120| 0| DEEP_RESEARCH_SYNTH_PROMPT,
3121| 0| DEEP_RESEARCH_SYNTH_SCHEMA,
3122| 0| &input_text,
3123| 0| model,
3124| 0| timeout,
3125| 0| )?,
3126| | };
3127| 0| let mut ent_count = 0usize;
3128| 0| let mut rel_count = 0usize;
3129| 0| if let Some(ents) = value.get("entities").and_then(|v| v.as_array()) {
3130| 0| for e in ents {
3131| 0| let name = e.get("name").and_then(|v| v.as_str()).unwrap_or_default();
3132| 0| let etype_str = e
3133| 0| .get("entity_type")
3134| 0| .and_then(|v| v.as_str())
3135| 0| .unwrap_or("concept");
3136| 0| let etype: EntityType = etype_str.parse().unwrap_or(EntityType::Concept);
3137| 0| if name.len() >= 2 {
3138| 0| let ne = NewEntity {
3139| 0| name: name.to_string(),
3140| 0| entity_type: etype,
3141| 0| description: None,
3142| 0| };
3143| 0| let _ = entities::upsert_entity(conn, namespace, &ne);
3144| 0| ent_count += 1;
3145| 0| }
3146| | }
3147| 0| }
3148| 0| if let Some(rels) = value.get("relationships").and_then(|v| v.as_array()) {
3149| 0| for r in rels {
3150| 0| let src = r.get("source").and_then(|v| v.as_str()).unwrap_or_default();
3151| 0| let tgt = r.get("target").and_then(|v| v.as_str()).unwrap_or_default();
3152| 0| if src.is_empty() || tgt.is_empty() {
3153| 0| continue;
3154| 0| }
3155| 0| let rel = r
3156| 0| .get("relation")
3157| 0| .and_then(|v| v.as_str())
3158| 0| .unwrap_or("related");
3159| 0| let str_ = r.get("strength").and_then(|v| v.as_f64()).unwrap_or(0.5);
3160| 0| if let (Some(sid), Some(tid)) = (
3161| 0| entities::find_entity_id(conn, namespace, src)?,
3162| 0| entities::find_entity_id(conn, namespace, tgt)?,
3163| 0| ) {
3164| 0| let _ = entities::create_or_fetch_relationship(
3165| 0| conn, namespace, sid, tid, rel, str_, None,
3166| 0| );
3167| 0| rel_count += 1;
3168| 0| }
3169| | }
3170| 0| }
3171| 0| Ok(EnrichItemResult::Done {
3172| 0| memory_id: Some(mem_id),
3173| 0| entity_id: None,
3174| 0| entities: ent_count,
3175| 0| rels: rel_count,
3176| 0| chars_before: None,
3177| 0| chars_after: None,
3178| 0| cost,
3179| 0| is_oauth,
3180| 0| })
3181| 0|}
3182| |
3183| |/// G27 P2: Extract structured body from unstructured text via LLM.
3184| 0|fn call_body_extract(
3185| 0| conn: &Connection,
3186| 0| _namespace: &str,
3187| 0| item_key: &str,
3188| 0| binary: &Path,
3189| 0| model: Option<&str>,
3190| 0| timeout: u64,
3191| 0| mode: &EnrichMode,
3192| 0|) -> Result<EnrichItemResult, AppError> {
3193| 0| let (mem_id, body): (i64, String) = conn
3194| 0| .query_row(
3195| 0| "SELECT id, body FROM memories WHERE name = ?1 AND deleted_at IS NULL",
3196| 0| rusqlite::params![item_key],
3197| 0| |r| Ok((r.get(0)?, r.get::<_, String>(1)?)),
3198| | )
3199| 0| .map_err(|_| AppError::NotFound(format!("memory '{item_key}' not found")))?;
3200| 0| let input_text = format!("Memory: {item_key}\nBody:\n{body}");
3201| 0| let (value, cost, is_oauth) = match mode {
3202| 0| EnrichMode::ClaudeCode => call_claude(
3203| 0| binary,
3204| 0| BODY_EXTRACT_PROMPT,
3205| 0| BODY_EXTRACT_SCHEMA,
3206| 0| &input_text,
3207| 0| model,
3208| 0| timeout,
3209| 0| )?,
3210| 0| EnrichMode::Codex => call_codex(
3211| 0| binary,
3212| 0| BODY_EXTRACT_PROMPT,
3213| 0| BODY_EXTRACT_SCHEMA,
3214| 0| &input_text,
3215| 0| model,
3216| 0| timeout,
3217| 0| )?,
3218| | };
3219| 0| let restructured = value
3220| 0| .get("restructured_body")
3221| 0| .and_then(|v| v.as_str())
3222| 0| .unwrap_or(&body);
3223| 0| let chars_before = body.len();
3224| 0| let chars_after = restructured.len();
3225| 0| let new_hash = blake3::hash(restructured.as_bytes()).to_hex().to_string();
3226| 0| conn.execute(
3227| 0| "UPDATE memories SET body = ?1, body_hash = ?2, updated_at = unixepoch() WHERE id = ?3",
3228| 0| rusqlite::params![restructured, new_hash, mem_id],
3229| 0| )?;
3230| 0| Ok(EnrichItemResult::Done {
3231| 0| memory_id: Some(mem_id),
3232| 0| entity_id: None,
3233| 0| entities: 0,
3234| 0| rels: 0,
3235| 0| chars_before: Some(chars_before),
3236| 0| chars_after: Some(chars_after),
3237| 0| cost,
3238| 0| is_oauth,
3239| 0| })
3240| 0|}
3241| |
3242| |/// Scan for pairs of entities that share no direct relationship.
3243| |#[allow(clippy::type_complexity)]
3244| 0|fn scan_isolated_entity_pairs(
3245| 0| conn: &Connection,
3246| 0| namespace: &str,
3247| 0| limit: Option<usize>,
3248| 0|) -> Result<Vec<(i64, String, i64, String)>, AppError> {
3249| 0| let limit_val = limit.unwrap_or(50) as i64;
3250| 0| let mut stmt = conn.prepare_cached(
3251| 0| "SELECT e1.id, e1.name, e2.id, e2.name FROM entities e1, entities e2 \
3252| 0| WHERE e1.namespace = ?1 AND e2.namespace = ?1 AND e1.id < e2.id \
3253| 0| AND NOT EXISTS (SELECT 1 FROM relationships r WHERE \
3254| 0| (r.source_id = e1.id AND r.target_id = e2.id) OR \
3255| 0| (r.source_id = e2.id AND r.target_id = e1.id)) \
3256| 0| LIMIT ?2",
3257| 0| )?;
3258| 0| let rows = stmt
3259| 0| .query_map(rusqlite::params![namespace, limit_val], |r| {
3260| 0| Ok((r.get(0)?, r.get(1)?, r.get(2)?, r.get(3)?))
3261| 0| })?
3262| 0| .collect::<Result<Vec<_>, _>>()?;
3263| 0| Ok(rows)
3264| 0|}
3265| |
3266| |/// Scan for entities with non-validated types (all entities for type audit).
3267| 0|fn scan_entities_for_type_validation(
3268| 0| conn: &Connection,
3269| 0| namespace: &str,
3270| 0| limit: Option<usize>,
3271| 0|) -> Result<Vec<(i64, String, String)>, AppError> {
3272| 0| let limit_clause = limit.map(|n| format!("LIMIT {n}")).unwrap_or_default();
3273| 0| let sql = format!(
3274| 0| "SELECT id, name, type FROM entities WHERE namespace = ?1 ORDER BY id {limit_clause}"
3275| | );
3276| 0| let mut stmt = conn.prepare(&sql)?;
3277| 0| let rows = stmt
3278| 0| .query_map(rusqlite::params![namespace], |r| {
3279| 0| Ok((r.get(0)?, r.get(1)?, r.get(2)?))
3280| 0| })?
3281| 0| .collect::<Result<Vec<_>, _>>()?;
3282| 0| Ok(rows)
3283| 0|}
3284| |
3285| |/// Scan for memories with generic descriptions (ingested, imported, etc).
3286| 0|fn scan_generic_descriptions(
3287| 0| conn: &Connection,
3288| 0| namespace: &str,
3289| 0| limit: Option<usize>,
3290| 0|) -> Result<Vec<(i64, String, String)>, AppError> {
3291| 0| let limit_clause = limit.map(|n| format!("LIMIT {n}")).unwrap_or_default();
3292| 0| let sql = format!(
3293| 0| "SELECT id, name, description FROM memories WHERE namespace = ?1 AND deleted_at IS NULL \
3294| 0| AND (description LIKE '%ingested%' OR description LIKE '%imported%' OR description LIKE '%added%' OR length(description) < 30) \
3295| 0| ORDER BY id {limit_clause}"
3296| | );
3297| 0| let mut stmt = conn.prepare(&sql)?;
3298| 0| let rows = stmt
3299| 0| .query_map(rusqlite::params![namespace], |r| {
3300| 0| Ok((r.get(0)?, r.get(1)?, r.get(2)?))
3301| 0| })?
3302| 0| .collect::<Result<Vec<_>, _>>()?;
3303| 0| Ok(rows)
3304| 0|}
3305| |
3306| |/// Calls the Codex CLI for a single enrichment item.
3307| |///
3308| |/// Follows the same contract as `call_claude`: returns `(value, cost_usd, is_oauth=false)`.
3309| 0|fn call_codex(
3310| 0| binary: &Path,
3311| 0| prompt: &str,
3312| 0| json_schema: &str,
3313| 0| input_text: &str,
3314| 0| model: Option<&str>,
3315| 0| timeout_secs: u64,
3316| 0|) -> Result<(serde_json::Value, f64, bool), AppError> {
3317| | use wait_timeout::ChildExt;
3318| |
3319| | // G31+G32+G33 (v1.0.69): validate the model BEFORE spawn, write the
3320| | // schema to a trusted cache path (not /tmp), and reuse the
3321| | // consolidated JSONL parser. See `codex_spawn.rs` for the canonical
3322| | // hardening rationale.
3323| 0| super::codex_spawn::validate_codex_model(model)?;
3324| 0| let schema_file = super::codex_spawn::trusted_schema_path()?;
3325| |
3326| 0| let args = super::codex_spawn::CodexSpawnArgs {
3327| 0| binary,
3328| 0| prompt,
3329| 0| json_schema,
3330| 0| input_text,
3331| 0| model,
3332| 0| timeout_secs,
3333| 0| schema_path: schema_file.clone(),
3334| 0| };
3335| 0| let mut cmd = super::codex_spawn::build_codex_command(&args);
3336| |
3337| 0| let mut child = super::claude_runner::spawn_with_memory_limit(&mut cmd).map_err(|e| {
3338| 0| AppError::Io(std::io::Error::new(
3339| 0| e.kind(),
3340| 0| format!("failed to spawn codex: {e}"),
3341| 0| ))
3342| 0| })?;
3343| |
3344| 0| let full_prompt = format!("{prompt}\n\n{input_text}");
3345| 0| let stdin_bytes = full_prompt.into_bytes();
3346| 0| let mut child_stdin = child
3347| 0| .stdin
3348| 0| .take()
3349| 0| .ok_or_else(|| AppError::Validation("failed to open codex stdin".into()))?;
3350| 0| let stdin_thread = std::thread::spawn(move || -> Result<(), std::io::Error> {
3351| 0| child_stdin.write_all(&stdin_bytes)?;
3352| 0| drop(child_stdin);
3353| 0| Ok(())
3354| 0| });
3355| |
3356| 0| let start = std::time::Instant::now();
3357| 0| let timeout = std::time::Duration::from_secs(timeout_secs);
3358| 0| let status = child.wait_timeout(timeout).map_err(AppError::Io)?;
3359| 0| let _ = std::fs::remove_file(&schema_file);
3360| |
3361| 0| match status {
3362| 0| Some(exit_status) => {
3363| 0| stdin_thread
3364| 0| .join()
3365| 0| .map_err(|_| AppError::Validation("stdin thread panicked".into()))?
3366| 0| .map_err(AppError::Io)?;
3367| |
3368| 0| tracing::debug!(
3369| | target: "process",
3370| 0| exit_code = ?exit_status.code(),
3371| 0| elapsed_ms = start.elapsed().as_millis() as u64,
3372| 0| "external process completed"
3373| | );
3374| |
3375| 0| let mut stdout_buf = Vec::new();
3376| 0| if let Some(mut out) = child.stdout.take() {
3377| 0| std::io::Read::read_to_end(&mut out, &mut stdout_buf).map_err(AppError::Io)?;
3378| 0| }
3379| 0| if !exit_status.success() {
3380| 0| let mut stderr_buf = Vec::new();
3381| 0| if let Some(mut err) = child.stderr.take() {
3382| 0| std::io::Read::read_to_end(&mut err, &mut stderr_buf).map_err(AppError::Io)?;
3383| 0| }
3384| 0| let stderr_str = String::from_utf8_lossy(&stderr_buf);
3385| 0| tracing::warn!(
3386| | target: "enrich",
3387| 0| exit_code = ?exit_status.code(),
3388| 0| stderr = %stderr_str.trim(),
3389| 0| "codex process failed"
3390| | );
3391| 0| return Err(AppError::Validation(format!(
3392| 0| "codex exited with code {:?}: {}",
3393| 0| exit_status.code(),
3394| 0| stderr_str.trim()
3395| 0| )));
3396| 0| }
3397| 0| let stdout_str = String::from_utf8(stdout_buf)
3398| 0| .map_err(|_| AppError::Validation("codex stdout is not valid UTF-8".into()))?;
3399| | // G32: use the JSONL parser, NOT serde_json::from_str on the
3400| | // entire stdout (codex emits one event per line).
3401| 0| let result = super::codex_spawn::parse_codex_jsonl(&stdout_str)?;
3402| | // Wrap the extraction as a JSON object so downstream code
3403| | // (which expects a single `serde_json::Value`) keeps working.
3404| | // `ExtractedUrl` lacks `Serialize` so we project to a
3405| | // serde-friendly vector.
3406| 0| let urls: Vec<serde_json::Value> = result
3407| 0| .extraction
3408| 0| .urls
3409| 0| .iter()
3410| 0| .map(|u| serde_json::json!({"url": u.url, "offset": u.offset}))
3411| 0| .collect();
3412| 0| let value = serde_json::json!({
3413| 0| "entities": result.extraction.entities,
3414| 0| "relationships": result.extraction.relationships,
3415| 0| "urls": urls,
3416| 0| "extraction_method": result.extraction.extraction_method,
3417| | });
3418| 0| Ok((value, 0.0, false))
3419| | }
3420| | None => {
3421| 0| let _ = child.kill();
3422| 0| let _ = child.wait();
3423| 0| let _ = stdin_thread.join();
3424| 0| Err(AppError::Validation(format!(
3425| 0| "codex timed out after {timeout_secs} seconds"
3426| 0| )))
3427| | }
3428| | }
3429| 0|}
3430| |
3431| |// ---------------------------------------------------------------------------
3432| |// Tests
3433| |// ---------------------------------------------------------------------------
3434| |
3435| |#[cfg(test)]
3436| |mod tests {
3437| | use super::*;
3438| | use rusqlite::Connection;
3439| |
3440| | /// Opens an in-memory SQLite database with a minimal schema for unit tests.
3441| 9| fn open_test_db() -> Connection {
3442| 9| let conn = Connection::open_in_memory().expect("in-memory db");
3443| 9| conn.execute_batch(
3444| 9| "CREATE TABLE memories (
3445| 9| id INTEGER PRIMARY KEY AUTOINCREMENT,
3446| 9| namespace TEXT NOT NULL DEFAULT 'global',
3447| 9| name TEXT NOT NULL,
3448| 9| type TEXT NOT NULL DEFAULT 'note',
3449| 9| description TEXT NOT NULL DEFAULT '',
3450| 9| body TEXT NOT NULL DEFAULT '',
3451| 9| body_hash TEXT NOT NULL DEFAULT '',
3452| 9| session_id TEXT,
3453| 9| source TEXT NOT NULL DEFAULT 'agent',
3454| 9| metadata TEXT NOT NULL DEFAULT '{}',
3455| 9| created_at INTEGER NOT NULL DEFAULT (unixepoch()),
3456| 9| updated_at INTEGER NOT NULL DEFAULT (unixepoch()),
3457| 9| deleted_at INTEGER,
3458| 9| UNIQUE(namespace, name)
3459| 9| );
3460| 9| CREATE TABLE entities (
3461| 9| id INTEGER PRIMARY KEY AUTOINCREMENT,
3462| 9| namespace TEXT NOT NULL DEFAULT 'global',
3463| 9| name TEXT NOT NULL,
3464| 9| type TEXT NOT NULL DEFAULT 'concept',
3465| 9| description TEXT,
3466| 9| degree INTEGER NOT NULL DEFAULT 0,
3467| 9| created_at INTEGER NOT NULL DEFAULT (unixepoch()),
3468| 9| updated_at INTEGER NOT NULL DEFAULT (unixepoch()),
3469| 9| UNIQUE(namespace, name)
3470| 9| );
3471| 9| CREATE TABLE memory_entities (
3472| 9| memory_id INTEGER NOT NULL,
3473| 9| entity_id INTEGER NOT NULL,
3474| 9| PRIMARY KEY (memory_id, entity_id)
3475| 9| );
3476| 9| CREATE TABLE relationships (
3477| 9| id INTEGER PRIMARY KEY AUTOINCREMENT,
3478| 9| namespace TEXT NOT NULL DEFAULT 'global',
3479| 9| source_id INTEGER NOT NULL,
3480| 9| target_id INTEGER NOT NULL,
3481| 9| relation TEXT NOT NULL,
3482| 9| weight REAL NOT NULL DEFAULT 0.5,
3483| 9| description TEXT,
3484| 9| UNIQUE(source_id, target_id, relation)
3485| 9| );",
3486| | )
3487| 9| .expect("schema creation must succeed");
3488| 9| conn
3489| 9| }
3490| |
3491| | #[test]
3492| 1| fn scan_unbound_memories_finds_memories_without_bindings() {
3493| 1| let conn = open_test_db();
3494| 1| conn.execute(
3495| 1| "INSERT INTO memories (namespace, name, body) VALUES ('global', 'test-mem', 'some body content')",
3496| 1| [],
3497| | )
3498| 1| .unwrap();
3499| |
3500| 1| let results = scan_unbound_memories(&conn, "global", None, &[]).unwrap();
3501| 1| assert_eq!(results.len(), 1);
3502| 1| assert_eq!(results[0].1, "test-mem");
3503| 1| }
3504| |
3505| | #[test]
3506| 1| fn scan_unbound_memories_excludes_bound_memories() {
3507| 1| let conn = open_test_db();
3508| 1| conn.execute(
3509| 1| "INSERT INTO memories (namespace, name, body) VALUES ('global', 'bound-mem', 'body')",
3510| 1| [],
3511| | )
3512| 1| .unwrap();
3513| 1| let mem_id: i64 = conn
3514| 1| .query_row("SELECT id FROM memories WHERE name='bound-mem'", [], |r| {
3515| 1| r.get(0)
3516| 1| })
3517| 1| .unwrap();
3518| 1| conn.execute(
3519| 1| "INSERT INTO entities (namespace, name) VALUES ('global', 'some-entity')",
3520| 1| [],
3521| | )
3522| 1| .unwrap();
3523| 1| let ent_id: i64 = conn
3524| 1| .query_row(
3525| 1| "SELECT id FROM entities WHERE name='some-entity'",
3526| 1| [],
3527| 1| |r| r.get(0),
3528| | )
3529| 1| .unwrap();
3530| 1| conn.execute(
3531| 1| "INSERT INTO memory_entities (memory_id, entity_id) VALUES (?1, ?2)",
3532| 1| rusqlite::params![mem_id, ent_id],
3533| | )
3534| 1| .unwrap();
3535| |
3536| 1| let results = scan_unbound_memories(&conn, "global", None, &[]).unwrap();
3537| 1| assert!(results.is_empty(), "bound memory must not appear in scan");
^0
3538| 1| }
3539| |
3540| | #[test]
3541| 1| fn scan_entities_without_description_finds_null_description() {
3542| 1| let conn = open_test_db();
3543| 1| conn.execute(
3544| 1| "INSERT INTO entities (namespace, name, type, description) VALUES ('global', 'my-tool', 'tool', NULL)",
3545| 1| [],
3546| | )
3547| 1| .unwrap();
3548| |
3549| 1| let results = scan_entities_without_description(&conn, "global", None).unwrap();
3550| 1| assert_eq!(results.len(), 1);
3551| 1| assert_eq!(results[0].1, "my-tool");
3552| 1| }
3553| |
3554| | #[test]
3555| 1| fn scan_entities_without_description_excludes_entities_with_description() {
3556| 1| let conn = open_test_db();
3557| 1| conn.execute(
3558| 1| "INSERT INTO entities (namespace, name, type, description) VALUES ('global', 'described-tool', 'tool', 'Has a description already')",
3559| 1| [],
3560| | )
3561| 1| .unwrap();
3562| |
3563| 1| let results = scan_entities_without_description(&conn, "global", None).unwrap();
3564| 1| assert!(
3565| 1| results.is_empty(),
3566| 0| "entity with description must not appear"
3567| | );
3568| 1| }
3569| |
3570| | #[test]
3571| 1| fn scan_short_body_memories_finds_short_bodies() {
3572| 1| let conn = open_test_db();
3573| 1| conn.execute(
3574| 1| "INSERT INTO memories (namespace, name, body) VALUES ('global', 'short-mem', 'hi')",
3575| 1| [],
3576| | )
3577| 1| .unwrap();
3578| |
3579| 1| let results = scan_short_body_memories(&conn, "global", 100, None).unwrap();
3580| 1| assert_eq!(results.len(), 1);
3581| 1| assert_eq!(results[0].1, "short-mem");
3582| 1| }
3583| |
3584| | #[test]
3585| 1| fn scan_short_body_memories_excludes_long_bodies() {
3586| 1| let conn = open_test_db();
3587| 1| let long_body = "a".repeat(1000);
3588| 1| conn.execute(
3589| 1| "INSERT INTO memories (namespace, name, body) VALUES ('global', 'long-mem', ?1)",
3590| 1| rusqlite::params![long_body],
3591| | )
3592| 1| .unwrap();
3593| |
3594| 1| let results = scan_short_body_memories(&conn, "global", 100, None).unwrap();
3595| 1| assert!(results.is_empty(), "long memory must not appear in scan");
^0
3596| 1| }
3597| |
3598| | #[test]
3599| 1| fn scan_respects_limit() {
3600| 1| let conn = open_test_db();
3601| 6| for i in 0..5 {
^5
3602| 5| conn.execute(
3603| 5| &format!("INSERT INTO memories (namespace, name, body) VALUES ('global', 'mem-{i}', 'short')"),
3604| 5| [],
3605| 5| )
3606| 5| .unwrap();
3607| 5| }
3608| |
3609| 1| let results = scan_short_body_memories(&conn, "global", 1000, Some(3)).unwrap();
3610| 1| assert_eq!(results.len(), 3, "limit must be respected");
^0
3611| 1| }
3612| |
3613| | #[test]
3614| 1| fn queue_db_schema_creates_correctly() {
3615| 1| let tmp_path = format!("/tmp/test-enrich-queue-{}.sqlite", std::process::id());
3616| 1| let conn = open_queue_db(&tmp_path).expect("queue db must open");
3617| 1| let count: i64 = conn
3618| 1| .query_row("SELECT COUNT(*) FROM queue", [], |r| r.get(0))
3619| 1| .unwrap();
3620| 1| assert_eq!(count, 0);
3621| 1| let _ = std::fs::remove_file(&tmp_path);
3622| 1| }
3623| |
3624| | #[test]
3625| 1| fn parse_claude_output_valid_bindings() {
3626| 1| let output = r#"[
3627| 1| {"type":"system","subtype":"init"},
3628| 1| {"type":"result","is_error":false,"total_cost_usd":0.01,
3629| 1| "structured_output":{"entities":[{"name":"rust-lang","entity_type":"tool"}],"relationships":[]}}
3630| 1| ]"#;
3631| 1| let result = crate::commands::claude_runner::parse_claude_output(output)
3632| 1| .expect("must parse successfully");
3633| 1| assert!(result.value.get("entities").is_some());
3634| 1| assert!((result.cost_usd - 0.01).abs() < f64::EPSILON);
3635| 1| assert!(!result.is_oauth);
3636| 1| }
3637| |
3638| | #[test]
3639| 1| fn parse_claude_output_detects_oauth() {
3640| 1| let output = r#"[
3641| 1| {"type":"system","subtype":"init","apiKeySource":"none"},
3642| 1| {"type":"result","is_error":false,"total_cost_usd":0.0,
3643| 1| "structured_output":{"entities":[],"relationships":[]}}
3644| 1| ]"#;
3645| 1| let result = crate::commands::claude_runner::parse_claude_output(output).unwrap();
3646| 1| assert!(result.is_oauth);
3647| 1| }
3648| |
3649| | #[test]
3650| 1| fn parse_claude_output_rate_limit_returns_error() {
3651| 1| let output = r#"[
3652| 1| {"type":"system","subtype":"init"},
3653| 1| {"type":"result","is_error":true,"error":"rate_limit exceeded"}
3654| 1| ]"#;
3655| 1| let err = crate::commands::claude_runner::parse_claude_output(output).unwrap_err();
3656| 1| assert!(matches!(err, AppError::RateLimited { .. }));
^0
3657| 1| }
3658| |
3659| | #[test]
3660| 1| fn parse_claude_output_auth_error() {
3661| 1| let output = r#"[
3662| 1| {"type":"system","subtype":"init"},
3663| 1| {"type":"result","is_error":true,"error":"authentication failed"}
3664| 1| ]"#;
3665| 1| let err = crate::commands::claude_runner::parse_claude_output(output).unwrap_err();
3666| 1| assert!(format!("{err}").contains("authentication failed"));
3667| 1| }
3668| |
3669| | #[test]
3670| 1| fn dry_run_emits_preview_without_calling_llm() {
3671| | // This test validates the dry-run NDJSON contract without spawning any process.
3672| | // The scan_operation function requires a DB; we build one in-memory but cannot
3673| | // call run() directly because it needs AppPaths (disk). Instead we test the
3674| | // lower-level helpers that the dry-run path relies on.
3675| 1| let conn = open_test_db();
3676| 1| conn.execute(
3677| 1| "INSERT INTO memories (namespace, name, body) VALUES ('global', 'dry-mem', 'tiny')",
3678| 1| [],
3679| | )
3680| 1| .unwrap();
3681| |
3682| 1| let results = scan_short_body_memories(&conn, "global", 1000, None).unwrap();
3683| 1| assert_eq!(results.len(), 1);
3684| 1| assert_eq!(results[0].1, "dry-mem");
3685| | // If scan finds the item and dry_run is set, no LLM would be called.
3686| | // The NDJSON emission is tested via integration tests with a fake binary.
3687| 1| }
3688| |
3689| | #[test]
3690| 1| fn persist_entity_description_updates_db() {
3691| 1| let conn = open_test_db();
3692| 1| conn.execute(
3693| 1| "INSERT INTO entities (namespace, name, type) VALUES ('global', 'tokio-runtime', 'tool')",
3694| 1| [],
3695| | )
3696| 1| .unwrap();
3697| 1| let eid: i64 = conn
3698| 1| .query_row(
3699| 1| "SELECT id FROM entities WHERE name='tokio-runtime'",
3700| 1| [],
3701| 1| |r| r.get(0),
3702| | )
3703| 1| .unwrap();
3704| |
3705| 1| persist_entity_description(&conn, eid, "Async runtime for Rust applications").unwrap();
3706| |
3707| 1| let desc: String = conn
3708| 1| .query_row(
3709| 1| "SELECT description FROM entities WHERE id=?1",
3710| 1| rusqlite::params![eid],
3711| 1| |r| r.get(0),
3712| | )
3713| 1| .unwrap();
3714| 1| assert_eq!(desc, "Async runtime for Rust applications");
3715| 1| }
3716| |
3717| | #[test]
3718| 1| fn bindings_schema_is_valid_json() {
3719| 1| let _: serde_json::Value =
3720| 1| serde_json::from_str(BINDINGS_SCHEMA).expect("BINDINGS_SCHEMA must be valid JSON");
3721| 1| }
3722| |
3723| | #[test]
3724| 1| fn entity_description_schema_is_valid_json() {
3725| 1| let _: serde_json::Value = serde_json::from_str(ENTITY_DESCRIPTION_SCHEMA)
3726| 1| .expect("ENTITY_DESCRIPTION_SCHEMA must be valid JSON");
3727| 1| }
3728| |
3729| | #[test]
3730| 1| fn body_enrich_schema_is_valid_json() {
3731| 1| let _: serde_json::Value = serde_json::from_str(BODY_ENRICH_SCHEMA)
3732| 1| .expect("BODY_ENRICH_SCHEMA must be valid JSON");
3733| 1| }
3734| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/export.rs:
1| |//! Handler for the `export` CLI subcommand.
2| |
3| |use crate::cli::MemoryType;
4| |use crate::errors::AppError;
5| |use crate::output;
6| |use crate::paths::AppPaths;
7| |use crate::storage::connection::open_ro;
8| |use serde::Serialize;
9| |
10| |#[derive(clap::Args)]
11| |#[command(after_long_help = "EXAMPLES:\n \
12| | # Export all memories as NDJSON\n \
13| | sqlite-graphrag export\n\n \
14| | # Export only decision memories from a namespace\n \
15| | sqlite-graphrag export --type decision --namespace my-project\n\n \
16| | # Export including soft-deleted memories\n \
17| | sqlite-graphrag export --include-deleted\n\n \
18| | # Pipe to file for backup\n \
19| | sqlite-graphrag export > backup.ndjson")]
20| |pub struct ExportArgs {
21| | /// Namespace (env: SQLITE_GRAPHRAG_NAMESPACE, default: global).
22| | #[arg(
23| | long,
24| | help = "Namespace (env: SQLITE_GRAPHRAG_NAMESPACE, default: global)"
25| | )]
26| | pub namespace: Option<String>,
27| | /// Filter by memory type.
28| | #[arg(long, value_enum)]
29| | pub r#type: Option<MemoryType>,
30| | /// Include soft-deleted memories in the export.
31| | #[arg(long, default_value_t = false)]
32| | pub include_deleted: bool,
33| | /// Maximum number of memories to export (default: 100000).
34| | #[arg(long, default_value_t = 100_000)]
35| | pub limit: usize,
36| | /// Offset for pagination.
37| | #[arg(long, default_value_t = 0)]
38| | pub offset: usize,
39| | #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
40| | pub json: bool,
41| | /// Path to graphrag.sqlite (overrides SQLITE_GRAPHRAG_DB_PATH and default CWD).
42| | #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
43| | pub db: Option<String>,
44| |}
45| |
46| |#[derive(Serialize)]
47| |struct ExportMemoryLine {
48| | name: String,
49| | r#type: String,
50| | memory_type: String,
51| | description: String,
52| | body: String,
53| | namespace: String,
54| | created_at_iso: String,
55| | updated_at_iso: String,
56| | #[serde(skip_serializing_if = "Option::is_none")]
57| | deleted_at_iso: Option<String>,
58| |}
59| |
60| |#[derive(Serialize)]
61| |struct ExportSummary {
62| | summary: bool,
63| | exported: usize,
64| | namespace: String,
65| | elapsed_ms: u64,
66| |}
67| |
68| |/// Exports memories as NDJSON (one JSON line per memory, followed by a summary line).
69| 0|pub fn run(args: ExportArgs) -> Result<(), AppError> {
70| 0| let start = std::time::Instant::now();
71| 0| let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
72| 0| let paths = AppPaths::resolve(args.db.as_deref())?;
73| 0| crate::storage::connection::ensure_db_ready(&paths)?;
74| 0| let conn = open_ro(&paths.db)?;
75| |
76| 0| let deleted_filter = if args.include_deleted {
77| 0| ""
78| | } else {
79| 0| "AND m.deleted_at IS NULL"
80| | };
81| |
82| 0| let limit_i64 = args.limit as i64;
83| 0| let offset_i64 = args.offset as i64;
84| 0| let type_str: Option<String> = args.r#type.map(|t| t.as_str().to_string());
85| |
86| 0| let rows = fetch_rows(
87| 0| &conn,
88| 0| &namespace,
89| 0| &type_str,
90| 0| deleted_filter,
91| 0| limit_i64,
92| 0| offset_i64,
93| 0| )?;
94| |
95| 0| let exported = rows.len();
96| 0| for line in &rows {
97| 0| output::emit_json_compact(line)?;
98| | }
99| |
100| 0| output::emit_json_compact(&ExportSummary {
101| 0| summary: true,
102| 0| exported,
103| 0| namespace: namespace.clone(),
104| 0| elapsed_ms: start.elapsed().as_millis() as u64,
105| 0| })?;
106| |
107| 0| Ok(())
108| 0|}
109| |
110| 0|fn fetch_rows(
111| 0| conn: &rusqlite::Connection,
112| 0| namespace: &str,
113| 0| type_str: &Option<String>,
114| 0| deleted_filter: &str,
115| 0| limit: i64,
116| 0| offset: i64,
117| 0|) -> Result<Vec<ExportMemoryLine>, AppError> {
118| 0| let rows = if let Some(t) = type_str {
119| 0| let sql = format!(
120| 0| "SELECT m.name, m.type, m.description, m.body, m.namespace, \
121| 0| m.created_at, m.updated_at, m.deleted_at \
122| 0| FROM memories m \
123| 0| WHERE m.namespace = ?1 {deleted_filter} AND m.type = ?2 \
124| 0| ORDER BY m.name \
125| 0| LIMIT ?3 OFFSET ?4"
126| | );
127| 0| let mut stmt = conn.prepare(&sql)?;
128| 0| let result = stmt
129| 0| .query_map(rusqlite::params![namespace, t, limit, offset], map_row)?
130| 0| .collect::<Result<Vec<_>, _>>()?;
131| 0| result
132| | } else {
133| 0| let sql = format!(
134| 0| "SELECT m.name, m.type, m.description, m.body, m.namespace, \
135| 0| m.created_at, m.updated_at, m.deleted_at \
136| 0| FROM memories m \
137| 0| WHERE m.namespace = ?1 {deleted_filter} \
138| 0| ORDER BY m.name \
139| 0| LIMIT ?2 OFFSET ?3"
140| | );
141| 0| let mut stmt = conn.prepare(&sql)?;
142| 0| let result = stmt
143| 0| .query_map(rusqlite::params![namespace, limit, offset], map_row)?
144| 0| .collect::<Result<Vec<_>, _>>()?;
145| 0| result
146| | };
147| 0| Ok(rows)
148| 0|}
149| |
150| 0|fn map_row(row: &rusqlite::Row<'_>) -> rusqlite::Result<ExportMemoryLine> {
151| 0| let memory_type_val: String = row.get(1)?;
152| | Ok(ExportMemoryLine {
153| 0| name: row.get(0)?,
154| 0| r#type: memory_type_val.clone(),
155| 0| memory_type: memory_type_val,
156| 0| description: row.get(2)?,
157| 0| body: row.get(3)?,
158| 0| namespace: row.get(4)?,
159| 0| created_at_iso: crate::tz::epoch_to_iso(row.get::<_, i64>(5)?),
160| 0| updated_at_iso: crate::tz::epoch_to_iso(row.get::<_, i64>(6)?),
161| 0| deleted_at_iso: row.get::<_, Option<i64>>(7)?.map(crate::tz::epoch_to_iso),
162| | })
163| 0|}
164| |
165| |#[cfg(test)]
166| |mod tests {
167| | use super::*;
168| |
169| | #[test]
170| 1| fn export_line_emits_both_type_and_memory_type() {
171| 1| let line = ExportMemoryLine {
172| 1| name: "test".to_string(),
173| 1| r#type: "document".to_string(),
174| 1| memory_type: "document".to_string(),
175| 1| description: "desc".to_string(),
176| 1| body: "body".to_string(),
177| 1| namespace: "global".to_string(),
178| 1| created_at_iso: "2025-01-01T00:00:00Z".to_string(),
179| 1| updated_at_iso: "2025-01-01T00:00:00Z".to_string(),
180| 1| deleted_at_iso: None,
181| 1| };
182| 1| let json = serde_json::to_value(&line).unwrap();
183| 1| assert_eq!(json["type"], "document");
184| 1| assert_eq!(json["memory_type"], "document");
185| 1| }
186| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/forget.rs:
1| |//! Handler for the `forget` CLI subcommand.
2| |
3| |use crate::errors::AppError;
4| |use crate::i18n::errors_msg;
5| |use crate::output;
6| |use crate::paths::AppPaths;
7| |use crate::storage::connection::open_rw;
8| |use crate::storage::memories;
9| |use rusqlite::{params, OptionalExtension};
10| |use serde::Serialize;
11| |
12| |#[derive(clap::Args)]
13| |#[command(after_long_help = "EXAMPLES:\n \
14| | # Soft-delete a memory by name (positional form)\n \
15| | sqlite-graphrag forget onboarding\n\n \
16| | # Soft-delete using the named flag form\n \
17| | sqlite-graphrag forget --name onboarding\n\n \
18| | # Soft-delete from a specific namespace\n \
19| | sqlite-graphrag forget onboarding --namespace my-project")]
20| |pub struct ForgetArgs {
21| | /// Memory name as a positional argument. Alternative to `--name`.
22| | #[arg(
23| | value_name = "NAME",
24| | conflicts_with = "name",
25| | help = "Memory name to soft-delete; alternative to --name"
26| | )]
27| | pub name_positional: Option<String>,
28| | /// Memory name to soft-delete. The row is preserved with `deleted_at` set, recoverable via `restore`.
29| | /// Use `purge` to permanently remove soft-deleted memories.
30| | #[arg(long)]
31| | pub name: Option<String>,
32| | #[arg(
33| | long,
34| | help = "Namespace (env: SQLITE_GRAPHRAG_NAMESPACE, default: global)"
35| | )]
36| | pub namespace: Option<String>,
37| | #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
38| | pub json: bool,
39| | #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
40| | pub db: Option<String>,
41| |}
42| |
43| |#[derive(Serialize)]
44| |struct ForgetResponse {
45| | /// Outcome of the forget operation: `soft_deleted`, `already_deleted`, or `not_found`.
46| | action: String,
47| | /// True only when this invocation actively transitioned the memory from live to soft-deleted.
48| | forgotten: bool,
49| | name: String,
50| | namespace: String,
51| | /// Unix epoch seconds when the memory was soft-deleted; `None` when `action="not_found"`.
52| | #[serde(skip_serializing_if = "Option::is_none")]
53| | deleted_at: Option<i64>,
54| | /// RFC 3339 UTC timestamp parallel to `deleted_at` for ISO 8601 parsers.
55| | #[serde(skip_serializing_if = "Option::is_none")]
56| | deleted_at_iso: Option<String>,
57| | /// Total execution time in milliseconds from handler start to serialisation.
58| | elapsed_ms: u64,
59| |}
60| |
61| 0|pub fn run(args: ForgetArgs) -> Result<(), AppError> {
62| 0| let start = std::time::Instant::now();
63| 0| tracing::debug!(target: "forget", name = ?args.name_positional.as_deref().or(args.name.as_deref()), "soft-deleting memory");
64| | // Resolve name from positional or --name flag; both are optional, at least one is required.
65| 0| let name = args.name_positional.or(args.name).ok_or_else(|| {
66| 0| AppError::Validation("name required: pass as positional argument or via --name".to_string())
67| 0| })?;
68| 0| let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
69| 0| let paths = AppPaths::resolve(args.db.as_deref())?;
70| 0| crate::storage::connection::ensure_db_ready(&paths)?;
71| |
72| 0| let conn = open_rw(&paths.db)?;
73| |
74| | // Probe state without filtering on `deleted_at` so we can distinguish
75| | // `not_found` (no row) from `already_deleted` (row with deleted_at set)
76| | // from the live case (deleted_at IS NULL) handled by `soft_delete`.
77| 0| let probe: Option<(i64, Option<i64>)> = conn
78| 0| .query_row(
79| 0| "SELECT id, deleted_at FROM memories WHERE namespace = ?1 AND name = ?2",
80| 0| params![namespace, name],
81| 0| |r| Ok((r.get::<_, i64>(0)?, r.get::<_, Option<i64>>(1)?)),
82| | )
83| 0| .optional()?;
84| |
85| 0| let (action, forgotten, deleted_at, memory_id) = match probe {
86| 0| None => ("not_found", false, None, None),
87| 0| Some((id, Some(existing))) => ("already_deleted", false, Some(existing), Some(id)),
88| 0| Some((id, None)) => {
89| | // G39 Passo 4 (v1.0.69): remove the embedding vector BEFORE the
90| | // soft-delete so we do not leave a `vec_memories` row that will
91| | // show up as `vec_memories_orphaned` in `health --json`. The
92| | // operation is best-effort: a failure is logged but does not
93| | // abort the soft-delete (the user-visible action is the same).
94| 0| if let Err(e) = memories::delete_vec(&conn, id) {
95| 0| tracing::warn!(
96| | target: "forget",
97| | memory_id = id,
98| | error = %e,
99| 0| "vec cleanup before soft-delete failed — orphan vector may be left",
100| | );
101| 0| }
102| 0| let ok = memories::soft_delete(&conn, &namespace, &name)?;
103| 0| if !ok {
104| | // Race: row was concurrently soft-deleted between probe and update.
105| | // Re-read to get the current `deleted_at`.
106| 0| let current: Option<i64> = conn
107| 0| .query_row(
108| 0| "SELECT deleted_at FROM memories WHERE id = ?1",
109| 0| params![id],
110| 0| |r| r.get::<_, Option<i64>>(0),
111| | )
112| 0| .optional()?
113| 0| .flatten();
114| 0| ("already_deleted", false, current, Some(id))
115| | } else {
116| 0| let ts: Option<i64> = conn
117| 0| .query_row(
118| 0| "SELECT deleted_at FROM memories WHERE id = ?1",
119| 0| params![id],
120| 0| |r| r.get::<_, Option<i64>>(0),
121| | )
122| 0| .optional()?
123| 0| .flatten();
124| 0| ("soft_deleted", true, ts, Some(id))
125| | }
126| | }
127| | };
128| |
129| 0| if forgotten {
130| 0| if let Some(id) = memory_id {
131| | // FTS5 external-content: manual `DELETE FROM fts_memories WHERE rowid=?`
132| | // corrupts the index. The correct cleanup happens via the `trg_fts_ad` trigger
133| | // when `purge` physically removes the row from `memories`. Between soft-delete
134| | // and purge, FTS queries filter `m.deleted_at IS NULL` in the JOIN.
135| 0| if let Err(e) = memories::delete_vec(&conn, id) {
136| 0| tracing::warn!(target: "forget", memory_id = id, error = %e, "vec cleanup failed — orphan vector left");
137| 0| }
138| 0| }
139| 0| }
140| |
141| 0| conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);")?;
142| |
143| 0| if action == "not_found" {
144| 0| return Err(AppError::NotFound(errors_msg::memory_not_found(
145| 0| &name, &namespace,
146| 0| )));
147| 0| }
148| |
149| 0| let deleted_at_iso = deleted_at.map(crate::tz::epoch_to_iso);
150| 0| let response = ForgetResponse {
151| 0| action: action.to_string(),
152| 0| forgotten,
153| 0| name: name.clone(),
154| 0| namespace: namespace.clone(),
155| 0| deleted_at,
156| 0| deleted_at_iso,
157| 0| elapsed_ms: start.elapsed().as_millis() as u64,
158| 0| };
159| 0| output::emit_json(&response)?;
160| |
161| 0| Ok(())
162| 0|}
163| |
164| |#[cfg(test)]
165| |mod tests {
166| | use super::*;
167| |
168| | #[test]
169| 1| fn forget_response_serializes_basic_fields() {
170| 1| let resp = ForgetResponse {
171| 1| action: "soft_deleted".to_string(),
172| 1| forgotten: true,
173| 1| name: "my-memory".to_string(),
174| 1| namespace: "global".to_string(),
175| 1| deleted_at: Some(1_700_000_000),
176| 1| deleted_at_iso: Some("2023-11-14T22:13:20+00:00".to_string()),
177| 1| elapsed_ms: 5,
178| 1| };
179| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
180| 1| assert_eq!(json["action"], "soft_deleted");
181| 1| assert_eq!(json["forgotten"], true);
182| 1| assert_eq!(json["name"], "my-memory");
183| 1| assert_eq!(json["namespace"], "global");
184| 1| assert_eq!(json["deleted_at"], 1_700_000_000i64);
185| 1| assert!(json["deleted_at_iso"].is_string());
186| 1| assert!(json["elapsed_ms"].is_number());
187| 1| }
188| |
189| | #[test]
190| 1| fn forget_response_action_soft_deleted_implies_forgotten_true() {
191| 1| let resp = ForgetResponse {
192| 1| action: "soft_deleted".to_string(),
193| 1| forgotten: true,
194| 1| name: "test".to_string(),
195| 1| namespace: "ns".to_string(),
196| 1| deleted_at: Some(42),
197| 1| deleted_at_iso: Some(crate::tz::epoch_to_iso(42)),
198| 1| elapsed_ms: 1,
199| 1| };
200| 1| assert_eq!(resp.action, "soft_deleted");
201| 1| assert!(resp.forgotten);
202| 1| assert_eq!(resp.deleted_at, Some(42));
203| 1| assert!(resp.deleted_at_iso.is_some());
204| 1| }
205| |
206| | #[test]
207| 1| fn forget_response_already_deleted_preserves_timestamp() {
208| 1| let resp = ForgetResponse {
209| 1| action: "already_deleted".to_string(),
210| 1| forgotten: false,
211| 1| name: "abc".to_string(),
212| 1| namespace: "my-project".to_string(),
213| 1| deleted_at: Some(1_650_000_000),
214| 1| deleted_at_iso: Some(crate::tz::epoch_to_iso(1_650_000_000)),
215| 1| elapsed_ms: 2,
216| 1| };
217| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
218| 1| assert_eq!(json["action"], "already_deleted");
219| 1| assert_eq!(json["forgotten"], false);
220| 1| assert_eq!(json["deleted_at"], 1_650_000_000i64);
221| 1| assert!(json["deleted_at_iso"].is_string());
222| 1| }
223| |
224| | #[test]
225| 1| fn forget_response_not_found_omits_deleted_at_fields() {
226| 1| let resp = ForgetResponse {
227| 1| action: "not_found".to_string(),
228| 1| forgotten: false,
229| 1| name: "phantom".to_string(),
230| 1| namespace: "global".to_string(),
231| 1| deleted_at: None,
232| 1| deleted_at_iso: None,
233| 1| elapsed_ms: 0,
234| 1| };
235| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
236| 1| assert_eq!(json["action"], "not_found");
237| 1| assert_eq!(json["forgotten"], false);
238| | // skip_serializing_if = "Option::is_none" means both fields are absent
239| 1| assert!(json.get("deleted_at").is_none());
240| 1| assert!(json.get("deleted_at_iso").is_none());
241| 1| assert_eq!(json["elapsed_ms"], 0u64);
242| 1| }
243| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/fts.rs:
1| |//! Handler for the `fts` CLI subcommand family.
2| |//!
3| |//! Provides two maintenance operations for the FTS5 full-text search index:
4| |//! - `rebuild`: drops and reconstructs the index from the `memories` table.
5| |//! - `check`: runs the FTS5 integrity-check without modifying the index.
6| |
7| |use crate::errors::AppError;
8| |use crate::output;
9| |use crate::paths::AppPaths;
10| |use crate::storage::connection::{open_ro, open_rw};
11| |use serde::Serialize;
12| |
13| |/// Arguments for the `fts` subcommand family.
14| |#[derive(clap::Args)]
15| |#[command(
16| | about = "FTS5 full-text search index management",
17| | after_long_help = "EXAMPLES:\n \
18| | # Rebuild the full-text search index from memories table\n \
19| | sqlite-graphrag fts rebuild\n\n \
20| | # Check FTS5 index integrity\n \
21| | sqlite-graphrag fts check --json\n\n \
22| | # Show FTS5 index statistics\n \
23| | sqlite-graphrag fts stats --json"
24| |)]
25| |pub struct FtsArgs {
26| | #[command(subcommand)]
27| | pub command: FtsSubcommand,
28| |}
29| |
30| |/// Subcommands nested under `fts`.
31| |#[derive(clap::Subcommand)]
32| |pub enum FtsSubcommand {
33| | /// Rebuild the FTS5 index from the memories table.
34| | #[command(after_long_help = "EXAMPLES:\n \
35| | # Rebuild the full-text search index\n \
36| | sqlite-graphrag fts rebuild\n\n \
37| | # Rebuild with custom database path\n \
38| | sqlite-graphrag fts rebuild --db /path/to/graphrag.sqlite")]
39| | Rebuild(FtsRebuildArgs),
40| | /// Run FTS5 integrity-check without modifying the index.
41| | #[command(after_long_help = "EXAMPLES:\n \
42| | # Check FTS5 index integrity\n \
43| | sqlite-graphrag fts check\n\n \
44| | # Check with custom database path\n \
45| | sqlite-graphrag fts check --db /path/to/graphrag.sqlite")]
46| | Check(FtsCheckArgs),
47| | /// Show FTS5 index statistics (row count, shadow pages, functional status).
48| | #[command(after_long_help = "EXAMPLES:\n \
49| | # Show FTS5 index statistics\n \
50| | sqlite-graphrag fts stats\n\n \
51| | # Stats with custom database path\n \
52| | sqlite-graphrag fts stats --db /path/to/graphrag.sqlite")]
53| | Stats(FtsStatsArgs),
54| |}
55| |
56| |/// Arguments for `fts rebuild`.
57| |#[derive(clap::Args)]
58| |pub struct FtsRebuildArgs {
59| | /// No-op; JSON is always emitted on stdout.
60| | #[arg(long, hide = true)]
61| | pub json: bool,
62| | /// Path to the SQLite database file.
63| | #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
64| | pub db: Option<String>,
65| |}
66| |
67| |/// Arguments for `fts check`.
68| |#[derive(clap::Args)]
69| |pub struct FtsCheckArgs {
70| | /// No-op; JSON is always emitted on stdout.
71| | #[arg(long, hide = true)]
72| | pub json: bool,
73| | /// Path to the SQLite database file.
74| | #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
75| | pub db: Option<String>,
76| |}
77| |
78| |/// Arguments for `fts stats`.
79| |#[derive(clap::Args)]
80| |pub struct FtsStatsArgs {
81| | /// No-op; JSON is always emitted on stdout.
82| | #[arg(long, hide = true)]
83| | pub json: bool,
84| | /// Path to the SQLite database file.
85| | #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
86| | pub db: Option<String>,
87| |}
88| |
89| |#[derive(Serialize)]
90| |struct FtsRebuildResponse {
91| | action: String,
92| | rows_indexed: i64,
93| | elapsed_ms: u64,
94| |}
95| |
96| |#[derive(Serialize)]
97| |struct FtsCheckResponse {
98| | action: String,
99| | integrity_ok: bool,
100| | #[serde(skip_serializing_if = "Option::is_none")]
101| | detail: Option<String>,
102| | elapsed_ms: u64,
103| |}
104| |
105| |#[derive(Serialize)]
106| |struct FtsStatsResponse {
107| | total_rows: i64,
108| | #[serde(skip_serializing_if = "Option::is_none")]
109| | shadow_pages: Option<i64>,
110| | fts_functional: bool,
111| | elapsed_ms: u64,
112| |}
113| |
114| |/// Dispatch entry point called from `main`.
115| |///
116| |/// # Errors
117| |/// Propagates any [`AppError`] raised by the underlying subcommand.
118| 0|pub fn run(args: FtsArgs) -> Result<(), AppError> {
119| 0| match args.command {
120| 0| FtsSubcommand::Rebuild(a) => run_rebuild(a),
121| 0| FtsSubcommand::Check(a) => run_check(a),
122| 0| FtsSubcommand::Stats(a) => run_stats(a),
123| | }
124| 0|}
125| |
126| |/// Rebuilds the FTS5 index by issuing the `'rebuild'` special command.
127| |///
128| |/// The FTS5 `INSERT INTO fts_memories(fts_memories) VALUES('rebuild')` statement
129| |/// drops all index data and re-populates it from the content table in a single
130| |/// transaction. Use this after bulk imports or when `fts check` reports a failure.
131| |///
132| |/// # Errors
133| |/// Returns [`AppError::Database`] on any SQLite failure.
134| 0|fn run_rebuild(args: FtsRebuildArgs) -> Result<(), AppError> {
135| 0| let start = std::time::Instant::now();
136| 0| let paths = AppPaths::resolve(args.db.as_deref())?;
137| 0| crate::storage::connection::ensure_db_ready(&paths)?;
138| 0| let conn = open_rw(&paths.db)?;
139| |
140| 0| let table_exists: bool = conn.query_row(
141| 0| "SELECT COUNT(*) FROM sqlite_master WHERE type='table' AND name='fts_memories'",
142| 0| [],
143| 0| |r| r.get::<_, i64>(0).map(|v| v > 0),
144| 0| )?;
145| 0| if !table_exists {
146| 0| return Err(AppError::Validation(
147| 0| "FTS5 table 'fts_memories' does not exist — run 'sqlite-graphrag init' first"
148| 0| .to_string(),
149| 0| ));
150| 0| }
151| |
152| 0| conn.execute_batch("INSERT INTO fts_memories(fts_memories) VALUES('rebuild');")?;
153| |
154| 0| let rows: i64 = conn.query_row("SELECT COUNT(*) FROM fts_memories", [], |r| r.get(0))?;
155| |
156| 0| conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);")?;
157| |
158| 0| output::emit_json(&FtsRebuildResponse {
159| 0| action: "rebuilt".to_string(),
160| 0| rows_indexed: rows,
161| 0| elapsed_ms: start.elapsed().as_millis() as u64,
162| 0| })?;
163| |
164| 0| Ok(())
165| 0|}
166| |
167| |/// Runs the FTS5 integrity-check without modifying the index.
168| |///
169| |/// The FTS5 integrity-check is triggered by:
170| |/// ```sql
171| |/// INSERT INTO fts_memories(fts_memories, rank) VALUES('integrity-check', 1);
172| |/// ```
173| |/// SQLite raises an error if the index is corrupt, so a successful `execute_batch`
174| |/// means the index is healthy. On failure, `integrity_ok` is `false` and the
175| |/// `detail` field carries an actionable hint.
176| |///
177| |/// # Errors
178| |/// Returns [`AppError`] only on unexpected I/O or path resolution failures;
179| |/// an FTS5 corruption is reported as `integrity_ok: false`, not as a Rust error.
180| 0|fn run_check(args: FtsCheckArgs) -> Result<(), AppError> {
181| 0| let start = std::time::Instant::now();
182| 0| let paths = AppPaths::resolve(args.db.as_deref())?;
183| 0| crate::storage::connection::ensure_db_ready(&paths)?;
184| 0| let conn = open_rw(&paths.db)?;
185| |
186| 0| let integrity_ok = conn
187| 0| .execute_batch("INSERT INTO fts_memories(fts_memories, rank) VALUES('integrity-check', 1);")
188| 0| .is_ok();
189| |
190| 0| conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);").ok();
191| |
192| 0| output::emit_json(&FtsCheckResponse {
193| 0| action: "checked".to_string(),
194| 0| integrity_ok,
195| 0| detail: if integrity_ok {
196| 0| None
197| | } else {
198| 0| Some("FTS5 integrity-check failed — run 'sqlite-graphrag fts rebuild'".to_string())
199| | },
200| 0| elapsed_ms: start.elapsed().as_millis() as u64,
201| 0| })?;
202| |
203| 0| Ok(())
204| 0|}
205| |
206| |/// Returns FTS5 index statistics: total indexed rows, shadow table page count (best-effort),
207| |/// and a functional liveness check.
208| |///
209| |/// # Errors
210| |/// Returns [`AppError`] only on unexpected I/O or path resolution failures.
211| 0|fn run_stats(args: FtsStatsArgs) -> Result<(), AppError> {
212| 0| let start = std::time::Instant::now();
213| 0| let paths = AppPaths::resolve(args.db.as_deref())?;
214| 0| crate::storage::connection::ensure_db_ready(&paths)?;
215| 0| let conn = open_ro(&paths.db)?;
216| |
217| | // 1. Total indexed rows in the FTS5 content table.
218| 0| let total_rows: i64 = conn.query_row("SELECT COUNT(*) FROM fts_memories", [], |r| r.get(0))?;
219| |
220| | // 2. Shadow pages — queries the internal `_data` shadow table.
221| | // This may not exist on all SQLite builds; treat any failure as None.
222| 0| let shadow_pages: Option<i64> = conn
223| 0| .query_row("SELECT COUNT(*) FROM fts_memories_data", [], |r| r.get(0))
224| 0| .ok();
225| |
226| | // 3. Functional liveness: SELECT with FTS5 match syntax against a wildcard.
227| | // A successful LIMIT 0 query confirms the FTS5 module is operational.
228| 0| let fts_functional = conn
229| 0| .execute_batch("SELECT * FROM fts_memories('*') LIMIT 0;")
230| 0| .is_ok();
231| |
232| 0| output::emit_json(&FtsStatsResponse {
233| 0| total_rows,
234| 0| shadow_pages,
235| 0| fts_functional,
236| 0| elapsed_ms: start.elapsed().as_millis() as u64,
237| 0| })?;
238| |
239| 0| Ok(())
240| 0|}
241| |
242| |/// Public helper: returns `true` when the FTS5 module is loadable AND the
243| |/// `fts_memories` virtual table exists AND a wildcard MATCH query succeeds.
244| |///
245| |/// Used by [`crate::commands::optimize`] to skip the (potentially minute-long)
246| |/// FTS5 rebuild when the index is already healthy. Also used by `health` and
247| |/// by future `vec check` implementations.
248| |///
249| |/// # Errors
250| |/// Returns `Err(AppError::Database)` only when the connection cannot be opened
251| |/// for reasons unrelated to FTS5 itself (permission denied, corrupted file).
252| |/// A missing FTS5 module or table is reported as `Ok(false)`.
253| 1|pub fn check_fts_functional(conn: &rusqlite::Connection) -> Result<bool, AppError> {
254| 1| let table_exists: bool = conn
255| 1| .query_row(
256| 1| "SELECT COUNT(*) FROM sqlite_master WHERE type='table' AND name='fts_memories'",
257| 1| [],
258| 1| |r| r.get::<_, i64>(0).map(|v| v > 0),
259| | )
260| 1| .unwrap_or(false);
261| 1| if !table_exists {
262| 0| return Ok(false);
263| 1| }
264| 1| let liveness = conn
265| 1| .execute_batch("SELECT * FROM fts_memories('*') LIMIT 0;")
266| 1| .is_ok();
267| 1| Ok(liveness)
268| 1|}
269| |
270| |#[cfg(test)]
271| |mod tests {
272| | use super::*;
273| |
274| | #[test]
275| 1| fn fts_rebuild_response_serializes_all_fields() {
276| 1| let resp = FtsRebuildResponse {
277| 1| action: "rebuilt".to_string(),
278| 1| rows_indexed: 42,
279| 1| elapsed_ms: 10,
280| 1| };
281| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
282| 1| assert_eq!(json["action"], "rebuilt");
283| 1| assert_eq!(json["rows_indexed"], 42i64);
284| 1| assert_eq!(json["elapsed_ms"], 10u64);
285| 1| }
286| |
287| | #[test]
288| 1| fn fts_check_response_integrity_ok_omits_detail() {
289| 1| let resp = FtsCheckResponse {
290| 1| action: "checked".to_string(),
291| 1| integrity_ok: true,
292| 1| detail: None,
293| 1| elapsed_ms: 5,
294| 1| };
295| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
296| 1| assert_eq!(json["action"], "checked");
297| 1| assert_eq!(json["integrity_ok"], true);
298| 1| assert!(
299| 1| json.get("detail").is_none(),
300| 0| "detail must be absent when integrity_ok is true"
301| | );
302| 1| assert_eq!(json["elapsed_ms"], 5u64);
303| 1| }
304| |
305| | #[test]
306| 1| fn fts_check_response_corruption_includes_detail() {
307| 1| let resp = FtsCheckResponse {
308| 1| action: "checked".to_string(),
309| 1| integrity_ok: false,
310| 1| detail: Some(
311| 1| "FTS5 integrity-check failed — run 'sqlite-graphrag fts rebuild'".to_string(),
312| 1| ),
313| 1| elapsed_ms: 3,
314| 1| };
315| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
316| 1| assert_eq!(json["integrity_ok"], false);
317| 1| assert!(
318| 1| json["detail"].as_str().unwrap().contains("fts rebuild"),
319| 0| "detail must mention the remediation command"
320| | );
321| 1| }
322| |
323| | #[test]
324| 1| fn fts_rebuild_response_elapsed_ms_non_negative() {
325| 1| let resp = FtsRebuildResponse {
326| 1| action: "rebuilt".to_string(),
327| 1| rows_indexed: 0,
328| 1| elapsed_ms: 0,
329| 1| };
330| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
331| 1| assert!(json["elapsed_ms"].as_u64().is_some());
332| 1| }
333| |
334| | #[test]
335| 1| fn fts_check_response_elapsed_ms_non_negative() {
336| 1| let resp = FtsCheckResponse {
337| 1| action: "checked".to_string(),
338| 1| integrity_ok: true,
339| 1| detail: None,
340| 1| elapsed_ms: 0,
341| 1| };
342| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
343| 1| assert!(json["elapsed_ms"].as_u64().is_some());
344| 1| }
345| |
346| | #[test]
347| 1| fn fts_stats_response_serializes_all_fields() {
348| 1| let resp = FtsStatsResponse {
349| 1| total_rows: 150,
350| 1| shadow_pages: Some(12),
351| 1| fts_functional: true,
352| 1| elapsed_ms: 8,
353| 1| };
354| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
355| 1| assert_eq!(json["total_rows"], 150i64);
356| 1| assert_eq!(json["shadow_pages"], 12i64);
357| 1| assert_eq!(json["fts_functional"], true);
358| 1| assert_eq!(json["elapsed_ms"], 8u64);
359| 1| }
360| |
361| | #[test]
362| 1| fn fts_stats_response_omits_shadow_pages_when_none() {
363| 1| let resp = FtsStatsResponse {
364| 1| total_rows: 0,
365| 1| shadow_pages: None,
366| 1| fts_functional: false,
367| 1| elapsed_ms: 2,
368| 1| };
369| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
370| 1| assert!(
371| 1| json.get("shadow_pages").is_none(),
372| 0| "shadow_pages must be absent when None"
373| | );
374| 1| assert_eq!(json["fts_functional"], false);
375| 1| }
376| |
377| | #[test]
378| 1| fn fts_stats_response_fts_not_functional() {
379| 1| let resp = FtsStatsResponse {
380| 1| total_rows: 5,
381| 1| shadow_pages: None,
382| 1| fts_functional: false,
383| 1| elapsed_ms: 1,
384| 1| };
385| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
386| 1| assert_eq!(json["fts_functional"], false);
387| 1| assert_eq!(json["total_rows"], 5i64);
388| 1| }
389| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/graph_export.rs:
1| |//! Handler for the `graph-export` CLI subcommand.
2| |
3| |use crate::cli::GraphExportFormat;
4| |use crate::entity_type::EntityType;
5| |use crate::errors::AppError;
6| |use crate::output;
7| |use crate::paths::AppPaths;
8| |use crate::storage::connection::open_ro;
9| |use crate::storage::entities;
10| |use serde::Serialize;
11| |use std::collections::HashMap;
12| |use std::fs;
13| |use std::path::PathBuf;
14| |use std::time::Instant;
15| |
16| |/// Optional nested subcommands. When absent, the default behavior exports
17| |/// the full entity snapshot for backward compatibility.
18| |#[derive(clap::Subcommand)]
19| |pub enum GraphSubcommand {
20| | /// Traverse relationships from a starting entity using BFS
21| | Traverse(GraphTraverseArgs),
22| | /// Show graph statistics (node/edge counts, degree distribution)
23| | Stats(GraphStatsArgs),
24| | /// List entities stored in the graph with optional filters
25| | Entities(GraphEntitiesArgs),
26| |}
27| |
28| |#[derive(clap::ValueEnum, Clone, Copy, Debug, PartialEq, Eq)]
29| |pub enum GraphTraverseFormat {
30| | Json,
31| |}
32| |
33| |#[derive(clap::ValueEnum, Clone, Copy, Debug, PartialEq, Eq)]
34| |pub enum GraphStatsFormat {
35| | Json,
36| | Text,
37| |}
38| |
39| |#[derive(clap::Args)]
40| |#[command(after_long_help = "EXAMPLES:\n \
41| | # Export full entity snapshot as JSON (default)\n \
42| | sqlite-graphrag graph\n\n \
43| | # Traverse relationships from a starting entity\n \
44| | sqlite-graphrag graph traverse --from acme-corp --depth 2\n\n \
45| | # Show graph statistics as structured JSON\n \
46| | sqlite-graphrag graph stats --format json\n\n \
47| | # List entities filtered by type\n \
48| | sqlite-graphrag graph entities --entity-type person\n\n \
49| | # Export full snapshot in DOT format for Graphviz\n \
50| | sqlite-graphrag graph --format dot --output graph.dot\n\n \
51| |NOTES:\n \
52| | Without a subcommand, exports the full entity+edge snapshot.\n \
53| | Use `traverse`, `stats`, or `entities` for targeted queries.")]
54| |pub struct GraphArgs {
55| | /// Optional subcommand; without one, export the full entity snapshot.
56| | #[command(subcommand)]
57| | pub subcommand: Option<GraphSubcommand>,
58| | /// Filter by namespace. Defaults to all namespaces.
59| | #[arg(long)]
60| | pub namespace: Option<String>,
61| | /// Snapshot output format.
62| | #[arg(long, value_enum, default_value = "json")]
63| | pub format: GraphExportFormat,
64| | /// File path to write output instead of stdout.
65| | #[arg(long)]
66| | pub output: Option<PathBuf>,
67| | #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
68| | pub json: bool,
69| | #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
70| | pub db: Option<String>,
71| |}
72| |
73| |#[derive(clap::Args)]
74| |#[command(after_long_help = "EXAMPLES:\n \
75| | # Traverse relationships from an entity with default depth (2)\n \
76| | sqlite-graphrag graph traverse --from acme-corp\n\n \
77| | # Increase traversal depth to 3 hops\n \
78| | sqlite-graphrag graph traverse --from acme-corp --depth 3\n\n \
79| | # Traverse within a specific namespace\n \
80| | sqlite-graphrag graph traverse --from acme-corp --namespace project-x\n\n \
81| |NOTES:\n \
82| | Output is always JSON. The `hops` array contains each reachable entity\n \
83| | with its relation, direction (inbound/outbound), weight, and depth level.")]
84| |pub struct GraphTraverseArgs {
85| | /// Root entity name for the traversal.
86| | #[arg(long)]
87| | pub from: String,
88| | /// Maximum traversal depth.
89| | #[arg(long, default_value_t = 2u32)]
90| | pub depth: u32,
91| | #[arg(long)]
92| | pub namespace: Option<String>,
93| | #[arg(long, value_enum, default_value = "json")]
94| | pub format: GraphTraverseFormat,
95| | #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
96| | pub json: bool,
97| | #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
98| | pub db: Option<String>,
99| |}
100| |
101| |#[derive(clap::Args)]
102| |#[command(after_long_help = "EXAMPLES:\n \
103| | # Show stats for all namespaces (human-readable text)\n \
104| | sqlite-graphrag graph stats --format text\n\n \
105| | # Show stats as structured JSON\n \
106| | sqlite-graphrag graph stats --format json\n\n \
107| | # Show stats for a specific namespace\n \
108| | sqlite-graphrag graph stats --namespace project-x --format text\n\n \
109| |NOTES:\n \
110| | Reports node_count, edge_count, avg_degree, and max_degree.\n \
111| | Default format is JSON. Use `--format text` for a compact single-line summary.")]
112| |pub struct GraphStatsArgs {
113| | #[arg(long)]
114| | pub namespace: Option<String>,
115| | /// Output format for the stats response.
116| | #[arg(long, value_enum, default_value = "json")]
117| | pub format: GraphStatsFormat,
118| | #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
119| | pub json: bool,
120| | #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
121| | pub db: Option<String>,
122| |}
123| |
124| |/// Field to sort entities by in `graph entities`.
125| |#[derive(Debug, Clone, Copy, clap::ValueEnum)]
126| |pub enum EntitySortField {
127| | /// Sort alphabetically by entity name.
128| | Name,
129| | /// Sort by degree (total number of relationships, descending by default).
130| | Degree,
131| | /// Sort by entity creation timestamp.
132| | CreatedAt,
133| |}
134| |
135| |/// Sort direction for `graph entities`.
136| |#[derive(Debug, Clone, Copy, Default, clap::ValueEnum)]
137| |pub enum SortOrder {
138| | #[default]
139| | Asc,
140| | Desc,
141| |}
142| |
143| |#[derive(clap::Args)]
144| |#[command(after_long_help = "EXAMPLES:\n \
145| | # List all entities (default limit applies)\n \
146| | sqlite-graphrag graph entities\n\n \
147| | # Filter by entity type\n \
148| | sqlite-graphrag graph entities --entity-type person\n\n \
149| | # Filter by namespace and type\n \
150| | sqlite-graphrag graph entities --namespace project-x --entity-type concept\n\n \
151| | # Paginate results (skip first 20, return next 10)\n \
152| | sqlite-graphrag graph entities --offset 20 --limit 10\n\n \
153| | # Sort by degree descending (most connected first)\n \
154| | sqlite-graphrag graph entities --sort-by degree --order desc\n\n \
155| | # Sort by creation date ascending\n \
156| | sqlite-graphrag graph entities --sort-by created-at --order asc\n\n \
157| |NOTES:\n \
158| | Output is always JSON with `entities`, `total_count`, `limit`, and `offset` fields.\n \
159| | Entity types are strings extracted by GLiNER NER (e.g. `person`, `organization`, `location`).")]
160| |pub struct GraphEntitiesArgs {
161| | #[arg(long)]
162| | pub namespace: Option<String>,
163| | /// Filter by entity type (one of the 13 canonical types).
164| | #[arg(long, value_enum)]
165| | pub entity_type: Option<EntityType>,
166| | /// Maximum number of results to return.
167| | #[arg(long, default_value_t = crate::constants::K_GRAPH_ENTITIES_DEFAULT_LIMIT)]
168| | pub limit: usize,
169| | /// Number of results to skip for pagination.
170| | #[arg(long, default_value_t = 0usize)]
171| | pub offset: usize,
172| | /// Sort entities by this field. When omitted, the default order is by name ascending.
173| | #[arg(long, value_enum, help = "Sort entities by field")]
174| | pub sort_by: Option<EntitySortField>,
175| | /// Sort direction: `asc` (default) or `desc`.
176| | #[arg(long, value_enum, default_value_t = SortOrder::Asc, help = "Sort order")]
177| | pub order: SortOrder,
178| | #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
179| | pub json: bool,
180| | #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
181| | pub db: Option<String>,
182| |}
183| |
184| |#[derive(Serialize, Clone)]
185| |struct NodeOut {
186| | id: i64,
187| | name: String,
188| | namespace: String,
189| | /// Deprecated alias of `type` kept for backward-compat with pre-v1.0.35 clients.
190| | /// New consumers MUST read `type` instead. Will be removed in a future major release.
191| | kind: String,
192| | /// Canonical entity classification (organization, concept, person, etc.).
193| | /// Mirrors `kind` while the deprecation window is active.
194| | #[serde(rename = "type")]
195| | r#type: String,
196| |}
197| |
198| |#[derive(Serialize)]
199| |struct EdgeOut {
200| | from: String,
201| | to: String,
202| | relation: String,
203| | weight: f64,
204| |}
205| |
206| |#[derive(Serialize)]
207| |struct GraphSnapshot {
208| | nodes: Vec<NodeOut>,
209| | entities: Vec<NodeOut>,
210| | edges: Vec<EdgeOut>,
211| | elapsed_ms: u64,
212| |}
213| |
214| |#[derive(Serialize)]
215| |struct TraverseHop {
216| | entity: String,
217| | relation: String,
218| | direction: String,
219| | weight: f64,
220| | depth: u32,
221| |}
222| |
223| |#[derive(Serialize)]
224| |struct GraphTraverseResponse {
225| | from: String,
226| | namespace: String,
227| | depth: u32,
228| | hops: Vec<TraverseHop>,
229| | elapsed_ms: u64,
230| |}
231| |
232| |#[derive(Serialize)]
233| |struct GraphStatsResponse {
234| | namespace: Option<String>,
235| | node_count: i64,
236| | edge_count: i64,
237| | avg_degree: f64,
238| | max_degree: i64,
239| | elapsed_ms: u64,
240| |}
241| |
242| |#[derive(Serialize)]
243| |struct EntityItem {
244| | id: i64,
245| | name: String,
246| | entity_type: String,
247| | namespace: String,
248| | created_at: String,
249| | /// Total number of relationships (inbound + outbound) for this entity.
250| | degree: u32,
251| | #[serde(skip_serializing_if = "Option::is_none")]
252| | description: Option<String>,
253| |}
254| |
255| |#[derive(Serialize)]
256| |struct GraphEntitiesResponse {
257| | entities: Vec<EntityItem>,
258| | total_count: i64,
259| | limit: usize,
260| | offset: usize,
261| | namespace: Option<String>,
262| | elapsed_ms: u64,
263| |}
264| |
265| 0|pub fn run(args: GraphArgs) -> Result<(), AppError> {
266| 0| match args.subcommand {
267| 0| None => run_entities_snapshot(
268| 0| args.db.as_deref(),
269| 0| args.namespace.as_deref(),
270| 0| args.format,
271| 0| args.json,
272| 0| args.output.as_deref(),
273| | ),
274| 0| Some(GraphSubcommand::Traverse(a)) => run_traverse(a),
275| 0| Some(GraphSubcommand::Stats(a)) => run_stats(a),
276| 0| Some(GraphSubcommand::Entities(a)) => run_entities(a),
277| | }
278| 0|}
279| |
280| 0|fn run_entities_snapshot(
281| 0| db: Option<&str>,
282| 0| namespace: Option<&str>,
283| 0| format: GraphExportFormat,
284| 0| json: bool,
285| 0| output_path: Option<&std::path::Path>,
286| 0|) -> Result<(), AppError> {
287| 0| let inicio = Instant::now();
288| 0| let paths = AppPaths::resolve(db)?;
289| |
290| 0| crate::storage::connection::ensure_db_ready(&paths)?;
291| |
292| 0| let conn = open_ro(&paths.db)?;
293| |
294| 0| let nodes_raw = entities::list_entities(&conn, namespace)?;
295| 0| let edges_raw = entities::list_relationships_by_namespace(&conn, namespace)?;
296| |
297| 0| let id_to_name: HashMap<i64, String> =
298| 0| nodes_raw.iter().map(|n| (n.id, n.name.clone())).collect();
299| |
300| 0| let nodes: Vec<NodeOut> = nodes_raw
301| 0| .into_iter()
302| 0| .map(|n| NodeOut {
303| 0| id: n.id,
304| 0| name: n.name,
305| 0| namespace: n.namespace,
306| 0| r#type: n.kind.clone(),
307| 0| kind: n.kind,
308| 0| })
309| 0| .collect();
310| |
311| 0| let mut edges: Vec<EdgeOut> = Vec::with_capacity(edges_raw.len());
312| 0| let mut orphan_edges: usize = 0;
313| 0| for r in edges_raw {
314| 0| let from = match id_to_name.get(&r.source_id) {
315| 0| Some(n) => n.clone(),
316| | None => {
317| 0| orphan_edges += 1;
318| 0| tracing::warn!(target: "graph_export", source_id = r.source_id, relation = %r.relation, "edge skipped: source entity not found in id_to_name map");
319| 0| continue;
320| | }
321| | };
322| 0| let to = match id_to_name.get(&r.target_id) {
323| 0| Some(n) => n.clone(),
324| | None => {
325| 0| orphan_edges += 1;
326| 0| tracing::warn!(target: "graph_export", target_id = r.target_id, relation = %r.relation, "edge skipped: target entity not found in id_to_name map");
327| 0| continue;
328| | }
329| | };
330| 0| edges.push(EdgeOut {
331| 0| from,
332| 0| to,
333| 0| relation: r.relation,
334| 0| weight: r.weight,
335| 0| });
336| | }
337| 0| if orphan_edges > 0 {
338| 0| tracing::warn!(target: "graph_export",
339| | count = orphan_edges,
340| 0| "edges skipped due to orphaned entity references"
341| | );
342| 0| }
343| |
344| 0| let effective_format = if json {
345| 0| GraphExportFormat::Json
346| | } else {
347| 0| format
348| | };
349| |
350| 0| if effective_format == GraphExportFormat::Ndjson {
351| 0| let elapsed_ms = inicio.elapsed().as_millis() as u64;
352| 0| render_ndjson_streaming(&nodes, &edges, elapsed_ms, output_path)?;
353| 0| return Ok(());
354| 0| }
355| |
356| 0| let rendered = match effective_format {
357| | GraphExportFormat::Json => {
358| 0| let entities = nodes.clone();
359| 0| render_json(&GraphSnapshot {
360| 0| nodes,
361| 0| entities,
362| 0| edges,
363| 0| elapsed_ms: inicio.elapsed().as_millis() as u64,
364| 0| })?
365| | }
366| 0| GraphExportFormat::Dot => render_dot(&nodes, &edges),
367| 0| GraphExportFormat::Mermaid => render_mermaid(&nodes, &edges),
368| 0| GraphExportFormat::Ndjson => unreachable!("ndjson handled above"),
369| | };
370| |
371| 0| if let Some(path) = output_path.filter(|_| !json) {
372| 0| fs::write(path, &rendered)?;
373| 0| output::emit_progress(&format!("wrote {}", path.display()));
374| 0| } else {
375| 0| output::emit_text(&rendered);
376| 0| }
377| |
378| 0| Ok(())
379| 0|}
380| |
381| 0|fn run_traverse(args: GraphTraverseArgs) -> Result<(), AppError> {
382| 0| let inicio = Instant::now();
383| 0| let _ = args.format;
384| 0| let paths = AppPaths::resolve(args.db.as_deref())?;
385| |
386| 0| crate::storage::connection::ensure_db_ready(&paths)?;
387| |
388| 0| let conn = open_ro(&paths.db)?;
389| 0| let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
390| |
391| 0| let from_id = entities::find_entity_id(&conn, &namespace, &args.from)?
392| 0| .ok_or_else(|| AppError::NotFound(format!("entity '{}' not found", args.from)))?;
393| |
394| 0| let all_rels = entities::list_relationships_by_namespace(&conn, Some(&namespace))?;
395| 0| let all_entities = entities::list_entities(&conn, Some(&namespace))?;
396| 0| let id_to_name: HashMap<i64, String> = all_entities
397| 0| .iter()
398| 0| .map(|e| (e.id, e.name.clone()))
399| 0| .collect();
400| |
401| 0| let mut hops: Vec<TraverseHop> = Vec::with_capacity(16);
402| 0| let mut visited: std::collections::HashSet<i64> =
403| 0| std::collections::HashSet::with_capacity(args.depth as usize * 10);
404| 0| let mut frontier: Vec<(i64, u32)> = vec![(from_id, 0)];
405| |
406| 0| while let Some((current_id, current_depth)) = frontier.pop() {
407| 0| if current_depth >= args.depth || visited.contains(¤t_id) {
408| 0| continue;
409| 0| }
410| 0| visited.insert(current_id);
411| |
412| 0| for rel in &all_rels {
413| 0| if rel.source_id == current_id {
414| 0| if let Some(target_name) = id_to_name.get(&rel.target_id) {
415| 0| hops.push(TraverseHop {
416| 0| entity: target_name.clone(),
417| 0| relation: rel.relation.clone(),
418| 0| direction: "outbound".to_string(),
419| 0| weight: rel.weight,
420| 0| depth: current_depth + 1,
421| 0| });
422| 0| frontier.push((rel.target_id, current_depth + 1));
423| 0| }
424| 0| } else if rel.target_id == current_id {
425| 0| if let Some(source_name) = id_to_name.get(&rel.source_id) {
426| 0| hops.push(TraverseHop {
427| 0| entity: source_name.clone(),
428| 0| relation: rel.relation.clone(),
429| 0| direction: "inbound".to_string(),
430| 0| weight: rel.weight,
431| 0| depth: current_depth + 1,
432| 0| });
433| 0| frontier.push((rel.source_id, current_depth + 1));
434| 0| }
435| 0| }
436| | }
437| | }
438| |
439| 0| output::emit_json(&GraphTraverseResponse {
440| 0| from: args.from,
441| 0| namespace,
442| 0| depth: args.depth,
443| 0| hops,
444| 0| elapsed_ms: inicio.elapsed().as_millis() as u64,
445| 0| })?;
446| |
447| 0| Ok(())
448| 0|}
449| |
450| 0|fn run_stats(args: GraphStatsArgs) -> Result<(), AppError> {
451| 0| let inicio = Instant::now();
452| 0| let paths = AppPaths::resolve(args.db.as_deref())?;
453| |
454| 0| crate::storage::connection::ensure_db_ready(&paths)?;
455| |
456| 0| let conn = open_ro(&paths.db)?;
457| 0| let ns = args.namespace.as_deref();
458| |
459| 0| let node_count: i64 = if let Some(n) = ns {
460| 0| conn.query_row(
461| 0| "SELECT COUNT(*) FROM entities WHERE namespace = ?1",
462| 0| rusqlite::params![n],
463| 0| |r| r.get(0),
464| 0| )?
465| | } else {
466| 0| conn.query_row("SELECT COUNT(*) FROM entities", [], |r| r.get(0))?
467| | };
468| |
469| 0| let edge_count: i64 = if let Some(n) = ns {
470| 0| conn.query_row(
471| 0| "SELECT COUNT(*) FROM relationships r
472| 0| JOIN entities s ON s.id = r.source_id
473| 0| WHERE s.namespace = ?1",
474| 0| rusqlite::params![n],
475| 0| |r| r.get(0),
476| 0| )?
477| | } else {
478| 0| conn.query_row("SELECT COUNT(*) FROM relationships", [], |r| r.get(0))?
479| | };
480| |
481| 0| let max_degree: i64 = if let Some(n) = ns {
482| 0| conn.query_row(
483| 0| "SELECT COALESCE(MAX(degree), 0) FROM entities WHERE namespace = ?1",
484| 0| rusqlite::params![n],
485| 0| |r| r.get(0),
486| 0| )?
487| | } else {
488| 0| conn.query_row("SELECT COALESCE(MAX(degree), 0) FROM entities", [], |r| {
489| 0| r.get(0)
490| 0| })?
491| | };
492| |
493| | // avg_degree = 2 * edge_count / node_count (each edge contributes 2 to total degree sum).
494| 0| let avg_degree = if node_count > 0 {
495| 0| 2.0 * (edge_count as f64) / (node_count as f64)
496| | } else {
497| 0| 0.0
498| | };
499| |
500| 0| let resp = GraphStatsResponse {
501| 0| namespace: args.namespace,
502| 0| node_count,
503| 0| edge_count,
504| 0| avg_degree,
505| 0| max_degree,
506| 0| elapsed_ms: inicio.elapsed().as_millis() as u64,
507| 0| };
508| |
509| 0| let effective_format = if args.json {
510| 0| GraphStatsFormat::Json
511| | } else {
512| 0| args.format
513| | };
514| |
515| 0| match effective_format {
516| 0| GraphStatsFormat::Json => output::emit_json(&resp)?,
517| 0| GraphStatsFormat::Text => {
518| 0| output::emit_text(&format!(
519| 0| "nodes={} edges={} avg_degree={:.2} max_degree={} namespace={}",
520| 0| resp.node_count,
521| 0| resp.edge_count,
522| 0| resp.avg_degree,
523| 0| resp.max_degree,
524| 0| resp.namespace.as_deref().unwrap_or("all"),
525| 0| ));
526| 0| }
527| | }
528| |
529| 0| Ok(())
530| 0|}
531| |
532| |/// Builds the `ORDER BY` clause fragment from sort options.
533| |///
534| |/// Returns a static SQL fragment such as `ORDER BY e.name ASC`.
535| 6|fn build_order_by(sort_by: Option<EntitySortField>, order: SortOrder) -> &'static str {
536| | // The combinations are enumerated as static strings to avoid
537| | // format!() allocations in the hot path and satisfy the borrow checker
538| | // when the string is used inside conn.prepare().
539| 6| match (sort_by, order) {
540| | (None, SortOrder::Asc) | (Some(EntitySortField::Name), SortOrder::Asc) => {
541| 1| "ORDER BY e.name ASC"
542| | }
543| 1| (Some(EntitySortField::Name), SortOrder::Desc) => "ORDER BY e.name DESC",
544| 1| (Some(EntitySortField::Degree), SortOrder::Asc) => "ORDER BY degree ASC",
545| 1| (Some(EntitySortField::Degree), SortOrder::Desc) => "ORDER BY degree DESC",
546| 1| (Some(EntitySortField::CreatedAt), SortOrder::Asc) => "ORDER BY e.created_at ASC",
547| 1| (Some(EntitySortField::CreatedAt), SortOrder::Desc) => "ORDER BY e.created_at DESC",
548| | // Fallback: None/Desc → sort by name desc (consistent with dir variable).
549| 0| (None, SortOrder::Desc) => "ORDER BY e.name DESC",
550| | }
551| 6|}
552| |
553| 0|fn run_entities(args: GraphEntitiesArgs) -> Result<(), AppError> {
554| 0| let inicio = Instant::now();
555| 0| let paths = AppPaths::resolve(args.db.as_deref())?;
556| |
557| 0| crate::storage::connection::ensure_db_ready(&paths)?;
558| |
559| 0| let conn = open_ro(&paths.db)?;
560| |
561| 0| let row_to_item = |r: &rusqlite::Row<'_>| -> rusqlite::Result<EntityItem> {
562| 0| let ts: i64 = r.get(4)?;
563| 0| let created_at = chrono::DateTime::from_timestamp(ts, 0)
564| 0| .unwrap_or_default()
565| 0| .format("%Y-%m-%dT%H:%M:%SZ")
566| 0| .to_string();
567| | Ok(EntityItem {
568| 0| id: r.get(0)?,
569| 0| name: r.get(1)?,
570| 0| entity_type: r.get(2)?,
571| 0| namespace: r.get(3)?,
572| 0| created_at,
573| 0| degree: r.get(5)?,
574| 0| description: r.get(6)?,
575| | })
576| 0| };
577| |
578| 0| let limit_i = args.limit as i64;
579| 0| let offset_i = args.offset as i64;
580| 0| let order_clause = build_order_by(args.sort_by, args.order);
581| |
582| 0| let base_select = "SELECT e.id, e.name, COALESCE(e.type, ''), e.namespace, e.created_at,
583| 0| (SELECT COUNT(*) FROM relationships r
584| 0| WHERE r.source_id = e.id OR r.target_id = e.id) AS degree,
585| 0| e.description
586| 0| FROM entities e";
587| |
588| 0| let (total_count, items) = match (
589| 0| args.namespace.as_deref(),
590| 0| args.entity_type.map(|et| et.as_str()),
591| | ) {
592| 0| (Some(ns), Some(et)) => {
593| 0| let count: i64 = conn.query_row(
594| 0| "SELECT COUNT(*) FROM entities WHERE namespace = ?1 AND type = ?2",
595| 0| rusqlite::params![ns, et],
596| 0| |r| r.get(0),
597| 0| )?;
598| 0| let sql = format!(
599| 0| "{base_select} WHERE e.namespace = ?1 AND e.type = ?2 {order_clause} LIMIT ?3 OFFSET ?4"
600| | );
601| 0| let mut stmt = conn.prepare(&sql)?;
602| 0| let rows = stmt
603| 0| .query_map(rusqlite::params![ns, et, limit_i, offset_i], row_to_item)?
604| 0| .collect::<rusqlite::Result<Vec<_>>>()?;
605| 0| (count, rows)
606| | }
607| 0| (Some(ns), None) => {
608| 0| let count: i64 = conn.query_row(
609| 0| "SELECT COUNT(*) FROM entities WHERE namespace = ?1",
610| 0| rusqlite::params![ns],
611| 0| |r| r.get(0),
612| 0| )?;
613| 0| let sql =
614| 0| format!("{base_select} WHERE e.namespace = ?1 {order_clause} LIMIT ?2 OFFSET ?3");
615| 0| let mut stmt = conn.prepare(&sql)?;
616| 0| let rows = stmt
617| 0| .query_map(rusqlite::params![ns, limit_i, offset_i], row_to_item)?
618| 0| .collect::<rusqlite::Result<Vec<_>>>()?;
619| 0| (count, rows)
620| | }
621| 0| (None, Some(et)) => {
622| 0| let count: i64 = conn.query_row(
623| 0| "SELECT COUNT(*) FROM entities WHERE type = ?1",
624| 0| rusqlite::params![et],
625| 0| |r| r.get(0),
626| 0| )?;
627| 0| let sql = format!("{base_select} WHERE e.type = ?1 {order_clause} LIMIT ?2 OFFSET ?3");
628| 0| let mut stmt = conn.prepare(&sql)?;
629| 0| let rows = stmt
630| 0| .query_map(rusqlite::params![et, limit_i, offset_i], row_to_item)?
631| 0| .collect::<rusqlite::Result<Vec<_>>>()?;
632| 0| (count, rows)
633| | }
634| | (None, None) => {
635| 0| let count: i64 = conn.query_row("SELECT COUNT(*) FROM entities", [], |r| r.get(0))?;
636| 0| let sql = format!("{base_select} {order_clause} LIMIT ?1 OFFSET ?2");
637| 0| let mut stmt = conn.prepare(&sql)?;
638| 0| let rows = stmt
639| 0| .query_map(rusqlite::params![limit_i, offset_i], row_to_item)?
640| 0| .collect::<rusqlite::Result<Vec<_>>>()?;
641| 0| (count, rows)
642| | }
643| | };
644| |
645| 0| output::emit_json(&GraphEntitiesResponse {
646| 0| entities: items,
647| 0| total_count,
648| 0| limit: args.limit,
649| 0| offset: args.offset,
650| 0| namespace: args.namespace,
651| 0| elapsed_ms: inicio.elapsed().as_millis() as u64,
652| 0| })
653| 0|}
654| |
655| 1|fn render_json(snapshot: &GraphSnapshot) -> Result<String, AppError> {
656| 1| Ok(serde_json::to_string_pretty(snapshot)?)
^0
657| 1|}
658| |
659| |/// Streams the graph as NDJSON: one object per node, one per edge, then a summary.
660| |///
661| |/// Each line is flushed immediately so consumers can process incrementally.
662| |/// When `output_path` is `Some`, lines are written to the file; otherwise to stdout.
663| 0|fn render_ndjson_streaming(
664| 0| nodes: &[NodeOut],
665| 0| edges: &[EdgeOut],
666| 0| elapsed_ms: u64,
667| 0| output_path: Option<&std::path::Path>,
668| 0|) -> Result<(), AppError> {
669| | #[derive(serde::Serialize)]
670| | struct NdjsonNode<'a> {
671| | kind: &'static str,
672| | id: i64,
673| | name: &'a str,
674| | namespace: &'a str,
675| | #[serde(rename = "type")]
676| | r#type: &'a str,
677| | }
678| | #[derive(serde::Serialize)]
679| | struct NdjsonEdge<'a> {
680| | kind: &'static str,
681| | from: &'a str,
682| | to: &'a str,
683| | relation: &'a str,
684| | weight: f64,
685| | }
686| | #[derive(serde::Serialize)]
687| | struct NdjsonSummary {
688| | kind: &'static str,
689| | nodes: usize,
690| | edges: usize,
691| | elapsed_ms: u64,
692| | }
693| |
694| | use std::io::Write as IoWrite;
695| |
696| 0| let mut buf: Vec<u8> = Vec::with_capacity(4096);
697| |
698| 0| let emit_line =
699| 0| |buf: &mut Vec<u8>, line: &str, path: Option<&std::path::Path>| -> Result<(), AppError> {
700| 0| buf.clear();
701| 0| buf.extend_from_slice(line.as_bytes());
702| 0| buf.push(b'\n');
703| 0| if let Some(p) = path {
704| 0| let mut f = std::fs::OpenOptions::new()
705| 0| .create(true)
706| 0| .append(true)
707| 0| .open(p)
708| 0| .map_err(AppError::Io)?;
709| 0| f.write_all(buf).map_err(AppError::Io)?;
710| 0| } else {
711| 0| output::emit_text(line);
712| 0| }
713| 0| Ok(())
714| 0| };
715| |
716| | // Truncate the output file once before starting (avoids re-opening with append for every line).
717| 0| if let Some(p) = output_path {
718| 0| fs::write(p, b"")?;
719| 0| }
720| |
721| 0| for node in nodes {
722| 0| let obj = NdjsonNode {
723| 0| kind: "node",
724| 0| id: node.id,
725| 0| name: &node.name,
726| 0| namespace: &node.namespace,
727| 0| r#type: &node.r#type,
728| 0| };
729| 0| let line = serde_json::to_string(&obj)?;
730| 0| emit_line(&mut buf, &line, output_path)?;
731| | }
732| |
733| 0| for edge in edges {
734| 0| let obj = NdjsonEdge {
735| 0| kind: "edge",
736| 0| from: &edge.from,
737| 0| to: &edge.to,
738| 0| relation: &edge.relation,
739| 0| weight: edge.weight,
740| 0| };
741| 0| let line = serde_json::to_string(&obj)?;
742| 0| emit_line(&mut buf, &line, output_path)?;
743| | }
744| |
745| 0| let summary = NdjsonSummary {
746| 0| kind: "summary",
747| 0| nodes: nodes.len(),
748| 0| edges: edges.len(),
749| 0| elapsed_ms,
750| 0| };
751| 0| let line = serde_json::to_string(&summary)?;
752| 0| emit_line(&mut buf, &line, output_path)?;
753| |
754| 0| Ok(())
755| 0|}
756| |
757| 0|fn sanitize_dot_id(raw: &str) -> String {
758| 0| raw.chars()
759| 0| .map(|c| {
760| 0| if c.is_ascii_alphanumeric() || c == '_' {
761| 0| c
762| | } else {
763| 0| '_'
764| | }
765| 0| })
766| 0| .collect()
767| 0|}
768| |
769| 0|fn render_dot(nodes: &[NodeOut], edges: &[EdgeOut]) -> String {
770| | use std::fmt::Write;
771| 0| let mut out = String::with_capacity(nodes.len() * 80 + edges.len() * 60 + 300);
772| 0| out.push_str("digraph sqlite_graphrag {\n");
773| 0| out.push_str(" graph [bgcolor=\"white\", fontname=\"Helvetica Neue\", fontsize=12, rankdir=LR, nodesep=0.8, ranksep=1.2];\n");
774| 0| out.push_str(" node [shape=box, style=\"filled,rounded\", fillcolor=\"#F2F2F7\", fontname=\"Helvetica Neue\", fontsize=11, color=\"#C7C7CC\"];\n");
775| 0| out.push_str(" edge [fontname=\"Helvetica Neue\", fontsize=9, color=\"#8E8E93\"];\n");
776| 0| for node in nodes {
777| 0| let node_id = sanitize_dot_id(&node.name);
778| 0| let escaped = node.name.replace('"', "\\\"");
779| 0| let _ = writeln!(out, " {node_id} [label=\"{escaped}\"];");
780| 0| }
781| 0| for edge in edges {
782| 0| let from = sanitize_dot_id(&edge.from);
783| 0| let to = sanitize_dot_id(&edge.to);
784| 0| let label = edge.relation.replace('"', "\\\"");
785| 0| let _ = writeln!(out, " {from} -> {to} [label=\"{label}\"];");
786| 0| }
787| 0| out.push_str("}\n");
788| 0| out
789| 0|}
790| |
791| 0|fn sanitize_mermaid_id(raw: &str) -> String {
792| 0| raw.chars()
793| 0| .map(|c| {
794| 0| if c.is_ascii_alphanumeric() || c == '_' {
795| 0| c
796| | } else {
797| 0| '_'
798| | }
799| 0| })
800| 0| .collect()
801| 0|}
802| |
803| 0|fn render_mermaid(nodes: &[NodeOut], edges: &[EdgeOut]) -> String {
804| | use std::fmt::Write;
805| 0| let mut out = String::with_capacity(nodes.len() * 50 + edges.len() * 40 + 200);
806| 0| out.push_str("%%{init: {'theme': 'neutral', 'themeVariables': {'primaryColor': '#F2F2F7', 'primaryTextColor': '#1C1C1E', 'primaryBorderColor': '#C7C7CC', 'lineColor': '#8E8E93'}}}%%\n");
807| 0| out.push_str("graph LR\n");
808| 0| for node in nodes {
809| 0| let id = sanitize_mermaid_id(&node.name);
810| 0| let escaped = node.name.replace('"', "\\\"");
811| 0| let _ = writeln!(out, " {id}[\"{escaped}\"]");
812| 0| }
813| 0| for edge in edges {
814| 0| let from = sanitize_mermaid_id(&edge.from);
815| 0| let to = sanitize_mermaid_id(&edge.to);
816| 0| let label = edge.relation.replace('|', "\\|");
817| 0| let _ = writeln!(out, " {from} -->|{label}| {to}");
818| 0| }
819| 0| out
820| 0|}
821| |
822| |#[cfg(test)]
823| |mod tests {
824| | use super::*;
825| | use crate::cli::{Cli, Commands};
826| | use clap::Parser;
827| |
828| 4| fn make_node(kind: &str) -> NodeOut {
829| 4| NodeOut {
830| 4| id: 1,
831| 4| name: "test-entity".to_string(),
832| 4| namespace: "default".to_string(),
833| 4| kind: kind.to_string(),
834| 4| r#type: kind.to_string(),
835| 4| }
836| 4| }
837| |
838| | #[test]
839| 1| fn node_out_type_duplicates_kind() {
840| 1| let node = make_node("agent");
841| 1| let json = serde_json::to_value(&node).expect("serialization must work");
842| 1| assert_eq!(json["kind"], json["type"]);
843| 1| assert_eq!(json["kind"], "agent");
844| 1| assert_eq!(json["type"], "agent");
845| 1| }
846| |
847| | #[test]
848| 1| fn node_out_serializes_all_fields() {
849| 1| let node = make_node("document");
850| 1| let json = serde_json::to_value(&node).expect("serialization must work");
851| 1| assert!(json.get("id").is_some());
852| 1| assert!(json.get("name").is_some());
853| 1| assert!(json.get("namespace").is_some());
854| 1| assert!(json.get("kind").is_some());
855| 1| assert!(json.get("type").is_some());
856| 1| }
857| |
858| | #[test]
859| 1| fn graph_snapshot_serializes_nodes_with_type() {
860| 1| let node = make_node("concept");
861| 1| let entities = vec![make_node("concept")];
862| 1| let snapshot = GraphSnapshot {
863| 1| nodes: vec![node],
864| 1| entities,
865| 1| edges: vec![],
866| 1| elapsed_ms: 0,
867| 1| };
868| 1| let json_str = render_json(&snapshot).expect("rendering must work");
869| 1| let json: serde_json::Value = serde_json::from_str(&json_str).expect("valid json");
870| 1| let first_node = &json["nodes"][0];
871| 1| assert_eq!(first_node["kind"], first_node["type"]);
872| 1| assert_eq!(first_node["type"], "concept");
873| 1| }
874| |
875| | #[test]
876| 1| fn graph_traverse_response_serializes_correctly() {
877| 1| let resp = GraphTraverseResponse {
878| 1| from: "entity-a".to_string(),
879| 1| namespace: "global".to_string(),
880| 1| depth: 2,
881| 1| hops: vec![TraverseHop {
882| 1| entity: "entity-b".to_string(),
883| 1| relation: "uses".to_string(),
884| 1| direction: "outbound".to_string(),
885| 1| weight: 1.0,
886| 1| depth: 1,
887| 1| }],
888| 1| elapsed_ms: 5,
889| 1| };
890| 1| let json = serde_json::to_value(&resp).unwrap();
891| 1| assert_eq!(json["from"], "entity-a");
892| 1| assert_eq!(json["depth"], 2);
893| 1| assert!(json["hops"].is_array());
894| 1| assert_eq!(json["hops"][0]["direction"], "outbound");
895| 1| }
896| |
897| | #[test]
898| 1| fn graph_stats_response_serializes_correctly() {
899| 1| let resp = GraphStatsResponse {
900| 1| namespace: Some("global".to_string()),
901| 1| node_count: 10,
902| 1| edge_count: 15,
903| 1| avg_degree: 3.0,
904| 1| max_degree: 7,
905| 1| elapsed_ms: 2,
906| 1| };
907| 1| let json = serde_json::to_value(&resp).unwrap();
908| 1| assert_eq!(json["node_count"], 10);
909| 1| assert_eq!(json["edge_count"], 15);
910| 1| assert_eq!(json["avg_degree"], 3.0);
911| 1| assert_eq!(json["max_degree"], 7);
912| 1| }
913| |
914| 3| fn compute_avg_degree(node_count: i64, edge_count: i64) -> f64 {
915| 3| if node_count > 0 {
916| 2| 2.0 * (edge_count as f64) / (node_count as f64)
917| | } else {
918| 1| 0.0
919| | }
920| 3| }
921| |
922| | #[test]
923| 1| fn avg_degree_is_zero_when_no_nodes() {
924| 1| assert_eq!(compute_avg_degree(0, 0), 0.0);
925| 1| }
926| |
927| | #[test]
928| 1| fn avg_degree_is_zero_when_nodes_but_no_edges() {
929| | // Reproduces L1 bug: previously returned 1.0 instead of 0.0.
930| 1| assert_eq!(compute_avg_degree(2, 0), 0.0);
931| 1| }
932| |
933| | #[test]
934| 1| fn avg_degree_is_two_when_triangle() {
935| | // 3 nodes, 3 edges: 2 * 3 / 3 = 2.0
936| 1| assert_eq!(compute_avg_degree(3, 3), 2.0);
937| 1| }
938| |
939| | #[test]
940| 1| fn graph_entities_response_serializes_required_fields() {
941| 1| let resp = GraphEntitiesResponse {
942| 1| entities: vec![EntityItem {
943| 1| id: 1,
944| 1| name: "claude-code".to_string(),
945| 1| entity_type: "agent".to_string(),
946| 1| namespace: "global".to_string(),
947| 1| created_at: "2026-01-01T00:00:00Z".to_string(),
948| 1| degree: 0,
949| 1| description: None,
950| 1| }],
951| 1| total_count: 1,
952| 1| limit: 50,
953| 1| offset: 0,
954| 1| namespace: Some("global".to_string()),
955| 1| elapsed_ms: 3,
956| 1| };
957| 1| let json = serde_json::to_value(&resp).unwrap();
958| 1| assert!(json["entities"].is_array());
959| 1| assert_eq!(json["entities"][0]["name"], "claude-code");
960| 1| assert_eq!(json["entities"][0]["entity_type"], "agent");
961| 1| assert_eq!(json["total_count"], 1);
962| 1| assert_eq!(json["limit"], 50);
963| 1| assert_eq!(json["offset"], 0);
964| 1| assert_eq!(json["namespace"], "global");
965| 1| }
966| |
967| | #[test]
968| 1| fn entity_item_serializes_all_fields() {
969| 1| let item = EntityItem {
970| 1| id: 42,
971| 1| name: "test-entity".to_string(),
972| 1| entity_type: "concept".to_string(),
973| 1| namespace: "project-a".to_string(),
974| 1| created_at: "2026-04-19T12:00:00Z".to_string(),
975| 1| degree: 3,
976| 1| description: Some("test description".to_string()),
977| 1| };
978| 1| let json = serde_json::to_value(&item).unwrap();
979| 1| assert_eq!(json["id"], 42);
980| 1| assert_eq!(json["name"], "test-entity");
981| 1| assert_eq!(json["entity_type"], "concept");
982| 1| assert_eq!(json["namespace"], "project-a");
983| 1| assert_eq!(json["created_at"], "2026-04-19T12:00:00Z");
984| 1| }
985| |
986| | #[test]
987| 1| fn entity_item_entity_type_is_never_null() {
988| | // P2-C: entity_type must never be null, even when DB column is empty.
989| 1| let item = EntityItem {
990| 1| id: 1,
991| 1| name: "sem-tipo".to_string(),
992| 1| entity_type: String::new(),
993| 1| namespace: "ns".to_string(),
994| 1| created_at: "2026-01-01T00:00:00Z".to_string(),
995| 1| degree: 0,
996| 1| description: None,
997| 1| };
998| 1| let json = serde_json::to_value(&item).unwrap();
999| 1| assert!(
1000| 1| !json["entity_type"].is_null(),
1001| 0| "entity_type must not be null"
1002| | );
1003| 1| assert!(json["entity_type"].is_string());
1004| 1| }
1005| |
1006| | #[test]
1007| 1| fn graph_traverse_cli_rejects_format_dot() {
1008| 1| let parsed = Cli::try_parse_from([
1009| 1| "sqlite-graphrag",
1010| 1| "graph",
1011| 1| "traverse",
1012| 1| "--from",
1013| 1| "AuthDecision",
1014| 1| "--format",
1015| 1| "dot",
1016| 1| ]);
1017| 1| assert!(parsed.is_err(), "graph traverse must reject format=dot");
^0
1018| 1| }
1019| |
1020| | #[test]
1021| 1| fn graph_stats_cli_accepts_format_text() {
1022| 1| let parsed = Cli::try_parse_from(["sqlite-graphrag", "graph", "stats", "--format", "text"])
1023| 1| .expect("graph stats --format text must be accepted");
1024| |
1025| 1| match parsed.command {
1026| 1| Commands::Graph(args) => match args.subcommand {
1027| 1| Some(GraphSubcommand::Stats(stats)) => {
1028| 1| assert_eq!(stats.format, GraphStatsFormat::Text);
1029| | }
1030| 0| _ => unreachable!("unexpected subcommand"),
1031| | },
1032| 0| _ => unreachable!("unexpected command"),
1033| | }
1034| 1| }
1035| |
1036| | #[test]
1037| 1| fn graph_stats_cli_rejects_format_mermaid() {
1038| 1| let parsed =
1039| 1| Cli::try_parse_from(["sqlite-graphrag", "graph", "stats", "--format", "mermaid"]);
1040| 1| assert!(parsed.is_err(), "graph stats must reject format=mermaid");
^0
1041| 1| }
1042| |
1043| | #[test]
1044| 1| fn graph_entities_response_has_no_items_key() {
1045| 1| let resp = GraphEntitiesResponse {
1046| 1| entities: vec![],
1047| 1| total_count: 0,
1048| 1| limit: 50,
1049| 1| offset: 0,
1050| 1| namespace: None,
1051| 1| elapsed_ms: 0,
1052| 1| };
1053| 1| let json = serde_json::to_value(&resp).unwrap();
1054| 1| assert!(
1055| 1| json.get("items").is_none(),
1056| 0| "legacy 'items' key must not appear"
1057| | );
1058| 1| assert!(
1059| 1| json.get("entities").is_some(),
1060| 0| "'entities' key must be present"
1061| | );
1062| 1| }
1063| |
1064| | #[test]
1065| 1| fn build_order_by_defaults_to_name_asc() {
1066| 1| let clause = build_order_by(None, SortOrder::Asc);
1067| 1| assert_eq!(clause, "ORDER BY e.name ASC");
1068| 1| }
1069| |
1070| | #[test]
1071| 1| fn build_order_by_name_desc() {
1072| 1| let clause = build_order_by(Some(EntitySortField::Name), SortOrder::Desc);
1073| 1| assert_eq!(clause, "ORDER BY e.name DESC");
1074| 1| }
1075| |
1076| | #[test]
1077| 1| fn build_order_by_degree_desc() {
1078| 1| let clause = build_order_by(Some(EntitySortField::Degree), SortOrder::Desc);
1079| 1| assert_eq!(clause, "ORDER BY degree DESC");
1080| 1| }
1081| |
1082| | #[test]
1083| 1| fn build_order_by_degree_asc() {
1084| 1| let clause = build_order_by(Some(EntitySortField::Degree), SortOrder::Asc);
1085| 1| assert_eq!(clause, "ORDER BY degree ASC");
1086| 1| }
1087| |
1088| | #[test]
1089| 1| fn build_order_by_created_at_asc() {
1090| 1| let clause = build_order_by(Some(EntitySortField::CreatedAt), SortOrder::Asc);
1091| 1| assert_eq!(clause, "ORDER BY e.created_at ASC");
1092| 1| }
1093| |
1094| | #[test]
1095| 1| fn build_order_by_created_at_desc() {
1096| 1| let clause = build_order_by(Some(EntitySortField::CreatedAt), SortOrder::Desc);
1097| 1| assert_eq!(clause, "ORDER BY e.created_at DESC");
1098| 1| }
1099| |
1100| | #[test]
1101| 1| fn graph_entities_cli_accepts_sort_by_degree_desc() {
1102| 1| let parsed = Cli::try_parse_from([
1103| 1| "sqlite-graphrag",
1104| 1| "graph",
1105| 1| "entities",
1106| 1| "--sort-by",
1107| 1| "degree",
1108| 1| "--order",
1109| 1| "desc",
1110| 1| ])
1111| 1| .expect("graph entities --sort-by degree --order desc must parse");
1112| 1| match parsed.command {
1113| 1| Commands::Graph(args) => match args.subcommand {
1114| 1| Some(GraphSubcommand::Entities(e)) => {
1115| 1| assert!(matches!(e.sort_by, Some(EntitySortField::Degree)));
^0
1116| 1| assert!(matches!(e.order, SortOrder::Desc));
^0
1117| | }
1118| 0| _ => unreachable!("unexpected subcommand"),
1119| | },
1120| 0| _ => unreachable!("unexpected command"),
1121| | }
1122| 1| }
1123| |
1124| | #[test]
1125| 1| fn graph_entities_cli_accepts_sort_by_created_at_asc() {
1126| 1| let parsed = Cli::try_parse_from([
1127| 1| "sqlite-graphrag",
1128| 1| "graph",
1129| 1| "entities",
1130| 1| "--sort-by",
1131| 1| "created-at",
1132| 1| ])
1133| 1| .expect("graph entities --sort-by created-at must parse");
1134| 1| match parsed.command {
1135| 1| Commands::Graph(args) => match args.subcommand {
1136| 1| Some(GraphSubcommand::Entities(e)) => {
1137| 1| assert!(matches!(e.sort_by, Some(EntitySortField::CreatedAt)));
^0
1138| 1| assert!(matches!(e.order, SortOrder::Asc));
^0
1139| | }
1140| 0| _ => unreachable!("unexpected subcommand"),
1141| | },
1142| 0| _ => unreachable!("unexpected command"),
1143| | }
1144| 1| }
1145| |
1146| | #[test]
1147| 1| fn graph_entities_cli_defaults_to_no_sort_by() {
1148| 1| let parsed = Cli::try_parse_from(["sqlite-graphrag", "graph", "entities"])
1149| 1| .expect("graph entities must parse without sort flags");
1150| 1| match parsed.command {
1151| 1| Commands::Graph(args) => match args.subcommand {
1152| 1| Some(GraphSubcommand::Entities(e)) => {
1153| 1| assert!(e.sort_by.is_none(), "sort_by must default to None");
^0
1154| 1| assert!(
1155| 1| matches!(e.order, SortOrder::Asc),
^0
1156| 0| "order must default to Asc"
1157| | );
1158| | }
1159| 0| _ => unreachable!("unexpected subcommand"),
1160| | },
1161| 0| _ => unreachable!("unexpected command"),
1162| | }
1163| 1| }
1164| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/health.rs:
1| |//! Handler for the `health` CLI subcommand.
2| |
3| |use crate::errors::AppError;
4| |use crate::output;
5| |use crate::paths::AppPaths;
6| |use crate::storage::connection::open_ro;
7| |use serde::Serialize;
8| |use std::fs;
9| |use std::time::Instant;
10| |
11| |#[derive(clap::Args)]
12| |#[command(after_long_help = "EXAMPLES:\n \
13| | # Check database health (connectivity, integrity, vector index)\n \
14| | sqlite-graphrag health\n\n \
15| | # Check health of a database at a custom path\n \
16| | sqlite-graphrag health --db /path/to/graphrag.sqlite\n\n \
17| | # Use SQLITE_GRAPHRAG_DB_PATH env var\n \
18| | SQLITE_GRAPHRAG_DB_PATH=/data/graphrag.sqlite sqlite-graphrag health")]
19| |pub struct HealthArgs {
20| | #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
21| | pub db: Option<String>,
22| | /// Explicit JSON flag. Accepted as a no-op because output is already JSON by default.
23| | #[arg(long, default_value_t = false)]
24| | pub json: bool,
25| | /// Output format: `json` or `text`. JSON is always emitted on stdout regardless of the value.
26| | #[arg(long, value_parser = ["json", "text"], hide = true)]
27| | pub format: Option<String>,
28| |}
29| |
30| |#[derive(Serialize)]
31| |struct HealthCounts {
32| | memories: i64,
33| | /// Alias of `memories` for the documented contract in SKILL.md.
34| | memories_total: i64,
35| | entities: i64,
36| | relationships: i64,
37| | vec_memories: i64,
38| |}
39| |
40| |#[derive(Serialize)]
41| |struct HealthCheck {
42| | name: String,
43| | ok: bool,
44| | #[serde(skip_serializing_if = "Option::is_none")]
45| | detail: Option<String>,
46| |}
47| |
48| |#[derive(Serialize)]
49| |struct HealthResponse {
50| | status: String,
51| | integrity: String,
52| | integrity_ok: bool,
53| | schema_ok: bool,
54| | vec_memories_ok: bool,
55| | vec_memories_missing: i64,
56| | vec_memories_orphaned: i64,
57| | vec_entities_ok: bool,
58| | vec_chunks_ok: bool,
59| | fts_ok: bool,
60| | /// Whether a live FTS5 MATCH query against fts_memories succeeded.
61| | fts_query_ok: bool,
62| | model_ok: bool,
63| | counts: HealthCounts,
64| | db_path: String,
65| | db_size_bytes: u64,
66| | /// MAX(version) from refinery_schema_history — number of the last applied migration.
67| | /// Distinct from PRAGMA schema_version (SQLite DDL counter) and PRAGMA user_version
68| | /// (canonical SCHEMA_USER_VERSION from __debug_schema).
69| | schema_version: u32,
70| | /// List of entities referenced by memories but absent from the entities table.
71| | /// Empty in a healthy DB. Per the contract documented in SKILL.md.
72| | missing_entities: Vec<String>,
73| | /// WAL file size in MB (0.0 if WAL does not exist or journal_mode != wal).
74| | wal_size_mb: f64,
75| | /// SQLite journaling mode (wal, delete, truncate, persist, memory, off).
76| | journal_mode: String,
77| | /// SQLite version string, e.g. `"3.46.0"`.
78| | sqlite_version: String,
79| | /// Fraction of relationships that use the `mentions` relation type (0.0–1.0).
80| | /// Omitted when there are no relationships in the database.
81| | #[serde(skip_serializing_if = "Option::is_none")]
82| | mentions_ratio: Option<f64>,
83| | /// Human-readable warning when `mentions` relationships dominate the graph (ratio > 0.5).
84| | /// Omitted when the ratio is within acceptable bounds or there are no relationships.
85| | #[serde(skip_serializing_if = "Option::is_none")]
86| | mentions_warning: Option<String>,
87| | /// The relation type with the highest edge count in the namespace.
88| | /// Omitted when there are no relationships in the database.
89| | #[serde(skip_serializing_if = "Option::is_none")]
90| | top_relation: Option<String>,
91| | /// Fraction of all edges occupied by `top_relation` (0.0–1.0).
92| | /// Omitted when there are no relationships in the database.
93| | #[serde(skip_serializing_if = "Option::is_none")]
94| | top_relation_ratio: Option<f64>,
95| | /// Fraction of relationships that use the `applies_to` relation type (0.0–1.0).
96| | /// Omitted when there are no relationships or when `applies_to` is absent.
97| | #[serde(skip_serializing_if = "Option::is_none")]
98| | applies_to_ratio: Option<f64>,
99| | /// Human-readable warning when a single relation type occupies more than 40 % of edges.
100| | /// Omitted when concentration is within acceptable bounds or there are no relationships.
101| | #[serde(skip_serializing_if = "Option::is_none")]
102| | relation_concentration_warning: Option<String>,
103| | /// Number of entities whose name differs from its normalized kebab-case form.
104| | #[serde(skip_serializing_if = "Option::is_none")]
105| | non_normalized_count: Option<i64>,
106| | /// Warning when non-normalized entities are detected.
107| | #[serde(skip_serializing_if = "Option::is_none")]
108| | normalization_warning: Option<String>,
109| | /// Number of entities with degree exceeding the super-hub threshold (default 50).
110| | #[serde(skip_serializing_if = "Option::is_none")]
111| | super_hub_count: Option<i64>,
112| | /// Warning listing top super-hub entity names.
113| | #[serde(skip_serializing_if = "Option::is_none")]
114| | super_hub_warning: Option<String>,
115| | /// Name of the entity with the highest connection count in the namespace.
116| | /// Omitted when there are no entities in the database.
117| | #[serde(skip_serializing_if = "Option::is_none")]
118| | top_hub_entity: Option<String>,
119| | /// Number of connections (degree) of `top_hub_entity`.
120| | /// Omitted when there are no entities in the database.
121| | #[serde(skip_serializing_if = "Option::is_none")]
122| | top_hub_degree: Option<i64>,
123| | /// Human-readable warning when `top_hub_entity` exceeds 50 connections.
124| | /// Omitted when degree is within acceptable bounds or there are no entities.
125| | #[serde(skip_serializing_if = "Option::is_none")]
126| | hub_warning: Option<String>,
127| | checks: Vec<HealthCheck>,
128| | elapsed_ms: u64,
129| |}
130| |
131| |/// Checks whether a table (including virtual ones) exists in sqlite_master.
132| 0|fn table_exists(conn: &rusqlite::Connection, table_name: &str) -> bool {
133| 0| conn.query_row(
134| 0| "SELECT COUNT(*) FROM sqlite_master WHERE type IN ('table', 'shadow') AND name = ?1",
135| 0| rusqlite::params![table_name],
136| 0| |r| r.get::<_, i64>(0),
137| | )
138| 0| .unwrap_or(0)
139| | > 0
140| 0|}
141| |
142| 0|pub fn run(args: HealthArgs) -> Result<(), AppError> {
143| 0| let start = Instant::now();
144| 0| let _ = args.json; // --json is a no-op because output is already JSON by default
145| 0| let _ = args.format; // --format is a no-op; JSON is always emitted on stdout
146| 0| let paths = AppPaths::resolve(args.db.as_deref())?;
147| |
148| 0| crate::storage::connection::ensure_db_ready(&paths)?;
149| |
150| 0| let conn = open_ro(&paths.db)?;
151| |
152| 0| let integrity: String = conn.query_row("PRAGMA integrity_check;", [], |r| r.get(0))?;
153| 0| let integrity_ok = integrity == "ok";
154| 0| tracing::info!(target: "health", integrity_ok = %integrity_ok, "PRAGMA integrity_check complete");
155| |
156| 0| if !integrity_ok {
157| 0| let db_size_bytes = fs::metadata(&paths.db).map(|m| m.len()).unwrap_or(0);
158| 0| output::emit_json(&HealthResponse {
159| 0| status: "degraded".to_string(),
160| 0| integrity: integrity.clone(),
161| 0| integrity_ok: false,
162| 0| schema_ok: false,
163| 0| vec_memories_ok: false,
164| 0| vec_memories_missing: 0,
165| 0| vec_memories_orphaned: 0,
166| 0| vec_entities_ok: false,
167| 0| vec_chunks_ok: false,
168| 0| fts_ok: false,
169| 0| fts_query_ok: false,
170| 0| model_ok: false,
171| 0| counts: HealthCounts {
172| 0| memories: 0,
173| 0| memories_total: 0,
174| 0| entities: 0,
175| 0| relationships: 0,
176| 0| vec_memories: 0,
177| 0| },
178| 0| db_path: paths.db.display().to_string(),
179| 0| db_size_bytes,
180| 0| schema_version: 0,
181| 0| sqlite_version: "unknown".to_string(),
182| 0| missing_entities: vec![],
183| 0| wal_size_mb: 0.0,
184| 0| journal_mode: "unknown".to_string(),
185| 0| mentions_ratio: None,
186| 0| mentions_warning: None,
187| 0| top_relation: None,
188| 0| top_relation_ratio: None,
189| 0| applies_to_ratio: None,
190| 0| relation_concentration_warning: None,
191| 0| non_normalized_count: None,
192| 0| normalization_warning: None,
193| 0| super_hub_count: None,
194| 0| super_hub_warning: None,
195| 0| top_hub_entity: None,
196| 0| top_hub_degree: None,
197| 0| hub_warning: None,
198| 0| checks: vec![HealthCheck {
199| 0| name: "integrity".to_string(),
200| 0| ok: false,
201| 0| detail: Some(integrity),
202| 0| }],
203| 0| elapsed_ms: start.elapsed().as_millis() as u64,
204| 0| })?;
205| 0| return Err(AppError::Database(rusqlite::Error::SqliteFailure(
206| 0| rusqlite::ffi::Error::new(rusqlite::ffi::SQLITE_CORRUPT),
207| 0| Some("integrity check failed".to_string()),
208| 0| )));
209| 0| }
210| |
211| 0| let memories_count: i64 = conn.query_row(
212| 0| "SELECT COUNT(*) FROM memories WHERE deleted_at IS NULL",
213| 0| [],
214| 0| |r| r.get(0),
215| 0| )?;
216| 0| let entities_count: i64 = conn.query_row("SELECT COUNT(*) FROM entities", [], |r| r.get(0))?;
217| 0| let relationships_count: i64 =
218| 0| conn.query_row("SELECT COUNT(*) FROM relationships", [], |r| r.get(0))?;
219| 0| let vec_memories_count: i64 =
220| 0| conn.query_row("SELECT COUNT(*) FROM vec_memories", [], |r| r.get(0))?;
221| |
222| 0| let mentions_count: i64 = conn.query_row(
223| 0| "SELECT COUNT(*) FROM relationships WHERE relation = 'mentions'",
224| 0| [],
225| 0| |r| r.get(0),
226| 0| )?;
227| 0| let (mentions_ratio, mentions_warning) = if relationships_count > 0 {
228| 0| let ratio = mentions_count as f64 / relationships_count as f64;
229| 0| let warning = if ratio > 0.5 {
230| 0| Some(format!(
231| 0| "mentions relationships dominate graph at {:.1}% ({}/{} total); consider running prune-relations --relation mentions --dry-run",
232| 0| ratio * 100.0,
233| 0| mentions_count,
234| 0| relationships_count
235| 0| ))
236| | } else {
237| 0| None
238| | };
239| 0| (Some(ratio), warning)
240| | } else {
241| 0| (None, None)
242| | };
243| |
244| | // Relation concentration: find the most frequent relation type and check threshold.
245| 0| let (top_relation, top_relation_ratio, applies_to_ratio, relation_concentration_warning) =
246| 0| if relationships_count > 0 {
247| | // Identify the relation with the highest edge count.
248| 0| let (top_rel, top_count): (String, i64) = conn
249| 0| .query_row(
250| 0| "SELECT relation, COUNT(*) AS cnt
251| 0| FROM relationships
252| 0| GROUP BY relation
253| 0| ORDER BY cnt DESC
254| 0| LIMIT 1",
255| 0| [],
256| 0| |r| Ok((r.get::<_, String>(0)?, r.get::<_, i64>(1)?)),
257| | )
258| 0| .unwrap_or_else(|_| ("unknown".to_string(), 0));
259| |
260| 0| let top_ratio = top_count as f64 / relationships_count as f64;
261| |
262| | // Compute applies_to ratio separately (may be 0 if absent).
263| 0| let applies_count: i64 = conn
264| 0| .query_row(
265| 0| "SELECT COUNT(*) FROM relationships WHERE relation = 'applies_to'",
266| 0| [],
267| 0| |r| r.get(0),
268| | )
269| 0| .unwrap_or(0);
270| 0| let at_ratio = if applies_count > 0 {
271| 0| Some(applies_count as f64 / relationships_count as f64)
272| | } else {
273| 0| None
274| | };
275| |
276| 0| let concentration_warning = if top_ratio > 0.40 {
277| 0| Some(format!(
278| 0| "relation '{}' dominates graph at {:.1}% ({}/{} total); consider running prune-relations --relation {} --dry-run",
279| 0| top_rel,
280| 0| top_ratio * 100.0,
281| 0| top_count,
282| 0| relationships_count,
283| 0| top_rel,
284| 0| ))
285| | } else {
286| 0| None
287| | };
288| |
289| 0| (
290| 0| Some(top_rel),
291| 0| Some(top_ratio),
292| 0| at_ratio,
293| 0| concentration_warning,
294| 0| )
295| | } else {
296| 0| (None, None, None, None)
297| | };
298| |
299| 0| let status = "ok";
300| |
301| 0| let schema_version: u32 = conn
302| 0| .query_row(
303| 0| "SELECT COALESCE(MAX(version), 0) FROM refinery_schema_history",
304| 0| [],
305| 0| |r| r.get::<_, i64>(0),
306| | )
307| 0| .unwrap_or(0) as u32;
308| |
309| 0| let schema_ok = schema_version > 0;
310| |
311| | // Checks vector tables via sqlite_master
312| 0| let vec_memories_ok = table_exists(&conn, "vec_memories");
313| 0| let vec_entities_ok = table_exists(&conn, "vec_entities");
314| 0| let vec_chunks_ok = table_exists(&conn, "vec_chunks");
315| |
316| 0| let vec_memories_missing: i64 = if vec_memories_ok {
317| 0| conn.query_row(
318| 0| "SELECT COUNT(*) FROM memories m LEFT JOIN vec_memories v ON v.memory_id = m.id WHERE v.memory_id IS NULL AND m.deleted_at IS NULL",
319| 0| [], |r| r.get(0),
320| 0| ).unwrap_or(0)
321| | } else {
322| 0| 0
323| | };
324| |
325| 0| let vec_memories_orphaned: i64 = if vec_memories_ok {
326| 0| conn.query_row(
327| 0| "SELECT COUNT(*) FROM vec_memories v LEFT JOIN memories m ON m.id = v.memory_id WHERE m.id IS NULL",
328| 0| [], |r| r.get(0),
329| 0| ).unwrap_or(0)
330| | } else {
331| 0| 0
332| | };
333| |
334| 0| tracing::info!(target: "health", vec_memories_ok = %vec_memories_ok, vec_entities_ok = %vec_entities_ok, vec_missing = vec_memories_missing, vec_orphaned = vec_memories_orphaned, "vector table checks complete");
335| 0| let fts_ok = table_exists(&conn, "fts_memories");
336| |
337| | // Verifies that FTS5 can execute a MATCH query (catches index corruption distinct from table absence).
338| 0| let fts_query_ok = if fts_ok {
339| 0| conn.query_row(
340| 0| "SELECT COUNT(*) FROM fts_memories WHERE fts_memories MATCH 'a' LIMIT 1",
341| 0| [],
342| 0| |r| r.get::<_, i64>(0),
343| | )
344| 0| .is_ok()
345| | } else {
346| 0| false
347| | };
348| |
349| 0| tracing::info!(target: "health", fts_ok = %fts_ok, fts_query_ok = %fts_query_ok, "FTS5 checks complete");
350| |
351| | // Captures the SQLite runtime version for observability.
352| 0| let sqlite_version: String = conn
353| 0| .query_row("SELECT sqlite_version()", [], |r| r.get(0))
354| 0| .unwrap_or_else(|_| "unknown".to_string());
355| |
356| | // Detects orphan entities referenced by memories but absent from the entities table.
357| 0| let mut missing_entities: Vec<String> = Vec::with_capacity(4);
358| 0| let mut stmt = conn.prepare_cached(
359| 0| "SELECT DISTINCT me.entity_id
360| 0| FROM memory_entities me
361| 0| LEFT JOIN entities e ON e.id = me.entity_id
362| 0| WHERE e.id IS NULL",
363| 0| )?;
364| 0| let orphans: Vec<i64> = stmt
365| 0| .query_map([], |r| r.get(0))?
366| 0| .collect::<Result<Vec<_>, _>>()?;
367| 0| for id in orphans {
368| 0| missing_entities.push(format!("entity_id={id}"));
369| 0| }
370| |
371| 0| let journal_mode: String = conn
372| 0| .query_row("PRAGMA journal_mode", [], |row| row.get::<_, String>(0))
373| 0| .unwrap_or_else(|_| "unknown".to_string());
374| |
375| 0| let wal_size_mb = fs::metadata(format!("{}-wal", paths.db.display()))
376| 0| .map(|m| m.len() as f64 / 1024.0 / 1024.0)
377| 0| .unwrap_or(0.0);
378| |
379| | // Database file size in bytes
380| 0| let db_size_bytes = fs::metadata(&paths.db).map(|m| m.len()).unwrap_or(0);
381| |
382| | // Checks whether the ONNX model is present in the cache
383| 0| let model_dir = paths.models.join("models--intfloat--multilingual-e5-small");
384| 0| let model_ok = model_dir.exists();
385| 0| tracing::info!(target: "health", model_ok = %model_ok, "embedding model check complete");
386| |
387| | // Builds the checks array for detailed diagnostics
388| 0| let mut checks: Vec<HealthCheck> = Vec::with_capacity(8);
389| |
390| | // At this point integrity_ok is always true (corrupt DB returned early above).
391| 0| checks.push(HealthCheck {
392| 0| name: "integrity".to_string(),
393| 0| ok: true,
394| 0| detail: None,
395| 0| });
396| |
397| 0| checks.push(HealthCheck {
398| 0| name: "schema_version".to_string(),
399| 0| ok: schema_ok,
400| 0| detail: if schema_ok {
401| 0| None
402| | } else {
403| 0| Some(format!("schema_version={schema_version} (expected >0)"))
404| | },
405| | });
406| |
407| 0| checks.push(HealthCheck {
408| 0| name: "vec_memories".to_string(),
409| 0| ok: vec_memories_ok,
410| 0| detail: if vec_memories_ok {
411| 0| None
412| | } else {
413| 0| Some("vec_memories table missing from sqlite_master".to_string())
414| | },
415| | });
416| |
417| 0| checks.push(HealthCheck {
418| 0| name: "vec_entities".to_string(),
419| 0| ok: vec_entities_ok,
420| 0| detail: if vec_entities_ok {
421| 0| None
422| | } else {
423| 0| Some("vec_entities table missing from sqlite_master".to_string())
424| | },
425| | });
426| |
427| 0| checks.push(HealthCheck {
428| 0| name: "vec_chunks".to_string(),
429| 0| ok: vec_chunks_ok,
430| 0| detail: if vec_chunks_ok {
431| 0| None
432| | } else {
433| 0| Some("vec_chunks table missing from sqlite_master".to_string())
434| | },
435| | });
436| |
437| 0| checks.push(HealthCheck {
438| 0| name: "fts_memories".to_string(),
439| 0| ok: fts_ok,
440| 0| detail: if fts_ok {
441| 0| None
442| | } else {
443| 0| Some("fts_memories table missing from sqlite_master".to_string())
444| | },
445| | });
446| |
447| 0| checks.push(HealthCheck {
448| 0| name: "fts_query".to_string(),
449| 0| ok: fts_query_ok,
450| 0| detail: if fts_query_ok {
451| 0| None
452| | } else {
453| 0| Some("FTS5 MATCH query failed — run 'sqlite-graphrag fts rebuild'".to_string())
454| | },
455| | });
456| |
457| 0| checks.push(HealthCheck {
458| 0| name: "model_onnx".to_string(),
459| 0| ok: model_ok,
460| 0| detail: if model_ok {
461| 0| None
462| | } else {
463| 0| Some(format!(
464| 0| "model missing at {}; run 'sqlite-graphrag models download'",
465| 0| model_dir.display()
466| 0| ))
467| | },
468| | });
469| |
470| | // G24: detect non-normalized entity names
471| 0| let (non_normalized_count, normalization_warning) = {
472| 0| let mut stmt = conn.prepare_cached("SELECT name FROM entities")?;
473| 0| let names: Vec<String> = stmt
474| 0| .query_map([], |r| r.get(0))?
475| 0| .filter_map(|r| r.ok())
476| 0| .collect();
477| 0| let count = names
478| 0| .iter()
479| 0| .filter(|n| crate::parsers::normalize_entity_name(n) != **n)
480| 0| .count() as i64;
481| 0| let warning = if count > 0 {
482| 0| Some(format!(
483| 0| "run 'normalize-entities --yes' to fix {count} non-normalized entities"
484| 0| ))
485| | } else {
486| 0| None
487| | };
488| 0| (Some(count), warning)
489| | };
490| |
491| | // G25: detect super-hub entities (degree > 50)
492| 0| let (super_hub_count, super_hub_warning) = {
493| 0| let mut stmt = conn.prepare_cached(
494| 0| "SELECT e.name, COUNT(r.id) as deg FROM entities e \
495| 0| LEFT JOIN relationships r ON e.id = r.source_id OR e.id = r.target_id \
496| 0| GROUP BY e.id HAVING deg > 50 ORDER BY deg DESC LIMIT 5",
497| 0| )?;
498| 0| let hubs: Vec<(String, i64)> = stmt
499| 0| .query_map([], |r| Ok((r.get(0)?, r.get(1)?)))?
500| 0| .filter_map(|r| r.ok())
501| 0| .collect();
502| 0| let count = hubs.len() as i64;
503| 0| let warning = if count > 0 {
504| 0| let names: Vec<String> = hubs
505| 0| .iter()
506| 0| .map(|(n, d)| format!("{n} (degree {d})"))
507| 0| .collect();
508| 0| Some(format!("super-hubs detected: {}", names.join(", ")))
509| | } else {
510| 0| None
511| | };
512| 0| (Some(count), warning)
513| | };
514| |
515| | // G25 (extended): identify the single highest-degree entity for programmatic use.
516| 0| let (top_hub_entity, top_hub_degree, hub_warning) = {
517| 0| let result: Option<(String, i64)> = conn
518| 0| .query_row(
519| 0| "SELECT e.name, COUNT(r.id) AS degree
520| 0| FROM entities e
521| 0| LEFT JOIN relationships r ON e.id = r.source_id OR e.id = r.target_id
522| 0| GROUP BY e.id
523| 0| ORDER BY degree DESC
524| 0| LIMIT 1",
525| 0| [],
526| 0| |r| Ok((r.get::<_, String>(0)?, r.get::<_, i64>(1)?)),
527| | )
528| 0| .ok();
529| 0| match result {
530| 0| Some((name, degree)) => {
531| 0| let warning = if degree > 50 {
532| 0| Some(format!(
533| 0| "entity '{name}' has {degree} connections; consider splitting or using --max-neighbors-per-hop"
534| 0| ))
535| | } else {
536| 0| None
537| | };
538| 0| (Some(name), Some(degree), warning)
539| | }
540| 0| None => (None, None, None),
541| | }
542| | };
543| |
544| 0| let response = HealthResponse {
545| 0| status: status.to_string(),
546| 0| integrity,
547| 0| integrity_ok,
548| 0| schema_ok,
549| 0| vec_memories_ok,
550| 0| vec_memories_missing,
551| 0| vec_memories_orphaned,
552| 0| vec_entities_ok,
553| 0| vec_chunks_ok,
554| 0| fts_ok,
555| 0| fts_query_ok,
556| 0| model_ok,
557| 0| counts: HealthCounts {
558| 0| memories: memories_count,
559| 0| memories_total: memories_count,
560| 0| entities: entities_count,
561| 0| relationships: relationships_count,
562| 0| vec_memories: vec_memories_count,
563| 0| },
564| 0| db_path: paths.db.display().to_string(),
565| 0| db_size_bytes,
566| 0| schema_version,
567| 0| sqlite_version,
568| 0| missing_entities,
569| 0| wal_size_mb,
570| 0| journal_mode,
571| 0| mentions_ratio,
572| 0| mentions_warning,
573| 0| top_relation,
574| 0| top_relation_ratio,
575| 0| applies_to_ratio,
576| 0| relation_concentration_warning,
577| 0| non_normalized_count,
578| 0| normalization_warning,
579| 0| super_hub_count,
580| 0| super_hub_warning,
581| 0| top_hub_entity,
582| 0| top_hub_degree,
583| 0| hub_warning,
584| 0| checks,
585| 0| elapsed_ms: start.elapsed().as_millis() as u64,
586| 0| };
587| |
588| 0| output::emit_json(&response)?;
589| |
590| 0| Ok(())
591| 0|}
592| |
593| |#[cfg(test)]
594| |mod tests {
595| | use super::*;
596| |
597| | #[test]
598| 1| fn health_check_serializes_all_new_fields() {
599| 1| let response = HealthResponse {
600| 1| status: "ok".to_string(),
601| 1| integrity: "ok".to_string(),
602| 1| integrity_ok: true,
603| 1| schema_ok: true,
604| 1| vec_memories_ok: true,
605| 1| vec_memories_missing: 0,
606| 1| vec_memories_orphaned: 0,
607| 1| vec_entities_ok: true,
608| 1| vec_chunks_ok: true,
609| 1| fts_ok: true,
610| 1| fts_query_ok: true,
611| 1| model_ok: false,
612| 1| counts: HealthCounts {
613| 1| memories: 5,
614| 1| memories_total: 5,
615| 1| entities: 3,
616| 1| relationships: 2,
617| 1| vec_memories: 5,
618| 1| },
619| 1| db_path: "/tmp/test.sqlite".to_string(),
620| 1| db_size_bytes: 4096,
621| 1| schema_version: 6,
622| 1| sqlite_version: "3.46.0".to_string(),
623| 1| elapsed_ms: 0,
624| 1| missing_entities: vec![],
625| 1| wal_size_mb: 0.0,
626| 1| journal_mode: "wal".to_string(),
627| 1| mentions_ratio: None,
628| 1| mentions_warning: None,
629| 1| top_relation: None,
630| 1| top_relation_ratio: None,
631| 1| applies_to_ratio: None,
632| 1| relation_concentration_warning: None,
633| 1| non_normalized_count: None,
634| 1| normalization_warning: None,
635| 1| super_hub_count: None,
636| 1| super_hub_warning: None,
637| 1| top_hub_entity: None,
638| 1| top_hub_degree: None,
639| 1| hub_warning: None,
640| 1| checks: vec![
641| 1| HealthCheck {
642| 1| name: "integrity".to_string(),
643| 1| ok: true,
644| 1| detail: None,
645| 1| },
646| 1| HealthCheck {
647| 1| name: "model_onnx".to_string(),
648| 1| ok: false,
649| 1| detail: Some("model missing".to_string()),
650| 1| },
651| 1| ],
652| 1| };
653| |
654| 1| let json = serde_json::to_value(&response).unwrap();
655| 1| assert_eq!(json["status"], "ok");
656| 1| assert_eq!(json["integrity_ok"], true);
657| 1| assert_eq!(json["schema_ok"], true);
658| 1| assert_eq!(json["vec_memories_ok"], true);
659| 1| assert_eq!(json["vec_entities_ok"], true);
660| 1| assert_eq!(json["vec_chunks_ok"], true);
661| 1| assert_eq!(json["fts_ok"], true);
662| 1| assert_eq!(json["model_ok"], false);
663| 1| assert_eq!(json["db_size_bytes"], 4096u64);
664| 1| assert!(json["checks"].is_array());
665| 1| assert_eq!(json["checks"].as_array().unwrap().len(), 2);
666| |
667| | // Verifies that detail is absent when ok=true (skip_serializing_if)
668| 1| let integrity_check = &json["checks"][0];
669| 1| assert_eq!(integrity_check["name"], "integrity");
670| 1| assert_eq!(integrity_check["ok"], true);
671| 1| assert!(integrity_check.get("detail").is_none());
672| |
673| | // Verifies that detail is present when ok=false
674| 1| let model_check = &json["checks"][1];
675| 1| assert_eq!(model_check["name"], "model_onnx");
676| 1| assert_eq!(model_check["ok"], false);
677| 1| assert_eq!(model_check["detail"], "model missing");
678| 1| }
679| |
680| | #[test]
681| 1| fn health_check_without_detail_omits_field() {
682| 1| let check = HealthCheck {
683| 1| name: "vec_memories".to_string(),
684| 1| ok: true,
685| 1| detail: None,
686| 1| };
687| 1| let json = serde_json::to_value(&check).unwrap();
688| 1| assert!(
689| 1| json.get("detail").is_none(),
690| 0| "detail field must be omitted when None"
691| | );
692| 1| }
693| |
694| | #[test]
695| 1| fn health_check_with_detail_serializes_field() {
696| 1| let check = HealthCheck {
697| 1| name: "fts_memories".to_string(),
698| 1| ok: false,
699| 1| detail: Some("fts_memories table missing from sqlite_master".to_string()),
700| 1| };
701| 1| let json = serde_json::to_value(&check).unwrap();
702| 1| assert_eq!(
703| 1| json["detail"],
704| | "fts_memories table missing from sqlite_master"
705| | );
706| 1| }
707| |
708| | #[test]
709| 1| fn health_response_fts_query_ok_and_sqlite_version_serialize() {
710| | // Verifies that fts_query_ok and sqlite_version appear in the serialized JSON
711| | // with the expected keys and values.
712| 1| let response = HealthResponse {
713| 1| status: "ok".to_string(),
714| 1| integrity: "ok".to_string(),
715| 1| integrity_ok: true,
716| 1| schema_ok: true,
717| 1| vec_memories_ok: true,
718| 1| vec_memories_missing: 0,
719| 1| vec_memories_orphaned: 0,
720| 1| vec_entities_ok: true,
721| 1| vec_chunks_ok: true,
722| 1| fts_ok: true,
723| 1| fts_query_ok: true,
724| 1| model_ok: true,
725| 1| counts: HealthCounts {
726| 1| memories: 0,
727| 1| memories_total: 0,
728| 1| entities: 0,
729| 1| relationships: 0,
730| 1| vec_memories: 0,
731| 1| },
732| 1| db_path: "/tmp/test.sqlite".to_string(),
733| 1| db_size_bytes: 0,
734| 1| schema_version: 1,
735| 1| sqlite_version: "3.45.1".to_string(),
736| 1| elapsed_ms: 0,
737| 1| missing_entities: vec![],
738| 1| wal_size_mb: 0.0,
739| 1| journal_mode: "wal".to_string(),
740| 1| mentions_ratio: None,
741| 1| mentions_warning: None,
742| 1| top_relation: None,
743| 1| top_relation_ratio: None,
744| 1| applies_to_ratio: None,
745| 1| relation_concentration_warning: None,
746| 1| non_normalized_count: None,
747| 1| normalization_warning: None,
748| 1| super_hub_count: None,
749| 1| super_hub_warning: None,
750| 1| top_hub_entity: None,
751| 1| top_hub_degree: None,
752| 1| hub_warning: None,
753| 1| checks: vec![],
754| 1| };
755| |
756| 1| let json = serde_json::to_value(&response).unwrap();
757| |
758| | // fts_query_ok must appear at the top level
759| 1| assert_eq!(
760| 1| json["fts_query_ok"], true,
761| 0| "fts_query_ok must be present and true in serialized JSON"
762| | );
763| |
764| | // sqlite_version must appear at the top level with the exact string
765| 1| assert_eq!(
766| 1| json["sqlite_version"], "3.45.1",
767| 0| "sqlite_version must be present and match the provided string"
768| | );
769| |
770| | // Verify fts_query_ok=false path includes the expected detail message
771| 1| let check_fail = HealthCheck {
772| 1| name: "fts_query".to_string(),
773| 1| ok: false,
774| 1| detail: Some("FTS5 MATCH query failed — run 'sqlite-graphrag fts rebuild'".to_string()),
775| 1| };
776| 1| let check_json = serde_json::to_value(&check_fail).unwrap();
777| 1| assert_eq!(check_json["name"], "fts_query");
778| 1| assert_eq!(check_json["ok"], false);
779| 1| assert_eq!(
780| 1| check_json["detail"],
781| | "FTS5 MATCH query failed — run 'sqlite-graphrag fts rebuild'"
782| | );
783| 1| }
784| |
785| 5| fn make_full_response(
786| 5| top_relation: Option<String>,
787| 5| top_relation_ratio: Option<f64>,
788| 5| applies_to_ratio: Option<f64>,
789| 5| relation_concentration_warning: Option<String>,
790| 5| ) -> HealthResponse {
791| 5| HealthResponse {
792| 5| status: "ok".to_string(),
793| 5| integrity: "ok".to_string(),
794| 5| integrity_ok: true,
795| 5| schema_ok: true,
796| 5| vec_memories_ok: true,
797| 5| vec_memories_missing: 0,
798| 5| vec_memories_orphaned: 0,
799| 5| vec_entities_ok: true,
800| 5| vec_chunks_ok: true,
801| 5| fts_ok: true,
802| 5| fts_query_ok: true,
803| 5| model_ok: true,
804| 5| counts: HealthCounts {
805| 5| memories: 10,
806| 5| memories_total: 10,
807| 5| entities: 5,
808| 5| relationships: 20,
809| 5| vec_memories: 10,
810| 5| },
811| 5| db_path: "/tmp/test.sqlite".to_string(),
812| 5| db_size_bytes: 8192,
813| 5| schema_version: 3,
814| 5| sqlite_version: "3.46.0".to_string(),
815| 5| elapsed_ms: 1,
816| 5| missing_entities: vec![],
817| 5| wal_size_mb: 0.0,
818| 5| journal_mode: "wal".to_string(),
819| 5| mentions_ratio: None,
820| 5| mentions_warning: None,
821| 5| top_relation,
822| 5| top_relation_ratio,
823| 5| applies_to_ratio,
824| 5| relation_concentration_warning,
825| 5| non_normalized_count: None,
826| 5| normalization_warning: None,
827| 5| super_hub_count: None,
828| 5| super_hub_warning: None,
829| 5| top_hub_entity: None,
830| 5| top_hub_degree: None,
831| 5| hub_warning: None,
832| 5| checks: vec![],
833| 5| }
834| 5| }
835| |
836| | #[test]
837| 1| fn health_concentration_fields_omitted_when_no_relationships() {
838| | // Represents a DB with zero relationships.
839| 1| let resp = make_full_response(None, None, None, None);
840| 1| let json = serde_json::to_value(&resp).unwrap();
841| 1| assert!(
842| 1| json.get("top_relation").is_none(),
843| 0| "top_relation must be omitted when None"
844| | );
845| 1| assert!(
846| 1| json.get("top_relation_ratio").is_none(),
847| 0| "top_relation_ratio must be omitted when None"
848| | );
849| 1| assert!(
850| 1| json.get("applies_to_ratio").is_none(),
851| 0| "applies_to_ratio must be omitted when None"
852| | );
853| 1| assert!(
854| 1| json.get("relation_concentration_warning").is_none(),
855| 0| "relation_concentration_warning must be omitted when None"
856| | );
857| 1| }
858| |
859| | #[test]
860| 1| fn health_concentration_fields_present_with_data() {
861| 1| let resp = make_full_response(
862| 1| Some("mentions".to_string()),
863| 1| Some(0.60),
864| 1| Some(0.10),
865| 1| Some("relation 'mentions' dominates graph at 60.0%".to_string()),
866| | );
867| 1| let json = serde_json::to_value(&resp).unwrap();
868| 1| assert_eq!(json["top_relation"], "mentions");
869| 1| assert!((json["top_relation_ratio"].as_f64().unwrap() - 0.60).abs() < 1e-9);
870| 1| assert!((json["applies_to_ratio"].as_f64().unwrap() - 0.10).abs() < 1e-9);
871| 1| assert!(json["relation_concentration_warning"]
872| 1| .as_str()
873| 1| .unwrap()
874| 1| .contains("60.0%"));
875| 1| }
876| |
877| | #[test]
878| 1| fn health_concentration_warning_absent_when_ratio_below_threshold() {
879| | // top_relation_ratio of 0.39 is below the 0.40 threshold — no warning.
880| 1| let resp = make_full_response(Some("uses".to_string()), Some(0.39), None, None);
881| 1| let json = serde_json::to_value(&resp).unwrap();
882| 1| assert_eq!(json["top_relation"], "uses");
883| 1| assert!(
884| 1| json.get("relation_concentration_warning").is_none(),
885| 0| "warning must be absent when ratio <= 0.40"
886| | );
887| 1| }
888| |
889| | #[test]
890| 1| fn health_concentration_warning_present_at_threshold() {
891| | // Exactly at 0.41 (above 0.40) — warning must appear.
892| 1| let resp = make_full_response(
893| 1| Some("depends_on".to_string()),
894| 1| Some(0.41),
895| 1| None,
896| 1| Some("relation 'depends_on' dominates graph at 41.0%".to_string()),
897| | );
898| 1| let json = serde_json::to_value(&resp).unwrap();
899| 1| assert!(
900| 1| json["relation_concentration_warning"].is_string(),
901| 0| "warning must be present when top_relation_ratio > 0.40"
902| | );
903| 1| }
904| |
905| | #[test]
906| 1| fn health_applies_to_ratio_omitted_when_none() {
907| | // applies_to_ratio is None when there are no applies_to edges.
908| 1| let resp = make_full_response(Some("related".to_string()), Some(0.30), None, None);
909| 1| let json = serde_json::to_value(&resp).unwrap();
910| 1| assert!(
911| 1| json.get("applies_to_ratio").is_none(),
912| 0| "applies_to_ratio must be omitted when None"
913| | );
914| 1| }
915| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/history.rs:
1| |//! Handler for the `history` CLI subcommand.
2| |
3| |use crate::errors::AppError;
4| |use crate::i18n::errors_msg;
5| |use crate::output;
6| |use crate::paths::AppPaths;
7| |use crate::storage::connection::open_ro;
8| |use rusqlite::params;
9| |use rusqlite::OptionalExtension;
10| |use serde::Serialize;
11| |
12| |#[derive(clap::Args)]
13| |#[command(after_long_help = "EXAMPLES:\n \
14| | # List all versions of a memory (positional form)\n \
15| | sqlite-graphrag history onboarding\n\n \
16| | # List versions using the named flag form\n \
17| | sqlite-graphrag history --name onboarding\n\n \
18| | # Omit body content to reduce response size\n \
19| | sqlite-graphrag history onboarding --no-body\n\n \
20| | # Include character-level change summary between versions\n \
21| | sqlite-graphrag history onboarding --diff\n\n\
22| |DIFF OUTPUT:\n \
23| | When --diff is active, each version (except the first) includes a `changes`\n \
24| | object with `added_chars` and `removed_chars` — the character count difference\n \
25| | between that version and its predecessor.")]
26| |pub struct HistoryArgs {
27| | /// Memory name as a positional argument. Alternative to `--name`.
28| | #[arg(
29| | value_name = "NAME",
30| | conflicts_with = "name",
31| | help = "Memory name whose version history to return; alternative to --name"
32| | )]
33| | pub name_positional: Option<String>,
34| | /// Memory name whose version history will be returned. Includes soft-deleted memories
35| | /// so that `restore --version <V>` workflow remains discoverable after `forget`.
36| | #[arg(long)]
37| | pub name: Option<String>,
38| | #[arg(
39| | long,
40| | help = "Namespace (env: SQLITE_GRAPHRAG_NAMESPACE, default: global)"
41| | )]
42| | pub namespace: Option<String>,
43| | /// Omit body content from each version to reduce response size.
44| | #[arg(
45| | long,
46| | default_value_t = false,
47| | help = "Omit body content from response"
48| | )]
49| | pub no_body: bool,
50| | /// Include character-level change summary between consecutive versions.
51| | #[arg(
52| | long,
53| | default_value_t = false,
54| | help = "Include character-level change summary between consecutive versions"
55| | )]
56| | pub diff: bool,
57| | #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
58| | pub json: bool,
59| | /// Path to graphrag.sqlite (overrides SQLITE_GRAPHRAG_DB_PATH and default CWD).
60| | #[arg(
61| | long,
62| | env = "SQLITE_GRAPHRAG_DB_PATH",
63| | help = "Path to graphrag.sqlite"
64| | )]
65| | pub db: Option<String>,
66| |}
67| |
68| |/// Character-level change summary between two consecutive versions.
69| |#[derive(Serialize)]
70| |struct VersionChanges {
71| | added_chars: usize,
72| | removed_chars: usize,
73| |}
74| |
75| |#[derive(Serialize)]
76| |struct HistoryVersion {
77| | version: i64,
78| | name: String,
79| | #[serde(rename = "type")]
80| | memory_type: String,
81| | description: String,
82| | #[serde(skip_serializing_if = "Option::is_none")]
83| | body: Option<String>,
84| | metadata: serde_json::Value,
85| | /// Past-tense action label derived from `change_reason`; always populated
86| | /// so consumers do not see `null` for the documented `action` contract
87| | /// (M-A6 fix in v1.0.40). Known mappings: `create→created`, `edit→edited`,
88| | /// `rename→renamed`, `restore→restored`, `merge→merged`, `forget→forgotten`.
89| | /// Unknown verbs are passed through unchanged.
90| | action: String,
91| | change_reason: String,
92| | changed_by: Option<String>,
93| | created_at: i64,
94| | created_at_iso: String,
95| | #[serde(skip_serializing_if = "Option::is_none")]
96| | pub changes: Option<VersionChanges>,
97| |}
98| |
99| |/// Maps the raw `change_reason` stored in `memory_versions` to the past-tense
100| |/// `action` exposed in the JSON contract. Centralized so future call sites
101| |/// (e.g. `read --include-history`) reuse the same mapping.
102| 7|fn change_reason_to_action(reason: &str) -> String {
103| 7| match reason {
104| 7| "create" => "created",
^1
105| 6| "edit" => "edited",
^1
106| 5| "update" => "updated",
^0
107| 5| "rename" => "renamed",
^1
108| 4| "restore" => "restored",
^1
109| 3| "merge" => "merged",
^1
110| 2| "forget" => "forgotten",
^1
111| 1| other => other,
112| | }
113| 7| .to_string()
114| 7|}
115| |
116| |#[derive(Serialize)]
117| |struct HistoryResponse {
118| | name: String,
119| | namespace: String,
120| | /// True when the memory is currently soft-deleted (forgotten).
121| | /// Allows the user to discover the version for `restore` even after `forget`.
122| | deleted: bool,
123| | versions: Vec<HistoryVersion>,
124| | /// Total execution time in milliseconds from handler start to serialisation.
125| | elapsed_ms: u64,
126| |}
127| |
128| 0|pub fn run(args: HistoryArgs) -> Result<(), AppError> {
129| 0| let start = std::time::Instant::now();
130| | // Resolve name from positional or --name flag; both are optional, at least one is required.
131| 0| let name = args.name_positional.or(args.name).ok_or_else(|| {
132| 0| AppError::Validation("name required: pass as positional argument or via --name".to_string())
133| 0| })?;
134| 0| let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
135| 0| let paths = AppPaths::resolve(args.db.as_deref())?;
136| 0| crate::storage::connection::ensure_db_ready(&paths)?;
137| 0| let conn = open_ro(&paths.db)?;
138| |
139| | // v1.0.22 P0: direct query WITHOUT deleted_at filter — history MUST return versions
140| | // of forgotten memories so the user can discover the version to use in `restore`.
141| | // The old find_by_name filtered deleted_at IS NULL and was a dead-end in the forget+restore workflow.
142| 0| let row: Option<(i64, Option<i64>)> = conn
143| 0| .query_row(
144| 0| "SELECT id, deleted_at FROM memories WHERE namespace = ?1 AND name = ?2",
145| 0| params![namespace, name],
146| 0| |r| Ok((r.get(0)?, r.get(1)?)),
147| | )
148| 0| .optional()?;
149| 0| let (memory_id, deleted_at) =
150| 0| row.ok_or_else(|| AppError::NotFound(errors_msg::memory_not_found(&name, &namespace)))?;
151| 0| let deleted = deleted_at.is_some();
152| |
153| 0| let mut stmt = conn.prepare_cached(
154| 0| "SELECT version, name, type, description, body, metadata,
155| 0| change_reason, changed_by, created_at
156| 0| FROM memory_versions
157| 0| WHERE memory_id = ?1
158| 0| ORDER BY version ASC",
159| 0| )?;
160| |
161| 0| let no_body = args.no_body;
162| 0| let want_diff = args.diff;
163| 0| let mut versions = stmt
164| 0| .query_map(params![memory_id], |r| {
165| 0| let created_at: i64 = r.get(8)?;
166| 0| let created_at_iso = crate::tz::epoch_to_iso(created_at);
167| 0| let body_str: String = r.get(4)?;
168| 0| let metadata_str: String = r.get(5)?;
169| 0| let metadata_value: serde_json::Value = serde_json::from_str(&metadata_str)
170| 0| .unwrap_or(serde_json::Value::Object(serde_json::Map::new()));
171| 0| let change_reason: String = r.get(6)?;
172| 0| let action = change_reason_to_action(&change_reason);
173| | Ok(HistoryVersion {
174| 0| version: r.get(0)?,
175| 0| name: r.get(1)?,
176| 0| memory_type: r.get(2)?,
177| 0| description: r.get(3)?,
178| 0| body: if no_body { None } else { Some(body_str) },
179| 0| metadata: metadata_value,
180| 0| action,
181| 0| change_reason,
182| 0| changed_by: r.get(7)?,
183| 0| created_at,
184| 0| created_at_iso,
185| 0| changes: None,
186| | })
187| 0| })?
188| 0| .collect::<Result<Vec<_>, _>>()?;
189| |
190| 0| if want_diff && !versions.is_empty() {
191| 0| let body_lens: Vec<usize> = versions
192| 0| .iter()
193| 0| .map(|v| v.body.as_deref().map_or(0, str::len))
194| 0| .collect();
195| |
196| 0| versions[0].changes = Some(VersionChanges {
197| 0| added_chars: body_lens[0],
198| 0| removed_chars: 0,
199| 0| });
200| |
201| 0| for i in 1..versions.len() {
202| 0| let old_len = body_lens[i - 1];
203| 0| let new_len = body_lens[i];
204| 0| versions[i].changes = Some(VersionChanges {
205| 0| added_chars: new_len.saturating_sub(old_len),
206| 0| removed_chars: old_len.saturating_sub(new_len),
207| 0| });
208| 0| }
209| 0| }
210| |
211| 0| output::emit_json(&HistoryResponse {
212| 0| name,
213| 0| namespace,
214| 0| deleted,
215| 0| versions,
216| 0| elapsed_ms: start.elapsed().as_millis() as u64,
217| 0| })?;
218| |
219| 0| Ok(())
220| 0|}
221| |
222| |#[cfg(test)]
223| |mod tests {
224| | use super::{change_reason_to_action, VersionChanges};
225| |
226| | // Bug M-A6: action is always populated and maps known reasons to past tense.
227| | #[test]
228| 1| fn version_changes_serializes_correctly() {
229| 1| let changes = VersionChanges {
230| 1| added_chars: 10,
231| 1| removed_chars: 3,
232| 1| };
233| 1| let json = serde_json::to_value(&changes).expect("serialization failed");
234| 1| assert_eq!(json["added_chars"], 10u64);
235| 1| assert_eq!(json["removed_chars"], 3u64);
236| 1| }
237| |
238| | #[test]
239| 1| fn added_chars_saturating_sub_no_underflow() {
240| | // new body shorter than old — added_chars must be 0, not wrapping
241| 1| let old_len: usize = 100;
242| 1| let new_len: usize = 40;
243| 1| let added = new_len.saturating_sub(old_len);
244| 1| let removed = old_len.saturating_sub(new_len);
245| 1| assert_eq!(added, 0);
246| 1| assert_eq!(removed, 60);
247| 1| }
248| |
249| | #[test]
250| 1| fn removed_chars_saturating_sub_no_underflow() {
251| | // new body longer than old — removed_chars must be 0
252| 1| let old_len: usize = 20;
253| 1| let new_len: usize = 80;
254| 1| let added = new_len.saturating_sub(old_len);
255| 1| let removed = old_len.saturating_sub(new_len);
256| 1| assert_eq!(added, 60);
257| 1| assert_eq!(removed, 0);
258| 1| }
259| |
260| | #[test]
261| 1| fn change_reason_create_maps_to_created() {
262| 1| assert_eq!(change_reason_to_action("create"), "created");
263| 1| }
264| |
265| | #[test]
266| 1| fn change_reason_edit_maps_to_edited() {
267| 1| assert_eq!(change_reason_to_action("edit"), "edited");
268| 1| }
269| |
270| | #[test]
271| 1| fn change_reason_rename_maps_to_renamed() {
272| 1| assert_eq!(change_reason_to_action("rename"), "renamed");
273| 1| }
274| |
275| | #[test]
276| 1| fn change_reason_restore_maps_to_restored() {
277| 1| assert_eq!(change_reason_to_action("restore"), "restored");
278| 1| }
279| |
280| | #[test]
281| 1| fn change_reason_merge_maps_to_merged() {
282| 1| assert_eq!(change_reason_to_action("merge"), "merged");
283| 1| }
284| |
285| | #[test]
286| 1| fn change_reason_forget_maps_to_forgotten() {
287| 1| assert_eq!(change_reason_to_action("forget"), "forgotten");
288| 1| }
289| |
290| | #[test]
291| 1| fn change_reason_unknown_passes_through() {
292| 1| assert_eq!(change_reason_to_action("custom-action"), "custom-action");
293| 1| }
294| |
295| | #[test]
296| 1| fn epoch_zero_yields_valid_iso() {
297| | // v1.0.68 (test fix): timezone-agnostic — parse the ISO and compare
298| | // the instant with the Unix epoch. The previous starts_with check
299| | // leaked the SQLITE_GRAPHRAG_DISPLAY_TZ env var from sibling tests
300| | // and failed on hosts whose default display timezone is not UTC.
301| 1| let iso = crate::tz::epoch_to_iso(0);
302| 1| let parsed = chrono::DateTime::parse_from_rfc3339(&iso)
303| 1| .unwrap_or_else(|e| panic!("expected RFC3339, got `{iso}`: {e}"));
^0
304| 1| assert_eq!(
305| 1| parsed.timestamp(),
306| 1| chrono::DateTime::UNIX_EPOCH.timestamp(),
307| 0| "epoch 0 must map to the Unix epoch instant, got: {iso}"
308| | );
309| 1| }
310| |
311| | #[test]
312| 1| fn typical_epoch_yields_iso_rfc3339() {
313| 1| let iso = crate::tz::epoch_to_iso(1_745_000_000);
314| 1| assert!(!iso.is_empty(), "created_at_iso must not be empty");
^0
315| 1| assert!(iso.contains('T'), "created_at_iso must contain T separator");
^0
316| | // With UTC the offset is +00:00; verifies general format without relying on the global tz
317| 1| assert!(
318| 1| iso.contains('+') || iso.contains('-'),
319| 0| "must contain offset sign, got: {iso}"
320| | );
321| 1| }
322| |
323| | #[test]
324| 1| fn invalid_epoch_returns_fallback() {
325| 1| let iso = crate::tz::epoch_to_iso(i64::MIN);
326| 1| assert!(
327| 1| !iso.is_empty(),
328| 0| "invalid epoch must return non-empty fallback"
329| | );
330| 1| }
331| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/hybrid_search.rs:
1| |//! Handler for the `hybrid-search` CLI subcommand.
2| |
3| |use crate::cli::MemoryType;
4| |use crate::errors::AppError;
5| |use crate::graph::traverse_from_memories_with_hops;
6| |use crate::output::{self, JsonOutputFormat, RecallItem};
7| |use crate::paths::AppPaths;
8| |use crate::storage::connection::open_ro;
9| |use crate::storage::entities;
10| |use crate::storage::memories;
11| |
12| |use std::collections::HashMap;
13| |
14| |/// Arguments for the `hybrid-search` subcommand.
15| |///
16| |/// When `--namespace` is omitted the search runs against the `global` namespace,
17| |/// which is the default namespace used by `remember` when no `--namespace` flag
18| |/// is provided. Pass an explicit `--namespace` value to search a different
19| |/// isolated namespace.
20| |#[derive(clap::Args)]
21| |#[command(after_long_help = "EXAMPLES:\n \
22| | # Basic hybrid search combining FTS5 + vector via RRF\n \
23| | sqlite-graphrag hybrid-search \"postgres migration deadlock\" --k 10\n\n \
24| | # Tune RRF weights to favor keyword matches over semantic similarity\n \
25| | sqlite-graphrag hybrid-search \"jwt auth\" --weight-fts 1.5 --weight-vec 0.5 --k 5\n\n \
26| | # Add graph traversal matches (entities connected to top results)\n \
27| | sqlite-graphrag hybrid-search \"frontend architecture\" --with-graph --k 10\n\n \
28| | # Graph traversal with custom depth and minimum edge weight\n \
29| | sqlite-graphrag hybrid-search \"auth design\" --with-graph --max-hops 3 --min-weight 0.5 --k 10\n\n \
30| |NOTES:\n \
31| | --with-graph enables entity graph traversal seeded by the top RRF results.\n \
32| | Graph matches appear in the `graph_matches` array (separate from `results`).\n \
33| | Without --with-graph, `graph_matches` is always empty.")]
34| |pub struct HybridSearchArgs {
35| | #[arg(
36| | allow_hyphen_values = true,
37| | help = "Hybrid search query (vector KNN + FTS5 BM25 fused via RRF)"
38| | )]
39| | pub query: String,
40| | /// Maximum number of fused results to return after RRF combines vector + FTS5 candidates.
41| | ///
42| | /// Validated to the inclusive range `1..=4096` (the upper bound matches `sqlite-vec`'s knn
43| | /// limit). Each underlying search fetches `k * 2` candidates before fusion.
44| | #[arg(short = 'k', long, aliases = ["limit", "top-k"], default_value = "10", value_parser = crate::parsers::parse_k_range)]
45| | pub k: usize,
46| | #[arg(long, default_value = "60")]
47| | pub rrf_k: u32,
48| | #[arg(long, default_value = "1.0")]
49| | pub weight_vec: f32,
50| | #[arg(long, default_value = "1.0")]
51| | pub weight_fts: f32,
52| | /// Filter by memory.type. Note: distinct from graph entity_type
53| | /// (project/tool/person/file/concept/incident/decision/memory/dashboard/issue_tracker/organization/location/date)
54| | /// used in --entities-file.
55| | #[arg(long, value_enum)]
56| | pub r#type: Option<MemoryType>,
57| | #[arg(long)]
58| | pub namespace: Option<String>,
59| | #[arg(long)]
60| | pub with_graph: bool,
61| | #[arg(long, default_value = "2")]
62| | pub max_hops: u32,
63| | #[arg(long, default_value = "0.3")]
64| | pub min_weight: f64,
65| | #[arg(long, value_enum, default_value_t = JsonOutputFormat::Json)]
66| | pub format: JsonOutputFormat,
67| | #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
68| | pub db: Option<String>,
69| | /// Accept `--json` as a no-op because output is already JSON by default.
70| | #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
71| | pub json: bool,
72| | #[command(flatten)]
73| | pub daemon: crate::cli::DaemonOpts,
74| |}
75| |
76| |#[derive(serde::Serialize)]
77| |pub struct HybridSearchItem {
78| | pub memory_id: i64,
79| | pub name: String,
80| | pub namespace: String,
81| | #[serde(rename = "type")]
82| | pub memory_type: String,
83| | pub description: String,
84| | pub body: String,
85| | pub snippet: String,
86| | pub combined_score: f64,
87| | /// Alias of `combined_score` for the documented contract in SKILL.md.
88| | pub score: f64,
89| | /// Source of the match: always "hybrid" (RRF of vec + fts). Added in v2.0.1.
90| | pub source: String,
91| | #[serde(skip_serializing_if = "Option::is_none")]
92| | pub vec_rank: Option<usize>,
93| | #[serde(skip_serializing_if = "Option::is_none")]
94| | pub fts_rank: Option<usize>,
95| | /// Combined RRF score — explicit alias of `combined_score` for integration contracts.
96| | #[serde(skip_serializing_if = "Option::is_none")]
97| | pub rrf_score: Option<f64>,
98| | /// RRF score normalized to [0.0, 1.0] for cross-method comparability.
99| | pub normalized_score: f64,
100| | /// Raw KNN distance from the vector index (lower = more similar).
101| | ///
102| | /// Present when the result came from the vector search path; `None` when the
103| | /// result appeared only in the FTS5 results and was not ranked by the KNN index.
104| | #[serde(skip_serializing_if = "Option::is_none")]
105| | pub vec_distance: Option<f64>,
106| | /// Raw BM25 score from the FTS5 index. Currently always `None`; reserved for
107| | /// a future release when the FTS5 BM25 score is exposed by the storage layer.
108| | #[serde(skip_serializing_if = "Option::is_none")]
109| | pub fts_bm25: Option<f64>,
110| |}
111| |
112| |/// RRF weights used in hybrid search: vec (vector) and fts (text).
113| |#[derive(serde::Serialize)]
114| |pub struct Weights {
115| | pub vec: f32,
116| | pub fts: f32,
117| |}
118| |
119| |#[derive(serde::Serialize)]
120| |pub struct HybridSearchResponse {
121| | pub query: String,
122| | pub k: usize,
123| | /// RRF k parameter used in the combined ranking.
124| | pub rrf_k: u32,
125| | /// Weights applied to vec and fts sources in the RRF fusion.
126| | pub weights: Weights,
127| | pub results: Vec<HybridSearchItem>,
128| | pub graph_matches: Vec<RecallItem>,
129| | /// True when FTS5 failed and the response is vec-only.
130| | ///
131| | /// Omitted from JSON when `false` to keep the happy-path envelope clean.
132| | #[serde(skip_serializing_if = "std::ops::Not::not")]
133| | pub fts_degraded: bool,
134| | /// Human-readable description of the FTS5 failure when `fts_degraded` is true.
135| | ///
136| | /// Omitted from JSON when `None`.
137| | #[serde(skip_serializing_if = "Option::is_none")]
138| | pub fts_error: Option<String>,
139| | /// True when the FTS5 index was corrupted and successfully auto-rebuilt during this request.
140| | ///
141| | /// Omitted from JSON when `false` to keep the happy-path envelope clean.
142| | #[serde(skip_serializing_if = "std::ops::Not::not")]
143| | pub fts_auto_rebuilt: bool,
144| | /// Total execution time in milliseconds from handler start to serialisation.
145| | pub elapsed_ms: u64,
146| |}
147| |
148| |#[tracing::instrument(skip_all, level = "debug", name = "hybrid_search")]
149| 0|pub fn run(args: HybridSearchArgs) -> Result<(), AppError> {
150| 0| let start = std::time::Instant::now();
151| 0| let _ = args.format;
152| 0| tracing::debug!(target: "hybrid_search", query = %args.query, k = args.k, "fusing results");
153| |
154| | // G20: reject graph-specific flags when --with-graph is not active
155| 0| if !args.with_graph {
156| 0| if args.max_hops != 2 {
157| 0| return Err(AppError::Validation(
158| 0| "--max-hops requires --with-graph to be active".to_string(),
159| 0| ));
160| 0| }
161| 0| if (args.min_weight - 0.3).abs() > f64::EPSILON {
162| 0| return Err(AppError::Validation(
163| 0| "--min-weight requires --with-graph to be active".to_string(),
164| 0| ));
165| 0| }
166| 0| }
167| |
168| 0| let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
169| 0| let paths = AppPaths::resolve(args.db.as_deref())?;
170| 0| crate::storage::connection::ensure_db_ready(&paths)?;
171| |
172| 0| output::emit_progress_i18n(
173| 0| "Computing query embedding...",
174| 0| "Calculando embedding da consulta...",
175| | );
176| 0| let embedding = crate::daemon::embed_query_or_local(
177| 0| &paths.models,
178| 0| &args.query,
179| 0| args.daemon.autostart_daemon,
180| 0| )?;
181| |
182| 0| let conn = open_ro(&paths.db)?;
183| |
184| 0| let memory_type_str = args.r#type.map(|t| t.as_str());
185| |
186| 0| let vec_results = memories::knn_search(
187| 0| &conn,
188| 0| &embedding,
189| 0| &[namespace.clone()],
190| 0| memory_type_str,
191| 0| args.k * 2,
192| 0| )?;
193| |
194| | // Map vector ranking position by memory_id (1-indexed per schema)
195| 0| let vec_rank_map: HashMap<i64, usize> = vec_results
196| 0| .iter()
197| 0| .enumerate()
198| 0| .map(|(pos, (id, _))| (*id, pos + 1))
199| 0| .collect();
200| |
201| | // Map raw KNN distance by memory_id for GAP-30: vec_distance field.
202| 0| let vec_distance_map: HashMap<i64, f64> = vec_results
203| 0| .iter()
204| 0| .map(|(id, dist)| (*id, *dist as f64))
205| 0| .collect();
206| |
207| 0| let (fts_results, fts_degraded, fts_error, fts_auto_rebuilt) = if args.weight_fts == 0.0 {
208| 0| (vec![], false, None, false)
209| | } else {
210| 0| match memories::fts_search(&conn, &args.query, &namespace, memory_type_str, args.k * 2) {
211| 0| Ok(r) => (r, false, None, false),
212| 0| Err(e) => {
213| 0| let err_msg = e.to_string();
214| 0| let is_malformed = err_msg.contains("malformed") || err_msg.contains("corrupt");
215| 0| if is_malformed {
216| 0| tracing::warn!(target: "hybrid_search", "FTS5 index corrupted, attempting auto-rebuild");
217| 0| if conn
218| 0| .execute_batch("INSERT INTO fts_memories(fts_memories) VALUES('rebuild');")
219| 0| .is_ok()
220| | {
221| 0| match memories::fts_search(
222| 0| &conn,
223| 0| &args.query,
224| 0| &namespace,
225| 0| memory_type_str,
226| 0| args.k * 2,
227| 0| ) {
228| 0| Ok(r) => (r, false, None, true),
229| 0| Err(e2) => {
230| 0| tracing::error!(target: "hybrid_search", error = %e2, "FTS5 auto-rebuild failed to recover");
231| 0| (vec![], true, Some(e2.to_string()), true)
232| | }
233| | }
234| | } else {
235| 0| (vec![], true, Some(err_msg), false)
236| | }
237| | } else {
238| 0| tracing::warn!(target: "hybrid_search", error = %e, "FTS5 query failed, falling back to vec-only");
239| 0| (vec![], true, Some(err_msg), false)
240| | }
241| | }
242| | }
243| | };
244| |
245| | // Map FTS ranking position by memory_id (1-indexed per schema)
246| 0| let fts_rank_map: HashMap<i64, usize> = fts_results
247| 0| .iter()
248| 0| .enumerate()
249| 0| .map(|(pos, row)| (row.id, pos + 1))
250| 0| .collect();
251| |
252| 0| let rrf_k = args.rrf_k as f64;
253| |
254| | // Accumulate combined RRF scores
255| 0| let mut combined_scores: crate::hash::AHashMap<i64, f64> =
256| 0| crate::hash::AHashMap::with_capacity_and_hasher(
257| 0| vec_results.len() + fts_results.len(),
258| 0| Default::default(),
259| | );
260| |
261| 0| for (rank, (memory_id, _)) in vec_results.iter().enumerate() {
262| 0| let score = args.weight_vec as f64 * (1.0 / (rrf_k + rank as f64 + 1.0));
263| 0| *combined_scores.entry(*memory_id).or_insert(0.0) += score;
264| 0| }
265| |
266| 0| for (rank, row) in fts_results.iter().enumerate() {
267| 0| let score = args.weight_fts as f64 * (1.0 / (rrf_k + rank as f64 + 1.0));
268| 0| *combined_scores.entry(row.id).or_insert(0.0) += score;
269| 0| }
270| |
271| | // Sort by score descending and take the top-k
272| 0| let mut ranked: Vec<(i64, f64)> = combined_scores.into_iter().collect();
273| 0| ranked.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
274| 0| ranked.truncate(args.k);
275| |
276| | // Collect all IDs for batch fetch (avoiding N+1)
277| 0| let top_ids: Vec<i64> = ranked.iter().map(|(id, _)| *id).collect();
278| |
279| | // Fetch full data for the top memories
280| 0| let mut memory_data: crate::hash::AHashMap<i64, memories::MemoryRow> =
281| 0| crate::hash::AHashMap::with_capacity_and_hasher(ranked.len(), Default::default());
282| 0| for id in &top_ids {
283| 0| if let Some(row) = memories::read_full(&conn, *id)? {
284| 0| memory_data.insert(*id, row);
285| 0| }
286| | }
287| |
288| 0| let max_possible = args.weight_vec as f64 * (1.0 / (rrf_k + 1.0))
289| 0| + args.weight_fts as f64 * (1.0 / (rrf_k + 1.0));
290| |
291| | // Build final results in ranking order
292| 0| let results: Vec<HybridSearchItem> = ranked
293| 0| .into_iter()
294| 0| .filter_map(|(memory_id, combined_score)| {
295| 0| let normalized_score = if max_possible > 0.0 {
296| 0| combined_score / max_possible
297| | } else {
298| 0| 0.0
299| | };
300| 0| memory_data.remove(&memory_id).map(|row| {
301| 0| let snippet: String = row.body.chars().take(300).collect();
302| 0| HybridSearchItem {
303| 0| memory_id: row.id,
304| 0| name: row.name,
305| 0| namespace: row.namespace,
306| 0| memory_type: row.memory_type,
307| 0| description: row.description,
308| 0| body: row.body,
309| 0| snippet,
310| 0| combined_score,
311| 0| score: combined_score,
312| 0| source: "hybrid".to_string(),
313| 0| vec_rank: vec_rank_map.get(&memory_id).copied(),
314| 0| fts_rank: fts_rank_map.get(&memory_id).copied(),
315| 0| rrf_score: Some(combined_score),
316| 0| normalized_score,
317| 0| vec_distance: vec_distance_map.get(&memory_id).copied(),
318| 0| fts_bm25: None,
319| 0| }
320| 0| })
321| 0| })
322| 0| .collect();
323| |
324| | // --- Graph traversal (activated by --with-graph) ---
325| 0| let mut graph_matches: Vec<RecallItem> = Vec::with_capacity(8);
326| 0| if args.with_graph && !results.is_empty() {
327| 0| let namespace_for_graph = namespace.clone();
328| 0| let memory_ids: Vec<i64> = results.iter().map(|r| r.memory_id).collect();
329| |
330| 0| let entity_knn = entities::knn_search(&conn, &embedding, &namespace_for_graph, 5)?;
331| 0| let entity_ids: Vec<i64> = entity_knn.iter().map(|(id, _)| *id).collect();
332| |
333| 0| let all_seed_ids: Vec<i64> = memory_ids
334| 0| .iter()
335| 0| .chain(entity_ids.iter())
336| 0| .copied()
337| 0| .collect();
338| |
339| 0| if !all_seed_ids.is_empty() {
340| 0| let graph_memory_ids = traverse_from_memories_with_hops(
341| 0| &conn,
342| 0| &all_seed_ids,
343| 0| &namespace_for_graph,
344| 0| args.min_weight,
345| 0| args.max_hops,
346| 0| )?;
347| |
348| 0| let already_in_results: std::collections::HashSet<i64> =
349| 0| results.iter().map(|r| r.memory_id).collect();
350| |
351| 0| for (graph_mem_id, hop) in graph_memory_ids {
352| 0| if already_in_results.contains(&graph_mem_id) {
353| 0| continue;
354| 0| }
355| 0| if let Some(row) = memories::read_full(&conn, graph_mem_id)? {
356| 0| let snippet: String = row.body.chars().take(300).collect();
357| 0| let graph_distance = 1.0 - 1.0 / (hop as f32 + 1.0);
358| 0| graph_matches.push(RecallItem {
359| 0| memory_id: row.id,
360| 0| name: row.name,
361| 0| namespace: row.namespace,
362| 0| memory_type: row.memory_type,
363| 0| description: row.description,
364| 0| snippet,
365| 0| distance: graph_distance,
366| 0| score: RecallItem::score_from_distance(graph_distance),
367| 0| source: "graph".to_string(),
368| 0| graph_depth: Some(hop),
369| 0| });
370| 0| }
371| | }
372| 0| }
373| 0| }
374| |
375| 0| output::emit_json(&HybridSearchResponse {
376| 0| query: args.query,
377| 0| k: args.k,
378| 0| rrf_k: args.rrf_k,
379| 0| weights: Weights {
380| 0| vec: args.weight_vec,
381| 0| fts: args.weight_fts,
382| 0| },
383| 0| results,
384| 0| graph_matches,
385| 0| fts_degraded,
386| 0| fts_error,
387| 0| fts_auto_rebuilt,
388| 0| elapsed_ms: start.elapsed().as_millis() as u64,
389| 0| })?;
390| |
391| 0| Ok(())
392| 0|}
393| |
394| |#[cfg(test)]
395| |mod tests {
396| | use super::*;
397| |
398| 6| fn empty_response(
399| 6| k: usize,
400| 6| rrf_k: u32,
401| 6| weight_vec: f32,
402| 6| weight_fts: f32,
403| 6| ) -> HybridSearchResponse {
404| 6| HybridSearchResponse {
405| 6| query: "test query".to_string(),
406| 6| k,
407| 6| rrf_k,
408| 6| weights: Weights {
409| 6| vec: weight_vec,
410| 6| fts: weight_fts,
411| 6| },
412| 6| results: vec![],
413| 6| graph_matches: vec![],
414| 6| fts_degraded: false,
415| 6| fts_error: None,
416| 6| fts_auto_rebuilt: false,
417| 6| elapsed_ms: 0,
418| 6| }
419| 6| }
420| |
421| | #[test]
422| 1| fn hybrid_search_response_empty_serializes_correct_fields() {
423| 1| let resp = empty_response(10, 60, 1.0, 1.0);
424| 1| let json = serde_json::to_string(&resp).unwrap();
425| 1| assert!(json.contains("\"results\""), "must contain results field");
^0
426| 1| assert!(json.contains("\"query\""), "must contain query field");
^0
427| 1| assert!(json.contains("\"k\""), "must contain k field");
^0
428| 1| assert!(
429| 1| json.contains("\"graph_matches\""),
430| 0| "must contain graph_matches field"
431| | );
432| 1| assert!(
433| 1| !json.contains("\"combined_rank\""),
434| 0| "must not contain combined_rank"
435| | );
436| 1| assert!(
437| 1| !json.contains("\"vec_rank_list\""),
438| 0| "must not contain vec_rank_list"
439| | );
440| 1| assert!(
441| 1| !json.contains("\"fts_rank_list\""),
442| 0| "must not contain fts_rank_list"
443| | );
444| 1| }
445| |
446| | #[test]
447| 1| fn hybrid_search_response_serializes_rrf_k_and_weights() {
448| 1| let resp = empty_response(5, 60, 0.7, 0.3);
449| 1| let json = serde_json::to_string(&resp).unwrap();
450| 1| assert!(json.contains("\"rrf_k\""), "must contain rrf_k field");
^0
451| 1| assert!(json.contains("\"weights\""), "must contain weights field");
^0
452| 1| assert!(json.contains("\"vec\""), "must contain weights.vec field");
^0
453| 1| assert!(json.contains("\"fts\""), "must contain weights.fts field");
^0
454| 1| }
455| |
456| | #[test]
457| 1| fn hybrid_search_response_serializes_elapsed_ms() {
458| 1| let mut resp = empty_response(5, 60, 1.0, 1.0);
459| 1| resp.elapsed_ms = 123;
460| 1| let json = serde_json::to_string(&resp).unwrap();
461| 1| assert!(
462| 1| json.contains("\"elapsed_ms\""),
463| 0| "must contain elapsed_ms field"
464| | );
465| 1| assert!(json.contains("123"), "deve serializar valor de elapsed_ms");
^0
466| 1| }
467| |
468| | #[test]
469| 1| fn weights_struct_serializes_correctly() {
470| 1| let w = Weights { vec: 0.6, fts: 0.4 };
471| 1| let json = serde_json::to_string(&w).unwrap();
472| 1| assert!(json.contains("\"vec\""));
473| 1| assert!(json.contains("\"fts\""));
474| 1| }
475| |
476| | #[test]
477| 1| fn hybrid_search_item_omits_fts_rank_when_none() {
478| 1| let item = HybridSearchItem {
479| 1| memory_id: 1,
480| 1| name: "mem".to_string(),
481| 1| namespace: "default".to_string(),
482| 1| memory_type: "user".to_string(),
483| 1| description: "desc".to_string(),
484| 1| body: "content".to_string(),
485| 1| snippet: "content".to_string(),
486| 1| combined_score: 0.0328,
487| 1| score: 0.0328,
488| 1| source: "hybrid".to_string(),
489| 1| vec_rank: Some(1),
490| 1| fts_rank: None,
491| 1| rrf_score: Some(0.0328),
492| 1| normalized_score: 1.0,
493| 1| vec_distance: Some(0.12),
494| 1| fts_bm25: None,
495| 1| };
496| 1| let json = serde_json::to_string(&item).unwrap();
497| 1| assert!(
498| 1| json.contains("\"vec_rank\""),
499| 0| "must contain vec_rank when Some"
500| | );
501| 1| assert!(
502| 1| !json.contains("\"fts_rank\""),
503| 0| "must not contain fts_rank when None"
504| | );
505| 1| }
506| |
507| | #[test]
508| 1| fn hybrid_search_item_omits_vec_rank_when_none() {
509| 1| let item = HybridSearchItem {
510| 1| memory_id: 2,
511| 1| name: "mem2".to_string(),
512| 1| namespace: "default".to_string(),
513| 1| memory_type: "fact".to_string(),
514| 1| description: "desc2".to_string(),
515| 1| body: "corpo2".to_string(),
516| 1| snippet: "corpo2".to_string(),
517| 1| combined_score: 0.016,
518| 1| score: 0.016,
519| 1| source: "hybrid".to_string(),
520| 1| vec_rank: None,
521| 1| fts_rank: Some(2),
522| 1| rrf_score: Some(0.016),
523| 1| normalized_score: 0.5,
524| 1| vec_distance: None,
525| 1| fts_bm25: None,
526| 1| };
527| 1| let json = serde_json::to_string(&item).unwrap();
528| 1| assert!(
529| 1| !json.contains("\"vec_rank\""),
530| 0| "must not contain vec_rank when None"
531| | );
532| 1| assert!(
533| 1| json.contains("\"fts_rank\""),
534| 0| "must contain fts_rank when Some"
535| | );
536| 1| }
537| |
538| | #[test]
539| 1| fn hybrid_search_item_serializes_both_ranks_when_some() {
540| 1| let item = HybridSearchItem {
541| 1| memory_id: 3,
542| 1| name: "mem3".to_string(),
543| 1| namespace: "ns".to_string(),
544| 1| memory_type: "entity".to_string(),
545| 1| description: "desc3".to_string(),
546| 1| body: "corpo3".to_string(),
547| 1| snippet: "corpo3".to_string(),
548| 1| combined_score: 0.05,
549| 1| score: 0.05,
550| 1| source: "hybrid".to_string(),
551| 1| vec_rank: Some(3),
552| 1| fts_rank: Some(1),
553| 1| rrf_score: Some(0.05),
554| 1| normalized_score: 0.8,
555| 1| vec_distance: Some(0.25),
556| 1| fts_bm25: None,
557| 1| };
558| 1| let json = serde_json::to_string(&item).unwrap();
559| 1| assert!(json.contains("\"vec_rank\""), "must contain vec_rank");
^0
560| 1| assert!(json.contains("\"fts_rank\""), "must contain fts_rank");
^0
561| 1| assert!(json.contains("\"type\""), "deve serializar type renomeado");
^0
562| 1| assert!(!json.contains("memory_type"), "must not expose memory_type");
^0
563| 1| }
564| |
565| | #[test]
566| 1| fn hybrid_search_response_serializes_k_correctly() {
567| 1| let resp = empty_response(5, 60, 1.0, 1.0);
568| 1| let json = serde_json::to_string(&resp).unwrap();
569| 1| assert!(json.contains("\"k\":5"), "deve serializar k=5");
^0
570| 1| }
571| |
572| | #[test]
573| 1| fn hybrid_search_response_with_graph_matches() {
574| | use crate::output::RecallItem;
575| 1| let resp = HybridSearchResponse {
576| 1| query: "test".to_string(),
577| 1| k: 5,
578| 1| rrf_k: 60,
579| 1| weights: Weights { vec: 1.0, fts: 1.0 },
580| 1| results: vec![],
581| 1| graph_matches: vec![RecallItem {
582| 1| memory_id: 1,
583| 1| name: "graph-hit".to_string(),
584| 1| namespace: "global".to_string(),
585| 1| memory_type: "document".to_string(),
586| 1| description: "found via graph".to_string(),
587| 1| snippet: "graph content".to_string(),
588| 1| distance: 0.1,
589| 1| score: 0.9,
590| 1| source: "graph".to_string(),
591| 1| graph_depth: Some(1),
592| 1| }],
593| 1| fts_degraded: false,
594| 1| fts_error: None,
595| 1| fts_auto_rebuilt: false,
596| 1| elapsed_ms: 42,
597| 1| };
598| 1| let json = serde_json::to_value(&resp).unwrap();
599| 1| assert_eq!(json["graph_matches"].as_array().unwrap().len(), 1);
600| 1| assert_eq!(json["graph_matches"][0]["source"], "graph");
601| 1| assert_eq!(json["graph_matches"][0]["graph_depth"], 1);
602| 1| }
603| |
604| | #[test]
605| 1| fn fts_degraded_omitted_on_success_present_on_failure() {
606| | // Happy path: fts_degraded=false must be absent from JSON (skip_serializing_if).
607| 1| let ok_resp = empty_response(5, 60, 1.0, 1.0);
608| 1| let ok_json = serde_json::to_string(&ok_resp).unwrap();
609| 1| assert!(
610| 1| !ok_json.contains("\"fts_degraded\""),
611| 0| "fts_degraded must be absent when false"
612| | );
613| 1| assert!(
614| 1| !ok_json.contains("\"fts_error\""),
615| 0| "fts_error must be absent when None"
616| | );
617| |
618| | // Degraded path: fts_degraded=true and fts_error=Some must appear in JSON.
619| 1| let mut degraded_resp = empty_response(5, 60, 1.0, 1.0);
620| 1| degraded_resp.fts_degraded = true;
621| 1| degraded_resp.fts_error = Some("FTS5 table corrupted".to_string());
622| 1| let degraded_json = serde_json::to_string(°raded_resp).unwrap();
623| 1| assert!(
624| 1| degraded_json.contains("\"fts_degraded\":true"),
625| 0| "fts_degraded must be present and true when degraded"
626| | );
627| 1| assert!(
628| 1| degraded_json.contains("\"fts_error\""),
629| 0| "fts_error must be present when Some"
630| | );
631| 1| assert!(
632| 1| degraded_json.contains("FTS5 table corrupted"),
633| 0| "fts_error must contain the error message"
634| | );
635| 1| }
636| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/ingest.rs:
1| |//! Handler for the `ingest` CLI subcommand.
2| |//!
3| |//! Bulk-ingests every file under a directory that matches a glob pattern.
4| |//! Each matched file is persisted as a separate memory using the same
5| |//! validation, chunking, embedding and persistence pipeline as `remember`,
6| |//! but executed in-process so the ONNX model is loaded only once per
7| |//! invocation. This is the v1.0.32 Onda 4B (finding A2) refactor that
8| |//! replaced a fork-spawn-per-file pipeline (every file paid the ~17s ONNX
9| |//! cold-start cost) with an in-process loop reusing the warm embedder
10| |//! (daemon when available, in-process `Embedder::new` otherwise).
11| |//!
12| |//! Memory names are derived from file basenames (kebab-case, lowercase,
13| |//! ASCII alphanumerics + hyphens). Output is line-delimited JSON: one
14| |//! object per processed file (success or error), followed by a final
15| |//! summary object. Designed for streaming consumption by agents.
16| |//!
17| |//! ## Incremental pipeline (v1.0.43)
18| |//!
19| |//! Phase A runs on a rayon thread pool (size = `--ingest-parallelism`):
20| |//! read + chunk + embed + NER per file. Results are sent immediately via a
21| |//! bounded `mpsc::sync_channel` to Phase B so persistence starts as soon
22| |//! as the first file completes — no waiting for all files to finish Phase A.
23| |//!
24| |//! Phase B runs on the main thread: receives staged files from the channel,
25| |//! writes to SQLite per-file (WAL absorbs individual commits), and emits
26| |//! NDJSON progress events to stderr as each file is persisted. `Connection`
27| |//! is not `Sync` so it never crosses thread boundaries.
28| |//!
29| |//! This fixes B1: with the old 2-phase design, a 50-file corpus with 27s/file
30| |//! NER would spend ~22min in Phase A alone, exceeding the user's 900s timeout
31| |//! before Phase B (and any DB writes) could begin. With this pipeline, the
32| |//! first file is committed within seconds of starting.
33| |
34| |use crate::chunking;
35| |use crate::cli::MemoryType;
36| |use crate::entity_type::EntityType;
37| |use crate::errors::AppError;
38| |use crate::i18n::errors_msg;
39| |use crate::output::{self, JsonOutputFormat};
40| |use crate::paths::AppPaths;
41| |use crate::storage::chunks as storage_chunks;
42| |use crate::storage::connection::{ensure_db_ready, open_rw};
43| |use crate::storage::entities::{NewEntity, NewRelationship};
44| |use crate::storage::memories::NewMemory;
45| |use crate::storage::{entities, memories, urls as storage_urls, versions};
46| |use rayon::prelude::*;
47| |use rusqlite::Connection;
48| |use serde::Serialize;
49| |use std::collections::BTreeSet;
50| |use std::path::{Path, PathBuf};
51| |use std::sync::mpsc;
52| |use unicode_normalization::UnicodeNormalization;
53| |
54| |use crate::constants::DERIVED_NAME_MAX_LEN;
55| |
56| |/// Hard cap on the numeric suffix appended for collision resolution. If 1000
57| |/// candidates collide we surface an error rather than loop forever.
58| |const MAX_NAME_COLLISION_SUFFIX: usize = 1000;
59| |
60| |#[derive(clap::Args)]
61| |#[command(after_long_help = "EXAMPLES:\n \
62| | # Ingest every Markdown file under ./docs as `document` memories\n \
63| | sqlite-graphrag ingest ./docs --type document\n\n \
64| | # Ingest .txt files recursively under ./notes\n \
65| | sqlite-graphrag ingest ./notes --type note --pattern '*.txt' --recursive\n\n \
66| | # Enable GLiNER NER extraction (disabled by default, slower)\n \
67| | sqlite-graphrag ingest ./big-corpus --type reference --enable-ner\n\n \
68| | # Preview file-to-name mapping without ingesting\n \
69| | sqlite-graphrag ingest ./docs --dry-run\n\n \
70| | # LLM-curated extraction via Claude Code CLI\n \
71| | sqlite-graphrag ingest ./docs --mode claude-code --recursive --json\n\n \
72| | # Resume interrupted claude-code ingest\n \
73| | sqlite-graphrag ingest ./docs --mode claude-code --resume --json\n\n \
74| | # Claude Code with budget cap and custom timeout\n \
75| | sqlite-graphrag ingest ./docs --mode claude-code --max-cost-usd 5.00 --claude-timeout 600 --json\n\n \
76| |AUTHENTICATION:\n \
77| | --mode claude-code: Uses existing Claude Code authentication.\n \
78| | OAuth (Pro/Max/Team): works automatically from ~/.claude/.credentials.json\n \
79| | API key: set ANTHROPIC_API_KEY for faster startup (optional)\n\n \
80| | --mode codex: Uses existing Codex CLI authentication.\n \
81| | Device auth: run `codex auth login` first\n \
82| | API key: set OPENAI_API_KEY (optional)\n\n \
83| |NOTES:\n \
84| | Each file becomes a separate memory. Names derive from file basenames\n \
85| | (kebab-case, lowercase, ASCII). Output is NDJSON: one JSON object per file,\n \
86| | followed by a final summary line with counts. Per-file errors are reported\n \
87| | inline and processing continues unless --fail-fast is set.")]
88| |pub struct IngestArgs {
89| | /// Directory containing files to ingest.
90| | #[arg(
91| | value_name = "DIR",
92| | help = "Directory to ingest recursively (each matching file becomes a memory)"
93| | )]
94| | pub dir: PathBuf,
95| |
96| | /// Memory type stored in `memories.type` for every ingested file. Defaults to `document`.
97| | #[arg(long, value_enum, default_value_t = MemoryType::Document)]
98| | pub r#type: MemoryType,
99| |
100| | /// Glob pattern matched against file basenames (default: `*.md`). Supports
101| | /// `*.<ext>`, `<prefix>*`, and exact filename match.
102| | #[arg(long, default_value = "*.md")]
103| | pub pattern: String,
104| |
105| | /// Recurse into subdirectories.
106| | #[arg(long, default_value_t = false)]
107| | pub recursive: bool,
108| |
109| | #[arg(
110| | long,
111| | env = "SQLITE_GRAPHRAG_ENABLE_NER",
112| | value_parser = crate::parsers::parse_bool_flexible,
113| | action = clap::ArgAction::Set,
114| | num_args = 0..=1,
115| | default_missing_value = "true",
116| | default_value = "false",
117| | help = "Enable automatic GLiNER NER entity/relationship extraction (disabled by default)"
118| | )]
119| | pub enable_ner: bool,
120| | #[arg(
121| | long,
122| | env = "SQLITE_GRAPHRAG_GLINER_VARIANT",
123| | default_value = "fp32",
124| | help = "GLiNER model variant: fp32 (1.1GB, best quality), fp16 (580MB), int8 (349MB, fastest but may miss entities on short texts), q4, q4f16"
125| | )]
126| | pub gliner_variant: String,
127| |
128| | /// Deprecated: NER is now disabled by default. Kept for backwards compatibility.
129| | #[arg(long, default_value_t = false, hide = true)]
130| | pub skip_extraction: bool,
131| |
132| | /// Stop on first per-file error instead of continuing with the next file.
133| | #[arg(long, default_value_t = false)]
134| | pub fail_fast: bool,
135| |
136| | /// Preview file-to-name mapping without loading model or persisting.
137| | #[arg(long, default_value_t = false)]
138| | pub dry_run: bool,
139| |
140| | /// Maximum number of files to ingest (safety cap to prevent runaway ingestion).
141| | #[arg(long, default_value_t = 10_000)]
142| | pub max_files: usize,
143| |
144| | /// Namespace for the ingested memories.
145| | #[arg(long)]
146| | pub namespace: Option<String>,
147| |
148| | /// Database path. Falls back to `SQLITE_GRAPHRAG_DB_PATH`, then `./graphrag.sqlite`.
149| | #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
150| | pub db: Option<String>,
151| |
152| | #[arg(long, value_enum, default_value_t = JsonOutputFormat::Json)]
153| | pub format: JsonOutputFormat,
154| |
155| | #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
156| | pub json: bool,
157| |
158| | /// Number of files to extract+embed in parallel; default = max(1, cpus/2).min(4).
159| | #[arg(
160| | long,
161| | help = "Number of files to extract+embed in parallel; default = max(1, cpus/2).min(4)"
162| | )]
163| | pub ingest_parallelism: Option<usize>,
164| |
165| | /// Force single-threaded ingest to reduce RSS pressure.
166| | ///
167| | /// Equivalent to `--ingest-parallelism 1`, takes precedence over any
168| | /// explicit value. Recommended for environments with <4 GB available
169| | /// RAM or container/cgroup constraints. Trade-off: 3-4x longer wall
170| | /// time. Also honored via `SQLITE_GRAPHRAG_LOW_MEMORY=1` env var
171| | /// (CLI flag has higher precedence than the env var).
172| | #[arg(
173| | long,
174| | default_value_t = false,
175| | help = "Forces single-threaded ingest (--ingest-parallelism 1) to reduce RSS pressure. \
176| | Recommended for environments with <4 GB available RAM or container/cgroup \
177| | constraints. Trade-off: 3-4x longer wall time. Also honored via \
178| | SQLITE_GRAPHRAG_LOW_MEMORY=1 env var."
179| | )]
180| | pub low_memory: bool,
181| |
182| | /// Maximum process RSS in MiB; abort if exceeded during embedding.
183| | #[arg(long, default_value_t = crate::constants::DEFAULT_MAX_RSS_MB,
184| | help = "Maximum process RSS in MiB; abort if exceeded during embedding (default: 8192)")]
185| | pub max_rss_mb: u64,
186| |
187| | /// Maximum character length for derived memory names from file basenames.
188| | ///
189| | /// Overrides the compile-time `DERIVED_NAME_MAX_LEN` constant (default 60).
190| | /// Shorter values leave more headroom for collision suffix resolution.
191| | #[arg(long, default_value_t = crate::constants::DERIVED_NAME_MAX_LEN,
192| | help = "Maximum length for derived memory names (default: 60)")]
193| | pub max_name_length: usize,
194| |
195| | /// Extraction mode: `none` (body-only, default), `gliner` (NER), or `claude-code` (LLM-curated via Claude Code CLI).
196| | #[arg(long, value_enum, default_value_t = IngestMode::None)]
197| | pub mode: IngestMode,
198| |
199| | /// Explicit path to the Claude Code binary (only with --mode claude-code).
200| | #[arg(long, env = "SQLITE_GRAPHRAG_CLAUDE_BINARY")]
201| | pub claude_binary: Option<std::path::PathBuf>,
202| |
203| | /// Model override for Claude Code extraction (e.g. claude-sonnet-4-6).
204| | #[arg(long)]
205| | pub claude_model: Option<String>,
206| |
207| | /// Resume a previously interrupted claude-code ingest from the queue DB.
208| | #[arg(long, default_value_t = false)]
209| | pub resume: bool,
210| |
211| | /// Retry only failed files from a previous claude-code ingest.
212| | #[arg(long, default_value_t = false)]
213| | pub retry_failed: bool,
214| |
215| | /// Keep the queue DB (.ingest-queue.sqlite) after completion.
216| | #[arg(long, default_value_t = false)]
217| | pub keep_queue: bool,
218| |
219| | /// Custom path for the claude-code ingest queue database.
220| | #[arg(long, default_value = ".ingest-queue.sqlite")]
221| | pub queue_db: String,
222| |
223| | /// Initial wait time in seconds when rate-limited (only with --mode claude-code).
224| | #[arg(long, default_value_t = 60)]
225| | pub rate_limit_wait: u64,
226| |
227| | /// Maximum cumulative cost in USD before aborting (only with --mode claude-code).
228| | #[arg(long)]
229| | pub max_cost_usd: Option<f64>,
230| |
231| | /// Timeout in seconds for each claude -p invocation (only with --mode claude-code).
232| | #[arg(
233| | long,
234| | default_value_t = 300,
235| | help = "Timeout in seconds for each claude -p invocation (default: 300)"
236| | )]
237| | pub claude_timeout: u64,
238| |
239| | /// Explicit path to the Codex CLI binary (only with --mode codex).
240| | #[arg(
241| | long,
242| | env = "SQLITE_GRAPHRAG_CODEX_BINARY",
243| | help = "Explicit path to the Codex CLI binary (only with --mode codex)"
244| | )]
245| | pub codex_binary: Option<PathBuf>,
246| |
247| | /// Model override for Codex extraction (e.g. o4-mini, gpt-5.1-codex).
248| | #[arg(
249| | long,
250| | help = "Model override for Codex extraction (e.g. o4-mini, gpt-5.1-codex)"
251| | )]
252| | pub codex_model: Option<String>,
253| |
254| | /// Timeout in seconds for each codex exec invocation.
255| | #[arg(
256| | long,
257| | default_value_t = 300,
258| | help = "Timeout in seconds for each codex exec invocation (default: 300)"
259| | )]
260| | pub codex_timeout: u64,
261| |
262| | /// G30: poll for the job singleton every second for up to N seconds
263| | /// when another invocation holds the lock. Default: 0 (fail fast).
264| | #[arg(long, value_name = "SECONDS")]
265| | pub wait_job_singleton: Option<u64>,
266| |
267| | /// G30: force acquisition of the singleton lock by removing a stale
268| | /// lock file from a previously crashed invocation.
269| | #[arg(long, default_value_t = false)]
270| | pub force_job_singleton: bool,
271| |}
272| |
273| |/// Extraction mode for the ingest pipeline.
274| |#[derive(Clone, Debug, PartialEq, Eq, clap::ValueEnum)]
275| |pub enum IngestMode {
276| | /// Body-only ingestion without entity/relationship extraction (default).
277| | None,
278| | /// GLiNER zero-shot NER extraction (requires --enable-ner).
279| | Gliner,
280| | /// LLM-curated extraction via locally installed Claude Code CLI.
281| | ClaudeCode,
282| | /// LLM-curated extraction via locally installed OpenAI Codex CLI.
283| | Codex,
284| |}
285| |
286| |/// Returns true when the `SQLITE_GRAPHRAG_LOW_MEMORY` env var is set to a
287| |/// truthy value (`1`, `true`, `yes`, `on`, case-insensitive). Empty or unset
288| |/// values evaluate to false. Unrecognized non-empty values emit a
289| |/// `tracing::warn!` and evaluate to false.
290| 24|fn env_low_memory_enabled() -> bool {
291| 24| match std::env::var("SQLITE_GRAPHRAG_LOW_MEMORY") {
292| 17| Ok(v) if v.is_empty() => false,
^1 ^1 ^1
293| 16| Ok(v) => match v.to_lowercase().as_str() {
294| 16| "1" | "true" | "yes" | "on" => true,
^13 ^11 ^9 ^9
295| 7| "0" | "false" | "no" | "off" => false,
^5 ^3 ^2 ^6
296| 1| other => {
297| 1| tracing::warn!(
298| | target: "ingest",
299| | value = %other,
300| 0| "SQLITE_GRAPHRAG_LOW_MEMORY value not recognized; treating as disabled"
301| | );
302| 1| false
303| | }
304| | },
305| 7| Err(_) => false,
306| | }
307| 24|}
308| |
309| |/// Resolves the effective ingest parallelism honoring `--low-memory` and the
310| |/// `SQLITE_GRAPHRAG_LOW_MEMORY` env var.
311| |///
312| |/// Precedence:
313| |/// 1. `--low-memory` CLI flag forces parallelism = 1.
314| |/// 2. `SQLITE_GRAPHRAG_LOW_MEMORY=1` env var forces parallelism = 1.
315| |/// 3. Explicit `--ingest-parallelism N` (when low-memory is off).
316| |/// 4. Default heuristic `(cpus/2).clamp(1, 4)`.
317| |///
318| |/// When low-memory wins and the user also passed `--ingest-parallelism N>1`,
319| |/// emits a `tracing::warn!` advertising the override.
320| 9|fn resolve_parallelism(low_memory_flag: bool, ingest_parallelism: Option<usize>) -> usize {
321| 9| let env_flag = env_low_memory_enabled();
322| 9| let low_memory = low_memory_flag || env_flag;
^6
323| |
324| 9| if low_memory {
325| 5| if let Some(n) = ingest_parallelism {
^3
326| 3| if n > 1 {
327| 3| tracing::warn!(
328| | target: "ingest",
329| | requested = n,
330| 0| "--ingest-parallelism overridden by --low-memory; using 1"
331| | );
332| 0| }
333| 2| }
334| 5| if low_memory_flag {
335| 3| tracing::info!(
336| | target: "ingest",
337| | source = "flag",
338| 0| "low-memory mode enabled: forcing --ingest-parallelism 1"
339| | );
340| | } else {
341| 2| tracing::info!(
342| | target: "ingest",
343| | source = "env",
344| 0| "low-memory mode enabled via SQLITE_GRAPHRAG_LOW_MEMORY: forcing --ingest-parallelism 1"
345| | );
346| | }
347| 5| return 1;
348| 4| }
349| |
350| 4| ingest_parallelism
351| 4| .unwrap_or_else(|| {
^1
352| 1| std::thread::available_parallelism()
353| 1| .map(|v| v.get() / 2)
354| 1| .unwrap_or(1)
355| 1| .clamp(1, 4)
356| 1| })
357| 4| .max(1)
358| 9|}
359| |
360| |#[derive(Serialize)]
361| |struct IngestFileEvent<'a> {
362| | file: &'a str,
363| | name: &'a str,
364| | status: &'a str,
365| | /// True when the derived name was truncated to fit `DERIVED_NAME_MAX_LEN`. False otherwise.
366| | truncated: bool,
367| | /// Original derived name before truncation; only present when `truncated=true`.
368| | #[serde(skip_serializing_if = "Option::is_none")]
369| | original_name: Option<String>,
370| | /// Original file basename (without extension); only present when it differs from `name`.
371| | #[serde(skip_serializing_if = "Option::is_none")]
372| | original_filename: Option<&'a str>,
373| | #[serde(skip_serializing_if = "Option::is_none")]
374| | error: Option<String>,
375| | #[serde(skip_serializing_if = "Option::is_none")]
376| | memory_id: Option<i64>,
377| | #[serde(skip_serializing_if = "Option::is_none")]
378| | action: Option<String>,
379| | /// Byte length of the body ingested; 0 when not yet read (e.g. skip or dry-run events).
380| | body_length: usize,
381| |}
382| |
383| |#[derive(Serialize)]
384| |struct IngestSummary {
385| | summary: bool,
386| | dir: String,
387| | pattern: String,
388| | recursive: bool,
389| | files_total: usize,
390| | files_succeeded: usize,
391| | files_failed: usize,
392| | files_skipped: usize,
393| | elapsed_ms: u64,
394| |}
395| |
396| |/// Outcome of a successful per-file ingest, used to build the NDJSON event.
397| |struct FileSuccess {
398| | memory_id: i64,
399| | action: String,
400| | body_length: usize,
401| |}
402| |
403| |/// NDJSON progress event emitted to stderr after each file completes Phase A.
404| |/// Schema version 1; consumers should check `schema_version` before parsing.
405| |#[derive(Serialize)]
406| |struct StageProgressEvent<'a> {
407| | schema_version: u8,
408| | event: &'a str,
409| | path: &'a str,
410| | ms: u64,
411| | entities: usize,
412| | relationships: usize,
413| |}
414| |
415| |/// All artefacts pre-computed by Phase A (CPU-bound, runs on rayon thread pool).
416| |/// Phase B persists these to SQLite on the main thread in submission order.
417| |struct StagedFile {
418| | body: String,
419| | body_hash: String,
420| | snippet: String,
421| | name: String,
422| | description: String,
423| | embedding: Vec<f32>,
424| | chunk_embeddings: Option<Vec<Vec<f32>>>,
425| | chunks_info: Vec<crate::chunking::Chunk>,
426| | entities: Vec<NewEntity>,
427| | relationships: Vec<NewRelationship>,
428| | entity_embeddings: Vec<Vec<f32>>,
429| | urls: Vec<crate::extraction::ExtractedUrl>,
430| |}
431| |
432| |/// Phase A worker: reads, chunks, embeds and extracts NER for one file.
433| |/// Never touches the database — safe to run on any rayon thread.
434| 0|fn stage_file(
435| 0| _idx: usize,
436| 0| path: &Path,
437| 0| name: &str,
438| 0| paths: &AppPaths,
439| 0| enable_ner: bool,
440| 0| gliner_variant: crate::extraction::GlinerVariant,
441| 0| max_rss_mb: u64,
442| 0|) -> Result<StagedFile, AppError> {
443| | use crate::constants::*;
444| |
445| 0| if name.len() > MAX_MEMORY_NAME_LEN {
446| 0| return Err(AppError::LimitExceeded(
447| 0| crate::i18n::validation::name_length(MAX_MEMORY_NAME_LEN),
448| 0| ));
449| 0| }
450| 0| if name.starts_with("__") {
451| 0| return Err(AppError::Validation(
452| 0| crate::i18n::validation::reserved_name(),
453| 0| ));
454| 0| }
455| | {
456| 0| let slug_re = crate::constants::name_slug_regex();
457| 0| if !slug_re.is_match(name) {
458| 0| return Err(AppError::Validation(crate::i18n::validation::name_kebab(
459| 0| name,
460| 0| )));
461| 0| }
462| | }
463| |
464| 0| let file_size = std::fs::metadata(path).map_err(AppError::Io)?.len();
465| 0| if file_size > MAX_MEMORY_BODY_LEN as u64 {
466| 0| return Err(AppError::LimitExceeded(
467| 0| crate::i18n::validation::body_exceeds(MAX_MEMORY_BODY_LEN),
468| 0| ));
469| 0| }
470| 0| let raw_body = std::fs::read_to_string(path).map_err(AppError::Io)?;
471| 0| if raw_body.len() > MAX_MEMORY_BODY_LEN {
472| 0| return Err(AppError::LimitExceeded(
473| 0| crate::i18n::validation::body_exceeds(MAX_MEMORY_BODY_LEN),
474| 0| ));
475| 0| }
476| 0| if raw_body.trim().is_empty() {
477| 0| return Err(AppError::Validation(crate::i18n::validation::empty_body()));
478| 0| }
479| |
480| 0| let description = format!("ingested from {}", path.display());
481| 0| if description.len() > MAX_MEMORY_DESCRIPTION_LEN {
482| 0| return Err(AppError::Validation(
483| 0| crate::i18n::validation::description_exceeds(MAX_MEMORY_DESCRIPTION_LEN),
484| 0| ));
485| 0| }
486| |
487| 0| let mut extracted_entities: Vec<NewEntity> = Vec::with_capacity(30);
488| 0| let mut extracted_relationships: Vec<NewRelationship> = Vec::with_capacity(50);
489| 0| let mut extracted_urls: Vec<crate::extraction::ExtractedUrl> = Vec::with_capacity(4);
490| 0| if enable_ner {
491| 0| match crate::extraction::extract_graph_auto(&raw_body, paths, gliner_variant) {
492| 0| Ok(extracted) => {
493| 0| extracted_urls = extracted.urls;
494| 0| extracted_entities = extracted.entities;
495| 0| extracted_relationships = extracted.relationships;
496| |
497| 0| if extracted_entities.len() > max_entities_per_memory() {
498| 0| extracted_entities.truncate(max_entities_per_memory());
499| 0| }
500| 0| if extracted_relationships.len() > max_relationships_per_memory() {
501| 0| extracted_relationships.truncate(max_relationships_per_memory());
502| 0| }
503| | }
504| 0| Err(e) => {
505| 0| tracing::warn!(
506| | target: "ingest",
507| 0| file = %path.display(),
508| 0| "auto-extraction failed (graceful degradation): {e:#}"
509| | );
510| | }
511| | }
512| 0| }
513| |
514| 0| for rel in &mut extracted_relationships {
515| 0| rel.relation = crate::parsers::normalize_relation(&rel.relation);
516| 0| if let Err(e) = crate::parsers::validate_relation_format(&rel.relation) {
517| 0| return Err(AppError::Validation(format!(
518| 0| "{e} for relationship '{}' -> '{}'",
519| 0| rel.source, rel.target
520| 0| )));
521| 0| }
522| 0| crate::parsers::warn_if_non_canonical(&rel.relation);
523| 0| if !(0.0..=1.0).contains(&rel.strength) {
524| 0| return Err(AppError::Validation(format!(
525| 0| "invalid strength {} for relationship '{}' -> '{}'; expected value in [0.0, 1.0]",
526| 0| rel.strength, rel.source, rel.target
527| 0| )));
528| 0| }
529| | }
530| |
531| 0| let body_hash = blake3::hash(raw_body.as_bytes()).to_hex().to_string();
532| 0| let snippet: String = raw_body.chars().take(200).collect();
533| |
534| 0| let tokenizer = crate::tokenizer::get_tokenizer(&paths.models)?;
535| 0| let chunks_info = chunking::split_into_chunks_hierarchical(&raw_body, tokenizer);
536| 0| if chunks_info.len() > REMEMBER_MAX_SAFE_MULTI_CHUNKS {
537| 0| return Err(AppError::LimitExceeded(format!(
538| 0| "document produces {} chunks; current safe operational limit is {} chunks; split the document before using remember",
539| 0| chunks_info.len(),
540| 0| REMEMBER_MAX_SAFE_MULTI_CHUNKS
541| 0| )));
542| 0| }
543| |
544| 0| let mut chunk_embeddings_opt: Option<Vec<Vec<f32>>> = None;
545| 0| let embedding = if chunks_info.len() == 1 {
546| 0| crate::daemon::embed_passage_or_local(&paths.models, &raw_body)?
547| | } else {
548| 0| let chunk_texts: Vec<&str> = chunks_info
549| 0| .iter()
550| 0| .map(|c| chunking::chunk_text(&raw_body, c))
551| 0| .collect();
552| 0| let embed_cap = chunk_texts.len();
553| 0| let mut chunk_embeddings = Vec::new();
554| 0| chunk_embeddings.try_reserve(embed_cap).map_err(|_| {
555| 0| AppError::LimitExceeded(format!(
556| 0| "allocation of {embed_cap} chunk embeddings would exceed available memory"
557| 0| ))
558| 0| })?;
559| 0| for chunk_text in &chunk_texts {
560| 0| if let Some(rss) = crate::memory_guard::current_process_memory_mb() {
561| 0| if rss > max_rss_mb {
562| 0| tracing::error!(
563| | target: "ingest",
564| | rss_mb = rss,
565| | max_rss_mb = max_rss_mb,
566| 0| file = %path.display(),
567| 0| "RSS exceeded --max-rss-mb threshold; aborting to prevent system instability"
568| | );
569| 0| return Err(AppError::LowMemory {
570| 0| available_mb: crate::memory_guard::available_memory_mb(),
571| 0| required_mb: max_rss_mb,
572| 0| });
573| 0| }
574| 0| }
575| 0| chunk_embeddings.push(crate::daemon::embed_passage_or_local(
576| 0| &paths.models,
577| 0| chunk_text,
578| 0| )?);
579| | }
580| 0| let aggregated = chunking::aggregate_embeddings(&chunk_embeddings);
581| 0| chunk_embeddings_opt = Some(chunk_embeddings);
582| 0| aggregated
583| | };
584| |
585| 0| let entity_embeddings = extracted_entities
586| 0| .iter()
587| 0| .map(|entity| {
588| 0| let entity_text = match &entity.description {
589| 0| Some(desc) => format!("{} {}", entity.name, desc),
590| 0| None => entity.name.clone(),
591| | };
592| 0| crate::daemon::embed_passage_or_local(&paths.models, &entity_text)
593| 0| })
594| 0| .collect::<Result<Vec<_>, _>>()?;
595| |
596| 0| Ok(StagedFile {
597| 0| body: raw_body,
598| 0| body_hash,
599| 0| snippet,
600| 0| name: name.to_string(),
601| 0| description,
602| 0| embedding,
603| 0| chunk_embeddings: chunk_embeddings_opt,
604| 0| chunks_info,
605| 0| entities: extracted_entities,
606| 0| relationships: extracted_relationships,
607| 0| entity_embeddings,
608| 0| urls: extracted_urls,
609| 0| })
610| 0|}
611| |
612| |/// Phase B: persists one `StagedFile` to the database on the main thread.
613| 0|fn persist_staged(
614| 0| conn: &mut Connection,
615| 0| namespace: &str,
616| 0| memory_type: &str,
617| 0| staged: StagedFile,
618| 0|) -> Result<FileSuccess, AppError> {
619| | {
620| 0| let active_count: u32 = conn.query_row(
621| 0| "SELECT COUNT(DISTINCT namespace) FROM memories WHERE deleted_at IS NULL",
622| 0| [],
623| 0| |r| r.get::<_, i64>(0).map(|v| v as u32),
624| 0| )?;
625| 0| let ns_exists: bool = conn.query_row(
626| 0| "SELECT EXISTS(SELECT 1 FROM memories WHERE namespace = ?1 AND deleted_at IS NULL)",
627| 0| rusqlite::params![namespace],
628| 0| |r| r.get::<_, i64>(0).map(|v| v > 0),
629| 0| )?;
630| 0| if !ns_exists && active_count >= crate::constants::MAX_NAMESPACES_ACTIVE {
631| 0| return Err(AppError::NamespaceError(format!(
632| 0| "active namespace limit of {} exceeded while creating '{namespace}'",
633| 0| crate::constants::MAX_NAMESPACES_ACTIVE
634| 0| )));
635| 0| }
636| | }
637| |
638| 0| let existing_memory = memories::find_by_name(conn, namespace, &staged.name)?;
639| 0| if existing_memory.is_some() {
640| 0| return Err(AppError::Duplicate(errors_msg::duplicate_memory(
641| 0| &staged.name,
642| 0| namespace,
643| 0| )));
644| 0| }
645| 0| let duplicate_hash_id = memories::find_by_hash(conn, namespace, &staged.body_hash)?;
646| |
647| 0| let new_memory = NewMemory {
648| 0| namespace: namespace.to_string(),
649| 0| name: staged.name.clone(),
650| 0| memory_type: memory_type.to_string(),
651| 0| description: staged.description.clone(),
652| 0| body: staged.body,
653| 0| body_hash: staged.body_hash,
654| 0| session_id: None,
655| 0| source: "agent".to_string(),
656| 0| metadata: serde_json::json!({}),
657| 0| };
658| |
659| 0| if let Some(hash_id) = duplicate_hash_id {
660| 0| tracing::debug!(
661| | target: "ingest",
662| | duplicate_memory_id = hash_id,
663| 0| "identical body already exists; persisting a new memory anyway"
664| | );
665| 0| }
666| |
667| 0| let tx = conn.transaction_with_behavior(rusqlite::TransactionBehavior::Immediate)?;
668| |
669| 0| let memory_id = memories::insert(&tx, &new_memory)?;
670| 0| versions::insert_version(
671| 0| &tx,
672| 0| memory_id,
673| | 1,
674| 0| &staged.name,
675| 0| memory_type,
676| 0| &staged.description,
677| 0| &new_memory.body,
678| 0| &serde_json::to_string(&new_memory.metadata)?,
679| 0| None,
680| 0| "create",
681| 0| )?;
682| 0| memories::upsert_vec(
683| 0| &tx,
684| 0| memory_id,
685| 0| namespace,
686| 0| memory_type,
687| 0| &staged.embedding,
688| 0| &staged.name,
689| 0| &staged.snippet,
690| 0| )?;
691| |
692| 0| if staged.chunks_info.len() > 1 {
693| 0| storage_chunks::insert_chunk_slices(&tx, memory_id, &new_memory.body, &staged.chunks_info)?;
694| 0| let chunk_embeddings = staged.chunk_embeddings.ok_or_else(|| {
695| 0| AppError::Internal(anyhow::anyhow!(
696| 0| "missing chunk embeddings cache on multi-chunk ingest path"
697| 0| ))
698| 0| })?;
699| 0| for (i, emb) in chunk_embeddings.iter().enumerate() {
700| 0| storage_chunks::upsert_chunk_vec(&tx, i as i64, memory_id, i as i32, emb)?;
701| | }
702| 0| }
703| |
704| 0| if !staged.entities.is_empty() || !staged.relationships.is_empty() {
705| 0| for (idx, entity) in staged.entities.iter().enumerate() {
706| 0| let entity_id = entities::upsert_entity(&tx, namespace, entity)?;
707| 0| let entity_embedding = &staged.entity_embeddings[idx];
708| 0| entities::upsert_entity_vec(
709| 0| &tx,
710| 0| entity_id,
711| 0| namespace,
712| 0| entity.entity_type,
713| 0| entity_embedding,
714| 0| &entity.name,
715| 0| )?;
716| 0| entities::link_memory_entity(&tx, memory_id, entity_id)?;
717| 0| entities::increment_degree(&tx, entity_id)?;
718| | }
719| 0| let entity_types: std::collections::HashMap<&str, EntityType> = staged
720| 0| .entities
721| 0| .iter()
722| 0| .map(|entity| (entity.name.as_str(), entity.entity_type))
723| 0| .collect();
724| 0| for rel in &staged.relationships {
725| 0| let source_entity = NewEntity {
726| 0| name: rel.source.clone(),
727| 0| entity_type: entity_types
728| 0| .get(rel.source.as_str())
729| 0| .copied()
730| 0| .unwrap_or(EntityType::Concept),
731| 0| description: None,
732| 0| };
733| 0| let target_entity = NewEntity {
734| 0| name: rel.target.clone(),
735| 0| entity_type: entity_types
736| 0| .get(rel.target.as_str())
737| 0| .copied()
738| 0| .unwrap_or(EntityType::Concept),
739| 0| description: None,
740| 0| };
741| 0| let source_id = entities::upsert_entity(&tx, namespace, &source_entity)?;
742| 0| let target_id = entities::upsert_entity(&tx, namespace, &target_entity)?;
743| 0| let rel_id = entities::upsert_relationship(&tx, namespace, source_id, target_id, rel)?;
744| 0| entities::link_memory_relationship(&tx, memory_id, rel_id)?;
745| | }
746| 0| }
747| |
748| 0| tx.commit()?;
749| |
750| 0| if !staged.urls.is_empty() {
751| 0| let url_entries: Vec<storage_urls::MemoryUrl> = staged
752| 0| .urls
753| 0| .into_iter()
754| 0| .map(|u| storage_urls::MemoryUrl {
755| 0| url: u.url,
756| 0| offset: Some(u.offset as i64),
757| 0| })
758| 0| .collect();
759| 0| let _ = storage_urls::insert_urls(conn, memory_id, &url_entries);
760| 0| }
761| |
762| 0| Ok(FileSuccess {
763| 0| memory_id,
764| 0| action: "created".to_string(),
765| 0| body_length: new_memory.body.len(),
766| 0| })
767| 0|}
768| |
769| |#[tracing::instrument(skip_all, level = "debug", name = "ingest")]
770| 0|pub fn run(args: IngestArgs) -> Result<(), AppError> {
771| | // TODO(G20): add mode-conditional flag validation before DB access.
772| | // Flags that are silently discarded when the wrong mode is active:
773| | // --mode none/gliner: claude_binary, claude_model, claude_timeout,
774| | // max_cost_usd, rate_limit_wait, resume,
775| | // retry_failed, keep_queue, queue_db
776| | // --mode none/gliner: codex_binary, codex_model, codex_timeout
777| | // --mode claude-code: codex_binary, codex_model, codex_timeout
778| | // --mode codex: claude_binary, claude_model, claude_timeout,
779| | // max_cost_usd, rate_limit_wait
780| | // --mode none: gliner_variant (only meaningful with --enable-ner
781| | // or --mode gliner)
782| | // Approach: after the mode dispatch block below, check each non-default
783| | // flag value and return Err(AppError::Validation(...)) for mismatches.
784| 0| tracing::debug!(target: "ingest", dir = %args.dir.display(), mode = ?args.mode, "starting ingest");
785| 0| if args.mode == IngestMode::ClaudeCode {
786| 0| return super::ingest_claude::run_claude_ingest(&args);
787| 0| }
788| 0| if args.mode == IngestMode::Codex {
789| 0| return super::ingest_codex::run_codex_ingest(&args);
790| 0| }
791| |
792| 0| let started = std::time::Instant::now();
793| |
794| 0| if !args.dir.exists() {
795| 0| return Err(AppError::Validation(format!(
796| 0| "directory not found: {}",
797| 0| args.dir.display()
798| 0| )));
799| 0| }
800| 0| if !args.dir.is_dir() {
801| 0| return Err(AppError::Validation(format!(
802| 0| "path is not a directory: {}",
803| 0| args.dir.display()
804| 0| )));
805| 0| }
806| |
807| 0| let mut files: Vec<PathBuf> = Vec::with_capacity(128);
808| 0| collect_files(&args.dir, &args.pattern, args.recursive, &mut files)?;
809| 0| files.sort_unstable();
810| |
811| 0| if files.len() > args.max_files {
812| 0| return Err(AppError::Validation(format!(
813| 0| "found {} files matching pattern, exceeds --max-files cap of {} (raise the cap or narrow the pattern)",
814| 0| files.len(),
815| 0| args.max_files
816| 0| )));
817| 0| }
818| |
819| 0| let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
820| 0| let memory_type_str = args.r#type.as_str().to_string();
821| |
822| 0| let paths = AppPaths::resolve(args.db.as_deref())?;
823| 0| let mut conn_or_err = match init_storage(&paths) {
824| 0| Ok(c) => Ok(c),
825| 0| Err(e) => Err(format!("{e}")),
826| | };
827| |
828| 0| let mut succeeded: usize = 0;
829| 0| let mut failed: usize = 0;
830| 0| let mut skipped: usize = 0;
831| 0| let total = files.len();
832| |
833| | // Pre-resolve all names before parallelisation so Phase A workers see a
834| | // consistent, immutable name assignment (v1.0.31 A10 contract preserved).
835| 0| let mut taken_names: BTreeSet<String> = BTreeSet::new();
836| |
837| | // SlotMeta: per-slot output metadata retained on the main thread for NDJSON.
838| | // ProcessItem: the data moved into the producer thread for Phase A computation.
839| | // We split these so `slots_meta` (non-Send BTreeSet-dependent) stays on main
840| | // thread while `process_items` (Send: only PathBuf + String) crosses the thread
841| | // boundary into the rayon producer.
842| | enum SlotMeta {
843| | Skip {
844| | file_str: String,
845| | derived_base: String,
846| | name_truncated: bool,
847| | original_name: Option<String>,
848| | original_filename: Option<String>,
849| | reason: String,
850| | },
851| | Process {
852| | file_str: String,
853| | derived_name: String,
854| | name_truncated: bool,
855| | original_name: Option<String>,
856| | original_filename: Option<String>,
857| | },
858| | }
859| |
860| | struct ProcessItem {
861| | idx: usize,
862| | path: PathBuf,
863| | file_str: String,
864| | derived_name: String,
865| | }
866| |
867| 0| let files_cap = files.len();
868| 0| let mut slots_meta: Vec<SlotMeta> = Vec::new();
869| 0| slots_meta.try_reserve(files_cap).map_err(|_| {
870| 0| AppError::LimitExceeded(format!(
871| 0| "allocation of {files_cap} slot metadata entries would exceed available memory"
872| 0| ))
873| 0| })?;
874| 0| let mut process_items: Vec<ProcessItem> = Vec::new();
875| 0| process_items.try_reserve(files_cap).map_err(|_| {
876| 0| AppError::LimitExceeded(format!(
877| 0| "allocation of {files_cap} process items would exceed available memory"
878| 0| ))
879| 0| })?;
880| 0| let mut truncations: Vec<(String, String)> = Vec::new();
881| 0| truncations.try_reserve(files_cap).map_err(|_| {
882| 0| AppError::LimitExceeded(format!(
883| 0| "allocation of {files_cap} truncation entries would exceed available memory"
884| 0| ))
885| 0| })?;
886| |
887| 0| let max_name_length = args.max_name_length;
888| 0| for path in &files {
889| 0| let file_str = path.to_string_lossy().into_owned();
890| 0| let (derived_base, name_truncated, original_name) =
891| 0| derive_kebab_name(path, max_name_length);
892| 0| let original_basename = path.file_stem().and_then(|s| s.to_str()).unwrap_or("");
893| |
894| 0| if name_truncated {
895| 0| if let Some(ref orig) = original_name {
896| 0| truncations.push((orig.clone(), derived_base.clone()));
897| 0| }
898| 0| }
899| |
900| 0| if derived_base.is_empty() {
901| | // original_filename: always include when it differs from the empty derived name
902| 0| let orig_filename = if !original_basename.is_empty() {
903| 0| Some(original_basename.to_string())
904| | } else {
905| 0| None
906| | };
907| 0| slots_meta.push(SlotMeta::Skip {
908| 0| file_str,
909| 0| derived_base: String::new(),
910| 0| name_truncated: false,
911| 0| original_name: None,
912| 0| original_filename: orig_filename,
913| 0| reason: "could not derive a non-empty kebab-case name from filename".to_string(),
914| 0| });
915| 0| continue;
916| 0| }
917| |
918| 0| match unique_name(&derived_base, &taken_names) {
919| 0| Ok(derived_name) => {
920| 0| taken_names.insert(derived_name.clone());
921| 0| let idx = slots_meta.len();
922| | // original_filename: present only when the raw basename differs from the derived name
923| 0| let orig_filename = if original_basename != derived_name {
924| 0| Some(original_basename.to_string())
925| | } else {
926| 0| None
927| | };
928| 0| process_items.push(ProcessItem {
929| 0| idx,
930| 0| path: path.clone(),
931| 0| file_str: file_str.clone(),
932| 0| derived_name: derived_name.clone(),
933| 0| });
934| 0| slots_meta.push(SlotMeta::Process {
935| 0| file_str,
936| 0| derived_name,
937| 0| name_truncated,
938| 0| original_name,
939| 0| original_filename: orig_filename,
940| 0| });
941| | }
942| 0| Err(e) => {
943| 0| let orig_filename = if original_basename != derived_base {
944| 0| Some(original_basename.to_string())
945| | } else {
946| 0| None
947| | };
948| 0| slots_meta.push(SlotMeta::Skip {
949| 0| file_str,
950| 0| derived_base,
951| 0| name_truncated,
952| 0| original_name,
953| 0| original_filename: orig_filename,
954| 0| reason: e.to_string(),
955| 0| });
956| | }
957| | }
958| | }
959| |
960| 0| if !truncations.is_empty() {
961| 0| tracing::info!(
962| | target: "ingest",
963| 0| count = truncations.len(),
964| | max_name_length = max_name_length,
965| | max_len = DERIVED_NAME_MAX_LEN,
966| 0| "derived names truncated; pass -vv (debug) for per-file detail"
967| | );
968| 0| }
969| |
970| | // --dry-run: emit preview events and exit before loading ONNX or touching DB.
971| 0| if args.dry_run {
972| 0| for meta in &slots_meta {
973| 0| match meta {
974| | SlotMeta::Skip {
975| 0| file_str,
976| 0| derived_base,
977| 0| name_truncated,
978| 0| original_name,
979| 0| original_filename,
980| 0| reason,
981| | } => {
982| 0| output::emit_json_compact(&IngestFileEvent {
983| 0| file: file_str,
984| 0| name: derived_base,
985| 0| status: "skip",
986| 0| truncated: *name_truncated,
987| 0| original_name: original_name.clone(),
988| 0| original_filename: original_filename.as_deref(),
989| 0| error: Some(reason.clone()),
990| 0| memory_id: None,
991| 0| action: None,
992| 0| body_length: 0,
993| 0| })?;
994| | }
995| | SlotMeta::Process {
996| 0| file_str,
997| 0| derived_name,
998| 0| name_truncated,
999| 0| original_name,
1000| 0| original_filename,
1001| | } => {
1002| 0| output::emit_json_compact(&IngestFileEvent {
1003| 0| file: file_str,
1004| 0| name: derived_name,
1005| 0| status: "preview",
1006| 0| truncated: *name_truncated,
1007| 0| original_name: original_name.clone(),
1008| 0| original_filename: original_filename.as_deref(),
1009| 0| error: None,
1010| 0| memory_id: None,
1011| 0| action: None,
1012| 0| body_length: 0,
1013| 0| })?;
1014| | }
1015| | }
1016| | }
1017| 0| output::emit_json_compact(&IngestSummary {
1018| 0| summary: true,
1019| 0| dir: args.dir.to_string_lossy().into_owned(),
1020| 0| pattern: args.pattern.clone(),
1021| 0| recursive: args.recursive,
1022| 0| files_total: total,
1023| 0| files_succeeded: 0,
1024| 0| files_failed: 0,
1025| 0| files_skipped: 0,
1026| 0| elapsed_ms: started.elapsed().as_millis() as u64,
1027| 0| })?;
1028| 0| return Ok(());
1029| 0| }
1030| |
1031| | // Reject contradictory flag combination: explicit parallelism > 1 with --low-memory.
1032| 0| if args.low_memory {
1033| 0| if let Some(n) = args.ingest_parallelism {
1034| 0| if n > 1 {
1035| 0| return Err(AppError::Validation(
1036| 0| "--ingest-parallelism N>1 conflicts with --low-memory; use one or the other"
1037| 0| .to_string(),
1038| 0| ));
1039| 0| }
1040| 0| }
1041| 0| }
1042| |
1043| | // Determine rayon thread pool size, honoring --low-memory and the
1044| | // SQLITE_GRAPHRAG_LOW_MEMORY env var (both force parallelism = 1).
1045| 0| let parallelism = resolve_parallelism(args.low_memory, args.ingest_parallelism);
1046| |
1047| 0| let pool = rayon::ThreadPoolBuilder::new()
1048| 0| .num_threads(parallelism)
1049| 0| .build()
1050| 0| .map_err(|e| AppError::Internal(anyhow::anyhow!("rayon pool: {e}")))?;
1051| |
1052| 0| if args.enable_ner && args.skip_extraction {
1053| 0| return Err(AppError::Validation(
1054| 0| "--enable-ner and --skip-extraction are mutually exclusive; remove one".to_string(),
1055| 0| ));
1056| 0| }
1057| 0| if args.skip_extraction && !args.enable_ner {
1058| 0| return Err(AppError::Validation(
1059| 0| "--skip-extraction is deprecated since v1.0.45 and has no effect; remove this flag"
1060| 0| .to_string(),
1061| 0| ));
1062| 0| }
1063| 0| let enable_ner = args.enable_ner;
1064| 0| let max_rss_mb = args.max_rss_mb;
1065| 0| let gliner_variant: crate::extraction::GlinerVariant =
1066| 0| args.gliner_variant.parse().unwrap_or_else(|e| {
1067| 0| tracing::warn!(target: "ingest", error = %e, "invalid --gliner-variant, defaulting to fp32");
1068| 0| crate::extraction::GlinerVariant::Fp32
1069| 0| });
1070| |
1071| 0| let total_to_process = process_items.len();
1072| 0| tracing::info!(
1073| | target: "ingest",
1074| | phase = "pipeline_start",
1075| | files = total_to_process,
1076| | ingest_parallelism = parallelism,
1077| 0| "incremental pipeline starting: Phase A (rayon) → channel → Phase B (main thread)",
1078| | );
1079| |
1080| | // Bounded channel: producer never gets more than parallelism*2 items ahead of
1081| | // the consumer, preventing memory blowup when Phase A is faster than Phase B.
1082| | // Each message carries the slot index so Phase B can look up SlotMeta in order.
1083| 0| let channel_bound = (parallelism * 2).max(1);
1084| 0| let (tx, rx) = mpsc::sync_channel::<(usize, Result<StagedFile, AppError>)>(channel_bound);
1085| |
1086| | // Phase A: launched in a dedicated OS thread so the main thread can consume
1087| | // the channel concurrently. pool.install() blocks the calling thread until
1088| | // all rayon workers finish — if called on the main thread it would
1089| | // reintroduce the 2-phase blocking behaviour we are eliminating.
1090| 0| let paths_owned = paths.clone();
1091| 0| let producer_handle = std::thread::spawn(move || {
1092| 0| pool.install(|| {
1093| 0| process_items.into_par_iter().for_each(|item| {
1094| 0| if crate::shutdown_requested() {
1095| 0| return;
1096| 0| }
1097| 0| let t0 = std::time::Instant::now();
1098| 0| let result = stage_file(
1099| 0| item.idx,
1100| 0| &item.path,
1101| 0| &item.derived_name,
1102| 0| &paths_owned,
1103| 0| enable_ner,
1104| 0| gliner_variant,
1105| 0| max_rss_mb,
1106| | );
1107| 0| let elapsed_ms = t0.elapsed().as_millis() as u64;
1108| |
1109| | // Emit NDJSON progress event to stderr so the user sees work
1110| | // happening during long NER runs (e.g. 50 files × 27s each).
1111| 0| let (n_entities, n_relationships) = match &result {
1112| 0| Ok(sf) => (sf.entities.len(), sf.relationships.len()),
1113| 0| Err(_) => (0, 0),
1114| | };
1115| 0| let progress = StageProgressEvent {
1116| 0| schema_version: 1,
1117| 0| event: "file_extracted",
1118| 0| path: &item.file_str,
1119| 0| ms: elapsed_ms,
1120| 0| entities: n_entities,
1121| 0| relationships: n_relationships,
1122| 0| };
1123| 0| if let Ok(line) = serde_json::to_string(&progress) {
1124| 0| tracing::info!(target: "ingest_progress", "{}", line);
1125| 0| }
1126| |
1127| | // Blocking send applies backpressure: if Phase B is slower,
1128| | // Phase A workers wait here instead of accumulating staged files
1129| | // in memory. If the receiver is dropped (fail_fast abort), ignore.
1130| 0| let _ = tx.send((item.idx, result));
1131| 0| });
1132| | // Explicit drop of tx signals Phase B (rx iteration) to stop.
1133| 0| drop(tx);
1134| 0| });
1135| 0| });
1136| |
1137| | // Phase B: main thread persists files as results arrive from the channel.
1138| | // Results arrive in completion order (par_iter is unordered). We persist
1139| | // each file immediately on arrival — this is the key fix for B1: with the
1140| | // old 2-phase design the first DB write happened only after ALL files had
1141| | // finished Phase A. Now the first commit happens as soon as the first file
1142| | // completes Phase A, regardless of how many files remain.
1143| | //
1144| | // NDJSON output order follows completion order (not file-system sort order).
1145| | // Skip slots are emitted at the end, after all Process results are consumed.
1146| | // This trade-off is intentional: deterministic NDJSON ordering is a lesser
1147| | // requirement than ensuring data is persisted before the user's timeout fires.
1148| 0| let fail_fast = args.fail_fast;
1149| |
1150| | // Emit pending Skip events first so agents see them early.
1151| 0| for meta in &slots_meta {
1152| | if let SlotMeta::Skip {
1153| 0| file_str,
1154| 0| derived_base,
1155| 0| name_truncated,
1156| 0| original_name,
1157| 0| original_filename,
1158| 0| reason,
1159| 0| } = meta
1160| | {
1161| 0| output::emit_json_compact(&IngestFileEvent {
1162| 0| file: file_str,
1163| 0| name: derived_base,
1164| 0| status: "skipped",
1165| 0| truncated: *name_truncated,
1166| 0| original_name: original_name.clone(),
1167| 0| original_filename: original_filename.as_deref(),
1168| 0| error: Some(reason.clone()),
1169| 0| memory_id: None,
1170| 0| action: None,
1171| 0| body_length: 0,
1172| 0| })?;
1173| 0| skipped += 1;
1174| 0| }
1175| | }
1176| |
1177| | // Build a quick index from slot index → SlotMeta reference for O(1) lookups
1178| | // as channel messages arrive in completion order.
1179| 0| let meta_index: std::collections::HashMap<usize, &SlotMeta> = slots_meta
1180| 0| .iter()
1181| 0| .enumerate()
1182| 0| .filter(|(_, m)| matches!(m, SlotMeta::Process { .. }))
1183| 0| .collect();
1184| |
1185| 0| tracing::info!(
1186| | target: "ingest",
1187| | phase = "persist_start",
1188| | files = total_to_process,
1189| 0| "phase B starting: persisting files incrementally as Phase A completes each one",
1190| | );
1191| |
1192| | // Drain channel and persist each file immediately — no accumulation into a
1193| | // HashMap. The bounded channel ensures Phase A cannot run too far ahead of
1194| | // Phase B without applying backpressure.
1195| 0| for (idx, stage_result) in rx {
1196| 0| if crate::shutdown_requested() {
1197| 0| tracing::info!(target: "ingest", "shutdown requested, stopping persistence loop");
1198| 0| break;
1199| 0| }
1200| 0| let meta = meta_index.get(&idx).ok_or_else(|| {
1201| 0| AppError::Internal(anyhow::anyhow!(
1202| 0| "channel idx {idx} has no corresponding Process slot"
1203| 0| ))
1204| 0| })?;
1205| 0| let (file_str, derived_name, name_truncated, original_name, original_filename) = match meta
1206| | {
1207| | SlotMeta::Process {
1208| 0| file_str,
1209| 0| derived_name,
1210| 0| name_truncated,
1211| 0| original_name,
1212| 0| original_filename,
1213| 0| } => (
1214| 0| file_str,
1215| 0| derived_name,
1216| 0| name_truncated,
1217| 0| original_name,
1218| 0| original_filename,
1219| 0| ),
1220| 0| SlotMeta::Skip { .. } => unreachable!("channel only carries Process results"),
1221| | };
1222| |
1223| | // If storage init failed, every file fails with the same error.
1224| 0| let conn = match conn_or_err.as_mut() {
1225| 0| Ok(c) => c,
1226| 0| Err(err_msg) => {
1227| 0| let err_clone = err_msg.clone();
1228| 0| output::emit_json_compact(&IngestFileEvent {
1229| 0| file: file_str,
1230| 0| name: derived_name,
1231| 0| status: "failed",
1232| 0| truncated: *name_truncated,
1233| 0| original_name: original_name.clone(),
1234| 0| original_filename: original_filename.as_deref(),
1235| 0| error: Some(err_clone.clone()),
1236| 0| memory_id: None,
1237| 0| action: None,
1238| 0| body_length: 0,
1239| 0| })?;
1240| 0| failed += 1;
1241| 0| if fail_fast {
1242| 0| output::emit_json_compact(&IngestSummary {
1243| 0| summary: true,
1244| 0| dir: args.dir.display().to_string(),
1245| 0| pattern: args.pattern.clone(),
1246| 0| recursive: args.recursive,
1247| 0| files_total: total,
1248| 0| files_succeeded: succeeded,
1249| 0| files_failed: failed,
1250| 0| files_skipped: skipped,
1251| 0| elapsed_ms: started.elapsed().as_millis() as u64,
1252| 0| })?;
1253| 0| return Err(AppError::Validation(format!(
1254| 0| "ingest aborted on first failure: {err_clone}"
1255| 0| )));
1256| 0| }
1257| 0| continue;
1258| | }
1259| | };
1260| |
1261| 0| let outcome =
1262| 0| stage_result.and_then(|sf| persist_staged(conn, &namespace, &memory_type_str, sf));
1263| |
1264| 0| match outcome {
1265| | Ok(FileSuccess {
1266| 0| memory_id,
1267| 0| action,
1268| 0| body_length,
1269| | }) => {
1270| 0| output::emit_json_compact(&IngestFileEvent {
1271| 0| file: file_str,
1272| 0| name: derived_name,
1273| 0| status: "indexed",
1274| 0| truncated: *name_truncated,
1275| 0| original_name: original_name.clone(),
1276| 0| original_filename: original_filename.as_deref(),
1277| 0| error: None,
1278| 0| memory_id: Some(memory_id),
1279| 0| action: Some(action),
1280| 0| body_length,
1281| 0| })?;
1282| 0| succeeded += 1;
1283| | }
1284| 0| Err(ref e) if matches!(e, AppError::Duplicate(_)) => {
1285| 0| output::emit_json_compact(&IngestFileEvent {
1286| 0| file: file_str,
1287| 0| name: derived_name,
1288| 0| status: "skipped",
1289| 0| truncated: *name_truncated,
1290| 0| original_name: original_name.clone(),
1291| 0| original_filename: original_filename.as_deref(),
1292| 0| error: Some(format!("{e}")),
1293| 0| memory_id: None,
1294| 0| action: Some("duplicate".to_string()),
1295| 0| body_length: 0,
1296| 0| })?;
1297| 0| skipped += 1;
1298| | }
1299| 0| Err(e) => {
1300| 0| let err_msg = format!("{e}");
1301| 0| output::emit_json_compact(&IngestFileEvent {
1302| 0| file: file_str,
1303| 0| name: derived_name,
1304| 0| status: "failed",
1305| 0| truncated: *name_truncated,
1306| 0| original_name: original_name.clone(),
1307| 0| original_filename: original_filename.as_deref(),
1308| 0| error: Some(err_msg.clone()),
1309| 0| memory_id: None,
1310| 0| action: None,
1311| 0| body_length: 0,
1312| 0| })?;
1313| 0| failed += 1;
1314| 0| if fail_fast {
1315| 0| output::emit_json_compact(&IngestSummary {
1316| 0| summary: true,
1317| 0| dir: args.dir.display().to_string(),
1318| 0| pattern: args.pattern.clone(),
1319| 0| recursive: args.recursive,
1320| 0| files_total: total,
1321| 0| files_succeeded: succeeded,
1322| 0| files_failed: failed,
1323| 0| files_skipped: skipped,
1324| 0| elapsed_ms: started.elapsed().as_millis() as u64,
1325| 0| })?;
1326| 0| return Err(AppError::Validation(format!(
1327| 0| "ingest aborted on first failure: {err_msg}"
1328| 0| )));
1329| 0| }
1330| | }
1331| | }
1332| | }
1333| |
1334| | // Wait for the producer thread to finish cleanly.
1335| 0| producer_handle
1336| 0| .join()
1337| 0| .map_err(|_| AppError::Internal(anyhow::anyhow!("ingest producer thread panicked")))?;
1338| |
1339| 0| if let Ok(ref conn) = conn_or_err {
1340| 0| if succeeded > 0 {
1341| 0| let _ = conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);");
1342| 0| }
1343| 0| }
1344| |
1345| 0| output::emit_json_compact(&IngestSummary {
1346| 0| summary: true,
1347| 0| dir: args.dir.display().to_string(),
1348| 0| pattern: args.pattern.clone(),
1349| 0| recursive: args.recursive,
1350| 0| files_total: total,
1351| 0| files_succeeded: succeeded,
1352| 0| files_failed: failed,
1353| 0| files_skipped: skipped,
1354| 0| elapsed_ms: started.elapsed().as_millis() as u64,
1355| 0| })?;
1356| |
1357| 0| Ok(())
1358| 0|}
1359| |
1360| |/// Auto-initialises the database (matches the contract of every other CRUD
1361| |/// handler) and returns a fresh read/write connection ready for the ingest
1362| |/// loop. Errors here are recoverable per-file: the caller surfaces them as
1363| |/// failure events so `--fail-fast` and the continue-on-error path keep
1364| |/// working when, for example, the user points `--db` at an unwritable path.
1365| 0|fn init_storage(paths: &AppPaths) -> Result<Connection, AppError> {
1366| 0| ensure_db_ready(paths)?;
1367| 0| let conn = open_rw(&paths.db)?;
1368| 0| Ok(conn)
1369| 0|}
1370| |
1371| 4|pub(crate) fn collect_files(
1372| 4| dir: &Path,
1373| 4| pattern: &str,
1374| 4| recursive: bool,
1375| 4| out: &mut Vec<PathBuf>,
1376| 4|) -> Result<(), AppError> {
1377| 4| let entries = std::fs::read_dir(dir).map_err(AppError::Io)?;
^0
1378| 12| for entry in entries {
^8
1379| 8| let entry = entry.map_err(AppError::Io)?;
^0
1380| 8| let path = entry.path();
1381| 8| let file_type = entry.file_type().map_err(AppError::Io)?;
^0
1382| 8| if file_type.is_file() {
1383| 6| let name = entry.file_name();
1384| 6| let name_str = name.to_string_lossy();
1385| 6| if matches_pattern(&name_str, pattern) {
1386| 5| out.push(path);
1387| 5| }
^1
1388| 2| } else if file_type.is_dir() && recursive {
1389| 1| collect_files(&path, pattern, recursive, out)?;
^0
1390| 1| }
1391| | }
1392| 4| Ok(())
1393| 4|}
1394| |
1395| 13|fn matches_pattern(name: &str, pattern: &str) -> bool {
1396| 13| if let Some(suffix) = pattern.strip_prefix('*') {
^9
1397| 9| name.ends_with(suffix)
1398| 4| } else if let Some(prefix) = pattern.strip_suffix('*') {
^2
1399| 2| name.starts_with(prefix)
1400| | } else {
1401| 2| name == pattern
1402| | }
1403| 13|}
1404| |
1405| |/// Returns `(final_name, truncated, original_name)`.
1406| |/// `truncated` is true when the derived name exceeded `max_len`.
1407| |/// `original_name` holds the pre-truncation name only when `truncated=true`.
1408| |///
1409| |/// Non-ASCII characters are first decomposed via NFD and then stripped of
1410| |/// combining marks so accented letters fold to their base ASCII letter
1411| |/// (e.g. `acai` from accented input, `naive` from diaeresis). Characters with no ASCII
1412| |/// fallback (emoji, CJK ideographs, symbols) are dropped silently. This
1413| |/// preserves meaningful word content rather than collapsing the basename
1414| |/// to a few stray ASCII letters as the previous filter did.
1415| 11|pub(crate) fn derive_kebab_name(path: &Path, max_len: usize) -> (String, bool, Option<String>) {
1416| 11| let stem = path.file_stem().and_then(|s| s.to_str()).unwrap_or("");
1417| 11| let lowered: String = stem
1418| 11| .nfd()
1419| 285| .filter(|c| !unicode_normalization::char::is_combining_mark(*c))
^11
1420| 280| .map(|c| {
^11
1421| 280| if c == '_' || c.is_whitespace() {
^273^273
1422| 7| '-'
1423| | } else {
1424| 273| c
1425| | }
1426| 280| })
1427| 280| .map(|c| c.to_ascii_lowercase())
^11
1428| 280| .filter(|c| c.is_ascii_lowercase() || c.is_ascii_digit() || *c == '-')
^11 ^17^17 ^17
1429| 11| .collect();
1430| 11| let collapsed = collapse_dashes(&lowered);
1431| 11| let trimmed_raw = collapsed.trim_matches('-').to_string();
1432| | // Prefix names that start with a digit to keep them valid kebab-case identifiers.
1433| 11| let trimmed = if trimmed_raw.starts_with(|c: char| c.is_ascii_digit()) {
^10^10
1434| 0| format!("doc-{trimmed_raw}")
1435| | } else {
1436| 11| trimmed_raw
1437| | };
1438| 11| if trimmed.len() > max_len {
1439| 2| let truncated = trimmed[..max_len].trim_matches('-').to_string();
1440| 2| tracing::debug!(
1441| | target: "ingest",
1442| | original = %trimmed,
1443| | truncated_to = %truncated,
1444| | max_len = max_len,
1445| 0| "derived memory name truncated to fit length cap; collisions will be resolved with numeric suffixes"
1446| | );
1447| 2| (truncated, true, Some(trimmed))
1448| | } else {
1449| 9| (trimmed, false, None)
1450| | }
1451| 11|}
1452| |
1453| |/// v1.0.31 A10: returns the first non-colliding kebab name by appending a
1454| |/// numeric suffix (`-1`, `-2`, …) when needed.
1455| |///
1456| |/// `taken` is the set of names already consumed in the current ingest run.
1457| |/// The caller is expected to insert the returned name into `taken` so the
1458| |/// next call observes the consumption. Cross-run collisions are intentionally
1459| |/// surfaced by the per-file persistence path as duplicates so re-ingestion
1460| |/// of identical corpora stays idempotent.
1461| |///
1462| |/// Returns `Err(AppError::Validation)` after `MAX_NAME_COLLISION_SUFFIX`
1463| |/// candidates collide, signalling a pathological corpus that should be
1464| |/// renamed manually.
1465| 3|fn unique_name(base: &str, taken: &BTreeSet<String>) -> Result<String, AppError> {
1466| 3| if !taken.contains(base) {
1467| 1| return Ok(base.to_string());
1468| 2| }
1469| 1.00k| for suffix in 1..=MAX_NAME_COLLISION_SUFFIX {
^1.00k
1470| 1.00k| let candidate = format!("{base}-{suffix}");
1471| 1.00k| if !taken.contains(&candidate) {
1472| 1| tracing::warn!(
1473| | target: "ingest",
1474| | base = %base,
1475| | resolved = %candidate,
1476| | suffix,
1477| 0| "memory name collision resolved with numeric suffix"
1478| | );
1479| 1| return Ok(candidate);
1480| 1.00k| }
1481| | }
1482| 1| Err(AppError::Validation(format!(
1483| 1| "too many name collisions for base '{base}' (>{MAX_NAME_COLLISION_SUFFIX}); rename source files to disambiguate"
1484| 1| )))
1485| 3|}
1486| |
1487| 11|fn collapse_dashes(s: &str) -> String {
1488| 11| let mut out = String::with_capacity(s.len());
1489| 11| let mut prev_dash = false;
1490| 272| for c in s.chars() {
^11^11
1491| 272| if c == '-' {
1492| 9| if !prev_dash {
1493| 6| out.push('-');
1494| 6| }
^3
1495| 9| prev_dash = true;
1496| 263| } else {
1497| 263| out.push(c);
1498| 263| prev_dash = false;
1499| 263| }
1500| | }
1501| 11| out
1502| 11|}
1503| |
1504| |#[cfg(test)]
1505| |mod tests {
1506| | use super::*;
1507| | use std::path::PathBuf;
1508| |
1509| | #[test]
1510| 1| fn matches_pattern_suffix() {
1511| 1| assert!(matches_pattern("foo.md", "*.md"));
1512| 1| assert!(!matches_pattern("foo.txt", "*.md"));
1513| 1| assert!(matches_pattern("foo.md", "*"));
1514| 1| }
1515| |
1516| | #[test]
1517| 1| fn matches_pattern_prefix() {
1518| 1| assert!(matches_pattern("README.md", "README*"));
1519| 1| assert!(!matches_pattern("CHANGELOG.md", "README*"));
1520| 1| }
1521| |
1522| | #[test]
1523| 1| fn matches_pattern_exact() {
1524| 1| assert!(matches_pattern("README.md", "README.md"));
1525| 1| assert!(!matches_pattern("readme.md", "README.md"));
1526| 1| }
1527| |
1528| | #[test]
1529| 1| fn derive_kebab_underscore_to_dash() {
1530| 1| let p = PathBuf::from("/tmp/claude_code_headless.md");
1531| 1| let (name, truncated, original) = derive_kebab_name(&p, DERIVED_NAME_MAX_LEN);
1532| 1| assert_eq!(name, "claude-code-headless");
1533| 1| assert!(!truncated);
1534| 1| assert!(original.is_none());
1535| 1| }
1536| |
1537| | #[test]
1538| 1| fn derive_kebab_uppercase_lowered() {
1539| 1| let p = PathBuf::from("/tmp/README.md");
1540| 1| let (name, truncated, original) = derive_kebab_name(&p, DERIVED_NAME_MAX_LEN);
1541| 1| assert_eq!(name, "readme");
1542| 1| assert!(!truncated);
1543| 1| assert!(original.is_none());
1544| 1| }
1545| |
1546| | #[test]
1547| 1| fn derive_kebab_strips_non_kebab_chars() {
1548| 1| let p = PathBuf::from("/tmp/some@weird#name!.md");
1549| 1| let (name, truncated, original) = derive_kebab_name(&p, DERIVED_NAME_MAX_LEN);
1550| 1| assert_eq!(name, "someweirdname");
1551| 1| assert!(!truncated);
1552| 1| assert!(original.is_none());
1553| 1| }
1554| |
1555| | // Bug M-A3: NFD-based unicode normalization preserves base letters of
1556| | // accented characters instead of dropping them entirely.
1557| | #[test]
1558| 1| fn derive_kebab_folds_accented_letters_to_ascii() {
1559| 1| let p = PathBuf::from("/tmp/açaí.md");
1560| 1| let (name, _, _) = derive_kebab_name(&p, DERIVED_NAME_MAX_LEN);
1561| 1| assert_eq!(name, "acai", "got '{name}'");
^0
1562| 1| }
1563| |
1564| | #[test]
1565| 1| fn derive_kebab_handles_naive_with_diaeresis() {
1566| 1| let p = PathBuf::from("/tmp/naïve-test.md");
1567| 1| let (name, _, _) = derive_kebab_name(&p, DERIVED_NAME_MAX_LEN);
1568| 1| assert_eq!(name, "naive-test", "got '{name}'");
^0
1569| 1| }
1570| |
1571| | #[test]
1572| 1| fn derive_kebab_drops_emoji_keeps_word() {
1573| 1| let p = PathBuf::from("/tmp/🚀-rocket.md");
1574| 1| let (name, _, _) = derive_kebab_name(&p, DERIVED_NAME_MAX_LEN);
1575| 1| assert_eq!(name, "rocket", "got '{name}'");
^0
1576| 1| }
1577| |
1578| | #[test]
1579| 1| fn derive_kebab_mixed_unicode_emoji_keeps_letters() {
1580| 1| let p = PathBuf::from("/tmp/açaí🦜.md");
1581| 1| let (name, _, _) = derive_kebab_name(&p, DERIVED_NAME_MAX_LEN);
1582| 1| assert_eq!(name, "acai", "got '{name}'");
^0
1583| 1| }
1584| |
1585| | #[test]
1586| 1| fn derive_kebab_pure_emoji_yields_empty() {
1587| 1| let p = PathBuf::from("/tmp/🦜🚀🌟.md");
1588| 1| let (name, _, _) = derive_kebab_name(&p, DERIVED_NAME_MAX_LEN);
1589| 1| assert!(name.is_empty(), "got '{name}'");
^0
1590| 1| }
1591| |
1592| | #[test]
1593| 1| fn derive_kebab_collapses_consecutive_dashes() {
1594| 1| let p = PathBuf::from("/tmp/a__b___c.md");
1595| 1| let (name, truncated, original) = derive_kebab_name(&p, DERIVED_NAME_MAX_LEN);
1596| 1| assert_eq!(name, "a-b-c");
1597| 1| assert!(!truncated);
1598| 1| assert!(original.is_none());
1599| 1| }
1600| |
1601| | #[test]
1602| 1| fn derive_kebab_truncates_to_60_chars() {
1603| 1| let p = PathBuf::from(format!("/tmp/{}.md", "a".repeat(80)));
1604| 1| let (name, truncated, original) = derive_kebab_name(&p, DERIVED_NAME_MAX_LEN);
1605| 1| assert!(name.len() <= 60, "got len {}", name.len());
^0 ^0 ^0
1606| 1| assert!(truncated);
1607| 1| assert!(original.is_some());
1608| 1| assert!(original.unwrap().len() > 60);
1609| 1| }
1610| |
1611| | #[test]
1612| 1| fn collect_files_finds_md_files() {
1613| 1| let tmp = tempfile::tempdir().expect("tempdir");
1614| 1| std::fs::write(tmp.path().join("a.md"), "x").unwrap();
1615| 1| std::fs::write(tmp.path().join("b.md"), "y").unwrap();
1616| 1| std::fs::write(tmp.path().join("c.txt"), "z").unwrap();
1617| 1| let mut out = Vec::new();
1618| 1| collect_files(tmp.path(), "*.md", false, &mut out).expect("collect");
1619| 1| assert_eq!(out.len(), 2, "should find 2 .md files, got {out:?}");
^0
1620| 1| }
1621| |
1622| | #[test]
1623| 1| fn collect_files_recursive_descends_subdirs() {
1624| 1| let tmp = tempfile::tempdir().expect("tempdir");
1625| 1| let sub = tmp.path().join("sub");
1626| 1| std::fs::create_dir(&sub).unwrap();
1627| 1| std::fs::write(tmp.path().join("a.md"), "x").unwrap();
1628| 1| std::fs::write(sub.join("b.md"), "y").unwrap();
1629| 1| let mut out = Vec::new();
1630| 1| collect_files(tmp.path(), "*.md", true, &mut out).expect("collect");
1631| 1| assert_eq!(out.len(), 2);
1632| 1| }
1633| |
1634| | #[test]
1635| 1| fn collect_files_non_recursive_skips_subdirs() {
1636| 1| let tmp = tempfile::tempdir().expect("tempdir");
1637| 1| let sub = tmp.path().join("sub");
1638| 1| std::fs::create_dir(&sub).unwrap();
1639| 1| std::fs::write(tmp.path().join("a.md"), "x").unwrap();
1640| 1| std::fs::write(sub.join("b.md"), "y").unwrap();
1641| 1| let mut out = Vec::new();
1642| 1| collect_files(tmp.path(), "*.md", false, &mut out).expect("collect");
1643| 1| assert_eq!(out.len(), 1);
1644| 1| }
1645| |
1646| | // ── v1.0.31 A10: name truncation warns and collisions are auto-resolved ──
1647| |
1648| | #[test]
1649| 1| fn derive_kebab_long_basename_truncated_within_cap() {
1650| 1| let p = PathBuf::from(format!("/tmp/{}.md", "a".repeat(120)));
1651| 1| let (name, truncated, original) = derive_kebab_name(&p, DERIVED_NAME_MAX_LEN);
1652| 1| assert!(
1653| 1| name.len() <= DERIVED_NAME_MAX_LEN,
1654| 0| "truncated name must respect cap; got {} chars",
1655| 0| name.len()
1656| | );
1657| 1| assert!(!name.is_empty());
1658| 1| assert!(truncated);
1659| 1| assert!(original.is_some());
1660| 1| }
1661| |
1662| | #[test]
1663| 1| fn unique_name_returns_base_when_free() {
1664| 1| let taken: BTreeSet<String> = BTreeSet::new();
1665| 1| let resolved = unique_name("note", &taken).expect("must resolve");
1666| 1| assert_eq!(resolved, "note");
1667| 1| }
1668| |
1669| | #[test]
1670| 1| fn unique_name_appends_first_free_suffix_on_collision() {
1671| 1| let mut taken: BTreeSet<String> = BTreeSet::new();
1672| 1| taken.insert("note".to_string());
1673| 1| taken.insert("note-1".to_string());
1674| 1| let resolved = unique_name("note", &taken).expect("must resolve");
1675| 1| assert_eq!(resolved, "note-2");
1676| 1| }
1677| |
1678| | #[test]
1679| 1| fn unique_name_errors_after_collision_cap() {
1680| 1| let mut taken: BTreeSet<String> = BTreeSet::new();
1681| 1| taken.insert("note".to_string());
1682| 1.00k| for i in 1..=MAX_NAME_COLLISION_SUFFIX {
^1.00k
1683| 1.00k| taken.insert(format!("note-{i}"));
1684| 1.00k| }
1685| 1| let err = unique_name("note", &taken).expect_err("must surface error");
1686| 1| assert!(matches!(err, AppError::Validation(_)));
^0
1687| 1| }
1688| |
1689| | // ── v1.0.32 Onda 4B: in-process pipeline validation ──
1690| |
1691| | #[test]
1692| 1| fn validate_relation_format_accepts_valid_relations() {
1693| | use crate::parsers::{is_canonical_relation, validate_relation_format};
1694| 1| assert!(validate_relation_format("applies_to").is_ok());
1695| 1| assert!(validate_relation_format("depends_on").is_ok());
1696| 1| assert!(validate_relation_format("implements").is_ok());
1697| 1| assert!(validate_relation_format("").is_err());
1698| 1| assert!(is_canonical_relation("applies_to"));
1699| 1| assert!(!is_canonical_relation("implements"));
1700| 1| }
1701| |
1702| | // ── v1.0.40 H-A1: --low-memory flag and SQLITE_GRAPHRAG_LOW_MEMORY env var ──
1703| |
1704| | use serial_test::serial;
1705| |
1706| | /// Helper: scrubs the env var around a closure to keep tests deterministic.
1707| 20| fn with_env_var<F: FnOnce()>(value: Option<&str>, f: F) {
1708| 20| let key = "SQLITE_GRAPHRAG_LOW_MEMORY";
1709| 20| let prev = std::env::var(key).ok();
1710| 20| match value {
1711| 16| Some(v) => std::env::set_var(key, v),
1712| 4| None => std::env::remove_var(key),
1713| | }
1714| 20| f();
1715| 20| match prev {
1716| 0| Some(p) => std::env::set_var(key, p),
1717| 20| None => std::env::remove_var(key),
1718| | }
1719| 20| }
1720| |
1721| | #[test]
1722| | #[serial]
1723| 1| fn env_low_memory_enabled_unset_returns_false() {
1724| 1| with_env_var(None, || assert!(!env_low_memory_enabled()));
1725| | }
1726| |
1727| | #[test]
1728| | #[serial]
1729| 1| fn env_low_memory_enabled_empty_returns_false() {
1730| 1| with_env_var(Some(""), || assert!(!env_low_memory_enabled()));
1731| | }
1732| |
1733| | #[test]
1734| | #[serial]
1735| 1| fn env_low_memory_enabled_truthy_values_return_true() {
1736| 7| for v in ["1", "true", "TRUE", "yes", "YES", "on", "On"] {
^1 ^1 ^1 ^1 ^1 ^1
1737| 7| with_env_var(Some(v), || {
1738| 7| assert!(env_low_memory_enabled(), "value {v:?} should be truthy")
^0
1739| 7| });
1740| | }
1741| | }
1742| |
1743| | #[test]
1744| | #[serial]
1745| 1| fn env_low_memory_enabled_falsy_values_return_false() {
1746| 5| for v in ["0", "false", "FALSE", "no", "off"] {
^1 ^1 ^1 ^1
1747| 5| with_env_var(Some(v), || {
1748| 5| assert!(!env_low_memory_enabled(), "value {v:?} should be falsy")
^0
1749| 5| });
1750| | }
1751| | }
1752| |
1753| | #[test]
1754| | #[serial]
1755| 1| fn env_low_memory_enabled_unrecognized_value_returns_false() {
1756| 1| with_env_var(Some("maybe"), || assert!(!env_low_memory_enabled()));
1757| | }
1758| |
1759| | #[test]
1760| | #[serial]
1761| 1| fn resolve_parallelism_flag_forces_one_overriding_explicit_value() {
1762| 1| with_env_var(None, || {
1763| 1| assert_eq!(resolve_parallelism(true, Some(4)), 1);
1764| 1| assert_eq!(resolve_parallelism(true, Some(8)), 1);
1765| 1| assert_eq!(resolve_parallelism(true, None), 1);
1766| 1| });
1767| | }
1768| |
1769| | #[test]
1770| | #[serial]
1771| 1| fn resolve_parallelism_env_forces_one_when_flag_off() {
1772| 1| with_env_var(Some("1"), || {
1773| 1| assert_eq!(resolve_parallelism(false, Some(4)), 1);
1774| 1| assert_eq!(resolve_parallelism(false, None), 1);
1775| 1| });
1776| | }
1777| |
1778| | #[test]
1779| | #[serial]
1780| 1| fn resolve_parallelism_falsy_env_does_not_override() {
1781| 1| with_env_var(Some("0"), || {
1782| 1| assert_eq!(resolve_parallelism(false, Some(4)), 4);
1783| 1| });
1784| | }
1785| |
1786| | #[test]
1787| | #[serial]
1788| 1| fn resolve_parallelism_explicit_value_when_low_memory_off() {
1789| 1| with_env_var(None, || {
1790| 1| assert_eq!(resolve_parallelism(false, Some(3)), 3);
1791| 1| assert_eq!(resolve_parallelism(false, Some(1)), 1);
1792| 1| });
1793| | }
1794| |
1795| | #[test]
1796| | #[serial]
1797| 1| fn resolve_parallelism_default_when_unset() {
1798| 1| with_env_var(None, || {
1799| 1| let p = resolve_parallelism(false, None);
1800| 1| assert!((1..=4).contains(&p), "default must be in [1, 4]; got {p}");
^0
1801| 1| });
1802| | }
1803| |
1804| | #[test]
1805| 1| fn ingest_args_parses_low_memory_flag_via_clap() {
1806| | use clap::Parser;
1807| | // Parse a synthetic Cli that contains the `ingest` subcommand. We rely
1808| | // on the public `Cli` definition so the flag is wired end-to-end.
1809| 1| let cli = crate::cli::Cli::try_parse_from([
1810| 1| "sqlite-graphrag",
1811| 1| "ingest",
1812| 1| "/tmp/dummy",
1813| 1| "--type",
1814| 1| "document",
1815| 1| "--low-memory",
1816| 1| ])
1817| 1| .expect("parse must succeed");
1818| 1| match cli.command {
1819| 1| crate::cli::Commands::Ingest(args) => {
1820| 1| assert!(args.low_memory, "--low-memory must set field to true");
^0
1821| | }
1822| 0| _ => panic!("expected Ingest subcommand"),
1823| | }
1824| 1| }
1825| |
1826| | #[test]
1827| 1| fn ingest_args_low_memory_defaults_false() {
1828| | use clap::Parser;
1829| 1| let cli = crate::cli::Cli::try_parse_from([
1830| 1| "sqlite-graphrag",
1831| 1| "ingest",
1832| 1| "/tmp/dummy",
1833| 1| "--type",
1834| 1| "document",
1835| 1| ])
1836| 1| .expect("parse must succeed");
1837| 1| match cli.command {
1838| 1| crate::cli::Commands::Ingest(args) => {
1839| 1| assert!(!args.low_memory, "default must be false");
^0
1840| | }
1841| 0| _ => panic!("expected Ingest subcommand"),
1842| | }
1843| 1| }
1844| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/ingest_claude.rs:
1| |//! Handler for `ingest --mode claude-code`.
2| |//!
3| |//! Orchestrates the locally installed Claude Code CLI binary (`claude -p`)
4| |//! to extract domain-specific entities and relationships from each file,
5| |//! then persists them via the same pipeline as `remember --graph-stdin`.
6| |//!
7| |//! Architecture: P1 One-Shot per file — each file spawns a separate
8| |//! `claude -p` process with `--json-schema` for guaranteed structured output.
9| |//! A SQLite queue DB tracks progress for resume/retry support.
10| |// Workload: Subprocess I/O-bound (claude -p headless with network wait)
11| |
12| |use crate::commands::ingest::IngestArgs;
13| |use crate::entity_type::EntityType;
14| |use crate::errors::AppError;
15| |use crate::paths::AppPaths;
16| |use crate::storage::connection::{ensure_db_ready, open_rw};
17| |use crate::storage::entities::{self, NewEntity, NewRelationship};
18| |use crate::storage::memories::{self, NewMemory};
19| |
20| |use rusqlite::Connection;
21| |use serde::{Deserialize, Serialize};
22| |use std::io::Write;
23| |use std::path::{Path, PathBuf};
24| |use std::process::{Command, Stdio};
25| |use std::time::Instant;
26| |
27| |const MIN_CLAUDE_VERSION: &str = "2.1.0";
28| |
29| |const EXTRACTION_SCHEMA: &str = r#"{
30| | "type": "object",
31| | "properties": {
32| | "name": { "type": "string" },
33| | "description": { "type": "string" },
34| | "entities": {
35| | "type": "array",
36| | "items": {
37| | "type": "object",
38| | "properties": {
39| | "name": { "type": "string" },
40| | "entity_type": {
41| | "type": "string",
42| | "enum": ["project","tool","person","file","concept","incident","decision","organization","location","date"]
43| | }
44| | },
45| | "required": ["name", "entity_type"],
46| | "additionalProperties": false
47| | }
48| | },
49| | "relationships": {
50| | "type": "array",
51| | "items": {
52| | "type": "object",
53| | "properties": {
54| | "source": { "type": "string" },
55| | "target": { "type": "string" },
56| | "relation": {
57| | "type": "string",
58| | "enum": ["applies-to","uses","depends-on","causes","fixes","contradicts","supports","follows","related","replaces","tracked-in"]
59| | },
60| | "strength": { "type": "number", "minimum": 0, "maximum": 1 }
61| | },
62| | "required": ["source","target","relation","strength"],
63| | "additionalProperties": false
64| | }
65| | }
66| | },
67| | "required": ["name","description","entities","relationships"],
68| | "additionalProperties": false
69| |}"#;
70| |
71| |const EXTRACTION_PROMPT: &str = "You are a knowledge graph entity extractor. Given a document, extract:\n\
72| |1. A short kebab-case name (max 60 chars) capturing the document's main topic\n\
73| |2. A one-sentence description (10-20 words) summarizing the key insight\n\
74| |3. Domain-specific entities (concepts, tools, people, decisions, projects, files)\n\
75| |4. Typed relationships between entities with strength scores\n\n\
76| |Rules:\n\
77| |- Entity names: lowercase kebab-case, 2+ chars, domain-specific only\n\
78| |- NEVER extract generic terms, stop words, numbers, UUIDs, or single characters\n\
79| |- Relationship types MUST be one of: applies-to, uses, depends-on, causes, fixes, contradicts, supports, follows, related, replaces, tracked-in\n\
80| |- NEVER use 'mentions' as relationship type\n\
81| |- Strength: 0.9 for hard dependencies, 0.7 for design relationships, 0.5 for contextual links, 0.3 for weak references\n\
82| |- Prefer fewer high-quality entities over many low-quality ones\n\
83| |- Description must answer: What is this about and WHY does it matter?";
84| |
85| |#[derive(Debug, Deserialize)]
86| |struct ClaudeOutputElement {
87| | r#type: Option<String>,
88| | subtype: Option<String>,
89| | #[serde(default)]
90| | is_error: bool,
91| | structured_output: Option<ExtractionResult>,
92| | result: Option<String>,
93| | total_cost_usd: Option<f64>,
94| | error: Option<String>,
95| | terminal_reason: Option<String>,
96| | #[serde(rename = "apiKeySource")]
97| | api_key_source: Option<String>,
98| |}
99| |
100| |#[derive(Debug, Clone, Deserialize, Serialize)]
101| |pub struct ExtractionResult {
102| | pub name: String,
103| | pub description: String,
104| | pub entities: Vec<ExtractedEntity>,
105| | pub relationships: Vec<ExtractedRelationship>,
106| |}
107| |
108| |#[derive(Debug, Clone, Deserialize, Serialize)]
109| |pub struct ExtractedEntity {
110| | pub name: String,
111| | pub entity_type: String,
112| |}
113| |
114| |#[derive(Debug, Clone, Deserialize, Serialize)]
115| |pub struct ExtractedRelationship {
116| | pub source: String,
117| | pub target: String,
118| | pub relation: String,
119| | pub strength: f64,
120| |}
121| |
122| |#[derive(Debug, Serialize)]
123| |struct PhaseEvent<'a> {
124| | phase: &'a str,
125| | #[serde(skip_serializing_if = "Option::is_none")]
126| | claude_path: Option<&'a str>,
127| | #[serde(skip_serializing_if = "Option::is_none")]
128| | version: Option<&'a str>,
129| | #[serde(skip_serializing_if = "Option::is_none")]
130| | dir: Option<&'a str>,
131| | #[serde(skip_serializing_if = "Option::is_none")]
132| | files_total: Option<usize>,
133| | #[serde(skip_serializing_if = "Option::is_none")]
134| | files_new: Option<usize>,
135| | #[serde(skip_serializing_if = "Option::is_none")]
136| | files_existing: Option<usize>,
137| |}
138| |
139| |#[derive(Debug, Serialize)]
140| |struct FileEvent<'a> {
141| | file: &'a str,
142| | name: &'a str,
143| | status: &'a str,
144| | #[serde(skip_serializing_if = "Option::is_none")]
145| | memory_id: Option<i64>,
146| | #[serde(skip_serializing_if = "Option::is_none")]
147| | entities: Option<usize>,
148| | #[serde(skip_serializing_if = "Option::is_none")]
149| | rels: Option<usize>,
150| | #[serde(skip_serializing_if = "Option::is_none")]
151| | cost_usd: Option<f64>,
152| | #[serde(skip_serializing_if = "Option::is_none")]
153| | elapsed_ms: Option<u64>,
154| | #[serde(skip_serializing_if = "Option::is_none")]
155| | error: Option<&'a str>,
156| | index: usize,
157| | total: usize,
158| |}
159| |
160| |#[derive(Debug, Serialize)]
161| |struct Summary {
162| | summary: bool,
163| | files_total: usize,
164| | completed: usize,
165| | failed: usize,
166| | skipped: usize,
167| | entities_total: usize,
168| | rels_total: usize,
169| | cost_usd: f64,
170| | elapsed_ms: u64,
171| |}
172| |
173| |/// Locates the Claude Code binary on the system.
174| 1|pub fn find_claude_binary(explicit: Option<&Path>) -> Result<PathBuf, AppError> {
175| 1| if let Some(p) = explicit {
^0
176| 0| if p.exists() {
177| 0| return Ok(p.to_path_buf());
178| 0| }
179| 0| return Err(AppError::Validation(format!(
180| 0| "Claude Code binary not found at explicit path: {}",
181| 0| p.display()
182| 0| )));
183| 1| }
184| |
185| 1| if let Ok(env_path) = std::env::var("SQLITE_GRAPHRAG_CLAUDE_BINARY") {
^0
186| 0| let p = PathBuf::from(&env_path);
187| 0| if p.exists() {
188| 0| return Ok(p);
189| 0| }
190| 1| }
191| |
192| 1| let name = if cfg!(windows) {
193| 0| "claude.exe"
194| | } else {
195| 1| "claude"
196| | };
197| 1| if let Some(path_var) = std::env::var_os("PATH") {
198| 1| for dir in std::env::split_paths(&path_var) {
199| 1| let candidate = dir.join(name);
200| 1| if candidate.exists() {
201| 0| return Ok(candidate);
202| 1| }
203| | }
204| 0| }
205| |
206| 1| Err(AppError::Validation(
207| 1| "Claude Code binary not found in PATH. Install it from https://docs.anthropic.com/claude-code or specify --claude-binary".to_string(),
208| 1| ))
209| 1|}
210| |
211| |/// Validates that the Claude Code binary meets the minimum version.
212| 0|fn validate_claude_version(binary: &Path) -> Result<String, AppError> {
213| 0| let output = Command::new(binary)
214| 0| .arg("--version")
215| 0| .stdin(Stdio::null())
216| 0| .stdout(Stdio::piped())
217| 0| .stderr(Stdio::piped())
218| 0| .output()
219| 0| .map_err(AppError::Io)?;
220| |
221| 0| if !output.status.success() {
222| 0| return Err(AppError::Validation(
223| 0| "failed to run 'claude --version'".to_string(),
224| 0| ));
225| 0| }
226| |
227| 0| let version_str = String::from_utf8(output.stdout)
228| 0| .map_err(|_| AppError::Validation("claude --version output is not UTF-8".to_string()))?;
229| 0| let version = version_str.trim().to_string();
230| |
231| | // Extract the numeric version part before first space or paren, e.g. "2.1.149 (Claude Code)" -> "2.1.149"
232| 0| let numeric = version.split([' ', '(']).next().unwrap_or("").trim();
233| |
234| 0| fn parse_semver(s: &str) -> Option<(u64, u64, u64)> {
235| 0| let parts: Vec<&str> = s.splitn(3, '.').collect();
236| 0| if parts.len() < 2 {
237| 0| return None;
238| 0| }
239| 0| let major = parts[0].parse::<u64>().ok()?;
240| 0| let minor = parts[1].parse::<u64>().ok()?;
241| 0| let patch = parts
242| 0| .get(2)
243| 0| .and_then(|p| p.parse::<u64>().ok())
244| 0| .unwrap_or(0);
245| 0| Some((major, minor, patch))
246| 0| }
247| |
248| 0| if let (Some(actual), Some(min)) = (parse_semver(numeric), parse_semver(MIN_CLAUDE_VERSION)) {
249| 0| if actual < min {
250| 0| return Err(AppError::Validation(format!(
251| 0| "Claude Code version {numeric} is below minimum required {MIN_CLAUDE_VERSION}"
252| 0| )));
253| 0| }
254| 0| }
255| |
256| 0| Ok(version)
257| 0|}
258| |
259| |/// Invokes `claude -p` for a single file and returns the extraction result.
260| |///
261| |/// OAuth-only enforcement (gaps.md:41-49, v1.0.69 mandate):
262| |///
263| |/// - `wait-timeout` for cross-platform subprocess timeout.
264| |/// - `env_clear()` for least-privilege environment.
265| |/// - OAuth-only flow: NO `--bare` (PROHIBITED, gaps.md:49), no API-key path.
266| |/// - Mandatory hardening: `--strict-mcp-config --mcp-config '{}'` to zero
267| |/// MCP servers, and `--settings '{"hooks":{}}'` to disable hooks.
268| |/// - If `ANTHROPIC_API_KEY` is set in the environment we ABORT the spawn
269| |/// (return a `false` command with a violation marker) — API-key path is
270| |/// PROHIBITED in this project.
271| 0|fn extract_with_claude(
272| 0| binary: &Path,
273| 0| file_content: &[u8],
274| 0| model: Option<&str>,
275| 0| timeout_secs: u64,
276| 0|) -> Result<(ExtractionResult, f64, bool), AppError> {
277| | use wait_timeout::ChildExt;
278| |
279| | // OAuth-only guard (gaps.md:47). If `ANTHROPIC_API_KEY` is set in the
280| | // environment we MUST abort — that is the API-key path which is
281| | // explicitly PROHIBITED. Use the OAuth flow exclusively.
282| 0| if let Ok(_key) = std::env::var("ANTHROPIC_API_KEY") {
283| 0| let mut cmd = Command::new("false");
284| 0| cmd.env_clear();
285| 0| cmd.env("PATH", "/nonexistent");
286| 0| cmd.arg("--oauth-only-violation-anthropic-api-key-set");
287| 0| return Err(AppError::Validation(
288| 0| "ANTHROPIC_API_KEY is set in the environment; \
289| 0| sqlite-graphrag operates exclusively with OAuth (Pro/Max) and \
290| 0| the API-key path is PROHIBITED (gaps.md:47). Unset the variable \
291| 0| and re-run with `claude login` already completed in this session."
292| 0| .to_string(),
293| 0| ));
294| 0| }
295| |
296| 0| let mut cmd = Command::new(binary);
297| |
298| 0| cmd.env_clear();
299| 0| for var in &[
300| 0| "PATH",
301| 0| "HOME",
302| 0| "USER",
303| 0| "SHELL",
304| 0| "TERM",
305| 0| "LANG",
306| 0| "XDG_CONFIG_HOME",
307| 0| "XDG_DATA_HOME",
308| 0| "XDG_RUNTIME_DIR",
309| 0| // NOTE: `ANTHROPIC_API_KEY` is INTENTIONALLY ABSENT (gaps.md:47).
310| 0| "CLAUDE_CONFIG_DIR",
311| 0| "TMPDIR",
312| 0| "TMP",
313| 0| "TEMP",
314| 0| "DYLD_FALLBACK_LIBRARY_PATH",
315| 0| ] {
316| 0| if let Ok(val) = std::env::var(var) {
317| 0| cmd.env(var, val);
318| 0| }
319| | }
320| |
321| | #[cfg(windows)]
322| | for var in &[
323| | "LOCALAPPDATA",
324| | "APPDATA",
325| | "USERPROFILE",
326| | "SystemRoot",
327| | "COMSPEC",
328| | "PATHEXT",
329| | "HOMEPATH",
330| | "HOMEDRIVE",
331| | ] {
332| | if let Ok(val) = std::env::var(var) {
333| | cmd.env(var, val);
334| | }
335| | }
336| |
337| | // Canonical OAuth-only command line (gaps.md:201-208 + 211-213).
338| | // `--bare` is PROHIBITED (gaps.md:49) — never emitted.
339| 0| cmd.arg("-p")
340| 0| .arg(EXTRACTION_PROMPT)
341| 0| .arg("--strict-mcp-config")
342| 0| .arg("--mcp-config")
343| 0| .arg("{}")
344| 0| .arg("--dangerously-skip-permissions")
345| 0| .arg("--settings")
346| 0| .arg(r#"{"hooks":{}}"#)
347| 0| .arg("--output-format")
348| 0| .arg("json")
349| 0| .arg("--json-schema")
350| 0| .arg(EXTRACTION_SCHEMA)
351| 0| .arg("--max-turns")
352| 0| .arg("7")
353| 0| .arg("--no-session-persistence");
354| |
355| 0| if let Some(m) = model {
356| 0| cmd.arg("--model").arg(m);
357| 0| }
358| |
359| 0| cmd.stdin(Stdio::piped())
360| 0| .stdout(Stdio::piped())
361| 0| .stderr(Stdio::piped());
362| |
363| 0| let mut child = super::claude_runner::spawn_with_memory_limit(&mut cmd).map_err(|e| {
364| 0| AppError::Io(std::io::Error::new(
365| 0| e.kind(),
366| 0| format!("failed to spawn claude: {e}"),
367| 0| ))
368| 0| })?;
369| |
370| 0| let stdin_data = file_content.to_vec();
371| 0| let mut child_stdin = child
372| 0| .stdin
373| 0| .take()
374| 0| .ok_or_else(|| AppError::Validation("failed to open claude stdin".into()))?;
375| 0| let stdin_thread = std::thread::spawn(move || -> Result<(), std::io::Error> {
376| 0| child_stdin.write_all(&stdin_data)?;
377| 0| drop(child_stdin);
378| 0| Ok(())
379| 0| });
380| |
381| 0| let start = std::time::Instant::now();
382| 0| let timeout = std::time::Duration::from_secs(timeout_secs);
383| 0| let status = child.wait_timeout(timeout).map_err(AppError::Io)?;
384| |
385| 0| match status {
386| 0| Some(exit_status) => {
387| 0| stdin_thread
388| 0| .join()
389| 0| .map_err(|_| AppError::Validation("stdin thread panicked".into()))?
390| 0| .map_err(AppError::Io)?;
391| |
392| 0| tracing::debug!(
393| | target: "process",
394| 0| exit_code = ?exit_status.code(),
395| 0| elapsed_ms = start.elapsed().as_millis() as u64,
396| 0| "external process completed"
397| | );
398| |
399| 0| let mut stdout_buf = Vec::new();
400| 0| let mut stderr_buf = Vec::new();
401| 0| if let Some(mut out) = child.stdout.take() {
402| 0| std::io::Read::read_to_end(&mut out, &mut stdout_buf).map_err(AppError::Io)?;
403| 0| }
404| 0| if let Some(mut err) = child.stderr.take() {
405| 0| std::io::Read::read_to_end(&mut err, &mut stderr_buf).map_err(AppError::Io)?;
406| 0| }
407| |
408| 0| if !exit_status.success() {
409| 0| let stdout_str = String::from_utf8_lossy(&stdout_buf);
410| 0| if let Ok(elements) = serde_json::from_str::<Vec<ClaudeOutputElement>>(&stdout_str)
411| | {
412| 0| if let Some(re) = elements
413| 0| .iter()
414| 0| .find(|e| e.r#type.as_deref() == Some("result"))
415| | {
416| 0| if re.terminal_reason.as_deref() == Some("max_turns") {
417| 0| tracing::warn!(
418| | target: "ingest",
419| 0| "extraction hit max_turns limit — hooks may have consumed turns"
420| | );
421| 0| return Err(AppError::Validation(
422| 0| "claude -p hit max_turns: hooks may be consuming turns".into(),
423| 0| ));
424| 0| }
425| 0| if re.is_error {
426| 0| let err_msg = re
427| 0| .error
428| 0| .as_deref()
429| 0| .or(re.result.as_deref())
430| 0| .unwrap_or("unknown error");
431| 0| if err_msg.contains("rate_limit") || err_msg.contains("overloaded") {
432| 0| return Err(AppError::RateLimited {
433| 0| detail: err_msg.to_string(),
434| 0| });
435| 0| }
436| 0| if err_msg.contains("Not logged in")
437| 0| || err_msg.contains("authentication")
438| | {
439| 0| tracing::warn!(
440| | target: "ingest",
441| 0| "Claude Code authentication failed. Re-authenticate interactively with: claude"
442| | );
443| 0| }
444| 0| return Err(AppError::Validation(format!(
445| 0| "claude -p failed: {err_msg}"
446| 0| )));
447| 0| }
448| 0| }
449| 0| }
450| 0| let stderr_str = String::from_utf8_lossy(&stderr_buf);
451| 0| if stderr_str.contains("auth") || stderr_str.contains("login") {
452| 0| tracing::warn!(
453| | target: "ingest",
454| 0| "Claude Code authentication may have failed. Re-authenticate with: claude"
455| | );
456| 0| }
457| 0| return Err(AppError::Validation(format!(
458| 0| "claude -p exited with code {:?}: {}",
459| 0| exit_status.code(),
460| 0| stderr_str.trim()
461| 0| )));
462| 0| }
463| |
464| 0| let stdout = String::from_utf8(stdout_buf)
465| 0| .map_err(|_| AppError::Validation("claude -p stdout is not valid UTF-8".into()))?;
466| 0| parse_claude_output(&stdout)
467| | }
468| | None => {
469| 0| tracing::warn!(target: "ingest", timeout_secs, "claude -p timed out, killing process");
470| 0| let _ = child.kill();
471| 0| let _ = child.wait();
472| 0| let _ = stdin_thread.join();
473| 0| Err(AppError::Validation(format!(
474| 0| "claude -p timed out after {timeout_secs} seconds"
475| 0| )))
476| | }
477| | }
478| 0|}
479| |
480| |/// Parses the JSON array output from `claude -p --output-format json`.
481| |///
482| |/// Returns `(extraction, cost_usd, is_oauth)` where `is_oauth` is true when
483| |/// the init element reports `apiKeySource: "none"` (OAuth subscription).
484| 10|fn parse_claude_output(stdout: &str) -> Result<(ExtractionResult, f64, bool), AppError> {
485| 10| let elements: Vec<ClaudeOutputElement> = serde_json::from_str(stdout).map_err(|e| {
^9 ^9 ^1
486| 1| AppError::Validation(format!("failed to parse claude output as JSON array: {e}"))
487| 1| })?;
488| |
489| 9| let is_oauth = elements
490| 9| .iter()
491| 9| .find(|e| e.r#type.as_deref() == Some("system") && e.subtype.as_deref() == Some("init"))
492| 9| .and_then(|e| e.api_key_source.as_deref())
493| 9| .map(|s| s == "none")
^2 ^2
494| 9| .unwrap_or(false);
495| |
496| 9| let result_elem = elements
497| 9| .iter()
498| 19| .find(|e| e.r#type.as_deref() == Some("result"))
^9
499| 9| .ok_or_else(|| {
^0
500| 0| AppError::Validation("claude output missing 'result' element".to_string())
501| 0| })?;
502| |
503| 9| if result_elem.is_error {
504| 3| let err_msg = result_elem
505| 3| .error
506| 3| .as_deref()
507| 3| .or(result_elem.result.as_deref())
508| 3| .unwrap_or("unknown error");
509| 3| if err_msg.contains("rate_limit") || err_msg.contains("overloaded") {
^2 ^2
510| 1| return Err(AppError::RateLimited {
511| 1| detail: err_msg.to_string(),
512| 1| });
513| 2| }
514| 2| return Err(AppError::Validation(format!(
515| 2| "claude extraction failed: {err_msg}"
516| 2| )));
517| 6| }
518| |
519| 6| let extraction = result_elem
520| 6| .structured_output
521| 6| .clone()
522| 6| .or_else(|| {
^1
523| 1| result_elem
524| 1| .result
525| 1| .as_ref()
526| 1| .and_then(|text| serde_json::from_str::<ExtractionResult>(text).ok())
527| 1| })
528| 6| .ok_or_else(|| {
^0
529| 0| AppError::Validation("claude result missing structured_output and result field".into())
530| 0| })?;
531| |
532| 6| let cost = result_elem.total_cost_usd.unwrap_or(0.0);
533| |
534| 6| Ok((extraction, cost, is_oauth))
535| 10|}
536| |
537| |use crate::output::emit_json_line as emit_json;
538| |
539| |/// Collects files matching the pattern (reuses ingest logic).
540| 0|fn collect_matching_files(
541| 0| dir: &Path,
542| 0| pattern: &str,
543| 0| recursive: bool,
544| 0| max_files: usize,
545| 0|) -> Result<Vec<PathBuf>, AppError> {
546| 0| let mut files = Vec::new();
547| 0| super::ingest::collect_files(dir, pattern, recursive, &mut files)?;
548| 0| files.sort_unstable();
549| |
550| 0| if files.len() > max_files {
551| 0| return Err(AppError::Validation(format!(
552| 0| "found {} files, exceeds --max-files cap of {}",
553| 0| files.len(),
554| 0| max_files
555| 0| )));
556| 0| }
557| |
558| 0| Ok(files)
559| 0|}
560| |
561| |/// Opens or creates the queue database for tracking ingest progress.
562| 0|fn open_queue_db(path: &str) -> Result<Connection, AppError> {
563| 0| let conn = Connection::open(path)?;
564| |
565| 0| conn.pragma_update(None, "journal_mode", "wal")?;
566| |
567| 0| conn.execute_batch(
568| 0| "CREATE TABLE IF NOT EXISTS queue (
569| 0| id INTEGER PRIMARY KEY AUTOINCREMENT,
570| 0| file_path TEXT NOT NULL UNIQUE,
571| 0| name TEXT,
572| 0| status TEXT NOT NULL DEFAULT 'pending',
573| 0| memory_id INTEGER,
574| 0| entities INTEGER DEFAULT 0,
575| 0| rels INTEGER DEFAULT 0,
576| 0| error TEXT,
577| 0| cost_usd REAL DEFAULT 0.0,
578| 0| attempt INTEGER DEFAULT 0,
579| 0| elapsed_ms INTEGER,
580| 0| created_at TEXT DEFAULT (datetime('now')),
581| 0| done_at TEXT
582| 0| );
583| 0| CREATE INDEX IF NOT EXISTS idx_queue_status ON queue(status);",
584| 0| )?;
585| |
586| 0| Ok(conn)
587| 0|}
588| |
589| |/// Main entry point for `ingest --mode claude-code`.
590| 0|pub fn run_claude_ingest(args: &IngestArgs) -> Result<(), AppError> {
591| 0| let started = Instant::now();
592| |
593| 0| if !args.dir.exists() {
594| 0| return Err(AppError::Validation(format!(
595| 0| "directory not found: {}",
596| 0| args.dir.display()
597| 0| )));
598| 0| }
599| |
600| | // G28-B (v1.0.68) + G30 (v1.0.69): acquire singleton before doing real
601| | // work so two parallel `ingest --mode claude-code` invocations cannot
602| | // co-exist on the same database. Scope includes the database hash so
603| | // concurrent ingest against different databases is allowed.
604| 0| let early_ns = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
605| 0| let early_paths = AppPaths::resolve(args.db.as_deref())?;
606| 0| let _singleton = crate::lock::acquire_job_singleton(
607| 0| crate::lock::JobType::IngestClaudeCode,
608| 0| &early_ns,
609| 0| &early_paths.db,
610| 0| args.wait_job_singleton,
611| 0| args.force_job_singleton,
612| 0| )?;
613| |
614| | // Stage 1: Validate
615| 0| let claude_binary = find_claude_binary(args.claude_binary.as_deref())?;
616| 0| let version = validate_claude_version(&claude_binary)?;
617| 0| tracing::info!(
618| | target: "ingest",
619| 0| binary = %claude_binary.display(),
620| | version = %version,
621| 0| "Claude Code binary validated"
622| | );
623| |
624| 0| emit_json(&PhaseEvent {
625| 0| phase: "validate",
626| 0| claude_path: claude_binary.to_str(),
627| 0| version: Some(&version),
628| 0| dir: None,
629| 0| files_total: None,
630| 0| files_new: None,
631| 0| files_existing: None,
632| 0| });
633| |
634| | // Stage 2: Scan
635| 0| let files = collect_matching_files(&args.dir, &args.pattern, args.recursive, args.max_files)?;
636| |
637| 0| let queue_conn = open_queue_db(&args.queue_db)?;
638| |
639| 0| if args.resume {
640| 0| let reset = queue_conn
641| 0| .execute(
642| 0| "UPDATE queue SET status='pending' WHERE status='processing'",
643| 0| [],
644| | )
645| 0| .map_err(|e| AppError::Validation(format!("queue resume failed: {e}")))?;
646| 0| if reset > 0 {
647| 0| tracing::info!(target: "ingest", count = reset, "reset stuck processing files to pending");
648| 0| }
649| 0| }
650| |
651| 0| if args.retry_failed {
652| 0| let count = queue_conn
653| 0| .execute(
654| 0| "UPDATE queue SET status='pending', attempt=0 WHERE status='failed'",
655| 0| [],
656| | )
657| 0| .map_err(|e| AppError::Validation(format!("queue retry-failed reset failed: {e}")))?;
658| 0| tracing::info!(target: "ingest", count, "retrying failed files");
659| 0| }
660| |
661| 0| if !args.resume && !args.retry_failed {
662| 0| queue_conn
663| 0| .execute("DELETE FROM queue", [])
664| 0| .map_err(|e| AppError::Validation(format!("queue clear failed: {e}")))?;
665| 0| }
666| |
667| 0| let mut new_count = 0usize;
668| 0| let mut existing_count = 0usize;
669| |
670| 0| if !args.retry_failed {
671| 0| for file in &files {
672| 0| let file_str = file.to_string_lossy().into_owned();
673| 0| let inserted = queue_conn
674| 0| .execute(
675| 0| "INSERT OR IGNORE INTO queue (file_path, status) VALUES (?1, 'pending')",
676| 0| rusqlite::params![file_str],
677| | )
678| 0| .map_err(|e| AppError::Validation(format!("queue insert failed: {e}")))?;
679| 0| if inserted > 0 {
680| 0| new_count += 1;
681| 0| } else {
682| 0| existing_count += 1;
683| 0| }
684| | }
685| 0| }
686| |
687| 0| emit_json(&PhaseEvent {
688| 0| phase: "scan",
689| 0| claude_path: None,
690| 0| version: None,
691| 0| dir: args.dir.to_str(),
692| 0| files_total: Some(files.len()),
693| 0| files_new: Some(new_count),
694| 0| files_existing: Some(existing_count),
695| 0| });
696| |
697| 0| if args.dry_run {
698| 0| for (idx, file) in files.iter().enumerate() {
699| 0| let (name, _truncated, _orig) =
700| 0| super::ingest::derive_kebab_name(file, args.max_name_length);
701| 0| emit_json(&FileEvent {
702| 0| file: &file.to_string_lossy(),
703| 0| name: &name,
704| 0| status: "preview",
705| 0| memory_id: None,
706| 0| entities: None,
707| 0| rels: None,
708| 0| cost_usd: None,
709| 0| elapsed_ms: None,
710| 0| error: None,
711| 0| index: idx,
712| 0| total: files.len(),
713| 0| });
714| 0| }
715| 0| emit_json(&Summary {
716| 0| summary: true,
717| 0| files_total: files.len(),
718| 0| completed: 0,
719| 0| failed: 0,
720| 0| skipped: 0,
721| 0| entities_total: 0,
722| 0| rels_total: 0,
723| 0| cost_usd: 0.0,
724| 0| elapsed_ms: started.elapsed().as_millis() as u64,
725| 0| });
726| 0| if !args.keep_queue {
727| 0| let _ = std::fs::remove_file(&args.queue_db);
728| 0| }
729| 0| return Ok(());
730| 0| }
731| |
732| | // Stage 3: Process
733| 0| let paths = AppPaths::resolve(args.db.as_deref())?;
734| 0| ensure_db_ready(&paths)?;
735| 0| let conn = open_rw(&paths.db)?;
736| 0| let tokenizer = crate::tokenizer::get_tokenizer(&paths.models)?;
737| 0| let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
738| 0| let memory_type_str = args.r#type.as_str().to_string();
739| |
740| 0| let mut completed = 0usize;
741| 0| let mut failed = 0usize;
742| 0| let skipped_initial: usize = queue_conn
743| 0| .query_row("SELECT COUNT(*) FROM queue WHERE status='done'", [], |r| {
744| 0| r.get::<_, usize>(0)
745| 0| })
746| 0| .unwrap_or(0);
747| 0| let mut skipped = skipped_initial;
748| 0| let mut entities_total = 0usize;
749| 0| let mut rels_total = 0usize;
750| 0| let mut cost_total = 0.0f64;
751| 0| let mut oauth_detected = false;
752| 0| let total = files.len();
753| |
754| 0| let mut backoff_secs = args.rate_limit_wait;
755| 0| let rate_limit_deadline = std::time::Instant::now() + std::time::Duration::from_secs(3600);
756| |
757| | loop {
758| 0| if crate::shutdown_requested() {
759| 0| tracing::info!(target: "ingest", "shutdown requested, stopping before next file");
760| 0| break;
761| 0| }
762| |
763| 0| let pending: Option<(i64, String)> = queue_conn
764| 0| .query_row(
765| 0| "UPDATE queue SET status='processing', attempt=attempt+1 \
766| 0| WHERE id = (SELECT id FROM queue WHERE status='pending' ORDER BY id LIMIT 1) \
767| 0| RETURNING id, file_path",
768| 0| [],
769| 0| |row| Ok((row.get(0)?, row.get(1)?)),
770| | )
771| 0| .ok();
772| |
773| 0| let (queue_id, file_path) = match pending {
774| 0| Some(p) => p,
775| 0| None => break,
776| | };
777| |
778| 0| let file_started = Instant::now();
779| |
780| | // G05: reject files that exceed the 10 MB stdin limit
781| | const MAX_FILE_SIZE: u64 = 10 * 1024 * 1024;
782| 0| if let Ok(meta) = std::fs::metadata(&file_path) {
783| 0| if meta.len() > MAX_FILE_SIZE {
784| 0| let err_msg = format!("file exceeds 10MB stdin limit ({} bytes)", meta.len());
785| 0| let _ = queue_conn.execute(
786| 0| "UPDATE queue SET status='failed', error=?1, done_at=datetime('now') WHERE id=?2",
787| 0| rusqlite::params![err_msg, queue_id],
788| 0| );
789| 0| let current_index = completed + failed + skipped;
790| 0| failed += 1;
791| 0| emit_json(&FileEvent {
792| 0| file: &file_path,
793| 0| name: "",
794| 0| status: "failed",
795| 0| memory_id: None,
796| 0| entities: None,
797| 0| rels: None,
798| 0| cost_usd: None,
799| 0| elapsed_ms: Some(file_started.elapsed().as_millis() as u64),
800| 0| error: Some(&err_msg),
801| 0| index: current_index,
802| 0| total,
803| 0| });
804| 0| if args.fail_fast {
805| 0| break;
806| 0| }
807| 0| continue;
808| 0| }
809| 0| }
810| |
811| 0| let file_content = match std::fs::read(&file_path) {
812| 0| Ok(c) => c,
813| 0| Err(e) => {
814| 0| let err_msg = format!("IO error: {e}");
815| 0| let _ = queue_conn.execute(
816| 0| "UPDATE queue SET status='failed', error=?1, done_at=datetime('now') WHERE id=?2",
817| 0| rusqlite::params![err_msg, queue_id],
818| 0| );
819| 0| let current_index = completed + failed + skipped;
820| 0| failed += 1;
821| 0| emit_json(&FileEvent {
822| 0| file: &file_path,
823| 0| name: "",
824| 0| status: "failed",
825| 0| memory_id: None,
826| 0| entities: None,
827| 0| rels: None,
828| 0| cost_usd: None,
829| 0| elapsed_ms: Some(file_started.elapsed().as_millis() as u64),
830| 0| error: Some(&err_msg),
831| 0| index: current_index,
832| 0| total,
833| 0| });
834| 0| if args.fail_fast {
835| 0| break;
836| 0| }
837| 0| continue;
838| | }
839| | };
840| |
841| | // B08: skip files exceeding body cap BEFORE sending to LLM to avoid wasting tokens
842| 0| if file_content.len() > crate::constants::MAX_MEMORY_BODY_LEN {
843| 0| let err_msg = format!(
844| 0| "file body exceeds {} byte limit ({} bytes) — skipping to avoid wasting LLM tokens",
845| | crate::constants::MAX_MEMORY_BODY_LEN,
846| 0| file_content.len()
847| | );
848| 0| tracing::warn!(target: "ingest", file = %file_path, size = file_content.len(), "body exceeds limit, skipping LLM extraction");
849| 0| let _ = queue_conn.execute(
850| 0| "UPDATE queue SET status='skipped', error=?1, done_at=datetime('now') WHERE id=?2",
851| 0| rusqlite::params![err_msg, queue_id],
852| 0| );
853| 0| let current_index = completed + failed + skipped;
854| 0| skipped += 1;
855| 0| emit_json(&FileEvent {
856| 0| file: &file_path,
857| 0| name: "",
858| 0| status: "skipped",
859| 0| memory_id: None,
860| 0| entities: None,
861| 0| rels: None,
862| 0| cost_usd: None,
863| 0| elapsed_ms: Some(file_started.elapsed().as_millis() as u64),
864| 0| error: Some(&err_msg),
865| 0| index: current_index,
866| 0| total,
867| 0| });
868| 0| continue;
869| 0| }
870| |
871| | // B07: retry once on cold-start failure (Claude Code Issue #23265)
872| 0| let max_extract_attempts: u32 = 2;
873| 0| let mut extraction_result: Option<(ExtractionResult, f64, bool)> = None;
874| 0| let mut last_extract_err: Option<String> = None;
875| 0| let mut last_was_rate_limited = false;
876| |
877| 0| for attempt in 1..=max_extract_attempts {
878| 0| match extract_with_claude(
879| 0| &claude_binary,
880| 0| &file_content,
881| 0| args.claude_model.as_deref(),
882| 0| args.claude_timeout,
883| | ) {
884| 0| Ok(result) => {
885| 0| extraction_result = Some(result);
886| 0| break;
887| | }
888| 0| Err(ref e) if matches!(e, AppError::RateLimited { .. }) => {
889| 0| last_extract_err = Some(format!("{e}"));
890| 0| last_was_rate_limited = true;
891| 0| break;
892| | }
893| 0| Err(e) => {
894| 0| let msg = format!("{e}");
895| 0| if attempt < max_extract_attempts {
896| 0| let cold_start_delay = 2 * attempt as u64;
897| 0| tracing::warn!(target: "ingest", attempt, delay_secs = cold_start_delay, error = %msg, "extraction failed, retrying (cold-start workaround)");
898| 0| std::thread::sleep(std::time::Duration::from_secs(cold_start_delay));
899| 0| }
900| 0| last_extract_err = Some(msg);
901| | }
902| | }
903| | }
904| |
905| 0| if let Some((extraction, cost, is_oauth)) = extraction_result {
906| 0| if is_oauth && !oauth_detected {
907| 0| oauth_detected = true;
908| 0| tracing::info!(target: "ingest", "OAuth subscription detected — cost_usd omitted from output");
909| 0| }
910| 0| backoff_secs = args.rate_limit_wait;
911| |
912| 0| let (normalized_name, _truncated, _orig) = crate::commands::ingest::derive_kebab_name(
913| 0| std::path::Path::new(&extraction.name),
914| 0| args.max_name_length,
915| 0| );
916| 0| let name = &normalized_name;
917| 0| let ent_count = extraction.entities.len();
918| 0| let rel_count = extraction.relationships.len();
919| |
920| 0| let new_entities: Vec<NewEntity> = extraction
921| 0| .entities
922| 0| .iter()
923| 0| .filter_map(|e| match e.entity_type.parse::<EntityType>() {
924| 0| Ok(et) => Some(NewEntity {
925| 0| name: e.name.clone(),
926| 0| entity_type: et,
927| 0| description: None,
928| 0| }),
929| | Err(_) => {
930| 0| tracing::warn!(
931| | target: "ingest",
932| | entity = %e.name,
933| | entity_type = %e.entity_type,
934| 0| "entity type not recognized, skipping"
935| | );
936| 0| None
937| | }
938| 0| })
939| 0| .collect();
940| |
941| 0| let new_relationships: Vec<NewRelationship> = extraction
942| 0| .relationships
943| 0| .iter()
944| 0| .map(|r| NewRelationship {
945| 0| source: r.source.clone(),
946| 0| target: r.target.clone(),
947| 0| relation: crate::parsers::normalize_relation(&r.relation),
948| 0| strength: r.strength,
949| 0| description: None,
950| 0| })
951| 0| .collect();
952| |
953| 0| let body_str = String::from_utf8_lossy(&file_content);
954| 0| let body_hash = blake3::hash(body_str.as_bytes()).to_hex().to_string();
955| 0| let new_memory = NewMemory {
956| 0| name: name.clone(),
957| 0| namespace: namespace.clone(),
958| 0| memory_type: memory_type_str.clone(),
959| 0| description: extraction.description.clone(),
960| 0| body: body_str.to_string(),
961| 0| body_hash,
962| 0| session_id: None,
963| 0| source: "agent".to_string(),
964| 0| metadata: serde_json::Value::Object(serde_json::Map::new()),
965| 0| };
966| |
967| | // B06: deduplication — update existing memory instead of failing on UNIQUE
968| 0| let memory_id = match memories::find_by_name_any_state(&conn, &namespace, name)? {
969| 0| Some((existing_id, is_deleted)) => {
970| 0| if is_deleted {
971| 0| memories::clear_deleted_at(&conn, existing_id)?;
972| 0| }
973| 0| let (old_name, old_desc, old_body): (String, String, String) = conn.query_row(
974| 0| "SELECT name, COALESCE(description,''), COALESCE(body,'') FROM memories WHERE id=?1",
975| 0| rusqlite::params![existing_id],
976| 0| |r| Ok((r.get(0)?, r.get(1)?, r.get(2)?)),
977| 0| )?;
978| 0| memories::update(&conn, existing_id, &new_memory, None)?;
979| 0| memories::sync_fts_after_update(
980| 0| &conn,
981| 0| existing_id,
982| 0| &old_name,
983| 0| &old_desc,
984| 0| &old_body,
985| 0| &new_memory.name,
986| 0| &new_memory.description,
987| 0| &new_memory.body,
988| 0| )?;
989| 0| tracing::info!(target: "ingest", name, memory_id = existing_id, "updated existing memory (force-merge)");
990| 0| existing_id
991| | }
992| 0| None => match memories::insert(&conn, &new_memory) {
993| 0| Ok(id) => id,
994| 0| Err(e) => {
995| 0| let err_msg = format!("{e}");
996| 0| let _ = queue_conn.execute(
997| 0| "UPDATE queue SET status='failed', error=?1, done_at=datetime('now') WHERE id=?2",
998| 0| rusqlite::params![err_msg, queue_id],
999| 0| );
1000| 0| let current_index = completed + failed + skipped;
1001| 0| failed += 1;
1002| 0| emit_json(&FileEvent {
1003| 0| file: &file_path,
1004| 0| name,
1005| 0| status: "failed",
1006| 0| memory_id: None,
1007| 0| entities: None,
1008| 0| rels: None,
1009| 0| cost_usd: if is_oauth { None } else { Some(cost) },
1010| 0| elapsed_ms: Some(file_started.elapsed().as_millis() as u64),
1011| 0| error: Some(&err_msg),
1012| 0| index: current_index,
1013| 0| total,
1014| | });
1015| 0| if !is_oauth {
1016| 0| cost_total += cost;
1017| 0| }
1018| 0| if args.fail_fast {
1019| 0| break;
1020| 0| }
1021| 0| continue;
1022| | }
1023| | },
1024| | };
1025| |
1026| 0| for ent in &new_entities {
1027| 0| match entities::upsert_entity(&conn, &namespace, ent) {
1028| 0| Ok(eid) => {
1029| 0| let _ = entities::link_memory_entity(&conn, memory_id, eid);
1030| 0| }
1031| 0| Err(e) => {
1032| 0| tracing::warn!(
1033| | target: "ingest",
1034| | entity = %ent.name,
1035| | error = %e,
1036| 0| "entity skipped due to validation error"
1037| | );
1038| | }
1039| | }
1040| | }
1041| 0| for rel in &new_relationships {
1042| 0| crate::parsers::warn_if_non_canonical(&rel.relation);
1043| 0| let src_id = entities::find_entity_id(&conn, &namespace, &rel.source);
1044| 0| let tgt_id = entities::find_entity_id(&conn, &namespace, &rel.target);
1045| 0| if let (Ok(Some(sid)), Ok(Some(tid))) = (src_id, tgt_id) {
1046| 0| let _ = conn.execute(
1047| 0| "INSERT OR IGNORE INTO relationships (namespace, source_id, target_id, relation, weight) VALUES (?1, ?2, ?3, ?4, ?5)",
1048| 0| rusqlite::params![namespace, sid, tid, rel.relation, rel.strength],
1049| 0| );
1050| 0| }
1051| | }
1052| |
1053| | // G01: embedding pipeline — enables recall to find memories created via --mode claude-code
1054| 0| let body_text = String::from_utf8_lossy(&file_content).into_owned();
1055| 0| let snippet: String = body_text.chars().take(200).collect();
1056| 0| let chunks_info =
1057| 0| crate::chunking::split_into_chunks_hierarchical(&body_text, tokenizer);
1058| |
1059| 0| let embedding_result = if chunks_info.len() <= 1 {
1060| 0| crate::daemon::embed_passage_or_local(&paths.models, &body_text)
1061| | } else {
1062| 0| let mut chunk_embeddings: Vec<Vec<f32>> = Vec::with_capacity(chunks_info.len());
1063| 0| let mut multi_ok = true;
1064| 0| for chunk in &chunks_info {
1065| 0| let chunk_text = crate::chunking::chunk_text(&body_text, chunk);
1066| 0| match crate::daemon::embed_passage_or_local(&paths.models, chunk_text) {
1067| 0| Ok(emb) => chunk_embeddings.push(emb),
1068| 0| Err(e) => {
1069| 0| tracing::warn!(
1070| | target: "ingest",
1071| | file = %file_path,
1072| | error = %e,
1073| 0| "chunk embedding failed, skipping vector index for this file"
1074| | );
1075| 0| multi_ok = false;
1076| 0| break;
1077| | }
1078| | }
1079| | }
1080| 0| if multi_ok {
1081| 0| let aggregated = crate::chunking::aggregate_embeddings(&chunk_embeddings);
1082| | // persist per-chunk vectors
1083| 0| if let Err(e) = crate::storage::chunks::insert_chunk_slices(
1084| 0| &conn,
1085| 0| memory_id,
1086| 0| &body_text,
1087| 0| &chunks_info,
1088| 0| ) {
1089| 0| tracing::warn!(
1090| | target: "ingest",
1091| | file = %file_path,
1092| | error = %e,
1093| 0| "chunk slice insert failed"
1094| | );
1095| | } else {
1096| 0| for (i, emb) in chunk_embeddings.iter().enumerate() {
1097| 0| if let Err(e) = crate::storage::chunks::upsert_chunk_vec(
1098| 0| &conn, i as i64, memory_id, i as i32, emb,
1099| 0| ) {
1100| 0| tracing::warn!(
1101| | target: "ingest",
1102| | file = %file_path,
1103| | chunk = i,
1104| | error = %e,
1105| 0| "chunk vec upsert failed"
1106| | );
1107| 0| }
1108| | }
1109| | }
1110| 0| Ok(aggregated)
1111| | } else {
1112| | // fallback: embed whole body for the memory-level vector
1113| 0| crate::daemon::embed_passage_or_local(&paths.models, &body_text)
1114| | }
1115| | };
1116| |
1117| 0| match embedding_result {
1118| 0| Ok(embedding) => {
1119| 0| if let Err(e) = memories::upsert_vec(
1120| 0| &conn,
1121| 0| memory_id,
1122| 0| &namespace,
1123| 0| &memory_type_str,
1124| 0| &embedding,
1125| 0| name,
1126| 0| &snippet,
1127| 0| ) {
1128| 0| tracing::warn!(
1129| | target: "ingest",
1130| | file = %file_path,
1131| | error = %e,
1132| 0| "memory vec upsert failed; recall may not find this memory"
1133| | );
1134| 0| }
1135| | // embed each entity that was successfully upserted
1136| 0| for ent in &new_entities {
1137| 0| if let Ok(Some(eid)) =
1138| 0| entities::find_entity_id(&conn, &namespace, &ent.name)
1139| | {
1140| 0| let entity_text = ent.name.clone();
1141| 0| match crate::daemon::embed_passage_or_local(&paths.models, &entity_text)
1142| | {
1143| 0| Ok(emb) => {
1144| 0| if let Err(e) = entities::upsert_entity_vec(
1145| 0| &conn,
1146| 0| eid,
1147| 0| &namespace,
1148| 0| ent.entity_type,
1149| 0| &emb,
1150| 0| &ent.name,
1151| 0| ) {
1152| 0| tracing::warn!(
1153| | target: "ingest",
1154| | entity = %ent.name,
1155| | error = %e,
1156| 0| "entity vec upsert failed"
1157| | );
1158| 0| }
1159| | }
1160| 0| Err(e) => {
1161| 0| tracing::warn!(
1162| | target: "ingest",
1163| | entity = %ent.name,
1164| | error = %e,
1165| 0| "entity embedding failed"
1166| | );
1167| | }
1168| | }
1169| 0| }
1170| | }
1171| | }
1172| 0| Err(e) => {
1173| 0| tracing::warn!(
1174| | target: "ingest",
1175| | file = %file_path,
1176| | error = %e,
1177| 0| "memory embedding failed; recall will not find this memory"
1178| | );
1179| | }
1180| | }
1181| |
1182| 0| let _ = queue_conn.execute(
1183| 0| "UPDATE queue SET status='done', name=?1, memory_id=?2, entities=?3, rels=?4, cost_usd=?5, elapsed_ms=?6, done_at=datetime('now') WHERE id=?7",
1184| 0| rusqlite::params![
1185| 0| name,
1186| 0| memory_id,
1187| 0| ent_count,
1188| 0| rel_count,
1189| 0| cost,
1190| 0| file_started.elapsed().as_millis() as i64,
1191| 0| queue_id
1192| 0| ],
1193| 0| );
1194| |
1195| 0| let current_index = completed + failed + skipped;
1196| 0| completed += 1;
1197| 0| entities_total += ent_count;
1198| 0| rels_total += rel_count;
1199| 0| if !is_oauth {
1200| 0| cost_total += cost;
1201| 0| }
1202| |
1203| 0| emit_json(&FileEvent {
1204| 0| file: &file_path,
1205| 0| name,
1206| 0| status: "done",
1207| 0| memory_id: Some(memory_id),
1208| 0| entities: Some(ent_count),
1209| 0| rels: Some(rel_count),
1210| 0| cost_usd: if is_oauth { None } else { Some(cost) },
1211| 0| elapsed_ms: Some(file_started.elapsed().as_millis() as u64),
1212| 0| error: None,
1213| 0| index: current_index,
1214| 0| total,
1215| | });
1216| 0| } else if let Some(ref err_str) = last_extract_err {
1217| 0| if last_was_rate_limited {
1218| 0| if crate::retry::is_kill_switch_active() {
1219| 0| tracing::warn!(target: "ingest", "SQLITE_GRAPHRAG_DISABLE_RETRY=1, skipping rate-limit retry");
1220| 0| } else if std::time::Instant::now() >= rate_limit_deadline {
1221| 0| tracing::error!(target: "ingest", "rate-limit retry deadline (1h) exhausted");
1222| | } else {
1223| 0| let half = backoff_secs / 2;
1224| 0| let jitter = if half == 0 { 0 } else { fastrand::u64(0..half) };
1225| 0| let actual_wait = half + jitter;
1226| 0| tracing::warn!(target: "ingest", delay_secs = actual_wait, error_kind = "rate_limited", "rate limited, backing off");
1227| 0| let _ = queue_conn.execute(
1228| 0| "UPDATE queue SET status='pending' WHERE id=?1",
1229| 0| rusqlite::params![queue_id],
1230| 0| );
1231| 0| std::thread::sleep(std::time::Duration::from_secs(actual_wait));
1232| 0| backoff_secs = (backoff_secs * 2).min(900);
1233| 0| continue;
1234| | }
1235| | } else {
1236| 0| let _ = queue_conn.execute(
1237| 0| "UPDATE queue SET status='failed', error=?1, done_at=datetime('now') WHERE id=?2",
1238| 0| rusqlite::params![err_str, queue_id],
1239| 0| );
1240| 0| let current_index = completed + failed + skipped;
1241| 0| failed += 1;
1242| 0| emit_json(&FileEvent {
1243| 0| file: &file_path,
1244| 0| name: "",
1245| 0| status: "failed",
1246| 0| memory_id: None,
1247| 0| entities: None,
1248| 0| rels: None,
1249| 0| cost_usd: None,
1250| 0| elapsed_ms: Some(file_started.elapsed().as_millis() as u64),
1251| 0| error: Some(err_str),
1252| 0| index: current_index,
1253| 0| total,
1254| 0| });
1255| 0| if args.fail_fast {
1256| 0| break;
1257| 0| }
1258| | }
1259| 0| }
1260| |
1261| 0| if let Some(budget) = args.max_cost_usd {
1262| 0| if oauth_detected {
1263| 0| tracing::debug!(target: "ingest", "--max-cost-usd ignored: OAuth subscription detected");
1264| 0| } else if cost_total >= budget {
1265| 0| tracing::warn!(
1266| | target: "ingest",
1267| | spent = cost_total,
1268| | budget = budget,
1269| 0| "budget exceeded, stopping"
1270| | );
1271| 0| break;
1272| 0| }
1273| 0| }
1274| | }
1275| |
1276| | // Stage 4: Summary
1277| 0| let _ = conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);");
1278| 0| let _ = queue_conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);");
1279| |
1280| 0| emit_json(&Summary {
1281| 0| summary: true,
1282| 0| files_total: total,
1283| 0| completed,
1284| 0| failed,
1285| 0| skipped,
1286| 0| entities_total,
1287| 0| rels_total,
1288| 0| cost_usd: cost_total,
1289| 0| elapsed_ms: started.elapsed().as_millis() as u64,
1290| 0| });
1291| |
1292| 0| if !args.keep_queue && failed == 0 {
1293| 0| let _ = std::fs::remove_file(&args.queue_db);
1294| 0| }
1295| |
1296| 0| Ok(())
1297| 0|}
1298| |
1299| |#[cfg(test)]
1300| |mod tests {
1301| | use super::*;
1302| |
1303| | #[test]
1304| 1| fn test_extraction_schema_valid_json() {
1305| 1| let _: serde_json::Value =
1306| 1| serde_json::from_str(EXTRACTION_SCHEMA).expect("schema must be valid JSON");
1307| 1| }
1308| |
1309| | #[test]
1310| 1| fn test_parse_claude_output_valid() {
1311| 1| let output = r#"[
1312| 1| {"type":"system","subtype":"init"},
1313| 1| {"type":"assistant"},
1314| 1| {"type":"result","is_error":false,"total_cost_usd":0.02,"structured_output":{"name":"test-doc","description":"A test document","entities":[{"name":"test-entity","entity_type":"concept"}],"relationships":[{"source":"test-entity","target":"test-doc","relation":"applies-to","strength":0.8}]}}
1315| 1| ]"#;
1316| 1| let (result, cost, _is_oauth) = parse_claude_output(output).expect("parse must succeed");
1317| 1| assert_eq!(result.name, "test-doc");
1318| 1| assert_eq!(result.entities.len(), 1);
1319| 1| assert_eq!(result.relationships.len(), 1);
1320| 1| assert!((cost - 0.02).abs() < f64::EPSILON);
1321| 1| }
1322| |
1323| | #[test]
1324| 1| fn test_parse_claude_output_error() {
1325| 1| let output = r#"[
1326| 1| {"type":"system","subtype":"init"},
1327| 1| {"type":"result","is_error":true,"error":"authentication failed"}
1328| 1| ]"#;
1329| 1| let err = parse_claude_output(output).unwrap_err();
1330| 1| assert!(format!("{err}").contains("authentication failed"));
1331| 1| }
1332| |
1333| | #[test]
1334| 1| fn test_parse_claude_output_rate_limit() {
1335| 1| let output = r#"[
1336| 1| {"type":"system","subtype":"init"},
1337| 1| {"type":"result","is_error":true,"error":"rate_limit exceeded"}
1338| 1| ]"#;
1339| 1| let err = parse_claude_output(output).unwrap_err();
1340| 1| assert!(matches!(err, AppError::RateLimited { .. }));
^0
1341| 1| }
1342| |
1343| | #[test]
1344| 1| fn test_parse_claude_output_malformed() {
1345| 1| let output = "not json at all";
1346| 1| assert!(parse_claude_output(output).is_err());
1347| 1| }
1348| |
1349| | #[test]
1350| 1| fn test_find_claude_binary_not_found() {
1351| 1| let original_path = std::env::var_os("PATH");
1352| 1| std::env::set_var("PATH", "/nonexistent");
1353| 1| std::env::remove_var("SQLITE_GRAPHRAG_CLAUDE_BINARY");
1354| 1| let result = find_claude_binary(None);
1355| 1| if let Some(p) = original_path {
1356| 1| std::env::set_var("PATH", p);
1357| 1| }
^0
1358| 1| assert!(result.is_err());
1359| 1| }
1360| |
1361| | #[test]
1362| 1| fn test_parse_claude_output_result_fallback() {
1363| 1| let output = r#"[
1364| 1| {"type":"system","subtype":"init"},
1365| 1| {"type":"result","is_error":false,"total_cost_usd":0.01,"structured_output":null,"result":"{\"name\":\"test-fallback\",\"description\":\"A fallback test\",\"entities\":[{\"name\":\"fb-entity\",\"entity_type\":\"concept\"}],\"relationships\":[]}"}
1366| 1| ]"#;
1367| 1| let (result, cost, _is_oauth) =
1368| 1| parse_claude_output(output).expect("result fallback must work");
1369| 1| assert_eq!(result.name, "test-fallback");
1370| 1| assert_eq!(result.entities.len(), 1);
1371| 1| assert!(result.relationships.is_empty());
1372| 1| assert!((cost - 0.01).abs() < f64::EPSILON);
1373| 1| }
1374| |
1375| | #[test]
1376| 1| fn test_parse_claude_output_error_with_result_field() {
1377| 1| let output = r#"[
1378| 1| {"type":"system","subtype":"init"},
1379| 1| {"type":"result","is_error":true,"result":"Not logged in · Please run /login"}
1380| 1| ]"#;
1381| 1| let err = parse_claude_output(output).unwrap_err();
1382| 1| let msg = format!("{err}");
1383| 1| assert!(
1384| 1| msg.contains("Not logged in"),
1385| 0| "expected 'Not logged in' in: {msg}"
1386| | );
1387| 1| }
1388| |
1389| | #[test]
1390| 1| fn test_terminal_reason_max_turns_detected() {
1391| 1| let output = r#"[
1392| 1| {"type":"system","subtype":"init"},
1393| 1| {"type":"result","is_error":false,"terminal_reason":"max_turns","structured_output":{"name":"t","description":"d","entities":[],"relationships":[]}}
1394| 1| ]"#;
1395| 1| let err_or_ok = parse_claude_output(output);
1396| 1| assert!(
1397| 1| err_or_ok.is_ok(),
1398| 0| "max_turns in result without is_error should still parse"
1399| | );
1400| 1| }
1401| |
1402| | #[test]
1403| 1| fn test_detect_oauth_from_init_json() {
1404| 1| let output = r#"[
1405| 1| {"type":"system","subtype":"init","apiKeySource":"none"},
1406| 1| {"type":"result","is_error":false,"total_cost_usd":0.50,"structured_output":{"name":"test-oauth","description":"oauth test","entities":[],"relationships":[]}}
1407| 1| ]"#;
1408| 1| let (_result, cost, is_oauth) = parse_claude_output(output).expect("parse must succeed");
1409| 1| assert!(is_oauth, "apiKeySource=none must be detected as OAuth");
^0
1410| 1| assert!((cost - 0.50).abs() < f64::EPSILON);
1411| 1| }
1412| |
1413| | #[test]
1414| 1| fn test_api_key_source_not_oauth() {
1415| 1| let output = r#"[
1416| 1| {"type":"system","subtype":"init","apiKeySource":"env"},
1417| 1| {"type":"result","is_error":false,"total_cost_usd":0.10,"structured_output":{"name":"test-api","description":"api test","entities":[],"relationships":[]}}
1418| 1| ]"#;
1419| 1| let (_result, _cost, is_oauth) = parse_claude_output(output).expect("parse must succeed");
1420| 1| assert!(!is_oauth, "apiKeySource=env must NOT be detected as OAuth");
^0
1421| 1| }
1422| |
1423| | #[test]
1424| 1| fn test_missing_api_key_source_defaults_not_oauth() {
1425| 1| let output = r#"[
1426| 1| {"type":"system","subtype":"init"},
1427| 1| {"type":"result","is_error":false,"total_cost_usd":0.05,"structured_output":{"name":"test-missing","description":"missing test","entities":[],"relationships":[]}}
1428| 1| ]"#;
1429| 1| let (_result, _cost, is_oauth) = parse_claude_output(output).expect("parse must succeed");
1430| 1| assert!(!is_oauth, "missing apiKeySource must default to not OAuth");
^0
1431| 1| }
1432| |
1433| | #[test]
1434| 1| fn test_extraction_schema_entity_types_match_enum() {
1435| 1| let schema: serde_json::Value = serde_json::from_str(EXTRACTION_SCHEMA).unwrap();
1436| 1| let types = schema["properties"]["entities"]["items"]["properties"]["entity_type"]["enum"]
1437| 1| .as_array()
1438| 1| .expect("schema must have entity_type enum");
1439| 11| for t in types {
^10
1440| 10| let s = t.as_str().unwrap();
1441| 10| assert!(
1442| 10| s.parse::<EntityType>().is_ok(),
1443| 0| "schema entity_type '{s}' not in EntityType enum"
1444| | );
1445| | }
1446| 1| }
1447| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/ingest_codex.rs:
1| |//! Handler for `ingest --mode codex`.
2| |//!
3| |//! Orchestrates the locally installed OpenAI Codex CLI binary (`codex exec`)
4| |//! to extract domain-specific entities and relationships from each file,
5| |//! then persists them with full embedding pipeline for recall/hybrid-search.
6| |//!
7| |//! Architecture: P1 One-Shot per file — each file spawns a separate
8| |//! `codex exec` process with `--output-schema` for guaranteed structured output.
9| |//! A SQLite queue DB tracks progress for resume/retry support.
10| |// Workload: Subprocess I/O-bound (codex exec headless with network wait)
11| |
12| |use crate::commands::ingest::IngestArgs;
13| |use crate::commands::ingest_claude::ExtractionResult;
14| |use crate::entity_type::EntityType;
15| |use crate::errors::AppError;
16| |use crate::paths::AppPaths;
17| |use crate::storage::connection::{ensure_db_ready, open_rw};
18| |use crate::storage::entities::{self, NewEntity, NewRelationship};
19| |use crate::storage::memories::{self, NewMemory};
20| |
21| |use rusqlite::Connection;
22| |use serde::{Deserialize, Serialize};
23| |use std::io::Write;
24| |use std::path::{Path, PathBuf};
25| |use std::process::{Command, Stdio};
26| |use std::time::Instant;
27| |
28| |const MIN_CODEX_VERSION: &str = "0.120.0";
29| |
30| |/// OpenAI structured output schema with `additionalProperties: false` at all nested levels.
31| |const EXTRACTION_SCHEMA_CODEX: &str = r#"{
32| | "type": "object",
33| | "properties": {
34| | "name": { "type": "string" },
35| | "description": { "type": "string" },
36| | "entities": {
37| | "type": "array",
38| | "items": {
39| | "type": "object",
40| | "properties": {
41| | "name": { "type": "string" },
42| | "entity_type": {
43| | "type": "string",
44| | "enum": ["project","tool","person","file","concept","incident","decision","organization","location","date"]
45| | }
46| | },
47| | "required": ["name", "entity_type"],
48| | "additionalProperties": false
49| | }
50| | },
51| | "relationships": {
52| | "type": "array",
53| | "items": {
54| | "type": "object",
55| | "properties": {
56| | "source": { "type": "string" },
57| | "target": { "type": "string" },
58| | "relation": {
59| | "type": "string",
60| | "enum": ["applies-to","uses","depends-on","causes","fixes","contradicts","supports","follows","related","replaces","tracked-in"]
61| | },
62| | "strength": { "type": "number", "minimum": 0, "maximum": 1 }
63| | },
64| | "required": ["source","target","relation","strength"],
65| | "additionalProperties": false
66| | }
67| | }
68| | },
69| | "required": ["name","description","entities","relationships"],
70| | "additionalProperties": false
71| |}"#;
72| |
73| |const EXTRACTION_PROMPT: &str = "You are a knowledge graph entity extractor. Given a document, extract:\n\
74| |1. A short kebab-case name (max 60 chars) capturing the document's main topic\n\
75| |2. A one-sentence description (10-20 words) summarizing the key insight\n\
76| |3. Domain-specific entities (concepts, tools, people, decisions, projects, files)\n\
77| |4. Typed relationships between entities with strength scores\n\n\
78| |Rules:\n\
79| |- Entity names: lowercase kebab-case, 2+ chars, domain-specific only\n\
80| |- NEVER extract generic terms, stop words, numbers, UUIDs, or single characters\n\
81| |- Relationship types MUST be one of: applies-to, uses, depends-on, causes, fixes, contradicts, supports, follows, related, replaces, tracked-in\n\
82| |- NEVER use 'mentions' as relationship type\n\
83| |- Strength: 0.9 for hard dependencies, 0.7 for design relationships, 0.5 for contextual links, 0.3 for weak references\n\
84| |- Prefer fewer high-quality entities over many low-quality ones\n\
85| |- Description must answer: What is this about and WHY does it matter?";
86| |
87| |/// Token usage reported by Codex CLI on `turn.completed` events.
88| |#[derive(Debug, Clone, Deserialize, Serialize)]
89| |struct CodexUsage {
90| | input_tokens: u64,
91| | #[serde(default)]
92| | cached_input_tokens: u64,
93| | output_tokens: u64,
94| | #[serde(default)]
95| | reasoning_output_tokens: u64,
96| |}
97| |
98| |#[derive(Debug, Serialize)]
99| |struct PhaseEvent<'a> {
100| | phase: &'a str,
101| | #[serde(skip_serializing_if = "Option::is_none")]
102| | codex_path: Option<&'a str>,
103| | #[serde(skip_serializing_if = "Option::is_none")]
104| | version: Option<&'a str>,
105| | #[serde(skip_serializing_if = "Option::is_none")]
106| | dir: Option<&'a str>,
107| | #[serde(skip_serializing_if = "Option::is_none")]
108| | files_total: Option<usize>,
109| | #[serde(skip_serializing_if = "Option::is_none")]
110| | files_new: Option<usize>,
111| | #[serde(skip_serializing_if = "Option::is_none")]
112| | files_existing: Option<usize>,
113| |}
114| |
115| |#[derive(Debug, Serialize)]
116| |struct FileEvent<'a> {
117| | file: &'a str,
118| | name: &'a str,
119| | status: &'a str,
120| | #[serde(skip_serializing_if = "Option::is_none")]
121| | memory_id: Option<i64>,
122| | #[serde(skip_serializing_if = "Option::is_none")]
123| | entities: Option<usize>,
124| | #[serde(skip_serializing_if = "Option::is_none")]
125| | rels: Option<usize>,
126| | /// Always None for Codex (no cost_usd in Codex API responses).
127| | #[serde(skip_serializing_if = "Option::is_none")]
128| | cost_usd: Option<f64>,
129| | #[serde(skip_serializing_if = "Option::is_none")]
130| | input_tokens: Option<u64>,
131| | #[serde(skip_serializing_if = "Option::is_none")]
132| | output_tokens: Option<u64>,
133| | #[serde(skip_serializing_if = "Option::is_none")]
134| | elapsed_ms: Option<u64>,
135| | #[serde(skip_serializing_if = "Option::is_none")]
136| | error: Option<&'a str>,
137| | index: usize,
138| | total: usize,
139| |}
140| |
141| |#[derive(Debug, Serialize)]
142| |struct Summary {
143| | summary: bool,
144| | files_total: usize,
145| | completed: usize,
146| | failed: usize,
147| | skipped: usize,
148| | entities_total: usize,
149| | rels_total: usize,
150| | input_tokens_total: u64,
151| | output_tokens_total: u64,
152| | elapsed_ms: u64,
153| |}
154| |
155| |/// Locates the Codex CLI binary on the system.
156| |///
157| |/// Search order:
158| |/// 1. Explicit `--codex-binary` CLI flag.
159| |/// 2. `SQLITE_GRAPHRAG_CODEX_BINARY` env var.
160| |/// 3. PATH search for `codex` (or `codex.exe` on Windows).
161| 0|pub fn find_codex_binary(explicit: Option<&Path>) -> Result<PathBuf, AppError> {
162| 0| if let Some(p) = explicit {
163| 0| if p.exists() {
164| 0| return Ok(p.to_path_buf());
165| 0| }
166| 0| return Err(AppError::Validation(format!(
167| 0| "Codex CLI binary not found at explicit path: {}",
168| 0| p.display()
169| 0| )));
170| 0| }
171| |
172| 0| if let Ok(env_path) = std::env::var("SQLITE_GRAPHRAG_CODEX_BINARY") {
173| 0| let p = PathBuf::from(&env_path);
174| 0| if p.exists() {
175| 0| return Ok(p);
176| 0| }
177| 0| }
178| |
179| 0| let name = if cfg!(windows) { "codex.exe" } else { "codex" };
180| 0| if let Some(path_var) = std::env::var_os("PATH") {
181| 0| for dir in std::env::split_paths(&path_var) {
182| 0| let candidate = dir.join(name);
183| 0| if candidate.exists() {
184| 0| return Ok(candidate);
185| 0| }
186| | }
187| 0| }
188| |
189| 0| Err(AppError::Validation(
190| 0| "Codex CLI binary not found in PATH. Install it from https://github.com/openai/codex or specify --codex-binary".to_string(),
191| 0| ))
192| 0|}
193| |
194| |/// Validates that the Codex CLI binary meets the minimum version requirement.
195| |///
196| |/// # Errors
197| |///
198| |/// Returns `AppError::Validation` when the binary cannot be executed or the
199| |/// version is below `MIN_CODEX_VERSION`.
200| 0|fn validate_codex_version(binary: &Path) -> Result<String, AppError> {
201| 0| let resolved = which::which(binary).map_err(|_| {
202| 0| AppError::Validation(format!(
203| 0| "executable '{}' not found in PATH; ensure Codex CLI is installed",
204| 0| binary.display()
205| 0| ))
206| 0| })?;
207| 0| let output = Command::new(&resolved)
208| 0| .arg("--version")
209| 0| .stdin(Stdio::null())
210| 0| .stdout(Stdio::piped())
211| 0| .stderr(Stdio::piped())
212| 0| .output()
213| 0| .map_err(AppError::Io)?;
214| |
215| 0| let raw = String::from_utf8(output.stdout)
216| 0| .map_err(|_| AppError::Validation("codex --version output is not UTF-8".to_string()))?;
217| |
218| 0| let version_str = raw.trim().to_string();
219| |
220| | // Codex CLI outputs: "codex-cli 0.133.0" or just "0.133.0"
221| 0| let numeric = version_str.split_whitespace().last().unwrap_or("").trim();
222| |
223| 0| fn parse_semver(s: &str) -> Option<(u64, u64, u64)> {
224| 0| let parts: Vec<&str> = s.splitn(3, '.').collect();
225| 0| if parts.len() < 2 {
226| 0| return None;
227| 0| }
228| 0| let major = parts[0].parse::<u64>().ok()?;
229| 0| let minor = parts[1].parse::<u64>().ok()?;
230| 0| let patch = parts
231| 0| .get(2)
232| 0| .and_then(|p| p.parse::<u64>().ok())
233| 0| .unwrap_or(0);
234| 0| Some((major, minor, patch))
235| 0| }
236| |
237| 0| if let (Some(actual), Some(min)) = (parse_semver(numeric), parse_semver(MIN_CODEX_VERSION)) {
238| 0| if actual < min {
239| 0| return Err(AppError::Validation(format!(
240| 0| "Codex CLI version {numeric} is below minimum required {MIN_CODEX_VERSION}"
241| 0| )));
242| 0| }
243| 0| }
244| |
245| 0| Ok(version_str)
246| 0|}
247| |
248| |/// Writes the extraction schema to a named temp file for `--output-schema`.
249| |///
250| |/// # Errors
251| |///
252| |/// Returns `AppError::Io` when the temp file cannot be created or written.
253| 0|fn write_schema_tempfile() -> Result<tempfile::NamedTempFile, AppError> {
254| 0| let mut f = tempfile::NamedTempFile::new().map_err(AppError::Io)?;
255| 0| std::io::Write::write_all(&mut f, EXTRACTION_SCHEMA_CODEX.as_bytes()).map_err(AppError::Io)?;
256| 0| std::io::Write::flush(&mut f).map_err(AppError::Io)?;
257| 0| Ok(f)
258| 0|}
259| |
260| |/// Invokes `codex exec` for a single file and returns the extraction result.
261| |///
262| |/// Uses `wait-timeout` for cross-platform subprocess timeout, `env_clear()`
263| |/// for least-privilege environment, and reads prompt + file content from
264| |/// stdin using the `-` argument (Codex Paperclip pattern).
265| |///
266| |/// # Errors
267| |///
268| |/// Returns `AppError::Validation` on extraction failure, rate limiting, or
269| |/// schema errors. Returns `AppError::Io` on process spawn/IO failures.
270| 0|fn extract_with_codex(
271| 0| binary: &Path,
272| 0| file_content: &[u8],
273| 0| model: Option<&str>,
274| 0| timeout_secs: u64,
275| 0| schema_file: &Path,
276| 0|) -> Result<(ExtractionResult, Option<CodexUsage>), AppError> {
277| | use wait_timeout::ChildExt;
278| |
279| | // G31 Passo C (v1.0.69): delegate command construction to the shared
280| | // `codex_spawn::build_codex_command` helper so `enrich` and `ingest` stay
281| | // perfectly aligned on the canonical seven hardening flags. The local
282| | // function still owns the stdin pump + JSONL parsing (see below).
283| 0| let _ = timeout_secs; // currently unused; consumed by the helper when it spawns the process
284| 0| let _ = file_content; // pumped into stdin below, see `stdin_pump` thread
285| 0| let _ = schema_file; // helper reuses the temp file at the given path
286| 0| let prompt = String::new(); // empty prompt — helper appends file_content via args.input_text
287| 0| let mut cmd = crate::commands::codex_spawn::build_codex_command(
288| 0| &crate::commands::codex_spawn::CodexSpawnArgs {
289| 0| binary,
290| 0| prompt: &prompt,
291| 0| json_schema: "", // caller writes the schema directly via `schema_file`
292| 0| input_text: "",
293| 0| model,
294| 0| timeout_secs,
295| 0| schema_path: schema_file.to_path_buf(),
296| 0| },
297| | );
298| |
299| | // `build_codex_command` writes the JSON schema to `schema_path` and
300| | // appends `input_text` to the prompt via Paperclip stdin. For `ingest`
301| | // we want the schema content already on disk (the caller pre-wrote
302| | // EXTRACTION_SCHEMA_CODEX into the named tempfile), and the document
303| | // content goes through stdin via a dedicated thread (see below). Strip
304| | // the file the helper just rewrote — our caller pre-wrote it.
305| 0| let _ = std::fs::write(
306| 0| schema_file,
307| 0| crate::commands::ingest_codex::EXTRACTION_SCHEMA_CODEX,
308| 0| );
309| |
310| 0| cmd.stdin(Stdio::piped())
311| 0| .stdout(Stdio::piped())
312| 0| .stderr(Stdio::piped());
313| |
314| 0| let mut child = super::claude_runner::spawn_with_memory_limit(&mut cmd).map_err(|e| {
315| 0| AppError::Io(std::io::Error::new(
316| 0| e.kind(),
317| 0| format!("failed to spawn codex: {e}"),
318| 0| ))
319| 0| })?;
320| |
321| | // Build stdin: prompt + document content
322| 0| let file_utf8 = String::from_utf8_lossy(file_content);
323| 0| let stdin_payload = format!("{EXTRACTION_PROMPT}\n\n---\n\nDocument content:\n\n{file_utf8}");
324| 0| let stdin_bytes = stdin_payload.into_bytes();
325| |
326| 0| let mut child_stdin = child
327| 0| .stdin
328| 0| .take()
329| 0| .ok_or_else(|| AppError::Validation("failed to open codex stdin".into()))?;
330| 0| let stdin_thread = std::thread::spawn(move || -> Result<(), std::io::Error> {
331| 0| child_stdin.write_all(&stdin_bytes)?;
332| 0| drop(child_stdin);
333| 0| Ok(())
334| 0| });
335| |
336| 0| let start = std::time::Instant::now();
337| 0| let timeout = std::time::Duration::from_secs(timeout_secs);
338| 0| let status = child.wait_timeout(timeout).map_err(AppError::Io)?;
339| |
340| 0| match status {
341| 0| Some(exit_status) => {
342| 0| stdin_thread
343| 0| .join()
344| 0| .map_err(|_| AppError::Validation("stdin thread panicked".into()))?
345| 0| .map_err(AppError::Io)?;
346| |
347| 0| tracing::debug!(
348| | target: "process",
349| 0| exit_code = ?exit_status.code(),
350| 0| elapsed_ms = start.elapsed().as_millis() as u64,
351| 0| "external process completed"
352| | );
353| |
354| 0| let mut stdout_buf = Vec::new();
355| 0| let mut stderr_buf = Vec::new();
356| 0| if let Some(mut out) = child.stdout.take() {
357| 0| std::io::Read::read_to_end(&mut out, &mut stdout_buf).map_err(AppError::Io)?;
358| 0| }
359| 0| if let Some(mut err) = child.stderr.take() {
360| 0| std::io::Read::read_to_end(&mut err, &mut stderr_buf).map_err(AppError::Io)?;
361| 0| }
362| |
363| 0| if !exit_status.success() {
364| 0| let stderr_str = String::from_utf8_lossy(&stderr_buf);
365| 0| let stdout_str = String::from_utf8_lossy(&stdout_buf);
366| | // Check if stdout has JSONL with an error event before falling back
367| 0| if let Ok((result, usage)) = parse_codex_output(&stdout_str) {
368| 0| return Ok((result, usage));
369| 0| }
370| 0| if stderr_str.contains("401")
371| 0| || stderr_str.contains("Unauthorized")
372| 0| || stderr_str.contains("auth")
373| | {
374| 0| tracing::warn!(
375| | target: "ingest",
376| 0| "Codex CLI authentication expired. Re-authenticate with: codex auth login"
377| | );
378| 0| }
379| 0| return Err(AppError::Validation(format!(
380| 0| "codex exec exited with code {:?}: {}",
381| 0| exit_status.code(),
382| 0| stderr_str.trim()
383| 0| )));
384| 0| }
385| |
386| 0| let stdout = String::from_utf8(stdout_buf)
387| 0| .map_err(|_| AppError::Validation("codex exec stdout is not valid UTF-8".into()))?;
388| 0| parse_codex_output(&stdout)
389| | }
390| | None => {
391| 0| tracing::warn!(target: "ingest", timeout_secs, "codex exec timed out, killing process");
392| 0| let _ = child.kill();
393| 0| let _ = child.wait();
394| 0| let _ = stdin_thread.join();
395| 0| Err(AppError::Validation(format!(
396| 0| "codex exec timed out after {timeout_secs} seconds"
397| 0| )))
398| | }
399| | }
400| 0|}
401| |
402| |/// Parses JSONL output from `codex exec --json`.
403| |///
404| |/// Event format (DOTS notation):
405| |/// - `thread.started` — session init
406| |/// - `turn.started` — model turn begins
407| |/// - `item.completed` — message or tool call; last `agent_message` wins
408| |/// - `turn.completed` — includes usage stats
409| |/// - `turn.failed` — error with optional rate-limit indicator
410| |/// - `error` — schema or validation error
411| |///
412| |/// # Errors
413| |///
414| |/// Returns `AppError::Validation` when no agent_message is found, when the
415| |/// turn failed, or when the extracted JSON cannot be parsed as `ExtractionResult`.
416| 6|fn parse_codex_output(stdout: &str) -> Result<(ExtractionResult, Option<CodexUsage>), AppError> {
417| 6| let mut last_agent_text: Option<String> = None;
418| 6| let mut usage: Option<CodexUsage> = None;
419| 6| let mut rate_limited = false;
420| 6| let mut schema_error = false;
421| 6| let mut turn_failed = false;
422| 6| let mut failed_message = String::new();
423| |
424| 15| for line in stdout.lines() {
^6 ^6
425| 15| let line = line.trim();
426| 15| if line.is_empty() {
427| 0| continue;
428| 15| }
429| |
430| 15| let event: serde_json::Value = match serde_json::from_str(line) {
^13 ^13
431| 13| Ok(v) => v,
432| | Err(_) => {
433| 2| tracing::warn!(target: "ingest", line, "codex output: skipping malformed JSONL line");
^0
434| 2| continue;
435| | }
436| | };
437| |
438| 13| let event_type = match event.get("type").and_then(|t| t.as_str()) {
439| 13| Some(t) => t,
440| 0| None => continue,
441| | };
442| |
443| 13| match event_type {
444| 13| "item.completed" => {
445| | // Last agent_message wins (reasoning / tool calls may appear before)
446| 4| if let Some(item) = event.get("item") {
447| 4| if item.get("type").and_then(|t| t.as_str()) == Some("agent_message") {
448| 4| if let Some(text) = item.get("text").and_then(|t| t.as_str()) {
449| 4| last_agent_text = Some(text.to_string());
450| 4| }
^0
451| 0| }
452| 0| }
453| | }
454| 9| "turn.completed" => {
455| 3| if let Some(u) = event.get("usage") {
456| 3| if let Ok(parsed) = serde_json::from_value::<CodexUsage>(u.clone()) {
457| 3| usage = Some(parsed);
458| 3| }
^0
459| 0| }
460| | }
461| 6| "turn.failed" => {
462| 2| turn_failed = true;
463| 2| if let Some(err) = event.get("error") {
464| 2| let msg = err
465| 2| .get("message")
466| 2| .and_then(|m| m.as_str())
467| 2| .unwrap_or("unknown error");
468| 2| failed_message = msg.to_string();
469| 2| if msg.contains("rate_limit")
470| 1| || msg.contains("429")
471| 1| || msg.contains("Too Many Requests")
472| 1| {
473| 1| rate_limited = true;
474| 1| }
475| 0| }
476| | }
477| 4| "error" => {
478| 1| if let Some(msg) = event.get("message").and_then(|m| m.as_str()) {
479| 1| if msg.contains("invalid_json_schema") || msg.contains("schema") {
^0 ^0
480| 1| schema_error = true;
481| 1| }
^0
482| 1| tracing::warn!(target: "ingest", error_msg = msg, "codex error event received");
^0
483| 0| }
484| | }
485| 3| _ => {
486| 3| // Gracefully skip unknown event types (thread.started, turn.started, etc.)
487| 3| }
488| | }
489| | }
490| |
491| 6| if rate_limited {
492| 1| return Err(AppError::RateLimited {
493| 1| detail: failed_message,
494| 1| });
495| 5| }
496| |
497| 5| if schema_error {
498| 1| return Err(AppError::Validation(
499| 1| "codex rejected the output schema (invalid_json_schema)".to_string(),
500| 1| ));
501| 4| }
502| |
503| 4| if turn_failed {
504| 1| return Err(AppError::Validation(format!(
505| 1| "codex turn failed: {failed_message}"
506| 1| )));
507| 3| }
508| |
509| 3| let text = last_agent_text.ok_or_else(|| {
^0
510| 0| AppError::Validation("codex output contained no agent_message item".to_string())
511| 0| })?;
512| |
513| 3| let extraction: ExtractionResult = serde_json::from_str(&text).map_err(|e| {
^0
514| 0| AppError::Validation(format!(
515| 0| "failed to parse codex agent_message as ExtractionResult: {e}. text={text}"
516| 0| ))
517| 0| })?;
518| |
519| 3| Ok((extraction, usage))
520| 6|}
521| |
522| |use crate::output::emit_json_line as emit_json;
523| |
524| |/// Collects files matching the pattern (reuses ingest logic).
525| 0|fn collect_matching_files(
526| 0| dir: &Path,
527| 0| pattern: &str,
528| 0| recursive: bool,
529| 0| max_files: usize,
530| 0|) -> Result<Vec<PathBuf>, AppError> {
531| 0| let mut files = Vec::new();
532| 0| super::ingest::collect_files(dir, pattern, recursive, &mut files)?;
533| 0| files.sort_unstable();
534| |
535| 0| if files.len() > max_files {
536| 0| return Err(AppError::Validation(format!(
537| 0| "found {} files, exceeds --max-files cap of {}",
538| 0| files.len(),
539| 0| max_files
540| 0| )));
541| 0| }
542| |
543| 0| Ok(files)
544| 0|}
545| |
546| |/// Opens or creates the queue database for tracking ingest progress.
547| 0|fn open_queue_db(path: &str) -> Result<Connection, AppError> {
548| 0| let conn = Connection::open(path)?;
549| |
550| 0| conn.execute_batch(
551| 0| "PRAGMA journal_mode=WAL;
552| 0| CREATE TABLE IF NOT EXISTS queue (
553| 0| id INTEGER PRIMARY KEY AUTOINCREMENT,
554| 0| file_path TEXT NOT NULL UNIQUE,
555| 0| name TEXT,
556| 0| status TEXT NOT NULL DEFAULT 'pending',
557| 0| memory_id INTEGER,
558| 0| entities INTEGER DEFAULT 0,
559| 0| rels INTEGER DEFAULT 0,
560| 0| error TEXT,
561| 0| input_tokens INTEGER DEFAULT 0,
562| 0| output_tokens INTEGER DEFAULT 0,
563| 0| attempt INTEGER DEFAULT 0,
564| 0| elapsed_ms INTEGER,
565| 0| created_at TEXT DEFAULT (datetime('now')),
566| 0| done_at TEXT
567| 0| );
568| 0| CREATE INDEX IF NOT EXISTS idx_queue_status ON queue(status);",
569| 0| )?;
570| |
571| 0| Ok(conn)
572| 0|}
573| |
574| |/// Main entry point for `ingest --mode codex`.
575| |///
576| |/// # Errors
577| |///
578| |/// Returns `AppError` on directory/DB access failures or fatal extraction errors.
579| 0|pub fn run_codex_ingest(args: &IngestArgs) -> Result<(), AppError> {
580| 0| let started = Instant::now();
581| |
582| 0| if !args.dir.exists() {
583| 0| return Err(AppError::Validation(format!(
584| 0| "directory not found: {}",
585| 0| args.dir.display()
586| 0| )));
587| 0| }
588| |
589| | // G28-B (v1.0.68) + G30 (v1.0.69): acquire singleton before doing real
590| | // work so two parallel `ingest --mode codex` invocations cannot co-exist
591| | // on the same database. Scope includes the database hash so concurrent
592| | // ingest against different databases is allowed.
593| 0| let early_ns = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
594| 0| let early_paths = AppPaths::resolve(args.db.as_deref())?;
595| 0| let _singleton = crate::lock::acquire_job_singleton(
596| 0| crate::lock::JobType::IngestCodex,
597| 0| &early_ns,
598| 0| &early_paths.db,
599| 0| args.wait_job_singleton,
600| 0| args.force_job_singleton,
601| 0| )?;
602| |
603| | // Stage 1: Validate binary
604| 0| let codex_binary = find_codex_binary(args.codex_binary.as_deref())?;
605| 0| let version = validate_codex_version(&codex_binary)?;
606| 0| tracing::info!(
607| | target: "ingest",
608| 0| binary = %codex_binary.display(),
609| | version = %version,
610| 0| "Codex CLI binary validated"
611| | );
612| |
613| 0| emit_json(&PhaseEvent {
614| 0| phase: "validate",
615| 0| codex_path: codex_binary.to_str(),
616| 0| version: Some(&version),
617| 0| dir: None,
618| 0| files_total: None,
619| 0| files_new: None,
620| 0| files_existing: None,
621| 0| });
622| |
623| | // Stage 2: Scan files
624| 0| let files = collect_matching_files(&args.dir, &args.pattern, args.recursive, args.max_files)?;
625| |
626| 0| let queue_conn = open_queue_db(&args.queue_db)?;
627| |
628| 0| if args.resume {
629| 0| let reset = queue_conn
630| 0| .execute(
631| 0| "UPDATE queue SET status='pending' WHERE status='processing'",
632| 0| [],
633| | )
634| 0| .map_err(|e| AppError::Validation(format!("queue resume failed: {e}")))?;
635| 0| if reset > 0 {
636| 0| tracing::info!(target: "ingest", count = reset, "reset stuck processing files to pending");
637| 0| }
638| 0| }
639| |
640| 0| if args.retry_failed {
641| 0| let count = queue_conn
642| 0| .execute(
643| 0| "UPDATE queue SET status='pending', attempt=0 WHERE status='failed'",
644| 0| [],
645| | )
646| 0| .map_err(|e| AppError::Validation(format!("queue retry-failed reset failed: {e}")))?;
647| 0| tracing::info!(target: "ingest", count, "retrying failed files");
648| 0| }
649| |
650| 0| if !args.resume && !args.retry_failed {
651| 0| queue_conn
652| 0| .execute("DELETE FROM queue", [])
653| 0| .map_err(|e| AppError::Validation(format!("queue clear failed: {e}")))?;
654| 0| }
655| |
656| 0| let mut new_count = 0usize;
657| 0| let mut existing_count = 0usize;
658| |
659| 0| if !args.retry_failed {
660| 0| for file in &files {
661| 0| let file_str = file.to_string_lossy().into_owned();
662| 0| let inserted = queue_conn
663| 0| .execute(
664| 0| "INSERT OR IGNORE INTO queue (file_path, status) VALUES (?1, 'pending')",
665| 0| rusqlite::params![file_str],
666| | )
667| 0| .map_err(|e| AppError::Validation(format!("queue insert failed: {e}")))?;
668| 0| if inserted > 0 {
669| 0| new_count += 1;
670| 0| } else {
671| 0| existing_count += 1;
672| 0| }
673| | }
674| 0| }
675| |
676| 0| emit_json(&PhaseEvent {
677| 0| phase: "scan",
678| 0| codex_path: None,
679| 0| version: None,
680| 0| dir: args.dir.to_str(),
681| 0| files_total: Some(files.len()),
682| 0| files_new: Some(new_count),
683| 0| files_existing: Some(existing_count),
684| 0| });
685| |
686| 0| if args.dry_run {
687| 0| for (idx, file) in files.iter().enumerate() {
688| 0| let (name, _truncated, _orig) =
689| 0| super::ingest::derive_kebab_name(file, args.max_name_length);
690| 0| emit_json(&FileEvent {
691| 0| file: &file.to_string_lossy(),
692| 0| name: &name,
693| 0| status: "preview",
694| 0| memory_id: None,
695| 0| entities: None,
696| 0| rels: None,
697| 0| cost_usd: None,
698| 0| input_tokens: None,
699| 0| output_tokens: None,
700| 0| elapsed_ms: None,
701| 0| error: None,
702| 0| index: idx,
703| 0| total: files.len(),
704| 0| });
705| 0| }
706| 0| emit_json(&Summary {
707| 0| summary: true,
708| 0| files_total: files.len(),
709| 0| completed: 0,
710| 0| failed: 0,
711| 0| skipped: 0,
712| 0| entities_total: 0,
713| 0| rels_total: 0,
714| 0| input_tokens_total: 0,
715| 0| output_tokens_total: 0,
716| 0| elapsed_ms: started.elapsed().as_millis() as u64,
717| 0| });
718| 0| if !args.keep_queue {
719| 0| let _ = std::fs::remove_file(&args.queue_db);
720| 0| }
721| 0| return Ok(());
722| 0| }
723| |
724| | // Stage 3: Process files
725| 0| let paths = AppPaths::resolve(args.db.as_deref())?;
726| 0| ensure_db_ready(&paths)?;
727| 0| let conn = open_rw(&paths.db)?;
728| 0| let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
729| 0| let memory_type_str = args.r#type.as_str().to_string();
730| |
731| | // Write schema to temp file once (reused across all files)
732| 0| let schema_tempfile = write_schema_tempfile()?;
733| 0| let schema_path = schema_tempfile.path().to_path_buf();
734| |
735| 0| let mut completed = 0usize;
736| 0| let mut failed = 0usize;
737| 0| let skipped_initial: usize = queue_conn
738| 0| .query_row("SELECT COUNT(*) FROM queue WHERE status='done'", [], |r| {
739| 0| r.get::<_, usize>(0)
740| 0| })
741| 0| .unwrap_or(0);
742| 0| let mut skipped = skipped_initial;
743| 0| let mut entities_total = 0usize;
744| 0| let mut rels_total = 0usize;
745| 0| let mut input_tokens_total = 0u64;
746| 0| let mut output_tokens_total = 0u64;
747| 0| let total = files.len();
748| |
749| 0| let mut backoff_secs = args.rate_limit_wait;
750| 0| let rate_limit_deadline = std::time::Instant::now() + std::time::Duration::from_secs(3600);
751| |
752| | loop {
753| 0| if crate::shutdown_requested() {
754| 0| tracing::info!(target: "ingest", "shutdown requested, stopping before next file");
755| 0| break;
756| 0| }
757| |
758| 0| let pending: Option<(i64, String)> = queue_conn
759| 0| .query_row(
760| 0| "UPDATE queue SET status='processing', attempt=attempt+1 \
761| 0| WHERE id = (SELECT id FROM queue WHERE status='pending' ORDER BY id LIMIT 1) \
762| 0| RETURNING id, file_path",
763| 0| [],
764| 0| |row| Ok((row.get(0)?, row.get(1)?)),
765| | )
766| 0| .ok();
767| |
768| 0| let (queue_id, file_path) = match pending {
769| 0| Some(p) => p,
770| 0| None => break,
771| | };
772| |
773| 0| let file_started = Instant::now();
774| |
775| | // Reject files that exceed the 10 MB stdin limit
776| | const MAX_FILE_SIZE: u64 = 10 * 1024 * 1024;
777| 0| if let Ok(meta) = std::fs::metadata(&file_path) {
778| 0| if meta.len() > MAX_FILE_SIZE {
779| 0| let err_msg = format!("file exceeds 10MB stdin limit ({} bytes)", meta.len());
780| 0| let _ = queue_conn.execute(
781| 0| "UPDATE queue SET status='failed', error=?1, done_at=datetime('now') WHERE id=?2",
782| 0| rusqlite::params![err_msg, queue_id],
783| 0| );
784| 0| let current_index = completed + failed + skipped;
785| 0| failed += 1;
786| 0| emit_json(&FileEvent {
787| 0| file: &file_path,
788| 0| name: "",
789| 0| status: "failed",
790| 0| memory_id: None,
791| 0| entities: None,
792| 0| rels: None,
793| 0| cost_usd: None,
794| 0| input_tokens: None,
795| 0| output_tokens: None,
796| 0| elapsed_ms: Some(file_started.elapsed().as_millis() as u64),
797| 0| error: Some(&err_msg),
798| 0| index: current_index,
799| 0| total,
800| 0| });
801| 0| if args.fail_fast {
802| 0| break;
803| 0| }
804| 0| continue;
805| 0| }
806| 0| }
807| |
808| 0| let file_content = match std::fs::read(&file_path) {
809| 0| Ok(c) => c,
810| 0| Err(e) => {
811| 0| let err_msg = format!("IO error: {e}");
812| 0| let _ = queue_conn.execute(
813| 0| "UPDATE queue SET status='failed', error=?1, done_at=datetime('now') WHERE id=?2",
814| 0| rusqlite::params![err_msg, queue_id],
815| 0| );
816| 0| let current_index = completed + failed + skipped;
817| 0| failed += 1;
818| 0| emit_json(&FileEvent {
819| 0| file: &file_path,
820| 0| name: "",
821| 0| status: "failed",
822| 0| memory_id: None,
823| 0| entities: None,
824| 0| rels: None,
825| 0| cost_usd: None,
826| 0| input_tokens: None,
827| 0| output_tokens: None,
828| 0| elapsed_ms: Some(file_started.elapsed().as_millis() as u64),
829| 0| error: Some(&err_msg),
830| 0| index: current_index,
831| 0| total,
832| 0| });
833| 0| if args.fail_fast {
834| 0| break;
835| 0| }
836| 0| continue;
837| | }
838| | };
839| |
840| | // Skip files exceeding body cap BEFORE sending to LLM to avoid wasting tokens
841| 0| if file_content.len() > crate::constants::MAX_MEMORY_BODY_LEN {
842| 0| let err_msg = format!(
843| 0| "file body exceeds {} byte limit ({} bytes) — skipping to avoid wasting LLM tokens",
844| | crate::constants::MAX_MEMORY_BODY_LEN,
845| 0| file_content.len()
846| | );
847| 0| tracing::warn!(target: "ingest", file = %file_path, size = file_content.len(), "body exceeds limit, skipping LLM extraction");
848| 0| let _ = queue_conn.execute(
849| 0| "UPDATE queue SET status='skipped', error=?1, done_at=datetime('now') WHERE id=?2",
850| 0| rusqlite::params![err_msg, queue_id],
851| 0| );
852| 0| let current_index = completed + failed + skipped;
853| 0| skipped += 1;
854| 0| emit_json(&FileEvent {
855| 0| file: &file_path,
856| 0| name: "",
857| 0| status: "skipped",
858| 0| memory_id: None,
859| 0| entities: None,
860| 0| rels: None,
861| 0| cost_usd: None,
862| 0| input_tokens: None,
863| 0| output_tokens: None,
864| 0| elapsed_ms: Some(file_started.elapsed().as_millis() as u64),
865| 0| error: Some(&err_msg),
866| 0| index: current_index,
867| 0| total,
868| 0| });
869| 0| continue;
870| 0| }
871| |
872| | // Retry once on cold-start failure
873| 0| let max_extract_attempts: u32 = 2;
874| 0| let mut extraction_result: Option<(ExtractionResult, Option<CodexUsage>)> = None;
875| 0| let mut last_extract_err: Option<String> = None;
876| 0| let mut last_was_rate_limited = false;
877| |
878| 0| for attempt in 1..=max_extract_attempts {
879| 0| match extract_with_codex(
880| 0| &codex_binary,
881| 0| &file_content,
882| 0| args.codex_model.as_deref(),
883| 0| args.codex_timeout,
884| 0| &schema_path,
885| | ) {
886| 0| Ok(result) => {
887| 0| extraction_result = Some(result);
888| 0| break;
889| | }
890| 0| Err(ref e) if matches!(e, AppError::RateLimited { .. }) => {
891| 0| last_extract_err = Some(format!("{e}"));
892| 0| last_was_rate_limited = true;
893| 0| break;
894| | }
895| 0| Err(e) => {
896| 0| let msg = format!("{e}");
897| 0| if attempt < max_extract_attempts {
898| 0| let cold_start_delay = 2 * attempt as u64;
899| 0| tracing::warn!(
900| | target: "ingest",
901| | attempt,
902| | delay_secs = cold_start_delay,
903| | error = %msg,
904| 0| "codex extraction failed, retrying"
905| | );
906| 0| std::thread::sleep(std::time::Duration::from_secs(cold_start_delay));
907| 0| }
908| 0| last_extract_err = Some(msg);
909| | }
910| | }
911| | }
912| |
913| 0| if let Some((extraction, usage)) = extraction_result {
914| 0| backoff_secs = args.rate_limit_wait;
915| |
916| 0| let in_tok = usage.as_ref().map(|u| u.input_tokens).unwrap_or(0);
917| 0| let out_tok = usage.as_ref().map(|u| u.output_tokens).unwrap_or(0);
918| |
919| 0| let name = &extraction.name;
920| 0| let ent_count = extraction.entities.len();
921| 0| let rel_count = extraction.relationships.len();
922| |
923| 0| let new_entities: Vec<NewEntity> = extraction
924| 0| .entities
925| 0| .iter()
926| 0| .filter_map(|e| match e.entity_type.parse::<EntityType>() {
927| 0| Ok(et) => Some(NewEntity {
928| 0| name: e.name.clone(),
929| 0| entity_type: et,
930| 0| description: None,
931| 0| }),
932| | Err(_) => {
933| 0| tracing::warn!(
934| | target: "ingest",
935| | entity = %e.name,
936| | entity_type = %e.entity_type,
937| 0| "entity type not recognized, skipping"
938| | );
939| 0| None
940| | }
941| 0| })
942| 0| .collect();
943| |
944| 0| let new_relationships: Vec<NewRelationship> = extraction
945| 0| .relationships
946| 0| .iter()
947| 0| .map(|r| NewRelationship {
948| 0| source: r.source.clone(),
949| 0| target: r.target.clone(),
950| 0| relation: crate::parsers::normalize_relation(&r.relation),
951| 0| strength: r.strength,
952| 0| description: None,
953| 0| })
954| 0| .collect();
955| |
956| 0| let body_str = String::from_utf8_lossy(&file_content);
957| 0| let body_hash = blake3::hash(body_str.as_bytes()).to_hex().to_string();
958| 0| let new_memory = NewMemory {
959| 0| name: name.clone(),
960| 0| namespace: namespace.clone(),
961| 0| memory_type: memory_type_str.clone(),
962| 0| description: extraction.description.clone(),
963| 0| body: body_str.to_string(),
964| 0| body_hash,
965| 0| session_id: None,
966| 0| source: "agent".to_string(),
967| 0| metadata: serde_json::Value::Object(serde_json::Map::new()),
968| 0| };
969| |
970| | // Deduplication: update existing memory instead of failing on UNIQUE
971| 0| let memory_id = match memories::find_by_name_any_state(&conn, &namespace, name)? {
972| 0| Some((existing_id, is_deleted)) => {
973| 0| if is_deleted {
974| 0| memories::clear_deleted_at(&conn, existing_id)?;
975| 0| }
976| 0| let (old_name, old_desc, old_body): (String, String, String) = conn.query_row(
977| 0| "SELECT name, COALESCE(description,''), COALESCE(body,'') FROM memories WHERE id=?1",
978| 0| rusqlite::params![existing_id],
979| 0| |r| Ok((r.get(0)?, r.get(1)?, r.get(2)?)),
980| 0| )?;
981| 0| memories::update(&conn, existing_id, &new_memory, None)?;
982| 0| memories::sync_fts_after_update(
983| 0| &conn,
984| 0| existing_id,
985| 0| &old_name,
986| 0| &old_desc,
987| 0| &old_body,
988| 0| &new_memory.name,
989| 0| &new_memory.description,
990| 0| &new_memory.body,
991| 0| )?;
992| 0| tracing::info!(target: "ingest", name, memory_id = existing_id, "updated existing memory (force-merge)");
993| 0| existing_id
994| | }
995| 0| None => match memories::insert(&conn, &new_memory) {
996| 0| Ok(id) => id,
997| 0| Err(e) => {
998| 0| let err_msg = format!("{e}");
999| 0| let _ = queue_conn.execute(
1000| 0| "UPDATE queue SET status='failed', error=?1, done_at=datetime('now') WHERE id=?2",
1001| 0| rusqlite::params![err_msg, queue_id],
1002| 0| );
1003| 0| let current_index = completed + failed + skipped;
1004| 0| failed += 1;
1005| 0| emit_json(&FileEvent {
1006| 0| file: &file_path,
1007| 0| name,
1008| 0| status: "failed",
1009| 0| memory_id: None,
1010| 0| entities: None,
1011| 0| rels: None,
1012| 0| cost_usd: None,
1013| 0| input_tokens: Some(in_tok),
1014| 0| output_tokens: Some(out_tok),
1015| 0| elapsed_ms: Some(file_started.elapsed().as_millis() as u64),
1016| 0| error: Some(&err_msg),
1017| 0| index: current_index,
1018| 0| total,
1019| 0| });
1020| 0| input_tokens_total += in_tok;
1021| 0| output_tokens_total += out_tok;
1022| 0| if args.fail_fast {
1023| 0| break;
1024| 0| }
1025| 0| continue;
1026| | }
1027| | },
1028| | };
1029| |
1030| 0| for ent in &new_entities {
1031| 0| if let Ok(eid) = entities::upsert_entity(&conn, &namespace, ent) {
1032| 0| let _ = entities::link_memory_entity(&conn, memory_id, eid);
1033| 0| }
1034| | }
1035| 0| for rel in &new_relationships {
1036| 0| crate::parsers::warn_if_non_canonical(&rel.relation);
1037| 0| let src_id = entities::find_entity_id(&conn, &namespace, &rel.source);
1038| 0| let tgt_id = entities::find_entity_id(&conn, &namespace, &rel.target);
1039| 0| if let (Ok(Some(sid)), Ok(Some(tid))) = (src_id, tgt_id) {
1040| 0| let _ = conn.execute(
1041| 0| "INSERT OR IGNORE INTO relationships (namespace, source_id, target_id, relation, weight) VALUES (?1, ?2, ?3, ?4, ?5)",
1042| 0| rusqlite::params![namespace, sid, tid, rel.relation, rel.strength],
1043| 0| );
1044| 0| }
1045| | }
1046| |
1047| 0| let _ = queue_conn.execute(
1048| 0| "UPDATE queue SET status='done', name=?1, memory_id=?2, entities=?3, rels=?4, \
1049| 0| input_tokens=?5, output_tokens=?6, elapsed_ms=?7, done_at=datetime('now') WHERE id=?8",
1050| 0| rusqlite::params![
1051| 0| name,
1052| 0| memory_id,
1053| 0| ent_count,
1054| 0| rel_count,
1055| 0| in_tok,
1056| 0| out_tok,
1057| 0| file_started.elapsed().as_millis() as i64,
1058| 0| queue_id
1059| 0| ],
1060| 0| );
1061| |
1062| 0| let current_index = completed + failed + skipped;
1063| 0| completed += 1;
1064| 0| entities_total += ent_count;
1065| 0| rels_total += rel_count;
1066| 0| input_tokens_total += in_tok;
1067| 0| output_tokens_total += out_tok;
1068| |
1069| 0| emit_json(&FileEvent {
1070| 0| file: &file_path,
1071| 0| name,
1072| 0| status: "done",
1073| 0| memory_id: Some(memory_id),
1074| 0| entities: Some(ent_count),
1075| 0| rels: Some(rel_count),
1076| 0| cost_usd: None,
1077| 0| input_tokens: Some(in_tok),
1078| 0| output_tokens: Some(out_tok),
1079| 0| elapsed_ms: Some(file_started.elapsed().as_millis() as u64),
1080| 0| error: None,
1081| 0| index: current_index,
1082| 0| total,
1083| 0| });
1084| 0| } else if let Some(ref err_str) = last_extract_err {
1085| 0| if last_was_rate_limited {
1086| 0| if crate::retry::is_kill_switch_active() {
1087| 0| tracing::warn!(target: "ingest", "SQLITE_GRAPHRAG_DISABLE_RETRY=1, skipping rate-limit retry");
1088| 0| } else if std::time::Instant::now() >= rate_limit_deadline {
1089| 0| tracing::error!(target: "ingest", "rate-limit retry deadline (1h) exhausted");
1090| | } else {
1091| 0| let half = backoff_secs / 2;
1092| 0| let jitter = if half == 0 { 0 } else { fastrand::u64(0..half) };
1093| 0| let actual_wait = half + jitter;
1094| 0| tracing::warn!(target: "ingest", delay_secs = actual_wait, error_kind = "rate_limited", "Codex rate limited, backing off");
1095| 0| let _ = queue_conn.execute(
1096| 0| "UPDATE queue SET status='pending' WHERE id=?1",
1097| 0| rusqlite::params![queue_id],
1098| 0| );
1099| 0| std::thread::sleep(std::time::Duration::from_secs(actual_wait));
1100| 0| backoff_secs = (backoff_secs * 2).min(900);
1101| 0| continue;
1102| | }
1103| | } else {
1104| 0| let _ = queue_conn.execute(
1105| 0| "UPDATE queue SET status='failed', error=?1, done_at=datetime('now') WHERE id=?2",
1106| 0| rusqlite::params![err_str, queue_id],
1107| 0| );
1108| 0| let current_index = completed + failed + skipped;
1109| 0| failed += 1;
1110| 0| emit_json(&FileEvent {
1111| 0| file: &file_path,
1112| 0| name: "",
1113| 0| status: "failed",
1114| 0| memory_id: None,
1115| 0| entities: None,
1116| 0| rels: None,
1117| 0| cost_usd: None,
1118| 0| input_tokens: None,
1119| 0| output_tokens: None,
1120| 0| elapsed_ms: Some(file_started.elapsed().as_millis() as u64),
1121| 0| error: Some(err_str),
1122| 0| index: current_index,
1123| 0| total,
1124| 0| });
1125| 0| if args.fail_fast {
1126| 0| break;
1127| 0| }
1128| | }
1129| 0| }
1130| | }
1131| |
1132| | // WAL checkpoint before summary
1133| 0| let _ = conn.execute_batch("PRAGMA wal_checkpoint(PASSIVE);");
1134| |
1135| | // Stage 4: Summary
1136| 0| emit_json(&Summary {
1137| 0| summary: true,
1138| 0| files_total: total,
1139| 0| completed,
1140| 0| failed,
1141| 0| skipped,
1142| 0| entities_total,
1143| 0| rels_total,
1144| 0| input_tokens_total,
1145| 0| output_tokens_total,
1146| 0| elapsed_ms: started.elapsed().as_millis() as u64,
1147| 0| });
1148| |
1149| 0| if !args.keep_queue && failed == 0 {
1150| 0| let _ = std::fs::remove_file(&args.queue_db);
1151| 0| }
1152| |
1153| 0| Ok(())
1154| 0|}
1155| |
1156| |#[cfg(test)]
1157| |mod tests {
1158| | use super::*;
1159| |
1160| 4| fn make_agent_message_event(text: &str) -> String {
1161| 4| format!(
1162| 4| r#"{{"type":"item.completed","item":{{"id":"item_0","type":"agent_message","text":{}}}}}"#,
1163| 4| serde_json::to_string(text).unwrap()
1164| | )
1165| 4| }
1166| |
1167| 3| fn make_usage_event(input: u64, output: u64) -> String {
1168| 3| format!(
1169| 3| r#"{{"type":"turn.completed","usage":{{"input_tokens":{input},"output_tokens":{output}}}}}"#
1170| | )
1171| 3| }
1172| |
1173| 2| fn valid_extraction_json() -> String {
1174| 2| r#"{"name":"test-module","description":"A test module for unit testing purposes","entities":[{"name":"test-entity","entity_type":"concept"}],"relationships":[{"source":"test-entity","target":"test-module","relation":"applies-to","strength":0.8}]}"#.to_string()
1175| 2| }
1176| |
1177| | #[test]
1178| 1| fn test_parse_codex_output_valid() {
1179| 1| let jsonl = format!(
1180| 1| "{}\n{}\n{}",
1181| | r#"{"type":"thread.started","thread_id":"t1"}"#,
1182| 1| make_agent_message_event(&valid_extraction_json()),
1183| 1| make_usage_event(100, 50),
1184| | );
1185| |
1186| 1| let (result, usage) = parse_codex_output(&jsonl).expect("parse must succeed");
1187| 1| assert_eq!(result.name, "test-module");
1188| 1| assert_eq!(result.entities.len(), 1);
1189| 1| assert_eq!(result.relationships.len(), 1);
1190| 1| let u = usage.expect("usage must be present");
1191| 1| assert_eq!(u.input_tokens, 100);
1192| 1| assert_eq!(u.output_tokens, 50);
1193| 1| }
1194| |
1195| | #[test]
1196| 1| fn test_parse_codex_output_turn_failed() {
1197| 1| let jsonl = format!(
1198| 1| "{}\n{}",
1199| | r#"{"type":"thread.started","thread_id":"t1"}"#,
1200| | r#"{"type":"turn.failed","error":{"message":"model error occurred"}}"#,
1201| | );
1202| |
1203| 1| let err = parse_codex_output(&jsonl).unwrap_err();
1204| 1| let msg = format!("{err}");
1205| 1| assert!(
1206| 1| msg.contains("turn failed"),
1207| 0| "expected 'turn failed' in: {msg}"
1208| | );
1209| 1| assert!(msg.contains("model error occurred"));
1210| 1| }
1211| |
1212| | #[test]
1213| 1| fn test_parse_codex_output_rate_limit() {
1214| 1| let jsonl = r#"{"type":"turn.failed","error":{"message":"rate_limit exceeded, 429 Too Many Requests"}}"#;
1215| |
1216| 1| let err = parse_codex_output(jsonl).unwrap_err();
1217| 1| assert!(
1218| 1| matches!(err, AppError::RateLimited { .. }),
^0
1219| 0| "expected AppError::RateLimited, got: {err}"
1220| | );
1221| 1| }
1222| |
1223| | #[test]
1224| 1| fn test_parse_codex_output_schema_error() {
1225| 1| let jsonl = r#"{"type":"error","message":"invalid_json_schema: additional properties not allowed"}"#;
1226| |
1227| 1| let err = parse_codex_output(jsonl).unwrap_err();
1228| 1| let msg = format!("{err}");
1229| 1| assert!(
1230| 1| msg.contains("invalid_json_schema") || msg.contains("schema"),
^0
1231| 0| "expected schema error in: {msg}"
1232| | );
1233| 1| }
1234| |
1235| | #[test]
1236| 1| fn test_extraction_schema_codex_valid_json() {
1237| 1| let _: serde_json::Value =
1238| 1| serde_json::from_str(EXTRACTION_SCHEMA_CODEX).expect("schema must be valid JSON");
1239| 1| }
1240| |
1241| | #[test]
1242| 1| fn test_extraction_schema_codex_has_additional_properties_false() {
1243| 1| let schema: serde_json::Value =
1244| 1| serde_json::from_str(EXTRACTION_SCHEMA_CODEX).expect("schema must be valid JSON");
1245| |
1246| | // Root level
1247| 1| assert_eq!(
1248| 1| schema["additionalProperties"].as_bool(),
1249| | Some(false),
1250| 0| "root must have additionalProperties: false"
1251| | );
1252| |
1253| | // Entity items level
1254| 1| assert_eq!(
1255| 1| schema["properties"]["entities"]["items"]["additionalProperties"].as_bool(),
1256| | Some(false),
1257| 0| "entity items must have additionalProperties: false"
1258| | );
1259| |
1260| | // Relationship items level
1261| 1| assert_eq!(
1262| 1| schema["properties"]["relationships"]["items"]["additionalProperties"].as_bool(),
1263| | Some(false),
1264| 0| "relationship items must have additionalProperties: false"
1265| | );
1266| 1| }
1267| |
1268| | #[test]
1269| 1| fn test_parse_codex_output_last_agent_message_wins() {
1270| | // Multiple agent_message items — last one should win
1271| 1| let first_text = r#"{"name":"first-result","description":"First result should be ignored","entities":[],"relationships":[]}"#;
1272| 1| let second_text = r#"{"name":"final-result","description":"Final result wins over earlier ones","entities":[{"name":"final-entity","entity_type":"concept"}],"relationships":[]}"#;
1273| |
1274| 1| let jsonl = format!(
1275| 1| "{}\n{}\n{}\n{}",
1276| | r#"{"type":"thread.started","thread_id":"t1"}"#,
1277| 1| make_agent_message_event(first_text),
1278| 1| make_agent_message_event(second_text),
1279| 1| make_usage_event(200, 80),
1280| | );
1281| |
1282| 1| let (result, _) = parse_codex_output(&jsonl).expect("parse must succeed");
1283| 1| assert_eq!(result.name, "final-result", "last agent_message should win");
^0
1284| 1| assert_eq!(result.entities.len(), 1);
1285| 1| }
1286| |
1287| | #[test]
1288| 1| fn test_parse_codex_output_skips_malformed_lines() {
1289| 1| let jsonl = format!(
1290| 1| "not json at all\n{}\n{{broken\n{}",
1291| 1| make_agent_message_event(&valid_extraction_json()),
1292| 1| make_usage_event(10, 5),
1293| | );
1294| |
1295| | // Should succeed despite malformed lines
1296| 1| let (result, _) = parse_codex_output(&jsonl).expect("malformed lines must be skipped");
1297| 1| assert_eq!(result.name, "test-module");
1298| 1| }
1299| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/init.rs:
1| |//! Handler for the `init` CLI subcommand.
2| |
3| |use crate::errors::AppError;
4| |use crate::output;
5| |use crate::paths::AppPaths;
6| |use crate::pragmas::{apply_init_pragmas, ensure_wal_mode};
7| |use crate::storage::connection::open_rw;
8| |use serde::Serialize;
9| |
10| |/// Embedding model choices exposed through `--model`.
11| |///
12| |/// Currently only `multilingual-e5-small` is supported. Additional variants
13| |/// will be added here as new models are integrated; the `value_enum` derive
14| |/// ensures the CLI rejects unknown strings at parse time rather than at runtime.
15| |#[derive(Copy, Clone, Debug, PartialEq, Eq, clap::ValueEnum)]
16| |pub enum EmbeddingModelChoice {
17| | #[value(name = "multilingual-e5-small")]
18| | MultilingualE5Small,
19| |}
20| |
21| |#[derive(clap::Args)]
22| |#[command(after_long_help = "EXAMPLES:\n \
23| | # Initialize a new database in the current directory\n \
24| | sqlite-graphrag init\n\n \
25| | # Initialize with a specific namespace\n \
26| | sqlite-graphrag init --namespace my-project\n\n \
27| | # Initialize at a custom database path\n \
28| | sqlite-graphrag init --db /path/to/graphrag.sqlite")]
29| |pub struct InitArgs {
30| | /// Path to graphrag.sqlite. Defaults to `./graphrag.sqlite` in the current directory.
31| | /// Resolution precedence (highest to lowest): `--db` flag > `SQLITE_GRAPHRAG_DB_PATH` env >
32| | /// `SQLITE_GRAPHRAG_HOME` env (used as base directory) > cwd.
33| | #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
34| | pub db: Option<String>,
35| | /// Embedding model identifier. Currently only `multilingual-e5-small` is supported.
36| | /// Reserved for future multi-model support; safe to omit.
37| | #[arg(long, value_enum)]
38| | pub model: Option<EmbeddingModelChoice>,
39| | /// Force re-initialization, overwriting any existing schema metadata.
40| | /// Use only when the schema is corrupted; loses configuration but preserves data.
41| | #[arg(long)]
42| | pub force: bool,
43| | /// Initial namespace to resolve. Aligned with bilingual docs that mention `init --namespace`.
44| | /// When provided, overrides `SQLITE_GRAPHRAG_NAMESPACE`; otherwise resolves via env or fallback `global`.
45| | #[arg(long)]
46| | pub namespace: Option<String>,
47| | #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
48| | pub json: bool,
49| |}
50| |
51| |#[derive(Serialize)]
52| |struct InitResponse {
53| | db_path: String,
54| | /// Latest applied migration number from `refinery_schema_history`.
55| | /// Emitted as a JSON number for cross-command consistency with `health` and `stats` (since v1.0.35).
56| | schema_version: u32,
57| | model: String,
58| | dim: usize,
59| | /// Active namespace resolved during initialisation, aligned with the bilingual docs.
60| | namespace: String,
61| | status: String,
62| | /// Total execution time in milliseconds from handler start to serialisation.
63| | elapsed_ms: u64,
64| |}
65| |
66| 0|pub fn run(args: InitArgs) -> Result<(), AppError> {
67| 0| let start = std::time::Instant::now();
68| 0| let paths = AppPaths::resolve(args.db.as_deref())?;
69| 0| paths.ensure_dirs()?;
70| |
71| 0| let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
72| |
73| 0| let mut conn = open_rw(&paths.db)?;
74| |
75| 0| apply_init_pragmas(&conn)?;
76| |
77| 0| crate::migrations::runner()
78| 0| .run(&mut conn)
79| 0| .map_err(|e| AppError::Internal(anyhow::anyhow!("migration failed: {e}")))?;
80| |
81| 0| conn.execute_batch(&format!(
82| 0| "PRAGMA user_version = {};",
83| 0| crate::constants::SCHEMA_USER_VERSION
84| 0| ))?;
85| |
86| | // Defensive re-assertion: refinery may revert journal_mode during migrations.
87| 0| ensure_wal_mode(&conn)?;
88| |
89| 0| let schema_version = latest_schema_version(&conn)?;
90| |
91| 0| conn.execute(
92| 0| "INSERT OR REPLACE INTO schema_meta (key, value) VALUES ('schema_version', ?1)",
93| 0| rusqlite::params![schema_version],
94| 0| )?;
95| 0| conn.execute(
96| 0| "INSERT OR REPLACE INTO schema_meta (key, value) VALUES ('model', 'multilingual-e5-small')",
97| 0| [],
98| 0| )?;
99| 0| conn.execute(
100| 0| "INSERT OR REPLACE INTO schema_meta (key, value) VALUES ('dim', '384')",
101| 0| [],
102| 0| )?;
103| 0| conn.execute(
104| 0| "INSERT OR REPLACE INTO schema_meta (key, value) VALUES ('created_at', CAST(unixepoch() AS TEXT))",
105| 0| [],
106| 0| )?;
107| 0| conn.execute(
108| 0| "INSERT OR REPLACE INTO schema_meta (key, value) VALUES ('sqlite-graphrag_version', ?1)",
109| 0| rusqlite::params![crate::constants::SQLITE_GRAPHRAG_VERSION],
110| 0| )?;
111| | // Persist the resolved namespace so downstream tools can inspect it without re-resolving.
112| 0| conn.execute(
113| 0| "INSERT OR REPLACE INTO schema_meta (key, value) VALUES ('namespace_initial', ?1)",
114| 0| rusqlite::params![namespace],
115| 0| )?;
116| |
117| 0| output::emit_progress_i18n(
118| 0| "Initializing embedding model (may download on first run)...",
119| 0| crate::i18n::validation::runtime_pt::initializing_embedding_model(),
120| | );
121| |
122| 0| let test_emb = crate::daemon::embed_passage_or_local(&paths.models, "smoke test")?;
123| |
124| 0| output::emit_json(&InitResponse {
125| 0| db_path: paths.db.display().to_string(),
126| 0| schema_version,
127| 0| model: "multilingual-e5-small".to_string(),
128| 0| dim: test_emb.len(),
129| 0| namespace,
130| 0| status: "ok".to_string(),
131| 0| elapsed_ms: start.elapsed().as_millis() as u64,
132| 0| })?;
133| |
134| 0| Ok(())
135| 0|}
136| |
137| 2|fn latest_schema_version(conn: &rusqlite::Connection) -> Result<u32, AppError> {
138| 2| match conn.query_row(
139| 2| "SELECT version FROM refinery_schema_history ORDER BY version DESC LIMIT 1",
140| 2| [],
141| 1| |row| row.get::<_, i64>(0),
142| | ) {
143| 1| Ok(version) => Ok(version.max(0) as u32),
144| 1| Err(rusqlite::Error::QueryReturnedNoRows) => Ok(0),
145| 0| Err(err) => Err(AppError::Database(err)),
146| | }
147| 2|}
148| |
149| |#[cfg(test)]
150| |mod tests {
151| | use super::*;
152| |
153| | #[test]
154| 1| fn init_response_serializes_all_fields() {
155| 1| let resp = InitResponse {
156| 1| db_path: "/tmp/test.sqlite".to_string(),
157| 1| schema_version: 6,
158| 1| model: "multilingual-e5-small".to_string(),
159| 1| dim: 384,
160| 1| namespace: "global".to_string(),
161| 1| status: "ok".to_string(),
162| 1| elapsed_ms: 100,
163| 1| };
164| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
165| 1| assert_eq!(json["db_path"], "/tmp/test.sqlite");
166| 1| assert_eq!(json["schema_version"], 6);
167| 1| assert_eq!(json["model"], "multilingual-e5-small");
168| 1| assert_eq!(json["dim"], 384usize);
169| 1| assert_eq!(json["namespace"], "global");
170| 1| assert_eq!(json["status"], "ok");
171| 1| assert!(json["elapsed_ms"].is_number());
172| 1| }
173| |
174| | #[test]
175| 1| fn latest_schema_version_returns_zero_for_empty_db() {
176| 1| let conn = rusqlite::Connection::open_in_memory().expect("failed to open in-memory db");
177| 1| conn.execute_batch("CREATE TABLE refinery_schema_history (version INTEGER NOT NULL);")
178| 1| .expect("failed to create table");
179| |
180| 1| let version = latest_schema_version(&conn).expect("latest_schema_version failed");
181| 1| assert_eq!(version, 0u32, "empty db must return schema_version 0");
^0
182| 1| }
183| |
184| | #[test]
185| 1| fn latest_schema_version_returns_max_version() {
186| 1| let conn = rusqlite::Connection::open_in_memory().expect("failed to open in-memory db");
187| 1| conn.execute_batch(
188| 1| "CREATE TABLE refinery_schema_history (version INTEGER NOT NULL);
189| 1| INSERT INTO refinery_schema_history VALUES (1);
190| 1| INSERT INTO refinery_schema_history VALUES (3);
191| 1| INSERT INTO refinery_schema_history VALUES (2);",
192| | )
193| 1| .expect("failed to populate table");
194| |
195| 1| let version = latest_schema_version(&conn).expect("latest_schema_version failed");
196| 1| assert_eq!(version, 3u32, "must return the highest version present");
^0
197| 1| }
198| |
199| | #[test]
200| 1| fn init_response_dim_aligned_with_constant() {
201| 1| assert_eq!(
202| | crate::constants::EMBEDDING_DIM,
203| | 384,
204| 0| "dim must be aligned with EMBEDDING_DIM=384"
205| | );
206| 1| }
207| |
208| | #[test]
209| 1| fn init_response_namespace_aligned_with_schema() {
210| | // Verify namespace field survives round-trip serialization with correct value.
211| 1| let resp = InitResponse {
212| 1| db_path: "/tmp/x.sqlite".to_string(),
213| 1| schema_version: 6,
214| 1| model: "multilingual-e5-small".to_string(),
215| 1| dim: 384,
216| 1| namespace: "my-project".to_string(),
217| 1| status: "ok".to_string(),
218| 1| elapsed_ms: 0,
219| 1| };
220| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
221| 1| assert_eq!(json["namespace"], "my-project");
222| 1| }
223| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/link.rs:
1| |//! Handler for the `link` CLI subcommand.
2| |
3| |use crate::constants::DEFAULT_RELATION_WEIGHT;
4| |use crate::entity_type::EntityType;
5| |use crate::errors::AppError;
6| |use crate::i18n::{errors_msg, validation};
7| |use crate::output::{self, OutputFormat};
8| |use crate::paths::AppPaths;
9| |use crate::storage::connection::open_rw;
10| |use crate::storage::entities;
11| |use crate::storage::entities::NewEntity;
12| |use rusqlite::params;
13| |use serde::Serialize;
14| |
15| |#[derive(clap::Args)]
16| |#[command(after_long_help = "EXAMPLES:\n \
17| | # Link two existing graph entities (extracted by GLiNER NER during `remember`)\n \
18| | sqlite-graphrag link --from oauth-flow --to refresh-tokens --relation related\n\n \
19| | # Auto-create entities that don't exist yet\n \
20| | sqlite-graphrag link --from concept-a --to concept-b --relation depends-on --create-missing\n\n \
21| | # Specify entity type for auto-created entities\n \
22| | sqlite-graphrag link --from alice --to acme-corp --relation related --create-missing --entity-type person\n\n \
23| | # Use a custom (non-canonical) relation type\n \
24| | sqlite-graphrag link --from module-a --to module-b --relation implements --create-missing\n\n \
25| | # If the entity does not exist and --create-missing is not set, the command fails with exit 4.\n \
26| | # To list current entity names:\n \
27| | sqlite-graphrag graph entities | jaq '.entities[].name'\n\n \
28| |NOTE:\n \
29| | --from and --to expect ENTITY names (graph nodes), not memory names.\n \
30| | Memory names are managed via remember/read/edit/forget; entities are auto-extracted\n \
31| | by GLiNER NER from memory bodies or auto-created via --create-missing.")]
32| |pub struct LinkArgs {
33| | /// Source ENTITY name (graph node, not memory). Entities are extracted by GLiNER NER during
34| | /// `remember` or auto-created via `--create-missing`. Use `graph entities` to list
35| | /// available entity names. Also accepts the alias `--name`.
36| | #[arg(long, alias = "name")]
37| | pub from: String,
38| | /// Target ENTITY name (graph node, not memory). See `--from` for sourcing entity names.
39| | #[arg(long)]
40| | pub to: String,
41| | /// Relation type between entities. Canonical values: applies-to, uses,
42| | /// depends-on, causes, fixes, contradicts, supports, follows, related,
43| | /// mentions, replaces, tracked-in. Any kebab-case or snake_case string
44| | /// is also accepted as a custom relation.
45| | #[arg(long, value_parser = crate::parsers::parse_relation, value_name = "RELATION")]
46| | pub relation: String,
47| | #[arg(long)]
48| | pub weight: Option<f64>,
49| | #[arg(long)]
50| | pub namespace: Option<String>,
51| | #[arg(long, value_enum, default_value = "json")]
52| | pub format: OutputFormat,
53| | #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
54| | pub json: bool,
55| | #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
56| | pub db: Option<String>,
57| | /// Auto-create entities when they do not exist. Created entities default to
58| | /// type `concept` unless `--entity-type` specifies a different type.
59| | #[arg(long, default_value_t = false)]
60| | pub create_missing: bool,
61| | /// Entity type assigned to auto-created entities (only effective with `--create-missing`).
62| | #[arg(long, value_enum, default_value = "concept")]
63| | pub entity_type: EntityType,
64| | /// Reject non-canonical relation types with exit 1.
65| | ///
66| | /// When set, any relation not in the canonical list causes an immediate error.
67| | /// Canonical values: applies-to, uses, depends-on, causes, fixes, contradicts,
68| | /// supports, follows, related, mentions, replaces, tracked-in.
69| | #[arg(
70| | long,
71| | default_value_t = false,
72| | help = "Reject non-canonical relation types with exit 1"
73| | )]
74| | pub strict_relations: bool,
75| | /// Emit a warning (but do not reject) when creating an edge would push either endpoint
76| | /// entity above this degree. Default 50. Set 0 to disable the check.
77| | #[arg(long, default_value_t = 50, value_name = "N")]
78| | pub max_entity_degree: u32,
79| |}
80| |
81| |#[derive(Serialize)]
82| |struct LinkResponse {
83| | action: String,
84| | from: String,
85| | to: String,
86| | relation: String,
87| | weight: f64,
88| | namespace: String,
89| | /// Total execution time in milliseconds from handler start to serialisation.
90| | elapsed_ms: u64,
91| | /// Entity names that were auto-created by `--create-missing`.
92| | #[serde(skip_serializing_if = "Vec::is_empty")]
93| | created_entities: Vec<String>,
94| | /// Non-fatal warnings (e.g. non-canonical relation type).
95| | #[serde(skip_serializing_if = "Vec::is_empty")]
96| | warnings: Vec<String>,
97| |}
98| |
99| 0|pub fn run(args: LinkArgs) -> Result<(), AppError> {
100| 0| let inicio = std::time::Instant::now();
101| 0| tracing::debug!(target: "link", from = %args.from, to = %args.to, relation = %args.relation, "creating relationship");
102| 0| let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
103| 0| let paths = AppPaths::resolve(args.db.as_deref())?;
104| |
105| 0| let norm_from = crate::parsers::normalize_entity_name(&args.from);
106| 0| let norm_to = crate::parsers::normalize_entity_name(&args.to);
107| |
108| 0| if norm_from == norm_to {
109| 0| return Err(AppError::Validation(validation::self_referential_link()));
110| 0| }
111| |
112| 0| let weight = args.weight.unwrap_or(DEFAULT_RELATION_WEIGHT);
113| 0| if !(0.0..=1.0).contains(&weight) {
114| 0| return Err(AppError::Validation(validation::invalid_link_weight(
115| 0| weight,
116| 0| )));
117| 0| }
118| 0| if weight >= 0.95 {
119| 0| tracing::warn!(target: "link",
120| | weight = weight,
121| 0| "weight >= 0.95 compresses the scoring range; consider using a value below 0.95"
122| | );
123| 0| }
124| 0| if weight <= 0.05 {
125| 0| tracing::warn!(target: "link",
126| | weight = weight,
127| 0| "weight <= 0.05 may be too weak to influence traversal; consider using a value above 0.05"
128| | );
129| 0| }
130| |
131| 0| crate::storage::connection::ensure_db_ready(&paths)?;
132| |
133| 0| let mut warnings: Vec<String> = Vec::with_capacity(2);
134| 0| let is_canonical = crate::parsers::is_canonical_relation(&args.relation);
135| 0| if !is_canonical {
136| 0| if args.strict_relations {
137| 0| return Err(AppError::Validation(format!(
138| 0| "non-canonical relation '{}': use --strict-relations=false or choose from: {}",
139| 0| args.relation,
140| 0| crate::parsers::CANONICAL_RELATIONS.join(", ")
141| 0| )));
142| 0| }
143| 0| warnings.push(format!("non-canonical relation '{}'", args.relation));
144| 0| tracing::warn!(target: "link",
145| | relation = %args.relation,
146| 0| "non-canonical relation accepted; consider using a well-known value"
147| | );
148| 0| }
149| 0| let relation_str = &args.relation;
150| |
151| 0| let mut conn = open_rw(&paths.db)?;
152| 0| let tx = conn.transaction_with_behavior(rusqlite::TransactionBehavior::Immediate)?;
153| |
154| 0| let mut created_entities: Vec<String> = Vec::with_capacity(2);
155| |
156| 0| if args.entity_type.as_str() == "memory" {
157| 0| tracing::warn!(target: "link",
158| | entity_type = "memory",
159| 0| "entity_type 'memory' may conflict with memory table semantics; consider using 'concept' or another type"
160| | );
161| 0| }
162| |
163| 0| let source_id = match entities::find_entity_id(&tx, &namespace, &norm_from)? {
164| 0| Some(id) => id,
165| 0| None if args.create_missing => {
166| 0| let new_entity = NewEntity {
167| 0| name: norm_from.clone(),
168| 0| entity_type: args.entity_type,
169| 0| description: None,
170| 0| };
171| 0| created_entities.push(norm_from.clone());
172| 0| entities::upsert_entity(&tx, &namespace, &new_entity)?
173| | }
174| | None => {
175| 0| return Err(AppError::NotFound(errors_msg::entity_not_found(
176| 0| &norm_from, &namespace,
177| 0| )));
178| | }
179| | };
180| |
181| 0| let target_id = match entities::find_entity_id(&tx, &namespace, &norm_to)? {
182| 0| Some(id) => id,
183| 0| None if args.create_missing => {
184| 0| let new_entity = NewEntity {
185| 0| name: norm_to.clone(),
186| 0| entity_type: args.entity_type,
187| 0| description: None,
188| 0| };
189| 0| created_entities.push(norm_to.clone());
190| 0| entities::upsert_entity(&tx, &namespace, &new_entity)?
191| | }
192| | None => {
193| 0| return Err(AppError::NotFound(errors_msg::entity_not_found(
194| 0| &norm_to, &namespace,
195| 0| )));
196| | }
197| | };
198| |
199| 0| let (rel_id, was_created) = entities::create_or_fetch_relationship(
200| 0| &tx,
201| 0| &namespace,
202| 0| source_id,
203| 0| target_id,
204| 0| relation_str,
205| 0| weight,
206| 0| None,
207| 0| )?;
208| |
209| 0| let actual_weight: f64 = tx.query_row(
210| 0| "SELECT weight FROM relationships WHERE id = ?1",
211| 0| params![rel_id],
212| 0| |r| r.get(0),
213| 0| )?;
214| |
215| 0| if was_created {
216| 0| entities::recalculate_degree(&tx, source_id)?;
217| 0| entities::recalculate_degree(&tx, target_id)?;
218| |
219| 0| if args.max_entity_degree > 0 {
220| 0| let cap = args.max_entity_degree as i64;
221| 0| for (entity_id, entity_name) in [(source_id, &norm_from), (target_id, &norm_to)] {
222| 0| let degree: i64 = tx.query_row(
223| 0| "SELECT degree FROM entities WHERE id = ?1",
224| 0| params![entity_id],
225| 0| |r| r.get(0),
226| 0| )?;
227| 0| if degree > cap {
228| 0| output::emit_progress(&format!(
229| 0| "WARNING: entity '{entity_name}' degree {degree} exceeds cap {cap}"
230| 0| ));
231| 0| }
232| | }
233| 0| }
234| 0| }
235| 0| tx.commit()?;
236| |
237| 0| conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);")?;
238| |
239| 0| let action = if was_created {
240| 0| "created".to_string()
241| | } else {
242| 0| "already_exists".to_string()
243| | };
244| |
245| 0| let response = LinkResponse {
246| 0| action: action.clone(),
247| 0| from: norm_from.clone(),
248| 0| to: norm_to.clone(),
249| 0| relation: relation_str.to_string(),
250| 0| weight: actual_weight,
251| 0| namespace: namespace.clone(),
252| 0| elapsed_ms: inicio.elapsed().as_millis() as u64,
253| 0| created_entities,
254| 0| warnings,
255| 0| };
256| |
257| 0| match args.format {
258| 0| OutputFormat::Json => output::emit_json(&response)?,
259| 0| OutputFormat::Text | OutputFormat::Markdown => {
260| 0| output::emit_text(&format!(
261| 0| "{}: {} --[{}]--> {} [{}]",
262| 0| action, response.from, response.relation, response.to, response.namespace
263| 0| ));
264| 0| }
265| | }
266| |
267| 0| Ok(())
268| 0|}
269| |
270| |#[cfg(test)]
271| |mod tests {
272| | use super::*;
273| |
274| | #[test]
275| 1| fn link_response_without_redundant_aliases() {
276| | // P1-O: source/target fields were removed from the JSON response.
277| 1| let resp = LinkResponse {
278| 1| action: "created".to_string(),
279| 1| from: "entity-a".to_string(),
280| 1| to: "entity-b".to_string(),
281| 1| relation: "uses".to_string(),
282| 1| weight: 1.0,
283| 1| namespace: "default".to_string(),
284| 1| elapsed_ms: 0,
285| 1| created_entities: vec![],
286| 1| warnings: vec![],
287| 1| };
288| 1| let json = serde_json::to_value(&resp).expect("serialization must work");
289| 1| assert_eq!(json["from"], "entity-a");
290| 1| assert_eq!(json["to"], "entity-b");
291| 1| assert!(
292| 1| json.get("source").is_none(),
293| 0| "field 'source' was removed in P1-O"
294| | );
295| 1| assert!(
296| 1| json.get("target").is_none(),
297| 0| "field 'target' was removed in P1-O"
298| | );
299| 1| }
300| |
301| | #[test]
302| 1| fn link_response_serializes_all_fields() {
303| 1| let resp = LinkResponse {
304| 1| action: "already_exists".to_string(),
305| 1| from: "origin".to_string(),
306| 1| to: "destination".to_string(),
307| 1| relation: "mentions".to_string(),
308| 1| weight: 0.8,
309| 1| namespace: "test".to_string(),
310| 1| elapsed_ms: 5,
311| 1| created_entities: vec![],
312| 1| warnings: vec![],
313| 1| };
314| 1| let json = serde_json::to_value(&resp).expect("serialization must work");
315| 1| assert!(json.get("action").is_some());
316| 1| assert!(json.get("from").is_some());
317| 1| assert!(json.get("to").is_some());
318| 1| assert!(json.get("relation").is_some());
319| 1| assert!(json.get("weight").is_some());
320| 1| assert!(json.get("namespace").is_some());
321| 1| assert!(json.get("elapsed_ms").is_some());
322| 1| }
323| |
324| | #[test]
325| 1| fn link_response_omits_created_entities_when_empty() {
326| 1| let resp = LinkResponse {
327| 1| action: "created".to_string(),
328| 1| from: "a".to_string(),
329| 1| to: "b".to_string(),
330| 1| relation: "uses".to_string(),
331| 1| weight: 1.0,
332| 1| namespace: "global".to_string(),
333| 1| elapsed_ms: 0,
334| 1| created_entities: vec![],
335| 1| warnings: vec![],
336| 1| };
337| 1| let json = serde_json::to_value(&resp).expect("serialization");
338| 1| assert!(
339| 1| json.get("created_entities").is_none(),
340| 0| "empty vec must be omitted"
341| | );
342| 1| }
343| |
344| | #[test]
345| 1| fn link_response_includes_created_entities_when_present() {
346| 1| let resp = LinkResponse {
347| 1| action: "created".to_string(),
348| 1| from: "new-a".to_string(),
349| 1| to: "new-b".to_string(),
350| 1| relation: "depends-on".to_string(),
351| 1| weight: 0.5,
352| 1| namespace: "test".to_string(),
353| 1| elapsed_ms: 1,
354| 1| created_entities: vec!["new-a".to_string(), "new-b".to_string()],
355| 1| warnings: vec![],
356| 1| };
357| 1| let json = serde_json::to_value(&resp).expect("serialization");
358| 1| let created = json["created_entities"].as_array().expect("must be array");
359| 1| assert_eq!(created.len(), 2);
360| 1| assert_eq!(created[0], "new-a");
361| 1| assert_eq!(created[1], "new-b");
362| 1| }
363| |
364| | #[test]
365| 1| fn link_response_includes_warnings_when_non_canonical() {
366| 1| let resp = LinkResponse {
367| 1| action: "created".to_string(),
368| 1| from: "a".to_string(),
369| 1| to: "b".to_string(),
370| 1| relation: "implements".to_string(),
371| 1| weight: 0.5,
372| 1| namespace: "global".to_string(),
373| 1| elapsed_ms: 0,
374| 1| created_entities: vec![],
375| 1| warnings: vec!["non-canonical relation 'implements'".to_string()],
376| 1| };
377| 1| let json = serde_json::to_value(&resp).expect("serialization");
378| 1| let w = json["warnings"]
379| 1| .as_array()
380| 1| .expect("warnings must be present");
381| 1| assert_eq!(w.len(), 1);
382| 1| assert!(w[0].as_str().unwrap().contains("implements"));
383| 1| }
384| |
385| | #[test]
386| 1| fn link_response_omits_warnings_when_empty() {
387| 1| let resp = LinkResponse {
388| 1| action: "created".to_string(),
389| 1| from: "a".to_string(),
390| 1| to: "b".to_string(),
391| 1| relation: "uses".to_string(),
392| 1| weight: 0.5,
393| 1| namespace: "global".to_string(),
394| 1| elapsed_ms: 0,
395| 1| created_entities: vec![],
396| 1| warnings: vec![],
397| 1| };
398| 1| let json = serde_json::to_value(&resp).expect("serialization");
399| 1| assert!(
400| 1| json.get("warnings").is_none(),
401| 0| "empty warnings must be omitted"
402| | );
403| 1| }
404| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/list.rs:
1| |//! Handler for the `list` CLI subcommand.
2| |
3| |use crate::cli::MemoryType;
4| |use crate::errors::AppError;
5| |use crate::output::{self, OutputFormat};
6| |use crate::paths::AppPaths;
7| |use crate::storage::connection::open_ro;
8| |use crate::storage::memories;
9| |use serde::Serialize;
10| |
11| |#[derive(clap::Args)]
12| |#[command(after_long_help = "EXAMPLES:\n \
13| | # List up to 50 memories from the global namespace (default)\n \
14| | sqlite-graphrag list\n\n \
15| | # Filter by memory type and namespace\n \
16| | sqlite-graphrag list --type project --namespace my-project\n\n \
17| | # Paginate with limit and offset\n \
18| | sqlite-graphrag list --limit 20 --offset 40\n\n \
19| | # Include soft-deleted memories\n \
20| | sqlite-graphrag list --include-deleted")]
21| |pub struct ListArgs {
22| | #[arg(
23| | long,
24| | help = "Namespace (env: SQLITE_GRAPHRAG_NAMESPACE, default: global)"
25| | )]
26| | pub namespace: Option<String>,
27| | /// Filter by memory.type. Note: distinct from graph entity_type
28| | /// (project/tool/person/file/concept/incident/decision/memory/dashboard/issue_tracker/organization/location/date)
29| | /// used in --entities-file.
30| | #[arg(long, value_enum)]
31| | pub r#type: Option<MemoryType>,
32| | #[arg(
33| | long,
34| | help = "Maximum number of memories to return (default: 50 for text, all for JSON)"
35| | )]
36| | pub limit: Option<usize>,
37| | /// Number of memories to skip before returning results.
38| | #[arg(long, default_value = "0", help = "Number of memories to skip")]
39| | pub offset: usize,
40| | /// Output format: json (default), text, or markdown.
41| | #[arg(long, value_enum, default_value = "json", help = "Output format")]
42| | pub format: OutputFormat,
43| | /// Include soft-deleted memories in the listing (deleted_at IS NOT NULL).
44| | #[arg(long, default_value_t = false, help = "Include soft-deleted memories")]
45| | pub include_deleted: bool,
46| | #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
47| | pub json: bool,
48| | /// Path to graphrag.sqlite (overrides SQLITE_GRAPHRAG_DB_PATH and default CWD).
49| | #[arg(
50| | long,
51| | env = "SQLITE_GRAPHRAG_DB_PATH",
52| | help = "Path to graphrag.sqlite"
53| | )]
54| | pub db: Option<String>,
55| |}
56| |
57| |#[derive(Serialize, Clone)]
58| |struct ListItem {
59| | id: i64,
60| | /// Semantic alias of `id` for the contract documented in SKILL.md.
61| | memory_id: i64,
62| | name: String,
63| | namespace: String,
64| | /// Semantic alias for agents that parse `.type` in the JSON output.
65| | #[serde(rename = "type")]
66| | type_field: String,
67| | /// Semantic alias for agents that parse `.memory_type` in the JSON output.
68| | memory_type: String,
69| | description: String,
70| | snippet: String,
71| | updated_at: i64,
72| | /// RFC 3339 UTC timestamp parallel to `updated_at`.
73| | updated_at_iso: String,
74| | /// Unix epoch when the memory was soft-deleted, or omitted for active memories.
75| | /// Surfaced only in `list --include-deleted --json` so LLM consumers can
76| | /// distinguish active rows from soft-deleted ones in a single query (v1.0.37 H7+M9).
77| | #[serde(skip_serializing_if = "Option::is_none")]
78| | deleted_at: Option<i64>,
79| | /// RFC 3339 UTC mirror of `deleted_at`, omitted when `deleted_at` is None.
80| | #[serde(skip_serializing_if = "Option::is_none")]
81| | deleted_at_iso: Option<String>,
82| | /// Byte length of the full memory body.
83| | body_length: usize,
84| |}
85| |
86| |#[derive(Serialize)]
87| |struct ListResponse {
88| | items: Vec<ListItem>,
89| | memories: Vec<ListItem>,
90| | /// Total number of matching memories in the namespace (ignoring limit/offset).
91| | total_count: usize,
92| | /// True when the returned item count is less than `total_count`, indicating
93| | /// that more results exist beyond the applied limit.
94| | truncated: bool,
95| | /// Total execution time in milliseconds from handler start to serialisation.
96| | elapsed_ms: u64,
97| |}
98| |
99| 0|pub fn run(args: ListArgs) -> Result<(), AppError> {
100| 0| if args.limit == Some(0) {
101| 0| return Err(AppError::Validation(
102| 0| "--limit must be greater than zero".to_string(),
103| 0| ));
104| 0| }
105| 0| let inicio = std::time::Instant::now();
106| 0| let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
107| 0| let paths = AppPaths::resolve(args.db.as_deref())?;
108| | // v1.0.22 P1: standardizes exit code 4 with a friendly message when the DB does not exist.
109| 0| crate::storage::connection::ensure_db_ready(&paths)?;
110| 0| let conn = open_ro(&paths.db)?;
111| |
112| 0| let effective_limit = args.limit.unwrap_or(match args.format {
113| 0| OutputFormat::Json => usize::MAX,
114| 0| _ => 50,
115| | });
116| |
117| 0| let memory_type_str = args.r#type.map(|t| t.as_str());
118| 0| let rows = memories::list(
119| 0| &conn,
120| 0| &namespace,
121| 0| memory_type_str,
122| 0| effective_limit,
123| 0| args.offset,
124| 0| args.include_deleted,
125| 0| )?;
126| |
127| 0| let items: Vec<ListItem> = rows
128| 0| .into_iter()
129| 0| .map(|r| {
130| 0| let body_length = r.body.len();
131| 0| let snippet: String = r.body.chars().take(200).collect();
132| 0| let updated_at_iso = crate::tz::epoch_to_iso(r.updated_at);
133| 0| let deleted_at_iso = r.deleted_at.map(crate::tz::epoch_to_iso);
134| 0| ListItem {
135| 0| id: r.id,
136| 0| memory_id: r.id,
137| 0| name: r.name,
138| 0| namespace: r.namespace,
139| 0| type_field: r.memory_type.clone(),
140| 0| memory_type: r.memory_type,
141| 0| description: r.description,
142| 0| snippet,
143| 0| updated_at: r.updated_at,
144| 0| updated_at_iso,
145| 0| deleted_at: r.deleted_at,
146| 0| deleted_at_iso,
147| 0| body_length,
148| 0| }
149| 0| })
150| 0| .collect();
151| |
152| 0| let total_count = items.len();
153| 0| let truncated = args.limit.is_some_and(|lim| items.len() >= lim);
154| |
155| 0| match args.format {
156| | OutputFormat::Json => {
157| 0| let memories = items.clone();
158| 0| output::emit_json(&ListResponse {
159| 0| total_count,
160| 0| truncated,
161| 0| memories,
162| 0| items,
163| 0| elapsed_ms: inicio.elapsed().as_millis() as u64,
164| 0| })?;
165| | }
166| | OutputFormat::Text | OutputFormat::Markdown => {
167| 0| for item in &items {
168| 0| output::emit_text(&format!("{}: {}", item.name, item.snippet));
169| 0| }
170| | }
171| | }
172| 0| Ok(())
173| 0|}
174| |
175| |#[cfg(test)]
176| |mod tests {
177| | use super::*;
178| |
179| 2| fn make_item(name: &str) -> ListItem {
180| 2| ListItem {
181| 2| id: 1,
182| 2| memory_id: 1,
183| 2| name: name.to_string(),
184| 2| namespace: "global".to_string(),
185| 2| type_field: "note".to_string(),
186| 2| memory_type: "note".to_string(),
187| 2| description: "desc".to_string(),
188| 2| snippet: "snip".to_string(),
189| 2| updated_at: 1_745_000_000,
190| 2| updated_at_iso: "2025-04-19T00:00:00Z".to_string(),
191| 2| deleted_at: None,
192| 2| deleted_at_iso: None,
193| 2| body_length: 4,
194| 2| }
195| 2| }
196| |
197| | #[test]
198| 1| fn list_response_serializes_items_and_elapsed_ms() {
199| 1| let resp = ListResponse {
200| 1| items: vec![make_item("test-memory")],
201| 1| memories: vec![make_item("test-memory")],
202| 1| total_count: 1,
203| 1| truncated: false,
204| 1| elapsed_ms: 7,
205| 1| };
206| 1| let json = serde_json::to_value(&resp).unwrap();
207| 1| assert!(json["items"].is_array());
208| 1| assert_eq!(json["items"].as_array().unwrap().len(), 1);
209| 1| assert_eq!(json["items"][0]["name"], "test-memory");
210| 1| assert_eq!(json["items"][0]["memory_id"], 1);
211| 1| assert_eq!(json["elapsed_ms"], 7);
212| | // deleted_at/deleted_at_iso must be omitted when None (skip_serializing_if)
213| 1| assert!(json["items"][0].get("deleted_at").is_none());
214| 1| assert!(json["items"][0].get("deleted_at_iso").is_none());
215| 1| }
216| |
217| | #[test]
218| 1| fn list_item_with_deleted_at_serializes_both_fields() {
219| 1| let item = ListItem {
220| 1| id: 99,
221| 1| memory_id: 99,
222| 1| name: "soft-deleted-memory".to_string(),
223| 1| namespace: "global".to_string(),
224| 1| type_field: "note".to_string(),
225| 1| memory_type: "note".to_string(),
226| 1| description: "deleted".to_string(),
227| 1| snippet: "snip".to_string(),
228| 1| updated_at: 1_745_000_000,
229| 1| updated_at_iso: "2025-04-19T00:00:00Z".to_string(),
230| 1| deleted_at: Some(1_745_100_000),
231| 1| deleted_at_iso: Some("2025-04-20T03:46:40Z".to_string()),
232| 1| body_length: 4,
233| 1| };
234| 1| let json = serde_json::to_value(&item).unwrap();
235| 1| assert_eq!(json["deleted_at"], 1_745_100_000_i64);
236| 1| assert_eq!(json["deleted_at_iso"], "2025-04-20T03:46:40Z");
237| 1| }
238| |
239| | #[test]
240| 1| fn list_response_items_empty_serializes_empty_array() {
241| 1| let resp = ListResponse {
242| 1| items: vec![],
243| 1| memories: vec![],
244| 1| total_count: 0,
245| 1| truncated: false,
246| 1| elapsed_ms: 0,
247| 1| };
248| 1| let json = serde_json::to_value(&resp).unwrap();
249| 1| assert!(json["items"].is_array());
250| 1| assert_eq!(json["items"].as_array().unwrap().len(), 0);
251| 1| assert_eq!(json["elapsed_ms"], 0);
252| 1| }
253| |
254| | #[test]
255| 1| fn list_item_memory_id_equals_id() {
256| 1| let item = ListItem {
257| 1| id: 42,
258| 1| memory_id: 42,
259| 1| name: "memory-alias".to_string(),
260| 1| namespace: "projeto".to_string(),
261| 1| type_field: "fact".to_string(),
262| 1| memory_type: "fact".to_string(),
263| 1| description: "desc".to_string(),
264| 1| snippet: "snip".to_string(),
265| 1| updated_at: 0,
266| 1| updated_at_iso: "1970-01-01T00:00:00Z".to_string(),
267| 1| deleted_at: None,
268| 1| deleted_at_iso: None,
269| 1| body_length: 0,
270| 1| };
271| 1| let json = serde_json::to_value(&item).unwrap();
272| 1| assert_eq!(
273| 1| json["id"], json["memory_id"],
274| 0| "id e memory_id devem ser iguais"
275| | );
276| 1| }
277| |
278| | #[test]
279| 1| fn snippet_truncated_to_200_chars() {
280| 1| let body_longo: String = "a".repeat(300);
281| 1| let snippet: String = body_longo.chars().take(200).collect();
282| 1| assert_eq!(snippet.len(), 200, "snippet deve ter exatamente 200 chars");
^0
283| 1| }
284| |
285| | #[test]
286| 1| fn list_item_emits_both_type_and_memory_type() {
287| 1| let item = ListItem {
288| 1| id: 1,
289| 1| memory_id: 1,
290| 1| name: "test".to_string(),
291| 1| namespace: "global".to_string(),
292| 1| type_field: "note".to_string(),
293| 1| memory_type: "note".to_string(),
294| 1| description: "desc".to_string(),
295| 1| snippet: "snip".to_string(),
296| 1| updated_at: 0,
297| 1| updated_at_iso: "1970-01-01T00:00:00Z".to_string(),
298| 1| deleted_at: None,
299| 1| deleted_at_iso: None,
300| 1| body_length: 0,
301| 1| };
302| 1| let json = serde_json::to_value(&item).unwrap();
303| 1| assert_eq!(json["type"], "note", "serde rename must produce 'type'");
^0
304| 1| assert_eq!(
305| 1| json["memory_type"], "note",
306| 0| "memory_type must also be present"
307| | );
308| 1| }
309| |
310| | #[test]
311| 1| fn updated_at_iso_epoch_zero_yields_valid_utc() {
312| | // v1.0.68 (test fix): timezone-agnostic — parse the ISO and compare
313| | // the instant with the Unix epoch.
314| 1| let iso = crate::tz::epoch_to_iso(0);
315| 1| let parsed = chrono::DateTime::parse_from_rfc3339(&iso)
316| 1| .unwrap_or_else(|e| panic!("expected RFC3339, got `{iso}`: {e}"));
^0
317| 1| assert_eq!(
318| 1| parsed.timestamp(),
319| 1| chrono::DateTime::UNIX_EPOCH.timestamp(),
320| 0| "epoch 0 deve mapear para o instante Unix epoch, obtido: {iso}"
321| | );
322| 1| assert!(
323| 1| iso.contains('+') || iso.contains('-'),
324| 0| "must contain offset sign, got: {iso}"
325| | );
326| 1| }
327| |
328| | #[test]
329| 1| fn body_length_reflects_byte_count() {
330| 1| let body = "hello world";
331| 1| let item = ListItem {
332| 1| id: 1,
333| 1| memory_id: 1,
334| 1| name: "test".to_string(),
335| 1| namespace: "global".to_string(),
336| 1| type_field: "note".to_string(),
337| 1| memory_type: "note".to_string(),
338| 1| description: "desc".to_string(),
339| 1| snippet: body.chars().take(200).collect(),
340| 1| updated_at: 0,
341| 1| updated_at_iso: "1970-01-01T00:00:00Z".to_string(),
342| 1| deleted_at: None,
343| 1| deleted_at_iso: None,
344| 1| body_length: body.len(),
345| 1| };
346| 1| let json = serde_json::to_value(&item).unwrap();
347| 1| assert_eq!(json["body_length"], body.len());
348| 1| }
349| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/memory_entities.rs:
1| |//! Handler for the `memory-entities` CLI subcommand.
2| |
3| |use crate::errors::AppError;
4| |use crate::output;
5| |use crate::paths::AppPaths;
6| |use crate::storage::connection::open_ro;
7| |use rusqlite::params;
8| |use serde::Serialize;
9| |
10| |#[derive(clap::Args)]
11| |#[command(
12| | about = "List entities linked to a memory, or memories linked to an entity",
13| | after_long_help = "EXAMPLES:\n \
14| | # List entities connected to a memory\n \
15| | sqlite-graphrag memory-entities --name my-memory\n\n \
16| | # Reverse: list memories bound to an entity\n \
17| | sqlite-graphrag memory-entities --entity rust-lang\n\n \
18| | # With namespace\n \
19| | sqlite-graphrag memory-entities --name my-memory --namespace project"
20| |)]
21| |pub struct MemoryEntitiesArgs {
22| | #[arg(value_name = "NAME", conflicts_with = "name", help = "Memory name")]
23| | pub name_positional: Option<String>,
24| | #[arg(long, conflicts_with_all = ["entity"])]
25| | pub name: Option<String>,
26| | /// Entity name — list memories bound to this entity (reverse lookup).
27| | #[arg(long, conflicts_with_all = ["name", "name_positional"])]
28| | pub entity: Option<String>,
29| | #[arg(
30| | long,
31| | help = "Namespace (env: SQLITE_GRAPHRAG_NAMESPACE, default: global)"
32| | )]
33| | pub namespace: Option<String>,
34| | #[arg(long, hide = true)]
35| | pub json: bool,
36| | #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
37| | pub db: Option<String>,
38| |}
39| |
40| |#[derive(Serialize)]
41| |struct EntityBinding {
42| | entity_id: i64,
43| | name: String,
44| | entity_type: String,
45| |}
46| |
47| |#[derive(Serialize)]
48| |struct MemoryEntitiesResponse {
49| | memory_name: String,
50| | entities: Vec<EntityBinding>,
51| | count: usize,
52| | elapsed_ms: u64,
53| |}
54| |
55| |#[derive(Serialize)]
56| |struct MemoryBinding {
57| | memory_id: i64,
58| | name: String,
59| | description: String,
60| | memory_type: String,
61| |}
62| |
63| |#[derive(Serialize)]
64| |struct EntityMemoriesResponse {
65| | entity_name: String,
66| | memories: Vec<MemoryBinding>,
67| | count: usize,
68| | elapsed_ms: u64,
69| |}
70| |
71| 0|pub fn run(args: MemoryEntitiesArgs) -> Result<(), AppError> {
72| 0| let start = std::time::Instant::now();
73| 0| let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
74| 0| let paths = AppPaths::resolve(args.db.as_deref())?;
75| 0| crate::storage::connection::ensure_db_ready(&paths)?;
76| 0| let conn = open_ro(&paths.db)?;
77| |
78| 0| if let Some(entity_name) = args.entity {
79| 0| let entity_id = crate::storage::entities::find_entity_id(&conn, &namespace, &entity_name)?
80| 0| .ok_or_else(|| {
81| 0| AppError::NotFound(crate::i18n::errors_msg::entity_not_found(
82| 0| &entity_name,
83| 0| &namespace,
84| 0| ))
85| 0| })?;
86| |
87| 0| let mut stmt = conn.prepare_cached(
88| 0| "SELECT m.id, m.name, m.description, m.type
89| 0| FROM memory_entities me
90| 0| JOIN memories m ON m.id = me.memory_id
91| 0| WHERE me.entity_id = ?1 AND m.deleted_at IS NULL
92| 0| ORDER BY m.name",
93| 0| )?;
94| |
95| 0| let memories: Vec<MemoryBinding> = stmt
96| 0| .query_map(params![entity_id], |r| {
97| | Ok(MemoryBinding {
98| 0| memory_id: r.get(0)?,
99| 0| name: r.get(1)?,
100| 0| description: r.get(2)?,
101| 0| memory_type: r.get(3)?,
102| | })
103| 0| })?
104| 0| .collect::<Result<Vec<_>, _>>()?;
105| |
106| 0| let count = memories.len();
107| 0| output::emit_json(&EntityMemoriesResponse {
108| 0| entity_name,
109| 0| memories,
110| 0| count,
111| 0| elapsed_ms: start.elapsed().as_millis() as u64,
112| 0| })?;
113| 0| return Ok(());
114| 0| }
115| |
116| 0| let name = args.name_positional.or(args.name).ok_or_else(|| {
117| 0| AppError::Validation(
118| 0| "name required: pass as positional argument, via --name, or use --entity for reverse lookup".to_string(),
119| 0| )
120| 0| })?;
121| |
122| 0| let memory_id: i64 = conn
123| 0| .query_row(
124| 0| "SELECT id FROM memories WHERE namespace = ?1 AND name = ?2 AND deleted_at IS NULL",
125| 0| params![namespace, name],
126| 0| |r| r.get(0),
127| | )
128| 0| .map_err(|_| {
129| 0| AppError::NotFound(crate::i18n::errors_msg::memory_not_found(&name, &namespace))
130| 0| })?;
131| |
132| 0| let mut stmt = conn.prepare_cached(
133| 0| "SELECT e.id, e.name, e.type AS entity_type
134| 0| FROM memory_entities me
135| 0| JOIN entities e ON e.id = me.entity_id
136| 0| WHERE me.memory_id = ?1
137| 0| ORDER BY e.name",
138| 0| )?;
139| |
140| 0| let entities: Vec<EntityBinding> = stmt
141| 0| .query_map(params![memory_id], |r| {
142| | Ok(EntityBinding {
143| 0| entity_id: r.get(0)?,
144| 0| name: r.get(1)?,
145| 0| entity_type: r.get(2)?,
146| | })
147| 0| })?
148| 0| .collect::<Result<Vec<_>, _>>()?;
149| |
150| 0| let count = entities.len();
151| |
152| 0| output::emit_json(&MemoryEntitiesResponse {
153| 0| memory_name: name,
154| 0| entities,
155| 0| count,
156| 0| elapsed_ms: start.elapsed().as_millis() as u64,
157| 0| })?;
158| |
159| 0| Ok(())
160| 0|}
161| |
162| |#[cfg(test)]
163| |mod tests {
164| | use super::*;
165| |
166| | #[test]
167| 1| fn response_serializes_correctly() {
168| 1| let resp = MemoryEntitiesResponse {
169| 1| memory_name: "test-mem".to_string(),
170| 1| entities: vec![EntityBinding {
171| 1| entity_id: 1,
172| 1| name: "rust".to_string(),
173| 1| entity_type: "concept".to_string(),
174| 1| }],
175| 1| count: 1,
176| 1| elapsed_ms: 5,
177| 1| };
178| 1| let json = serde_json::to_value(&resp).unwrap();
179| 1| assert_eq!(json["memory_name"], "test-mem");
180| 1| assert_eq!(json["count"], 1);
181| 1| assert_eq!(json["entities"][0]["name"], "rust");
182| 1| }
183| |
184| | #[test]
185| 1| fn entity_memories_response_serializes_correctly() {
186| 1| let resp = EntityMemoriesResponse {
187| 1| entity_name: "rust-lang".to_string(),
188| 1| memories: vec![MemoryBinding {
189| 1| memory_id: 42,
190| 1| name: "design-auth".to_string(),
191| 1| description: "JWT auth design".to_string(),
192| 1| memory_type: "decision".to_string(),
193| 1| }],
194| 1| count: 1,
195| 1| elapsed_ms: 3,
196| 1| };
197| 1| let json = serde_json::to_value(&resp).unwrap();
198| 1| assert_eq!(json["entity_name"], "rust-lang");
199| 1| assert_eq!(json["count"], 1);
200| 1| assert_eq!(json["memories"][0]["name"], "design-auth");
201| 1| assert_eq!(json["memories"][0]["memory_type"], "decision");
202| 1| assert_eq!(json["memories"][0]["memory_id"], 42);
203| 1| }
204| |
205| | #[test]
206| 1| fn entity_memories_response_empty_list() {
207| 1| let resp = EntityMemoriesResponse {
208| 1| entity_name: "orphan-entity".to_string(),
209| 1| memories: vec![],
210| 1| count: 0,
211| 1| elapsed_ms: 1,
212| 1| };
213| 1| let json = serde_json::to_value(&resp).unwrap();
214| 1| assert_eq!(json["entity_name"], "orphan-entity");
215| 1| assert_eq!(json["count"], 0);
216| 1| assert!(json["memories"].as_array().unwrap().is_empty());
217| 1| }
218| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/merge_entities.rs:
1| |//! Handler for the `merge-entities` CLI subcommand (GAP-19).
2| |//!
3| |//! Merges two or more source entities into a single target entity by:
4| |//! 1. Retargeting all relationships pointing at any source to the target.
5| |//! 2. Deduplicating relationships that become identical after the merge
6| |//! (same source_id + target_id + relation).
7| |//! 3. Retargeting memory_entities bindings.
8| |//! 4. Deleting the now-empty source entity rows.
9| |
10| |use crate::errors::AppError;
11| |use crate::i18n::errors_msg;
12| |use crate::output::{self, OutputFormat};
13| |use crate::paths::AppPaths;
14| |use crate::storage::connection::open_rw;
15| |use crate::storage::entities;
16| |use rusqlite::params;
17| |use serde::Serialize;
18| |
19| |#[derive(clap::Args)]
20| |#[command(after_long_help = "EXAMPLES:\n \
21| | # Merge two source entities into a target\n \
22| | sqlite-graphrag merge-entities --names auth,authentication --into auth-service\n\n \
23| | # Merge three sources into one target across a namespace\n \
24| | sqlite-graphrag merge-entities --names svc-a,svc-b,old-svc --into canonical-service --namespace my-project\n\n\
25| |NOTE:\n \
26| | --names is a comma-separated list of source entity names.\n \
27| | --into is the target entity name and must already exist.\n \
28| | Source entities are deleted after the merge; the target is preserved.\n \
29| | Duplicate relationships (same endpoints + relation) are removed automatically.\n \
30| | Run `sqlite-graphrag cleanup-orphans` afterwards if sources had no other links.")]
31| |pub struct MergeEntitiesArgs {
32| | /// Comma-separated list of source entity names to merge into the target.
33| | #[arg(long, value_delimiter = ',', value_name = "NAMES")]
34| | pub names: Vec<String>,
35| | /// Target entity name. Must already exist. All source relationships are redirected here.
36| | #[arg(long, value_name = "TARGET")]
37| | pub into: String,
38| | #[arg(long)]
39| | pub namespace: Option<String>,
40| | #[arg(long, value_enum, default_value = "json")]
41| | pub format: OutputFormat,
42| | #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
43| | pub json: bool,
44| | #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
45| | pub db: Option<String>,
46| |}
47| |
48| |#[derive(Serialize)]
49| |struct MergeEntitiesResponse {
50| | action: String,
51| | sources: Vec<String>,
52| | target: String,
53| | namespace: String,
54| | relationships_moved: usize,
55| | entities_removed: usize,
56| | /// Total execution time in milliseconds from handler start to serialisation.
57| | elapsed_ms: u64,
58| |}
59| |
60| 0|pub fn run(args: MergeEntitiesArgs) -> Result<(), AppError> {
61| 0| let inicio = std::time::Instant::now();
62| |
63| 0| if args.names.is_empty() {
64| 0| return Err(AppError::Validation(
65| 0| "--names must contain at least one source entity name".to_string(),
66| 0| ));
67| 0| }
68| |
69| 0| let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
70| 0| let paths = AppPaths::resolve(args.db.as_deref())?;
71| |
72| 0| crate::storage::connection::ensure_db_ready(&paths)?;
73| |
74| 0| let mut conn = open_rw(&paths.db)?;
75| |
76| | // Resolve target entity ID.
77| 0| let target_id = entities::find_entity_id(&conn, &namespace, &args.into)?
78| 0| .ok_or_else(|| AppError::NotFound(errors_msg::entity_not_found(&args.into, &namespace)))?;
79| |
80| | // Resolve source entity IDs — reject self-referential merge (G21).
81| 0| let mut source_ids: Vec<i64> = Vec::with_capacity(args.names.len());
82| 0| for name in &args.names {
83| 0| if name == &args.into {
84| 0| return Err(AppError::Validation(format!(
85| 0| "source entity '{}' equals target '{}' — self-referential merge is not allowed",
86| 0| name, args.into
87| 0| )));
88| 0| }
89| 0| let id = entities::find_entity_id(&conn, &namespace, name)?
90| 0| .ok_or_else(|| AppError::NotFound(errors_msg::entity_not_found(name, &namespace)))?;
91| 0| if !source_ids.contains(&id) {
92| 0| source_ids.push(id);
93| 0| }
94| | }
95| |
96| 0| if source_ids.is_empty() {
97| 0| return Err(AppError::Validation(
98| 0| "no valid source entities to merge (all names equal the target or were duplicates)"
99| 0| .to_string(),
100| 0| ));
101| 0| }
102| |
103| 0| let tx = conn.transaction_with_behavior(rusqlite::TransactionBehavior::Immediate)?;
104| |
105| 0| let mut relationships_moved: usize = 0;
106| |
107| 0| for &src_id in &source_ids {
108| | // Step 1a: redirect source_id, ignoring UNIQUE conflicts.
109| 0| let moved_src = tx.execute(
110| 0| "UPDATE OR IGNORE relationships SET source_id = ?1 WHERE source_id = ?2",
111| 0| params![target_id, src_id],
112| 0| )?;
113| 0| tx.execute(
114| 0| "DELETE FROM relationships WHERE source_id = ?1",
115| 0| params![src_id],
116| 0| )?;
117| | // Step 1b: redirect target_id, ignoring UNIQUE conflicts.
118| 0| let moved_tgt = tx.execute(
119| 0| "UPDATE OR IGNORE relationships SET target_id = ?1 WHERE target_id = ?2",
120| 0| params![target_id, src_id],
121| 0| )?;
122| 0| tx.execute(
123| 0| "DELETE FROM relationships WHERE target_id = ?1",
124| 0| params![src_id],
125| 0| )?;
126| 0| relationships_moved += moved_src + moved_tgt;
127| | }
128| |
129| | // Step 2: remove self-loops introduced by the redirect (target → target).
130| 0| tx.execute("DELETE FROM relationships WHERE source_id = target_id", [])?;
131| |
132| | // Step 3: deduplicate relationships that now share (source, target, relation).
133| | // Safety net — UPDATE OR IGNORE should have handled most duplicates above.
134| 0| tx.execute(
135| 0| "DELETE FROM relationships
136| 0| WHERE id NOT IN (
137| 0| SELECT MIN(id)
138| 0| FROM relationships
139| 0| GROUP BY source_id, target_id, relation
140| 0| )",
141| 0| [],
142| 0| )?;
143| |
144| | // Step 4: retarget memory_entities bindings.
145| | // Use UPDATE OR IGNORE to skip conflicts when memory is already bound to
146| | // target entity. Then DELETE remaining source rows (the conflicting ones
147| | // that UPDATE OR IGNORE skipped). Same pattern as relationships (Step 1).
148| 0| for &src_id in &source_ids {
149| 0| tx.execute(
150| 0| "UPDATE OR IGNORE memory_entities SET entity_id = ?1 WHERE entity_id = ?2",
151| 0| params![target_id, src_id],
152| 0| )?;
153| 0| tx.execute(
154| 0| "DELETE FROM memory_entities WHERE entity_id = ?1",
155| 0| params![src_id],
156| 0| )?;
157| | }
158| |
159| | // Step 5: deduplicate memory_entities bindings (same memory + entity).
160| 0| tx.execute(
161| 0| "DELETE FROM memory_entities
162| 0| WHERE rowid NOT IN (
163| 0| SELECT MIN(rowid)
164| 0| FROM memory_entities
165| 0| GROUP BY memory_id, entity_id
166| 0| )",
167| 0| [],
168| 0| )?;
169| |
170| | // Step 6: delete source entities (vec_entities first — no FK CASCADE on vec0).
171| 0| let mut entities_removed: usize = 0;
172| 0| for &src_id in &source_ids {
173| 0| let _ = tx.execute(
174| 0| "DELETE FROM vec_entities WHERE entity_id = ?1",
175| 0| params![src_id],
176| 0| );
177| 0| let removed = tx.execute("DELETE FROM entities WHERE id = ?1", params![src_id])?;
178| 0| entities_removed += removed;
179| | }
180| |
181| | // Step 7: recalculate degree for target and all adjacent entities.
182| 0| let adjacent_ids: Vec<i64> = {
183| 0| let mut stmt = tx.prepare(
184| 0| "SELECT DISTINCT CASE WHEN source_id = ?1 THEN target_id ELSE source_id END
185| 0| FROM relationships WHERE source_id = ?1 OR target_id = ?1",
186| 0| )?;
187| 0| let ids: Vec<i64> = stmt
188| 0| .query_map(params![target_id], |r| r.get(0))?
189| 0| .collect::<Result<Vec<_>, _>>()?;
190| 0| ids
191| | };
192| 0| entities::recalculate_degree(&tx, target_id)?;
193| 0| for &adj_id in &adjacent_ids {
194| 0| entities::recalculate_degree(&tx, adj_id)?;
195| | }
196| |
197| 0| tx.commit()?;
198| |
199| 0| conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);")?;
200| |
201| | // Build the list of sources that were actually processed (excluding target duplicates).
202| 0| let processed_sources: Vec<String> = args
203| 0| .names
204| 0| .iter()
205| 0| .filter(|n| n.as_str() != args.into.as_str())
206| 0| .cloned()
207| 0| .collect();
208| |
209| 0| let response = MergeEntitiesResponse {
210| 0| action: "merged".to_string(),
211| 0| sources: processed_sources,
212| 0| target: args.into.clone(),
213| 0| namespace: namespace.clone(),
214| 0| relationships_moved,
215| 0| entities_removed,
216| 0| elapsed_ms: inicio.elapsed().as_millis() as u64,
217| 0| };
218| |
219| 0| match args.format {
220| 0| OutputFormat::Json => output::emit_json(&response)?,
221| 0| OutputFormat::Text | OutputFormat::Markdown => {
222| 0| output::emit_text(&format!(
223| 0| "merged: {} sources into '{}' (relationships_moved={}, entities_removed={}) [{}]",
224| 0| response.sources.len(),
225| 0| response.target,
226| 0| response.relationships_moved,
227| 0| response.entities_removed,
228| 0| response.namespace
229| 0| ));
230| 0| }
231| | }
232| |
233| 0| Ok(())
234| 0|}
235| |
236| |#[cfg(test)]
237| |mod tests {
238| | use super::*;
239| |
240| | #[test]
241| 1| fn merge_entities_response_serializes_all_fields() {
242| 1| let resp = MergeEntitiesResponse {
243| 1| action: "merged".to_string(),
244| 1| sources: vec!["auth".to_string(), "authentication".to_string()],
245| 1| target: "auth-service".to_string(),
246| 1| namespace: "global".to_string(),
247| 1| relationships_moved: 7,
248| 1| entities_removed: 2,
249| 1| elapsed_ms: 15,
250| 1| };
251| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
252| 1| assert_eq!(json["action"], "merged");
253| 1| assert_eq!(json["target"], "auth-service");
254| 1| assert_eq!(json["namespace"], "global");
255| 1| assert_eq!(json["relationships_moved"], 7);
256| 1| assert_eq!(json["entities_removed"], 2);
257| 1| let sources = json["sources"].as_array().expect("must be array");
258| 1| assert_eq!(sources.len(), 2);
259| 1| assert!(json["elapsed_ms"].is_number());
260| 1| }
261| |
262| | #[test]
263| 1| fn merge_entities_response_action_is_merged() {
264| 1| let resp = MergeEntitiesResponse {
265| 1| action: "merged".to_string(),
266| 1| sources: vec!["src".to_string()],
267| 1| target: "tgt".to_string(),
268| 1| namespace: "ns".to_string(),
269| 1| relationships_moved: 0,
270| 1| entities_removed: 1,
271| 1| elapsed_ms: 0,
272| 1| };
273| 1| assert_eq!(resp.action, "merged");
274| 1| }
275| |
276| | #[test]
277| 1| fn merge_entities_response_empty_sources_serializes() {
278| 1| let resp = MergeEntitiesResponse {
279| 1| action: "merged".to_string(),
280| 1| sources: vec![],
281| 1| target: "target".to_string(),
282| 1| namespace: "global".to_string(),
283| 1| relationships_moved: 0,
284| 1| entities_removed: 0,
285| 1| elapsed_ms: 1,
286| 1| };
287| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
288| 1| let sources = json["sources"].as_array().expect("must be array");
289| 1| assert_eq!(sources.len(), 0);
290| 1| }
291| |
292| | #[test]
293| 1| fn merge_entities_response_with_zero_relationships_moved() {
294| 1| let resp = MergeEntitiesResponse {
295| 1| action: "merged".to_string(),
296| 1| sources: vec!["src-a".to_string()],
297| 1| target: "tgt".to_string(),
298| 1| namespace: "global".to_string(),
299| 1| relationships_moved: 0,
300| 1| entities_removed: 1,
301| 1| elapsed_ms: 5,
302| 1| };
303| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
304| 1| assert_eq!(json["relationships_moved"], 0);
305| 1| assert_eq!(json["entities_removed"], 1);
306| 1| }
307| |
308| | #[test]
309| 1| fn merge_entities_response_multiple_sources() {
310| 1| let resp = MergeEntitiesResponse {
311| 1| action: "merged".to_string(),
312| 1| sources: vec!["a".into(), "b".into(), "c".into()],
313| 1| target: "canonical".to_string(),
314| 1| namespace: "proj".to_string(),
315| 1| relationships_moved: 12,
316| 1| entities_removed: 3,
317| 1| elapsed_ms: 42,
318| 1| };
319| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
320| 1| assert_eq!(json["entities_removed"], 3);
321| 1| let sources = json["sources"].as_array().unwrap();
322| 1| assert_eq!(sources.len(), 3);
323| 1| }
324| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/migrate.rs:
1| |//! Handler for the `migrate` CLI subcommand.
2| |
3| |use crate::errors::AppError;
4| |use crate::output;
5| |use crate::paths::AppPaths;
6| |use crate::storage::connection::open_rw;
7| |use rusqlite::OptionalExtension;
8| |use serde::Serialize;
9| |
10| |#[derive(clap::Args)]
11| |#[command(after_long_help = "EXAMPLES:\n \
12| | # Apply pending schema migrations\n \
13| | sqlite-graphrag migrate\n\n \
14| | # Show already-applied migrations without applying new ones\n \
15| | sqlite-graphrag migrate --status\n\n \
16| | # Migrate a database at a custom path\n \
17| | sqlite-graphrag migrate --db /path/to/graphrag.sqlite")]
18| |pub struct MigrateArgs {
19| | #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
20| | pub db: Option<String>,
21| | /// Explicit JSON flag. Accepted as a no-op because output is already JSON by default.
22| | #[arg(long, default_value_t = false)]
23| | pub json: bool,
24| | /// Show already applied migrations without applying new ones.
25| | #[arg(long, default_value_t = false)]
26| | pub status: bool,
27| |}
28| |
29| |#[derive(Serialize)]
30| |struct MigrateResponse {
31| | db_path: String,
32| | /// Latest applied migration number from `refinery_schema_history`.
33| | /// Emitted as JSON number for cross-command consistency with `health`/`stats`/`init` (since v1.0.35).
34| | schema_version: u32,
35| | status: String,
36| | /// Total execution time in milliseconds from handler start to serialisation.
37| | elapsed_ms: u64,
38| |}
39| |
40| |#[derive(Serialize)]
41| |struct MigrateStatusResponse {
42| | db_path: String,
43| | applied_migrations: Vec<MigrationEntry>,
44| | /// Latest applied migration number. JSON number since v1.0.35.
45| | schema_version: u32,
46| | elapsed_ms: u64,
47| |}
48| |
49| |#[derive(Serialize)]
50| |struct MigrationEntry {
51| | version: i64,
52| | name: String,
53| | applied_on: Option<String>,
54| |}
55| |
56| 0|pub fn run(args: MigrateArgs) -> Result<(), AppError> {
57| 0| let start = std::time::Instant::now();
58| 0| let _ = args.json; // --json is a no-op because output is already JSON by default
59| 0| let paths = AppPaths::resolve(args.db.as_deref())?;
60| 0| paths.ensure_dirs()?;
61| |
62| 0| let mut conn = open_rw(&paths.db)?;
63| |
64| 0| if args.status {
65| 0| let schema_version = latest_schema_version(&conn).unwrap_or(0);
66| 0| let applied = list_applied_migrations(&conn)?;
67| 0| output::emit_json(&MigrateStatusResponse {
68| 0| db_path: paths.db.display().to_string(),
69| 0| applied_migrations: applied,
70| 0| schema_version,
71| 0| elapsed_ms: start.elapsed().as_millis() as u64,
72| 0| })?;
73| 0| return Ok(());
74| 0| }
75| |
76| 0| crate::migrations::runner()
77| 0| .run(&mut conn)
78| 0| .map_err(|e| AppError::Internal(anyhow::anyhow!("migration failed: {e}")))?;
79| |
80| 0| conn.execute_batch(&format!(
81| 0| "PRAGMA user_version = {};",
82| 0| crate::constants::SCHEMA_USER_VERSION
83| 0| ))?;
84| |
85| 0| let schema_version = latest_schema_version(&conn)?;
86| 0| conn.execute(
87| 0| "INSERT OR REPLACE INTO schema_meta (key, value) VALUES ('schema_version', ?1)",
88| 0| rusqlite::params![schema_version],
89| 0| )?;
90| |
91| 0| output::emit_json(&MigrateResponse {
92| 0| db_path: paths.db.display().to_string(),
93| 0| schema_version,
94| 0| status: "ok".to_string(),
95| 0| elapsed_ms: start.elapsed().as_millis() as u64,
96| 0| })?;
97| |
98| 0| Ok(())
99| 0|}
100| |
101| 0|fn list_applied_migrations(conn: &rusqlite::Connection) -> Result<Vec<MigrationEntry>, AppError> {
102| 0| let table_exists: Option<String> = conn
103| 0| .query_row(
104| 0| "SELECT name FROM sqlite_master WHERE type='table' AND name='refinery_schema_history'",
105| 0| [],
106| 0| |r| r.get(0),
107| | )
108| 0| .optional()?;
109| 0| if table_exists.is_none() {
110| 0| return Ok(vec![]);
111| 0| }
112| 0| let mut stmt = conn.prepare_cached(
113| 0| "SELECT version, name, applied_on FROM refinery_schema_history ORDER BY version ASC",
114| 0| )?;
115| 0| let entries = stmt
116| 0| .query_map([], |r| {
117| | Ok(MigrationEntry {
118| 0| version: r.get(0)?,
119| 0| name: r.get(1)?,
120| 0| applied_on: r.get(2)?,
121| | })
122| 0| })?
123| 0| .collect::<Result<Vec<_>, _>>()?;
124| 0| Ok(entries)
125| 0|}
126| |
127| 3|fn latest_schema_version(conn: &rusqlite::Connection) -> Result<u32, AppError> {
128| 3| match conn.query_row(
129| 3| "SELECT version FROM refinery_schema_history ORDER BY version DESC LIMIT 1",
130| 3| [],
131| 1| |row| row.get::<_, i64>(0),
132| | ) {
133| 1| Ok(version) => Ok(version.max(0) as u32),
134| 1| Err(rusqlite::Error::QueryReturnedNoRows) => Ok(0),
135| 1| Err(err) => Err(AppError::Database(err)),
136| | }
137| 3|}
138| |
139| |#[cfg(test)]
140| |mod tests {
141| | use super::*;
142| | use rusqlite::Connection;
143| |
144| 1| fn create_db_without_history() -> Connection {
145| 1| Connection::open_in_memory().expect("failed to open in-memory db")
146| 1| }
147| |
148| 1| fn create_db_with_history(version: i64) -> Connection {
149| 1| let conn = Connection::open_in_memory().expect("failed to open in-memory db");
150| 1| conn.execute_batch(
151| 1| "CREATE TABLE refinery_schema_history (
152| 1| version INTEGER NOT NULL,
153| 1| name TEXT,
154| 1| applied_on TEXT,
155| 1| checksum TEXT
156| 1| );",
157| | )
158| 1| .expect("failed to create history table");
159| 1| conn.execute(
160| 1| "INSERT INTO refinery_schema_history (version, name) VALUES (?1, 'V001__init')",
161| 1| rusqlite::params![version],
162| | )
163| 1| .expect("failed to insert version");
164| 1| conn
165| 1| }
166| |
167| | #[test]
168| 1| fn latest_schema_version_returns_error_without_table() {
169| 1| let conn = create_db_without_history();
170| | // Without refinery_schema_history table, SQLite returns Unknown (code 1) -> AppError::Database
171| 1| let result = latest_schema_version(&conn);
172| 1| assert!(result.is_err(), "must return Err when table does not exist");
^0
173| 1| }
174| |
175| | #[test]
176| 1| fn latest_schema_version_returns_max_version() {
177| 1| let conn = create_db_with_history(6);
178| 1| let version = latest_schema_version(&conn).unwrap();
179| 1| assert_eq!(version, 6u32);
180| 1| }
181| |
182| | #[test]
183| 1| fn migrate_response_serializes_required_fields() {
184| 1| let resp = MigrateResponse {
185| 1| db_path: "/tmp/test.sqlite".to_string(),
186| 1| schema_version: 6,
187| 1| status: "ok".to_string(),
188| 1| elapsed_ms: 12,
189| 1| };
190| 1| let json = serde_json::to_value(&resp).unwrap();
191| 1| assert_eq!(json["status"], "ok");
192| 1| assert_eq!(json["schema_version"], 6);
193| 1| assert_eq!(json["db_path"], "/tmp/test.sqlite");
194| 1| assert_eq!(json["elapsed_ms"], 12);
195| 1| }
196| |
197| | #[test]
198| 1| fn latest_schema_version_returns_zero_when_table_empty() {
199| 1| let conn = Connection::open_in_memory().expect("in-memory db");
200| 1| conn.execute_batch(
201| 1| "CREATE TABLE refinery_schema_history (
202| 1| version INTEGER NOT NULL,
203| 1| name TEXT
204| 1| );",
205| | )
206| 1| .expect("table creation");
207| | // Table exists but is empty -> QueryReturnedNoRows -> 0
208| 1| let version = latest_schema_version(&conn).unwrap();
209| 1| assert_eq!(version, 0u32);
210| 1| }
211| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/namespace_detect.rs:
1| |//! Handler for the `namespace-detect` CLI subcommand.
2| |
3| |use crate::errors::AppError;
4| |use crate::namespace;
5| |use crate::output;
6| |use serde::Serialize;
7| |
8| |#[derive(clap::Args)]
9| |#[command(after_long_help = "EXAMPLES:\n \
10| | # Resolve namespace using current environment and cwd\n \
11| | sqlite-graphrag namespace-detect\n\n \
12| | # Override with an explicit namespace flag\n \
13| | sqlite-graphrag namespace-detect --namespace my-project\n\n \
14| | # Resolve via SQLITE_GRAPHRAG_NAMESPACE env var\n \
15| | SQLITE_GRAPHRAG_NAMESPACE=ci-runner sqlite-graphrag namespace-detect")]
16| |pub struct NamespaceDetectArgs {
17| | #[arg(long)]
18| | pub namespace: Option<String>,
19| | /// Explicit database path. Accepted as a no-op to preserve the global contract.
20| | #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
21| | pub db: Option<String>,
22| | /// Explicit JSON flag. Accepted as a no-op because output is already JSON by default.
23| | #[arg(long, default_value_t = false)]
24| | pub json: bool,
25| |}
26| |
27| |#[derive(Serialize)]
28| |struct NamespaceDetectResponse {
29| | namespace: String,
30| | source: namespace::NamespaceSource,
31| | cwd: String,
32| | /// Total execution time in milliseconds from handler start to serialisation.
33| | elapsed_ms: u64,
34| |}
35| |
36| 0|pub fn run(args: NamespaceDetectArgs) -> Result<(), AppError> {
37| 0| let inicio = std::time::Instant::now();
38| 0| let _ = args.db;
39| 0| let _ = args.json; // --json is a no-op because output is already JSON by default
40| 0| let resolution = namespace::detect_namespace(args.namespace.as_deref())?;
41| 0| output::emit_json(&NamespaceDetectResponse {
42| 0| namespace: resolution.namespace,
43| 0| source: resolution.source,
44| 0| cwd: resolution.cwd,
45| 0| elapsed_ms: inicio.elapsed().as_millis() as u64,
46| 0| })?;
47| 0| Ok(())
48| 0|}
49| |
50| |#[cfg(test)]
51| |mod tests {
52| | use super::*;
53| | use crate::namespace::NamespaceSource;
54| | use clap::Parser;
55| | use serial_test::serial;
56| |
57| | #[test]
58| | #[serial]
59| 1| fn namespace_detect_default_returns_global_via_detect() {
60| | // Garante que sem flag e sem env, detect_namespace retorna "global"
61| 1| std::env::remove_var("SQLITE_GRAPHRAG_NAMESPACE");
62| 1| let resolution = namespace::detect_namespace(None).unwrap();
63| 1| assert_eq!(resolution.namespace, "global");
64| 1| assert_eq!(resolution.source, NamespaceSource::Default);
65| | }
66| |
67| | #[test]
68| | #[serial]
69| 1| fn namespace_detect_explicit_flag_overrides_env() {
70| 1| std::env::set_var("SQLITE_GRAPHRAG_NAMESPACE", "env-namespace");
71| 1| let resolution = namespace::detect_namespace(Some("flag-namespace")).unwrap();
72| 1| assert_eq!(resolution.namespace, "flag-namespace");
73| 1| assert_eq!(resolution.source, NamespaceSource::ExplicitFlag);
74| 1| std::env::remove_var("SQLITE_GRAPHRAG_NAMESPACE");
75| | }
76| |
77| | #[test]
78| | #[serial]
79| 1| fn namespace_detect_env_var_used_when_no_flag() {
80| 1| std::env::remove_var("SQLITE_GRAPHRAG_NAMESPACE");
81| 1| std::env::set_var("SQLITE_GRAPHRAG_NAMESPACE", "namespace-de-env");
82| 1| let resolution = namespace::detect_namespace(None).unwrap();
83| 1| assert_eq!(resolution.namespace, "namespace-de-env");
84| 1| assert_eq!(resolution.source, NamespaceSource::Environment);
85| 1| std::env::remove_var("SQLITE_GRAPHRAG_NAMESPACE");
86| | }
87| |
88| | #[test]
89| 1| fn namespace_detect_response_serializes_all_fields() {
90| 1| let resp = NamespaceDetectResponse {
91| 1| namespace: "meu-projeto".to_string(),
92| 1| source: NamespaceSource::ExplicitFlag,
93| 1| cwd: "/home/usuario/projeto".to_string(),
94| 1| elapsed_ms: 3,
95| 1| };
96| 1| let json = serde_json::to_value(&resp).unwrap();
97| 1| assert_eq!(json["namespace"], "meu-projeto");
98| 1| assert_eq!(json["source"], "explicit_flag");
99| 1| assert!(json["cwd"].is_string());
100| 1| assert_eq!(json["elapsed_ms"], 3);
101| 1| }
102| |
103| | #[test]
104| 1| fn namespace_source_serializes_in_snake_case() {
105| 1| let cases = vec![
106| 1| (NamespaceSource::ExplicitFlag, "explicit_flag"),
107| 1| (NamespaceSource::Environment, "environment"),
108| 1| (NamespaceSource::Default, "default"),
109| | ];
110| 4| for (source, expected) in cases {
^3 ^3
111| 3| let json = serde_json::to_value(source).unwrap();
112| 3| assert_eq!(
113| | json, expected,
114| 0| "NamespaceSource::{source:?} must serialize as \"{expected}\""
115| | );
116| | }
117| 1| }
118| |
119| | #[test]
120| 1| fn namespace_detect_accepts_db_as_noop() {
121| 1| let cli = crate::cli::Cli::try_parse_from([
122| 1| "sqlite-graphrag",
123| 1| "namespace-detect",
124| 1| "--db",
125| 1| "/tmp/graphrag.sqlite",
126| 1| ])
127| 1| .expect("namespace-detect must accept --db as a no-op");
128| |
129| 1| match cli.command {
130| 1| crate::cli::Commands::NamespaceDetect(args) => {
131| 1| assert_eq!(args.db.as_deref(), Some("/tmp/graphrag.sqlite"));
132| | }
133| 0| _ => unreachable!("unexpected command parsed"),
134| | }
135| 1| }
136| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/normalize_entities.rs:
1| |//! Handler for the `normalize-entities` CLI subcommand (GAP-15).
2| |//!
3| |//! Scans all existing entity names in the namespace and normalizes them to
4| |//! kebab-case ASCII using [`crate::parsers::normalize_entity_name`].
5| |//! When a normalized name already exists (collision), the source entity is
6| |//! merged into the target using the same logic as `merge-entities`:
7| |//! relationships are retargeted via `UPDATE OR IGNORE` + `DELETE`, then
8| |//! the source row is removed. Otherwise the entity name is updated in place.
9| |
10| |use crate::errors::AppError;
11| |use crate::output::{self, OutputFormat};
12| |use crate::parsers::normalize_entity_name;
13| |use crate::paths::AppPaths;
14| |use crate::storage::connection::open_rw;
15| |use rusqlite::params;
16| |use serde::Serialize;
17| |
18| |#[derive(clap::Args)]
19| |#[command(after_long_help = "EXAMPLES:\n \
20| | # Preview which entities would be renamed or merged\n \
21| | sqlite-graphrag normalize-entities --dry-run\n\n \
22| | # Apply normalization to all entity names\n \
23| | sqlite-graphrag normalize-entities --yes\n\n \
24| | # Scope to a specific namespace\n \
25| | sqlite-graphrag normalize-entities --yes --namespace my-project\n\n\
26| |NOTE:\n \
27| | When a normalized name already exists, the source entity is merged into\n \
28| | the existing target via relationship retargeting (UPDATE OR IGNORE + DELETE).\n \
29| | Run `cleanup-orphans` afterwards to remove any newly orphaned entities.")]
30| |pub struct NormalizeEntitiesArgs {
31| | /// Preview changes without persisting them.
32| | #[arg(long, conflicts_with = "yes")]
33| | pub dry_run: bool,
34| | /// Apply normalization without interactive confirmation.
35| | #[arg(long, conflicts_with = "dry_run")]
36| | pub yes: bool,
37| | #[arg(long)]
38| | pub namespace: Option<String>,
39| | #[arg(long, value_enum, default_value = "json")]
40| | pub format: OutputFormat,
41| | #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
42| | pub json: bool,
43| | #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
44| | pub db: Option<String>,
45| |}
46| |
47| |#[derive(Serialize)]
48| |struct NormalizeEntitiesResponse {
49| | /// "normalized" when changes were applied, "dry_run" when only previewed.
50| | action: String,
51| | /// Number of entities whose names were updated in place.
52| | normalized_count: usize,
53| | /// Number of entities that collided with an existing normalized name and
54| | /// were merged into the target.
55| | merged_count: usize,
56| | namespace: String,
57| | /// Total execution time in milliseconds from handler start to serialisation.
58| | elapsed_ms: u64,
59| |}
60| |
61| 0|pub fn run(args: NormalizeEntitiesArgs) -> Result<(), AppError> {
62| 0| let inicio = std::time::Instant::now();
63| |
64| 0| if !args.dry_run && !args.yes {
65| 0| return Err(AppError::Validation(
66| 0| "pass --dry-run to preview or --yes to apply changes".to_string(),
67| 0| ));
68| 0| }
69| |
70| 0| let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
71| 0| let paths = AppPaths::resolve(args.db.as_deref())?;
72| |
73| 0| crate::storage::connection::ensure_db_ready(&paths)?;
74| |
75| 0| let mut conn = open_rw(&paths.db)?;
76| |
77| | // Collect all entity (id, name) pairs for the namespace.
78| 0| let entities: Vec<(i64, String)> = {
79| 0| let mut stmt =
80| 0| conn.prepare_cached("SELECT id, name FROM entities WHERE namespace = ?1 ORDER BY id")?;
81| 0| let rows = stmt.query_map(params![namespace], |r| {
82| 0| Ok((r.get::<_, i64>(0)?, r.get::<_, String>(1)?))
83| 0| })?;
84| 0| rows.collect::<Result<Vec<_>, _>>()?
85| | };
86| |
87| | // Compute which names need changing.
88| 0| let to_change: Vec<(i64, String, String)> = entities
89| 0| .iter()
90| 0| .filter_map(|(id, name)| {
91| 0| let normalized = normalize_entity_name(name);
92| 0| if normalized != *name {
93| 0| Some((*id, name.clone(), normalized))
94| | } else {
95| 0| None
96| | }
97| 0| })
98| 0| .collect();
99| |
100| | // G10: classify changes into renames (no collision) and merges (collision).
101| | // A collision occurs when two distinct names normalize to the same target,
102| | // or when the normalized target already exists in the DB as an already-normalized entity.
103| 0| let already_normalized: std::collections::HashSet<String> = entities
104| 0| .iter()
105| 0| .filter(|(_, name)| normalize_entity_name(name) == *name)
106| 0| .map(|(_, name)| name.clone())
107| 0| .collect();
108| |
109| 0| let mut target_groups: std::collections::HashMap<String, usize> =
110| 0| std::collections::HashMap::with_capacity(to_change.len());
111| 0| for (_, _, normalized) in &to_change {
112| 0| *target_groups.entry(normalized.clone()).or_insert(0) += 1;
113| 0| }
114| |
115| 0| let mut merge_count_preview: usize = 0;
116| 0| let mut rename_count_preview: usize = 0;
117| 0| for (target, count) in &target_groups {
118| 0| if *count > 1 || already_normalized.contains(target) {
119| | // All sources in this group will merge into the existing or first entity
120| 0| let extra = if already_normalized.contains(target) {
121| 0| *count // all merge into existing
122| | } else {
123| 0| count - 1 // first one renames, rest merge
124| | };
125| 0| merge_count_preview += extra;
126| 0| rename_count_preview += count - extra;
127| 0| } else {
128| 0| rename_count_preview += 1;
129| 0| }
130| | }
131| |
132| 0| if args.dry_run {
133| 0| let response = NormalizeEntitiesResponse {
134| 0| action: "dry_run".to_string(),
135| 0| normalized_count: rename_count_preview,
136| 0| merged_count: merge_count_preview,
137| 0| namespace,
138| 0| elapsed_ms: inicio.elapsed().as_millis() as u64,
139| 0| };
140| 0| match args.format {
141| 0| OutputFormat::Json => output::emit_json(&response)?,
142| 0| OutputFormat::Text | OutputFormat::Markdown => {
143| 0| output::emit_text(&format!(
144| 0| "dry_run: {} entity names would be normalized",
145| 0| response.normalized_count
146| 0| ));
147| 0| }
148| | }
149| 0| return Ok(());
150| 0| }
151| |
152| | // Apply changes inside a transaction.
153| 0| let tx = conn.transaction_with_behavior(rusqlite::TransactionBehavior::Immediate)?;
154| |
155| 0| let mut normalized_count: usize = 0;
156| 0| let mut merged_count: usize = 0;
157| |
158| 0| for (src_id, _original_name, normalized) in &to_change {
159| | // Check whether a row with the normalized name already exists.
160| 0| let existing_id: Option<i64> = {
161| 0| let mut stmt =
162| 0| tx.prepare_cached("SELECT id FROM entities WHERE namespace = ?1 AND name = ?2")?;
163| 0| match stmt.query_row(params![namespace, normalized], |r| r.get::<_, i64>(0)) {
164| 0| Ok(id) => Some(id),
165| 0| Err(rusqlite::Error::QueryReturnedNoRows) => None,
166| 0| Err(e) => return Err(AppError::Database(e)),
167| | }
168| | };
169| |
170| 0| match existing_id {
171| 0| Some(target_id) if target_id != *src_id => {
172| | // Collision: merge source into target using UPDATE OR IGNORE + DELETE.
173| | // Step 1a: redirect source_id.
174| 0| tx.execute(
175| 0| "UPDATE OR IGNORE relationships SET source_id = ?1 WHERE source_id = ?2",
176| 0| params![target_id, src_id],
177| 0| )?;
178| 0| tx.execute(
179| 0| "DELETE FROM relationships WHERE source_id = ?1",
180| 0| params![src_id],
181| 0| )?;
182| | // Step 1b: redirect target_id.
183| 0| tx.execute(
184| 0| "UPDATE OR IGNORE relationships SET target_id = ?1 WHERE target_id = ?2",
185| 0| params![target_id, src_id],
186| 0| )?;
187| 0| tx.execute(
188| 0| "DELETE FROM relationships WHERE target_id = ?1",
189| 0| params![src_id],
190| 0| )?;
191| | // Remove self-loops.
192| 0| tx.execute("DELETE FROM relationships WHERE source_id = target_id", [])?;
193| | // Retarget memory_entities bindings.
194| 0| tx.execute(
195| 0| "UPDATE OR IGNORE memory_entities SET entity_id = ?1 WHERE entity_id = ?2",
196| 0| params![target_id, src_id],
197| 0| )?;
198| 0| tx.execute(
199| 0| "DELETE FROM memory_entities WHERE entity_id = ?1",
200| 0| params![src_id],
201| 0| )?;
202| | // Remove the source entity row.
203| 0| tx.execute("DELETE FROM entities WHERE id = ?1", params![src_id])?;
204| | // Recalculate degree for the surviving target.
205| 0| tx.execute(
206| 0| "UPDATE entities
207| 0| SET degree = (SELECT COUNT(*) FROM relationships
208| 0| WHERE source_id = entities.id OR target_id = entities.id)
209| 0| WHERE id = ?1",
210| 0| params![target_id],
211| 0| )?;
212| 0| tracing::info!(target: "normalize_entities",
213| | src_id = src_id,
214| | target_id = target_id,
215| | normalized = normalized,
216| 0| "entity merged into existing normalized target"
217| | );
218| 0| merged_count += 1;
219| | }
220| | _ => {
221| | // No collision: simple rename.
222| 0| tx.execute(
223| 0| "UPDATE entities SET name = ?1, updated_at = unixepoch() WHERE id = ?2",
224| 0| params![normalized, src_id],
225| 0| )?;
226| 0| tracing::info!(target: "normalize_entities",
227| | entity_id = src_id,
228| | normalized = normalized,
229| 0| "entity name normalized"
230| | );
231| 0| normalized_count += 1;
232| | }
233| | }
234| | }
235| |
236| 0| tx.commit()?;
237| 0| conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);")?;
238| |
239| 0| let response = NormalizeEntitiesResponse {
240| 0| action: "normalized".to_string(),
241| 0| normalized_count,
242| 0| merged_count,
243| 0| namespace,
244| 0| elapsed_ms: inicio.elapsed().as_millis() as u64,
245| 0| };
246| |
247| 0| match args.format {
248| 0| OutputFormat::Json => output::emit_json(&response)?,
249| 0| OutputFormat::Text | OutputFormat::Markdown => {
250| 0| output::emit_text(&format!(
251| 0| "normalized: {} renamed, {} merged",
252| 0| response.normalized_count, response.merged_count
253| 0| ));
254| 0| }
255| | }
256| |
257| 0| Ok(())
258| 0|}
259| |
260| |#[cfg(test)]
261| |mod tests {
262| | use super::*;
263| | use crate::storage::connection::register_vec_extension;
264| | use rusqlite::Connection;
265| | use tempfile::TempDir;
266| |
267| | type TestResult = Result<(), Box<dyn std::error::Error>>;
268| |
269| | /// Opens a temp DB with the full schema applied via migrations.
270| 3| fn setup_db() -> Result<(TempDir, Connection), Box<dyn std::error::Error>> {
271| 3| register_vec_extension();
272| 3| let tmp = TempDir::new()?;
^0
273| 3| let db_path = tmp.path().join("test.db");
274| 3| let mut conn = Connection::open(&db_path)?;
^0
275| 3| crate::migrations::runner().run(&mut conn)?;
^0
276| 3| Ok((tmp, conn))
277| 3| }
278| |
279| | /// Inserts an entity bypassing `upsert_entity` normalization, so tests can
280| | /// seed deliberately un-normalized names.
281| 5| fn insert_entity(conn: &Connection, name: &str) -> Result<i64, Box<dyn std::error::Error>> {
282| | // Bypass upsert_entity normalization to seed raw (un-normalized) names.
283| 5| conn.execute(
284| 5| "INSERT INTO entities (namespace, name, type, description) VALUES ('global', ?1, 'concept', NULL)",
285| 5| params![name],
286| 0| )?;
287| 5| let id: i64 = conn.query_row(
288| 5| "SELECT id FROM entities WHERE namespace = 'global' AND name = ?1",
289| 5| params![name],
290| 5| |r| r.get(0),
291| 0| )?;
292| 5| Ok(id)
293| 5| }
294| |
295| | #[test]
296| 1| fn dry_run_returns_count_without_changes() -> TestResult {
297| 1| let (_tmp, conn) = setup_db()?;
^0
298| 1| insert_entity(&conn, "Hello World")?;
^0
299| 1| insert_entity(&conn, "already-normalized")?;
^0
300| |
301| | // Verify "Hello World" exists.
302| 1| let count: i64 = conn.query_row(
303| 1| "SELECT COUNT(*) FROM entities WHERE name = 'Hello World' AND namespace = 'global'",
304| 1| [],
305| 1| |r| r.get(0),
306| 0| )?;
307| 1| assert_eq!(count, 1, "entity must exist before dry run");
^0
308| |
309| | // dry_run must not modify anything.
310| 1| let count_after: i64 = conn.query_row(
311| 1| "SELECT COUNT(*) FROM entities WHERE name = 'Hello World' AND namespace = 'global'",
312| 1| [],
313| 1| |r| r.get(0),
314| 0| )?;
315| 1| assert_eq!(count_after, 1, "dry run must not rename entities");
^0
316| 1| Ok(())
317| 1| }
318| |
319| | #[test]
320| 1| fn renames_unnormalized_entity_in_place() -> TestResult {
321| 1| let (_tmp, conn) = setup_db()?;
^0
322| 1| let src_id = insert_entity(&conn, "Hello World")?;
^0
323| |
324| | // Apply normalization directly via the internal logic.
325| | {
326| 1| let normalized = normalize_entity_name("Hello World");
327| 1| let existing: Option<i64> = {
328| 1| match conn.query_row(
329| 1| "SELECT id FROM entities WHERE namespace = 'global' AND name = ?1",
330| 1| params![normalized],
331| 0| |r| r.get::<_, i64>(0),
332| | ) {
333| 0| Ok(id) => Some(id),
334| 1| Err(rusqlite::Error::QueryReturnedNoRows) => None,
335| 0| Err(e) => return Err(e.into()),
336| | }
337| | };
338| 1| assert!(existing.is_none(), "no collision expected");
^0
339| 1| conn.execute(
340| 1| "UPDATE entities SET name = ?1 WHERE id = ?2",
341| 1| params![normalized, src_id],
342| 0| )?;
343| | }
344| |
345| 1| let name: String = conn.query_row(
346| 1| "SELECT name FROM entities WHERE id = ?1",
347| 1| params![src_id],
348| 1| |r| r.get(0),
349| 0| )?;
350| 1| assert_eq!(name, "hello-world");
351| 1| Ok(())
352| 1| }
353| |
354| | #[test]
355| 1| fn merges_into_existing_on_collision() -> TestResult {
356| 1| let (_tmp, conn) = setup_db()?;
^0
357| | // Target already exists with the normalized name.
358| 1| let target_id = insert_entity(&conn, "hello-world")?;
^0
359| | // Source has the un-normalized form that normalizes to the same value.
360| 1| let src_id = insert_entity(&conn, "Hello World")?;
^0
361| |
362| | // Insert a relationship attached to src_id.
363| 1| conn.execute(
364| 1| "INSERT INTO relationships (namespace, source_id, target_id, relation, weight)
365| 1| VALUES ('global', ?1, ?1, 'related', 0.5)",
366| 1| params![src_id],
367| 0| )?;
368| |
369| | // Merge: retarget relationships from src → target.
370| 1| conn.execute(
371| 1| "UPDATE OR IGNORE relationships SET source_id = ?1 WHERE source_id = ?2",
372| 1| params![target_id, src_id],
373| 0| )?;
374| 1| conn.execute(
375| 1| "DELETE FROM relationships WHERE source_id = ?1",
376| 1| params![src_id],
377| 0| )?;
378| 1| conn.execute("DELETE FROM entities WHERE id = ?1", params![src_id])?;
^0
379| |
380| | // Source must be gone.
381| 1| let src_exists: i64 = conn.query_row(
382| 1| "SELECT COUNT(*) FROM entities WHERE id = ?1",
383| 1| params![src_id],
384| 1| |r| r.get(0),
385| 0| )?;
386| 1| assert_eq!(src_exists, 0, "source entity must be deleted after merge");
^0
387| |
388| | // Target must still exist.
389| 1| let target_name: String = conn.query_row(
390| 1| "SELECT name FROM entities WHERE id = ?1",
391| 1| params![target_id],
392| 1| |r| r.get(0),
393| 0| )?;
394| 1| assert_eq!(target_name, "hello-world");
395| 1| Ok(())
396| 1| }
397| |
398| | #[test]
399| 1| fn normalize_entities_response_serializes_correctly() {
400| 1| let resp = NormalizeEntitiesResponse {
401| 1| action: "normalized".to_string(),
402| 1| normalized_count: 3,
403| 1| merged_count: 1,
404| 1| namespace: "global".to_string(),
405| 1| elapsed_ms: 42,
406| 1| };
407| 1| let json = serde_json::to_value(&resp).expect("serialization");
408| 1| assert_eq!(json["action"], "normalized");
409| 1| assert_eq!(json["normalized_count"], 3);
410| 1| assert_eq!(json["merged_count"], 1);
411| 1| assert_eq!(json["namespace"], "global");
412| 1| assert!(json["elapsed_ms"].as_u64().is_some());
413| 1| }
414| |
415| | #[test]
416| 1| fn dry_run_response_has_correct_action() {
417| 1| let resp = NormalizeEntitiesResponse {
418| 1| action: "dry_run".to_string(),
419| 1| normalized_count: 5,
420| 1| merged_count: 0,
421| 1| namespace: "test".to_string(),
422| 1| elapsed_ms: 1,
423| 1| };
424| 1| let json = serde_json::to_value(&resp).expect("serialization");
425| 1| assert_eq!(json["action"], "dry_run");
426| 1| }
427| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/optimize.rs:
1| |//! Handler for the `optimize` CLI subcommand.
2| |
3| |use crate::commands::fts::check_fts_functional;
4| |use crate::errors::AppError;
5| |use crate::output;
6| |use crate::paths::AppPaths;
7| |use crate::storage::connection::open_rw;
8| |use serde::Serialize;
9| |
10| |#[derive(clap::Args)]
11| |#[command(after_long_help = "EXAMPLES:\n \
12| | # Run PRAGMA optimize on the default database\n \
13| | sqlite-graphrag optimize\n\n \
14| | # Optimize a database at a custom path\n \
15| | sqlite-graphrag optimize --db /path/to/graphrag.sqlite\n\n \
16| | # Skip the FTS5 rebuild even if the index looks unhealthy\n \
17| | sqlite-graphrag optimize --skip-fts\n\n \
18| | # Dry-run: only report FTS5 health status, do not rebuild\n \
19| | sqlite-graphrag optimize --fts-dry-run\n\n \
20| | # Run optimize non-interactively (skip confirmation prompts)\n \
21| | sqlite-graphrag optimize --yes\n\n \
22| | # Force a full FTS5 rebuild even if the index already passes integrity-check\n \
23| | sqlite-graphrag optimize --no-fts-skip-when-functional\n\n \
24| | # Optimize via SQLITE_GRAPHRAG_DB_PATH env var\n \
25| | SQLITE_GRAPHRAG_DB_PATH=/data/graphrag.sqlite sqlite-graphrag optimize")]
26| |pub struct OptimizeArgs {
27| | #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
28| | pub json: bool,
29| | #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
30| | pub db: Option<String>,
31| | #[arg(long, default_value_t = false, help = "Skip FTS5 index rebuild")]
32| | pub skip_fts: bool,
33| | /// When true (default), the FTS5 rebuild step is skipped when
34| | /// `fts check` reports the index is already functional. Saves 5-15
35| | /// minutes on large databases. Set to false to always rebuild.
36| | #[arg(
37| | long,
38| | default_value_t = true,
39| | help = "Skip FTS5 rebuild when index is already functional (saves minutes on big DBs)"
40| | )]
41| | pub fts_skip_when_functional: bool,
42| | /// G36 Passo 2 (v1.0.69): run `fts check` + `fts stats` only, do not
43| | /// trigger any rebuild. Exit code is 0 when the index is healthy, 1
44| | /// when a rebuild would be recommended.
45| | #[arg(
46| | long,
47| | default_value_t = false,
48| | help = "G36: only run fts check + fts stats, do not rebuild (exit 1 if rebuild recommended)"
49| | )]
50| | pub fts_dry_run: bool,
51| | /// G36 Passo 3 (v1.0.69): emit a tracing::info! progress line every
52| | /// N seconds during the FTS5 rebuild. The FTS5 `rebuild` command is
53| | /// synchronous and does not call the SQLite progress handler, so the
54| | /// progress is sampled at the configured interval. Use 0 to disable.
55| | #[arg(
56| | long,
57| | default_value_t = 30,
58| | help = "G36: emit progress line every N seconds during FTS5 rebuild (0 to disable)"
59| | )]
60| | pub fts_progress: u64,
61| | /// G36 Passo 4 (v1.0.69): skip all confirmation prompts. Required
62| | /// for non-interactive CI/CD pipelines that cannot answer `y/N`.
63| | #[arg(
64| | long,
65| | default_value_t = false,
66| | help = "G36: skip confirmation prompts (required for non-interactive CI)"
67| | )]
68| | pub yes: bool,
69| |}
70| |
71| |#[derive(Serialize)]
72| |struct OptimizeResponse {
73| | db_path: String,
74| | status: String,
75| | /// True when the FTS5 index was rebuilt during this optimize run.
76| | fts_rebuilt: bool,
77| | /// True when the FTS5 rebuild was skipped because the index was already healthy.
78| | fts_skipped_functional: bool,
79| | /// True when FTS5 was detected as unhealthy AND the rebuild was attempted.
80| | fts_unhealthy: bool,
81| | /// Number of FTS5 rows indexed during the rebuild (G36 progress observability).
82| | fts_rows_indexed: Option<i64>,
83| | /// Total execution time in milliseconds from handler start to serialisation.
84| | elapsed_ms: u64,
85| |}
86| |
87| 1|pub fn run(args: OptimizeArgs) -> Result<(), AppError> {
88| 1| let inicio = std::time::Instant::now();
89| 1| let paths = AppPaths::resolve(args.db.as_deref())?;
^0
90| |
91| 1| crate::storage::connection::ensure_db_ready(&paths)?;
^0
92| |
93| 1| let conn = open_rw(&paths.db)?;
^0
94| 1| conn.execute_batch("PRAGMA optimize;")?;
^0
95| |
96| | // G36: pre-check FTS5 health before triggering a multi-minute rebuild.
97| 1| let fts_functional = if !args.skip_fts {
98| 1| check_fts_functional(&conn).unwrap_or(false)
99| | } else {
100| 0| false
101| | };
102| |
103| | // G36 Passo 2 (v1.0.69): dry-run path. Run fts check + fts stats, emit
104| | // JSON envelope, and return exit 1 when a rebuild would be recommended.
105| 1| if args.fts_dry_run {
106| 0| let recommend_rebuild = !fts_functional;
107| 0| output::emit_json(&OptimizeResponse {
108| 0| db_path: paths.db.display().to_string(),
109| 0| status: if recommend_rebuild {
110| 0| "rebuild_recommended".to_string()
111| | } else {
112| 0| "ok".to_string()
113| | },
114| | fts_rebuilt: false,
115| | fts_skipped_functional: false,
116| 0| fts_unhealthy: !fts_functional,
117| 0| fts_rows_indexed: None,
118| 0| elapsed_ms: inicio.elapsed().as_millis() as u64,
119| 0| })?;
120| 0| if recommend_rebuild {
121| 0| std::process::exit(1);
122| 0| }
123| 0| return Ok(());
124| 1| }
125| |
126| 1| let (fts_rebuilt, fts_skipped_functional, fts_unhealthy, fts_rows_indexed) = if args.skip_fts {
127| 0| (false, false, false, None)
128| 1| } else if args.fts_skip_when_functional && fts_functional {
129| 1| tracing::info!(target: "optimize",
130| 0| "FTS5 index already functional; skipping rebuild (use --no-fts-skip-when-functional to override)"
131| | );
132| 1| (false, true, false, None)
133| | } else {
134| 0| if !fts_functional {
135| 0| tracing::warn!(target: "optimize",
136| 0| "FTS5 index reported unhealthy; running full rebuild"
137| | );
138| 0| }
139| | // Capture row count BEFORE rebuild so we can report progress.
140| | // (FTS5 rebuild is synchronous; a true callback would require
141| | // `sqlite3_progress_handler` which the FTS5 'rebuild' command
142| | // does not respect. We sample the row count after.)
143| 0| let before: i64 = conn
144| 0| .query_row("SELECT COUNT(*) FROM fts_memories", [], |r| r.get(0))
145| 0| .unwrap_or(0);
146| | // G36 Passo 3 (v1.0.69): spawn a lightweight background thread that
147| | // emits a tracing::info! progress line every `args.fts_progress`
148| | // seconds while the rebuild is in flight. The FTS5 rebuild command
149| | // is synchronous and does not call the SQLite progress handler, so
150| | // the only observability we can add is a row-count poll from a
151| | // background thread. We open a SEPARATE read-only connection
152| | // because `rusqlite::Connection` is not `Sync` and the rebuild
153| | // holds the main connection exclusively. Default 30s; 0 disables.
154| 0| let progress_thread = if args.fts_progress > 0 {
155| 0| let interval = std::time::Duration::from_secs(args.fts_progress);
156| 0| let db_path = paths.db.clone();
157| 0| let child = std::thread::spawn(move || loop {
158| 0| std::thread::sleep(interval);
159| 0| let count: i64 = match crate::storage::connection::open_ro(&db_path) {
160| 0| Ok(c) => c
161| 0| .query_row("SELECT COUNT(*) FROM fts_memories", [], |r| r.get(0))
162| 0| .unwrap_or(-1),
163| 0| Err(_) => -1,
164| | };
165| 0| tracing::info!(target: "optimize", fts_rows = count, "FTS5 rebuild progress sample");
166| | });
167| 0| Some(child)
168| | } else {
169| 0| None
170| | };
171| 0| let rebuilt_ok = conn
172| 0| .execute_batch("INSERT INTO fts_memories(fts_memories) VALUES('rebuild');")
173| 0| .is_ok();
174| 0| if let Some(handle) = progress_thread {
175| 0| // The thread runs forever in a sleep loop; we leak it on
176| 0| // purpose because (a) it terminates when the process exits
177| 0| // and (b) we cannot safely join without a stop signal channel
178| 0| // which would add complexity not warranted for a 30s sampler.
179| 0| std::mem::forget(handle);
180| 0| }
181| 0| let after: i64 = if rebuilt_ok {
182| 0| conn.query_row("SELECT COUNT(*) FROM fts_memories", [], |r| r.get(0))
183| 0| .unwrap_or(0)
184| | } else {
185| 0| 0
186| | };
187| | // G36 progress: rows_indexed == after - before. Emitted as a
188| | // tracing::info! line so operators following logs see the
189| | // rebuild magnitude without needing NDJSON streaming.
190| 0| tracing::info!(target: "optimize", before, after, "FTS5 rebuild complete");
191| 0| (rebuilt_ok, false, !fts_functional, Some(after - before))
192| | };
193| |
194| | // G36 Passo 4 (v1.0.69): --yes flag is currently honored for forward
195| | // compatibility — every interactive prompt path in optimize must
196| | // check this flag and skip the prompt when set. As of v1.0.69 there
197| | // are no interactive prompts in optimize (the user is told up front
198| | // via the after_long_help), but the flag is reserved so future
199| | // confirmations can be added without breaking the CLI contract.
200| 1| let _ = args.yes;
201| |
202| 1| output::emit_json(&OptimizeResponse {
203| 1| db_path: paths.db.display().to_string(),
204| 1| status: "ok".to_string(),
205| 1| fts_rebuilt,
206| 1| fts_skipped_functional,
207| 1| fts_unhealthy,
208| 1| fts_rows_indexed,
209| 1| elapsed_ms: inicio.elapsed().as_millis() as u64,
210| 1| })?;
^0
211| |
212| 1| Ok(())
213| 1|}
214| |
215| |#[cfg(test)]
216| |mod tests {
217| | use super::*;
218| | use serial_test::serial;
219| | use tempfile::TempDir;
220| |
221| | #[test]
222| 1| fn optimize_response_serializes_required_fields() {
223| 1| let resp = OptimizeResponse {
224| 1| db_path: "/tmp/graphrag.sqlite".to_string(),
225| 1| status: "ok".to_string(),
226| 1| fts_rebuilt: false,
227| 1| fts_rows_indexed: None,
228| 1| fts_skipped_functional: false,
229| 1| fts_unhealthy: false,
230| 1| elapsed_ms: 5,
231| 1| };
232| 1| let json = serde_json::to_value(&resp).unwrap();
233| 1| assert_eq!(json["status"], "ok");
234| 1| assert_eq!(json["db_path"], "/tmp/graphrag.sqlite");
235| 1| assert_eq!(json["elapsed_ms"], 5);
236| 1| }
237| |
238| | #[test]
239| | #[serial]
240| 1| fn optimize_auto_inits_when_db_missing() {
241| 1| let dir = TempDir::new().unwrap();
242| 1| let db_path = dir.path().join("missing.sqlite");
243| | // SAFETY: `#[serial]` guarantees single-threaded execution.
244| 1| unsafe {
245| 1| std::env::set_var("SQLITE_GRAPHRAG_DB_PATH", db_path.to_str().unwrap());
246| 1| std::env::set_var("LOG_LEVEL", "error");
247| 1| }
248| |
249| 1| let args = OptimizeArgs {
250| 1| json: false,
251| 1| db: Some(db_path.to_string_lossy().into_owned()),
252| 1| skip_fts: false,
253| 1| fts_skip_when_functional: true,
254| 1| fts_dry_run: false,
255| 1| fts_progress: 30,
256| 1| yes: true,
257| 1| };
258| 1| let result = run(args);
259| 1| assert!(
260| 1| result.is_ok(),
261| 0| "auto-init must succeed and PRAGMA optimize must run on the fresh database, got {result:?}"
262| | );
263| 1| assert!(
264| 1| db_path.exists(),
265| 0| "auto-init must create the database file at {}",
266| 0| db_path.display()
267| | );
268| | // SAFETY: `#[serial]` guarantees single-threaded execution.
269| 1| unsafe {
270| 1| std::env::remove_var("SQLITE_GRAPHRAG_DB_PATH");
271| 1| std::env::remove_var("LOG_LEVEL");
272| 1| }
273| | }
274| |
275| | #[test]
276| 1| fn optimize_response_status_ok_fixo() {
277| 1| let resp = OptimizeResponse {
278| 1| db_path: "/qualquer/caminho".to_string(),
279| 1| status: "ok".to_string(),
280| 1| fts_rebuilt: false,
281| 1| fts_rows_indexed: None,
282| 1| fts_skipped_functional: false,
283| 1| fts_unhealthy: false,
284| 1| elapsed_ms: 0,
285| 1| };
286| 1| let json = serde_json::to_value(&resp).unwrap();
287| 1| assert_eq!(json["status"], "ok", "status deve ser sempre 'ok'");
^0
288| 1| }
289| |
290| | #[test]
291| 1| fn optimize_response_serializes_all_fields() {
292| 1| let resp = OptimizeResponse {
293| 1| db_path: "/data/x.sqlite".into(),
294| 1| status: "ok".into(),
295| 1| fts_rebuilt: true,
296| 1| fts_rows_indexed: Some(0),
297| 1| fts_skipped_functional: false,
298| 1| fts_unhealthy: true,
299| 1| elapsed_ms: 120,
300| 1| };
301| 1| let v = serde_json::to_value(&resp).unwrap();
302| 1| assert_eq!(v["db_path"], "/data/x.sqlite");
303| 1| assert_eq!(v["status"], "ok");
304| 1| assert_eq!(v["fts_rebuilt"], true);
305| 1| assert_eq!(v["fts_skipped_functional"], false);
306| 1| assert_eq!(v["fts_unhealthy"], true);
307| 1| assert_eq!(v["elapsed_ms"], 120u64);
308| 1| }
309| |
310| | #[test]
311| 1| fn optimize_response_includes_fts_flags() {
312| | // G36: operator must be able to distinguish (a) rebuilt, (b) skipped-healthy,
313| | // (c) skipped-by-flag from (d) attempted-but-failed. The response
314| | // exposes fts_rebuilt, fts_skipped_functional, fts_unhealthy booleans.
315| 1| let resp = OptimizeResponse {
316| 1| db_path: "/x".into(),
317| 1| status: "ok".into(),
318| 1| fts_rebuilt: true,
319| 1| fts_rows_indexed: Some(0),
320| 1| fts_skipped_functional: false,
321| 1| fts_unhealthy: true,
322| 1| elapsed_ms: 1,
323| 1| };
324| 1| let v = serde_json::to_value(&resp).unwrap();
325| 1| assert_eq!(v["fts_rebuilt"], true);
326| 1| assert_eq!(v["fts_skipped_functional"], false);
327| 1| assert_eq!(v["fts_unhealthy"], true);
328| 1| }
329| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/prune_ner.rs:
1| |//! Handler for the `prune-ner` CLI subcommand.
2| |//!
3| |//! Removes NER bindings (rows in `memory_entities`) for a single entity or for
4| |//! all entities in the namespace. Useful for cleaning up low-quality automatic
5| |//! extractions without touching the entities or memories themselves.
6| |
7| |use crate::errors::AppError;
8| |use crate::output::{self, OutputFormat};
9| |use crate::paths::AppPaths;
10| |use crate::storage::connection::open_rw;
11| |use serde::Serialize;
12| |
13| |#[derive(clap::Args)]
14| |#[command(after_long_help = "EXAMPLES:\n \
15| | # Preview bindings that would be removed for a single entity\n \
16| | sqlite-graphrag prune-ner --entity jwt-token --dry-run\n\n \
17| | # Remove all NER bindings for a single entity\n \
18| | sqlite-graphrag prune-ner --entity jwt-token --yes\n\n \
19| | # Remove ALL NER bindings in the current namespace\n \
20| | sqlite-graphrag prune-ner --all --yes\n\n \
21| |NOTE:\n \
22| | This command deletes rows from memory_entities (the link table between\n \
23| | memories and extracted entities). The entities and memories themselves\n \
24| | are not deleted. Use cleanup-orphans afterwards to remove entity nodes\n \
25| | that have no remaining links.")]
26| |pub struct PruneNerArgs {
27| | /// Entity name whose bindings should be removed.
28| | /// Mutually exclusive with --all.
29| | #[arg(long, conflicts_with = "all", value_name = "NAME")]
30| | pub entity: Option<String>,
31| |
32| | /// Remove all NER bindings in the namespace. Mutually exclusive with --entity.
33| | #[arg(long, conflicts_with = "entity", default_value_t = false)]
34| | pub all: bool,
35| |
36| | #[arg(long)]
37| | pub namespace: Option<String>,
38| |
39| | /// Preview count without deleting.
40| | #[arg(long, default_value_t = false)]
41| | pub dry_run: bool,
42| |
43| | /// Skip confirmation for destructive operation.
44| | #[arg(long, default_value_t = false)]
45| | pub yes: bool,
46| |
47| | #[arg(long, value_enum, default_value = "json")]
48| | pub format: OutputFormat,
49| |
50| | #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
51| | pub json: bool,
52| |
53| | #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
54| | pub db: Option<String>,
55| |}
56| |
57| |#[derive(Serialize)]
58| |struct PruneNerResponse {
59| | action: String,
60| | bindings_removed: usize,
61| | namespace: String,
62| | /// Entity name targeted, when `--entity` was used.
63| | #[serde(skip_serializing_if = "Option::is_none")]
64| | entity: Option<String>,
65| | /// Total execution time in milliseconds from handler start to serialisation.
66| | elapsed_ms: u64,
67| |}
68| |
69| 0|pub fn run(args: PruneNerArgs) -> Result<(), AppError> {
70| 0| let inicio = std::time::Instant::now();
71| |
72| 0| if args.entity.is_none() && !args.all {
73| 0| return Err(AppError::Validation(
74| 0| "either --entity <NAME> or --all must be specified".to_string(),
75| 0| ));
76| 0| }
77| |
78| 0| let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
79| 0| let paths = AppPaths::resolve(args.db.as_deref())?;
80| |
81| 0| crate::storage::connection::ensure_db_ready(&paths)?;
82| |
83| 0| let mut conn = open_rw(&paths.db)?;
84| |
85| | // Count how many rows would be affected.
86| 0| let count: usize = if let Some(ref entity_name) = args.entity {
87| 0| conn.query_row(
88| 0| "SELECT COUNT(*) FROM memory_entities me
89| 0| JOIN entities e ON e.id = me.entity_id
90| 0| WHERE e.name = ?1 AND e.namespace = ?2",
91| 0| rusqlite::params![entity_name, namespace],
92| 0| |r| r.get::<_, i64>(0).map(|v| v as usize),
93| 0| )?
94| | } else {
95| 0| conn.query_row(
96| 0| "SELECT COUNT(*) FROM memory_entities me
97| 0| JOIN entities e ON e.id = me.entity_id
98| 0| WHERE e.namespace = ?1",
99| 0| rusqlite::params![namespace],
100| 0| |r| r.get::<_, i64>(0).map(|v| v as usize),
101| 0| )?
102| | };
103| |
104| 0| if args.dry_run {
105| 0| let response = PruneNerResponse {
106| 0| action: "dry_run".to_string(),
107| 0| bindings_removed: count,
108| 0| namespace: namespace.clone(),
109| 0| entity: args.entity.clone(),
110| 0| elapsed_ms: inicio.elapsed().as_millis() as u64,
111| 0| };
112| |
113| 0| match args.format {
114| 0| OutputFormat::Json => output::emit_json(&response)?,
115| 0| OutputFormat::Text | OutputFormat::Markdown => {
116| 0| output::emit_text(&format!(
117| 0| "dry_run: {count} NER bindings would be removed [{namespace}]"
118| 0| ));
119| 0| }
120| | }
121| |
122| 0| return Ok(());
123| 0| }
124| |
125| 0| if !args.yes {
126| 0| let response = PruneNerResponse {
127| 0| action: "aborted".to_string(),
128| 0| bindings_removed: count,
129| 0| namespace: namespace.clone(),
130| 0| entity: args.entity.clone(),
131| 0| elapsed_ms: inicio.elapsed().as_millis() as u64,
132| 0| };
133| |
134| 0| match args.format {
135| 0| OutputFormat::Json => output::emit_json(&response)?,
136| 0| OutputFormat::Text | OutputFormat::Markdown => {
137| 0| output::emit_text(&format!(
138| 0| "aborted: {count} NER bindings would be removed; pass --yes to confirm [{namespace}]"
139| 0| ));
140| 0| }
141| | }
142| |
143| 0| return Ok(());
144| 0| }
145| |
146| | // Destructive path: COUNT + DELETE in same transaction for consistency.
147| 0| let removed: usize = if let Some(ref entity_name) = args.entity {
148| | // Normalize to match the normalized stored entity names.
149| 0| let entity_name = crate::parsers::normalize_entity_name(entity_name);
150| 0| let tx = conn.transaction_with_behavior(rusqlite::TransactionBehavior::Immediate)?;
151| 0| let n = tx.execute(
152| 0| "DELETE FROM memory_entities WHERE entity_id IN (
153| 0| SELECT id FROM entities WHERE name = ?1 AND namespace = ?2
154| 0| )",
155| 0| rusqlite::params![entity_name, namespace],
156| 0| )?;
157| 0| tx.commit()?;
158| 0| n
159| | } else {
160| 0| let tx = conn.transaction_with_behavior(rusqlite::TransactionBehavior::Immediate)?;
161| 0| let n = tx.execute(
162| 0| "DELETE FROM memory_entities WHERE entity_id IN (
163| 0| SELECT id FROM entities WHERE namespace = ?1
164| 0| )",
165| 0| rusqlite::params![namespace],
166| 0| )?;
167| 0| tx.commit()?;
168| 0| n
169| | };
170| |
171| 0| conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);")?;
172| |
173| 0| tracing::info!(target: "prune_ner",
174| | removed = removed,
175| | namespace = %namespace,
176| | entity = ?args.entity,
177| 0| "NER bindings pruned"
178| | );
179| |
180| 0| let response = PruneNerResponse {
181| 0| action: "pruned".to_string(),
182| 0| bindings_removed: removed,
183| 0| namespace: namespace.clone(),
184| 0| entity: args.entity.clone(),
185| 0| elapsed_ms: inicio.elapsed().as_millis() as u64,
186| 0| };
187| |
188| 0| match args.format {
189| 0| OutputFormat::Json => output::emit_json(&response)?,
190| 0| OutputFormat::Text | OutputFormat::Markdown => {
191| 0| output::emit_text(&format!(
192| 0| "pruned: {removed} NER bindings removed [{namespace}]"
193| 0| ));
194| 0| }
195| | }
196| |
197| 0| Ok(())
198| 0|}
199| |
200| |#[cfg(test)]
201| |mod tests {
202| | use super::*;
203| |
204| | #[test]
205| 1| fn prune_ner_response_dry_run_serializes_correctly() {
206| 1| let resp = PruneNerResponse {
207| 1| action: "dry_run".to_string(),
208| 1| bindings_removed: 42,
209| 1| namespace: "global".to_string(),
210| 1| entity: Some("jwt-token".to_string()),
211| 1| elapsed_ms: 5,
212| 1| };
213| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
214| 1| assert_eq!(json["action"], "dry_run");
215| 1| assert_eq!(json["bindings_removed"], 42);
216| 1| assert_eq!(json["entity"], "jwt-token");
217| 1| assert_eq!(json["namespace"], "global");
218| 1| }
219| |
220| | #[test]
221| 1| fn prune_ner_response_pruned_all_omits_entity() {
222| 1| let resp = PruneNerResponse {
223| 1| action: "pruned".to_string(),
224| 1| bindings_removed: 200,
225| 1| namespace: "project-x".to_string(),
226| 1| entity: None,
227| 1| elapsed_ms: 15,
228| 1| };
229| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
230| 1| assert_eq!(json["action"], "pruned");
231| 1| assert_eq!(json["bindings_removed"], 200);
232| 1| assert!(
233| 1| json.get("entity").is_none(),
234| 0| "entity must be omitted when None"
235| | );
236| 1| }
237| |
238| | #[test]
239| 1| fn prune_ner_response_aborted_includes_count() {
240| 1| let resp = PruneNerResponse {
241| 1| action: "aborted".to_string(),
242| 1| bindings_removed: 10,
243| 1| namespace: "global".to_string(),
244| 1| entity: None,
245| 1| elapsed_ms: 1,
246| 1| };
247| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
248| 1| assert_eq!(json["action"], "aborted");
249| 1| assert_eq!(json["bindings_removed"], 10);
250| 1| assert!(json["elapsed_ms"].is_number());
251| 1| }
252| |
253| | #[test]
254| 1| fn prune_ner_response_zero_bindings() {
255| 1| let resp = PruneNerResponse {
256| 1| action: "pruned".to_string(),
257| 1| bindings_removed: 0,
258| 1| namespace: "global".to_string(),
259| 1| entity: Some("nonexistent".to_string()),
260| 1| elapsed_ms: 2,
261| 1| };
262| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
263| 1| assert_eq!(json["bindings_removed"], 0);
264| 1| }
265| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/prune_relations.rs:
1| |//! Handler for the `prune-relations` CLI subcommand.
2| |
3| |use crate::errors::AppError;
4| |use crate::i18n;
5| |use crate::output::{self, OutputFormat};
6| |use crate::paths::AppPaths;
7| |use crate::storage::connection::open_rw;
8| |use crate::storage::entities;
9| |use serde::Serialize;
10| |
11| |#[derive(clap::Args)]
12| |#[command(after_long_help = "EXAMPLES:\n \
13| | # Preview how many 'mentions' relations would be removed\n \
14| | sqlite-graphrag prune-relations --relation mentions --dry-run\n\n \
15| | # Remove all 'mentions' relations without confirmation prompt\n \
16| | sqlite-graphrag prune-relations --relation mentions --yes\n\n\
17| |NOTE:\n \
18| | This command permanently deletes relationships. Use --dry-run first.\n \
19| | Entity degree counts are automatically recalculated after pruning.")]
20| |pub struct PruneRelationsArgs {
21| | /// Relation type to delete (e.g. mentions, related, uses).
22| | /// Accepts canonical and custom kebab-case/snake_case values.
23| | #[arg(long, value_parser = crate::parsers::parse_relation, value_name = "RELATION")]
24| | pub relation: String,
25| | #[arg(long)]
26| | pub namespace: Option<String>,
27| | /// Preview count without deleting.
28| | #[arg(long)]
29| | pub dry_run: bool,
30| | /// Skip confirmation for destructive operation.
31| | #[arg(long)]
32| | pub yes: bool,
33| | /// Show affected entity names during --dry-run preview.
34| | #[arg(long, default_value_t = false)]
35| | pub show_entities: bool,
36| | #[arg(long, value_enum, default_value = "json")]
37| | pub format: OutputFormat,
38| | #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
39| | pub json: bool,
40| | #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
41| | pub db: Option<String>,
42| |}
43| |
44| |#[derive(Serialize)]
45| |struct PruneRelationsResponse {
46| | action: String,
47| | relation: String,
48| | count: usize,
49| | entities_affected: usize,
50| | namespace: String,
51| | /// Total execution time in milliseconds from handler start to serialisation.
52| | elapsed_ms: u64,
53| | #[serde(skip_serializing_if = "Option::is_none")]
54| | affected_entity_names: Option<Vec<String>>,
55| |}
56| |
57| 0|pub fn run(args: PruneRelationsArgs) -> Result<(), AppError> {
58| 0| let inicio = std::time::Instant::now();
59| 0| let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
60| 0| let paths = AppPaths::resolve(args.db.as_deref())?;
61| |
62| 0| crate::storage::connection::ensure_db_ready(&paths)?;
63| |
64| 0| crate::parsers::warn_if_non_canonical(&args.relation);
65| |
66| 0| let mut conn = open_rw(&paths.db)?;
67| |
68| 0| if args.dry_run {
69| 0| let count = entities::count_relationships_by_relation(&conn, &namespace, &args.relation)?;
70| |
71| 0| let affected_names = if args.show_entities {
72| 0| Some(entities::list_entity_names_by_relation(
73| 0| &conn,
74| 0| &namespace,
75| 0| &args.relation,
76| 0| )?)
77| | } else {
78| 0| None
79| | };
80| |
81| 0| let entities_affected_count = affected_names.as_ref().map_or(0, |v| v.len());
82| |
83| 0| output::emit_progress(&i18n::prune_dry_run(count, &args.relation));
84| |
85| 0| let response = PruneRelationsResponse {
86| 0| action: "dry_run".to_string(),
87| 0| relation: args.relation.clone(),
88| 0| count,
89| 0| entities_affected: entities_affected_count,
90| 0| namespace: namespace.clone(),
91| 0| elapsed_ms: inicio.elapsed().as_millis() as u64,
92| 0| affected_entity_names: affected_names,
93| 0| };
94| |
95| 0| match args.format {
96| 0| OutputFormat::Json => output::emit_json(&response)?,
97| 0| OutputFormat::Text | OutputFormat::Markdown => {
98| 0| output::emit_text(&format!(
99| 0| "dry_run: {} '{}' relations would be removed [{}]",
100| 0| response.count, response.relation, response.namespace
101| 0| ));
102| 0| }
103| | }
104| |
105| 0| return Ok(());
106| 0| }
107| |
108| 0| if !args.yes {
109| 0| output::emit_progress(&i18n::prune_requires_yes());
110| |
111| 0| let count = entities::count_relationships_by_relation(&conn, &namespace, &args.relation)?;
112| |
113| 0| let response = PruneRelationsResponse {
114| 0| action: "aborted".to_string(),
115| 0| relation: args.relation.clone(),
116| 0| count,
117| 0| entities_affected: 0,
118| 0| namespace: namespace.clone(),
119| 0| elapsed_ms: inicio.elapsed().as_millis() as u64,
120| 0| affected_entity_names: None,
121| 0| };
122| |
123| 0| match args.format {
124| 0| OutputFormat::Json => output::emit_json(&response)?,
125| 0| OutputFormat::Text | OutputFormat::Markdown => {
126| 0| output::emit_text(&format!(
127| 0| "aborted: {} '{}' relations would be removed; pass --yes to confirm [{}]",
128| 0| response.count, response.relation, response.namespace
129| 0| ));
130| 0| }
131| | }
132| |
133| 0| return Ok(());
134| 0| }
135| |
136| | // Destructive path: delete relationships.
137| 0| let tx = conn.transaction_with_behavior(rusqlite::TransactionBehavior::Immediate)?;
138| 0| let (count, entity_ids) =
139| 0| entities::delete_relationships_by_relation(&tx, &namespace, &args.relation)?;
140| 0| tx.commit()?;
141| |
142| | // Run ANALYZE to refresh query planner statistics after bulk deletion.
143| 0| conn.execute_batch("ANALYZE relationships; ANALYZE memory_relationships;")?;
144| 0| conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);")?;
145| |
146| 0| output::emit_progress(&i18n::relations_pruned(count, &args.relation, &namespace));
147| |
148| 0| let response = PruneRelationsResponse {
149| 0| action: "pruned".to_string(),
150| 0| relation: args.relation.clone(),
151| 0| count,
152| 0| entities_affected: entity_ids.len(),
153| 0| namespace: namespace.clone(),
154| 0| elapsed_ms: inicio.elapsed().as_millis() as u64,
155| 0| affected_entity_names: None,
156| 0| };
157| |
158| 0| match args.format {
159| 0| OutputFormat::Json => output::emit_json(&response)?,
160| 0| OutputFormat::Text | OutputFormat::Markdown => {
161| 0| output::emit_text(&format!(
162| 0| "pruned: {} '{}' relations removed, {} entities affected [{}]",
163| 0| response.count, response.relation, response.entities_affected, response.namespace
164| 0| ));
165| 0| }
166| | }
167| |
168| 0| Ok(())
169| 0|}
170| |
171| |#[cfg(test)]
172| |mod tests {
173| | use super::*;
174| |
175| | #[test]
176| 1| fn prune_response_serializes_all_fields() {
177| 1| let resp = PruneRelationsResponse {
178| 1| action: "pruned".to_string(),
179| 1| relation: "mentions".to_string(),
180| 1| count: 3451,
181| 1| entities_affected: 200,
182| 1| namespace: "global".to_string(),
183| 1| elapsed_ms: 42,
184| 1| affected_entity_names: None,
185| 1| };
186| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
187| 1| assert_eq!(json["action"], "pruned");
188| 1| assert_eq!(json["relation"], "mentions");
189| 1| assert_eq!(json["count"], 3451);
190| 1| assert_eq!(json["entities_affected"], 200);
191| 1| assert_eq!(json["namespace"], "global");
192| 1| assert!(json["elapsed_ms"].is_number());
193| 1| }
194| |
195| | #[test]
196| 1| fn prune_response_action_dry_run() {
197| 1| let resp = PruneRelationsResponse {
198| 1| action: "dry_run".to_string(),
199| 1| relation: "mentions".to_string(),
200| 1| count: 100,
201| 1| entities_affected: 0,
202| 1| namespace: "test".to_string(),
203| 1| elapsed_ms: 5,
204| 1| affected_entity_names: None,
205| 1| };
206| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
207| 1| assert_eq!(json["action"], "dry_run");
208| 1| assert_eq!(
209| 1| json["entities_affected"], 0,
210| 0| "dry_run must report zero entities_affected"
211| | );
212| 1| }
213| |
214| | #[test]
215| 1| fn prune_response_action_pruned() {
216| 1| let resp = PruneRelationsResponse {
217| 1| action: "pruned".to_string(),
218| 1| relation: "uses".to_string(),
219| 1| count: 50,
220| 1| entities_affected: 10,
221| 1| namespace: "my-project".to_string(),
222| 1| elapsed_ms: 120,
223| 1| affected_entity_names: None,
224| 1| };
225| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
226| 1| assert_eq!(json["action"], "pruned");
227| 1| assert!(json["count"].as_u64().unwrap() > 0);
228| 1| assert!(json["entities_affected"].as_u64().unwrap() > 0);
229| 1| }
230| |
231| | #[test]
232| 1| fn prune_response_zero_count_when_nothing_to_prune() {
233| 1| let resp = PruneRelationsResponse {
234| 1| action: "pruned".to_string(),
235| 1| relation: "nonexistent".to_string(),
236| 1| count: 0,
237| 1| entities_affected: 0,
238| 1| namespace: "global".to_string(),
239| 1| elapsed_ms: 1,
240| 1| affected_entity_names: None,
241| 1| };
242| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
243| 1| assert_eq!(json["count"], 0);
244| 1| assert_eq!(json["entities_affected"], 0);
245| 1| }
246| |
247| | #[test]
248| 1| fn prune_response_verbose_includes_entity_names() {
249| 1| let resp = PruneRelationsResponse {
250| 1| action: "dry_run".to_string(),
251| 1| relation: "mentions".to_string(),
252| 1| count: 10,
253| 1| entities_affected: 3,
254| 1| namespace: "global".to_string(),
255| 1| elapsed_ms: 5,
256| 1| affected_entity_names: Some(vec!["alpha".into(), "beta".into(), "gamma".into()]),
257| 1| };
258| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
259| 1| let names = json["affected_entity_names"]
260| 1| .as_array()
261| 1| .expect("must be array");
262| 1| assert_eq!(names.len(), 3);
263| 1| }
264| |
265| | #[test]
266| 1| fn prune_response_no_verbose_omits_entity_names() {
267| 1| let resp = PruneRelationsResponse {
268| 1| action: "dry_run".to_string(),
269| 1| relation: "mentions".to_string(),
270| 1| count: 10,
271| 1| entities_affected: 0,
272| 1| namespace: "global".to_string(),
273| 1| elapsed_ms: 5,
274| 1| affected_entity_names: None,
275| 1| };
276| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
277| 1| assert!(
278| 1| json.get("affected_entity_names").is_none(),
279| 0| "must be omitted when None"
280| | );
281| 1| }
282| |
283| | #[test]
284| 1| fn prune_response_action_values_are_exhaustive() {
285| 4| for action in &["pruned", "dry_run", "aborted"] {
^3
286| 3| let resp = PruneRelationsResponse {
287| 3| action: action.to_string(),
288| 3| relation: "mentions".to_string(),
289| 3| count: 0,
290| 3| entities_affected: 0,
291| 3| namespace: "global".to_string(),
292| 3| elapsed_ms: 0,
293| 3| affected_entity_names: None,
294| 3| };
295| 3| let json = serde_json::to_value(&resp).expect("serialization");
296| 3| assert_eq!(json["action"], *action);
297| | }
298| 1| }
299| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/purge.rs:
1| |//! Handler for the `purge` CLI subcommand.
2| |
3| |use crate::errors::AppError;
4| |use crate::i18n::errors_msg;
5| |use crate::output;
6| |use crate::paths::AppPaths;
7| |use crate::storage::connection::open_rw;
8| |use serde::Serialize;
9| |
10| |#[derive(clap::Args)]
11| |#[command(after_long_help = "EXAMPLES:\n \
12| | # Permanently delete soft-deleted memories older than 90 days (default retention)\n \
13| | sqlite-graphrag purge\n\n \
14| | # Custom retention window in days\n \
15| | sqlite-graphrag purge --retention-days 30\n\n \
16| | # Purge ALL soft-deleted memories regardless of age\n \
17| | sqlite-graphrag purge --retention-days 0\n\n \
18| | # Preview what would be purged without deleting\n \
19| | sqlite-graphrag purge --dry-run\n\n \
20| | # Purge a specific memory by name\n \
21| | sqlite-graphrag purge --name old-memory --namespace my-project\n\n\
22| |NOTES:\n \
23| | `--yes` only confirms intent and does NOT override `--retention-days`.\n \
24| | To wipe every soft-deleted memory immediately, pair `--yes` with `--retention-days 0`.")]
25| |pub struct PurgeArgs {
26| | #[arg(long)]
27| | pub name: Option<String>,
28| | /// Namespace to purge. Defaults to the contextual namespace (SQLITE_GRAPHRAG_NAMESPACE env var or "global").
29| | #[arg(long)]
30| | pub namespace: Option<String>,
31| | /// Retention days: memories with deleted_at older than (now - retention_days*86400) will be
32| | /// permanently removed. Default: PURGE_RETENTION_DAYS_DEFAULT (90). Use 0 to purge all
33| | /// soft-deleted memories regardless of age. Alias: `--max-age-days`.
34| | #[arg(
35| | long,
36| | alias = "days",
37| | alias = "max-age-days",
38| | value_name = "DAYS",
39| | default_value_t = crate::constants::PURGE_RETENTION_DAYS_DEFAULT
40| | )]
41| | pub retention_days: u32,
42| | /// [DEPRECATED in v2.0.0] Legacy alias — use --retention-days instead.
43| | #[arg(long, hide = true)]
44| | pub older_than_seconds: Option<u64>,
45| | /// Does not execute DELETE: computes and reports what WOULD be purged.
46| | #[arg(long, default_value_t = false)]
47| | pub dry_run: bool,
48| | /// Confirms destructive intent for tools that require explicit acknowledgement.
49| | /// Does NOT override `--retention-days`: combine with `--retention-days 0` to wipe
50| | /// every soft-deleted memory regardless of age.
51| | #[arg(long, default_value_t = false)]
52| | pub yes: bool,
53| | #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
54| | pub json: bool,
55| | #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
56| | pub db: Option<String>,
57| |}
58| |
59| |#[derive(Serialize)]
60| |pub struct PurgeResponse {
61| | pub action: String,
62| | pub purged_count: usize,
63| | pub bytes_freed: i64,
64| | pub oldest_deleted_at: Option<i64>,
65| | pub retention_days_used: u32,
66| | pub dry_run: bool,
67| | pub namespace: Option<String>,
68| | pub cutoff_epoch: i64,
69| | pub warnings: Vec<String>,
70| | /// Total execution time in milliseconds from handler start to serialisation.
71| | pub elapsed_ms: u64,
72| | /// Human-readable explanation surfaced when nothing was purged so callers
73| | /// understand the retention semantics. Present only when
74| | /// `purged_count == 0` (M2 in v1.0.32) — kept absent otherwise to preserve
75| | /// the existing JSON contract.
76| | #[serde(skip_serializing_if = "Option::is_none")]
77| | pub message: Option<String>,
78| |}
79| |
80| |/// Permanently delete soft-deleted memories that have exceeded the retention window.
81| |///
82| |/// Only memories with `deleted_at IS NOT NULL AND deleted_at <= cutoff_epoch` are affected.
83| |/// When `--dry-run` is set the DELETE is skipped and the response reflects candidates only.
84| 0|pub fn run(args: PurgeArgs) -> Result<(), AppError> {
85| 0| let inicio = std::time::Instant::now();
86| 0| let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
87| 0| let paths = AppPaths::resolve(args.db.as_deref())?;
88| |
89| 0| crate::storage::connection::ensure_db_ready(&paths)?;
90| |
91| 0| let mut warnings: Vec<String> = Vec::with_capacity(1);
92| 0| let now = current_epoch()?;
93| |
94| 0| let cutoff_epoch = if let Some(secs) = args.older_than_seconds {
95| 0| warnings.push(
96| 0| "--older-than-seconds is deprecated; use --retention-days in v2.0.0+".to_string(),
97| | );
98| 0| now - secs as i64
99| | } else {
100| 0| now - (args.retention_days as i64) * 86_400
101| | };
102| |
103| 0| let namespace_opt: Option<&str> = Some(namespace.as_str());
104| |
105| 0| let mut conn = open_rw(&paths.db)?;
106| |
107| 0| let (bytes_freed, oldest_deleted_at, candidates_count) =
108| 0| compute_metrics(&conn, cutoff_epoch, namespace_opt, args.name.as_deref())?;
109| |
110| 0| if candidates_count == 0 && args.name.is_some() {
111| 0| return Err(AppError::NotFound(
112| 0| errors_msg::soft_deleted_memory_not_found(
113| 0| args.name.as_deref().unwrap_or_default(),
114| 0| &namespace,
115| 0| ),
116| 0| ));
117| 0| }
118| |
119| 0| if !args.dry_run {
120| 0| let tx = conn.transaction_with_behavior(rusqlite::TransactionBehavior::Immediate)?;
121| 0| execute_purge(
122| 0| &tx,
123| 0| &namespace,
124| 0| args.name.as_deref(),
125| 0| cutoff_epoch,
126| 0| &mut warnings,
127| 0| )?;
128| 0| tx.commit()?;
129| 0| conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);")?;
130| 0| }
131| |
132| 0| let message = if candidates_count == 0 {
133| 0| Some(format!(
134| 0| "no soft-deleted memories older than {retention_days} day(s); use --retention-days 0 to purge all soft-deleted memories regardless of age",
135| 0| retention_days = args.retention_days
136| 0| ))
137| | } else {
138| 0| None
139| | };
140| |
141| 0| output::emit_json(&PurgeResponse {
142| 0| action: if args.dry_run {
143| 0| "dry_run".to_string()
144| | } else {
145| 0| "purged".to_string()
146| | },
147| 0| purged_count: candidates_count,
148| 0| bytes_freed,
149| 0| oldest_deleted_at,
150| 0| retention_days_used: args.retention_days,
151| 0| dry_run: args.dry_run,
152| 0| namespace: Some(namespace),
153| 0| cutoff_epoch,
154| 0| warnings,
155| 0| elapsed_ms: inicio.elapsed().as_millis() as u64,
156| 0| message,
157| 0| })?;
158| |
159| 0| Ok(())
160| 0|}
161| |
162| 4|fn current_epoch() -> Result<i64, AppError> {
163| 4| let now = std::time::SystemTime::now()
164| 4| .duration_since(std::time::UNIX_EPOCH)
165| 4| .map_err(|err| AppError::Internal(anyhow::anyhow!("system clock error: {err}")))?;
^0 ^0 ^0
166| 4| Ok(now.as_secs() as i64)
167| 4|}
168| |
169| 5|fn compute_metrics(
170| 5| conn: &rusqlite::Connection,
171| 5| cutoff_epoch: i64,
172| 5| namespace_opt: Option<&str>,
173| 5| name: Option<&str>,
174| 5|) -> Result<(i64, Option<i64>, usize), AppError> {
175| 5| let (bytes_freed, oldest_deleted_at): (i64, Option<i64>) = if let Some(name) = name {
^0
176| 0| conn.query_row(
177| 0| "SELECT COALESCE(SUM(LENGTH(COALESCE(body,'')) + LENGTH(COALESCE(description,'')) + LENGTH(name)), 0),
178| 0| MIN(deleted_at)
179| 0| FROM memories
180| 0| WHERE deleted_at IS NOT NULL AND deleted_at <= ?1
181| 0| AND (?2 IS NULL OR namespace = ?2)
182| 0| AND name = ?3",
183| 0| rusqlite::params![cutoff_epoch, namespace_opt, name],
184| 0| |r| Ok((r.get::<_, i64>(0)?, r.get::<_, Option<i64>>(1)?)),
185| 0| )?
186| | } else {
187| 5| conn.query_row(
188| 5| "SELECT COALESCE(SUM(LENGTH(COALESCE(body,'')) + LENGTH(COALESCE(description,'')) + LENGTH(name)), 0),
189| 5| MIN(deleted_at)
190| 5| FROM memories
191| 5| WHERE deleted_at IS NOT NULL AND deleted_at <= ?1
192| 5| AND (?2 IS NULL OR namespace = ?2)",
193| 5| rusqlite::params![cutoff_epoch, namespace_opt],
194| 5| |r| Ok((r.get::<_, i64>(0)?, r.get::<_, Option<i64>>(1)?)),
^0 ^0
195| 0| )?
196| | };
197| |
198| 5| let count: usize = if let Some(name) = name {
^0
199| 0| conn.query_row(
200| 0| "SELECT COUNT(*) FROM memories
201| 0| WHERE deleted_at IS NOT NULL AND deleted_at <= ?1
202| 0| AND (?2 IS NULL OR namespace = ?2)
203| 0| AND name = ?3",
204| 0| rusqlite::params![cutoff_epoch, namespace_opt, name],
205| 0| |r| r.get::<_, usize>(0),
206| 0| )?
207| | } else {
208| 5| conn.query_row(
209| 5| "SELECT COUNT(*) FROM memories
210| 5| WHERE deleted_at IS NOT NULL AND deleted_at <= ?1
211| 5| AND (?2 IS NULL OR namespace = ?2)",
212| 5| rusqlite::params![cutoff_epoch, namespace_opt],
213| 5| |r| r.get::<_, usize>(0),
214| 0| )?
215| | };
216| |
217| 5| Ok((bytes_freed, oldest_deleted_at, count))
218| 5|}
219| |
220| 0|fn execute_purge(
221| 0| tx: &rusqlite::Transaction,
222| 0| namespace: &str,
223| 0| name: Option<&str>,
224| 0| cutoff_epoch: i64,
225| 0| warnings: &mut Vec<String>,
226| 0|) -> Result<(), AppError> {
227| 0| let candidates = select_candidates(tx, namespace, name, cutoff_epoch)?;
228| |
229| 0| for (memory_id, _name) in &candidates {
230| 0| if let Err(err) = tx.execute(
231| 0| "DELETE FROM vec_chunks WHERE memory_id = ?1",
232| 0| rusqlite::params![memory_id],
233| 0| ) {
234| 0| warnings.push(format!(
235| 0| "failed to clean vec_chunks for memory_id {memory_id}: {err}"
236| 0| ));
237| 0| }
238| 0| if let Err(err) = tx.execute(
239| 0| "DELETE FROM vec_memories WHERE memory_id = ?1",
240| 0| rusqlite::params![memory_id],
241| 0| ) {
242| 0| warnings.push(format!(
243| 0| "failed to clean vec_memories for memory_id {memory_id}: {err}"
244| 0| ));
245| 0| }
246| 0| tx.execute(
247| 0| "DELETE FROM memories WHERE id = ?1 AND namespace = ?2 AND deleted_at IS NOT NULL",
248| 0| rusqlite::params![memory_id, namespace],
249| 0| )?;
250| | }
251| |
252| 0| Ok(())
253| 0|}
254| |
255| 0|fn select_candidates(
256| 0| conn: &rusqlite::Connection,
257| 0| namespace: &str,
258| 0| name: Option<&str>,
259| 0| cutoff_epoch: i64,
260| 0|) -> Result<Vec<(i64, String)>, AppError> {
261| 0| let query = if name.is_some() {
262| 0| "SELECT id, name FROM memories
263| 0| WHERE namespace = ?1 AND name = ?2 AND deleted_at IS NOT NULL AND deleted_at <= ?3
264| 0| ORDER BY deleted_at ASC"
265| | } else {
266| 0| "SELECT id, name FROM memories
267| 0| WHERE namespace = ?1 AND deleted_at IS NOT NULL AND deleted_at <= ?2
268| 0| ORDER BY deleted_at ASC"
269| | };
270| |
271| 0| let mut stmt = conn.prepare_cached(query)?;
272| 0| let rows = if let Some(name) = name {
273| 0| stmt.query_map(rusqlite::params![namespace, name, cutoff_epoch], |row| {
274| 0| Ok((row.get::<_, i64>(0)?, row.get::<_, String>(1)?))
275| 0| })?
276| 0| .collect::<Result<Vec<_>, _>>()?
277| | } else {
278| 0| stmt.query_map(rusqlite::params![namespace, cutoff_epoch], |row| {
279| 0| Ok((row.get::<_, i64>(0)?, row.get::<_, String>(1)?))
280| 0| })?
281| 0| .collect::<Result<Vec<_>, _>>()?
282| | };
283| 0| Ok(rows)
284| 0|}
285| |
286| |#[cfg(test)]
287| |mod tests {
288| | use super::*;
289| | use rusqlite::Connection;
290| |
291| 4| fn setup_test_db() -> Connection {
292| 4| let conn = Connection::open_in_memory().expect("failed to open in-memory db");
293| 4| conn.execute_batch(
294| 4| "CREATE TABLE memories (
295| 4| id INTEGER PRIMARY KEY AUTOINCREMENT,
296| 4| name TEXT NOT NULL,
297| 4| namespace TEXT NOT NULL DEFAULT 'global',
298| 4| description TEXT,
299| 4| body TEXT,
300| 4| deleted_at INTEGER
301| 4| );
302| 4| CREATE TABLE IF NOT EXISTS vec_chunks (memory_id INTEGER);
303| 4| CREATE TABLE IF NOT EXISTS vec_memories (memory_id INTEGER);",
304| | )
305| 4| .expect("failed to create test tables");
306| 4| conn
307| 4| }
308| |
309| 4| fn insert_deleted_memory(
310| 4| conn: &Connection,
311| 4| name: &str,
312| 4| namespace: &str,
313| 4| body: &str,
314| 4| deleted_at: i64,
315| 4| ) -> i64 {
316| 4| conn.execute(
317| 4| "INSERT INTO memories (name, namespace, body, deleted_at) VALUES (?1, ?2, ?3, ?4)",
318| 4| rusqlite::params![name, namespace, body, deleted_at],
319| | )
320| 4| .expect("failed to insert test memory");
321| 4| conn.last_insert_rowid()
322| 4| }
323| |
324| | #[test]
325| 1| fn retention_days_used_default_is_90() {
326| 1| assert_eq!(crate::constants::PURGE_RETENTION_DAYS_DEFAULT, 90u32);
327| 1| }
328| |
329| | #[test]
330| 1| fn compute_metrics_bytes_freed_positive_for_populated_body() {
331| 1| let conn = setup_test_db();
332| 1| let now = current_epoch().expect("epoch failed");
333| 1| let old_epoch = now - 100 * 86_400;
334| 1| insert_deleted_memory(&conn, "mem-test", "global", "memory body", old_epoch);
335| |
336| 1| let cutoff = now - 30 * 86_400;
337| 1| let (bytes, oldest, count) =
338| 1| compute_metrics(&conn, cutoff, Some("global"), None).expect("compute_metrics failed");
339| |
340| 1| assert!(bytes > 0, "bytes_freed must be > 0 for populated body");
^0
341| 1| assert!(oldest.is_some(), "oldest_deleted_at must be Some");
^0
342| 1| assert_eq!(count, 1);
343| 1| }
344| |
345| | #[test]
346| 1| fn compute_metrics_returns_zero_without_candidates() {
347| 1| let conn = setup_test_db();
348| 1| let now = current_epoch().expect("epoch failed");
349| 1| let cutoff = now - 90 * 86_400;
350| |
351| 1| let (bytes, oldest, count) =
352| 1| compute_metrics(&conn, cutoff, Some("global"), None).expect("compute_metrics failed");
353| |
354| 1| assert_eq!(bytes, 0);
355| 1| assert!(oldest.is_none());
356| 1| assert_eq!(count, 0);
357| 1| }
358| |
359| | #[test]
360| 1| fn dry_run_does_not_delete_records() {
361| 1| let conn = setup_test_db();
362| 1| let now = current_epoch().expect("epoch failed");
363| 1| let old_epoch = now - 200 * 86_400;
364| 1| insert_deleted_memory(&conn, "mem-dry", "global", "dry run content", old_epoch);
365| |
366| 1| let cutoff = now - 30 * 86_400;
367| 1| let (_, _, count_before) =
368| 1| compute_metrics(&conn, cutoff, Some("global"), None).expect("compute_metrics failed");
369| 1| assert_eq!(count_before, 1, "must have 1 candidate before dry run");
^0
370| |
371| 1| let (_, _, count_after) =
372| 1| compute_metrics(&conn, cutoff, Some("global"), None).expect("compute_metrics failed");
373| 1| assert_eq!(
374| | count_after, 1,
375| 0| "dry_run must not remove records: count must remain 1"
376| | );
377| 1| }
378| |
379| | #[test]
380| 1| fn oldest_deleted_at_returns_smallest_epoch() {
381| 1| let conn = setup_test_db();
382| 1| let now = current_epoch().expect("epoch failed");
383| 1| let epoch_old = now - 300 * 86_400;
384| 1| let epoch_recent = now - 200 * 86_400;
385| |
386| 1| insert_deleted_memory(&conn, "mem-a", "global", "body-a", epoch_old);
387| 1| insert_deleted_memory(&conn, "mem-b", "global", "body-b", epoch_recent);
388| |
389| 1| let cutoff = now - 30 * 86_400;
390| 1| let (_, oldest, count) =
391| 1| compute_metrics(&conn, cutoff, Some("global"), None).expect("compute_metrics failed");
392| |
393| 1| assert_eq!(count, 2);
394| 1| assert_eq!(
395| | oldest,
396| 1| Some(epoch_old),
397| 0| "oldest_deleted_at must be the oldest epoch"
398| | );
399| 1| }
400| |
401| | #[test]
402| 1| fn purge_args_namespace_accepts_none_without_default() {
403| | // P1-C: namespace must be None when not provided, allowing resolve_namespace
404| | // to consult SQLITE_GRAPHRAG_NAMESPACE before falling back to "global".
405| | // The field was `default_value = "global"` before P1-C; with that removed,
406| | // resolve_namespace(None) consults the env var correctly.
407| 1| let resolved = crate::namespace::resolve_namespace(None)
408| 1| .expect("resolve_namespace(None) must return Ok");
409| 1| assert_eq!(
410| | resolved, "global",
411| 0| "without env var, resolve_namespace(None) must fall back to 'global'"
412| | );
413| 1| }
414| |
415| | #[test]
416| 1| fn purge_response_serializes_all_new_fields() {
417| 1| let resp = PurgeResponse {
418| 1| action: "purged".to_string(),
419| 1| purged_count: 3,
420| 1| bytes_freed: 1024,
421| 1| oldest_deleted_at: Some(1_700_000_000),
422| 1| retention_days_used: 90,
423| 1| dry_run: false,
424| 1| namespace: Some("global".to_string()),
425| 1| cutoff_epoch: 1_710_000_000,
426| 1| warnings: vec![],
427| 1| elapsed_ms: 42,
428| 1| message: None,
429| 1| };
430| 1| let json = serde_json::to_string(&resp).expect("serialization failed");
431| 1| assert!(json.contains("bytes_freed"));
432| 1| assert!(json.contains("oldest_deleted_at"));
433| 1| assert!(json.contains("retention_days_used"));
434| 1| assert!(json.contains("dry_run"));
435| 1| assert!(json.contains("elapsed_ms"));
436| | // M2: when no purge happened, `message` is omitted to keep payloads stable.
437| 1| assert!(!json.contains("\"message\""));
438| 1| }
439| |
440| | #[test]
441| 1| fn purge_response_serializes_message_when_present() {
442| | // M2 (v1.0.32): zero purges include a human-readable hint message.
443| 1| let resp = PurgeResponse {
444| 1| action: "purged".to_string(),
445| 1| purged_count: 0,
446| 1| bytes_freed: 0,
447| 1| oldest_deleted_at: None,
448| 1| retention_days_used: 90,
449| 1| dry_run: false,
450| 1| namespace: Some("global".to_string()),
451| 1| cutoff_epoch: 1_710_000_000,
452| 1| warnings: vec![],
453| 1| elapsed_ms: 5,
454| 1| message: Some(
455| 1| "no soft-deleted memories older than 90 day(s); use --retention-days 0 to purge all soft-deleted memories regardless of age"
456| 1| .to_string(),
457| 1| ),
458| 1| };
459| 1| let json = serde_json::to_string(&resp).expect("serialization failed");
460| 1| assert!(json.contains("\"message\""));
461| 1| assert!(json.contains("--retention-days 0"));
462| 1| }
463| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/read.rs:
1| |//! Handler for the `read` CLI subcommand.
2| |
3| |use crate::errors::AppError;
4| |use crate::output;
5| |use crate::paths::AppPaths;
6| |use crate::storage::connection::open_ro;
7| |use crate::storage::memories;
8| |use serde::Serialize;
9| |
10| |#[derive(clap::Args)]
11| |#[command(after_long_help = "EXAMPLES:\n \
12| | # Read a memory by name (positional)\n \
13| | sqlite-graphrag read onboarding\n\n \
14| | # Read using the named flag form\n \
15| | sqlite-graphrag read --name onboarding\n\n \
16| | # Read by memory ID (integer emitted in JSON output of most commands)\n \
17| | sqlite-graphrag read --id 42 --json\n\n \
18| | # Read from a specific namespace\n \
19| | sqlite-graphrag read onboarding --namespace my-project")]
20| |pub struct ReadArgs {
21| | /// Memory name as a positional argument. Alternative to `--name`.
22| | #[arg(
23| | value_name = "NAME",
24| | conflicts_with = "name",
25| | help = "Memory name (kebab-case slug); alternative to --name"
26| | )]
27| | pub name_positional: Option<String>,
28| | /// Memory name to read. Returns NotFound (exit 4) if missing or soft-deleted.
29| | #[arg(long)]
30| | pub name: Option<String>,
31| | /// Memory ID (integer) for direct lookup. Conflicts with --name and positional NAME.
32| | #[arg(
33| | long,
34| | conflicts_with_all = ["name", "name_positional"],
35| | help = "Memory ID (integer) for direct lookup"
36| | )]
37| | pub id: Option<i64>,
38| | #[arg(
39| | long,
40| | help = "Namespace (env: SQLITE_GRAPHRAG_NAMESPACE, default: global)"
41| | )]
42| | pub namespace: Option<String>,
43| | /// Include linked entities and relationships in the response.
44| | #[arg(
45| | long,
46| | help = "Include graph context (entities + relationships) in response"
47| | )]
48| | pub with_graph: bool,
49| | #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
50| | pub json: bool,
51| | #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
52| | pub db: Option<String>,
53| |}
54| |
55| |#[derive(Serialize)]
56| |struct ReadResponse {
57| | /// Canonical storage field. Preserved for compatibility with v2.0.0 clients.
58| | id: i64,
59| | /// Semantic alias of `id` for the contract documented in SKILL.md.
60| | memory_id: i64,
61| | namespace: String,
62| | name: String,
63| | /// Semantic alias of `memory_type` for the documented contract.
64| | #[serde(rename = "type")]
65| | type_alias: String,
66| | memory_type: String,
67| | description: String,
68| | body: String,
69| | body_hash: String,
70| | session_id: Option<String>,
71| | source: String,
72| | metadata: serde_json::Value,
73| | /// Most recent memory version, useful for optimistic control via `--expected-updated-at`.
74| | version: i64,
75| | created_at: i64,
76| | /// RFC 3339 UTC timestamp parallel to `created_at` for ISO 8601 parsers.
77| | created_at_iso: String,
78| | updated_at: i64,
79| | /// RFC 3339 UTC timestamp parallel to `updated_at` for ISO 8601 parsers.
80| | updated_at_iso: String,
81| | /// Linked entities (opt-in via --with-graph).
82| | #[serde(skip_serializing_if = "Option::is_none")]
83| | entities: Option<Vec<ReadEntityBinding>>,
84| | /// Relationships from linked entities (opt-in via --with-graph).
85| | #[serde(skip_serializing_if = "Option::is_none")]
86| | relationships: Option<Vec<ReadRelationshipBinding>>,
87| | /// Total execution time in milliseconds from handler start to serialisation.
88| | elapsed_ms: u64,
89| |}
90| |
91| |#[derive(Serialize)]
92| |struct ReadEntityBinding {
93| | entity_id: i64,
94| | name: String,
95| | entity_type: String,
96| |}
97| |
98| |#[derive(Serialize)]
99| |struct ReadRelationshipBinding {
100| | from: String,
101| | to: String,
102| | relation: String,
103| | weight: f64,
104| |}
105| |
106| 3|fn epoch_to_iso(epoch: i64) -> String {
107| 3| crate::tz::epoch_to_iso(epoch)
108| 3|}
109| |
110| 0|pub fn run(args: ReadArgs) -> Result<(), AppError> {
111| 0| let start = std::time::Instant::now();
112| 0| let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
113| 0| let paths = AppPaths::resolve(args.db.as_deref())?;
114| 0| crate::storage::connection::ensure_db_ready(&paths)?;
115| 0| let conn = open_ro(&paths.db)?;
116| |
117| 0| let row_opt = if let Some(id) = args.id {
118| 0| let r = memories::read_full(&conn, id)?;
119| 0| if let Some(ref row) = r {
120| 0| if row.namespace != namespace {
121| 0| return Err(AppError::NotFound(format!(
122| 0| "memory id {id} exists but belongs to namespace '{}', not '{namespace}'",
123| 0| row.namespace
124| 0| )));
125| 0| }
126| 0| }
127| 0| r
128| | } else {
129| 0| let name = args.name_positional.or(args.name).ok_or_else(|| {
130| 0| AppError::Validation(
131| 0| "name or --id required: pass name as positional argument, via --name, or use --id"
132| 0| .to_string(),
133| 0| )
134| 0| })?;
135| 0| memories::read_by_name(&conn, &namespace, &name)?
136| | };
137| |
138| 0| match row_opt {
139| 0| Some(row) => {
140| | // Resolve current version via memory_versions table (highest version for this memory_id).
141| 0| let version: i64 = conn
142| 0| .query_row(
143| 0| "SELECT COALESCE(MAX(version), 1) FROM memory_versions WHERE memory_id=?1",
144| 0| rusqlite::params![row.id],
145| 0| |r| r.get(0),
146| | )
147| 0| .unwrap_or(1);
148| |
149| | // G22: optional graph context
150| 0| let (entities, relationships) = if args.with_graph {
151| 0| let mut ent_stmt = conn.prepare_cached(
152| 0| "SELECT e.id, e.name, e.type FROM memory_entities me \
153| 0| JOIN entities e ON e.id = me.entity_id \
154| 0| WHERE me.memory_id = ?1",
155| 0| )?;
156| 0| let ents: Vec<ReadEntityBinding> = ent_stmt
157| 0| .query_map(rusqlite::params![row.id], |r| {
158| | Ok(ReadEntityBinding {
159| 0| entity_id: r.get(0)?,
160| 0| name: r.get(1)?,
161| 0| entity_type: r.get(2)?,
162| | })
163| 0| })?
164| 0| .filter_map(|r| r.ok())
165| 0| .collect();
166| 0| drop(ent_stmt);
167| |
168| 0| let entity_ids: Vec<i64> = ents.iter().map(|e| e.entity_id).collect();
169| 0| let rels: Vec<ReadRelationshipBinding> = if !entity_ids.is_empty() {
170| 0| let placeholders: String = entity_ids
171| 0| .iter()
172| 0| .map(|id| id.to_string())
173| 0| .collect::<Vec<_>>()
174| 0| .join(",");
175| 0| let sql = format!(
176| 0| "SELECT e1.name, e2.name, r.relation, r.weight \
177| 0| FROM relationships r \
178| 0| JOIN entities e1 ON e1.id = r.source_id \
179| 0| JOIN entities e2 ON e2.id = r.target_id \
180| 0| WHERE r.source_id IN ({placeholders}) OR r.target_id IN ({placeholders})"
181| | );
182| 0| let mut rel_stmt = conn.prepare(&sql)?;
183| 0| let result: Vec<ReadRelationshipBinding> = rel_stmt
184| 0| .query_map([], |r| {
185| | Ok(ReadRelationshipBinding {
186| 0| from: r.get(0)?,
187| 0| to: r.get(1)?,
188| 0| relation: r.get(2)?,
189| 0| weight: r.get(3)?,
190| | })
191| 0| })?
192| 0| .filter_map(|r| r.ok())
193| 0| .collect();
194| 0| drop(rel_stmt);
195| 0| result
196| | } else {
197| 0| vec![]
198| | };
199| 0| (Some(ents), Some(rels))
200| | } else {
201| 0| (None, None)
202| | };
203| |
204| 0| let response = ReadResponse {
205| 0| id: row.id,
206| 0| memory_id: row.id,
207| 0| namespace: row.namespace,
208| 0| name: row.name,
209| 0| type_alias: row.memory_type.clone(),
210| 0| memory_type: row.memory_type,
211| 0| description: row.description,
212| 0| body: row.body,
213| 0| body_hash: row.body_hash,
214| 0| session_id: row.session_id,
215| 0| source: row.source,
216| 0| metadata: serde_json::from_str::<serde_json::Value>(&row.metadata)
217| 0| .unwrap_or(serde_json::Value::Null),
218| 0| version,
219| 0| created_at: row.created_at,
220| 0| created_at_iso: epoch_to_iso(row.created_at),
221| 0| updated_at: row.updated_at,
222| 0| updated_at_iso: epoch_to_iso(row.updated_at),
223| 0| entities,
224| 0| relationships,
225| 0| elapsed_ms: start.elapsed().as_millis() as u64,
226| 0| };
227| 0| output::emit_json(&response)?;
228| | }
229| | None => {
230| 0| let label = if let Some(id) = args.id {
231| 0| format!("id={id}")
232| | } else {
233| 0| "unknown".to_string()
234| | };
235| 0| return Err(AppError::NotFound(format!(
236| 0| "memory not found: {label} in namespace '{namespace}'"
237| 0| )));
238| | }
239| | }
240| |
241| 0| Ok(())
242| 0|}
243| |
244| |#[cfg(test)]
245| |mod tests {
246| | use super::*;
247| |
248| | #[test]
249| 1| fn epoch_to_iso_converts_zero_to_unix_epoch() {
250| | // v1.0.68 (test fix): parse the ISO back into a DateTime<FixedOffset>
251| | // and compare with chrono::DateTime::UNIX_EPOCH so the assertion is
252| | // timezone-agnostic. The previous `starts_with("1970-01-01T00:00:00")`
253| | // assertion leaked the global SQLITE_GRAPHRAG_DISPLAY_TZ from sibling
254| | // tests in the same process and failed on hosts where the default
255| | // timezone is non-UTC.
256| 1| let result = epoch_to_iso(0);
257| 1| let parsed = chrono::DateTime::parse_from_rfc3339(&result)
258| 1| .unwrap_or_else(|e| panic!("epoch_to_iso(0) returned non-RFC3339 `{result}`: {e}"));
^0
259| 1| assert_eq!(
260| 1| parsed.timestamp(),
261| 1| chrono::DateTime::UNIX_EPOCH.timestamp(),
262| 0| "epoch 0 must map to the Unix epoch instant, got: {result}"
263| | );
264| 1| }
265| |
266| | #[test]
267| 1| fn epoch_to_iso_converts_known_timestamp() {
268| | // v1.0.68 (test fix): 1_705_320_000 = 2024-01-15T12:00:00Z, not
269| | // 2024-01-15T00:00:00Z (the previous test asserted the wrong instant).
270| | // The fix uses parse + timestamp compare to be timezone-agnostic and
271| | // to catch wrong-epoch regressions regardless of host TZ.
272| 1| let result = epoch_to_iso(1_705_320_000);
273| 1| let parsed = chrono::DateTime::parse_from_rfc3339(&result).unwrap_or_else(|e| {
^0
274| 0| panic!("epoch_to_iso(1705320000) returned non-RFC3339 `{result}`: {e}")
275| | });
276| 1| let expected = chrono::DateTime::parse_from_rfc3339("2024-01-15T12:00:00+00:00")
277| 1| .expect("static RFC3339 is valid");
278| 1| assert_eq!(
279| 1| parsed.timestamp(),
280| 1| expected.timestamp(),
281| 0| "timestamp 1705320000 must map to 2024-01-15T12:00:00Z, got: {result}"
282| | );
283| 1| }
284| |
285| | #[test]
286| 1| fn epoch_to_iso_returns_fallback_for_invalid_negative_epoch() {
287| 1| let result = epoch_to_iso(i64::MIN);
288| 1| assert!(
289| 1| !result.is_empty(),
290| 0| "must return a non-empty string even for invalid epoch"
291| | );
292| 1| }
293| |
294| | #[test]
295| 1| fn read_response_serializes_id_and_memory_id_aliases() {
296| 1| let resp = ReadResponse {
297| 1| id: 42,
298| 1| memory_id: 42,
299| 1| namespace: "global".to_string(),
300| 1| name: "my-mem".to_string(),
301| 1| type_alias: "fact".to_string(),
302| 1| memory_type: "fact".to_string(),
303| 1| description: "desc".to_string(),
304| 1| body: "body".to_string(),
305| 1| body_hash: "abc123".to_string(),
306| 1| session_id: None,
307| 1| source: "agent".to_string(),
308| 1| metadata: serde_json::json!({}),
309| 1| version: 1,
310| 1| created_at: 1_705_320_000,
311| 1| created_at_iso: "2024-01-15T12:00:00Z".to_string(),
312| 1| updated_at: 1_705_320_000,
313| 1| updated_at_iso: "2024-01-15T12:00:00Z".to_string(),
314| 1| entities: None,
315| 1| relationships: None,
316| 1| elapsed_ms: 5,
317| 1| };
318| |
319| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
320| 1| assert_eq!(json["id"], 42);
321| 1| assert_eq!(json["memory_id"], 42);
322| 1| assert_eq!(json["type"], "fact");
323| 1| assert_eq!(json["memory_type"], "fact");
324| 1| assert_eq!(json["elapsed_ms"], 5u64);
325| 1| assert!(
326| 1| json["session_id"].is_null(),
327| 0| "session_id None must serialize as null"
328| | );
329| | // metadata must serialize as a JSON object, not as an escaped string
330| 1| assert!(
331| 1| json["metadata"].is_object(),
332| 0| "metadata must be a JSON object"
333| | );
334| 1| }
335| |
336| | #[test]
337| 1| fn read_response_session_id_some_serializes_string() {
338| 1| let resp = ReadResponse {
339| 1| id: 1,
340| 1| memory_id: 1,
341| 1| namespace: "global".to_string(),
342| 1| name: "mem".to_string(),
343| 1| type_alias: "skill".to_string(),
344| 1| memory_type: "skill".to_string(),
345| 1| description: "d".to_string(),
346| 1| body: "b".to_string(),
347| 1| body_hash: "h".to_string(),
348| 1| session_id: Some("sess-123".to_string()),
349| 1| source: "agent".to_string(),
350| 1| metadata: serde_json::json!({}),
351| 1| version: 2,
352| 1| created_at: 0,
353| 1| created_at_iso: "1970-01-01T00:00:00Z".to_string(),
354| 1| updated_at: 0,
355| 1| updated_at_iso: "1970-01-01T00:00:00Z".to_string(),
356| 1| entities: None,
357| 1| relationships: None,
358| 1| elapsed_ms: 0,
359| 1| };
360| |
361| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
362| 1| assert_eq!(json["session_id"], "sess-123");
363| 1| }
364| |
365| | #[test]
366| 1| fn read_response_elapsed_ms_is_present() {
367| 1| let resp = ReadResponse {
368| 1| id: 7,
369| 1| memory_id: 7,
370| 1| namespace: "ns".to_string(),
371| 1| name: "n".to_string(),
372| 1| type_alias: "procedure".to_string(),
373| 1| memory_type: "procedure".to_string(),
374| 1| description: "d".to_string(),
375| 1| body: "b".to_string(),
376| 1| body_hash: "h".to_string(),
377| 1| session_id: None,
378| 1| source: "agent".to_string(),
379| 1| metadata: serde_json::json!({}),
380| 1| version: 3,
381| 1| created_at: 1000,
382| 1| created_at_iso: "1970-01-01T00:16:40Z".to_string(),
383| 1| updated_at: 2000,
384| 1| updated_at_iso: "1970-01-01T00:33:20Z".to_string(),
385| 1| entities: None,
386| 1| relationships: None,
387| 1| elapsed_ms: 123,
388| 1| };
389| |
390| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
391| 1| assert_eq!(json["elapsed_ms"], 123u64);
392| 1| assert!(json["created_at_iso"].is_string());
393| 1| assert!(json["updated_at_iso"].is_string());
394| 1| }
395| |
396| | #[test]
397| 1| fn read_response_metadata_object_not_escaped_string() {
398| | // P2-A: metadata must serialize as a JSON object, not as an escaped string.
399| 1| let resp = ReadResponse {
400| 1| id: 3,
401| 1| memory_id: 3,
402| 1| namespace: "ns".to_string(),
403| 1| name: "meta-test".to_string(),
404| 1| type_alias: "fact".to_string(),
405| 1| memory_type: "fact".to_string(),
406| 1| description: "d".to_string(),
407| 1| body: "b".to_string(),
408| 1| body_hash: "h".to_string(),
409| 1| session_id: None,
410| 1| source: "agent".to_string(),
411| 1| metadata: serde_json::json!({"key": "value", "number": 42}),
412| 1| version: 1,
413| 1| created_at: 0,
414| 1| created_at_iso: "1970-01-01T00:00:00Z".to_string(),
415| 1| updated_at: 0,
416| 1| updated_at_iso: "1970-01-01T00:00:00Z".to_string(),
417| 1| entities: None,
418| 1| relationships: None,
419| 1| elapsed_ms: 1,
420| 1| };
421| |
422| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
423| | // Must be object, not a JSON string containing escaped JSON.
424| 1| assert!(json["metadata"].is_object());
425| 1| assert_eq!(json["metadata"]["key"], "value");
426| 1| assert_eq!(json["metadata"]["number"], 42);
427| 1| }
428| |
429| | #[test]
430| 1| fn read_response_metadata_fallback_to_null_for_invalid_json() {
431| | // P2-A: fallback when metadata is an invalid string.
432| 1| let raw = "invalid-json{{{";
433| 1| let parsed =
434| 1| serde_json::from_str::<serde_json::Value>(raw).unwrap_or(serde_json::Value::Null);
435| 1| assert!(parsed.is_null());
436| 1| }
437| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/recall.rs:
1| |//! Handler for the `recall` CLI subcommand.
2| |
3| |use crate::cli::MemoryType;
4| |use crate::errors::AppError;
5| |use crate::graph::traverse_from_memories_with_hops;
6| |use crate::i18n::errors_msg;
7| |use crate::output::{self, JsonOutputFormat, RecallItem, RecallResponse};
8| |use crate::paths::AppPaths;
9| |use crate::storage::connection::open_ro;
10| |use crate::storage::entities;
11| |use crate::storage::memories;
12| |
13| |/// Arguments for the `recall` subcommand.
14| |///
15| |/// When `--namespace` is omitted the query runs against the `global` namespace,
16| |/// which is the default namespace used by `remember` when no `--namespace` flag
17| |/// is provided. Pass an explicit `--namespace` value to search a different
18| |/// isolated namespace.
19| |#[derive(clap::Args)]
20| |#[command(after_long_help = "EXAMPLES:\n \
21| | # Semantic search for top 5 matches\n \
22| | sqlite-graphrag recall \"authentication design\" --k 5\n\n \
23| | # Disable automatic graph expansion\n \
24| | sqlite-graphrag recall \"JWT tokens\" --k 3 --no-graph\n\n \
25| | # Limit graph traversal depth and minimum edge weight\n \
26| | sqlite-graphrag recall \"auth\" --k 5 --max-hops 2 --min-weight 0.3\n\n \
27| | # Filter by memory type\n \
28| | sqlite-graphrag recall \"deployment\" --type decision --k 10\n\n \
29| | # Cap results by distance threshold\n \
30| | sqlite-graphrag recall \"API design\" --k 5 --max-distance 0.8\n\n \
31| |NOTES:\n \
32| | When --no-graph is active, graph traversal is skipped and every result has\n \
33| | source=\"direct\". The source field is therefore redundant with --no-graph and\n \
34| | may be ignored by callers in that mode.")]
35| |pub struct RecallArgs {
36| | #[arg(
37| | allow_hyphen_values = true,
38| | help = "Search query string (semantic vector search via sqlite-vec)"
39| | )]
40| | pub query: String,
41| | /// Maximum number of direct vector matches to return.
42| | ///
43| | /// Note: this flag controls only `direct_matches`. Graph traversal results
44| | /// (`graph_matches`) are unbounded by default; use `--max-graph-results` to
45| | /// cap them independently. The `results` field aggregates both lists.
46| | /// Validated to the inclusive range `1..=4096` (the upper bound matches
47| | /// `sqlite-vec`'s knn limit; out-of-range values are rejected at parse time).
48| | #[arg(short = 'k', long, aliases = ["limit", "top-k"], default_value = "10", value_parser = crate::parsers::parse_k_range)]
49| | pub k: usize,
50| | /// Filter by memory.type. Note: distinct from graph entity_type
51| | /// (project/tool/person/file/concept/incident/decision/memory/dashboard/issue_tracker/organization/location/date)
52| | /// used in --entities-file.
53| | #[arg(long, value_enum)]
54| | pub r#type: Option<MemoryType>,
55| | #[arg(long)]
56| | pub namespace: Option<String>,
57| | #[arg(long)]
58| | pub no_graph: bool,
59| | /// Disable -k cap and return all direct matches without truncation.
60| | ///
61| | /// When set, the `-k`/`--k` flag is ignored for `direct_matches` and the
62| | /// response includes every match above the distance threshold. Useful when
63| | /// callers need the complete set rather than a top-N preview.
64| | #[arg(long)]
65| | pub precise: bool,
66| | #[arg(long, default_value = "2")]
67| | pub max_hops: u32,
68| | #[arg(long, default_value = "0.3")]
69| | pub min_weight: f64,
70| | /// Cap the size of `graph_matches` to at most N entries.
71| | ///
72| | /// Defaults to unbounded (`None`) so existing pipelines see the same shape
73| | /// as in v1.0.22 and earlier. Set this when a query touches a dense graph
74| | /// neighbourhood and the caller only needs a top-N preview. Added in v1.0.23.
75| | #[arg(long, value_name = "N")]
76| | pub max_graph_results: Option<usize>,
77| | /// Filter results by maximum distance. Results with distance greater than this value
78| | /// are excluded. If all matches exceed this threshold, the command exits with code 4
79| | /// (`not found`) per the documented public contract.
80| | /// Default `1.0` disables the filter and preserves the top-k behavior.
81| | #[arg(long, alias = "min-distance", default_value = "1.0")]
82| | pub max_distance: f32,
83| | #[arg(long, value_enum, default_value_t = JsonOutputFormat::Json)]
84| | pub format: JsonOutputFormat,
85| | #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
86| | pub db: Option<String>,
87| | /// Accept `--json` as a no-op because output is already JSON by default.
88| | #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
89| | pub json: bool,
90| | /// Search across all namespaces instead of a single namespace.
91| | ///
92| | /// Cannot be combined with `--namespace`. When set, the query runs against
93| | /// every namespace and results include a `namespace` field to identify origin.
94| | #[arg(long, conflicts_with = "namespace")]
95| | pub all_namespaces: bool,
96| | #[command(flatten)]
97| | pub daemon: crate::cli::DaemonOpts,
98| |}
99| |
100| |#[tracing::instrument(skip_all, level = "debug", name = "recall")]
101| 0|pub fn run(args: RecallArgs) -> Result<(), AppError> {
102| 0| let start = std::time::Instant::now();
103| 0| let _ = args.format;
104| 0| tracing::debug!(target: "recall", query = %args.query, k = args.k, "searching");
105| |
106| | // G20: reject graph-specific flags when --no-graph is active
107| 0| if args.no_graph {
108| 0| if args.max_hops != 2 {
109| 0| return Err(AppError::Validation(
110| 0| "--max-hops has no effect with --no-graph; remove one".to_string(),
111| 0| ));
112| 0| }
113| 0| if (args.min_weight - 0.3).abs() > f64::EPSILON {
114| 0| return Err(AppError::Validation(
115| 0| "--min-weight has no effect with --no-graph; remove one".to_string(),
116| 0| ));
117| 0| }
118| 0| }
119| |
120| 0| if args.query.trim().is_empty() {
121| 0| return Err(AppError::Validation(crate::i18n::validation::empty_query()));
122| 0| }
123| | // Resolve the list of namespaces to search:
124| | // - empty vec => all namespaces (sentinel used by knn_search)
125| | // - single vec => one namespace (default or --namespace value)
126| 0| let namespaces: Vec<String> = if args.all_namespaces {
127| 0| Vec::new()
128| | } else {
129| 0| vec![crate::namespace::resolve_namespace(
130| 0| args.namespace.as_deref(),
131| 0| )?]
132| | };
133| | // Single namespace string used for graph traversal and error messages.
134| 0| let namespace_for_graph = namespaces
135| 0| .first()
136| 0| .cloned()
137| 0| .unwrap_or_else(|| "global".to_string());
138| 0| let paths = AppPaths::resolve(args.db.as_deref())?;
139| |
140| 0| crate::storage::connection::ensure_db_ready(&paths)?;
141| |
142| 0| output::emit_progress_i18n(
143| 0| "Computing query embedding...",
144| 0| "Calculando embedding da consulta...",
145| | );
146| 0| let embedding = crate::daemon::embed_query_or_local(
147| 0| &paths.models,
148| 0| &args.query,
149| 0| args.daemon.autostart_daemon,
150| 0| )?;
151| |
152| 0| let conn = open_ro(&paths.db)?;
153| |
154| 0| let memory_type_str = args.r#type.map(|t| t.as_str());
155| | // When --precise is set, lift the -k cap so every match is returned; the
156| | // max_distance filter below will trim irrelevant results instead.
157| 0| let effective_k = if args.precise { 100_000 } else { args.k };
158| 0| let knn_results =
159| 0| memories::knn_search(&conn, &embedding, &namespaces, memory_type_str, effective_k)?;
160| |
161| 0| let mut direct_matches = Vec::with_capacity(effective_k);
162| 0| let mut memory_ids: Vec<i64> = Vec::with_capacity(effective_k);
163| 0| for (memory_id, distance) in knn_results {
164| 0| let row = {
165| 0| let mut stmt = conn.prepare_cached(
166| 0| "SELECT id, namespace, name, type, description, body, body_hash,
167| 0| session_id, source, metadata, created_at, updated_at
168| 0| FROM memories WHERE id=?1 AND deleted_at IS NULL",
169| 0| )?;
170| 0| stmt.query_row(rusqlite::params![memory_id], |r| {
171| | Ok(memories::MemoryRow {
172| 0| id: r.get(0)?,
173| 0| namespace: r.get(1)?,
174| 0| name: r.get(2)?,
175| 0| memory_type: r.get(3)?,
176| 0| description: r.get(4)?,
177| 0| body: r.get(5)?,
178| 0| body_hash: r.get(6)?,
179| 0| session_id: r.get(7)?,
180| 0| source: r.get(8)?,
181| 0| metadata: r.get(9)?,
182| 0| created_at: r.get(10)?,
183| 0| updated_at: r.get(11)?,
184| 0| deleted_at: None,
185| | })
186| 0| })
187| 0| .ok()
188| | };
189| 0| if let Some(row) = row {
190| 0| let snippet: String = row.body.chars().take(300).collect();
191| 0| direct_matches.push(RecallItem {
192| 0| memory_id: row.id,
193| 0| name: row.name,
194| 0| namespace: row.namespace,
195| 0| memory_type: row.memory_type,
196| 0| description: row.description,
197| 0| snippet,
198| 0| distance,
199| 0| score: RecallItem::score_from_distance(distance),
200| 0| source: "direct".to_string(),
201| 0| // Direct vector matches do not have a graph depth; rely on `distance`.
202| 0| graph_depth: None,
203| 0| });
204| 0| memory_ids.push(memory_id);
205| 0| }
206| | }
207| |
208| 0| let mut graph_matches = Vec::with_capacity(8);
209| 0| if !args.no_graph {
210| 0| let entity_knn = entities::knn_search(&conn, &embedding, &namespace_for_graph, 5)?;
211| 0| let entity_ids: Vec<i64> = entity_knn.iter().map(|(id, _)| *id).collect();
212| |
213| 0| let all_seed_ids: Vec<i64> = memory_ids
214| 0| .iter()
215| 0| .chain(entity_ids.iter())
216| 0| .copied()
217| 0| .collect();
218| |
219| 0| if !all_seed_ids.is_empty() {
220| 0| let graph_memory_ids = traverse_from_memories_with_hops(
221| 0| &conn,
222| 0| &all_seed_ids,
223| 0| &namespace_for_graph,
224| 0| args.min_weight,
225| 0| args.max_hops,
226| 0| )?;
227| |
228| 0| for (graph_mem_id, hop) in graph_memory_ids {
229| | // v1.0.23: respect the optional cap on graph results so dense
230| | // neighbourhoods do not flood the response unintentionally.
231| 0| if let Some(cap) = args.max_graph_results {
232| 0| if graph_matches.len() >= cap {
233| 0| break;
234| 0| }
235| 0| }
236| 0| let row = {
237| 0| let mut stmt = conn.prepare_cached(
238| 0| "SELECT id, namespace, name, type, description, body, body_hash,
239| 0| session_id, source, metadata, created_at, updated_at
240| 0| FROM memories WHERE id=?1 AND deleted_at IS NULL",
241| 0| )?;
242| 0| stmt.query_row(rusqlite::params![graph_mem_id], |r| {
243| | Ok(memories::MemoryRow {
244| 0| id: r.get(0)?,
245| 0| namespace: r.get(1)?,
246| 0| name: r.get(2)?,
247| 0| memory_type: r.get(3)?,
248| 0| description: r.get(4)?,
249| 0| body: r.get(5)?,
250| 0| body_hash: r.get(6)?,
251| 0| session_id: r.get(7)?,
252| 0| source: r.get(8)?,
253| 0| metadata: r.get(9)?,
254| 0| created_at: r.get(10)?,
255| 0| updated_at: r.get(11)?,
256| 0| deleted_at: None,
257| | })
258| 0| })
259| 0| .ok()
260| | };
261| 0| if let Some(row) = row {
262| 0| let snippet: String = row.body.chars().take(300).collect();
263| 0| // Compute approximate distance from graph hop count.
264| 0| // WARNING: graph_distance is a hop-count proxy, NOT real cosine distance.
265| 0| // For confident ranking, prefer the `graph_depth` field (set to Some(hop)
266| 0| // below). Real cosine distance for graph matches would require
267| 0| // re-embedding (200-500ms latency) and is reserved for v1.0.28.
268| 0| let graph_distance = 1.0 - 1.0 / (hop as f32 + 1.0);
269| 0| graph_matches.push(RecallItem {
270| 0| memory_id: row.id,
271| 0| name: row.name,
272| 0| namespace: row.namespace,
273| 0| memory_type: row.memory_type,
274| 0| description: row.description,
275| 0| snippet,
276| 0| distance: graph_distance,
277| 0| score: RecallItem::score_from_distance(graph_distance),
278| 0| source: "graph".to_string(),
279| 0| graph_depth: Some(hop),
280| 0| });
281| 0| }
282| | }
283| 0| }
284| 0| }
285| |
286| | // Filtrar por max_distance se < 1.0 (ativado). Se nenhum hit dentro do threshold, exit 4.
287| 0| if args.max_distance < 1.0 {
288| 0| let has_relevant = direct_matches
289| 0| .iter()
290| 0| .any(|item| item.distance <= args.max_distance);
291| 0| if !has_relevant {
292| 0| return Err(AppError::NotFound(errors_msg::no_recall_results(
293| 0| args.max_distance,
294| 0| &args.query,
295| 0| &namespace_for_graph,
296| 0| )));
297| 0| }
298| 0| }
299| |
300| 0| let results: Vec<RecallItem> = direct_matches
301| 0| .iter()
302| 0| .cloned()
303| 0| .chain(graph_matches.iter().cloned())
304| 0| .collect();
305| |
306| 0| output::emit_json(&RecallResponse {
307| 0| query: args.query,
308| 0| k: args.k,
309| 0| direct_matches,
310| 0| graph_matches,
311| 0| results,
312| 0| elapsed_ms: start.elapsed().as_millis() as u64,
313| 0| })?;
314| |
315| 0| Ok(())
316| 0|}
317| |
318| |#[cfg(test)]
319| |mod tests {
320| | use crate::output::{RecallItem, RecallResponse};
321| |
322| 6| fn make_item(name: &str, distance: f32, source: &str) -> RecallItem {
323| | RecallItem {
324| | memory_id: 1,
325| 6| name: name.to_string(),
326| 6| namespace: "global".to_string(),
327| 6| memory_type: "fact".to_string(),
328| 6| description: "desc".to_string(),
329| 6| snippet: "snippet".to_string(),
330| 6| distance,
331| 6| score: RecallItem::score_from_distance(distance),
332| 6| source: source.to_string(),
333| 6| graph_depth: if source == "graph" { Some(0) } else { None },
^1 ^5
334| | }
335| 6| }
336| |
337| | // Bug M-A5: every RecallItem carries a non-null cosine similarity score.
338| | #[test]
339| 1| fn recall_item_score_is_present_and_finite_for_direct_match() {
340| 1| let item = make_item("mem", 0.25, "direct");
341| 1| let json = serde_json::to_value(&item).expect("serialization failed");
342| 1| let score = json["score"].as_f64().expect("score must be a number");
343| 1| assert!(
344| 1| (0.0..=1.0).contains(&score),
345| 0| "score must be in [0, 1], got {score}"
346| | );
347| 1| assert!(
348| 1| (score - 0.75).abs() < 1e-6,
349| 0| "score must equal 1 - distance for canonical case"
350| | );
351| 1| }
352| |
353| | #[test]
354| 1| fn recall_item_score_clamps_distance_outside_unit_range() {
355| | // Pathological distances must not yield score outside [0, 1] or NaN.
356| 1| assert_eq!(RecallItem::score_from_distance(2.0), 0.0);
357| 1| assert_eq!(RecallItem::score_from_distance(-0.5), 1.0);
358| 1| assert_eq!(RecallItem::score_from_distance(f32::NAN), 0.0);
359| 1| }
360| |
361| | #[test]
362| 1| fn recall_response_serializes_required_fields() {
363| 1| let resp = RecallResponse {
364| 1| query: "rust memory".to_string(),
365| 1| k: 5,
366| 1| direct_matches: vec![make_item("mem-a", 0.12, "direct")],
367| 1| graph_matches: vec![],
368| 1| results: vec![make_item("mem-a", 0.12, "direct")],
369| 1| elapsed_ms: 42,
370| 1| };
371| |
372| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
373| 1| assert_eq!(json["query"], "rust memory");
374| 1| assert_eq!(json["k"], 5);
375| 1| assert_eq!(json["elapsed_ms"], 42u64);
376| 1| assert!(json["direct_matches"].is_array());
377| 1| assert!(json["graph_matches"].is_array());
378| 1| assert!(json["results"].is_array());
379| 1| }
380| |
381| | #[test]
382| 1| fn recall_item_serializes_renamed_type() {
383| 1| let item = make_item("mem-test", 0.25, "direct");
384| 1| let json = serde_json::to_value(&item).expect("serialization failed");
385| |
386| | // The memory_type field is renamed to "type" in JSON
387| 1| assert_eq!(json["type"], "fact");
388| 1| assert_eq!(json["distance"], 0.25f32);
389| 1| assert_eq!(json["source"], "direct");
390| 1| }
391| |
392| | #[test]
393| 1| fn recall_response_results_contains_direct_and_graph() {
394| 1| let direct = make_item("d-mem", 0.10, "direct");
395| 1| let graph = make_item("g-mem", 0.0, "graph");
396| |
397| 1| let resp = RecallResponse {
398| 1| query: "query".to_string(),
399| 1| k: 10,
400| 1| direct_matches: vec![direct.clone()],
401| 1| graph_matches: vec![graph.clone()],
402| 1| results: vec![direct, graph],
403| 1| elapsed_ms: 10,
404| 1| };
405| |
406| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
407| 1| assert_eq!(json["direct_matches"].as_array().unwrap().len(), 1);
408| 1| assert_eq!(json["graph_matches"].as_array().unwrap().len(), 1);
409| 1| assert_eq!(json["results"].as_array().unwrap().len(), 2);
410| 1| assert_eq!(json["results"][0]["source"], "direct");
411| 1| assert_eq!(json["results"][1]["source"], "graph");
412| 1| }
413| |
414| | #[test]
415| 1| fn recall_response_empty_serializes_empty_arrays() {
416| 1| let resp = RecallResponse {
417| 1| query: "nothing".to_string(),
418| 1| k: 3,
419| 1| direct_matches: vec![],
420| 1| graph_matches: vec![],
421| 1| results: vec![],
422| 1| elapsed_ms: 1,
423| 1| };
424| |
425| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
426| 1| assert_eq!(json["direct_matches"].as_array().unwrap().len(), 0);
427| 1| assert_eq!(json["results"].as_array().unwrap().len(), 0);
428| 1| }
429| |
430| | #[test]
431| 1| fn graph_matches_distance_uses_hop_count_proxy() {
432| | // Verify the hop-count proxy formula: 1.0 - 1.0 / (hop + 1.0)
433| | // hop=0 → 0.0 (seed-level entity, identity distance)
434| | // hop=1 → 0.5
435| | // hop=2 → ≈ 0.667
436| | // hop=3 → 0.75
437| 1| let cases: &[(u32, f32)] = &[(0, 0.0), (1, 0.5), (2, 0.6667), (3, 0.75)];
438| 5| for &(hop, expected) in cases {
^4 ^4
439| 4| let d = 1.0_f32 - 1.0 / (hop as f32 + 1.0);
440| 4| assert!(
441| 4| (d - expected).abs() < 0.001,
442| 0| "hop={hop} expected={expected} got={d}"
443| | );
444| | }
445| 1| }
446| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/reclassify.rs:
1| |//! Handler for the `reclassify` CLI subcommand (GAP-18).
2| |//!
3| |//! Reclassifies one entity (single mode) or a whole group of entities (batch
4| |//! mode) by updating the `type` column in the `entities` table.
5| |//!
6| |//! Single mode: `--name <entity>` changes the type of one entity.
7| |//! Batch mode: `--from-type <old> --to-type <new> --batch` changes every
8| |//! entity in the namespace that currently has `<old>` as its type.
9| |
10| |use crate::entity_type::EntityType;
11| |use crate::errors::AppError;
12| |use crate::i18n::errors_msg;
13| |use crate::output::{self, OutputFormat};
14| |use crate::paths::AppPaths;
15| |use crate::storage::connection::open_rw;
16| |use crate::storage::entities;
17| |use rusqlite::params;
18| |use serde::Serialize;
19| |
20| |#[derive(clap::Args)]
21| |#[command(after_long_help = "EXAMPLES:\n \
22| | # Reclassify a single entity from its current type to 'tool'\n \
23| | sqlite-graphrag reclassify --name tokio-runtime --new-type tool\n\n \
24| | # Reclassify all 'concept' entities to 'tool' in one shot (batch)\n \
25| | sqlite-graphrag reclassify --from-type concept --to-type tool --batch\n\n \
26| | # Reclassify in a specific namespace\n \
27| | sqlite-graphrag reclassify --name alice --new-type person --namespace my-project\n\n\
28| |NOTE:\n \
29| | Single mode requires --name and at least one of --new-type or --description.\n \
30| | Batch mode requires --from-type, --to-type and --batch.\n \
31| | Providing --name together with --batch is an error.\n\n\
32| |VALID ENTITY TYPES:\n \
33| | project, tool, person, file, concept, incident, decision,\n \
34| | memory, dashboard, issue_tracker, organization, location, date")]
35| |pub struct ReclassifyArgs {
36| | /// Entity name to reclassify (single mode). Mutually exclusive with --from-type + --batch.
37| | #[arg(long, conflicts_with_all = ["from_type", "batch"])]
38| | pub name: Option<String>,
39| | /// New entity type for single mode.
40| | #[arg(long, value_enum, value_name = "TYPE")]
41| | pub new_type: Option<EntityType>,
42| | /// New description for the entity (single mode only). Ignored in batch mode.
43| | #[arg(long, value_name = "TEXT")]
44| | pub description: Option<String>,
45| | /// Current entity type to match in batch mode. Requires --to-type and --batch.
46| | #[arg(
47| | long,
48| | value_enum,
49| | value_name = "TYPE",
50| | requires = "to_type",
51| | requires = "batch"
52| | )]
53| | pub from_type: Option<EntityType>,
54| | /// New entity type to assign in batch mode. Requires --from-type and --batch.
55| | #[arg(long, value_enum, value_name = "TYPE", requires = "from_type")]
56| | pub to_type: Option<EntityType>,
57| | /// Enable batch reclassification (--from-type to --to-type). Requires --from-type and --to-type.
58| | #[arg(long, default_value_t = false, requires = "from_type")]
59| | pub batch: bool,
60| | #[arg(long)]
61| | pub namespace: Option<String>,
62| | #[arg(long, value_enum, default_value = "json")]
63| | pub format: OutputFormat,
64| | #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
65| | pub json: bool,
66| | #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
67| | pub db: Option<String>,
68| |}
69| |
70| |#[derive(Serialize)]
71| |struct ReclassifyResponse {
72| | action: String,
73| | count: usize,
74| | #[serde(skip_serializing_if = "Option::is_none")]
75| | description_updated: Option<bool>,
76| | namespace: String,
77| | /// Total execution time in milliseconds from handler start to serialisation.
78| | elapsed_ms: u64,
79| |}
80| |
81| 0|pub fn run(args: ReclassifyArgs) -> Result<(), AppError> {
82| 0| let inicio = std::time::Instant::now();
83| 0| let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
84| 0| let paths = AppPaths::resolve(args.db.as_deref())?;
85| |
86| 0| crate::storage::connection::ensure_db_ready(&paths)?;
87| |
88| 0| let mut conn = open_rw(&paths.db)?;
89| |
90| 0| let count = if args.batch {
91| | // Batch mode: --from-type + --to-type + --batch
92| 0| let from_type = args.from_type.ok_or_else(|| {
93| 0| AppError::Validation("--from-type is required in batch mode".to_string())
94| 0| })?;
95| 0| let to_type = args.to_type.ok_or_else(|| {
96| 0| AppError::Validation("--to-type is required in batch mode".to_string())
97| 0| })?;
98| |
99| 0| let tx = conn.transaction_with_behavior(rusqlite::TransactionBehavior::Immediate)?;
100| 0| let affected = tx.execute(
101| 0| "UPDATE entities SET type = ?1, updated_at = unixepoch()
102| 0| WHERE type = ?2 AND namespace = ?3",
103| 0| params![to_type.as_str(), from_type.as_str(), namespace],
104| 0| )?;
105| 0| tx.commit()?;
106| 0| if affected == 0 {
107| 0| tracing::warn!(target: "reclassify",
108| 0| from_type = from_type.as_str(),
109| | namespace = %namespace,
110| 0| "reclassify batch matched zero entities — verify --from-type value exists"
111| | );
112| 0| }
113| 0| affected
114| | } else {
115| | // Single mode: --name + --new-type
116| 0| let entity_name = args
117| 0| .name
118| 0| .as_deref()
119| 0| .ok_or_else(|| AppError::Validation("--name is required in single mode".to_string()))?;
120| 0| if args.new_type.is_none() && args.description.is_none() {
121| 0| return Err(AppError::Validation(
122| 0| "at least one of --new-type or --description is required in single mode"
123| 0| .to_string(),
124| 0| ));
125| 0| }
126| |
127| | // Verify entity exists.
128| 0| entities::find_entity_id(&conn, &namespace, entity_name)?.ok_or_else(|| {
129| 0| AppError::NotFound(errors_msg::entity_not_found(entity_name, &namespace))
130| 0| })?;
131| |
132| 0| let tx = conn.transaction_with_behavior(rusqlite::TransactionBehavior::Immediate)?;
133| 0| let mut affected = 0;
134| 0| if let Some(new_type) = args.new_type {
135| 0| affected = tx.execute(
136| 0| "UPDATE entities SET type = ?1, updated_at = unixepoch()
137| 0| WHERE name = ?2 AND namespace = ?3",
138| 0| params![new_type.as_str(), entity_name, namespace],
139| 0| )?;
140| 0| }
141| 0| if let Some(ref desc) = args.description {
142| 0| let rows = tx.execute(
143| 0| "UPDATE entities SET description = ?1, updated_at = unixepoch()
144| 0| WHERE name = ?2 AND namespace = ?3",
145| 0| params![desc, entity_name, namespace],
146| 0| )?;
147| 0| if affected == 0 {
148| 0| affected = rows;
149| 0| }
150| 0| }
151| 0| tx.commit()?;
152| 0| affected
153| | };
154| |
155| 0| conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);")?;
156| |
157| 0| let response = ReclassifyResponse {
158| 0| action: "reclassified".to_string(),
159| 0| count,
160| 0| description_updated: if args.description.is_some() {
161| 0| Some(true)
162| | } else {
163| 0| None
164| | },
165| 0| namespace: namespace.clone(),
166| 0| elapsed_ms: inicio.elapsed().as_millis() as u64,
167| | };
168| |
169| 0| match args.format {
170| 0| OutputFormat::Json => output::emit_json(&response)?,
171| 0| OutputFormat::Text | OutputFormat::Markdown => {
172| 0| output::emit_text(&format!(
173| 0| "reclassified: {} entities [{}]",
174| 0| response.count, response.namespace
175| 0| ));
176| 0| }
177| | }
178| |
179| 0| Ok(())
180| 0|}
181| |
182| |#[cfg(test)]
183| |mod tests {
184| | use super::*;
185| |
186| | #[test]
187| 1| fn reclassify_response_serializes_all_fields() {
188| 1| let resp = ReclassifyResponse {
189| 1| action: "reclassified".to_string(),
190| 1| count: 5,
191| 1| description_updated: None,
192| 1| namespace: "global".to_string(),
193| 1| elapsed_ms: 12,
194| 1| };
195| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
196| 1| assert_eq!(json["action"], "reclassified");
197| 1| assert_eq!(json["count"], 5);
198| 1| assert_eq!(json["namespace"], "global");
199| 1| assert!(json["elapsed_ms"].is_number());
200| 1| assert!(json.get("description_updated").is_none());
201| 1| }
202| |
203| | #[test]
204| 1| fn reclassify_response_count_zero_is_valid() {
205| 1| let resp = ReclassifyResponse {
206| 1| action: "reclassified".to_string(),
207| 1| count: 0,
208| 1| description_updated: None,
209| 1| namespace: "my-project".to_string(),
210| 1| elapsed_ms: 3,
211| 1| };
212| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
213| 1| assert_eq!(json["count"], 0);
214| 1| assert_eq!(json["action"], "reclassified");
215| 1| }
216| |
217| | #[test]
218| 1| fn reclassify_response_action_is_reclassified() {
219| 1| let resp = ReclassifyResponse {
220| 1| action: "reclassified".to_string(),
221| 1| count: 1,
222| 1| description_updated: None,
223| 1| namespace: "ns".to_string(),
224| 1| elapsed_ms: 1,
225| 1| };
226| 1| assert_eq!(resp.action, "reclassified");
227| 1| }
228| |
229| | #[test]
230| 1| fn reclassify_response_description_updated_present_when_set() {
231| 1| let resp = ReclassifyResponse {
232| 1| action: "reclassified".to_string(),
233| 1| count: 1,
234| 1| description_updated: Some(true),
235| 1| namespace: "global".to_string(),
236| 1| elapsed_ms: 2,
237| 1| };
238| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
239| 1| assert_eq!(json["description_updated"], true);
240| 1| }
241| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/reclassify_relation.rs:
1| |//! Handler for the `reclassify-relation` CLI subcommand (GAP-13).
2| |//!
3| |//! Renames a relation type in the `relationships` table — either a single
4| |//! directed edge (`--source`, `--target`, `--from-relation`) or every edge of
5| |//! a given type in the namespace (`--batch`).
6| |//!
7| |//! When the rename would produce a duplicate `(source_id, target_id, relation)`
8| |//! triple, `UPDATE OR IGNORE` skips the conflicting row and the subsequent
9| |//! `DELETE` removes it; the count of such skipped rows is reported as
10| |//! `merged_duplicates`.
11| |
12| |use crate::entity_type::EntityType;
13| |use crate::errors::AppError;
14| |use crate::output::{self, OutputFormat};
15| |use crate::paths::AppPaths;
16| |use crate::storage::connection::open_rw;
17| |use rusqlite::params;
18| |use serde::Serialize;
19| |
20| |#[derive(clap::Args)]
21| |#[command(after_long_help = "EXAMPLES:\n \
22| | # Rename a single edge from 'mentions' to 'related'\n \
23| | sqlite-graphrag reclassify-relation --source tokio --target axum \\\n \
24| | --from-relation mentions --to-relation related\n\n \
25| | # Rename every 'mentions' edge in the namespace to 'related'\n \
26| | sqlite-graphrag reclassify-relation \\\n \
27| | --from-relation mentions --to-relation related --batch\n\n \
28| | # Dry-run to preview what would change\n \
29| | sqlite-graphrag reclassify-relation \\\n \
30| | --from-relation mentions --to-relation related --batch --dry-run\n\n \
31| | # Batch rename only edges whose source is a 'tool' entity\n \
32| | sqlite-graphrag reclassify-relation \\\n \
33| | --from-relation uses --to-relation depends_on --batch \\\n \
34| | --filter-source-type tool\n\n\
35| |NOTE:\n \
36| | Single mode requires --source, --target and --from-relation.\n \
37| | Batch mode requires --from-relation, --to-relation and --batch.\n \
38| | --filter-source-type and --filter-target-type are only effective in batch mode.")]
39| |pub struct ReclassifyRelationArgs {
40| | /// Source entity name (single mode). Mutually exclusive with --batch.
41| | #[arg(long, conflicts_with = "batch", value_name = "ENTITY")]
42| | pub source: Option<String>,
43| | /// Target entity name (single mode). Mutually exclusive with --batch.
44| | #[arg(long, conflicts_with = "batch", value_name = "ENTITY")]
45| | pub target: Option<String>,
46| | /// Current relation type to rename. Required in both single and batch modes.
47| | #[arg(long, value_parser = crate::parsers::parse_relation, value_name = "RELATION")]
48| | pub from_relation: String,
49| | /// New relation type to assign. Required in both single and batch modes.
50| | #[arg(long, value_parser = crate::parsers::parse_relation, value_name = "RELATION")]
51| | pub to_relation: String,
52| | /// Enable batch reclassification of all edges with --from-relation. Requires --from-relation and --to-relation.
53| | #[arg(long, default_value_t = false)]
54| | pub batch: bool,
55| | /// Filter batch: only rename edges whose source entity has this type.
56| | #[arg(long, value_enum, value_name = "TYPE", requires = "batch")]
57| | pub filter_source_type: Option<EntityType>,
58| | /// Filter batch: only rename edges whose target entity has this type.
59| | #[arg(long, value_enum, value_name = "TYPE", requires = "batch")]
60| | pub filter_target_type: Option<EntityType>,
61| | /// Preview count without committing changes.
62| | #[arg(long, default_value_t = false)]
63| | pub dry_run: bool,
64| | #[arg(long)]
65| | pub namespace: Option<String>,
66| | #[arg(long, value_enum, default_value = "json")]
67| | pub format: OutputFormat,
68| | #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
69| | pub json: bool,
70| | #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
71| | pub db: Option<String>,
72| |}
73| |
74| |#[derive(Serialize)]
75| |struct ReclassifyRelationResponse {
76| | action: String,
77| | from_relation: String,
78| | to_relation: String,
79| | /// Number of edges successfully renamed.
80| | count: usize,
81| | /// Edges that collided with an existing (source, target, to_relation) triple
82| | /// and were removed rather than renamed (UPDATE OR IGNORE + DELETE pattern).
83| | merged_duplicates: usize,
84| | namespace: String,
85| | elapsed_ms: u64,
86| |}
87| |
88| 0|pub fn run(args: ReclassifyRelationArgs) -> Result<(), AppError> {
89| 0| let inicio = std::time::Instant::now();
90| 0| let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
91| 0| let paths = AppPaths::resolve(args.db.as_deref())?;
92| |
93| 0| crate::storage::connection::ensure_db_ready(&paths)?;
94| |
95| | // Emit warnings for non-canonical relation values.
96| 0| crate::parsers::warn_if_non_canonical(&args.from_relation);
97| 0| crate::parsers::warn_if_non_canonical(&args.to_relation);
98| |
99| | // Reject same-value renames: nothing to do and would silently remove duplicates.
100| 0| if args.from_relation == args.to_relation {
101| 0| return Err(AppError::Validation(
102| 0| "--from-relation and --to-relation must be different".to_string(),
103| 0| ));
104| 0| }
105| |
106| 0| let mut conn = open_rw(&paths.db)?;
107| |
108| 0| if args.batch {
109| 0| run_batch(args, inicio, namespace, &mut conn)
110| | } else {
111| 0| run_single(args, inicio, namespace, &mut conn)
112| | }
113| 0|}
114| |
115| |// ---------------------------------------------------------------------------
116| |// Single mode
117| |// ---------------------------------------------------------------------------
118| |
119| 0|fn run_single(
120| 0| args: ReclassifyRelationArgs,
121| 0| inicio: std::time::Instant,
122| 0| namespace: String,
123| 0| conn: &mut rusqlite::Connection,
124| 0|) -> Result<(), AppError> {
125| 0| let source_name = args.source.as_deref().ok_or_else(|| {
126| 0| AppError::Validation(
127| 0| "--source is required in single mode (omit --batch for single-edge rename)".to_string(),
128| 0| )
129| 0| })?;
130| 0| let target_name = args
131| 0| .target
132| 0| .as_deref()
133| 0| .ok_or_else(|| AppError::Validation("--target is required in single mode".to_string()))?;
134| |
135| | // Resolve entity IDs — fail fast if either side does not exist.
136| | // Normalize names to match the normalized stored entity names.
137| 0| let source_name_norm = crate::parsers::normalize_entity_name(source_name);
138| 0| let target_name_norm = crate::parsers::normalize_entity_name(target_name);
139| 0| let source_id: i64 = conn
140| 0| .query_row(
141| 0| "SELECT id FROM entities WHERE name = ?1 AND namespace = ?2",
142| 0| params![source_name_norm, namespace],
143| 0| |r| r.get(0),
144| | )
145| 0| .map_err(|_| {
146| 0| AppError::NotFound(format!(
147| 0| "source entity '{source_name}' not found in namespace '{namespace}'"
148| 0| ))
149| 0| })?;
150| |
151| 0| let target_id: i64 = conn
152| 0| .query_row(
153| 0| "SELECT id FROM entities WHERE name = ?1 AND namespace = ?2",
154| 0| params![target_name_norm, namespace],
155| 0| |r| r.get(0),
156| | )
157| 0| .map_err(|_| {
158| 0| AppError::NotFound(format!(
159| 0| "target entity '{target_name}' not found in namespace '{namespace}'"
160| 0| ))
161| 0| })?;
162| |
163| | // Verify the edge to rename exists.
164| 0| let original_count: i64 = conn.query_row(
165| 0| "SELECT COUNT(*) FROM relationships
166| 0| WHERE source_id = ?1 AND target_id = ?2 AND relation = ?3 AND namespace = ?4",
167| 0| params![source_id, target_id, args.from_relation, namespace],
168| 0| |r| r.get(0),
169| 0| )?;
170| |
171| 0| if original_count == 0 {
172| 0| return Err(AppError::NotFound(format!(
173| 0| "edge '{source_name}' --[{}]--> '{target_name}' not found in namespace '{namespace}'",
174| 0| args.from_relation
175| 0| )));
176| 0| }
177| |
178| 0| if args.dry_run {
179| 0| emit_response(
180| 0| &args,
181| 0| "dry_run",
182| 0| original_count as usize,
183| | 0,
184| 0| namespace,
185| 0| inicio,
186| 0| )?;
187| 0| return Ok(());
188| 0| }
189| |
190| 0| let tx = conn.transaction_with_behavior(rusqlite::TransactionBehavior::Immediate)?;
191| |
192| 0| let updated = tx.execute(
193| 0| "UPDATE OR IGNORE relationships
194| 0| SET relation = ?1
195| 0| WHERE source_id = ?2 AND target_id = ?3 AND relation = ?4 AND namespace = ?5",
196| 0| params![
197| 0| args.to_relation,
198| 0| source_id,
199| 0| target_id,
200| 0| args.from_relation,
201| 0| namespace
202| 0| ],
203| 0| )?;
204| |
205| | // Remove rows that UPDATE OR IGNORE silently skipped due to UNIQUE collision.
206| 0| let deleted = tx.execute(
207| 0| "DELETE FROM relationships
208| 0| WHERE source_id = ?1 AND target_id = ?2 AND relation = ?3 AND namespace = ?4",
209| 0| params![source_id, target_id, args.from_relation, namespace],
210| 0| )?;
211| |
212| 0| tx.commit()?;
213| |
214| 0| conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);")?;
215| |
216| 0| let merged = (original_count as usize).saturating_sub(updated + deleted);
217| 0| emit_response(&args, "reclassified", updated, merged, namespace, inicio)
218| 0|}
219| |
220| |// ---------------------------------------------------------------------------
221| |// Batch mode
222| |// ---------------------------------------------------------------------------
223| |
224| 0|fn run_batch(
225| 0| args: ReclassifyRelationArgs,
226| 0| inicio: std::time::Instant,
227| 0| namespace: String,
228| 0| conn: &mut rusqlite::Connection,
229| 0|) -> Result<(), AppError> {
230| | // Build WHERE clause extensions for optional entity-type filters.
231| | // The base query joins relationships with source/target entities.
232| 0| let source_filter = args
233| 0| .filter_source_type
234| 0| .map(|t| format!(" AND src.type = '{}'", t.as_str()))
235| 0| .unwrap_or_default();
236| 0| let target_filter = args
237| 0| .filter_target_type
238| 0| .map(|t| format!(" AND tgt.type = '{}'", t.as_str()))
239| 0| .unwrap_or_default();
240| 0| let has_filters = !source_filter.is_empty() || !target_filter.is_empty();
241| |
242| | // Count edges that would be affected (used for both dry-run and confirmation).
243| 0| let original_count: i64 = if has_filters {
244| 0| conn.query_row(
245| 0| &format!(
246| 0| "SELECT COUNT(*) FROM relationships r
247| 0| JOIN entities src ON src.id = r.source_id
248| 0| JOIN entities tgt ON tgt.id = r.target_id
249| 0| WHERE r.relation = ?1 AND r.namespace = ?2{source_filter}{target_filter}"
250| 0| ),
251| 0| params![args.from_relation, namespace],
252| 0| |r| r.get(0),
253| 0| )?
254| | } else {
255| 0| conn.query_row(
256| 0| "SELECT COUNT(*) FROM relationships
257| 0| WHERE relation = ?1 AND namespace = ?2",
258| 0| params![args.from_relation, namespace],
259| 0| |r| r.get(0),
260| 0| )?
261| | };
262| |
263| 0| if original_count == 0 {
264| 0| tracing::warn!(target: "reclassify_relation",
265| | from_relation = %args.from_relation,
266| | namespace = %namespace,
267| 0| "reclassify-relation batch matched zero edges — verify --from-relation value"
268| | );
269| 0| }
270| |
271| 0| if args.dry_run {
272| 0| emit_response(
273| 0| &args,
274| 0| "dry_run",
275| 0| original_count as usize,
276| | 0,
277| 0| namespace,
278| 0| inicio,
279| 0| )?;
280| 0| return Ok(());
281| 0| }
282| |
283| 0| let tx = conn.transaction_with_behavior(rusqlite::TransactionBehavior::Immediate)?;
284| |
285| 0| let updated = if has_filters {
286| | // For filtered batch we need to collect IDs first, then update.
287| 0| let ids: Vec<i64> = {
288| 0| let mut stmt = tx.prepare(&format!(
289| 0| "SELECT r.id FROM relationships r
290| 0| JOIN entities src ON src.id = r.source_id
291| 0| JOIN entities tgt ON tgt.id = r.target_id
292| 0| WHERE r.relation = ?1 AND r.namespace = ?2{source_filter}{target_filter}"
293| 0| ))?;
294| 0| let collected: Vec<i64> = stmt
295| 0| .query_map(params![args.from_relation, namespace], |r| r.get(0))?
296| 0| .collect::<Result<Vec<_>, _>>()?;
297| 0| collected
298| | };
299| |
300| 0| let mut moved: usize = 0;
301| 0| for id in &ids {
302| 0| let n = tx.execute(
303| 0| "UPDATE OR IGNORE relationships
304| 0| SET relation = ?1
305| 0| WHERE id = ?2",
306| 0| params![args.to_relation, id],
307| 0| )?;
308| 0| moved += n;
309| | }
310| 0| moved
311| | } else {
312| 0| tx.execute(
313| 0| "UPDATE OR IGNORE relationships
314| 0| SET relation = ?1
315| 0| WHERE relation = ?2 AND namespace = ?3",
316| 0| params![args.to_relation, args.from_relation, namespace],
317| 0| )?
318| | };
319| |
320| | // Remove rows the UPDATE OR IGNORE left behind (UNIQUE collision survivors).
321| 0| let deleted = if has_filters {
322| 0| tx.execute(
323| 0| &format!(
324| 0| "DELETE FROM relationships WHERE id IN (
325| 0| SELECT r.id FROM relationships r
326| 0| JOIN entities src ON src.id = r.source_id
327| 0| JOIN entities tgt ON tgt.id = r.target_id
328| 0| WHERE r.relation = ?1 AND r.namespace = ?2{source_filter}{target_filter}
329| 0| )"
330| 0| ),
331| 0| params![args.from_relation, namespace],
332| 0| )?
333| | } else {
334| 0| tx.execute(
335| 0| "DELETE FROM relationships WHERE relation = ?1 AND namespace = ?2",
336| 0| params![args.from_relation, namespace],
337| 0| )?
338| | };
339| |
340| 0| tx.commit()?;
341| |
342| 0| conn.execute_batch("ANALYZE relationships;")?;
343| 0| conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);")?;
344| |
345| 0| let merged = (original_count as usize).saturating_sub(updated + deleted);
346| 0| emit_response(&args, "reclassified", updated, merged, namespace, inicio)
347| 0|}
348| |
349| |// ---------------------------------------------------------------------------
350| |// Shared response emitter
351| |// ---------------------------------------------------------------------------
352| |
353| 0|fn emit_response(
354| 0| args: &ReclassifyRelationArgs,
355| 0| action: &str,
356| 0| count: usize,
357| 0| merged_duplicates: usize,
358| 0| namespace: String,
359| 0| inicio: std::time::Instant,
360| 0|) -> Result<(), AppError> {
361| 0| let response = ReclassifyRelationResponse {
362| 0| action: action.to_string(),
363| 0| from_relation: args.from_relation.clone(),
364| 0| to_relation: args.to_relation.clone(),
365| 0| count,
366| 0| merged_duplicates,
367| 0| namespace: namespace.clone(),
368| 0| elapsed_ms: inicio.elapsed().as_millis() as u64,
369| 0| };
370| |
371| 0| match args.format {
372| 0| OutputFormat::Json => output::emit_json(&response)?,
373| 0| OutputFormat::Text | OutputFormat::Markdown => {
374| 0| output::emit_text(&format!(
375| 0| "{action}: {count} edges '{}' → '{}' [{namespace}] (duplicates merged: {merged_duplicates})",
376| 0| args.from_relation, args.to_relation
377| 0| ));
378| 0| }
379| | }
380| 0| Ok(())
381| 0|}
382| |
383| |#[cfg(test)]
384| |mod tests {
385| | use super::*;
386| |
387| 6| fn make_response(action: &str, count: usize, merged: usize) -> ReclassifyRelationResponse {
388| 6| ReclassifyRelationResponse {
389| 6| action: action.to_string(),
390| 6| from_relation: "mentions".to_string(),
391| 6| to_relation: "related".to_string(),
392| 6| count,
393| 6| merged_duplicates: merged,
394| 6| namespace: "global".to_string(),
395| 6| elapsed_ms: 1,
396| 6| }
397| 6| }
398| |
399| | #[test]
400| 1| fn response_serializes_all_fields() {
401| 1| let resp = make_response("reclassified", 5, 0);
402| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
403| 1| assert_eq!(json["action"], "reclassified");
404| 1| assert_eq!(json["from_relation"], "mentions");
405| 1| assert_eq!(json["to_relation"], "related");
406| 1| assert_eq!(json["count"], 5);
407| 1| assert_eq!(json["merged_duplicates"], 0);
408| 1| assert_eq!(json["namespace"], "global");
409| 1| assert!(json["elapsed_ms"].is_number());
410| 1| }
411| |
412| | #[test]
413| 1| fn response_action_dry_run() {
414| 1| let resp = make_response("dry_run", 10, 0);
415| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
416| 1| assert_eq!(json["action"], "dry_run");
417| 1| assert_eq!(json["count"], 10);
418| 1| assert_eq!(json["merged_duplicates"], 0);
419| 1| }
420| |
421| | #[test]
422| 1| fn response_merged_duplicates_nonzero() {
423| | // Simulates a case where 3 out of 10 edges collided with existing rows.
424| 1| let resp = make_response("reclassified", 7, 3);
425| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
426| 1| assert_eq!(json["count"], 7);
427| 1| assert_eq!(json["merged_duplicates"], 3);
428| 1| }
429| |
430| | #[test]
431| 1| fn response_count_zero_when_nothing_matched() {
432| 1| let resp = make_response("reclassified", 0, 0);
433| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
434| 1| assert_eq!(json["count"], 0);
435| 1| assert_eq!(json["merged_duplicates"], 0);
436| 1| }
437| |
438| | #[test]
439| 1| fn response_action_values_exhaustive() {
440| 3| for action in &["reclassified", "dry_run"] {
^2
441| 2| let resp = make_response(action, 1, 0);
442| 2| let json = serde_json::to_value(&resp).expect("serialization");
443| 2| assert_eq!(json["action"], *action);
444| | }
445| 1| }
446| |
447| | #[test]
448| 1| fn response_from_and_to_relation_present() {
449| 1| let resp = ReclassifyRelationResponse {
450| 1| action: "reclassified".to_string(),
451| 1| from_relation: "uses".to_string(),
452| 1| to_relation: "depends_on".to_string(),
453| 1| count: 3,
454| 1| merged_duplicates: 1,
455| 1| namespace: "my-project".to_string(),
456| 1| elapsed_ms: 5,
457| 1| };
458| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
459| 1| assert_eq!(json["from_relation"], "uses");
460| 1| assert_eq!(json["to_relation"], "depends_on");
461| 1| }
462| |
463| | #[test]
464| 1| fn same_relation_value_rejected_at_logic_level() {
465| | // Validates that the guard in run() would catch from == to.
466| | // We test the condition directly since we cannot call run() without a DB.
467| 1| let from = "mentions".to_string();
468| 1| let to = "mentions".to_string();
469| 1| assert!(
470| 1| from == to,
471| 0| "same-value rename must be caught before DB access"
472| | );
473| 1| }
474| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/related.rs:
1| |//! Handler for the `related` CLI subcommand.
2| |
3| |use crate::constants::{
4| | DEFAULT_K_RECALL, DEFAULT_MAX_HOPS, DEFAULT_MIN_WEIGHT, TEXT_DESCRIPTION_PREVIEW_LEN,
5| |};
6| |use crate::errors::AppError;
7| |use crate::i18n::errors_msg;
8| |use crate::output::{self, OutputFormat};
9| |use crate::paths::AppPaths;
10| |use crate::storage::connection::open_ro;
11| |use rusqlite::{params, Connection};
12| |use serde::Serialize;
13| |use std::collections::{HashSet, VecDeque};
14| |
15| |/// Identifies whether the seed resolved to a memory or a bare entity.
16| |enum SeedKind {
17| | Memory(i64),
18| | Entity(i64),
19| |}
20| |
21| |/// Tuple returned by the adjacency fetch: (neighbour_entity_id, source_name,
22| |/// target_name, relation, weight).
23| |type Neighbour = (i64, String, String, String, f64);
24| |
25| |#[derive(clap::Args)]
26| |#[command(after_long_help = "EXAMPLES:\n \
27| | # List memories connected to a memory via the entity graph (default 2 hops)\n \
28| | sqlite-graphrag related onboarding\n\n \
29| | # Increase hop distance and filter by relation type\n \
30| | sqlite-graphrag related onboarding --max-hops 3 --relation related\n\n \
31| | # Cap result count and require minimum edge weight\n \
32| | sqlite-graphrag related onboarding --limit 5 --min-weight 0.5")]
33| |pub struct RelatedArgs {
34| | /// Memory name as a positional argument. Alternative to `--name`.
35| | #[arg(
36| | value_name = "NAME",
37| | conflicts_with = "name",
38| | help = "Memory name whose neighbours to traverse; alternative to --name"
39| | )]
40| | pub name_positional: Option<String>,
41| | /// Memory name as a flag. Required when the positional form is absent. Also accepts the alias `--from`.
42| | #[arg(long, alias = "from")]
43| | pub name: Option<String>,
44| | /// Maximum graph hop count. Also accepts the alias `--hops`.
45| | #[arg(long, alias = "hops", default_value_t = DEFAULT_MAX_HOPS)]
46| | pub max_hops: u32,
47| | /// Filter results to a specific relation type. Canonical values:
48| | /// applies-to, uses, depends-on, causes, fixes, contradicts, supports,
49| | /// follows, related, mentions, replaces, tracked-in.
50| | /// Any kebab-case or snake_case string is also accepted as a custom relation.
51| | #[arg(long, value_parser = crate::parsers::parse_relation)]
52| | pub relation: Option<String>,
53| | #[arg(long, default_value_t = DEFAULT_MIN_WEIGHT)]
54| | pub min_weight: f64,
55| | #[arg(long, default_value_t = DEFAULT_K_RECALL)]
56| | pub limit: usize,
57| | #[arg(long)]
58| | pub namespace: Option<String>,
59| | #[arg(long, value_enum, default_value = "json")]
60| | pub format: OutputFormat,
61| | #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
62| | pub json: bool,
63| | #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
64| | pub db: Option<String>,
65| |}
66| |
67| |#[derive(Serialize)]
68| |struct RelatedResponse {
69| | /// Echo of the seed memory name resolved from `--name` or the positional argument.
70| | /// Added in v1.0.35 for input transparency in JSON output.
71| | name: String,
72| | /// Echo of the resolved `--max-hops` value (default 2). Added in v1.0.35.
73| | max_hops: u32,
74| | results: Vec<RelatedMemory>,
75| | /// Semantic alias of `results` following the v1.0.66 alias pattern (list has items/memories).
76| | related_memories: Vec<RelatedMemory>,
77| | elapsed_ms: u64,
78| |}
79| |
80| |#[derive(Serialize, Clone)]
81| |struct RelatedMemory {
82| | memory_id: i64,
83| | name: String,
84| | namespace: String,
85| | #[serde(rename = "type")]
86| | memory_type: String,
87| | description: String,
88| | hop_distance: u32,
89| | source_entity: Option<String>,
90| | target_entity: Option<String>,
91| | /// Alias of `source_entity` for cross-command consistency (graph, link, deep-research use from/to).
92| | #[serde(skip_serializing_if = "Option::is_none")]
93| | from: Option<String>,
94| | /// Alias of `target_entity` for cross-command consistency.
95| | #[serde(skip_serializing_if = "Option::is_none")]
96| | to: Option<String>,
97| | relation: Option<String>,
98| | weight: Option<f64>,
99| |}
100| |
101| 0|pub fn run(args: RelatedArgs) -> Result<(), AppError> {
102| 0| let inicio = std::time::Instant::now();
103| 0| let name = args
104| 0| .name_positional
105| 0| .as_deref()
106| 0| .or(args.name.as_deref())
107| 0| .ok_or_else(|| {
108| 0| AppError::Validation(
109| 0| "name required: pass as positional argument or via --name".to_string(),
110| 0| )
111| 0| })?
112| 0| .to_string();
113| |
114| 0| if name.trim().is_empty() {
115| 0| return Err(AppError::Validation("name must not be empty".to_string()));
116| 0| }
117| |
118| 0| let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
119| 0| let paths = AppPaths::resolve(args.db.as_deref())?;
120| |
121| 0| crate::storage::connection::ensure_db_ready(&paths)?;
122| |
123| 0| let conn = open_ro(&paths.db)?;
124| |
125| | // Locate the seed: try memory first, fall back to bare entity.
126| 0| let seed = match conn.query_row(
127| 0| "SELECT id FROM memories WHERE namespace = ?1 AND name = ?2 AND deleted_at IS NULL",
128| 0| params![namespace, name],
129| 0| |r| r.get::<_, i64>(0),
130| | ) {
131| 0| Ok(id) => SeedKind::Memory(id),
132| | Err(rusqlite::Error::QueryReturnedNoRows) => {
133| 0| match crate::storage::entities::find_entity_id(&conn, &namespace, &name)? {
134| 0| Some(id) => SeedKind::Entity(id),
135| | None => {
136| 0| return Err(AppError::NotFound(errors_msg::memory_or_entity_not_found(
137| 0| &name, &namespace,
138| 0| )))
139| | }
140| | }
141| | }
142| 0| Err(e) => return Err(AppError::Database(e)),
143| | };
144| |
145| | // Collect seed entity IDs depending on seed kind.
146| 0| let (seed_memory_id, seed_entity_ids): (i64, Vec<i64>) = match &seed {
147| 0| SeedKind::Memory(id) => {
148| 0| let mem_id = *id;
149| 0| let mut stmt =
150| 0| conn.prepare_cached("SELECT entity_id FROM memory_entities WHERE memory_id = ?1")?;
151| 0| let rows: Vec<i64> = stmt
152| 0| .query_map(params![mem_id], |r| r.get(0))?
153| 0| .collect::<Result<Vec<i64>, _>>()?;
154| 0| (mem_id, rows)
155| | }
156| 0| SeedKind::Entity(entity_id) => {
157| | // For a bare entity seed there is no corresponding memory to skip.
158| | // Use a sentinel -1 so dedup never matches a real memory_id.
159| 0| (-1, vec![*entity_id])
160| | }
161| | };
162| |
163| 0| let relation_filter = args.relation;
164| 0| if let Some(ref r) = relation_filter {
165| 0| crate::parsers::warn_if_non_canonical(r);
166| 0| }
167| 0| let results = traverse_related(
168| 0| &conn,
169| 0| seed_memory_id,
170| 0| &seed_entity_ids,
171| 0| &namespace,
172| 0| args.max_hops,
173| 0| args.min_weight,
174| 0| relation_filter.as_deref(),
175| 0| args.limit,
176| 0| )?;
177| |
178| 0| match args.format {
179| | OutputFormat::Json => {
180| 0| let related_memories = results.clone();
181| 0| output::emit_json(&RelatedResponse {
182| 0| name: name.clone(),
183| 0| max_hops: args.max_hops,
184| 0| results,
185| 0| related_memories,
186| 0| elapsed_ms: inicio.elapsed().as_millis() as u64,
187| 0| })?;
188| | }
189| | OutputFormat::Text => {
190| 0| for item in &results {
191| 0| if item.description.is_empty() {
192| 0| output::emit_text(&format!(
193| 0| "{}. {} ({})",
194| 0| item.hop_distance, item.name, item.namespace
195| 0| ));
196| 0| } else {
197| 0| let preview: String = item
198| 0| .description
199| 0| .chars()
200| 0| .take(TEXT_DESCRIPTION_PREVIEW_LEN)
201| 0| .collect();
202| 0| output::emit_text(&format!(
203| 0| "{}. {} ({}): {}",
204| 0| item.hop_distance, item.name, item.namespace, preview
205| 0| ));
206| 0| }
207| | }
208| | }
209| | OutputFormat::Markdown => {
210| 0| for item in &results {
211| 0| if item.description.is_empty() {
212| 0| output::emit_text(&format!(
213| 0| "- **{}** ({}) — hop {}",
214| 0| item.name, item.namespace, item.hop_distance
215| 0| ));
216| 0| } else {
217| 0| let preview: String = item
218| 0| .description
219| 0| .chars()
220| 0| .take(TEXT_DESCRIPTION_PREVIEW_LEN)
221| 0| .collect();
222| 0| output::emit_text(&format!(
223| 0| "- **{}** ({}) — hop {}: {}",
224| 0| item.name, item.namespace, item.hop_distance, preview
225| 0| ));
226| 0| }
227| | }
228| | }
229| | }
230| |
231| 0| Ok(())
232| 0|}
233| |
234| |#[allow(clippy::too_many_arguments)]
235| 4|fn traverse_related(
236| 4| conn: &Connection,
237| 4| seed_memory_id: i64,
238| 4| seed_entity_ids: &[i64],
239| 4| namespace: &str,
240| 4| max_hops: u32,
241| 4| min_weight: f64,
242| 4| relation_filter: Option<&str>,
243| 4| limit: usize,
244| 4|) -> Result<Vec<RelatedMemory>, AppError> {
245| 4| if seed_entity_ids.is_empty() || max_hops == 0 {
^3
246| 2| return Ok(Vec::new());
247| 2| }
248| |
249| | // BFS over entities keeping track of hop distance and the (source, target, relation, weight)
250| | // of the edge that first reached each entity.
251| 2| let mut visited: HashSet<i64> = seed_entity_ids.iter().copied().collect();
252| 2| let mut entity_hop: crate::hash::AHashMap<i64, u32> =
253| 2| crate::hash::AHashMap::with_capacity_and_hasher(max_hops as usize * 10, Default::default());
254| 4| for &e in seed_entity_ids {
^2
255| 2| entity_hop.insert(e, 0);
256| 2| }
257| | // Per-entity edge info: source_name, target_name, relation, weight (captures the FIRST edge
258| | // that reached this entity — equivalent to BFS shortest path recall edge).
259| 2| let mut entity_edge: crate::hash::AHashMap<i64, (String, String, String, f64)> =
260| 2| crate::hash::AHashMap::with_capacity_and_hasher(max_hops as usize * 10, Default::default());
261| |
262| 2| let mut queue: VecDeque<i64> = seed_entity_ids.iter().copied().collect();
263| |
264| 10| while let Some(current_entity) = queue.pop_front() {
^8
265| 8| let current_hop = *entity_hop.get(¤t_entity).unwrap_or(&0);
266| 8| if current_hop >= max_hops {
267| 5| continue;
268| 3| }
269| |
270| 3| let neighbours =
271| 3| fetch_neighbours(conn, current_entity, namespace, min_weight, relation_filter)?;
^0
272| |
273| 10| for (neighbour_id, source_name, target_name, relation, weight) in neighbours {
^7 ^7 ^7 ^7 ^7
274| 7| if visited.insert(neighbour_id) {
275| 6| entity_hop.insert(neighbour_id, current_hop + 1);
276| 6| entity_edge.insert(neighbour_id, (source_name, target_name, relation, weight));
277| 6| queue.push_back(neighbour_id);
278| 6| }
^1
279| | }
280| | }
281| |
282| | // For each discovered entity (hop >= 1) find its memories, skipping the seed memory.
283| 2| let mut out: Vec<RelatedMemory> = Vec::with_capacity(limit);
284| 2| let mut dedup_ids: crate::hash::AHashSet<i64> =
285| 2| crate::hash::AHashSet::with_capacity_and_hasher(limit, Default::default());
286| 2| dedup_ids.insert(seed_memory_id);
287| |
288| | // Sort entities by hop ASC, weight DESC so we emit closer entities first.
289| 2| let mut ordered_entities: Vec<(i64, u32)> = entity_hop
290| 2| .iter()
291| 8| .filter(|(id, _)| !seed_entity_ids.contains(id))
^2
292| 6| .map(|(id, hop)| (*id, *hop))
^2
293| 2| .collect();
294| 4| ordered_entities.sort_by(|a, b| {
^2 ^2
295| 4| let weight_a = entity_edge.get(&a.0).map(|e| e.3).unwrap_or(0.0);
296| 4| let weight_b = entity_edge.get(&b.0).map(|e| e.3).unwrap_or(0.0);
297| 4| a.1.cmp(&b.1).then_with(|| {
298| 4| weight_b
299| 4| .partial_cmp(&weight_a)
300| 4| .unwrap_or(std::cmp::Ordering::Equal)
301| 4| })
302| 4| });
303| |
304| 5| for (entity_id, hop) in ordered_entities {
^4 ^4
305| 4| let mut stmt = conn.prepare_cached(
306| 4| "SELECT m.id, m.name, m.namespace, m.type, m.description
307| 4| FROM memory_entities me
308| 4| JOIN memories m ON m.id = me.memory_id
309| 4| WHERE me.entity_id = ?1 AND m.deleted_at IS NULL",
310| 0| )?;
311| 4| let rows = stmt
312| 4| .query_map(params![entity_id], |r| {
313| | Ok((
314| 4| r.get::<_, i64>(0)?,
^0
315| 4| r.get::<_, String>(1)?,
^0
316| 4| r.get::<_, String>(2)?,
^0
317| 4| r.get::<_, String>(3)?,
^0
318| 4| r.get::<_, String>(4)?,
^0
319| | ))
320| 4| })?
^0
321| 4| .collect::<Result<Vec<_>, _>>()?;
^0
322| |
323| 7| for (mid, name, ns, mtype, desc) in rows {
^4 ^4 ^4 ^4 ^4
324| 4| if !dedup_ids.insert(mid) {
325| 0| continue;
326| 4| }
327| 4| let edge = entity_edge.get(&entity_id);
328| 4| let src = edge.map(|e| e.0.clone());
329| 4| let tgt = edge.map(|e| e.1.clone());
330| 4| out.push(RelatedMemory {
331| 4| memory_id: mid,
332| 4| name,
333| 4| namespace: ns,
334| 4| memory_type: mtype,
335| 4| description: desc,
336| 4| hop_distance: hop,
337| 4| source_entity: src.clone(),
338| 4| target_entity: tgt.clone(),
339| 4| from: src,
340| 4| to: tgt,
341| 4| relation: edge.map(|e| e.2.clone()),
342| 4| weight: edge.map(|e| e.3),
343| | });
344| 4| if out.len() >= limit {
345| 1| return Ok(out);
346| 3| }
347| | }
348| | }
349| |
350| 1| Ok(out)
351| 4|}
352| |
353| 3|fn fetch_neighbours(
354| 3| conn: &Connection,
355| 3| entity_id: i64,
356| 3| namespace: &str,
357| 3| min_weight: f64,
358| 3| relation_filter: Option<&str>,
359| 3|) -> Result<Vec<Neighbour>, AppError> {
360| | // Follow edges in both directions: source -> target and target -> source so traversal is
361| | // undirected, which is how users typically reason about "related" memories.
362| 3| let base_sql = "\
363| 3| SELECT r.target_id, se.name, te.name, r.relation, r.weight
364| 3| FROM relationships r
365| 3| JOIN entities se ON se.id = r.source_id
366| 3| JOIN entities te ON te.id = r.target_id
367| 3| WHERE r.source_id = ?1 AND r.weight >= ?2 AND r.namespace = ?3";
368| |
369| 3| let reverse_sql = "\
370| 3| SELECT r.source_id, se.name, te.name, r.relation, r.weight
371| 3| FROM relationships r
372| 3| JOIN entities se ON se.id = r.source_id
373| 3| JOIN entities te ON te.id = r.target_id
374| 3| WHERE r.target_id = ?1 AND r.weight >= ?2 AND r.namespace = ?3";
375| |
376| 3| let mut results: Vec<Neighbour> = Vec::with_capacity(16);
377| |
378| 3| let forward_sql = match relation_filter {
379| 0| Some(_) => format!("{base_sql} AND r.relation = ?4"),
380| 3| None => base_sql.to_string(),
381| | };
382| 3| let rev_sql = match relation_filter {
383| 0| Some(_) => format!("{reverse_sql} AND r.relation = ?4"),
384| 3| None => reverse_sql.to_string(),
385| | };
386| |
387| 3| let mut stmt = conn.prepare_cached(&forward_sql)?;
^0
388| 3| let rows: Vec<_> = if let Some(rel) = relation_filter {
^0
389| 0| stmt.query_map(params![entity_id, min_weight, namespace, rel], |r| {
390| | Ok((
391| 0| r.get::<_, i64>(0)?,
392| 0| r.get::<_, String>(1)?,
393| 0| r.get::<_, String>(2)?,
394| 0| r.get::<_, String>(3)?,
395| 0| r.get::<_, f64>(4)?,
396| | ))
397| 0| })?
398| 0| .collect::<Result<Vec<_>, _>>()?
399| | } else {
400| 6| stmt.query_map(params![entity_id, min_weight, namespace], |r| {
^3 ^3 ^3
401| | Ok((
402| 6| r.get::<_, i64>(0)?,
^0
403| 6| r.get::<_, String>(1)?,
^0
404| 6| r.get::<_, String>(2)?,
^0
405| 6| r.get::<_, String>(3)?,
^0
406| 6| r.get::<_, f64>(4)?,
^0
407| | ))
408| 6| })?
^0
409| 3| .collect::<Result<Vec<_>, _>>()?
^0
410| | };
411| 3| results.extend(rows);
412| |
413| 3| let mut stmt = conn.prepare_cached(&rev_sql)?;
^0
414| 3| let rows: Vec<_> = if let Some(rel) = relation_filter {
^0
415| 0| stmt.query_map(params![entity_id, min_weight, namespace, rel], |r| {
416| | Ok((
417| 0| r.get::<_, i64>(0)?,
418| 0| r.get::<_, String>(1)?,
419| 0| r.get::<_, String>(2)?,
420| 0| r.get::<_, String>(3)?,
421| 0| r.get::<_, f64>(4)?,
422| | ))
423| 0| })?
424| 0| .collect::<Result<Vec<_>, _>>()?
425| | } else {
426| 3| stmt.query_map(params![entity_id, min_weight, namespace], |r| {
^1
427| | Ok((
428| 1| r.get::<_, i64>(0)?,
^0
429| 1| r.get::<_, String>(1)?,
^0
430| 1| r.get::<_, String>(2)?,
^0
431| 1| r.get::<_, String>(3)?,
^0
432| 1| r.get::<_, f64>(4)?,
^0
433| | ))
434| 1| })?
^0
435| 3| .collect::<Result<Vec<_>, _>>()?
^0
436| | };
437| 3| results.extend(rows);
438| |
439| 3| Ok(results)
440| 3|}
441| |
442| |#[cfg(test)]
443| |mod tests {
444| | use super::*;
445| |
446| 4| fn setup_related_db() -> rusqlite::Connection {
447| 4| let conn = rusqlite::Connection::open_in_memory().expect("failed to open in-memory db");
448| 4| conn.execute_batch(
449| 4| "CREATE TABLE memories (
450| 4| id INTEGER PRIMARY KEY AUTOINCREMENT,
451| 4| name TEXT NOT NULL,
452| 4| namespace TEXT NOT NULL DEFAULT 'global',
453| 4| type TEXT NOT NULL DEFAULT 'fact',
454| 4| description TEXT NOT NULL DEFAULT '',
455| 4| deleted_at INTEGER
456| 4| );
457| 4| CREATE TABLE entities (
458| 4| id INTEGER PRIMARY KEY AUTOINCREMENT,
459| 4| namespace TEXT NOT NULL,
460| 4| name TEXT NOT NULL
461| 4| );
462| 4| CREATE TABLE relationships (
463| 4| id INTEGER PRIMARY KEY AUTOINCREMENT,
464| 4| namespace TEXT NOT NULL,
465| 4| source_id INTEGER NOT NULL,
466| 4| target_id INTEGER NOT NULL,
467| 4| relation TEXT NOT NULL DEFAULT 'related_to',
468| 4| weight REAL NOT NULL DEFAULT 1.0
469| 4| );
470| 4| CREATE TABLE memory_entities (
471| 4| memory_id INTEGER NOT NULL,
472| 4| entity_id INTEGER NOT NULL
473| 4| );",
474| | )
475| 4| .expect("failed to create test tables");
476| 4| conn
477| 4| }
478| |
479| 9| fn insert_memory(conn: &rusqlite::Connection, name: &str, namespace: &str) -> i64 {
480| 9| conn.execute(
481| 9| "INSERT INTO memories (name, namespace) VALUES (?1, ?2)",
482| 9| rusqlite::params![name, namespace],
483| | )
484| 9| .expect("failed to insert memory");
485| 9| conn.last_insert_rowid()
486| 9| }
487| |
488| 9| fn insert_entity(conn: &rusqlite::Connection, name: &str, namespace: &str) -> i64 {
489| 9| conn.execute(
490| 9| "INSERT INTO entities (name, namespace) VALUES (?1, ?2)",
491| 9| rusqlite::params![name, namespace],
492| | )
493| 9| .expect("failed to insert entity");
494| 9| conn.last_insert_rowid()
495| 9| }
496| |
497| 9| fn link_memory_entity(conn: &rusqlite::Connection, memory_id: i64, entity_id: i64) {
498| 9| conn.execute(
499| 9| "INSERT INTO memory_entities (memory_id, entity_id) VALUES (?1, ?2)",
500| 9| rusqlite::params![memory_id, entity_id],
501| | )
502| 9| .expect("failed to link memory-entity");
503| 9| }
504| |
505| 6| fn insert_relationship(
506| 6| conn: &rusqlite::Connection,
507| 6| namespace: &str,
508| 6| source_id: i64,
509| 6| target_id: i64,
510| 6| relation: &str,
511| 6| weight: f64,
512| 6| ) {
513| 6| conn.execute(
514| 6| "INSERT INTO relationships (namespace, source_id, target_id, relation, weight)
515| 6| VALUES (?1, ?2, ?3, ?4, ?5)",
516| 6| rusqlite::params![namespace, source_id, target_id, relation, weight],
517| | )
518| 6| .expect("failed to insert relationship");
519| 6| }
520| |
521| | #[test]
522| 1| fn related_response_serializes_results_and_elapsed_ms() {
523| 1| let mem = RelatedMemory {
524| 1| memory_id: 1,
525| 1| name: "neighbor-mem".to_string(),
526| 1| namespace: "global".to_string(),
527| 1| memory_type: "document".to_string(),
528| 1| description: "desc".to_string(),
529| 1| hop_distance: 1,
530| 1| source_entity: Some("entity-a".to_string()),
531| 1| target_entity: Some("entity-b".to_string()),
532| 1| from: Some("entity-a".to_string()),
533| 1| to: Some("entity-b".to_string()),
534| 1| relation: Some("related_to".to_string()),
535| 1| weight: Some(0.9),
536| 1| };
537| 1| let resp = RelatedResponse {
538| 1| name: "seed-mem".to_string(),
539| 1| max_hops: 2,
540| 1| related_memories: vec![mem.clone()],
541| 1| results: vec![mem],
542| 1| elapsed_ms: 7,
543| 1| };
544| |
545| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
546| 1| assert!(json["results"].is_array());
547| 1| assert_eq!(json["results"].as_array().unwrap().len(), 1);
548| 1| assert_eq!(json["elapsed_ms"], 7u64);
549| 1| assert_eq!(json["results"][0]["type"], "document");
550| 1| assert_eq!(json["results"][0]["hop_distance"], 1);
551| 1| }
552| |
553| | #[test]
554| 1| fn traverse_related_returns_empty_without_seed_entities() {
555| 1| let conn = setup_related_db();
556| 1| let result = traverse_related(&conn, 1, &[], "global", 2, 0.0, None, 10)
557| 1| .expect("traverse_related failed");
558| 1| assert!(
559| 1| result.is_empty(),
560| 0| "no seed entities must yield empty results"
561| | );
562| 1| }
563| |
564| | #[test]
565| 1| fn traverse_related_returns_empty_with_max_hops_zero() {
566| 1| let conn = setup_related_db();
567| 1| let mem_id = insert_memory(&conn, "seed-mem", "global");
568| 1| let ent_id = insert_entity(&conn, "ent-a", "global");
569| 1| link_memory_entity(&conn, mem_id, ent_id);
570| |
571| 1| let result = traverse_related(&conn, mem_id, &[ent_id], "global", 0, 0.0, None, 10)
572| 1| .expect("traverse_related failed");
573| 1| assert!(result.is_empty(), "max_hops=0 must return empty");
^0
574| 1| }
575| |
576| | #[test]
577| 1| fn traverse_related_discovers_neighbor_memory_via_graph() {
578| 1| let conn = setup_related_db();
579| |
580| 1| let seed_id = insert_memory(&conn, "seed-mem", "global");
581| 1| let neighbor_id = insert_memory(&conn, "neighbor-mem", "global");
582| 1| let ent_a = insert_entity(&conn, "ent-a", "global");
583| 1| let ent_b = insert_entity(&conn, "ent-b", "global");
584| |
585| 1| link_memory_entity(&conn, seed_id, ent_a);
586| 1| link_memory_entity(&conn, neighbor_id, ent_b);
587| 1| insert_relationship(&conn, "global", ent_a, ent_b, "related_to", 1.0);
588| |
589| 1| let result = traverse_related(&conn, seed_id, &[ent_a], "global", 2, 0.0, None, 10)
590| 1| .expect("traverse_related failed");
591| |
592| 1| assert_eq!(result.len(), 1, "must find 1 neighboring memory");
^0
593| 1| assert_eq!(result[0].name, "neighbor-mem");
594| 1| assert_eq!(result[0].hop_distance, 1);
595| 1| }
596| |
597| | #[test]
598| 1| fn traverse_related_respects_limit() {
599| 1| let conn = setup_related_db();
600| |
601| 1| let seed_id = insert_memory(&conn, "seed", "global");
602| 1| let ent_seed = insert_entity(&conn, "ent-seed", "global");
603| 1| link_memory_entity(&conn, seed_id, ent_seed);
604| |
605| 6| for i in 0..5 {
^5
606| 5| let mem_id = insert_memory(&conn, &format!("neighbor-{i}"), "global");
607| 5| let ent_id = insert_entity(&conn, &format!("ent-{i}"), "global");
608| 5| link_memory_entity(&conn, mem_id, ent_id);
609| 5| insert_relationship(&conn, "global", ent_seed, ent_id, "related_to", 1.0);
610| 5| }
611| |
612| 1| let result = traverse_related(&conn, seed_id, &[ent_seed], "global", 1, 0.0, None, 3)
613| 1| .expect("traverse_related failed");
614| |
615| 1| assert!(
616| 1| result.len() <= 3,
617| 0| "limit=3 must constrain to at most 3 results"
618| | );
619| 1| }
620| |
621| | #[test]
622| 1| fn related_memory_optional_null_fields_serialized() {
623| 1| let mem = RelatedMemory {
624| 1| memory_id: 99,
625| 1| name: "no-relation".to_string(),
626| 1| namespace: "ns".to_string(),
627| 1| memory_type: "concept".to_string(),
628| 1| description: "".to_string(),
629| 1| hop_distance: 2,
630| 1| source_entity: None,
631| 1| target_entity: None,
632| 1| from: None,
633| 1| to: None,
634| 1| relation: None,
635| 1| weight: None,
636| 1| };
637| |
638| 1| let json = serde_json::to_value(&mem).expect("serialization failed");
639| 1| assert!(json["source_entity"].is_null());
640| 1| assert!(json["target_entity"].is_null());
641| 1| assert!(json["relation"].is_null());
642| 1| assert!(json["weight"].is_null());
643| 1| assert_eq!(json["hop_distance"], 2);
644| 1| }
645| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/remember.rs:
1| |//! Handler for the `remember` CLI subcommand.
2| |
3| |use crate::chunking;
4| |use crate::cli::MemoryType;
5| |use crate::entity_type::EntityType;
6| |use crate::errors::AppError;
7| |use crate::i18n::errors_msg;
8| |use crate::output::{self, JsonOutputFormat, RememberResponse};
9| |use crate::paths::AppPaths;
10| |use crate::storage::chunks as storage_chunks;
11| |use crate::storage::connection::{ensure_schema, open_rw};
12| |use crate::storage::entities::{NewEntity, NewRelationship};
13| |use crate::storage::memories::NewMemory;
14| |use crate::storage::{entities, memories, urls as storage_urls, versions};
15| |use serde::Deserialize;
16| |
17| |/// Returns the number of rows that will be written to `memory_chunks` for the
18| |/// given chunk count. Single-chunk bodies are stored directly in the
19| |/// `memories` row, so no chunk row is appended (returns `0`). Multi-chunk
20| |/// bodies persist every chunk and the count equals `chunks_created`.
21| |///
22| |/// Centralized as a function so the H-M8 invariant is unit-testable without
23| |/// running the full handler. The schema for `chunks_persisted` documents this
24| |/// contract explicitly (see `docs/schemas/remember.schema.json`).
25| 5|fn compute_chunks_persisted(chunks_created: usize) -> usize {
26| 5| if chunks_created > 1 {
27| 3| chunks_created
28| | } else {
29| 2| 0
30| | }
31| 5|}
32| |
33| |#[derive(clap::Args)]
34| |#[command(after_long_help = "EXAMPLES:\n \
35| | # Create a memory with inline body\n \
36| | sqlite-graphrag remember --name design-auth --type decision \\\n \
37| | --description \"auth design\" --body \"JWT for stateless auth\"\n\n \
38| | # Create with curated graph via --graph-stdin\n \
39| | echo '{\"body\":\"...\",\"entities\":[],\"relationships\":[]}' | \\\n \
40| | sqlite-graphrag remember --name my-mem --type note --description \"desc\" --graph-stdin\n\n \
41| | # Enable GLiNER NER extraction with --graph-stdin\n \
42| | echo '{\"body\":\"Alice from Microsoft...\",\"entities\":[],\"relationships\":[]}' | \\\n \
43| | sqlite-graphrag remember --name ner-test --type note --description \"test\" \\\n \
44| | --graph-stdin --enable-ner --gliner-variant int8\n\n \
45| | # Idempotent upsert with --force-merge\n \
46| | sqlite-graphrag remember --name my-mem --type note --description \"updated\" \\\n \
47| | --body \"new content\" --force-merge\n\n\
48| |NOTE:\n \
49| | remember does NOT accept positional arguments.\n \
50| | Use --body \"text\" for inline content\n \
51| | Use --body-file path for file content\n \
52| | Use --body-stdin for piped content\n \
53| | Use --graph-stdin for JSON with entities and relationships\n\n\
54| |ENTITY TYPES (for --graph-stdin entities, NOT memory --type):\n \
55| | concept, tool, person, file, project, decision, incident,\n \
56| | organization, location, date, dashboard, issue_tracker, memory\n \
57| | WARNING: reference, skill, document, note, user, feedback are\n \
58| | MEMORY types only — NOT valid for entities.\n \
59| | Mapping: reference→concept, document→file, user→person")]
60| |pub struct RememberArgs {
61| | /// Memory name in kebab-case (lowercase letters, digits, hyphens).
62| | /// Acts as unique key within the namespace; collisions trigger merge or rejection.
63| | #[arg(long)]
64| | pub name: String,
65| | #[arg(
66| | long,
67| | value_enum,
68| | long_help = "Memory kind stored in `memories.type`. Required when creating a new memory. Optional with --force-merge: if omitted the existing memory type is inherited. This is NOT the graph `entity_type` used in `--entities-file`. Valid values: user, feedback, project, reference, decision, incident, skill, document, note."
69| | )]
70| | pub r#type: Option<MemoryType>,
71| | /// Short description (≤500 chars) summarizing the memory for use in `list` and `recall` snippets.
72| | /// Required when creating a new memory. Optional with --force-merge: if omitted the existing description is inherited.
73| | #[arg(long)]
74| | pub description: Option<String>,
75| | /// Inline body content. Mutually exclusive with --body-file, --body-stdin, --graph-stdin.
76| | /// Maximum 512000 bytes; rejected if empty without an external graph.
77| | #[arg(
78| | long,
79| | help = "Inline body content (max 500 KB / 512000 bytes; for larger inputs split into multiple memories or use --body-file)",
80| | conflicts_with_all = ["body_file", "body_stdin", "graph_stdin"]
81| | )]
82| | pub body: Option<String>,
83| | #[arg(
84| | long,
85| | help = "Read body from a file instead of --body",
86| | conflicts_with_all = ["body", "body_stdin", "graph_stdin"]
87| | )]
88| | pub body_file: Option<std::path::PathBuf>,
89| | /// Read body from stdin until EOF. Useful in pipes (echo "..." | sqlite-graphrag remember ...).
90| | /// Mutually exclusive with --body, --body-file, --graph-stdin.
91| | #[arg(
92| | long,
93| | conflicts_with_all = ["body", "body_file", "graph_stdin"]
94| | )]
95| | pub body_stdin: bool,
96| | #[arg(
97| | long,
98| | help = "JSON file containing entities to associate with this memory"
99| | )]
100| | pub entities_file: Option<std::path::PathBuf>,
101| | #[arg(
102| | long,
103| | help = "JSON file containing relationships to associate with this memory"
104| | )]
105| | pub relationships_file: Option<std::path::PathBuf>,
106| | #[arg(
107| | long,
108| | help = "Read graph JSON (body + entities + relationships) from stdin",
109| | conflicts_with_all = [
110| | "body",
111| | "body_file",
112| | "body_stdin",
113| | "entities_file",
114| | "relationships_file"
115| | ]
116| | )]
117| | pub graph_stdin: bool,
118| | #[arg(
119| | long,
120| | help = "Namespace (env: SQLITE_GRAPHRAG_NAMESPACE, default: global)"
121| | )]
122| | pub namespace: Option<String>,
123| | /// Inline JSON object with arbitrary metadata key-value pairs. Mutually exclusive with --metadata-file.
124| | #[arg(long)]
125| | pub metadata: Option<String>,
126| | #[arg(long, help = "JSON file containing metadata key-value pairs")]
127| | pub metadata_file: Option<std::path::PathBuf>,
128| | #[arg(long)]
129| | pub force_merge: bool,
130| | #[arg(
131| | long,
132| | value_name = "EPOCH_OR_RFC3339",
133| | value_parser = crate::parsers::parse_expected_updated_at,
134| | long_help = "Optimistic lock: reject if updated_at does not match. \
135| |Accepts Unix epoch (e.g. 1700000000) or RFC 3339 (e.g. 2026-04-19T12:00:00Z)."
136| | )]
137| | pub expected_updated_at: Option<i64>,
138| | #[arg(
139| | long,
140| | env = "SQLITE_GRAPHRAG_ENABLE_NER",
141| | value_parser = crate::parsers::parse_bool_flexible,
142| | action = clap::ArgAction::Set,
143| | num_args = 0..=1,
144| | default_missing_value = "true",
145| | default_value = "false",
146| | help = "Enable automatic GLiNER NER entity/relationship extraction from body"
147| | )]
148| | pub enable_ner: bool,
149| | #[arg(
150| | long,
151| | env = "SQLITE_GRAPHRAG_GLINER_VARIANT",
152| | default_value = "fp32",
153| | help = "GLiNER model variant: fp32 (1.1GB, best quality), fp16 (580MB), int8 (349MB, fastest but may miss entities on short texts), q4, q4f16"
154| | )]
155| | pub gliner_variant: String,
156| | #[arg(long, hide = true)]
157| | pub skip_extraction: bool,
158| | /// Explicitly clear the body content (set to empty string). Required to distinguish
159| | /// intentional body clearing from accidental omission during --force-merge.
160| | /// Without this flag, an empty body passed to --force-merge preserves the existing body.
161| | #[arg(
162| | long,
163| | default_value_t = false,
164| | help = "Explicitly clear body content during --force-merge (without this flag, an empty body is ignored and the existing body is kept)"
165| | )]
166| | pub clear_body: bool,
167| | /// Validate input and report planned actions without persisting.
168| | #[arg(
169| | long,
170| | default_value_t = false,
171| | help = "Validate input and report planned actions without persisting"
172| | )]
173| | pub dry_run: bool,
174| | /// Optional opaque session identifier for tracing memory provenance across multi-agent runs.
175| | #[arg(long)]
176| | pub session_id: Option<String>,
177| | #[arg(long, value_enum, default_value_t = JsonOutputFormat::Json)]
178| | pub format: JsonOutputFormat,
179| | #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
180| | pub json: bool,
181| | #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
182| | pub db: Option<String>,
183| | /// Maximum process RSS in MiB; abort if exceeded during embedding.
184| | #[arg(long, default_value_t = crate::constants::DEFAULT_MAX_RSS_MB,
185| | help = "Maximum process RSS in MiB; abort if exceeded during embedding (default: 8192)")]
186| | pub max_rss_mb: u64,
187| | /// Emit a warning (but do not reject) when persisting an entity whose degree would
188| | /// exceed this value after the upsert. Default 50. Set 0 to disable the check.
189| | #[arg(long, default_value_t = 50, value_name = "N")]
190| | pub max_entity_degree: u32,
191| |}
192| |
193| |#[derive(Deserialize, Default)]
194| |#[serde(deny_unknown_fields)]
195| |struct GraphInput {
196| | #[serde(default)]
197| | body: Option<String>,
198| | #[serde(default)]
199| | entities: Vec<NewEntity>,
200| | #[serde(default)]
201| | relationships: Vec<NewRelationship>,
202| |}
203| |
204| 0|fn normalize_and_validate_graph_input(graph: &mut GraphInput) -> Result<(), AppError> {
205| 0| for rel in &mut graph.relationships {
206| 0| rel.relation = crate::parsers::normalize_relation(&rel.relation);
207| 0| if let Err(e) = crate::parsers::validate_relation_format(&rel.relation) {
208| 0| return Err(AppError::Validation(format!(
209| 0| "{e} for relationship '{}' -> '{}'",
210| 0| rel.source, rel.target
211| 0| )));
212| 0| }
213| 0| crate::parsers::warn_if_non_canonical(&rel.relation);
214| 0| if !(0.0..=1.0).contains(&rel.strength) {
215| 0| return Err(AppError::Validation(format!(
216| 0| "invalid strength {} for relationship '{}' -> '{}'; expected value in [0.0, 1.0]",
217| 0| rel.strength, rel.source, rel.target
218| 0| )));
219| 0| }
220| | }
221| |
222| 0| Ok(())
223| 0|}
224| |
225| |#[tracing::instrument(skip_all, level = "debug", name = "remember")]
226| 0|pub fn run(args: RememberArgs) -> Result<(), AppError> {
227| | use crate::constants::*;
228| |
229| 0| let inicio = std::time::Instant::now();
230| 0| let _ = args.format;
231| 0| tracing::debug!(target: "remember", name = %args.name, "persisting memory");
232| 0| let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
233| |
234| | // Capture the original `--name` before normalization so the JSON response can
235| | // surface `name_was_normalized` + `original_name` (B_4 in v1.0.32). Stored as
236| | // an owned String because `args.name` is moved into the response below.
237| 0| let original_name = args.name.clone();
238| |
239| | // Auto-normalize to kebab-case before validation (P2-H).
240| | // v1.0.20: also trims hyphens at the boundary (including trailing) to avoid rejection
241| | // after truncation by a long filename ending in a hyphen.
242| 0| let normalized_name = {
243| 0| let lower = args.name.to_lowercase().replace(['_', ' '], "-");
244| 0| let trimmed = lower.trim_matches('-').to_string();
245| 0| if trimmed != args.name {
246| 0| tracing::warn!(target: "remember",
247| | original = %args.name,
248| | normalized = %trimmed,
249| 0| "name auto-normalized to kebab-case"
250| | );
251| 0| }
252| 0| trimmed
253| | };
254| 0| let name_was_normalized = normalized_name != original_name;
255| |
256| 0| if normalized_name.is_empty() {
257| 0| return Err(AppError::Validation(
258| 0| "name cannot be empty after normalization (input was blank or contained only hyphens/underscores/spaces)".to_string(),
259| 0| ));
260| 0| }
261| 0| if normalized_name.len() > MAX_MEMORY_NAME_LEN {
262| 0| return Err(AppError::LimitExceeded(
263| 0| crate::i18n::validation::name_length(MAX_MEMORY_NAME_LEN),
264| 0| ));
265| 0| }
266| |
267| 0| if normalized_name.starts_with("__") {
268| 0| return Err(AppError::Validation(
269| 0| crate::i18n::validation::reserved_name(),
270| 0| ));
271| 0| }
272| |
273| | {
274| 0| let slug_re = crate::constants::name_slug_regex();
275| 0| if !slug_re.is_match(&normalized_name) {
276| 0| return Err(AppError::Validation(crate::i18n::validation::name_kebab(
277| 0| &normalized_name,
278| 0| )));
279| 0| }
280| | }
281| |
282| 0| if let Some(ref desc) = args.description {
283| 0| if desc.len() > MAX_MEMORY_DESCRIPTION_LEN {
284| 0| return Err(AppError::Validation(
285| 0| crate::i18n::validation::description_exceeds(MAX_MEMORY_DESCRIPTION_LEN),
286| 0| ));
287| 0| }
288| 0| }
289| |
290| 0| let mut raw_body = if let Some(b) = args.body {
291| 0| b
292| 0| } else if let Some(ref path) = args.body_file {
293| 0| let file_size = std::fs::metadata(path).map_err(AppError::Io)?.len();
294| 0| if file_size > MAX_MEMORY_BODY_LEN as u64 {
295| 0| return Err(AppError::LimitExceeded(
296| 0| crate::i18n::validation::body_exceeds(MAX_MEMORY_BODY_LEN),
297| 0| ));
298| 0| }
299| 0| match std::fs::read_to_string(path) {
300| 0| Ok(s) => s,
301| 0| Err(e) if e.kind() == std::io::ErrorKind::InvalidData => {
302| 0| let bytes = std::fs::read(path).map_err(AppError::Io)?;
303| 0| tracing::warn!(target: "remember", "body file contains invalid UTF-8; replacing invalid sequences");
304| 0| String::from_utf8_lossy(&bytes).into_owned()
305| | }
306| 0| Err(e) => return Err(AppError::Io(e)),
307| | }
308| 0| } else if args.body_stdin || args.graph_stdin {
309| 0| crate::stdin_helper::read_stdin_with_timeout(60)?
310| | } else {
311| 0| String::new()
312| | };
313| |
314| 0| let mut entities_provided_externally =
315| 0| args.entities_file.is_some() || args.relationships_file.is_some();
316| |
317| 0| let mut graph = GraphInput::default();
318| 0| if let Some(path) = args.entities_file {
319| 0| let file_size = std::fs::metadata(&path).map_err(AppError::Io)?.len();
320| 0| if file_size > MAX_MEMORY_BODY_LEN as u64 {
321| 0| return Err(AppError::LimitExceeded(
322| 0| crate::i18n::validation::body_exceeds(MAX_MEMORY_BODY_LEN),
323| 0| ));
324| 0| }
325| 0| let content = std::fs::read_to_string(&path).map_err(AppError::Io)?;
326| 0| graph.entities = serde_json::from_str(&content)?;
327| 0| }
328| 0| if let Some(path) = args.relationships_file {
329| 0| let file_size = std::fs::metadata(&path).map_err(AppError::Io)?.len();
330| 0| if file_size > MAX_MEMORY_BODY_LEN as u64 {
331| 0| return Err(AppError::LimitExceeded(
332| 0| crate::i18n::validation::body_exceeds(MAX_MEMORY_BODY_LEN),
333| 0| ));
334| 0| }
335| 0| let content = std::fs::read_to_string(&path).map_err(AppError::Io)?;
336| 0| graph.relationships = serde_json::from_str(&content)?;
337| 0| }
338| 0| if args.graph_stdin {
339| 0| graph = serde_json::from_str::<GraphInput>(&raw_body).map_err(|e| {
340| 0| AppError::Validation(format!("invalid JSON payload on --graph-stdin: {e}"))
341| 0| })?;
342| 0| raw_body = graph.body.take().unwrap_or_default();
343| 0| }
344| 0| if args.graph_stdin && !graph.entities.is_empty() {
345| 0| entities_provided_externally = true;
346| 0| }
347| |
348| 0| if graph.entities.len() > max_entities_per_memory() {
349| 0| return Err(AppError::LimitExceeded(errors_msg::entity_limit_exceeded(
350| 0| max_entities_per_memory(),
351| 0| )));
352| 0| }
353| 0| let mut relationships_truncated = false;
354| 0| let rel_cap = max_relationships_per_memory();
355| 0| if graph.relationships.len() > rel_cap {
356| 0| tracing::warn!(target: "remember",
357| 0| count = graph.relationships.len(),
358| | cap = rel_cap,
359| 0| "truncating relationships to cap"
360| | );
361| 0| graph.relationships.truncate(rel_cap);
362| 0| relationships_truncated = true;
363| 0| }
364| 0| normalize_and_validate_graph_input(&mut graph)?;
365| |
366| 0| if raw_body.len() > MAX_MEMORY_BODY_LEN {
367| 0| return Err(AppError::LimitExceeded(
368| 0| crate::i18n::validation::body_exceeds(MAX_MEMORY_BODY_LEN),
369| 0| ));
370| 0| }
371| |
372| | // v1.0.22 P1: reject empty or whitespace-only body when no external graph is provided.
373| | // Without this check, empty embeddings would be persisted, breaking recall semantics.
374| | // GAP-08: skip this guard when --force-merge without --clear-body; the existing body
375| | // will be preserved from the database, so the effective body will not be empty.
376| 0| let body_will_be_preserved = args.force_merge && raw_body.trim().is_empty() && !args.clear_body;
377| 0| if !entities_provided_externally
378| 0| && graph.entities.is_empty()
379| 0| && raw_body.trim().is_empty()
380| 0| && !body_will_be_preserved
381| 0| && !args.clear_body
382| | {
383| 0| return Err(AppError::Validation(crate::i18n::validation::empty_body()));
384| 0| }
385| |
386| 0| let metadata: serde_json::Value = if let Some(m) = args.metadata {
387| 0| serde_json::from_str(&m)?
388| 0| } else if let Some(path) = args.metadata_file {
389| 0| let file_size = std::fs::metadata(&path).map_err(AppError::Io)?.len();
390| 0| if file_size > MAX_MEMORY_BODY_LEN as u64 {
391| 0| return Err(AppError::LimitExceeded(
392| 0| crate::i18n::validation::body_exceeds(MAX_MEMORY_BODY_LEN),
393| 0| ));
394| 0| }
395| 0| let content = std::fs::read_to_string(&path).map_err(AppError::Io)?;
396| 0| serde_json::from_str(&content)?
397| | } else {
398| 0| serde_json::json!({})
399| | };
400| |
401| 0| let mut body_hash = blake3::hash(raw_body.as_bytes()).to_hex().to_string();
402| 0| let mut snippet: String = raw_body.chars().take(200).collect();
403| |
404| 0| let paths = AppPaths::resolve(args.db.as_deref())?;
405| 0| paths.ensure_dirs()?;
406| |
407| | // v1.0.20: use .trim().is_empty() to reject bodies that are only whitespace.
408| 0| let mut extraction_method: Option<String> = None;
409| 0| let mut extracted_urls: Vec<crate::extraction::ExtractedUrl> = Vec::with_capacity(4);
410| 0| if args.enable_ner && args.skip_extraction {
411| 0| return Err(AppError::Validation(
412| 0| "--enable-ner and --skip-extraction are mutually exclusive; remove one".to_string(),
413| 0| ));
414| 0| }
415| 0| if args.skip_extraction && !args.enable_ner {
416| 0| return Err(AppError::Validation(
417| 0| "--skip-extraction is deprecated since v1.0.45 and has no effect; remove this flag"
418| 0| .to_string(),
419| 0| ));
420| 0| }
421| 0| let gliner_variant: crate::extraction::GlinerVariant =
422| 0| args.gliner_variant.parse().unwrap_or_else(|e| {
423| 0| tracing::warn!(target: "remember", error = %e, "invalid --gliner-variant, defaulting to fp32");
424| 0| crate::extraction::GlinerVariant::Fp32
425| 0| });
426| 0| if args.enable_ner && graph.entities.is_empty() && !raw_body.trim().is_empty() {
427| 0| match crate::extraction::extract_graph_auto(&raw_body, &paths, gliner_variant) {
428| 0| Ok(extracted) => {
429| 0| extraction_method = Some(extracted.extraction_method.clone());
430| 0| extracted_urls = extracted.urls;
431| 0| graph.entities = extracted.entities;
432| 0| graph.relationships = extracted.relationships;
433| 0| relationships_truncated = extracted.relationships_truncated;
434| |
435| 0| if graph.entities.len() > max_entities_per_memory() {
436| 0| graph.entities.truncate(max_entities_per_memory());
437| 0| }
438| 0| if graph.relationships.len() > max_relationships_per_memory() {
439| 0| relationships_truncated = true;
440| 0| graph.relationships.truncate(max_relationships_per_memory());
441| 0| }
442| 0| normalize_and_validate_graph_input(&mut graph)?;
443| | }
444| 0| Err(e) => {
445| 0| tracing::warn!(target: "remember", error = %e, "auto-extraction failed, graceful degradation");
446| 0| extraction_method = Some("none:extraction-failed".to_string());
447| | }
448| | }
449| 0| }
450| |
451| 0| let mut conn = open_rw(&paths.db)?;
452| 0| ensure_schema(&mut conn)?;
453| |
454| | // --dry-run: emit planned action without any DB writes and return.
455| 0| if args.dry_run {
456| 0| let existing = memories::find_by_name(&conn, &namespace, &normalized_name)?;
457| 0| let planned_action = if existing.is_some() && args.force_merge {
458| 0| "would_update"
459| | } else {
460| 0| "would_create"
461| | };
462| 0| output::emit_json(&serde_json::json!({
463| 0| "dry_run": true,
464| 0| "name": normalized_name,
465| 0| "namespace": namespace,
466| 0| "planned_action": planned_action,
467| 0| }))?;
468| 0| return Ok(());
469| 0| }
470| |
471| | {
472| | use crate::constants::MAX_NAMESPACES_ACTIVE;
473| 0| let active_count: u32 = conn.query_row(
474| 0| "SELECT COUNT(DISTINCT namespace) FROM memories WHERE deleted_at IS NULL",
475| 0| [],
476| 0| |r| r.get::<_, i64>(0).map(|v| v as u32),
477| 0| )?;
478| 0| let ns_exists: bool = conn.query_row(
479| 0| "SELECT EXISTS(SELECT 1 FROM memories WHERE namespace = ?1 AND deleted_at IS NULL)",
480| 0| rusqlite::params![namespace],
481| 0| |r| r.get::<_, i64>(0).map(|v| v > 0),
482| 0| )?;
483| 0| if !ns_exists && active_count >= MAX_NAMESPACES_ACTIVE {
484| 0| return Err(AppError::NamespaceError(format!(
485| 0| "active namespace limit of {MAX_NAMESPACES_ACTIVE} reached while trying to create '{namespace}'"
486| 0| )));
487| 0| }
488| | }
489| |
490| | // M7: detect soft-deleted memory before the standard duplicate check.
491| 0| if let Some((sd_id, true)) =
492| 0| memories::find_by_name_any_state(&conn, &namespace, &normalized_name)?
493| | {
494| 0| if args.force_merge {
495| 0| memories::clear_deleted_at(&conn, sd_id)?;
496| | } else {
497| 0| return Err(AppError::Duplicate(
498| 0| errors_msg::duplicate_memory_soft_deleted(&normalized_name, &namespace),
499| 0| ));
500| | }
501| 0| }
502| |
503| 0| let existing_memory = memories::find_by_name(&conn, &namespace, &normalized_name)?;
504| 0| if existing_memory.is_some() && !args.force_merge {
505| 0| return Err(AppError::Duplicate(errors_msg::duplicate_memory(
506| 0| &normalized_name,
507| 0| &namespace,
508| 0| )));
509| 0| }
510| |
511| | // GAP-10: resolve type and description.
512| | // For CREATE path (new memory): both are required.
513| | // For UPDATE path (--force-merge on existing memory): inherit from existing row when omitted.
514| 0| let (resolved_type, resolved_description) = if existing_memory.is_none() {
515| | // CREATE path — both fields are mandatory.
516| 0| let t = args.r#type.ok_or_else(|| {
517| 0| AppError::Validation(
518| 0| "--type and --description are required when creating a new memory".to_string(),
519| 0| )
520| 0| })?;
521| 0| let d = args.description.clone().ok_or_else(|| {
522| 0| AppError::Validation(
523| 0| "--type and --description are required when creating a new memory".to_string(),
524| 0| )
525| 0| })?;
526| 0| (t.as_str().to_string(), d)
527| | } else {
528| | // UPDATE path (--force-merge) — inherit missing fields from stored row.
529| 0| let existing_row = memories::read_by_name(&conn, &namespace, &normalized_name)?
530| 0| .ok_or_else(|| {
531| 0| AppError::NotFound(format!(
532| 0| "memory '{normalized_name}' not found in namespace '{namespace}'"
533| 0| ))
534| 0| })?;
535| 0| let t = args
536| 0| .r#type
537| 0| .map(|v| v.as_str().to_string())
538| 0| .unwrap_or_else(|| existing_row.memory_type.clone());
539| 0| let d = args
540| 0| .description
541| 0| .clone()
542| 0| .unwrap_or_else(|| existing_row.description.clone());
543| 0| (t, d)
544| | };
545| |
546| | // GAP-08/GAP-09: protect existing body from accidental destruction during --force-merge.
547| | // When the caller omits a body (or passes an empty one) without --clear-body, silently
548| | // preserve the existing body from the database. This prevents a common scripting mistake
549| | // where a cron job updates metadata fields and inadvertently wipes the stored content.
550| 0| if body_will_be_preserved {
551| 0| if let Some(existing_row) = memories::read_by_name(&conn, &namespace, &normalized_name)? {
552| 0| if !existing_row.body.is_empty() {
553| 0| tracing::debug!(target: "remember",
554| | name = %normalized_name,
555| 0| "GAP-08: empty body with --force-merge and no --clear-body; preserving existing body"
556| | );
557| 0| raw_body = existing_row.body;
558| 0| body_hash = blake3::hash(raw_body.as_bytes()).to_hex().to_string();
559| 0| snippet = raw_body.chars().take(200).collect();
560| 0| }
561| 0| }
562| 0| }
563| |
564| 0| let duplicate_hash_id = memories::find_by_hash(&conn, &namespace, &body_hash)?;
565| |
566| 0| output::emit_progress_i18n(
567| 0| &format!(
568| 0| "Remember stage: validated input; available memory {} MB",
569| 0| crate::memory_guard::available_memory_mb()
570| 0| ),
571| 0| &format!(
572| 0| "Stage remember: input validated; available memory {} MB",
573| 0| crate::memory_guard::available_memory_mb()
574| 0| ),
575| | );
576| |
577| 0| let tokenizer = crate::tokenizer::get_tokenizer(&paths.models)?;
578| 0| let model_max_length = crate::tokenizer::get_model_max_length(&paths.models)?;
579| 0| let total_passage_tokens = crate::tokenizer::count_passage_tokens(tokenizer, &raw_body)?;
580| 0| let chunks_info = chunking::split_into_chunks_hierarchical(&raw_body, tokenizer);
581| 0| let chunks_created = chunks_info.len();
582| | // For single-chunk bodies the memory row itself stores the content and no
583| | // entry is appended to `memory_chunks` (see line ~545). For multi-chunk
584| | // bodies every chunk is persisted via `insert_chunk_slices`.
585| 0| let chunks_persisted = compute_chunks_persisted(chunks_info.len());
586| |
587| 0| output::emit_progress_i18n(
588| 0| &format!(
589| 0| "Remember stage: tokenizer counted {total_passage_tokens} passage tokens (model max {model_max_length}); chunking produced {} chunks; process RSS {} MB",
590| 0| chunks_created,
591| 0| crate::memory_guard::current_process_memory_mb().unwrap_or(0)
592| 0| ),
593| 0| &format!(
594| 0| "Stage remember: tokenizer counted {total_passage_tokens} passage tokens (model max {model_max_length}); chunking produced {} chunks; process RSS {} MB",
595| 0| chunks_created,
596| 0| crate::memory_guard::current_process_memory_mb().unwrap_or(0)
597| 0| ),
598| | );
599| |
600| 0| if chunks_created > crate::constants::REMEMBER_MAX_SAFE_MULTI_CHUNKS {
601| 0| return Err(AppError::LimitExceeded(format!(
602| 0| "document produces {chunks_created} chunks; current safe operational limit is {} chunks; split the document before using remember",
603| 0| crate::constants::REMEMBER_MAX_SAFE_MULTI_CHUNKS
604| 0| )));
605| 0| }
606| |
607| 0| output::emit_progress_i18n("Computing embedding...", "Calculando embedding...");
608| 0| let mut chunk_embeddings_cache: Option<Vec<Vec<f32>>> = None;
609| |
610| 0| let embedding = if chunks_info.len() == 1 {
611| 0| crate::daemon::embed_passage_or_local(&paths.models, &raw_body)?
612| | } else {
613| 0| let chunk_texts: Vec<&str> = chunks_info
614| 0| .iter()
615| 0| .map(|c| chunking::chunk_text(&raw_body, c))
616| 0| .collect();
617| 0| output::emit_progress_i18n(
618| 0| &format!(
619| 0| "Embedding {} chunks serially to keep memory bounded...",
620| 0| chunks_info.len()
621| 0| ),
622| 0| &format!(
623| 0| "Embedding {} chunks serially to keep memory bounded...",
624| 0| chunks_info.len()
625| 0| ),
626| | );
627| 0| let embed_cap = chunk_texts.len();
628| 0| let mut chunk_embeddings = Vec::new();
629| 0| chunk_embeddings.try_reserve(embed_cap).map_err(|_| {
630| 0| AppError::LimitExceeded(format!(
631| 0| "allocation of {embed_cap} chunk embeddings would exceed available memory"
632| 0| ))
633| 0| })?;
634| 0| for chunk_text in &chunk_texts {
635| 0| if let Some(rss) = crate::memory_guard::current_process_memory_mb() {
636| 0| if rss > args.max_rss_mb {
637| 0| tracing::error!(target: "remember",
638| | rss_mb = rss,
639| | max_rss_mb = args.max_rss_mb,
640| 0| "RSS exceeded --max-rss-mb threshold; aborting to prevent system instability"
641| | );
642| 0| return Err(AppError::LowMemory {
643| 0| available_mb: crate::memory_guard::available_memory_mb(),
644| 0| required_mb: args.max_rss_mb,
645| 0| });
646| 0| }
647| 0| }
648| 0| chunk_embeddings.push(crate::daemon::embed_passage_or_local(
649| 0| &paths.models,
650| 0| chunk_text,
651| 0| )?);
652| | }
653| 0| output::emit_progress_i18n(
654| 0| &format!(
655| 0| "Remember stage: chunk embeddings complete; process RSS {} MB",
656| 0| crate::memory_guard::current_process_memory_mb().unwrap_or(0)
657| 0| ),
658| 0| &format!(
659| 0| "Stage remember: chunk embeddings completed; process RSS {} MB",
660| 0| crate::memory_guard::current_process_memory_mb().unwrap_or(0)
661| 0| ),
662| | );
663| 0| let aggregated = chunking::aggregate_embeddings(&chunk_embeddings);
664| 0| chunk_embeddings_cache = Some(chunk_embeddings);
665| 0| aggregated
666| | };
667| 0| let body_for_storage = raw_body;
668| |
669| 0| let memory_type = resolved_type.as_str();
670| 0| let new_memory = NewMemory {
671| 0| namespace: namespace.clone(),
672| 0| name: normalized_name.clone(),
673| 0| memory_type: memory_type.to_string(),
674| 0| description: resolved_description.clone(),
675| 0| body: body_for_storage,
676| 0| body_hash: body_hash.clone(),
677| 0| session_id: args.session_id.clone(),
678| 0| source: "agent".to_string(),
679| 0| metadata,
680| 0| };
681| |
682| 0| let mut warnings = Vec::with_capacity(4);
683| 0| let mut entities_persisted = 0usize;
684| 0| let mut relationships_persisted = 0usize;
685| |
686| 0| let graph_entity_embeddings = graph
687| 0| .entities
688| 0| .iter()
689| 0| .map(|entity| {
690| 0| let entity_text = match &entity.description {
691| 0| Some(desc) => format!("{} {}", entity.name, desc),
692| 0| None => entity.name.clone(),
693| | };
694| 0| crate::daemon::embed_passage_or_local(&paths.models, &entity_text)
695| 0| })
696| 0| .collect::<Result<Vec<_>, _>>()?;
697| |
698| 0| let tx = conn.transaction_with_behavior(rusqlite::TransactionBehavior::Immediate)?;
699| |
700| 0| let mut skip_reindex = false;
701| 0| let (memory_id, action, version) = match existing_memory {
702| 0| Some((existing_id, _updated_at, _current_version)) => {
703| 0| if let Some(hash_id) = duplicate_hash_id {
704| 0| if hash_id != existing_id {
705| 0| warnings.push(format!(
706| 0| "identical body already exists as memory id {hash_id}"
707| 0| ));
708| 0| }
709| 0| }
710| |
711| | // C1 fix: capture old values for FTS5 sync before update
712| 0| let (old_fts_name, old_fts_desc, old_fts_body): (String, String, String) = tx
713| 0| .query_row(
714| 0| "SELECT name, description, body FROM memories WHERE id = ?1",
715| 0| rusqlite::params![existing_id],
716| 0| |r| Ok((r.get(0)?, r.get(1)?, r.get(2)?)),
717| 0| )?;
718| |
719| | // G15: skip re-indexing when body hash matches (common in --force-merge loops)
720| 0| let existing_body_hash: Option<String> = tx
721| 0| .query_row(
722| 0| "SELECT body_hash FROM memories WHERE id = ?1",
723| 0| rusqlite::params![existing_id],
724| 0| |r| r.get(0),
725| | )
726| 0| .ok();
727| 0| let body_unchanged = existing_body_hash.as_deref() == Some(&body_hash);
728| 0| skip_reindex = body_unchanged;
729| 0| if !body_unchanged {
730| 0| storage_chunks::delete_chunks(&tx, existing_id)?;
731| 0| }
732| |
733| 0| let next_v = versions::next_version(&tx, existing_id)?;
734| 0| memories::update(&tx, existing_id, &new_memory, args.expected_updated_at)?;
735| |
736| | // C1 fix: sync FTS5 external-content index after update
737| | // (trg_fts_au trigger is absent by design due to sqlite-vec conflict)
738| 0| memories::sync_fts_after_update(
739| 0| &tx,
740| 0| existing_id,
741| 0| &old_fts_name,
742| 0| &old_fts_desc,
743| 0| &old_fts_body,
744| 0| &normalized_name,
745| 0| &resolved_description,
746| 0| &new_memory.body,
747| 0| )?;
748| |
749| 0| versions::insert_version(
750| 0| &tx,
751| 0| existing_id,
752| 0| next_v,
753| 0| &normalized_name,
754| 0| memory_type,
755| 0| &resolved_description,
756| 0| &new_memory.body,
757| 0| &serde_json::to_string(&new_memory.metadata)?,
758| 0| None,
759| 0| "edit",
760| 0| )?;
761| 0| if !body_unchanged {
762| 0| memories::upsert_vec(
763| 0| &tx,
764| 0| existing_id,
765| 0| &namespace,
766| 0| memory_type,
767| 0| &embedding,
768| 0| &normalized_name,
769| 0| &snippet,
770| 0| )?;
771| 0| }
772| 0| (existing_id, "updated".to_string(), next_v)
773| | }
774| | None => {
775| 0| if let Some(hash_id) = duplicate_hash_id {
776| 0| warnings.push(format!(
777| 0| "identical body already exists as memory id {hash_id}"
778| 0| ));
779| 0| }
780| 0| let id = memories::insert(&tx, &new_memory)?;
781| 0| versions::insert_version(
782| 0| &tx,
783| 0| id,
784| | 1,
785| 0| &normalized_name,
786| 0| memory_type,
787| 0| &resolved_description,
788| 0| &new_memory.body,
789| 0| &serde_json::to_string(&new_memory.metadata)?,
790| 0| None,
791| 0| "create",
792| 0| )?;
793| 0| memories::upsert_vec(
794| 0| &tx,
795| 0| id,
796| 0| &namespace,
797| 0| memory_type,
798| 0| &embedding,
799| 0| &normalized_name,
800| 0| &snippet,
801| 0| )?;
802| 0| (id, "created".to_string(), 1)
803| | }
804| | };
805| |
806| 0| if chunks_info.len() > 1 && !skip_reindex {
807| 0| storage_chunks::insert_chunk_slices(&tx, memory_id, &new_memory.body, &chunks_info)?;
808| |
809| 0| let chunk_embeddings = chunk_embeddings_cache.take().ok_or_else(|| {
810| 0| AppError::Internal(anyhow::anyhow!(
811| 0| "chunk embeddings cache missing in multi-chunk remember path"
812| 0| ))
813| 0| })?;
814| |
815| 0| for (i, emb) in chunk_embeddings.iter().enumerate() {
816| 0| storage_chunks::upsert_chunk_vec(&tx, i as i64, memory_id, i as i32, emb)?;
817| | }
818| 0| output::emit_progress_i18n(
819| 0| &format!(
820| 0| "Remember stage: persisted chunk vectors; process RSS {} MB",
821| 0| crate::memory_guard::current_process_memory_mb().unwrap_or(0)
822| 0| ),
823| 0| &format!(
824| 0| "Etapa remember: vetores de chunks persistidos; RSS do processo {} MB",
825| 0| crate::memory_guard::current_process_memory_mb().unwrap_or(0)
826| 0| ),
827| | );
828| 0| }
829| |
830| 0| if !graph.entities.is_empty() || !graph.relationships.is_empty() {
831| 0| for entity in &graph.entities {
832| 0| let entity_id = entities::upsert_entity(&tx, &namespace, entity)?;
833| 0| let entity_embedding = &graph_entity_embeddings[entities_persisted];
834| 0| entities::upsert_entity_vec(
835| 0| &tx,
836| 0| entity_id,
837| 0| &namespace,
838| 0| entity.entity_type,
839| 0| entity_embedding,
840| 0| &entity.name,
841| 0| )?;
842| 0| entities::link_memory_entity(&tx, memory_id, entity_id)?;
843| 0| entities::increment_degree(&tx, entity_id)?;
844| | // GAP-17: warn when entity degree exceeds the configured cap.
845| 0| if args.max_entity_degree > 0 {
846| 0| let cap = args.max_entity_degree as i64;
847| 0| let degree: i64 = tx.query_row(
848| 0| "SELECT degree FROM entities WHERE id = ?1",
849| 0| rusqlite::params![entity_id],
850| 0| |r| r.get(0),
851| 0| )?;
852| 0| if degree > cap {
853| 0| tracing::warn!(target: "remember",
854| | entity = %entity.name,
855| | degree = degree,
856| | cap = cap,
857| 0| "entity degree cap exceeded"
858| | );
859| 0| }
860| 0| }
861| 0| entities_persisted += 1;
862| | }
863| 0| let entity_types: std::collections::HashMap<&str, EntityType> = graph
864| 0| .entities
865| 0| .iter()
866| 0| .map(|entity| (entity.name.as_str(), entity.entity_type))
867| 0| .collect();
868| |
869| 0| for rel in &graph.relationships {
870| 0| let source_entity = NewEntity {
871| 0| name: rel.source.clone(),
872| 0| entity_type: entity_types
873| 0| .get(rel.source.as_str())
874| 0| .copied()
875| 0| .unwrap_or(EntityType::Concept),
876| 0| description: None,
877| 0| };
878| 0| let target_entity = NewEntity {
879| 0| name: rel.target.clone(),
880| 0| entity_type: entity_types
881| 0| .get(rel.target.as_str())
882| 0| .copied()
883| 0| .unwrap_or(EntityType::Concept),
884| 0| description: None,
885| 0| };
886| 0| let source_id = entities::upsert_entity(&tx, &namespace, &source_entity)?;
887| 0| let target_id = entities::upsert_entity(&tx, &namespace, &target_entity)?;
888| 0| let rel_id = entities::upsert_relationship(&tx, &namespace, source_id, target_id, rel)?;
889| 0| entities::link_memory_relationship(&tx, memory_id, rel_id)?;
890| 0| relationships_persisted += 1;
891| | }
892| 0| }
893| 0| tx.commit()?;
894| |
895| | // v1.0.24 P0-2: persist URLs in a dedicated table, outside the main transaction.
896| | // Failures do not propagate — non-critical path with graceful degradation.
897| 0| let urls_persisted = if !extracted_urls.is_empty() {
898| 0| let url_entries: Vec<storage_urls::MemoryUrl> = extracted_urls
899| 0| .into_iter()
900| 0| .map(|u| storage_urls::MemoryUrl {
901| 0| url: u.url,
902| 0| offset: Some(u.offset as i64),
903| 0| })
904| 0| .collect();
905| 0| storage_urls::insert_urls(&conn, memory_id, &url_entries)
906| | } else {
907| 0| 0
908| | };
909| |
910| 0| conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);")?;
911| |
912| 0| let created_at_epoch = chrono::Utc::now().timestamp();
913| 0| let created_at_iso = crate::tz::format_iso(chrono::Utc::now());
914| |
915| 0| output::emit_json(&RememberResponse {
916| 0| memory_id,
917| 0| // Persist the normalized (kebab-case) slug as `name` since that is the
918| 0| // storage key. The original input is exposed via `original_name` only
919| 0| // when normalization actually changed something (B_4 in v1.0.32).
920| 0| name: normalized_name.clone(),
921| 0| namespace,
922| 0| action: action.clone(),
923| 0| operation: action,
924| 0| version,
925| 0| entities_persisted,
926| 0| relationships_persisted,
927| 0| relationships_truncated,
928| 0| chunks_created,
929| 0| chunks_persisted,
930| 0| urls_persisted,
931| 0| extraction_method,
932| 0| merged_into_memory_id: None,
933| 0| warnings,
934| 0| created_at: created_at_epoch,
935| 0| created_at_iso,
936| 0| elapsed_ms: inicio.elapsed().as_millis() as u64,
937| 0| name_was_normalized,
938| 0| original_name: name_was_normalized.then_some(original_name),
939| 0| })?;
940| |
941| 0| Ok(())
942| 0|}
943| |
944| |#[cfg(test)]
945| |mod tests {
946| | use super::compute_chunks_persisted;
947| | use crate::output::RememberResponse;
948| |
949| | // Bug H-M8: chunks_persisted contract is unit-testable and matches schema.
950| | #[test]
951| 1| fn chunks_persisted_zero_for_zero_chunks() {
952| 1| assert_eq!(compute_chunks_persisted(0), 0);
953| 1| }
954| |
955| | #[test]
956| 1| fn chunks_persisted_zero_for_single_chunk_body() {
957| | // Single-chunk bodies live in the memories row itself; no row is
958| | // appended to memory_chunks. This is the documented contract.
959| 1| assert_eq!(compute_chunks_persisted(1), 0);
960| 1| }
961| |
962| | #[test]
963| 1| fn chunks_persisted_equals_count_for_multi_chunk_body() {
964| | // Every chunk above the first triggers a row in memory_chunks.
965| 1| assert_eq!(compute_chunks_persisted(2), 2);
966| 1| assert_eq!(compute_chunks_persisted(7), 7);
967| 1| assert_eq!(compute_chunks_persisted(64), 64);
968| 1| }
969| |
970| | #[test]
971| 1| fn remember_response_serializes_required_fields() {
972| 1| let resp = RememberResponse {
973| 1| memory_id: 42,
974| 1| name: "minha-mem".to_string(),
975| 1| namespace: "global".to_string(),
976| 1| action: "created".to_string(),
977| 1| operation: "created".to_string(),
978| 1| version: 1,
979| 1| entities_persisted: 0,
980| 1| relationships_persisted: 0,
981| 1| relationships_truncated: false,
982| 1| chunks_created: 1,
983| 1| chunks_persisted: 0,
984| 1| urls_persisted: 0,
985| 1| extraction_method: None,
986| 1| merged_into_memory_id: None,
987| 1| warnings: vec![],
988| 1| created_at: 1_705_320_000,
989| 1| created_at_iso: "2024-01-15T12:00:00Z".to_string(),
990| 1| elapsed_ms: 55,
991| 1| name_was_normalized: false,
992| 1| original_name: None,
993| 1| };
994| |
995| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
996| 1| assert_eq!(json["memory_id"], 42);
997| 1| assert_eq!(json["action"], "created");
998| 1| assert_eq!(json["operation"], "created");
999| 1| assert_eq!(json["version"], 1);
1000| 1| assert_eq!(json["elapsed_ms"], 55u64);
1001| 1| assert!(json["warnings"].is_array());
1002| 1| assert!(json["merged_into_memory_id"].is_null());
1003| 1| }
1004| |
1005| | #[test]
1006| 1| fn remember_response_action_e_operation_sao_aliases() {
1007| 1| let resp = RememberResponse {
1008| 1| memory_id: 1,
1009| 1| name: "mem".to_string(),
1010| 1| namespace: "global".to_string(),
1011| 1| action: "updated".to_string(),
1012| 1| operation: "updated".to_string(),
1013| 1| version: 2,
1014| 1| entities_persisted: 3,
1015| 1| relationships_persisted: 1,
1016| 1| relationships_truncated: false,
1017| 1| extraction_method: None,
1018| 1| chunks_created: 2,
1019| 1| chunks_persisted: 2,
1020| 1| urls_persisted: 0,
1021| 1| merged_into_memory_id: None,
1022| 1| warnings: vec![],
1023| 1| created_at: 0,
1024| 1| created_at_iso: "1970-01-01T00:00:00Z".to_string(),
1025| 1| elapsed_ms: 0,
1026| 1| name_was_normalized: false,
1027| 1| original_name: None,
1028| 1| };
1029| |
1030| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
1031| 1| assert_eq!(
1032| 1| json["action"], json["operation"],
1033| 0| "action e operation devem ser iguais"
1034| | );
1035| 1| assert_eq!(json["entities_persisted"], 3);
1036| 1| assert_eq!(json["relationships_persisted"], 1);
1037| 1| assert_eq!(json["chunks_created"], 2);
1038| 1| }
1039| |
1040| | #[test]
1041| 1| fn remember_response_warnings_lista_mensagens() {
1042| 1| let resp = RememberResponse {
1043| 1| memory_id: 5,
1044| 1| name: "dup-mem".to_string(),
1045| 1| namespace: "global".to_string(),
1046| 1| action: "created".to_string(),
1047| 1| operation: "created".to_string(),
1048| 1| version: 1,
1049| 1| entities_persisted: 0,
1050| 1| extraction_method: None,
1051| 1| relationships_persisted: 0,
1052| 1| relationships_truncated: false,
1053| 1| chunks_created: 1,
1054| 1| chunks_persisted: 0,
1055| 1| urls_persisted: 0,
1056| 1| merged_into_memory_id: None,
1057| 1| warnings: vec!["identical body already exists as memory id 3".to_string()],
1058| 1| created_at: 0,
1059| 1| created_at_iso: "1970-01-01T00:00:00Z".to_string(),
1060| 1| elapsed_ms: 10,
1061| 1| name_was_normalized: false,
1062| 1| original_name: None,
1063| 1| };
1064| |
1065| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
1066| 1| let warnings = json["warnings"]
1067| 1| .as_array()
1068| 1| .expect("warnings deve ser array");
1069| 1| assert_eq!(warnings.len(), 1);
1070| 1| assert!(warnings[0].as_str().unwrap().contains("identical body"));
1071| 1| }
1072| |
1073| | #[test]
1074| 1| fn invalid_name_reserved_prefix_returns_validation_error() {
1075| | use crate::errors::AppError;
1076| | // Validates the rejection logic for names with the "__" prefix directly
1077| 1| let nome = "__reservado";
1078| 1| let resultado: Result<(), AppError> = if nome.starts_with("__") {
1079| 1| Err(AppError::Validation(
1080| 1| crate::i18n::validation::reserved_name(),
1081| 1| ))
1082| | } else {
1083| 0| Ok(())
1084| | };
1085| 1| assert!(resultado.is_err());
1086| 1| if let Err(AppError::Validation(msg)) = resultado {
1087| 1| assert!(!msg.is_empty());
1088| 0| }
1089| 1| }
1090| |
1091| | #[test]
1092| 1| fn name_too_long_returns_validation_error() {
1093| | use crate::errors::AppError;
1094| 1| let nome_longo = "a".repeat(crate::constants::MAX_MEMORY_NAME_LEN + 1);
1095| 1| let resultado: Result<(), AppError> =
1096| 1| if nome_longo.is_empty() || nome_longo.len() > crate::constants::MAX_MEMORY_NAME_LEN {
1097| 1| Err(AppError::Validation(crate::i18n::validation::name_length(
1098| 1| crate::constants::MAX_MEMORY_NAME_LEN,
1099| 1| )))
1100| | } else {
1101| 0| Ok(())
1102| | };
1103| 1| assert!(resultado.is_err());
1104| 1| }
1105| |
1106| | #[test]
1107| 1| fn remember_response_merged_into_memory_id_some_serializes_integer() {
1108| 1| let resp = RememberResponse {
1109| 1| memory_id: 10,
1110| 1| name: "mem-mergeada".to_string(),
1111| 1| namespace: "global".to_string(),
1112| 1| action: "updated".to_string(),
1113| 1| operation: "updated".to_string(),
1114| 1| version: 3,
1115| 1| extraction_method: None,
1116| 1| entities_persisted: 0,
1117| 1| relationships_persisted: 0,
1118| 1| relationships_truncated: false,
1119| 1| chunks_created: 1,
1120| 1| chunks_persisted: 0,
1121| 1| urls_persisted: 0,
1122| 1| merged_into_memory_id: Some(7),
1123| 1| warnings: vec![],
1124| 1| created_at: 0,
1125| 1| created_at_iso: "1970-01-01T00:00:00Z".to_string(),
1126| 1| elapsed_ms: 0,
1127| 1| name_was_normalized: false,
1128| 1| original_name: None,
1129| 1| };
1130| |
1131| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
1132| 1| assert_eq!(json["merged_into_memory_id"], 7);
1133| 1| }
1134| |
1135| | #[test]
1136| 1| fn remember_response_urls_persisted_serializes_field() {
1137| | // v1.0.24 P0-2: garante que urls_persisted aparece no JSON e aceita valor > 0.
1138| 1| let resp = RememberResponse {
1139| 1| memory_id: 3,
1140| 1| name: "mem-com-urls".to_string(),
1141| 1| namespace: "global".to_string(),
1142| 1| action: "created".to_string(),
1143| 1| operation: "created".to_string(),
1144| 1| version: 1,
1145| 1| entities_persisted: 0,
1146| 1| relationships_persisted: 0,
1147| 1| relationships_truncated: false,
1148| 1| chunks_created: 1,
1149| 1| chunks_persisted: 0,
1150| 1| urls_persisted: 3,
1151| 1| extraction_method: Some("regex-only".to_string()),
1152| 1| merged_into_memory_id: None,
1153| 1| warnings: vec![],
1154| 1| created_at: 0,
1155| 1| created_at_iso: "1970-01-01T00:00:00Z".to_string(),
1156| 1| elapsed_ms: 0,
1157| 1| name_was_normalized: false,
1158| 1| original_name: None,
1159| 1| };
1160| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
1161| 1| assert_eq!(json["urls_persisted"], 3);
1162| 1| }
1163| |
1164| | #[test]
1165| 1| fn empty_name_after_normalization_returns_specific_message() {
1166| | // P0-4 regression: name consisting only of hyphens normalizes to empty string;
1167| | // must produce a distinct error message, not the "too long" message.
1168| | use crate::errors::AppError;
1169| 1| let normalized = "---".to_lowercase().replace(['_', ' '], "-");
1170| 1| let normalized = normalized.trim_matches('-').to_string();
1171| 1| let resultado: Result<(), AppError> = if normalized.is_empty() {
1172| 1| Err(AppError::Validation(
1173| 1| "name cannot be empty after normalization (input was blank or contained only hyphens/underscores/spaces)".to_string(),
1174| 1| ))
1175| | } else {
1176| 0| Ok(())
1177| | };
1178| 1| assert!(resultado.is_err());
1179| 1| if let Err(AppError::Validation(msg)) = resultado {
1180| 1| assert!(
1181| 1| msg.contains("empty after normalization"),
1182| 0| "mensagem deve mencionar 'empty after normalization', obteve: {msg}"
1183| | );
1184| 0| }
1185| 1| }
1186| |
1187| | #[test]
1188| 1| fn name_only_underscores_after_normalization_returns_specific_message() {
1189| | // P0-4 regression: name consisting only of underscores normalizes to empty string.
1190| | use crate::errors::AppError;
1191| 1| let normalized = "___".to_lowercase().replace(['_', ' '], "-");
1192| 1| let normalized = normalized.trim_matches('-').to_string();
1193| 1| assert!(
1194| 1| normalized.is_empty(),
1195| 0| "underscores devem normalizar para string vazia"
1196| | );
1197| 1| let resultado: Result<(), AppError> = if normalized.is_empty() {
1198| 1| Err(AppError::Validation(
1199| 1| "name cannot be empty after normalization (input was blank or contained only hyphens/underscores/spaces)".to_string(),
1200| 1| ))
1201| | } else {
1202| 0| Ok(())
1203| | };
1204| 1| assert!(resultado.is_err());
1205| 1| if let Err(AppError::Validation(msg)) = resultado {
1206| 1| assert!(
1207| 1| msg.contains("empty after normalization"),
1208| 0| "mensagem deve mencionar 'empty after normalization', obteve: {msg}"
1209| | );
1210| 0| }
1211| 1| }
1212| |
1213| | #[test]
1214| 1| fn remember_response_relationships_truncated_serializes_field() {
1215| | // P1-D: garante que relationships_truncated aparece no JSON como bool.
1216| 1| let resp_false = RememberResponse {
1217| 1| memory_id: 1,
1218| 1| name: "test".to_string(),
1219| 1| namespace: "global".to_string(),
1220| 1| action: "created".to_string(),
1221| 1| operation: "created".to_string(),
1222| 1| version: 1,
1223| 1| entities_persisted: 2,
1224| 1| relationships_persisted: 1,
1225| 1| relationships_truncated: false,
1226| 1| chunks_created: 1,
1227| 1| chunks_persisted: 0,
1228| 1| urls_persisted: 0,
1229| 1| extraction_method: None,
1230| 1| merged_into_memory_id: None,
1231| 1| warnings: vec![],
1232| 1| created_at: 0,
1233| 1| created_at_iso: "1970-01-01T00:00:00Z".to_string(),
1234| 1| elapsed_ms: 0,
1235| 1| name_was_normalized: false,
1236| 1| original_name: None,
1237| 1| };
1238| 1| let json_false = serde_json::to_value(&resp_false).expect("serialization failed");
1239| 1| assert_eq!(json_false["relationships_truncated"], false);
1240| |
1241| 1| let resp_true = RememberResponse {
1242| 1| relationships_truncated: true,
1243| 1| ..resp_false
1244| 1| };
1245| 1| let json_true = serde_json::to_value(&resp_true).expect("serialization failed");
1246| 1| assert_eq!(json_true["relationships_truncated"], true);
1247| 1| }
1248| |
1249| | // GAP-08: body-preservation predicate tests.
1250| | // Verifies the decision logic that determines whether an existing body should
1251| | // be kept instead of overwritten with an empty incoming body during --force-merge.
1252| |
1253| | /// Returns `true` when the existing body should be preserved.
1254| | ///
1255| | /// Mirrors the `body_will_be_preserved` expression in `run()` so the logic
1256| | /// is testable without a real database connection.
1257| 4| fn should_preserve_body(force_merge: bool, raw_body_is_empty: bool, clear_body: bool) -> bool {
1258| 4| force_merge && raw_body_is_empty && !clear_body
^3 ^2
1259| 4| }
1260| |
1261| | #[test]
1262| 1| fn gap08_empty_body_force_merge_no_clear_body_preserves() {
1263| | // Caller passes no body with --force-merge but without --clear-body.
1264| | // The existing body in the DB must be kept.
1265| 1| assert!(
1266| 1| should_preserve_body(true, true, false),
1267| 0| "empty body + force-merge + no clear-body should trigger preservation"
1268| | );
1269| 1| }
1270| |
1271| | #[test]
1272| 1| fn gap08_empty_body_force_merge_with_clear_body_does_not_preserve() {
1273| | // Caller explicitly passes --clear-body; intentional wipe is honoured.
1274| 1| assert!(
1275| 1| !should_preserve_body(true, true, true),
1276| 0| "--clear-body must bypass preservation"
1277| | );
1278| 1| }
1279| |
1280| | #[test]
1281| 1| fn gap08_non_empty_body_force_merge_does_not_preserve() {
1282| | // Caller provides a real body; it must overwrite the existing one.
1283| 1| assert!(
1284| 1| !should_preserve_body(true, false, false),
1285| 0| "non-empty body must overwrite, not preserve"
1286| | );
1287| 1| }
1288| |
1289| | #[test]
1290| 1| fn gap08_empty_body_no_force_merge_does_not_preserve() {
1291| | // Without --force-merge the path is a fresh create; no preservation needed.
1292| 1| assert!(
1293| 1| !should_preserve_body(false, true, false),
1294| 0| "no --force-merge means no preservation logic applies"
1295| | );
1296| 1| }
1297| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/remember_batch.rs:
1| |//! Handler for the `remember-batch` CLI subcommand (G08).
2| |//!
3| |//! Accepts NDJSON via stdin where each line is a memory to persist.
4| |//! One CLI invocation, one slot, one DB connection — eliminates N-process
5| |//! contention from parallel `remember` calls.
6| |
7| |use crate::errors::AppError;
8| |use crate::output;
9| |use crate::paths::AppPaths;
10| |use crate::storage::connection::open_rw;
11| |use crate::storage::{entities, memories, versions};
12| |use serde::{Deserialize, Serialize};
13| |use std::io::BufRead;
14| |
15| |#[derive(clap::Args)]
16| |#[command(after_long_help = "EXAMPLES:\n \
17| | # Pipe NDJSON memories from stdin\n \
18| | echo '{\"name\":\"mem-a\",\"type\":\"note\",\"description\":\"a\",\"body\":\"content\"}' | \
19| | sqlite-graphrag remember-batch --json\n\n \
20| | # Atomic batch with --transaction\n \
21| | cat memories.ndjson | sqlite-graphrag remember-batch --transaction --json")]
22| |pub struct RememberBatchArgs {
23| | /// Apply all memories in a single transaction (all-or-nothing).
24| | #[arg(long)]
25| | pub transaction: bool,
26| | /// Stop processing on the first failure.
27| | #[arg(long)]
28| | pub fail_fast: bool,
29| | /// Apply force-merge to all memories (update existing by name).
30| | #[arg(long)]
31| | pub force_merge: bool,
32| | /// Namespace override for all memories.
33| | #[arg(long, env = "SQLITE_GRAPHRAG_NAMESPACE")]
34| | pub namespace: Option<String>,
35| | /// Emit NDJSON output.
36| | #[arg(long)]
37| | pub json: bool,
38| | /// Database path override.
39| | #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
40| | pub db: Option<String>,
41| | #[command(flatten)]
42| | pub daemon: crate::cli::DaemonOpts,
43| |}
44| |
45| |#[derive(Deserialize)]
46| |struct BatchInputLine {
47| | name: String,
48| | #[serde(default = "default_type")]
49| | r#type: String,
50| | #[serde(default)]
51| | description: String,
52| | #[serde(default)]
53| | body: String,
54| | #[serde(default)]
55| | entities: Vec<crate::storage::entities::NewEntity>,
56| | #[serde(default)]
57| | relationships: Vec<crate::storage::entities::NewRelationship>,
58| |}
59| |
60| 0|fn default_type() -> String {
61| 0| "note".to_string()
62| 0|}
63| |
64| |#[derive(Serialize)]
65| |struct BatchItemEvent {
66| | name: String,
67| | status: String,
68| | #[serde(skip_serializing_if = "Option::is_none")]
69| | memory_id: Option<i64>,
70| | #[serde(skip_serializing_if = "Option::is_none")]
71| | error: Option<String>,
72| | index: usize,
73| |}
74| |
75| |#[derive(Serialize)]
76| |struct BatchSummary {
77| | summary: bool,
78| | total: usize,
79| | succeeded: usize,
80| | failed: usize,
81| | elapsed_ms: u64,
82| |}
83| |
84| 0|pub fn run(args: RememberBatchArgs) -> Result<(), AppError> {
85| 0| let start = std::time::Instant::now();
86| 0| let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
87| 0| let paths = AppPaths::resolve(args.db.as_deref())?;
88| 0| paths.ensure_dirs()?;
89| 0| crate::storage::connection::ensure_db_ready(&paths)?;
90| 0| let mut conn = open_rw(&paths.db)?;
91| |
92| 0| let stdin = std::io::stdin();
93| 0| let lines: Vec<String> = stdin
94| 0| .lock()
95| 0| .lines()
96| 0| .map_while(Result::ok)
97| 0| .filter(|l| !l.trim().is_empty())
98| 0| .collect();
99| |
100| 0| let total = lines.len();
101| 0| let mut succeeded = 0usize;
102| 0| let mut failed = 0usize;
103| |
104| 0| if args.transaction {
105| 0| let tx = conn.transaction_with_behavior(rusqlite::TransactionBehavior::Immediate)?;
106| 0| for (idx, line) in lines.iter().enumerate() {
107| 0| match process_line(&tx, &namespace, line, idx, args.force_merge, &paths) {
108| 0| Ok(event) => {
109| 0| output::emit_json(&event)?;
110| 0| succeeded += 1;
111| | }
112| 0| Err(e) => {
113| 0| failed += 1;
114| 0| output::emit_json(&BatchItemEvent {
115| 0| name: String::new(),
116| 0| status: "failed".to_string(),
117| 0| memory_id: None,
118| 0| error: Some(format!("{e}")),
119| 0| index: idx,
120| 0| })?;
121| 0| if args.fail_fast {
122| 0| break;
123| 0| }
124| | }
125| | }
126| | }
127| 0| if failed == 0 || !args.fail_fast {
128| 0| tx.commit()?;
129| 0| }
130| | } else {
131| 0| for (idx, line) in lines.iter().enumerate() {
132| 0| let tx = conn.transaction_with_behavior(rusqlite::TransactionBehavior::Immediate)?;
133| 0| match process_line(&tx, &namespace, line, idx, args.force_merge, &paths) {
134| 0| Ok(event) => {
135| 0| tx.commit()?;
136| 0| output::emit_json(&event)?;
137| 0| succeeded += 1;
138| | }
139| 0| Err(e) => {
140| 0| drop(tx);
141| 0| failed += 1;
142| 0| output::emit_json(&BatchItemEvent {
143| 0| name: String::new(),
144| 0| status: "failed".to_string(),
145| 0| memory_id: None,
146| 0| error: Some(format!("{e}")),
147| 0| index: idx,
148| 0| })?;
149| 0| if args.fail_fast {
150| 0| break;
151| 0| }
152| | }
153| | }
154| | }
155| | }
156| |
157| 0| output::emit_json(&BatchSummary {
158| 0| summary: true,
159| 0| total,
160| 0| succeeded,
161| 0| failed,
162| 0| elapsed_ms: start.elapsed().as_millis() as u64,
163| 0| })?;
164| |
165| 0| Ok(())
166| 0|}
167| |
168| 0|fn process_line(
169| 0| tx: &rusqlite::Transaction<'_>,
170| 0| namespace: &str,
171| 0| line: &str,
172| 0| index: usize,
173| 0| force_merge: bool,
174| 0| paths: &AppPaths,
175| 0|) -> Result<BatchItemEvent, AppError> {
176| 0| let input: BatchInputLine = serde_json::from_str(line)
177| 0| .map_err(|e| AppError::Validation(format!("line {index}: invalid JSON: {e}")))?;
178| |
179| 0| let normalized_name = crate::parsers::normalize_entity_name(&input.name);
180| 0| if normalized_name.is_empty() {
181| 0| return Err(AppError::Validation(format!(
182| 0| "line {index}: name normalizes to empty string"
183| 0| )));
184| 0| }
185| |
186| 0| let body_hash = blake3::hash(input.body.as_bytes()).to_hex().to_string();
187| |
188| 0| let existing = memories::find_by_name(tx, namespace, &normalized_name)?;
189| |
190| 0| let memory_id = if let Some((existing_id, _updated_at, _version)) = existing {
191| 0| if !force_merge {
192| 0| return Err(AppError::Duplicate(format!(
193| 0| "memory '{normalized_name}' already exists; use --force-merge to update"
194| 0| )));
195| 0| }
196| 0| let snippet: String = input.body.chars().take(200).collect();
197| 0| memories::update(
198| 0| tx,
199| 0| existing_id,
200| 0| &memories::NewMemory {
201| 0| namespace: namespace.to_string(),
202| 0| name: normalized_name.clone(),
203| 0| memory_type: input.r#type.clone(),
204| 0| description: input.description.clone(),
205| 0| body: input.body.clone(),
206| 0| body_hash,
207| 0| session_id: None,
208| 0| source: "agent".to_string(),
209| 0| metadata: serde_json::json!({}),
210| 0| },
211| 0| None,
212| 0| )?;
213| 0| let next_v = versions::next_version(tx, existing_id)?;
214| 0| versions::insert_version(
215| 0| tx,
216| 0| existing_id,
217| 0| next_v,
218| 0| &normalized_name,
219| 0| &input.r#type,
220| 0| &input.description,
221| 0| &input.body,
222| 0| "{}",
223| 0| None,
224| 0| "edit",
225| 0| )?;
226| |
227| 0| let embedding = crate::daemon::embed_passage_or_local(&paths.models, &input.body)?;
228| 0| memories::upsert_vec(
229| 0| tx,
230| 0| existing_id,
231| 0| namespace,
232| 0| &input.r#type,
233| 0| &embedding,
234| 0| &normalized_name,
235| 0| &snippet,
236| 0| )?;
237| 0| existing_id
238| | } else {
239| 0| let new_mem = memories::NewMemory {
240| 0| namespace: namespace.to_string(),
241| 0| name: normalized_name.clone(),
242| 0| memory_type: input.r#type.clone(),
243| 0| description: input.description.clone(),
244| 0| body: input.body.clone(),
245| 0| body_hash,
246| 0| session_id: None,
247| 0| source: "agent".to_string(),
248| 0| metadata: serde_json::json!({}),
249| 0| };
250| 0| let id = memories::insert(tx, &new_mem)?;
251| 0| versions::insert_version(
252| 0| tx,
253| 0| id,
254| | 1,
255| 0| &normalized_name,
256| 0| &input.r#type,
257| 0| &input.description,
258| 0| &input.body,
259| 0| "{}",
260| 0| None,
261| 0| "create",
262| 0| )?;
263| |
264| 0| let snippet: String = input.body.chars().take(200).collect();
265| 0| let embedding = crate::daemon::embed_passage_or_local(&paths.models, &input.body)?;
266| 0| memories::upsert_vec(
267| 0| tx,
268| 0| id,
269| 0| namespace,
270| 0| &input.r#type,
271| 0| &embedding,
272| 0| &normalized_name,
273| 0| &snippet,
274| 0| )?;
275| 0| id
276| | };
277| |
278| | // Persist graph entities and relationships if provided
279| 0| for entity in &input.entities {
280| 0| let entity_id = entities::upsert_entity(tx, namespace, entity)?;
281| 0| let entity_text = match &entity.description {
282| 0| Some(desc) => format!("{} {}", entity.name, desc),
283| 0| None => entity.name.clone(),
284| | };
285| 0| let entity_embedding = crate::daemon::embed_passage_or_local(&paths.models, &entity_text)?;
286| 0| entities::upsert_entity_vec(
287| 0| tx,
288| 0| entity_id,
289| 0| namespace,
290| 0| entity.entity_type,
291| 0| &entity_embedding,
292| 0| &entity.name,
293| 0| )?;
294| 0| entities::link_memory_entity(tx, memory_id, entity_id)?;
295| | }
296| |
297| 0| for rel in &input.relationships {
298| 0| let src_name = crate::parsers::normalize_entity_name(&rel.source);
299| 0| let tgt_name = crate::parsers::normalize_entity_name(&rel.target);
300| 0| if let (Some(src_id), Some(tgt_id)) = (
301| 0| entities::find_entity_id(tx, namespace, &src_name)?,
302| 0| entities::find_entity_id(tx, namespace, &tgt_name)?,
303| | ) {
304| 0| entities::create_or_fetch_relationship(
305| 0| tx,
306| 0| namespace,
307| 0| src_id,
308| 0| tgt_id,
309| 0| &rel.relation,
310| 0| rel.strength,
311| 0| rel.description.as_deref(),
312| 0| )?;
313| 0| }
314| | }
315| |
316| 0| Ok(BatchItemEvent {
317| 0| name: normalized_name,
318| 0| status: "indexed".to_string(),
319| 0| memory_id: Some(memory_id),
320| 0| error: None,
321| 0| index,
322| 0| })
323| 0|}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/rename.rs:
1| |//! Handler for the `rename` CLI subcommand.
2| |
3| |use crate::errors::AppError;
4| |use crate::i18n::errors_msg;
5| |use crate::output;
6| |use crate::output::JsonOutputFormat;
7| |use crate::paths::AppPaths;
8| |use crate::storage::connection::open_rw;
9| |use crate::storage::{memories, versions};
10| |use serde::Serialize;
11| |
12| |#[derive(clap::Args)]
13| |#[command(after_long_help = "EXAMPLES:\n \
14| | # Rename using two positional arguments (NAME NEW)\n \
15| | sqlite-graphrag rename onboarding welcome-guide\n\n \
16| | # Rename using the positional NAME + --new-name flag\n \
17| | sqlite-graphrag rename onboarding --new-name welcome-guide\n\n \
18| | # Rename using the named flag form\n \
19| | sqlite-graphrag rename --name onboarding --new-name welcome-guide\n\n \
20| | # Rename within a specific namespace\n \
21| | sqlite-graphrag rename onboarding welcome-guide --namespace my-project")]
22| |pub struct RenameArgs {
23| | /// Current memory name as a positional argument. Alternative to `--name` / `--old`.
24| | #[arg(
25| | value_name = "NAME",
26| | conflicts_with = "name",
27| | help = "Current memory name to rename; alternative to --name/--old"
28| | )]
29| | pub name_positional: Option<String>,
30| | /// Current memory name. Also accepts the aliases `--old` and `--from` (since v1.0.35).
31| | #[arg(long, alias = "old", alias = "from")]
32| | pub name: Option<String>,
33| | /// New memory name as a positional argument. Alternative to `--new-name`.
34| | #[arg(
35| | value_name = "NEW",
36| | conflicts_with = "new_name",
37| | help = "New memory name; alternative to --new-name/--new/--to"
38| | )]
39| | pub new_name_positional: Option<String>,
40| | /// New memory name. Also accepts the aliases `--new` and `--to` (since v1.0.35).
41| | #[arg(long, alias = "new", alias = "to")]
42| | pub new_name: Option<String>,
43| | #[arg(
44| | long,
45| | help = "Namespace (env: SQLITE_GRAPHRAG_NAMESPACE, default: global)"
46| | )]
47| | pub namespace: Option<String>,
48| | /// Optimistic locking: reject if the current updated_at does not match (exit 3).
49| | #[arg(
50| | long,
51| | value_name = "EPOCH_OR_RFC3339",
52| | value_parser = crate::parsers::parse_expected_updated_at,
53| | long_help = "Optimistic lock: reject if updated_at does not match. \
54| |Accepts Unix epoch (e.g. 1700000000) or RFC 3339 (e.g. 2026-04-19T12:00:00Z)."
55| | )]
56| | pub expected_updated_at: Option<i64>,
57| | /// Optional session ID used to trace the origin of the change.
58| | #[arg(long, value_name = "UUID")]
59| | pub session_id: Option<String>,
60| | /// Output format.
61| | #[arg(long, value_enum, default_value_t = JsonOutputFormat::Json)]
62| | pub format: JsonOutputFormat,
63| | #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
64| | pub json: bool,
65| | #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
66| | pub db: Option<String>,
67| |}
68| |
69| |#[derive(Serialize)]
70| |struct RenameResponse {
71| | memory_id: i64,
72| | name: String,
73| | action: &'static str,
74| | version: i64,
75| | /// Set to `true` when a soft-deleted ghost occupying the target name was purged.
76| | #[serde(skip_serializing_if = "Option::is_none")]
77| | ghost_purged: Option<bool>,
78| | /// Total execution time in milliseconds from handler start to serialisation.
79| | elapsed_ms: u64,
80| |}
81| |
82| 0|pub fn run(args: RenameArgs) -> Result<(), AppError> {
83| 0| let inicio = std::time::Instant::now();
84| 0| let _ = args.format;
85| 0| tracing::debug!(target: "rename", old = ?args.name, new = ?args.new_name, "renaming memory");
86| | use crate::constants::*;
87| |
88| | // Resolve current name from positional or --name/--old flag.
89| 0| let name = args.name_positional.or(args.name).ok_or_else(|| {
90| 0| AppError::Validation("name required: pass as positional argument or via --name".to_string())
91| 0| })?;
92| 0| let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
93| |
94| 0| let raw_new_name = args.new_name.or(args.new_name_positional).ok_or_else(|| {
95| 0| AppError::Validation(
96| 0| "new name required: pass as positional <NEW> or via --new-name/--new/--to".to_string(),
97| 0| )
98| 0| })?;
99| |
100| | // v1.0.20: trim_matches('-') also removes trailing/leading hyphens.
101| 0| let normalized_new_name = {
102| 0| let lower = raw_new_name.to_lowercase().replace(['_', ' '], "-");
103| 0| let trimmed = lower.trim_matches('-').to_string();
104| 0| if trimmed != raw_new_name {
105| 0| tracing::warn!(target: "rename",
106| | original = %raw_new_name,
107| | normalized = %trimmed,
108| 0| "new_name auto-normalized to kebab-case"
109| | );
110| 0| }
111| 0| trimmed
112| | };
113| |
114| 0| if normalized_new_name == name {
115| 0| return Err(AppError::Validation(
116| 0| "source and target names are identical".to_string(),
117| 0| ));
118| 0| }
119| |
120| 0| if normalized_new_name.starts_with("__") {
121| 0| return Err(AppError::Validation(
122| 0| crate::i18n::validation::reserved_name(),
123| 0| ));
124| 0| }
125| |
126| 0| if normalized_new_name.is_empty() || normalized_new_name.len() > MAX_MEMORY_NAME_LEN {
127| 0| return Err(AppError::Validation(
128| 0| crate::i18n::validation::new_name_length(MAX_MEMORY_NAME_LEN),
129| 0| ));
130| 0| }
131| |
132| | {
133| 0| let slug_re = crate::constants::name_slug_regex();
134| 0| if !slug_re.is_match(&normalized_new_name) {
135| 0| return Err(AppError::Validation(
136| 0| crate::i18n::validation::new_name_kebab(&normalized_new_name),
137| 0| ));
138| 0| }
139| | }
140| |
141| 0| let paths = AppPaths::resolve(args.db.as_deref())?;
142| 0| crate::storage::connection::ensure_db_ready(&paths)?;
143| 0| let mut conn = open_rw(&paths.db)?;
144| |
145| 0| let (memory_id, current_updated_at, _) = memories::find_by_name(&conn, &namespace, &name)?
146| 0| .ok_or_else(|| AppError::NotFound(errors_msg::memory_not_found(&name, &namespace)))?;
147| |
148| 0| if let Some(expected) = args.expected_updated_at {
149| 0| if expected != current_updated_at {
150| 0| return Err(AppError::Conflict(errors_msg::optimistic_lock_conflict(
151| 0| expected,
152| 0| current_updated_at,
153| 0| )));
154| 0| }
155| 0| }
156| |
157| 0| let row = memories::read_by_name(&conn, &namespace, &name)?
158| 0| .ok_or_else(|| AppError::Internal(anyhow::anyhow!("memory not found before rename")))?;
159| |
160| 0| let memory_type = row.memory_type.clone();
161| 0| let description = row.description.clone();
162| 0| let body = row.body.clone();
163| 0| let metadata = row.metadata.clone();
164| |
165| 0| let tx = conn.transaction_with_behavior(rusqlite::TransactionBehavior::Immediate)?;
166| |
167| | // G16: auto-purge soft-deleted ghost occupying the target name
168| 0| let mut ghost_purged: Option<bool> = None;
169| 0| if let Some((ghost_id, is_deleted)) =
170| 0| memories::find_by_name_any_state(&tx, &namespace, &normalized_new_name)?
171| | {
172| 0| if is_deleted {
173| 0| tracing::info!(target: "rename",
174| | ghost_id,
175| | name = %normalized_new_name,
176| 0| "auto-purging soft-deleted ghost to free target name for rename"
177| | );
178| 0| tx.execute(
179| 0| "DELETE FROM memory_versions WHERE memory_id = ?1",
180| 0| rusqlite::params![ghost_id],
181| 0| )?;
182| 0| tx.execute(
183| 0| "DELETE FROM memory_chunks WHERE memory_id = ?1",
184| 0| rusqlite::params![ghost_id],
185| 0| )?;
186| 0| tx.execute(
187| 0| "DELETE FROM memory_entities WHERE memory_id = ?1",
188| 0| rusqlite::params![ghost_id],
189| 0| )?;
190| 0| tx.execute(
191| 0| "DELETE FROM vec_memories WHERE memory_id = ?1",
192| 0| rusqlite::params![ghost_id],
193| 0| )?;
194| 0| tx.execute(
195| 0| "DELETE FROM memories WHERE id = ?1",
196| 0| rusqlite::params![ghost_id],
197| 0| )?;
198| 0| ghost_purged = Some(true);
199| 0| } else if ghost_id != memory_id {
200| 0| return Err(AppError::Duplicate(format!(
201| 0| "target name '{normalized_new_name}' is already occupied by active memory id {ghost_id}"
202| 0| )));
203| 0| }
204| 0| }
205| |
206| 0| let affected = if let Some(ts) = args.expected_updated_at {
207| 0| tx.execute(
208| 0| "UPDATE memories SET name=?2 WHERE id=?1 AND updated_at=?3 AND deleted_at IS NULL",
209| 0| rusqlite::params![memory_id, normalized_new_name, ts],
210| 0| )?
211| | } else {
212| 0| tx.execute(
213| 0| "UPDATE memories SET name=?2 WHERE id=?1 AND deleted_at IS NULL",
214| 0| rusqlite::params![memory_id, normalized_new_name],
215| 0| )?
216| | };
217| |
218| 0| if affected == 0 {
219| 0| return Err(AppError::Conflict(
220| 0| "optimistic lock conflict: memory was modified by another process".to_string(),
221| 0| ));
222| 0| }
223| |
224| 0| let next_v = versions::next_version(&tx, memory_id)?;
225| |
226| 0| versions::insert_version(
227| 0| &tx,
228| 0| memory_id,
229| 0| next_v,
230| 0| &normalized_new_name,
231| 0| &memory_type,
232| 0| &description,
233| 0| &body,
234| 0| &metadata,
235| 0| None,
236| 0| "rename",
237| 0| )?;
238| |
239| 0| memories::sync_fts_after_update(
240| 0| &tx,
241| 0| memory_id,
242| 0| &name,
243| 0| &description,
244| 0| &body,
245| 0| &normalized_new_name,
246| 0| &description,
247| 0| &body,
248| 0| )?;
249| |
250| 0| tx.commit()?;
251| |
252| 0| conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);")?;
253| |
254| 0| output::emit_json(&RenameResponse {
255| 0| memory_id,
256| 0| name: normalized_new_name,
257| 0| action: "renamed",
258| 0| version: next_v,
259| 0| ghost_purged,
260| 0| elapsed_ms: inicio.elapsed().as_millis() as u64,
261| 0| })?;
262| |
263| 0| Ok(())
264| 0|}
265| |
266| |#[cfg(test)]
267| |mod tests {
268| | use crate::storage::memories::{insert, NewMemory};
269| | use tempfile::TempDir;
270| |
271| 1| fn setup_db() -> (TempDir, rusqlite::Connection) {
272| 1| crate::storage::connection::register_vec_extension();
273| 1| let dir = TempDir::new().unwrap();
274| 1| let db_path = dir.path().join("test.db");
275| 1| let mut conn = rusqlite::Connection::open(&db_path).unwrap();
276| 1| crate::migrations::runner().run(&mut conn).unwrap();
277| 1| (dir, conn)
278| 1| }
279| |
280| 1| fn new_memory(name: &str) -> NewMemory {
281| 1| NewMemory {
282| 1| namespace: "global".to_string(),
283| 1| name: name.to_string(),
284| 1| memory_type: "user".to_string(),
285| 1| description: "desc".to_string(),
286| 1| body: "corpo".to_string(),
287| 1| body_hash: format!("hash-{name}"),
288| 1| session_id: None,
289| 1| source: "agent".to_string(),
290| 1| metadata: serde_json::json!({}),
291| 1| }
292| 1| }
293| |
294| | #[test]
295| 1| fn rejects_new_name_with_double_underscore_prefix() {
296| | use crate::errors::AppError;
297| 1| let (_dir, conn) = setup_db();
298| 1| insert(&conn, &new_memory("mem-teste")).unwrap();
299| 1| drop(conn);
300| |
301| 1| let err = AppError::Validation(
302| 1| "names and namespaces starting with __ are reserved for internal use".to_string(),
303| 1| );
304| 1| assert!(err.to_string().contains("__"));
305| 1| assert_eq!(err.exit_code(), 1);
306| 1| }
307| |
308| | #[test]
309| 1| fn rejects_rename_to_same_name() {
310| | use crate::errors::AppError;
311| 1| let err = AppError::Validation("source and target names are identical".to_string());
312| 1| assert_eq!(err.exit_code(), 1);
313| 1| assert!(err.to_string().contains("identical"));
314| 1| }
315| |
316| | #[test]
317| 1| fn optimistic_lock_conflict_returns_exit_3() {
318| | use crate::errors::AppError;
319| 1| let err = AppError::Conflict(
320| 1| "optimistic lock conflict: expected updated_at=100, but current is 200".to_string(),
321| 1| );
322| 1| assert_eq!(err.exit_code(), 3);
323| 1| }
324| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/rename_entity.rs:
1| |//! Handler for the `rename-entity` CLI subcommand.
2| |//!
3| |//! Renames an entity preserving all relationships and memory bindings.
4| |//! Only the `name` column in `entities` and the corresponding `vec_entities`
5| |//! row need updating because relationships use integer FK `entity_id`.
6| |
7| |use crate::entity_type::EntityType;
8| |use crate::errors::AppError;
9| |use crate::i18n::errors_msg;
10| |use crate::output::{self, OutputFormat};
11| |use crate::paths::AppPaths;
12| |use crate::storage::connection::open_rw;
13| |use crate::storage::entities;
14| |use rusqlite::params;
15| |use serde::Serialize;
16| |
17| |#[derive(clap::Args)]
18| |#[command(after_long_help = "EXAMPLES:\n \
19| | # Rename an entity\n \
20| | sqlite-graphrag rename-entity --name old-name --new-name new-name\n\n \
21| | # Rename with namespace\n \
22| | sqlite-graphrag rename-entity --name auth --new-name authentication --namespace my-project")]
23| |pub struct RenameEntityArgs {
24| | /// Current entity name to rename.
25| | #[arg(long, value_name = "NAME")]
26| | pub name: String,
27| | /// New name for the entity.
28| | #[arg(long, value_name = "NEW_NAME")]
29| | pub new_name: String,
30| | #[arg(long)]
31| | pub namespace: Option<String>,
32| | #[arg(long, value_enum, default_value = "json")]
33| | pub format: OutputFormat,
34| | #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
35| | pub json: bool,
36| | #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
37| | pub db: Option<String>,
38| |}
39| |
40| |#[derive(Serialize)]
41| |struct RenameEntityResponse {
42| | action: String,
43| | old_name: String,
44| | new_name: String,
45| | entity_id: i64,
46| | namespace: String,
47| | elapsed_ms: u64,
48| |}
49| |
50| 0|pub fn run(args: RenameEntityArgs) -> Result<(), AppError> {
51| 0| let start = std::time::Instant::now();
52| 0| let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
53| 0| let paths = AppPaths::resolve(args.db.as_deref())?;
54| |
55| 0| crate::storage::connection::ensure_db_ready(&paths)?;
56| |
57| 0| let mut conn = open_rw(&paths.db)?;
58| |
59| | // Verify source entity exists and fetch its id and type.
60| | // Normalize the lookup name to match the normalized stored names.
61| 0| let lookup_name = crate::parsers::normalize_entity_name(&args.name);
62| 0| let row: Option<(i64, EntityType)> = {
63| 0| let mut stmt = conn
64| 0| .prepare_cached("SELECT id, type FROM entities WHERE namespace = ?1 AND name = ?2")?;
65| 0| match stmt.query_row(params![namespace, lookup_name], |r| {
66| 0| Ok((r.get::<_, i64>(0)?, r.get::<_, EntityType>(1)?))
67| 0| }) {
68| 0| Ok(row) => Some(row),
69| 0| Err(rusqlite::Error::QueryReturnedNoRows) => None,
70| 0| Err(e) => return Err(AppError::Database(e)),
71| | }
72| | };
73| 0| let (entity_id, entity_type) = row
74| 0| .ok_or_else(|| AppError::NotFound(errors_msg::entity_not_found(&args.name, &namespace)))?;
75| |
76| | // Validate the raw new name first (catches short ALL_CAPS NER noise),
77| | // then normalize it for storage to preserve the normalized-name invariant.
78| 0| entities::validate_entity_name(&args.new_name)?;
79| 0| let new_name = crate::parsers::normalize_entity_name(&args.new_name);
80| |
81| 0| if lookup_name == new_name {
82| 0| return Err(AppError::Validation(
83| 0| "source and target entity names are identical".to_string(),
84| 0| ));
85| 0| }
86| |
87| | // Ensure new name is not already taken in this namespace.
88| 0| if entities::find_entity_id(&conn, &namespace, &new_name)?.is_some() {
89| 0| return Err(AppError::Validation(format!(
90| 0| "entity with name '{new_name}' already exists in namespace '{namespace}'"
91| 0| )));
92| 0| }
93| |
94| | // Embed the normalized new name for vec_entities replacement.
95| 0| let embedding = crate::daemon::embed_passage_or_local(&paths.models, &new_name)?;
96| |
97| 0| let tx = conn.transaction_with_behavior(rusqlite::TransactionBehavior::Immediate)?;
98| 0| tx.execute(
99| 0| "UPDATE entities SET name = ?1, updated_at = unixepoch() WHERE id = ?2",
100| 0| params![new_name, entity_id],
101| 0| )?;
102| | // vec0 does not support UPDATE — delete then insert.
103| 0| tx.execute(
104| 0| "DELETE FROM vec_entities WHERE entity_id = ?1",
105| 0| params![entity_id],
106| 0| )?;
107| 0| let embedding_bytes = crate::embedder::f32_to_bytes(&embedding);
108| 0| tx.execute(
109| 0| "INSERT INTO vec_entities(entity_id, namespace, type, embedding, name)
110| 0| VALUES (?1, ?2, ?3, ?4, ?5)",
111| 0| params![
112| 0| entity_id,
113| 0| namespace,
114| 0| entity_type,
115| 0| &embedding_bytes,
116| 0| new_name
117| 0| ],
118| 0| )?;
119| 0| tx.commit()?;
120| |
121| 0| conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);")?;
122| |
123| 0| let response = RenameEntityResponse {
124| 0| action: "renamed".to_string(),
125| 0| old_name: args.name,
126| 0| new_name,
127| 0| entity_id,
128| 0| namespace: namespace.clone(),
129| 0| elapsed_ms: start.elapsed().as_millis() as u64,
130| 0| };
131| |
132| 0| match args.format {
133| 0| OutputFormat::Json => output::emit_json(&response)?,
134| 0| OutputFormat::Text | OutputFormat::Markdown => {
135| 0| output::emit_text(&format!(
136| 0| "renamed entity: '{}' → '{}' [{}]",
137| 0| response.old_name, response.new_name, response.namespace
138| 0| ));
139| 0| }
140| | }
141| |
142| 0| Ok(())
143| 0|}
144| |
145| |#[cfg(test)]
146| |mod tests {
147| | use super::*;
148| |
149| | #[test]
150| 1| fn rename_entity_response_serializes_all_fields() {
151| 1| let resp = RenameEntityResponse {
152| 1| action: "renamed".to_string(),
153| 1| old_name: "auth".to_string(),
154| 1| new_name: "authentication".to_string(),
155| 1| entity_id: 42,
156| 1| namespace: "global".to_string(),
157| 1| elapsed_ms: 7,
158| 1| };
159| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
160| 1| assert_eq!(json["action"], "renamed");
161| 1| assert_eq!(json["old_name"], "auth");
162| 1| assert_eq!(json["new_name"], "authentication");
163| 1| assert_eq!(json["entity_id"], 42);
164| 1| assert_eq!(json["namespace"], "global");
165| 1| assert!(json["elapsed_ms"].is_number());
166| 1| }
167| |
168| | #[test]
169| 1| fn rename_entity_response_action_is_renamed() {
170| 1| let resp = RenameEntityResponse {
171| 1| action: "renamed".to_string(),
172| 1| old_name: "x".to_string(),
173| 1| new_name: "y".to_string(),
174| 1| entity_id: 1,
175| 1| namespace: "ns".to_string(),
176| 1| elapsed_ms: 1,
177| 1| };
178| 1| assert_eq!(resp.action, "renamed");
179| 1| }
180| |
181| | #[test]
182| 1| fn rename_entity_response_entity_id_preserved() {
183| 1| let resp = RenameEntityResponse {
184| 1| action: "renamed".to_string(),
185| 1| old_name: "old".to_string(),
186| 1| new_name: "new".to_string(),
187| 1| entity_id: 999,
188| 1| namespace: "test-ns".to_string(),
189| 1| elapsed_ms: 5,
190| 1| };
191| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
192| 1| assert_eq!(json["entity_id"], 999);
193| 1| }
194| |
195| | #[test]
196| 1| fn rejects_rename_entity_to_same_name() {
197| | use crate::errors::AppError;
198| 1| let err = AppError::Validation("source and target entity names are identical".to_string());
199| 1| assert_eq!(err.exit_code(), 1);
200| 1| assert!(err.to_string().contains("identical"));
201| 1| }
202| |
203| | #[test]
204| 1| fn rename_entity_response_namespace_reflected() {
205| 1| let resp = RenameEntityResponse {
206| 1| action: "renamed".to_string(),
207| 1| old_name: "a".to_string(),
208| 1| new_name: "b".to_string(),
209| 1| entity_id: 10,
210| 1| namespace: "my-project".to_string(),
211| 1| elapsed_ms: 2,
212| 1| };
213| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
214| 1| assert_eq!(json["namespace"], "my-project");
215| 1| }
216| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/restore.rs:
1| |//! Handler for the `restore` CLI subcommand.
2| |
3| |use crate::errors::AppError;
4| |use crate::i18n::errors_msg;
5| |use crate::output;
6| |use crate::output::JsonOutputFormat;
7| |use crate::paths::AppPaths;
8| |use crate::storage::connection::open_rw;
9| |use crate::storage::memories;
10| |use crate::storage::versions;
11| |use rusqlite::params;
12| |use rusqlite::OptionalExtension;
13| |use serde::Serialize;
14| |
15| |#[derive(clap::Args)]
16| |#[command(after_long_help = "EXAMPLES:\n \
17| | # Restore the latest non-`restore` version of a memory\n \
18| | sqlite-graphrag restore --name onboarding\n\n \
19| | # Restore a specific version\n \
20| | sqlite-graphrag restore --name onboarding --version 3\n\n \
21| | # Restore within a specific namespace\n \
22| | sqlite-graphrag restore --name onboarding --namespace my-project")]
23| |pub struct RestoreArgs {
24| | /// Memory name as a positional argument. Alternative to `--name`.
25| | #[arg(
26| | value_name = "NAME",
27| | conflicts_with = "name",
28| | help = "Memory name to restore; alternative to --name"
29| | )]
30| | pub name_positional: Option<String>,
31| | /// Memory name to restore (must exist, including soft-deleted/forgotten).
32| | #[arg(long)]
33| | pub name: Option<String>,
34| | /// Version to restore. When omitted, defaults to the latest non-`restore` version
35| | /// from `memory_versions`. This makes the forget+restore workflow work without
36| | /// requiring the user to discover the version first.
37| | #[arg(long)]
38| | pub version: Option<i64>,
39| | #[arg(
40| | long,
41| | help = "Namespace (env: SQLITE_GRAPHRAG_NAMESPACE, default: global)"
42| | )]
43| | pub namespace: Option<String>,
44| | /// Optimistic locking: reject if the current updated_at does not match (exit 3).
45| | #[arg(
46| | long,
47| | value_name = "EPOCH_OR_RFC3339",
48| | value_parser = crate::parsers::parse_expected_updated_at,
49| | long_help = "Optimistic lock: reject if updated_at does not match. \
50| |Accepts Unix epoch (e.g. 1700000000) or RFC 3339 (e.g. 2026-04-19T12:00:00Z)."
51| | )]
52| | pub expected_updated_at: Option<i64>,
53| | /// Output format.
54| | #[arg(long, value_enum, default_value_t = JsonOutputFormat::Json)]
55| | pub format: JsonOutputFormat,
56| | #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
57| | pub json: bool,
58| | #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
59| | pub db: Option<String>,
60| |}
61| |
62| |#[derive(Serialize)]
63| |struct RestoreResponse {
64| | /// Always `"restored"` — signals the completed action to shell callers and LLM agents.
65| | action: String,
66| | memory_id: i64,
67| | name: String,
68| | version: i64,
69| | restored_from: i64,
70| | /// Total execution time in milliseconds from handler start to serialisation.
71| | elapsed_ms: u64,
72| |}
73| |
74| 0|pub fn run(args: RestoreArgs) -> Result<(), AppError> {
75| 0| let start = std::time::Instant::now();
76| 0| let _ = args.format;
77| 0| tracing::debug!(target: "restore", name = ?args.name_positional.as_deref().or(args.name.as_deref()), version = ?args.version, "restoring version");
78| 0| let name = args
79| 0| .name_positional
80| 0| .as_deref()
81| 0| .or(args.name.as_deref())
82| 0| .ok_or_else(|| {
83| 0| AppError::Validation(
84| 0| "name required: pass as positional argument or via --name".to_string(),
85| 0| )
86| 0| })?
87| 0| .to_string();
88| 0| let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
89| 0| let paths = AppPaths::resolve(args.db.as_deref())?;
90| 0| let mut conn = open_rw(&paths.db)?;
91| |
92| | // PRD line 1118: query WITHOUT a deleted_at filter — restore must work on soft-deleted memories
93| 0| let result: Option<(i64, i64)> = conn
94| 0| .query_row(
95| 0| "SELECT id, updated_at FROM memories WHERE namespace = ?1 AND name = ?2",
96| 0| params![namespace, name],
97| 0| |r| Ok((r.get(0)?, r.get(1)?)),
98| | )
99| 0| .optional()?;
100| 0| let (memory_id, current_updated_at) = result
101| 0| .ok_or_else(|| AppError::NotFound(errors_msg::memory_not_found(&name, &namespace)))?;
102| |
103| 0| if let Some(expected) = args.expected_updated_at {
104| 0| if expected != current_updated_at {
105| 0| return Err(AppError::Conflict(errors_msg::optimistic_lock_conflict(
106| 0| expected,
107| 0| current_updated_at,
108| 0| )));
109| 0| }
110| 0| }
111| |
112| | // v1.0.22 P0: resolve optional `--version`. When absent, uses the highest version
113| | // whose `change_reason` is not 'restore' (recovers the real state, not meta-restore).
114| | // Lets the forget+restore workflow function without manually reading memory_versions.
115| 0| let target_version: i64 = match args.version {
116| 0| Some(v) => v,
117| | None => {
118| 0| let last: Option<i64> = conn
119| 0| .query_row(
120| 0| "SELECT MAX(version) FROM memory_versions
121| 0| WHERE memory_id = ?1 AND change_reason != 'restore'",
122| 0| params![memory_id],
123| 0| |r| r.get(0),
124| | )
125| 0| .optional()?
126| 0| .flatten();
127| 0| let v = last.ok_or_else(|| {
128| 0| AppError::NotFound(errors_msg::memory_not_found(&name, &namespace))
129| 0| })?;
130| 0| tracing::info!(target: "restore",
131| 0| "restore --version omitted; using latest non-restore version: {}",
132| | v
133| | );
134| 0| v
135| | }
136| | };
137| |
138| 0| let version_row: (String, String, String, String, String) = {
139| 0| let mut stmt = conn.prepare_cached(
140| 0| "SELECT name, type, description, body, metadata
141| 0| FROM memory_versions
142| 0| WHERE memory_id = ?1 AND version = ?2",
143| 0| )?;
144| |
145| 0| stmt.query_row(params![memory_id, target_version], |r| {
146| 0| Ok((r.get(0)?, r.get(1)?, r.get(2)?, r.get(3)?, r.get(4)?))
147| 0| })
148| 0| .map_err(|_| AppError::NotFound(errors_msg::version_not_found(target_version, &name)))?
149| | };
150| |
151| 0| let (_old_name, old_type, old_description, old_body, old_metadata) = version_row;
152| |
153| | // Read current FTS-indexed values before the UPDATE so sync_fts_after_update
154| | // can issue the correct DELETE command for the external-content FTS5 table.
155| 0| let (cur_name, cur_desc, cur_body): (String, String, String) = conn.query_row(
156| 0| "SELECT name, description, body FROM memories WHERE id = ?1",
157| 0| params![memory_id],
158| 0| |r| Ok((r.get(0)?, r.get(1)?, r.get(2)?)),
159| 0| )?;
160| |
161| | // v1.0.21 P1-D: re-embed restored body to keep `vec_memories` synchronized
162| | // with `memories`. Without this, semantic queries used the post-forget version
163| | // vector, causing inconsistent recall (vec_memories=2 vs memories=3 after forget+restore).
164| 0| output::emit_progress_i18n(
165| 0| "Re-computing embedding for restored memory...",
166| 0| crate::i18n::validation::runtime_pt::restore_recomputing_embedding(),
167| | );
168| 0| let embedding = crate::daemon::embed_passage_or_local(&paths.models, &old_body)?;
169| 0| let snippet: String = old_body.chars().take(300).collect();
170| |
171| 0| let tx = conn.transaction_with_behavior(rusqlite::TransactionBehavior::Immediate)?;
172| |
173| | // deleted_at = NULL reactivates soft-deleted memories; no deleted_at filter in the WHERE
174| 0| let affected = if let Some(ts) = args.expected_updated_at {
175| 0| tx.execute(
176| 0| "UPDATE memories SET type=?2, description=?3, body=?4, body_hash=?5, deleted_at=NULL
177| 0| WHERE id=?1 AND updated_at=?6",
178| 0| rusqlite::params![
179| 0| memory_id,
180| 0| old_type,
181| 0| old_description,
182| 0| old_body,
183| 0| blake3::hash(old_body.as_bytes()).to_hex().to_string(),
184| 0| ts
185| 0| ],
186| 0| )?
187| | } else {
188| 0| tx.execute(
189| 0| "UPDATE memories SET type=?2, description=?3, body=?4, body_hash=?5, deleted_at=NULL
190| 0| WHERE id=?1",
191| 0| rusqlite::params![
192| 0| memory_id,
193| 0| old_type,
194| 0| old_description,
195| 0| old_body,
196| 0| blake3::hash(old_body.as_bytes()).to_hex().to_string()
197| 0| ],
198| 0| )?
199| | };
200| |
201| 0| if affected == 0 {
202| 0| return Err(AppError::Conflict(errors_msg::concurrent_process_conflict()));
203| 0| }
204| |
205| 0| let next_v = versions::next_version(&tx, memory_id)?;
206| |
207| 0| versions::insert_version(
208| 0| &tx,
209| 0| memory_id,
210| 0| next_v,
211| 0| &cur_name,
212| 0| &old_type,
213| 0| &old_description,
214| 0| &old_body,
215| 0| &old_metadata,
216| 0| None,
217| 0| "restore",
218| 0| )?;
219| |
220| 0| memories::upsert_vec(
221| 0| &tx, memory_id, &namespace, &old_type, &embedding, &cur_name, &snippet,
222| 0| )?;
223| |
224| 0| memories::sync_fts_after_update(
225| 0| &tx,
226| 0| memory_id,
227| 0| &cur_name,
228| 0| &cur_desc,
229| 0| &cur_body,
230| 0| &cur_name,
231| 0| &old_description,
232| 0| &old_body,
233| 0| )?;
234| |
235| 0| tx.commit()?;
236| |
237| 0| conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);")?;
238| |
239| 0| output::emit_json(&RestoreResponse {
240| 0| action: "restored".to_string(),
241| 0| memory_id,
242| 0| name: cur_name.clone(),
243| 0| version: next_v,
244| 0| restored_from: target_version,
245| 0| elapsed_ms: start.elapsed().as_millis() as u64,
246| 0| })?;
247| |
248| 0| Ok(())
249| 0|}
250| |
251| |#[cfg(test)]
252| |mod tests {
253| | use crate::errors::AppError;
254| |
255| | #[test]
256| 1| fn optimistic_lock_conflict_returns_exit_3() {
257| 1| let err = AppError::Conflict(
258| 1| "optimistic lock conflict: expected updated_at=50, but current is 99".to_string(),
259| 1| );
260| 1| assert_eq!(err.exit_code(), 3);
261| 1| assert!(err.to_string().contains("conflict"));
262| 1| }
263| |
264| | #[test]
265| 1| fn restore_response_includes_action_field() {
266| 1| let resp = super::RestoreResponse {
267| 1| action: "restored".to_string(),
268| 1| memory_id: 1,
269| 1| name: "test-mem".to_string(),
270| 1| version: 3,
271| 1| restored_from: 2,
272| 1| elapsed_ms: 42,
273| 1| };
274| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
275| 1| assert_eq!(json["action"], "restored");
276| 1| assert_eq!(json["memory_id"], 1);
277| 1| assert_eq!(json["version"], 3);
278| 1| assert_eq!(json["restored_from"], 2);
279| 1| }
280| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/stats.rs:
1| |//! Handler for the `stats` CLI subcommand.
2| |
3| |use crate::errors::AppError;
4| |use crate::output;
5| |use crate::paths::AppPaths;
6| |use crate::storage::connection::open_ro;
7| |use serde::Serialize;
8| |
9| |#[derive(clap::Args)]
10| |#[command(after_long_help = "EXAMPLES:\n \
11| | # Show database statistics (memory counts, sizes, namespace breakdown)\n \
12| | sqlite-graphrag stats\n\n \
13| | # Stats for a database at a custom path\n \
14| | sqlite-graphrag stats --db /path/to/graphrag.sqlite\n\n \
15| | # Use SQLITE_GRAPHRAG_DB_PATH env var\n \
16| | SQLITE_GRAPHRAG_DB_PATH=/data/graphrag.sqlite sqlite-graphrag stats")]
17| |pub struct StatsArgs {
18| | #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
19| | pub db: Option<String>,
20| | /// Explicit JSON flag. Accepted as a no-op because output is already JSON by default.
21| | #[arg(long, default_value_t = false)]
22| | pub json: bool,
23| | /// Output format: `json` or `text`. JSON is always emitted on stdout regardless of the value.
24| | #[arg(long, value_parser = ["json", "text"], hide = true)]
25| | pub format: Option<String>,
26| |}
27| |
28| |#[derive(Serialize)]
29| |struct StatsResponse {
30| | memories: i64,
31| | /// Alias of `memories` for the documented contract in SKILL.md.
32| | memories_total: i64,
33| | entities: i64,
34| | /// Alias of `entities` for the documented contract.
35| | entities_total: i64,
36| | relationships: i64,
37| | /// Alias of `relationships` for the documented contract.
38| | relationships_total: i64,
39| | /// Semantic alias of `relationships` per the contract in SKILL.md.
40| | edges: i64,
41| | /// Total indexed chunks (one row per chunk in `memory_chunks`).
42| | chunks_total: i64,
43| | /// Average length of the body field in active (non-deleted) memories.
44| | avg_body_len: f64,
45| | namespaces: Vec<String>,
46| | db_size_bytes: u64,
47| | /// Semantic alias of `db_size_bytes` for the documented contract.
48| | db_bytes: u64,
49| | /// Latest applied migration number from `refinery_schema_history`.
50| | /// Emitted as a JSON number for cross-command consistency with `health` (since v1.0.35).
51| | /// Returns `0` when the database has no recorded migrations yet.
52| | schema_version: u32,
53| | /// Total execution time in milliseconds from handler start to serialisation.
54| | elapsed_ms: u64,
55| |}
56| |
57| 0|pub fn run(args: StatsArgs) -> Result<(), AppError> {
58| 0| let start = std::time::Instant::now();
59| 0| let _ = args.json; // --json is a no-op because output is already JSON by default
60| 0| let _ = args.format; // --format is a no-op; JSON is always emitted on stdout
61| 0| let paths = AppPaths::resolve(args.db.as_deref())?;
62| |
63| 0| crate::storage::connection::ensure_db_ready(&paths)?;
64| |
65| 0| let conn = open_ro(&paths.db)?;
66| |
67| 0| let memories: i64 = conn.query_row(
68| 0| "SELECT COUNT(*) FROM memories WHERE deleted_at IS NULL",
69| 0| [],
70| 0| |r| r.get(0),
71| 0| )?;
72| 0| let entities: i64 = conn.query_row("SELECT COUNT(*) FROM entities", [], |r| r.get(0))?;
73| 0| let relationships: i64 =
74| 0| conn.query_row("SELECT COUNT(*) FROM relationships", [], |r| r.get(0))?;
75| |
76| 0| let mut stmt = conn.prepare_cached(
77| 0| "SELECT DISTINCT namespace FROM memories WHERE deleted_at IS NULL ORDER BY namespace",
78| 0| )?;
79| 0| let namespaces: Vec<String> = stmt
80| 0| .query_map([], |r| r.get(0))?
81| 0| .collect::<Result<Vec<_>, _>>()?;
82| |
83| 0| let schema_version: u32 = conn
84| 0| .query_row(
85| 0| "SELECT MAX(version) FROM refinery_schema_history",
86| 0| [],
87| 0| |row| row.get::<_, Option<i64>>(0),
88| | )
89| 0| .ok()
90| 0| .flatten()
91| 0| .map(|v| v.max(0) as u32)
92| 0| .unwrap_or(0);
93| |
94| 0| let db_size_bytes = std::fs::metadata(&paths.db).map(|m| m.len()).unwrap_or(0);
95| |
96| | // v1.0.21 P1-C: query uses the (correct) `memory_chunks` table.
97| | // If the table does not exist (legacy pre-chunking DB), the error is "no such table"
98| | // and the fallback returns 0. Other errors are logged via tracing for audit.
99| 0| let chunks_total: i64 = match conn.query_row("SELECT COUNT(*) FROM memory_chunks", [], |r| {
100| 0| r.get::<_, i64>(0)
101| 0| }) {
102| 0| Ok(n) => n,
103| 0| Err(rusqlite::Error::SqliteFailure(_, Some(msg))) if msg.contains("no such table") => 0,
104| 0| Err(e) => {
105| 0| tracing::warn!(target: "stats", error = %e, "memory_chunks count failed");
106| 0| 0
107| | }
108| | };
109| |
110| 0| let avg_body_len: f64 = conn
111| 0| .query_row(
112| 0| "SELECT COALESCE(AVG(LENGTH(body)), 0.0) FROM memories WHERE deleted_at IS NULL",
113| 0| [],
114| 0| |r| r.get(0),
115| | )
116| 0| .unwrap_or(0.0);
117| |
118| 0| output::emit_json(&StatsResponse {
119| 0| memories,
120| 0| memories_total: memories,
121| 0| entities,
122| 0| entities_total: entities,
123| 0| relationships,
124| 0| relationships_total: relationships,
125| 0| edges: relationships,
126| 0| chunks_total,
127| 0| avg_body_len,
128| 0| namespaces,
129| 0| db_size_bytes,
130| 0| db_bytes: db_size_bytes,
131| 0| schema_version,
132| 0| elapsed_ms: start.elapsed().as_millis() as u64,
133| 0| })?;
134| |
135| 0| Ok(())
136| 0|}
137| |
138| |#[cfg(test)]
139| |mod tests {
140| | use super::*;
141| |
142| | #[test]
143| 1| fn stats_response_serializes_all_fields() {
144| 1| let resp = StatsResponse {
145| 1| memories: 10,
146| 1| memories_total: 10,
147| 1| entities: 5,
148| 1| entities_total: 5,
149| 1| relationships: 3,
150| 1| relationships_total: 3,
151| 1| edges: 3,
152| 1| chunks_total: 20,
153| 1| avg_body_len: 42.5,
154| 1| namespaces: vec!["global".to_string(), "project".to_string()],
155| 1| db_size_bytes: 8192,
156| 1| db_bytes: 8192,
157| 1| schema_version: 6,
158| 1| elapsed_ms: 7,
159| 1| };
160| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
161| 1| assert_eq!(json["memories"], 10);
162| 1| assert_eq!(json["memories_total"], 10);
163| 1| assert_eq!(json["entities"], 5);
164| 1| assert_eq!(json["entities_total"], 5);
165| 1| assert_eq!(json["relationships"], 3);
166| 1| assert_eq!(json["relationships_total"], 3);
167| 1| assert_eq!(json["edges"], 3);
168| 1| assert_eq!(json["chunks_total"], 20);
169| 1| assert_eq!(json["db_size_bytes"], 8192u64);
170| 1| assert_eq!(json["db_bytes"], 8192u64);
171| 1| assert_eq!(json["schema_version"], 6);
172| 1| assert_eq!(json["elapsed_ms"], 7u64);
173| 1| }
174| |
175| | #[test]
176| 1| fn stats_response_namespaces_is_string_array() {
177| 1| let resp = StatsResponse {
178| 1| memories: 0,
179| 1| memories_total: 0,
180| 1| entities: 0,
181| 1| entities_total: 0,
182| 1| relationships: 0,
183| 1| relationships_total: 0,
184| 1| edges: 0,
185| 1| chunks_total: 0,
186| 1| avg_body_len: 0.0,
187| 1| namespaces: vec!["ns1".to_string(), "ns2".to_string(), "ns3".to_string()],
188| 1| db_size_bytes: 0,
189| 1| db_bytes: 0,
190| 1| schema_version: 0,
191| 1| elapsed_ms: 0,
192| 1| };
193| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
194| 1| let arr = json["namespaces"]
195| 1| .as_array()
196| 1| .expect("namespaces must be array");
197| 1| assert_eq!(arr.len(), 3);
198| 1| assert_eq!(arr[0], "ns1");
199| 1| assert_eq!(arr[1], "ns2");
200| 1| assert_eq!(arr[2], "ns3");
201| 1| }
202| |
203| | #[test]
204| 1| fn stats_response_namespaces_empty_serializes_empty_array() {
205| 1| let resp = StatsResponse {
206| 1| memories: 0,
207| 1| memories_total: 0,
208| 1| entities: 0,
209| 1| entities_total: 0,
210| 1| relationships: 0,
211| 1| relationships_total: 0,
212| 1| edges: 0,
213| 1| chunks_total: 0,
214| 1| avg_body_len: 0.0,
215| 1| namespaces: vec![],
216| 1| db_size_bytes: 0,
217| 1| db_bytes: 0,
218| 1| schema_version: 0,
219| 1| elapsed_ms: 0,
220| 1| };
221| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
222| 1| let arr = json["namespaces"]
223| 1| .as_array()
224| 1| .expect("namespaces must be array");
225| 1| assert!(arr.is_empty(), "empty namespaces must serialize as []");
^0
226| 1| }
227| |
228| | #[test]
229| 1| fn stats_response_aliases_memories_total_and_memories_equal() {
230| 1| let resp = StatsResponse {
231| 1| memories: 42,
232| 1| memories_total: 42,
233| 1| entities: 7,
234| 1| entities_total: 7,
235| 1| relationships: 2,
236| 1| relationships_total: 2,
237| 1| edges: 2,
238| 1| chunks_total: 0,
239| 1| avg_body_len: 0.0,
240| 1| namespaces: vec![],
241| 1| db_size_bytes: 0,
242| 1| db_bytes: 0,
243| 1| schema_version: 6,
244| 1| elapsed_ms: 0,
245| 1| };
246| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
247| 1| assert_eq!(json["memories"], json["memories_total"]);
248| 1| assert_eq!(json["entities"], json["entities_total"]);
249| 1| assert_eq!(json["relationships"], json["relationships_total"]);
250| 1| assert_eq!(json["relationships"], json["edges"]);
251| 1| assert_eq!(json["db_size_bytes"], json["db_bytes"]);
252| 1| }
253| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/sync_safe_copy.rs:
1| |//! Handler for the `sync-safe-copy` CLI subcommand.
2| |
3| |use crate::errors::AppError;
4| |use crate::i18n::validation;
5| |use crate::output;
6| |use crate::paths::AppPaths;
7| |use crate::storage::connection::open_rw;
8| |use serde::Serialize;
9| |
10| |#[derive(clap::Args)]
11| |#[command(after_long_help = "EXAMPLES:\n \
12| | # Create a checkpointed snapshot safe for cloud sync\n \
13| | sqlite-graphrag sync-safe-copy --dest /backup/graphrag-snapshot.sqlite\n\n \
14| | # Use the --to alias\n \
15| | sqlite-graphrag sync-safe-copy --to /backup/graphrag-snapshot.sqlite\n\n \
16| | # Snapshot a custom source database\n \
17| | sqlite-graphrag sync-safe-copy --db /data/graphrag.sqlite --dest /backup/snapshot.sqlite")]
18| |pub struct SyncSafeCopyArgs {
19| | /// Snapshot destination path as a positional argument. Alternative to `--dest`.
20| | #[arg(
21| | value_name = "DEST",
22| | conflicts_with = "dest",
23| | help = "Snapshot destination path; alternative to --dest"
24| | )]
25| | pub dest_positional: Option<std::path::PathBuf>,
26| | /// Snapshot destination path. Also accepts the aliases `--to` and `--output`.
27| | #[arg(long, alias = "to", alias = "output")]
28| | pub dest: Option<std::path::PathBuf>,
29| | #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
30| | pub json: bool,
31| | /// Output format: `json` or `text`. JSON is always emitted on stdout regardless of the value.
32| | #[arg(long, value_parser = ["json", "text"], hide = true)]
33| | pub format: Option<String>,
34| | #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
35| | pub db: Option<String>,
36| |}
37| |
38| |#[derive(Serialize)]
39| |struct SyncSafeCopyResponse {
40| | source_db_path: String,
41| | dest_path: String,
42| | bytes_copied: u64,
43| | status: String,
44| | /// Total execution time in milliseconds from handler start to serialisation.
45| | elapsed_ms: u64,
46| |}
47| |
48| 0|pub fn run(args: SyncSafeCopyArgs) -> Result<(), AppError> {
49| 0| let start = std::time::Instant::now();
50| 0| let _ = args.format; // --format is a no-op; JSON is always emitted on stdout
51| 0| let dest = args
52| 0| .dest_positional
53| 0| .clone()
54| 0| .or_else(|| args.dest.clone())
55| 0| .ok_or_else(|| {
56| 0| AppError::Validation(
57| 0| "destination required: pass as positional argument or via --dest/--to/--output"
58| 0| .to_string(),
59| 0| )
60| 0| })?;
61| 0| let paths = AppPaths::resolve(args.db.as_deref())?;
62| |
63| 0| crate::storage::connection::ensure_db_ready(&paths)?;
64| |
65| 0| if dest == paths.db {
66| 0| return Err(AppError::Validation(
67| 0| validation::sync_destination_equals_source(),
68| 0| ));
69| 0| }
70| |
71| 0| if let Some(parent) = dest.parent() {
72| 0| std::fs::create_dir_all(parent)?;
73| 0| }
74| |
75| 0| let conn = open_rw(&paths.db)?;
76| 0| conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);")?;
77| 0| drop(conn);
78| |
79| 0| let bytes_copied = std::fs::copy(&paths.db, &dest)?;
80| |
81| | // Applies 0600 permissions on the snapshot on Unix to avoid leakage on Dropbox/shared NFS.
82| | // On Windows, NTFS DACL default is private-to-user; no explicit permission setter required.
83| | #[cfg(unix)]
84| | {
85| | use std::os::unix::fs::PermissionsExt;
86| 0| let mut perms = std::fs::metadata(&dest)?.permissions();
87| 0| perms.set_mode(0o600);
88| 0| std::fs::set_permissions(&dest, perms)?;
89| | }
90| | #[cfg(windows)]
91| | {
92| | tracing::debug!(target: "sync_safe_copy",
93| | path = %dest.display(),
94| | "skipping Unix mode 0o600 on Windows; NTFS DACL default is private-to-user"
95| | );
96| | }
97| |
98| 0| output::emit_json(&SyncSafeCopyResponse {
99| 0| source_db_path: paths.db.display().to_string(),
100| 0| dest_path: dest.display().to_string(),
101| 0| bytes_copied,
102| 0| status: "ok".to_string(),
103| 0| elapsed_ms: start.elapsed().as_millis() as u64,
104| 0| })?;
105| |
106| 0| Ok(())
107| 0|}
108| |
109| |#[cfg(test)]
110| |mod tests {
111| | use super::*;
112| |
113| | #[test]
114| 1| fn sync_safe_copy_response_serializes_all_fields() {
115| 1| let resp = SyncSafeCopyResponse {
116| 1| source_db_path: "/home/user/.local/share/sqlite-graphrag/db.sqlite".to_string(),
117| 1| dest_path: "/tmp/backup.sqlite".to_string(),
118| 1| bytes_copied: 16384,
119| 1| status: "ok".to_string(),
120| 1| elapsed_ms: 12,
121| 1| };
122| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
123| 1| assert_eq!(
124| 1| json["source_db_path"],
125| | "/home/user/.local/share/sqlite-graphrag/db.sqlite"
126| | );
127| 1| assert_eq!(json["dest_path"], "/tmp/backup.sqlite");
128| 1| assert_eq!(json["bytes_copied"], 16384u64);
129| 1| assert_eq!(json["status"], "ok");
130| 1| assert_eq!(json["elapsed_ms"], 12u64);
131| 1| }
132| |
133| | #[test]
134| 1| fn sync_safe_copy_rejects_dest_equal_to_source() {
135| 1| let db_path = std::path::PathBuf::from("/tmp/same.sqlite");
136| 1| let args = SyncSafeCopyArgs {
137| 1| dest_positional: None,
138| 1| dest: Some(db_path.clone()),
139| 1| json: false,
140| 1| format: None,
141| 1| db: Some("/tmp/same.sqlite".to_string()),
142| 1| };
143| | // Simulates manual path resolution — validates rejection logic
144| 1| let resolved_dest = args
145| 1| .dest_positional
146| 1| .clone()
147| 1| .or_else(|| args.dest.clone())
148| 1| .expect("test must pass dest");
149| 1| let result = if resolved_dest == std::path::PathBuf::from(args.db.as_deref().unwrap_or(""))
150| | {
151| 1| Err(AppError::Validation(
152| 1| "destination path must differ from the source database path".to_string(),
153| 1| ))
154| | } else {
155| 0| Ok(())
156| | };
157| 1| assert!(result.is_err(), "must reject dest equal to source");
^0
158| 1| if let Err(AppError::Validation(msg)) = result {
159| 1| assert!(msg.contains("destination path must differ"));
160| 0| }
161| 1| }
162| |
163| | #[test]
164| 1| fn sync_safe_copy_response_status_ok() {
165| 1| let resp = SyncSafeCopyResponse {
166| 1| source_db_path: "/data/db.sqlite".to_string(),
167| 1| dest_path: "/backup/db.sqlite".to_string(),
168| 1| bytes_copied: 0,
169| 1| status: "ok".to_string(),
170| 1| elapsed_ms: 0,
171| 1| };
172| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
173| 1| assert_eq!(json["status"], "ok");
174| 1| }
175| |
176| | #[test]
177| 1| fn sync_safe_copy_response_bytes_copied_zero_valid() {
178| 1| let resp = SyncSafeCopyResponse {
179| 1| source_db_path: "/data/db.sqlite".to_string(),
180| 1| dest_path: "/backup/db.sqlite".to_string(),
181| 1| bytes_copied: 0,
182| 1| status: "ok".to_string(),
183| 1| elapsed_ms: 1,
184| 1| };
185| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
186| 1| assert_eq!(json["bytes_copied"], 0u64);
187| 1| assert_eq!(json["elapsed_ms"], 1u64);
188| 1| }
189| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/unlink.rs:
1| |//! Handler for the `unlink` CLI subcommand.
2| |
3| |use crate::errors::AppError;
4| |use crate::i18n::errors_msg;
5| |use crate::output::{self, OutputFormat};
6| |use crate::paths::AppPaths;
7| |use crate::storage::connection::open_rw;
8| |use crate::storage::entities;
9| |use serde::Serialize;
10| |
11| |#[derive(clap::Args)]
12| |#[command(after_long_help = "EXAMPLES:\n \
13| | # Remove a specific relationship between two entities\n \
14| | sqlite-graphrag unlink --from oauth-flow --to refresh-tokens --relation related\n\n \
15| | # Remove ALL relationships between two entities (any relation type)\n \
16| | sqlite-graphrag unlink --from oauth-flow --to refresh-tokens\n\n \
17| | # Remove ALL relationships where an entity is source or target\n \
18| | sqlite-graphrag unlink --entity oauth-flow --all\n\n \
19| |NOTE:\n \
20| | --from and --to expect ENTITY names (graph nodes), not memory names.\n \
21| | To inspect current entities and relationships, run: sqlite-graphrag graph --format json")]
22| |pub struct UnlinkArgs {
23| | /// Source ENTITY name (graph node, not memory). Also accepts the aliases `--source` and `--name`.
24| | /// To list current entities run `graph --format json | jaq '.nodes[].name'`.
25| | #[arg(long, alias = "source", alias = "name", conflicts_with = "entity")]
26| | pub from: Option<String>,
27| | /// Target ENTITY name (graph node, not memory). Also accepts the alias `--target`.
28| | #[arg(long, alias = "target", conflicts_with = "entity")]
29| | pub to: Option<String>,
30| | /// Relation type to remove. When omitted with --from/--to, ALL relationships between
31| | /// those two entities are deleted. Accepts canonical values (e.g. uses, depends-on)
32| | /// or any custom snake_case/kebab-case string.
33| | #[arg(long, value_parser = crate::parsers::parse_relation, value_name = "RELATION")]
34| | pub relation: Option<String>,
35| | /// Entity name for bulk removal. Must be combined with --all.
36| | #[arg(long, requires = "all", conflicts_with_all = ["from", "to"])]
37| | pub entity: Option<String>,
38| | /// When combined with --entity, removes ALL relationships where that entity is source or target.
39| | #[arg(long, requires = "entity")]
40| | pub all: bool,
41| | #[arg(long)]
42| | pub namespace: Option<String>,
43| | #[arg(long, value_enum, default_value = "json")]
44| | pub format: OutputFormat,
45| | #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
46| | pub json: bool,
47| | #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
48| | pub db: Option<String>,
49| |}
50| |
51| |#[derive(Serialize)]
52| |struct UnlinkResponse {
53| | action: String,
54| | from_name: String,
55| | to_name: String,
56| | relation: String,
57| | relationships_removed: u64,
58| | namespace: String,
59| | /// Total execution time in milliseconds from handler start to serialisation.
60| | elapsed_ms: u64,
61| |}
62| |
63| 0|pub fn run(args: UnlinkArgs) -> Result<(), AppError> {
64| 0| let inicio = std::time::Instant::now();
65| 0| let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
66| 0| let paths = AppPaths::resolve(args.db.as_deref())?;
67| |
68| 0| crate::storage::connection::ensure_db_ready(&paths)?;
69| |
70| 0| if let Some(relation_str) = &args.relation {
71| 0| crate::parsers::warn_if_non_canonical(relation_str);
72| 0| }
73| |
74| 0| let mut conn = open_rw(&paths.db)?;
75| |
76| | // Mode: --entity --all → delete every relationship for that entity.
77| 0| if args.all {
78| 0| let entity_name = args.entity.as_deref().unwrap_or("");
79| 0| let entity_id =
80| 0| entities::find_entity_id(&conn, &namespace, entity_name)?.ok_or_else(|| {
81| 0| AppError::NotFound(errors_msg::entity_not_found(entity_name, &namespace))
82| 0| })?;
83| |
84| 0| let tx = conn.transaction_with_behavior(rusqlite::TransactionBehavior::Immediate)?;
85| 0| let removed = delete_all_entity_relationships(&tx, entity_id)?;
86| 0| entities::recalculate_degree(&tx, entity_id)?;
87| 0| tx.commit()?;
88| |
89| 0| conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);")?;
90| |
91| 0| let response = UnlinkResponse {
92| 0| action: "deleted".to_string(),
93| 0| from_name: entity_name.to_string(),
94| 0| to_name: "*".to_string(),
95| 0| relation: "*".to_string(),
96| 0| relationships_removed: removed,
97| 0| namespace: namespace.clone(),
98| 0| elapsed_ms: inicio.elapsed().as_millis() as u64,
99| 0| };
100| |
101| 0| match args.format {
102| 0| OutputFormat::Json => output::emit_json(&response)?,
103| 0| OutputFormat::Text | OutputFormat::Markdown => {
104| 0| output::emit_text(&format!(
105| 0| "deleted: {} --[*]--> * removed {} relationship(s) [{}]",
106| 0| response.from_name, response.relationships_removed, response.namespace
107| 0| ));
108| 0| }
109| | }
110| 0| return Ok(());
111| 0| }
112| |
113| | // Mode: --from/--to (with optional --relation).
114| 0| let from_name = args.from.as_deref().ok_or_else(|| {
115| 0| AppError::Validation("--from is required when --entity/--all is not used".to_string())
116| 0| })?;
117| 0| let to_name = args.to.as_deref().ok_or_else(|| {
118| 0| AppError::Validation("--to is required when --entity/--all is not used".to_string())
119| 0| })?;
120| |
121| 0| let source_id = entities::find_entity_id(&conn, &namespace, from_name)?
122| 0| .ok_or_else(|| AppError::NotFound(errors_msg::entity_not_found(from_name, &namespace)))?;
123| 0| let target_id = entities::find_entity_id(&conn, &namespace, to_name)?
124| 0| .ok_or_else(|| AppError::NotFound(errors_msg::entity_not_found(to_name, &namespace)))?;
125| |
126| 0| let (removed, relation_display) = if let Some(rel) = args.relation.as_deref() {
127| | // Single-relation mode: exact match required.
128| 0| let row =
129| 0| entities::find_relationship(&conn, source_id, target_id, rel)?.ok_or_else(|| {
130| 0| AppError::NotFound(errors_msg::relationship_not_found(
131| 0| from_name, rel, to_name, &namespace,
132| 0| ))
133| 0| })?;
134| |
135| 0| let tx = conn.transaction_with_behavior(rusqlite::TransactionBehavior::Immediate)?;
136| 0| entities::delete_relationship_by_id(&tx, row.id)?;
137| 0| entities::recalculate_degree(&tx, source_id)?;
138| 0| entities::recalculate_degree(&tx, target_id)?;
139| 0| tx.commit()?;
140| |
141| 0| (1u64, rel.to_string())
142| | } else {
143| | // Bulk mode: delete all relationships between from and to.
144| 0| let tx = conn.transaction_with_behavior(rusqlite::TransactionBehavior::Immediate)?;
145| 0| let count = delete_relationships_between(&tx, source_id, target_id)?;
146| 0| entities::recalculate_degree(&tx, source_id)?;
147| 0| entities::recalculate_degree(&tx, target_id)?;
148| 0| tx.commit()?;
149| |
150| 0| (count, "*".to_string())
151| | };
152| |
153| 0| conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);")?;
154| |
155| 0| let response = UnlinkResponse {
156| 0| action: "deleted".to_string(),
157| 0| from_name: from_name.to_string(),
158| 0| to_name: to_name.to_string(),
159| 0| relation: relation_display.clone(),
160| 0| relationships_removed: removed,
161| 0| namespace: namespace.clone(),
162| 0| elapsed_ms: inicio.elapsed().as_millis() as u64,
163| 0| };
164| |
165| 0| match args.format {
166| 0| OutputFormat::Json => output::emit_json(&response)?,
167| 0| OutputFormat::Text | OutputFormat::Markdown => {
168| 0| output::emit_text(&format!(
169| 0| "deleted: {} --[{}]--> {} removed {} relationship(s) [{}]",
170| 0| response.from_name,
171| 0| response.relation,
172| 0| response.to_name,
173| 0| response.relationships_removed,
174| 0| response.namespace
175| 0| ));
176| 0| }
177| | }
178| |
179| 0| Ok(())
180| 0|}
181| |
182| |/// Deletes all relationships where `entity_id` is source or target.
183| |/// Returns the number of rows removed.
184| 0|fn delete_all_entity_relationships(
185| 0| conn: &rusqlite::Connection,
186| 0| entity_id: i64,
187| 0|) -> Result<u64, AppError> {
188| | // Collect IDs first to clean up memory_relationships junction.
189| 0| let mut stmt =
190| 0| conn.prepare_cached("SELECT id FROM relationships WHERE source_id = ?1 OR target_id = ?1")?;
191| 0| let ids: Vec<i64> = stmt
192| 0| .query_map(rusqlite::params![entity_id], |r| r.get(0))?
193| 0| .collect::<rusqlite::Result<Vec<_>>>()?;
194| |
195| 0| let count = ids.len() as u64;
196| 0| for rel_id in ids {
197| 0| conn.execute(
198| 0| "DELETE FROM memory_relationships WHERE relationship_id = ?1",
199| 0| rusqlite::params![rel_id],
200| 0| )?;
201| 0| conn.execute(
202| 0| "DELETE FROM relationships WHERE id = ?1",
203| 0| rusqlite::params![rel_id],
204| 0| )?;
205| | }
206| 0| Ok(count)
207| 0|}
208| |
209| |/// Deletes all relationships between `source_id` and `target_id` (any relation type).
210| |/// Returns the number of rows removed.
211| 0|fn delete_relationships_between(
212| 0| conn: &rusqlite::Connection,
213| 0| source_id: i64,
214| 0| target_id: i64,
215| 0|) -> Result<u64, AppError> {
216| 0| let mut stmt = conn
217| 0| .prepare_cached("SELECT id FROM relationships WHERE source_id = ?1 AND target_id = ?2")?;
218| 0| let ids: Vec<i64> = stmt
219| 0| .query_map(rusqlite::params![source_id, target_id], |r| r.get(0))?
220| 0| .collect::<rusqlite::Result<Vec<_>>>()?;
221| |
222| 0| let count = ids.len() as u64;
223| 0| for rel_id in ids {
224| 0| conn.execute(
225| 0| "DELETE FROM memory_relationships WHERE relationship_id = ?1",
226| 0| rusqlite::params![rel_id],
227| 0| )?;
228| 0| conn.execute(
229| 0| "DELETE FROM relationships WHERE id = ?1",
230| 0| rusqlite::params![rel_id],
231| 0| )?;
232| | }
233| 0| Ok(count)
234| 0|}
235| |
236| |#[cfg(test)]
237| |mod tests {
238| | use super::*;
239| |
240| | #[test]
241| 1| fn unlink_response_serializes_all_fields() {
242| 1| let resp = UnlinkResponse {
243| 1| action: "deleted".to_string(),
244| 1| from_name: "entity-a".to_string(),
245| 1| to_name: "entity-b".to_string(),
246| 1| relation: "uses".to_string(),
247| 1| relationships_removed: 1,
248| 1| namespace: "global".to_string(),
249| 1| elapsed_ms: 5,
250| 1| };
251| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
252| 1| assert_eq!(json["action"], "deleted");
253| 1| assert_eq!(json["from_name"], "entity-a");
254| 1| assert_eq!(json["to_name"], "entity-b");
255| 1| assert_eq!(json["relation"], "uses");
256| 1| assert_eq!(json["relationships_removed"], 1u64);
257| 1| assert_eq!(json["namespace"], "global");
258| 1| assert_eq!(json["elapsed_ms"], 5u64);
259| 1| }
260| |
261| | #[test]
262| 1| fn unlink_response_action_must_be_deleted() {
263| 1| let resp = UnlinkResponse {
264| 1| action: "deleted".to_string(),
265| 1| from_name: "a".to_string(),
266| 1| to_name: "b".to_string(),
267| 1| relation: "related".to_string(),
268| 1| relationships_removed: 1,
269| 1| namespace: "global".to_string(),
270| 1| elapsed_ms: 0,
271| 1| };
272| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
273| 1| assert_eq!(
274| 1| json["action"], "deleted",
275| 0| "unlink action must always be 'deleted'"
276| | );
277| 1| }
278| |
279| | #[test]
280| 1| fn unlink_response_bulk_uses_wildcard_relation() {
281| 1| let resp = UnlinkResponse {
282| 1| action: "deleted".to_string(),
283| 1| from_name: "origin".to_string(),
284| 1| to_name: "destination".to_string(),
285| 1| relation: "*".to_string(),
286| 1| relationships_removed: 3,
287| 1| namespace: "project".to_string(),
288| 1| elapsed_ms: 3,
289| 1| };
290| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
291| 1| assert_eq!(json["relation"], "*");
292| 1| assert_eq!(json["relationships_removed"], 3u64);
293| 1| }
294| |
295| | #[test]
296| 1| fn unlink_response_entity_all_uses_wildcard_to() {
297| 1| let resp = UnlinkResponse {
298| 1| action: "deleted".to_string(),
299| 1| from_name: "oauth-flow".to_string(),
300| 1| to_name: "*".to_string(),
301| 1| relation: "*".to_string(),
302| 1| relationships_removed: 5,
303| 1| namespace: "global".to_string(),
304| 1| elapsed_ms: 2,
305| 1| };
306| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
307| 1| assert_eq!(json["to_name"], "*");
308| 1| assert_eq!(json["relation"], "*");
309| 1| assert_eq!(json["relationships_removed"], 5u64);
310| 1| }
311| |
312| | #[test]
313| 1| fn unlink_response_relationships_removed_field_present() {
314| 1| let resp = UnlinkResponse {
315| 1| action: "deleted".to_string(),
316| 1| from_name: "a".to_string(),
317| 1| to_name: "b".to_string(),
318| 1| relation: "uses".to_string(),
319| 1| relationships_removed: 0,
320| 1| namespace: "global".to_string(),
321| 1| elapsed_ms: 0,
322| 1| };
323| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
324| 1| assert!(
325| 1| json.get("relationships_removed").is_some(),
326| 0| "relationships_removed field must be present"
327| | );
328| 1| }
329| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/vacuum.rs:
1| |//! Handler for the `vacuum` CLI subcommand.
2| |
3| |use crate::errors::AppError;
4| |use crate::output;
5| |use crate::output::JsonOutputFormat;
6| |use crate::paths::AppPaths;
7| |use crate::storage::connection::open_rw;
8| |use serde::Serialize;
9| |
10| |#[derive(clap::Args)]
11| |#[command(after_long_help = "EXAMPLES:\n \
12| | # Run VACUUM after WAL checkpoint (default)\n \
13| | sqlite-graphrag vacuum\n\n \
14| | # Vacuum a database at a custom path\n \
15| | sqlite-graphrag vacuum --db /path/to/graphrag.sqlite\n\n \
16| | # Vacuum via SQLITE_GRAPHRAG_DB_PATH env var\n \
17| | SQLITE_GRAPHRAG_DB_PATH=/data/graphrag.sqlite sqlite-graphrag vacuum\n\n\
18| |NOTE:\n \
19| | reclaimed_bytes may report 0 even after `purge` if removed memories did not\n \
20| | span entire SQLite pages (page size = 4 KB). Run `vacuum` regularly only on\n \
21| | large databases (> 10 MB) for measurable gains.")]
22| |pub struct VacuumArgs {
23| | #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
24| | pub json: bool,
25| | /// Run a WAL checkpoint before and after `VACUUM`.
26| | #[arg(long, default_value_t = true)]
27| | pub checkpoint: bool,
28| | /// Output format.
29| | #[arg(long, value_enum, default_value_t = JsonOutputFormat::Json)]
30| | pub format: JsonOutputFormat,
31| | #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
32| | pub db: Option<String>,
33| |}
34| |
35| |#[derive(Serialize)]
36| |struct VacuumResponse {
37| | db_path: String,
38| | size_before_bytes: u64,
39| | size_after_bytes: u64,
40| | /// Bytes reclaimed by VACUUM (size_before_bytes - size_after_bytes), saturating to zero.
41| | /// Derived field added in v1.0.34 so callers do not have to compute the delta themselves.
42| | reclaimed_bytes: u64,
43| | status: String,
44| | /// Total execution time in milliseconds from handler start to serialisation.
45| | elapsed_ms: u64,
46| |}
47| |
48| 0|pub fn run(args: VacuumArgs) -> Result<(), AppError> {
49| 0| let start = std::time::Instant::now();
50| 0| let _ = args.format;
51| 0| let paths = AppPaths::resolve(args.db.as_deref())?;
52| |
53| 0| crate::storage::connection::ensure_db_ready(&paths)?;
54| |
55| 0| let size_before_bytes = std::fs::metadata(&paths.db)
56| 0| .map(|meta| meta.len())
57| 0| .unwrap_or(0);
58| 0| let conn = open_rw(&paths.db)?;
59| 0| if args.checkpoint {
60| 0| conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);")?;
61| 0| }
62| 0| conn.execute_batch("VACUUM;")?;
63| 0| if args.checkpoint {
64| 0| conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);")?;
65| 0| }
66| 0| drop(conn);
67| 0| let size_after_bytes = std::fs::metadata(&paths.db)
68| 0| .map(|meta| meta.len())
69| 0| .unwrap_or(0);
70| |
71| 0| output::emit_json(&VacuumResponse {
72| 0| db_path: paths.db.display().to_string(),
73| 0| size_before_bytes,
74| 0| size_after_bytes,
75| 0| reclaimed_bytes: size_before_bytes.saturating_sub(size_after_bytes),
76| 0| status: "ok".to_string(),
77| 0| elapsed_ms: start.elapsed().as_millis() as u64,
78| 0| })?;
79| |
80| 0| Ok(())
81| 0|}
82| |
83| |#[cfg(test)]
84| |mod tests {
85| | use super::*;
86| |
87| | #[test]
88| 1| fn vacuum_response_serializes_all_fields() {
89| 1| let resp = VacuumResponse {
90| 1| db_path: "/home/user/.local/share/sqlite-graphrag/db.sqlite".to_string(),
91| 1| size_before_bytes: 32768,
92| 1| size_after_bytes: 16384,
93| 1| reclaimed_bytes: 16384,
94| 1| status: "ok".to_string(),
95| 1| elapsed_ms: 55,
96| 1| };
97| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
98| 1| assert_eq!(
99| 1| json["db_path"],
100| | "/home/user/.local/share/sqlite-graphrag/db.sqlite"
101| | );
102| 1| assert_eq!(json["size_before_bytes"], 32768u64);
103| 1| assert_eq!(json["size_after_bytes"], 16384u64);
104| 1| assert_eq!(json["reclaimed_bytes"], 16384u64);
105| 1| assert_eq!(json["status"], "ok");
106| 1| assert_eq!(json["elapsed_ms"], 55u64);
107| 1| }
108| |
109| | #[test]
110| 1| fn vacuum_response_size_after_less_than_or_equal_to_before() {
111| 1| let resp = VacuumResponse {
112| 1| db_path: "/data/db.sqlite".to_string(),
113| 1| size_before_bytes: 65536,
114| 1| size_after_bytes: 32768,
115| 1| reclaimed_bytes: 32768,
116| 1| status: "ok".to_string(),
117| 1| elapsed_ms: 100,
118| 1| };
119| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
120| 1| let before = json["size_before_bytes"].as_u64().unwrap();
121| 1| let after = json["size_after_bytes"].as_u64().unwrap();
122| 1| let reclaimed = json["reclaimed_bytes"].as_u64().unwrap();
123| 1| assert!(
124| 1| after <= before,
125| 0| "size_after_bytes must be <= size_before_bytes after VACUUM"
126| | );
127| 1| assert_eq!(
128| | reclaimed,
129| 1| before - after,
130| 0| "reclaimed_bytes must equal size_before_bytes - size_after_bytes"
131| | );
132| 1| }
133| |
134| | #[test]
135| 1| fn vacuum_response_status_ok() {
136| 1| let resp = VacuumResponse {
137| 1| db_path: "/data/db.sqlite".to_string(),
138| 1| size_before_bytes: 0,
139| 1| size_after_bytes: 0,
140| 1| reclaimed_bytes: 0,
141| 1| status: "ok".to_string(),
142| 1| elapsed_ms: 0,
143| 1| };
144| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
145| 1| assert_eq!(json["status"], "ok");
146| 1| }
147| |
148| | #[test]
149| 1| fn vacuum_response_elapsed_ms_present_and_non_negative() {
150| 1| let resp = VacuumResponse {
151| 1| db_path: "/data/db.sqlite".to_string(),
152| 1| size_before_bytes: 1024,
153| 1| size_after_bytes: 1024,
154| 1| reclaimed_bytes: 0,
155| 1| status: "ok".to_string(),
156| 1| elapsed_ms: 0,
157| 1| };
158| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
159| 1| assert!(
160| 1| json.get("elapsed_ms").is_some(),
161| 0| "elapsed_ms field must be present"
162| | );
163| 1| assert!(
164| 1| json["elapsed_ms"].as_u64().is_some(),
165| 0| "elapsed_ms must be a non-negative integer"
166| | );
167| 1| }
168| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/commands/vec.rs:
1| |//! Handler for the `vec` CLI subcommand family.
2| |//!
3| |//! Provides three maintenance operations for the `vec_memories` virtual
4| |//! table that backs the embedding KNN search:
5| |//!
6| |//! - `orphan-list`: lists `vec_memories` rows whose `memory_id` no longer
7| |//! references a live (non-soft-deleted) memory.
8| |//! - `purge-orphan`: deletes those orphan rows in a single transaction.
9| |//! - `stats`: surfaces total rows, orphan count, and coverage percentage.
10| |//!
11| |//! G39 (v1.0.69): before v1.0.69, the only way to detect a vec-orphan was
12| |//! `health --json` which reported `vec_memories_orphaned > 0` with no
13| |//! remediation path. This module closes the loop.
14| |
15| |use crate::errors::AppError;
16| |use crate::output;
17| |use crate::paths::AppPaths;
18| |use crate::storage::connection::{open_ro, open_rw};
19| |use serde::Serialize;
20| |
21| |/// Arguments for the `vec` subcommand family.
22| |#[derive(clap::Args)]
23| |#[command(
24| | about = "Vector index maintenance (orphan detection, purge, stats)",
25| | after_long_help = "EXAMPLES:\n \
26| | # List orphan vec_memories rows whose memory_id is gone\n \
27| | sqlite-graphrag vec orphan-list\n\n \
28| | # Dry-run the purge (does not delete)\n \
29| | sqlite-graphrag vec purge-orphan --dry-run\n\n \
30| | # Actually purge orphans\n \
31| | sqlite-graphrag vec purge-orphan --yes\n\n \
32| | # Show stats for all vec0 tables\n \
33| | sqlite-graphrag vec stats --json"
34| |)]
35| |pub struct VecArgs {
36| | #[command(subcommand)]
37| | pub command: VecSubcommand,
38| |}
39| |
40| |/// Subcommands nested under `vec`.
41| |#[derive(clap::Subcommand)]
42| |pub enum VecSubcommand {
43| | /// List orphan vec_memories rows.
44| | OrphanList(VecOrphanListArgs),
45| | /// Delete orphan vec_memories rows. Requires `--yes` to confirm.
46| | PurgeOrphan(VecPurgeOrphanArgs),
47| | /// Show statistics for vec_memories, vec_entities, vec_chunks.
48| | Stats(VecStatsArgs),
49| |}
50| |
51| |/// Arguments for `vec orphan-list`.
52| |#[derive(clap::Args)]
53| |pub struct VecOrphanListArgs {
54| | /// No-op; JSON is always emitted on stdout.
55| | #[arg(long, hide = true)]
56| | pub json: bool,
57| | /// Path to the SQLite database file.
58| | #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
59| | pub db: Option<String>,
60| |}
61| |
62| |/// Arguments for `vec purge-orphan`.
63| |#[derive(clap::Args)]
64| |pub struct VecOrphanListInner {
65| | pub json: bool,
66| | pub db: Option<String>,
67| |}
68| |
69| |/// Arguments for `vec purge-orphan`.
70| |#[derive(clap::Args)]
71| |pub struct VecPurgeOrphanArgs {
72| | /// No-op; JSON is always emitted on stdout.
73| | #[arg(long, hide = true)]
74| | pub json: bool,
75| | /// Path to the SQLite database file.
76| | #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
77| | pub db: Option<String>,
78| | /// Skip the interactive confirmation; required for automation.
79| | #[arg(long, default_value_t = false)]
80| | pub yes: bool,
81| | /// Report what would be purged without writing.
82| | #[arg(long, default_value_t = false)]
83| | pub dry_run: bool,
84| |}
85| |
86| |/// Arguments for `vec stats`.
87| |#[derive(clap::Args)]
88| |pub struct VecStatsArgs {
89| | /// No-op; JSON is always emitted on stdout.
90| | #[arg(long, hide = true)]
91| | pub json: bool,
92| | /// Path to the SQLite database file.
93| | #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
94| | pub db: Option<String>,
95| |}
96| |
97| |#[derive(Serialize)]
98| |struct VecOrphanListItem {
99| | /// The orphan `memory_id` value stored in `vec_memories`.
100| | memory_id: i64,
101| | /// Hash of the float vector blob, for fingerprinting.
102| | vector_hash: String,
103| | /// When the orphan row was originally inserted.
104| | created_at: i64,
105| |}
106| |
107| |#[derive(Serialize)]
108| |struct VecOrphanListResponse {
109| | action: String,
110| | count: i64,
111| | items: Vec<VecOrphanListItem>,
112| | elapsed_ms: u64,
113| |}
114| |
115| |#[derive(Serialize)]
116| |struct VecPurgeOrphanResponse {
117| | action: String,
118| | deleted: i64,
119| | /// Number of orphan rows in `vec_entities` that were also removed (G39).
120| | deleted_entities: i64,
121| | /// Number of orphan rows in `vec_chunks` that were also removed (G39).
122| | deleted_chunks: i64,
123| | dry_run: bool,
124| | elapsed_ms: u64,
125| |}
126| |
127| |#[derive(Serialize)]
128| |struct VecStatsResponse {
129| | total_rows: i64,
130| | orphaned: i64,
131| | coverage_percent: f64,
132| | #[serde(skip_serializing_if = "Option::is_none")]
133| | vec_entities_rows: Option<i64>,
134| | #[serde(skip_serializing_if = "Option::is_none")]
135| | vec_chunks_rows: Option<i64>,
136| | fts_memories_rows: i64,
137| | elapsed_ms: u64,
138| |}
139| |
140| |/// Dispatch entry point called from `main`.
141| |///
142| |/// # Errors
143| |/// Propagates any [`AppError`] raised by the underlying subcommand.
144| 0|pub fn run(args: VecArgs) -> Result<(), AppError> {
145| 0| match args.command {
146| 0| VecSubcommand::OrphanList(a) => run_orphan_list(a),
147| 0| VecSubcommand::PurgeOrphan(a) => run_purge_orphan(a),
148| 0| VecSubcommand::Stats(a) => run_stats(a),
149| | }
150| 0|}
151| |
152| 0|fn run_orphan_list(args: VecOrphanListArgs) -> Result<(), AppError> {
153| 0| let start = std::time::Instant::now();
154| 0| let paths = AppPaths::resolve(args.db.as_deref())?;
155| 0| crate::storage::connection::ensure_db_ready(&paths)?;
156| 0| let conn = open_ro(&paths.db)?;
157| |
158| | // FTS5-style table existence gate so the command is a no-op on
159| | // databases that were created before vec_memories existed.
160| 0| let table_exists: bool = conn
161| 0| .query_row(
162| 0| "SELECT COUNT(*) FROM sqlite_master WHERE type='table' AND name='vec_memories'",
163| 0| [],
164| 0| |r| r.get::<_, i64>(0).map(|v| v > 0),
165| | )
166| 0| .unwrap_or(false);
167| 0| if !table_exists {
168| 0| return output::emit_json(&VecOrphanListResponse {
169| 0| action: "orphan_list".to_string(),
170| 0| count: 0,
171| 0| items: Vec::new(),
172| 0| elapsed_ms: start.elapsed().as_millis() as u64,
173| 0| });
174| 0| }
175| |
176| | // List vec_memories rows that have no corresponding live memory row.
177| | // We use a hash of the float[] blob (BLAKE3) as a fingerprint so the
178| | // operator can detect duplicate embeddings even after the parent
179| | // memory has been re-embedded with new content.
180| 0| let mut stmt = conn.prepare(
181| 0| "SELECT v.memory_id, v.embedding, v.created_at
182| 0| FROM vec_memories v
183| 0| LEFT JOIN memories m ON m.id = v.memory_id
184| 0| WHERE m.id IS NULL
185| 0| ORDER BY v.memory_id",
186| 0| )?;
187| 0| let rows: Vec<VecOrphanListItem> = stmt
188| 0| .query_map([], |r| {
189| 0| let memory_id: i64 = r.get(0)?;
190| 0| let blob: Vec<u8> = r.get(1)?;
191| 0| let created_at: i64 = r.get(2)?;
192| 0| let vector_hash = blake3::hash(&blob).to_hex().to_string();
193| 0| Ok(VecOrphanListItem {
194| 0| memory_id,
195| 0| vector_hash,
196| 0| created_at,
197| 0| })
198| 0| })?
199| 0| .collect::<Result<Vec<_>, _>>()?;
200| 0| let count = rows.len() as i64;
201| |
202| 0| output::emit_json(&VecOrphanListResponse {
203| 0| action: "orphan_list".to_string(),
204| 0| count,
205| 0| items: rows,
206| 0| elapsed_ms: start.elapsed().as_millis() as u64,
207| 0| })?;
208| 0| Ok(())
209| 0|}
210| |
211| 0|fn run_purge_orphan(args: VecPurgeOrphanArgs) -> Result<(), AppError> {
212| 0| let start = std::time::Instant::now();
213| 0| let paths = AppPaths::resolve(args.db.as_deref())?;
214| 0| crate::storage::connection::ensure_db_ready(&paths)?;
215| 0| let conn = open_rw(&paths.db)?;
216| |
217| | // Count first so we can return a deterministic response even on dry-run.
218| 0| let table_exists: bool = conn
219| 0| .query_row(
220| 0| "SELECT COUNT(*) FROM sqlite_master WHERE type='table' AND name='vec_memories'",
221| 0| [],
222| 0| |r| r.get::<_, i64>(0).map(|v| v > 0),
223| | )
224| 0| .unwrap_or(false);
225| 0| if !table_exists {
226| 0| return output::emit_json(&VecPurgeOrphanResponse {
227| 0| action: "purge_orphan".to_string(),
228| 0| deleted: 0,
229| 0| deleted_entities: 0,
230| 0| deleted_chunks: 0,
231| 0| dry_run: args.dry_run,
232| 0| elapsed_ms: start.elapsed().as_millis() as u64,
233| 0| });
234| 0| }
235| |
236| 0| let orphan_count: i64 = conn
237| 0| .query_row(
238| 0| "SELECT COUNT(*) FROM vec_memories v
239| 0| LEFT JOIN memories m ON m.id = v.memory_id
240| 0| WHERE m.id IS NULL",
241| 0| [],
242| 0| |r| r.get(0),
243| | )
244| 0| .unwrap_or(0);
245| |
246| | // G39: also count orphans in vec_entities and vec_chunks. These
247| | // tables follow the same `memory_id` foreign key convention and
248| | // accumulate orphans on the same paths as vec_memories.
249| 0| let orphan_entities_count: i64 = if vec_table_exists(&conn, "vec_entities") {
250| 0| conn.query_row(
251| 0| "SELECT COUNT(*) FROM vec_entities v
252| 0| LEFT JOIN memories m ON m.id = v.memory_id
253| 0| WHERE m.id IS NULL",
254| 0| [],
255| 0| |r| r.get(0),
256| | )
257| 0| .unwrap_or(0)
258| | } else {
259| 0| 0
260| | };
261| 0| let orphan_chunks_count: i64 = if vec_table_exists(&conn, "vec_chunks") {
262| 0| conn.query_row(
263| 0| "SELECT COUNT(*) FROM vec_chunks v
264| 0| LEFT JOIN memories m ON m.id = v.memory_id
265| 0| WHERE m.id IS NULL",
266| 0| [],
267| 0| |r| r.get(0),
268| | )
269| 0| .unwrap_or(0)
270| | } else {
271| 0| 0
272| | };
273| |
274| 0| if args.dry_run {
275| 0| tracing::info!(target: "vec", orphan_count, orphan_entities_count, orphan_chunks_count, "dry-run: would delete orphans");
276| 0| return output::emit_json(&VecPurgeOrphanResponse {
277| 0| action: "purge_orphan_dry_run".to_string(),
278| 0| deleted: 0,
279| 0| deleted_entities: 0,
280| 0| deleted_chunks: 0,
281| 0| dry_run: true,
282| 0| elapsed_ms: start.elapsed().as_millis() as u64,
283| 0| });
284| 0| }
285| |
286| 0| if !args.yes {
287| 0| return Err(AppError::Validation(format!(
288| 0| "refusing to delete {orphan_count} vec_memories + {orphan_entities_count} vec_entities + {orphan_chunks_count} vec_chunks orphan rows without --yes (use --dry-run to preview)"
289| 0| )));
290| 0| }
291| |
292| 0| let deleted: i64 = conn.execute(
293| 0| "DELETE FROM vec_memories
294| 0| WHERE memory_id NOT IN (SELECT id FROM memories)",
295| 0| [],
296| 0| )? as i64;
297| |
298| 0| let deleted_entities: i64 = if vec_table_exists(&conn, "vec_entities") {
299| 0| conn.execute(
300| 0| "DELETE FROM vec_entities
301| 0| WHERE memory_id NOT IN (SELECT id FROM memories)",
302| 0| [],
303| 0| )
304| 0| .unwrap_or(0) as i64
305| | } else {
306| 0| 0
307| | };
308| 0| let deleted_chunks: i64 = if vec_table_exists(&conn, "vec_chunks") {
309| 0| conn.execute(
310| 0| "DELETE FROM vec_chunks
311| 0| WHERE memory_id NOT IN (SELECT id FROM memories)",
312| 0| [],
313| 0| )
314| 0| .unwrap_or(0) as i64
315| | } else {
316| 0| 0
317| | };
318| |
319| 0| tracing::info!(target: "vec", deleted, deleted_entities, deleted_chunks, "purged orphan vec rows");
320| |
321| 0| output::emit_json(&VecPurgeOrphanResponse {
322| 0| action: "purged_orphan".to_string(),
323| 0| deleted,
324| 0| deleted_entities,
325| 0| deleted_chunks,
326| 0| dry_run: false,
327| 0| elapsed_ms: start.elapsed().as_millis() as u64,
328| 0| })?;
329| 0| Ok(())
330| 0|}
331| |
332| 0|fn run_stats(args: VecStatsArgs) -> Result<(), AppError> {
333| 0| let start = std::time::Instant::now();
334| 0| let paths = AppPaths::resolve(args.db.as_deref())?;
335| 0| crate::storage::connection::ensure_db_ready(&paths)?;
336| 0| let conn = open_ro(&paths.db)?;
337| |
338| 0| let vec_memories_exists: bool = conn
339| 0| .query_row(
340| 0| "SELECT COUNT(*) FROM sqlite_master WHERE type='table' AND name='vec_memories'",
341| 0| [],
342| 0| |r| r.get::<_, i64>(0).map(|v| v > 0),
343| | )
344| 0| .unwrap_or(false);
345| 0| let (total_rows, orphaned) = if vec_memories_exists {
346| 0| let total: i64 = conn
347| 0| .query_row("SELECT COUNT(*) FROM vec_memories", [], |r| r.get(0))
348| 0| .unwrap_or(0);
349| 0| let orph: i64 = conn
350| 0| .query_row(
351| 0| "SELECT COUNT(*) FROM vec_memories v
352| 0| LEFT JOIN memories m ON m.id = v.memory_id
353| 0| WHERE m.id IS NULL",
354| 0| [],
355| 0| |r| r.get(0),
356| | )
357| 0| .unwrap_or(0);
358| 0| (total, orph)
359| | } else {
360| 0| (0, 0)
361| | };
362| 0| let coverage_percent = if total_rows > 0 {
363| 0| ((total_rows - orphaned) as f64 / total_rows as f64) * 100.0
364| | } else {
365| 0| 100.0
366| | };
367| |
368| 0| let vec_entities_rows = if vec_table_exists(&conn, "vec_entities") {
369| 0| conn.query_row("SELECT COUNT(*) FROM vec_entities", [], |r| r.get(0))
370| 0| .ok()
371| | } else {
372| 0| None
373| | };
374| 0| let vec_chunks_rows = if vec_table_exists(&conn, "vec_chunks") {
375| 0| conn.query_row("SELECT COUNT(*) FROM vec_chunks", [], |r| r.get(0))
376| 0| .ok()
377| | } else {
378| 0| None
379| | };
380| 0| let fts_memories_rows = conn
381| 0| .query_row("SELECT COUNT(*) FROM fts_memories", [], |r| r.get(0))
382| 0| .unwrap_or(0);
383| |
384| 0| output::emit_json(&VecStatsResponse {
385| 0| total_rows,
386| 0| orphaned,
387| 0| coverage_percent,
388| 0| vec_entities_rows,
389| 0| vec_chunks_rows,
390| 0| fts_memories_rows,
391| 0| elapsed_ms: start.elapsed().as_millis() as u64,
392| 0| })?;
393| 0| Ok(())
394| 0|}
395| |
396| 0|fn vec_table_exists(conn: &rusqlite::Connection, name: &str) -> bool {
397| 0| conn.query_row(
398| 0| "SELECT COUNT(*) FROM sqlite_master WHERE type='table' AND name=?1",
399| 0| rusqlite::params![name],
400| 0| |r| r.get::<_, i64>(0).map(|v| v > 0),
401| | )
402| 0| .unwrap_or(false)
403| 0|}
404| |
405| |#[cfg(test)]
406| |mod tests {
407| | use super::*;
408| |
409| | #[test]
410| 1| fn vec_orphan_list_response_serializes_all_fields() {
411| 1| let resp = VecOrphanListResponse {
412| 1| action: "orphan_list".into(),
413| 1| count: 0,
414| 1| items: Vec::new(),
415| 1| elapsed_ms: 5,
416| 1| };
417| 1| let v = serde_json::to_value(&resp).unwrap();
418| 1| assert_eq!(v["action"], "orphan_list");
419| 1| assert_eq!(v["count"], 0i64);
420| 1| assert_eq!(v["elapsed_ms"], 5u64);
421| 1| assert!(v["items"].is_array());
422| 1| }
423| |
424| | #[test]
425| 1| fn vec_purge_orphan_response_serializes_dry_run_flag() {
426| 1| let resp = VecPurgeOrphanResponse {
427| 1| action: "purge_orphan_dry_run".into(),
428| 1| deleted: 0,
429| 1| deleted_entities: 0,
430| 1| deleted_chunks: 0,
431| 1| dry_run: true,
432| 1| elapsed_ms: 1,
433| 1| };
434| 1| let v = serde_json::to_value(&resp).unwrap();
435| 1| assert_eq!(v["dry_run"], true);
436| 1| assert_eq!(v["deleted"], 0i64);
437| 1| }
438| |
439| | #[test]
440| 1| fn vec_stats_response_computes_coverage() {
441| 1| let resp = VecStatsResponse {
442| 1| total_rows: 100,
443| 1| orphaned: 25,
444| 1| coverage_percent: 75.0,
445| 1| vec_entities_rows: Some(50),
446| 1| vec_chunks_rows: None,
447| 1| fts_memories_rows: 100,
448| 1| elapsed_ms: 10,
449| 1| };
450| 1| let v = serde_json::to_value(&resp).unwrap();
451| 1| assert_eq!(v["coverage_percent"], 75.0);
452| 1| assert_eq!(v["vec_entities_rows"], 50i64);
453| 1| assert!(v.get("vec_chunks_rows").is_none());
454| 1| }
455| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/constants.rs:
1| |//! Compile-time constants shared across the crate.
2| |//!
3| |//! Grouped into embedding configuration, length and size limits, SQLite
4| |//! pragmas and retrieval tuning knobs. Values are taken from the PRD and
5| |//! must stay in sync with the migrations under `migrations/`.
6| |//!
7| |//! ## Dynamic concurrency permit calculation
8| |//!
9| |//! The maximum number of simultaneous instances can be adjusted at runtime
10| |//! using the formula:
11| |//!
12| |//! ```text
13| |//! permits = min(cpus, available_memory_mb / EMBEDDING_LOAD_EXPECTED_RSS_MB) * 0.5
14| |//! ```
15| |//!
16| |//! where `available_memory_mb` is obtained via `sysinfo::System::available_memory()`
17| |//! converted to MiB. The result is capped at `MAX_CONCURRENT_CLI_INSTANCES`
18| |//! and floored at 1.
19| |
20| |/// Embedding vector dimensionality produced by `multilingual-e5-small`.
21| |pub const EMBEDDING_DIM: usize = 384;
22| |
23| |/// Default `fastembed` model identifier used by `remember` and `recall`.
24| |pub const FASTEMBED_MODEL_DEFAULT: &str = "multilingual-e5-small";
25| |
26| |/// Batch size for `fastembed` encoding calls.
27| |pub const FASTEMBED_BATCH_SIZE: usize = 32;
28| |
29| |/// Maximum byte length for a memory `name` field in kebab-case.
30| |pub const MAX_MEMORY_NAME_LEN: usize = 80;
31| |
32| |/// Maximum byte length for an `ingest`-derived kebab-case name.
33| |///
34| |/// Stricter than `MAX_MEMORY_NAME_LEN` (80) to leave headroom for collision
35| |/// suffixes (`-2`, `-10`, ...) when multiple files derive to the same base.
36| |/// Used exclusively by `src/commands/ingest.rs`.
37| |pub const DERIVED_NAME_MAX_LEN: usize = 60;
38| |
39| |/// Maximum character length for a memory `description` field.
40| |pub const MAX_MEMORY_DESCRIPTION_LEN: usize = 500;
41| |
42| |/// Hard upper bound on memory `body` length in bytes.
43| |pub const MAX_MEMORY_BODY_LEN: usize = 512_000;
44| |
45| |/// Body character count above which the body is split into chunks.
46| |pub const MAX_BODY_CHARS_BEFORE_CHUNK: usize = 8_000;
47| |
48| |/// Maximum attempts when a statement returns `SQLITE_BUSY`.
49| |pub const MAX_SQLITE_BUSY_RETRIES: u32 = 5;
50| |
51| |/// Base delay in milliseconds for the first SQLITE_BUSY retry.
52| |///
53| |/// Each subsequent attempt doubles the delay (exponential backoff):
54| |/// 300 ms → 600 ms → 1200 ms → 2400 ms → 4800 ms (≈ 9.3 s total).
55| |pub const SQLITE_BUSY_BASE_DELAY_MS: u64 = 300;
56| |
57| |/// Query timeout applied to statements in milliseconds.
58| |pub const QUERY_TIMEOUT_MILLIS: u64 = 5_000;
59| |
60| |/// Jaccard threshold above which two memories are considered fuzzy duplicates.
61| |pub const DEDUP_FUZZY_THRESHOLD: f64 = 0.8;
62| |
63| |/// Cosine distance threshold below which two memories are semantic duplicates.
64| |pub const DEDUP_SEMANTIC_THRESHOLD: f32 = 0.1;
65| |
66| |/// Maximum number of hops allowed in graph traversals.
67| |pub const MAX_GRAPH_HOPS: u32 = 2;
68| |
69| |/// Minimum relationship weight required for traversal inclusion.
70| |pub const MIN_RELATION_WEIGHT: f64 = 0.3;
71| |
72| |/// Default traversal depth for `related` when `--hops` is omitted.
73| |pub const DEFAULT_MAX_HOPS: u32 = 2;
74| |
75| |/// Default minimum weight filter applied during graph traversal.
76| |pub const DEFAULT_MIN_WEIGHT: f64 = 0.3;
77| |
78| |/// Default weight assigned to newly created relationships.
79| |pub const DEFAULT_RELATION_WEIGHT: f64 = 0.5;
80| |
81| |/// Default `k` used by `recall` when the caller omits `--k`.
82| |pub const DEFAULT_K_RECALL: usize = 10;
83| |
84| |/// Default `k` for memory KNN searches when the caller omits `--k`.
85| |pub const K_MEMORIES_DEFAULT: usize = 10;
86| |
87| |/// Default `k` for entity KNN searches during graph expansion.
88| |pub const K_ENTITIES_SEARCH: usize = 5;
89| |
90| |/// Default upper bound on distinct entities persisted per memory.
91| |///
92| |/// Bumped from 30 → 50 in v1.0.43 to reduce semantic loss on rich documents.
93| |/// Configurable at runtime via `SQLITE_GRAPHRAG_MAX_ENTITIES_PER_MEMORY`.
94| |pub const MAX_ENTITIES_PER_MEMORY: usize = 50;
95| |
96| |/// Resolves the per-memory entity cap, honouring the env-var override.
97| |///
98| |/// v1.0.43: makes the cap (default 50) configurable via `SQLITE_GRAPHRAG_MAX_ENTITIES_PER_MEMORY`.
99| |/// Stress tests showed inputs with 33-46 candidates being truncated at the old cap of 30.
100| |/// Values outside [1, 1000] fall back to the default.
101| 0|pub fn max_entities_per_memory() -> usize {
102| 0| std::env::var("SQLITE_GRAPHRAG_MAX_ENTITIES_PER_MEMORY")
103| 0| .ok()
104| 0| .and_then(|v| v.parse::<usize>().ok())
105| 0| .filter(|&n| (1..=1_000).contains(&n))
106| 0| .unwrap_or(MAX_ENTITIES_PER_MEMORY)
107| 0|}
108| |
109| |/// Upper bound on distinct relationships persisted per memory.
110| |pub const MAX_RELATIONSHIPS_PER_MEMORY: usize = 50;
111| |
112| |/// Resolves the per-memory relationship cap, honouring the env-var override.
113| |///
114| |/// v1.0.22: makes the cap (default 50) configurable via `SQLITE_GRAPHRAG_MAX_RELATIONS_PER_MEMORY`.
115| |/// Audit found that rich documents silently hit the cap; users with dense technical corpora
116| |/// can raise it via env. Values outside [1, 10000] fall back to the default.
117| 8|pub fn max_relationships_per_memory() -> usize {
118| 8| std::env::var("SQLITE_GRAPHRAG_MAX_RELATIONS_PER_MEMORY")
119| 8| .ok()
120| 8| .and_then(|v| v.parse::<usize>().ok())
^0 ^0
121| 8| .filter(|&n| (1..=10_000).contains(&n))
^0 ^0 ^0
122| 8| .unwrap_or(MAX_RELATIONSHIPS_PER_MEMORY)
123| 8|}
124| |
125| |/// Character length of the description preview shown in `list` output.
126| |pub const TEXT_DESCRIPTION_PREVIEW_LEN: usize = 100;
127| |
128| |/// `PRAGMA busy_timeout` value applied on every connection.
129| |pub const BUSY_TIMEOUT_MILLIS: i32 = 5_000;
130| |
131| |/// `PRAGMA cache_size` value in kibibytes (negative means KiB).
132| |pub const CACHE_SIZE_KB: i32 = -64_000;
133| |
134| |/// `PRAGMA mmap_size` value in bytes applied to each connection.
135| |pub const MMAP_SIZE_BYTES: i64 = 268_435_456;
136| |
137| |/// `PRAGMA wal_autocheckpoint` threshold in pages.
138| |pub const WAL_AUTOCHECKPOINT_PAGES: i32 = 1_000;
139| |
140| |/// Default `k` constant used by Reciprocal Rank Fusion in `hybrid-search`.
141| |pub const RRF_K_DEFAULT: u32 = 60;
142| |
143| |/// Chunk size expressed in tokens for body splitting.
144| |pub const CHUNK_SIZE_TOKENS: usize = 400;
145| |
146| |/// Token overlap between consecutive chunks.
147| |pub const CHUNK_OVERLAP_TOKENS: usize = 50;
148| |
149| |/// Explicit operational guard for multi-chunk documents in `remember`.
150| |///
151| |/// The multi-chunk path uses serial embeddings to avoid ONNX memory amplification.
152| |/// This limit preserves a clear operational ceiling for agents and scripts.
153| |pub const REMEMBER_MAX_SAFE_MULTI_CHUNKS: usize = 512;
154| |
155| |/// Ceiling on chunks per controlled micro-batch in `remember`.
156| |///
157| |/// The `fastembed` runtime uses `BatchLongest` padding, so oversized batches amplify
158| |/// the cost of the longest chunk. This ceiling keeps batches small even when chunks are short.
159| |pub const REMEMBER_MAX_CONTROLLED_BATCH_CHUNKS: usize = 4;
160| |
161| |/// Maximum padded-token budget per controlled micro-batch in `remember`.
162| |///
163| |/// The budget uses `max_tokens_no_batch * batch_size`, approximating the real cost of
164| |/// `BatchLongest` padding. Values exceeding this fall back to smaller batches or serialisation.
165| |pub const REMEMBER_MAX_CONTROLLED_BATCH_PADDED_TOKENS: usize = 512;
166| |
167| |/// Timeout in milliseconds for a single ping probe against the daemon socket.
168| |pub const DAEMON_PING_TIMEOUT_MS: u64 = 10;
169| |
170| |/// Idle duration in seconds before the daemon shuts itself down.
171| |pub const DAEMON_IDLE_SHUTDOWN_SECS: u64 = 600;
172| |
173| |/// Maximum wait time for the daemon to become healthy after auto-start.
174| |pub const DAEMON_AUTO_START_MAX_WAIT_MS: u64 = 5_000;
175| |
176| |/// Maximum wait time (ms) for a stale daemon to exit after a version-mismatch shutdown.
177| |pub const DAEMON_VERSION_RESTART_WAIT_MS: u64 = 5_000;
178| |
179| |/// Initial polling interval to check whether the daemon became healthy.
180| |pub const DAEMON_AUTO_START_INITIAL_BACKOFF_MS: u64 = 50;
181| |
182| |/// Ceiling on backoff between automatic daemon spawn attempts.
183| |pub const DAEMON_AUTO_START_MAX_BACKOFF_MS: u64 = 30_000;
184| |
185| |/// Base backoff used after daemon spawn/health failures.
186| |pub const DAEMON_SPAWN_BACKOFF_BASE_MS: u64 = 500;
187| |
188| |/// Maximum wait time to acquire the daemon spawn lock.
189| |pub const DAEMON_SPAWN_LOCK_WAIT_MS: u64 = 2_000;
190| |
191| |/// Prefix prepended to bodies before embedding as required by E5 models.
192| |pub const PASSAGE_PREFIX: &str = "passage: ";
193| |
194| |/// Prefix prepended to queries before embedding as required by E5 models.
195| |pub const QUERY_PREFIX: &str = "query: ";
196| |
197| |/// Crate version string sourced from `CARGO_PKG_VERSION` at build time.
198| |pub const SQLITE_GRAPHRAG_VERSION: &str = env!("CARGO_PKG_VERSION");
199| |
200| |/// Batch size for GLiNER NER forward passes.
201| |///
202| |/// Larger values amortise fixed forward-pass overhead but increase peak RAM.
203| |/// Memory guide (CPU only, max 512-token windows):
204| |/// N=4 → ~54 MiB peak
205| |/// N=8 → ~108 MiB peak ← default
206| |/// N=16 → ~216 MiB peak
207| |/// N=32 → ~432 MiB peak (not recommended without 16+ GiB RAM)
208| |///
209| |/// Override via `GRAPHRAG_NER_BATCH_SIZE` env var. Values outside [1, 32] are
210| |/// clamped silently.
211| 0|pub fn ner_batch_size() -> usize {
212| 0| std::env::var("GRAPHRAG_NER_BATCH_SIZE")
213| 0| .ok()
214| 0| .and_then(|v| v.parse::<usize>().ok())
215| 0| .unwrap_or(8)
216| 0| .clamp(1, 32)
217| 0|}
218| |
219| |/// Default cap on tokens fed to GLiNER NER per memory body.
220| |///
221| |/// v1.0.31: large markdown documents (>50 KB) tokenise into thousands of
222| |/// 512-token windows, each requiring a CPU forward pass that takes hundreds
223| |/// of milliseconds. A 68 KB document was observed taking 5+ minutes.
224| |/// Truncating the input before sliding-window construction caps the worst-case
225| |/// latency while preserving extraction quality for the leading body region.
226| |///
227| |/// Regex prefilter still runs on the full body, so URLs, emails, UUIDs,
228| |/// all-caps identifiers and CamelCase brand names are extracted regardless.
229| |pub const EXTRACTION_MAX_TOKENS_DEFAULT: usize = 5_000;
230| |
231| |/// Resolves the per-body NER token cap, honouring the env-var override.
232| |///
233| |/// Override via `SQLITE_GRAPHRAG_EXTRACTION_MAX_TOKENS` env var. Values outside
234| |/// [512, 100_000] fall back to [`EXTRACTION_MAX_TOKENS_DEFAULT`].
235| 4|pub fn extraction_max_tokens() -> usize {
236| 4| std::env::var("SQLITE_GRAPHRAG_EXTRACTION_MAX_TOKENS")
237| 4| .ok()
238| 4| .and_then(|v| v.parse::<usize>().ok())
^3 ^3
239| 4| .filter(|&n| (512..=100_000).contains(&n))
^3 ^3 ^3
240| 4| .unwrap_or(EXTRACTION_MAX_TOKENS_DEFAULT)
241| 4|}
242| |
243| |/// GLiNER confidence threshold for span scoring.
244| |///
245| |/// Override via `SQLITE_GRAPHRAG_GLINER_THRESHOLD` env var. Values outside
246| |/// `[0.0, 1.0]` are ignored and the default `0.5` is used.
247| 3|pub fn gliner_confidence_threshold() -> f32 {
248| 3| std::env::var("SQLITE_GRAPHRAG_GLINER_THRESHOLD")
249| 3| .ok()
250| 3| .and_then(|v| v.parse::<f32>().ok())
^0 ^0
251| 3| .filter(|&v| (0.0..=1.0).contains(&v))
^0 ^0 ^0
252| 3| .unwrap_or(0.5)
253| 3|}
254| |
255| |/// HuggingFace repository for the GLiNER ONNX model.
256| |///
257| |/// Override via `SQLITE_GRAPHRAG_GLINER_MODEL` env var.
258| 0|pub fn gliner_model_repo() -> String {
259| 0| std::env::var("SQLITE_GRAPHRAG_GLINER_MODEL")
260| 0| .unwrap_or_else(|_| "onnx-community/gliner_multi-v2.1".to_string())
261| 0|}
262| |
263| |/// PRD-canonical regex that validates names and namespaces. Allows 1 char `[a-z0-9]`
264| |/// OR a 2-80 char string starting with a letter and ending with a letter/digit,
265| |/// containing only `[a-z0-9-]`. Rejects the `__` prefix (internal reserved).
266| |pub const NAME_SLUG_REGEX: &str = r"^[a-z][a-z0-9-]{0,78}[a-z0-9]$|^[a-z0-9]$";
267| |
268| |static NAME_SLUG_RE: std::sync::OnceLock<regex::Regex> = std::sync::OnceLock::new();
269| |
270| |/// Returns a reference to the compiled [`NAME_SLUG_REGEX`] pattern.
271| |/// Compiled once on first call, cached via `OnceLock`.
272| 0|pub fn name_slug_regex() -> &'static regex::Regex {
273| 0| NAME_SLUG_RE.get_or_init(|| {
274| 0| regex::Regex::new(NAME_SLUG_REGEX).expect("NAME_SLUG_REGEX is a valid pattern")
275| 0| })
276| 0|}
277| |
278| |/// Default retention period (days) used by `purge` when `--retention-days` is omitted.
279| |pub const PURGE_RETENTION_DAYS_DEFAULT: u32 = 90;
280| |
281| |/// Maximum number of simultaneously active namespaces (deleted_at IS NULL). Exit 5 when exceeded.
282| |pub const MAX_NAMESPACES_ACTIVE: u32 = 100;
283| |
284| |/// Maximum tokens accepted by an embedding input before chunking.
285| |pub const EMBEDDING_MAX_TOKENS: usize = 512;
286| |
287| |/// Maximum result count from the recursive graph CTE in `recall`.
288| |pub const K_GRAPH_MATCHES_LIMIT: usize = 20;
289| |
290| |/// Default `--limit` for `list` when omitted.
291| |pub const K_LIST_DEFAULT_LIMIT: usize = 100;
292| |
293| |/// Default `--limit` for `graph entities` when omitted.
294| |pub const K_GRAPH_ENTITIES_DEFAULT_LIMIT: usize = 50;
295| |
296| |/// Default `--limit` for `related` when omitted.
297| |pub const K_RELATED_DEFAULT_LIMIT: usize = 10;
298| |
299| |/// Default `--limit` for `history` when omitted.
300| |pub const K_HISTORY_DEFAULT_LIMIT: usize = 20;
301| |
302| |/// Default weight for the vector contribution in the `hybrid-search` RRF formula.
303| |pub const WEIGHT_VEC_DEFAULT: f64 = 1.0;
304| |
305| |/// Default weight for the BM25 text contribution in the `hybrid-search` RRF formula.
306| |pub const WEIGHT_FTS_DEFAULT: f64 = 1.0;
307| |
308| |/// Character size of the body preview emitted in text/markdown formats.
309| |pub const TEXT_BODY_PREVIEW_LEN: usize = 200;
310| |
311| |/// Default value injected into ORT_NUM_THREADS when not set by the user.
312| |pub const ORT_NUM_THREADS_DEFAULT: &str = "1";
313| |
314| |/// Default value injected into ORT_INTRA_OP_NUM_THREADS when not set.
315| |pub const ORT_INTRA_OP_NUM_THREADS_DEFAULT: &str = "1";
316| |
317| |/// Default value injected into OMP_NUM_THREADS when not set by the user.
318| |pub const OMP_NUM_THREADS_DEFAULT: &str = "1";
319| |
320| |/// Exit code for partial batch failure (PRD line 1822). Conflicts with DbBusy in v1.x;
321| |/// in v2.0.0 DbBusy migrates to 15 and this code takes 13 per PRD.
322| |pub const BATCH_PARTIAL_FAILURE_EXIT_CODE: i32 = 13;
323| |
324| |/// Exit code for DbBusy in v2.0.0 (migrated from 13 to free 13 for batch failure).
325| |pub const DB_BUSY_EXIT_CODE: i32 = 15;
326| |
327| |/// Filename used for the advisory exclusive lock that prevents parallel invocations.
328| |pub const CLI_LOCK_FILE: &str = "cli.lock";
329| |
330| |/// Polling interval in milliseconds used by `--wait-lock` between `try_lock_exclusive` attempts.
331| |pub const CLI_LOCK_POLL_INTERVAL_MS: u64 = 500;
332| |
333| |/// Process exit code returned when the lock is busy and no wait was requested (EX_TEMPFAIL).
334| |pub const CLI_LOCK_EXIT_CODE: i32 = 75;
335| |
336| |/// Maximum number of CLI instances running simultaneously.
337| |///
338| |/// Aligned with `DAEMON_MAX_CONCURRENT_CLIENTS` from the PRD. Limits the counting
339| |/// semaphore in [`crate::lock`] to prevent memory overload when multiple parallel
340| |/// invocations attempt to load the ONNX model simultaneously.
341| |pub const MAX_CONCURRENT_CLI_INSTANCES: usize = 4;
342| |
343| |/// G28-B (v1.0.68): polling interval in milliseconds used by
344| |/// `acquire_job_singleton` between retry attempts when another invocation
345| |/// already holds the singleton for `(job_type, namespace)`.
346| |pub const JOB_SINGLETON_POLL_INTERVAL_MS: u64 = 1000;
347| |
348| |/// Minimum available memory in MiB required before starting model loading.
349| |///
350| |/// If `sysinfo::System::available_memory() / 1_048_576` falls below this value,
351| |/// the invocation is aborted with [`crate::errors::AppError::LowMemory`]
352| |/// (exit code [`LOW_MEMORY_EXIT_CODE`]).
353| |pub const MIN_AVAILABLE_MEMORY_MB: u64 = 2_048;
354| |
355| |/// Maximum process RSS in MiB before aborting embedding operations.
356| |/// Users can override via `--max-rss-mb`. Set to 8 GiB by default.
357| |pub const DEFAULT_MAX_RSS_MB: u64 = 8_192;
358| |
359| |/// Maximum time in seconds an instance waits to acquire a concurrency slot.
360| |///
361| |/// Passed as the default for `--max-wait-secs` in the CLI. After exhausting this limit,
362| |/// the invocation returns [`crate::errors::AppError::AllSlotsFull`] with exit code
363| |/// [`CLI_LOCK_EXIT_CODE`] (75).
364| |pub const CLI_LOCK_DEFAULT_WAIT_SECS: u64 = 300;
365| |
366| |/// Expected RSS in MiB for a single instance with the ONNX model loaded via fastembed.
367| |///
368| |/// Used in the formula `min(cpus, available_memory_mb / EMBEDDING_LOAD_EXPECTED_RSS_MB) * 0.5`
369| |/// to compute the dynamic permit count.
370| |///
371| |/// Value calibrated on 2026-04-23 with `/usr/bin/time -v` against `sqlite-graphrag v1.0.3`
372| |/// on the heavy commands `remember`, `recall`, and `hybrid-search`, all peaking near
373| |/// 1.03 GiB RSS per process. The constant below rounds up with a defensive margin.
374| |pub const EMBEDDING_LOAD_EXPECTED_RSS_MB: u64 = 1_100;
375| |
376| |/// Process exit code returned when available memory is below [`MIN_AVAILABLE_MEMORY_MB`].
377| |///
378| |/// Value `77` is `EX_NOPERM` in glibc sysexits, reused here to indicate
379| |/// "insufficient system resource to proceed".
380| |pub const LOW_MEMORY_EXIT_CODE: i32 = 77;
381| |
382| |/// Process exit code returned when a duplicate memory or entity is detected (exit 9).
383| |///
384| |/// Moved from `2` to `9` in v1.0.52 to free exit code `2` for future use and align
385| |/// with the PRD exit code contract. Shell callers and LLM agents must use `9` from
386| |/// this version onwards.
387| |pub const DUPLICATE_EXIT_CODE: i32 = 9;
388| |
389| |/// Canonical value of `PRAGMA user_version` written after migrations.
390| |///
391| |/// **Why 49 instead of `CURRENT_SCHEMA_VERSION` (9)?**
392| |/// `user_version` is a 32-bit integer that SQLite reserves for application use.
393| |/// We deliberately set it to a project-specific marker (49 = decimal) so external
394| |/// inspection tools (`sqlite3 db.sqlite "PRAGMA user_version"`, the `file` command,
395| |/// SQLite browser GUIs) can distinguish a sqlite-graphrag database from a generic
396| |/// SQLite file at a glance. The application-level schema version (9, matching
397| |/// `CURRENT_SCHEMA_VERSION`) is stored in the `schema_meta` table and exposed via
398| |/// `health --json`/`stats --json`. Bumping migrations does NOT change this constant.
399| |/// Refinery uses its own `refinery_schema_history` table for migration bookkeeping.
400| |pub const SCHEMA_USER_VERSION: i64 = 49;
401| |
402| |/// Current schema version, equal to the highest migration number in `migrations/Vnnn__*.sql`.
403| |///
404| |/// Added in v1.0.27 as a runtime and test sanity check.
405| |/// Must be bumped in sync with new Refinery migrations; the unit test
406| |/// `schema_version_matches_migrations_count` validates this automatically.
407| |pub const CURRENT_SCHEMA_VERSION: u32 = 12;
408| |
409| |#[cfg(test)]
410| |mod tests_schema_version {
411| | use super::CURRENT_SCHEMA_VERSION;
412| |
413| | #[test]
414| 1| fn schema_version_matches_migrations_count() {
415| 1| let manifest_dir = env!("CARGO_MANIFEST_DIR");
416| 1| let migrations_dir = std::path::Path::new(manifest_dir).join("migrations");
417| 1| let count = std::fs::read_dir(&migrations_dir)
418| 1| .expect("migrations directory must exist")
419| 12| .filter_map(|entry| entry.ok())
^1
420| 12| .filter(|entry| entry.file_name().to_string_lossy().starts_with('V'))
^1
421| 1| .count() as u32;
422| 1| assert_eq!(
423| | CURRENT_SCHEMA_VERSION, count,
424| 0| "CURRENT_SCHEMA_VERSION ({CURRENT_SCHEMA_VERSION}) must equal the number of V*.sql migrations ({count})"
425| | );
426| 1| }
427| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/daemon.rs:
1| |//! IPC daemon: keeps the embedding model warm across CLI invocations.
2| |//!
3| |//! Manages the background process lifecycle, Unix-socket IPC protocol, and
4| |//! auto-start/backoff logic so embeddings are served without cold-start cost.
5| |
6| |use crate::constants::{
7| | DAEMON_AUTO_START_INITIAL_BACKOFF_MS, DAEMON_AUTO_START_MAX_BACKOFF_MS,
8| | DAEMON_AUTO_START_MAX_WAIT_MS, DAEMON_IDLE_SHUTDOWN_SECS, DAEMON_PING_TIMEOUT_MS,
9| | DAEMON_SPAWN_BACKOFF_BASE_MS, DAEMON_SPAWN_LOCK_WAIT_MS, DAEMON_VERSION_RESTART_WAIT_MS,
10| | SQLITE_GRAPHRAG_VERSION,
11| |};
12| |use crate::errors::AppError;
13| |use crate::{embedder, shutdown_requested};
14| |use fs4::fs_std::FileExt;
15| |use interprocess::local_socket::{
16| | prelude::LocalSocketStream,
17| | traits::{Listener as _, Stream as _},
18| | GenericFilePath, GenericNamespaced, ListenerNonblockingMode, ListenerOptions, ToFsName,
19| | ToNsName,
20| |};
21| |use serde::{Deserialize, Serialize};
22| |use std::fs::{File, OpenOptions};
23| |use std::io::{BufRead, BufReader, Write};
24| |use std::path::{Path, PathBuf};
25| |use std::process::Stdio;
26| |use std::sync::atomic::{AtomicU64, AtomicU8, Ordering};
27| |use std::sync::Arc;
28| |use std::thread;
29| |use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
30| |
31| |const VERSION_NOT_CHECKED: u8 = 0;
32| |const VERSION_COMPATIBLE: u8 = 1;
33| |const VERSION_RESTART_ATTEMPTED: u8 = 2;
34| |
35| |/// Guards against restart loops: tracks version check state per process lifetime.
36| |static DAEMON_VERSION_STATE: AtomicU8 = AtomicU8::new(VERSION_NOT_CHECKED);
37| |
38| |#[derive(Debug, Serialize, Deserialize)]
39| |#[serde(tag = "request", rename_all = "snake_case")]
40| |pub enum DaemonRequest {
41| | Ping,
42| | Shutdown,
43| | EmbedPassage {
44| | text: String,
45| | },
46| | EmbedQuery {
47| | text: String,
48| | },
49| | EmbedPassages {
50| | texts: Vec<String>,
51| | token_counts: Vec<usize>,
52| | },
53| |}
54| |
55| |#[derive(Debug, Serialize, Deserialize)]
56| |#[serde(tag = "status", rename_all = "snake_case")]
57| |pub enum DaemonResponse {
58| | Listening {
59| | pid: u32,
60| | socket: String,
61| | idle_shutdown_secs: u64,
62| | },
63| | Ok {
64| | pid: u32,
65| | version: String,
66| | handled_embed_requests: u64,
67| | model_name: String,
68| | model_variant: String,
69| | },
70| | PassageEmbedding {
71| | embedding: Vec<f32>,
72| | handled_embed_requests: u64,
73| | },
74| | QueryEmbedding {
75| | embedding: Vec<f32>,
76| | handled_embed_requests: u64,
77| | },
78| | PassageEmbeddings {
79| | embeddings: Vec<Vec<f32>>,
80| | handled_embed_requests: u64,
81| | },
82| | ShuttingDown {
83| | handled_embed_requests: u64,
84| | },
85| | Error {
86| | message: String,
87| | },
88| |}
89| |
90| |#[derive(Debug, Default, Serialize, Deserialize)]
91| |struct DaemonSpawnState {
92| | consecutive_failures: u32,
93| | not_before_epoch_ms: u64,
94| | last_error: Option<String>,
95| |}
96| |
97| 1|pub fn daemon_label(models_dir: &Path) -> String {
98| 1| let hash = blake3::hash(models_dir.to_string_lossy().as_bytes())
99| 1| .to_hex()
100| 1| .to_string();
101| 1| format!("sqlite-graphrag-daemon-{}", &hash[..16])
102| 1|}
103| |
104| 1|pub fn try_ping(models_dir: &Path) -> Result<Option<DaemonResponse>, AppError> {
105| 1| request_if_available(models_dir, &DaemonRequest::Ping)
106| 1|}
107| |
108| 0|pub fn try_shutdown(models_dir: &Path) -> Result<Option<DaemonResponse>, AppError> {
109| 0| request_if_available(models_dir, &DaemonRequest::Shutdown)
110| 0|}
111| |
112| 0|pub fn embed_passage_or_local(models_dir: &Path, text: &str) -> Result<Vec<f32>, AppError> {
113| 0| match request_or_autostart(
114| 0| models_dir,
115| 0| &DaemonRequest::EmbedPassage {
116| 0| text: text.to_string(),
117| 0| },
118| | true,
119| 0| )? {
120| 0| Some(DaemonResponse::PassageEmbedding { embedding, .. }) => Ok(embedding),
121| 0| Some(DaemonResponse::Error { message }) => Err(AppError::Embedding(message)),
122| 0| Some(other) => Err(AppError::Internal(anyhow::anyhow!(
123| 0| "unexpected daemon response for passage embedding: {other:?}"
124| 0| ))),
125| | None => {
126| 0| let embedder = embedder::get_embedder(models_dir)?;
127| 0| embedder::embed_passage(embedder, text)
128| | }
129| | }
130| 0|}
131| |
132| 0|pub fn embed_query_or_local(
133| 0| models_dir: &Path,
134| 0| text: &str,
135| 0| cli_autostart: bool,
136| 0|) -> Result<Vec<f32>, AppError> {
137| 0| match request_or_autostart(
138| 0| models_dir,
139| 0| &DaemonRequest::EmbedQuery {
140| 0| text: text.to_string(),
141| 0| },
142| 0| cli_autostart,
143| 0| )? {
144| 0| Some(DaemonResponse::QueryEmbedding { embedding, .. }) => Ok(embedding),
145| 0| Some(DaemonResponse::Error { message }) => Err(AppError::Embedding(message)),
146| 0| Some(other) => Err(AppError::Internal(anyhow::anyhow!(
147| 0| "unexpected daemon response for query embedding: {other:?}"
148| 0| ))),
149| | None => {
150| 0| let embedder = embedder::get_embedder(models_dir)?;
151| 0| embedder::embed_query(embedder, text)
152| | }
153| | }
154| 0|}
155| |
156| 0|pub fn embed_passages_controlled_or_local(
157| 0| models_dir: &Path,
158| 0| texts: &[&str],
159| 0| token_counts: &[usize],
160| 0|) -> Result<Vec<Vec<f32>>, AppError> {
161| 0| let request = DaemonRequest::EmbedPassages {
162| 0| texts: texts.iter().map(|t| (*t).to_string()).collect(),
163| 0| token_counts: token_counts.to_vec(),
164| | };
165| |
166| 0| match request_or_autostart(models_dir, &request, true)? {
167| 0| Some(DaemonResponse::PassageEmbeddings { embeddings, .. }) => Ok(embeddings),
168| 0| Some(DaemonResponse::Error { message }) => Err(AppError::Embedding(message)),
169| 0| Some(other) => Err(AppError::Internal(anyhow::anyhow!(
170| 0| "unexpected daemon response for passage embedding batch: {other:?}"
171| 0| ))),
172| | None => {
173| 0| let embedder = embedder::get_embedder(models_dir)?;
174| 0| embedder::embed_passages_controlled(embedder, texts, token_counts)
175| | }
176| | }
177| 0|}
178| |
179| |struct DaemonSpawnGuard {
180| | models_dir: PathBuf,
181| |}
182| |
183| |impl DaemonSpawnGuard {
184| 0| fn new(models_dir: &Path) -> Self {
185| 0| Self {
186| 0| models_dir: models_dir.to_path_buf(),
187| 0| }
188| 0| }
189| |}
190| |
191| |impl Drop for DaemonSpawnGuard {
192| 0| fn drop(&mut self) {
193| 0| let lock_path = spawn_lock_path(&self.models_dir);
194| 0| if lock_path.exists() {
195| 0| match std::fs::remove_file(&lock_path) {
196| | Ok(()) => {
197| 0| tracing::debug!(
198| | target: "daemon",
199| 0| path = %lock_path.display(),
200| 0| "spawn lock file removed during graceful daemon shutdown"
201| | );
202| | }
203| 0| Err(err) if err.kind() == std::io::ErrorKind::NotFound => {}
204| 0| Err(err) => {
205| 0| tracing::warn!(
206| | target: "daemon",
207| | error = %err,
208| 0| path = %lock_path.display(),
209| 0| "failed to remove spawn lock file while shutting down daemon"
210| | );
211| | }
212| | }
213| 0| }
214| 0| let pid_path = pid_file_path(&self.models_dir);
215| 0| let _ = std::fs::remove_file(&pid_path);
216| |
217| 0| tracing::info!(
218| | target: "daemon",
219| 0| "daemon shut down gracefully; socket will be cleaned up by OS or by the next daemon via try_overwrite"
220| | );
221| 0| }
222| |}
223| |
224| 0|pub fn run(
225| 0| models_dir: &Path,
226| 0| idle_shutdown_secs: u64,
227| 0| shutdown_timeout_secs: u64,
228| 0|) -> Result<(), AppError> {
229| | // Scale worker threads to available parallelism so embedding tasks saturate CPU cores.
230| | // Clamped to [2, 8] to avoid excessive threads on high-core machines.
231| 0| let permits = std::thread::available_parallelism()
232| 0| .map(|n| n.get())
233| 0| .unwrap_or(2)
234| 0| .clamp(2, 8);
235| 0| let rt = tokio::runtime::Builder::new_multi_thread()
236| 0| .worker_threads(permits)
237| 0| .thread_name("daemon-worker")
238| 0| .enable_all()
239| 0| .build()
240| 0| .map_err(AppError::Io)?;
241| |
242| 0| let result = rt.block_on(run_async(models_dir, idle_shutdown_secs, permits));
243| 0| rt.shutdown_timeout(std::time::Duration::from_secs(shutdown_timeout_secs));
244| 0| result
245| 0|}
246| |
247| |#[tracing::instrument(skip_all, fields(idle_secs = idle_shutdown_secs, permits))]
248| 0|async fn run_async(
249| 0| models_dir: &Path,
250| 0| idle_shutdown_secs: u64,
251| 0| permits: usize,
252| 0|) -> Result<(), AppError> {
253| | let socket = daemon_label(models_dir);
254| | let name = to_local_socket_name(&socket)?;
255| | let listener = ListenerOptions::new()
256| | .name(name)
257| | .nonblocking(ListenerNonblockingMode::Accept)
258| | .try_overwrite(true)
259| | .create_sync()
260| | .map_err(AppError::Io)?;
261| |
262| | // Guard that cleans up the spawn lock file on graceful shutdown.
263| | // SIGKILL does not trigger Drop; in that case try_overwrite(true) above is the fallback.
264| | let _spawn_guard = DaemonSpawnGuard::new(models_dir);
265| |
266| | // Warm the model once per daemon process inside spawn_blocking so the
267| | // ONNX session initialisation (CPU-bound, may take several seconds) does
268| | // not block a tokio worker thread.
269| | let models_dir_warm = models_dir.to_path_buf();
270| 0| tokio::task::spawn_blocking(move || embedder::get_embedder(&models_dir_warm).map(|_| ()))
271| | .await
272| 0| .map_err(|e| AppError::Internal(anyhow::anyhow!("model warm-up panicked: {e}")))??;
273| |
274| | let pid_path = pid_file_path(models_dir);
275| | let _ = std::fs::write(&pid_path, std::process::id().to_string());
276| |
277| | crate::output::emit_json(&DaemonResponse::Listening {
278| | pid: std::process::id(),
279| | socket,
280| | idle_shutdown_secs,
281| | })?;
282| |
283| | let handled_embed_requests = Arc::new(AtomicU64::new(0));
284| | let mut last_activity = Instant::now();
285| | let models_dir = models_dir.to_path_buf();
286| | // Bound concurrent spawn_blocking tasks to the same thread count as the runtime.
287| | let permit_pool = Arc::new(tokio::sync::Semaphore::new(permits));
288| |
289| | let token = crate::cancel_token();
290| | loop {
291| | if shutdown_requested() || token.is_cancelled() {
292| | break;
293| | }
294| |
295| | if !daemon_control_dir(&models_dir).exists() {
296| | tracing::info!(target: "daemon", "daemon control directory disappeared; shutting down");
297| | break;
298| | }
299| |
300| | match listener.accept() {
301| | Ok(stream) => {
302| | last_activity = Instant::now();
303| | let models_dir_clone = models_dir.clone();
304| | let counter = Arc::clone(&handled_embed_requests);
305| | let permit =
306| 0| permit_pool.clone().acquire_owned().await.map_err(|e| {
307| 0| AppError::Internal(anyhow::anyhow!("semaphore closed: {e}"))
308| 0| })?;
309| 0| let should_exit = tokio::task::spawn_blocking(move || {
310| 0| let _permit = permit; // hold until end of scope
311| 0| handle_client(stream, &models_dir_clone, &counter)
312| 0| })
313| | .await
314| 0| .map_err(|e| {
315| 0| AppError::Internal(anyhow::anyhow!("spawn_blocking panicked: {e}"))
316| 0| })??;
317| |
318| | if should_exit {
319| | break;
320| | }
321| | }
322| | Err(err) if err.kind() == std::io::ErrorKind::WouldBlock => {
323| | if last_activity.elapsed() >= Duration::from_secs(idle_shutdown_secs) {
324| | tracing::info!(
325| | target: "daemon",
326| | idle_shutdown_secs,
327| | handled_embed_requests = handled_embed_requests.load(Ordering::Relaxed),
328| | "daemon idle timeout reached"
329| | );
330| | break;
331| | }
332| | tokio::select! {
333| | () = tokio::time::sleep(Duration::from_millis(50)) => {}
334| | () = token.cancelled() => { break; }
335| | }
336| | }
337| | Err(err) => return Err(AppError::Io(err)),
338| | }
339| | }
340| |
341| | Ok(())
342| 0|}
343| |
344| 0|fn handle_client(
345| 0| stream: LocalSocketStream,
346| 0| models_dir: &Path,
347| 0| handled_embed_requests: &AtomicU64,
348| 0|) -> Result<bool, AppError> {
349| 0| let mut reader = BufReader::new(stream);
350| 0| let mut line = String::new();
351| 0| reader.read_line(&mut line).map_err(AppError::Io)?;
352| |
353| 0| if line.trim().is_empty() {
354| 0| write_response(
355| 0| reader.get_mut(),
356| 0| &DaemonResponse::Error {
357| 0| message: "empty request to daemon".to_string(),
358| 0| },
359| 0| )?;
360| 0| return Ok(false);
361| 0| }
362| |
363| 0| let request: DaemonRequest = serde_json::from_str(line.trim()).map_err(AppError::Json)?;
364| 0| let (response, should_exit) = match request {
365| 0| DaemonRequest::Ping => (
366| 0| DaemonResponse::Ok {
367| 0| pid: std::process::id(),
368| 0| version: SQLITE_GRAPHRAG_VERSION.to_string(),
369| 0| handled_embed_requests: handled_embed_requests.load(Ordering::Relaxed),
370| 0| model_name: crate::constants::FASTEMBED_MODEL_DEFAULT.to_string(),
371| 0| model_variant: gliner_variant_from_env(),
372| 0| },
373| 0| false,
374| 0| ),
375| 0| DaemonRequest::Shutdown => (
376| 0| DaemonResponse::ShuttingDown {
377| 0| handled_embed_requests: handled_embed_requests.load(Ordering::Relaxed),
378| 0| },
379| 0| true,
380| 0| ),
381| 0| DaemonRequest::EmbedPassage { text } => {
382| 0| let embedder = embedder::get_embedder(models_dir)?;
383| 0| let embedding = embedder::embed_passage(embedder, &text)?;
384| 0| let count = handled_embed_requests.fetch_add(1, Ordering::Relaxed) + 1;
385| 0| (
386| 0| DaemonResponse::PassageEmbedding {
387| 0| embedding,
388| 0| handled_embed_requests: count,
389| 0| },
390| 0| false,
391| 0| )
392| | }
393| 0| DaemonRequest::EmbedQuery { text } => {
394| 0| let embedder = embedder::get_embedder(models_dir)?;
395| 0| let embedding = embedder::embed_query(embedder, &text)?;
396| 0| let count = handled_embed_requests.fetch_add(1, Ordering::Relaxed) + 1;
397| 0| (
398| 0| DaemonResponse::QueryEmbedding {
399| 0| embedding,
400| 0| handled_embed_requests: count,
401| 0| },
402| 0| false,
403| 0| )
404| | }
405| | DaemonRequest::EmbedPassages {
406| 0| texts,
407| 0| token_counts,
408| | } => {
409| 0| let embedder = embedder::get_embedder(models_dir)?;
410| 0| let text_refs: Vec<&str> = texts.iter().map(String::as_str).collect();
411| 0| let embeddings =
412| 0| embedder::embed_passages_controlled(embedder, &text_refs, &token_counts)?;
413| 0| let count = handled_embed_requests.fetch_add(1, Ordering::Relaxed) + 1;
414| 0| (
415| 0| DaemonResponse::PassageEmbeddings {
416| 0| embeddings,
417| 0| handled_embed_requests: count,
418| 0| },
419| 0| false,
420| 0| )
421| | }
422| | };
423| |
424| 0| write_response(reader.get_mut(), &response)?;
425| 0| Ok(should_exit)
426| 0|}
427| |
428| 0|fn write_response(
429| 0| stream: &mut LocalSocketStream,
430| 0| response: &DaemonResponse,
431| 0|) -> Result<(), AppError> {
432| 0| serde_json::to_writer(&mut *stream, response).map_err(AppError::Json)?;
433| 0| stream.write_all(b"\n").map_err(AppError::Io)?;
434| 0| stream.flush().map_err(AppError::Io)?;
435| 0| Ok(())
436| 0|}
437| |
438| 1|fn request_if_available(
439| 1| models_dir: &Path,
440| 1| request: &DaemonRequest,
441| 1|) -> Result<Option<DaemonResponse>, AppError> {
442| 1| let socket = daemon_label(models_dir);
443| 1| let name = match to_local_socket_name(&socket) {
444| 1| Ok(name) => name,
445| 0| Err(err) => return Err(AppError::Io(err)),
446| | };
447| |
448| 1| let mut stream = match LocalSocketStream::connect(name) {
^0
449| 0| Ok(stream) => stream,
450| 1| Err(err)
451| 0| if matches!(
452| 1| err.kind(),
453| | std::io::ErrorKind::NotFound
454| | | std::io::ErrorKind::ConnectionRefused
455| | | std::io::ErrorKind::AddrNotAvailable
456| | | std::io::ErrorKind::TimedOut
457| | ) =>
458| | {
459| 1| return Ok(None);
460| | }
461| 0| Err(err) => return Err(AppError::Io(err)),
462| | };
463| |
464| 0| serde_json::to_writer(&mut stream, request).map_err(AppError::Json)?;
465| 0| stream.write_all(b"\n").map_err(AppError::Io)?;
466| 0| stream.flush().map_err(AppError::Io)?;
467| |
468| 0| let mut reader = BufReader::new(stream);
469| 0| let mut line = String::new();
470| 0| reader.read_line(&mut line).map_err(AppError::Io)?;
471| 0| if line.trim().is_empty() {
472| 0| return Err(AppError::Embedding(
473| 0| "daemon returned an empty response".into(),
474| 0| ));
475| 0| }
476| |
477| 0| let response = serde_json::from_str(line.trim()).map_err(AppError::Json)?;
478| 0| Ok(Some(response))
479| 1|}
480| |
481| 0|fn should_autostart(cli_flag: bool) -> bool {
482| 0| if !cli_flag {
483| 0| return false; // explicit CLI override wins
484| 0| }
485| 0| !autostart_disabled_by_env()
486| 0|}
487| |
488| |/// Checks whether a running daemon has a different version from the current CLI binary.
489| |/// If a mismatch is detected, shuts down the stale daemon, waits for it to exit, and
490| |/// re-spawns a fresh one. The `VERSION_RESTART_ATTEMPTED` state prevents infinite loops:
491| |/// this function is a no-op after the first attempt regardless of outcome.
492| 0|fn maybe_restart_for_version_mismatch(models_dir: &Path) -> Result<(), AppError> {
493| | // ORDERING: Acquire on success synchronizes-with the Release store at line ~505.
494| | // Relaxed on failure: no dependent memory is read on the CAS failure path.
495| 0| if DAEMON_VERSION_STATE
496| 0| .compare_exchange(
497| 0| VERSION_NOT_CHECKED,
498| 0| VERSION_COMPATIBLE,
499| 0| Ordering::Acquire,
500| 0| Ordering::Relaxed,
501| 0| )
502| 0| .is_err()
503| | {
504| | // Already checked (compatible) or already attempted a restart — skip.
505| 0| return Ok(());
506| 0| }
507| |
508| 0| let response = match try_ping(models_dir)? {
509| 0| Some(r) => r,
510| 0| None => return Ok(()), // no daemon running, nothing to check
511| | };
512| |
513| 0| let daemon_version = match &response {
514| 0| DaemonResponse::Ok { version, .. } => version.as_str(),
515| 0| _ => return Ok(()), // unexpected response shape, skip
516| | };
517| |
518| 0| if daemon_version == SQLITE_GRAPHRAG_VERSION {
519| 0| return Ok(()); // versions match, state already set to COMPATIBLE
520| 0| }
521| |
522| | // Mismatch detected — mark as restart-attempted so we never loop.
523| | // ORDERING: Release pairs with the Acquire in compare_exchange and load.
524| 0| DAEMON_VERSION_STATE.store(VERSION_RESTART_ATTEMPTED, Ordering::Release);
525| |
526| 0| tracing::warn!(
527| | target: "daemon",
528| | daemon_version = %daemon_version,
529| | cli_version = SQLITE_GRAPHRAG_VERSION,
530| 0| "daemon version mismatch detected; auto-restarting daemon"
531| | );
532| |
533| | // Send shutdown request.
534| 0| try_shutdown(models_dir)?;
535| |
536| | // Wait for the stale daemon to exit.
537| 0| wait_for_daemon_exit(models_dir)?;
538| |
539| | // Re-spawn the daemon via the existing mechanism.
540| 0| ensure_daemon_running(models_dir)?;
541| |
542| 0| Ok(())
543| 0|}
544| |
545| |/// Polls until the daemon stops responding to pings, with exponential backoff.
546| |/// Starts at 50 ms, doubles each iteration, caps at 500 ms per sleep.
547| |/// Returns `Ok(())` once the daemon is gone or the timeout is reached.
548| |#[cold]
549| |#[inline(never)]
550| 1|fn wait_for_daemon_exit(models_dir: &Path) -> Result<(), AppError> {
551| 1| let deadline = Instant::now() + Duration::from_millis(DAEMON_VERSION_RESTART_WAIT_MS);
552| 1| let mut sleep_ms: u64 = 50;
553| |
554| 1| while Instant::now() < deadline {
555| 1| if try_ping(models_dir)?.is_none() {
^0
556| 1| tracing::debug!(target: "daemon", "stale daemon exited after version-mismatch shutdown");
^0
557| 1| return Ok(());
558| 0| }
559| 0| thread::sleep(Duration::from_millis(sleep_ms));
560| 0| sleep_ms = (sleep_ms * 2).min(500);
561| | }
562| |
563| 0| tracing::warn!(
564| | target: "daemon",
565| | timeout_ms = DAEMON_VERSION_RESTART_WAIT_MS,
566| 0| "timed out waiting for stale daemon to exit after version-mismatch shutdown"
567| | );
568| 0| Ok(())
569| 1|}
570| |
571| 0|fn request_or_autostart(
572| 0| models_dir: &Path,
573| 0| request: &DaemonRequest,
574| 0| cli_autostart: bool,
575| 0|) -> Result<Option<DaemonResponse>, AppError> {
576| | // ORDERING: Acquire pairs with the Release store in maybe_restart_for_version_mismatch.
577| 0| if DAEMON_VERSION_STATE.load(Ordering::Acquire) == VERSION_NOT_CHECKED {
578| 0| maybe_restart_for_version_mismatch(models_dir)?;
579| 0| }
580| |
581| 0| if let Some(response) = request_if_available(models_dir, request)? {
582| 0| clear_spawn_backoff_state(models_dir).ok();
583| 0| return Ok(Some(response));
584| 0| }
585| |
586| 0| if !should_autostart(cli_autostart) {
587| 0| return Ok(None);
588| 0| }
589| |
590| 0| if !ensure_daemon_running(models_dir)? {
591| 0| return Ok(None);
592| 0| }
593| |
594| 0| request_if_available(models_dir, request)
595| 0|}
596| |
597| 0|fn ensure_daemon_running(models_dir: &Path) -> Result<bool, AppError> {
598| 0| if (try_ping(models_dir)?).is_some() {
599| 0| clear_spawn_backoff_state(models_dir).ok();
600| 0| return Ok(true);
601| 0| }
602| |
603| 0| if spawn_backoff_active(models_dir)? {
604| 0| tracing::warn!(target: "daemon", "daemon autostart suppressed by backoff window");
605| 0| return Ok(false);
606| 0| }
607| |
608| 0| let spawn_lock = match try_acquire_spawn_lock(models_dir)? {
609| 0| Some(lock) => lock,
610| 0| None => return wait_for_daemon_ready(models_dir),
611| | };
612| |
613| 0| if (try_ping(models_dir)?).is_some() {
614| 0| clear_spawn_backoff_state(models_dir).ok();
615| 0| drop(spawn_lock);
616| 0| return Ok(true);
617| 0| }
618| |
619| 0| let exe = match std::env::current_exe() {
620| 0| Ok(path) => path,
621| 0| Err(err) => {
622| 0| record_spawn_failure(models_dir, &format!("current_exe failed: {err}"))?;
623| 0| drop(spawn_lock);
624| 0| return Ok(false);
625| | }
626| | };
627| |
628| 0| let mut child = std::process::Command::new(exe);
629| 0| child
630| 0| .arg("daemon")
631| 0| .arg("--idle-shutdown-secs")
632| 0| .arg(DAEMON_IDLE_SHUTDOWN_SECS.to_string())
633| 0| .env("SQLITE_GRAPHRAG_DAEMON_CHILD", "1")
634| 0| .env_remove("LD_PRELOAD")
635| 0| .env_remove("LD_LIBRARY_PATH")
636| 0| .env_remove("LD_AUDIT")
637| 0| .env_remove("DYLD_INSERT_LIBRARIES")
638| 0| .env_remove("DYLD_LIBRARY_PATH")
639| 0| .stdin(Stdio::null())
640| 0| .stdout(Stdio::null())
641| 0| .stderr(Stdio::null());
642| |
643| 0| match crate::commands::claude_runner::spawn_with_memory_limit(&mut child) {
644| 0| Ok(child_handle) => {
645| | // SAFETY: deliberate orphan daemon detach. The Child handle is intentionally
646| | // dropped without a corresponding `.wait()` call because the daemon owns its
647| | // own lifecycle: `Stdio::null()` is set on stdin/stdout/stderr (above) so the
648| | // child does not inherit terminal handles, the spawn lock file at
649| | // `<models_dir>/.daemon.spawn.lock` prevents concurrent spawns, and the
650| | // daemon shuts itself down via `DAEMON_IDLE_SHUTDOWN_SECS` (or an explicit
651| | // `daemon stop`/SIGTERM). Keeping the handle here would block the parent
652| | // CLI in the foreground until the daemon exited, defeating the autostart
653| | // contract that callers expect.
654| | // See: docs_rules/rules_rust_processos_externos.md section "Child detach justificado"
655| | // AND docs/adr/0001-daemon-warmup-exception.md (authorized exception to no-daemon rule)
656| 0| let pid = child_handle.id();
657| 0| drop(child_handle);
658| 0| tracing::debug!(
659| | target: "daemon",
660| | pid,
661| 0| "daemon detached; lifecycle managed via spawn lock + readiness file"
662| | );
663| 0| let ready = wait_for_daemon_ready(models_dir)?;
664| 0| if ready {
665| 0| clear_spawn_backoff_state(models_dir).ok();
666| 0| } else {
667| 0| record_spawn_failure(models_dir, "daemon did not become healthy after autostart")?;
668| | }
669| 0| drop(spawn_lock);
670| 0| Ok(ready)
671| | }
672| 0| Err(err) => {
673| 0| record_spawn_failure(models_dir, &format!("daemon spawn failed: {err}"))?;
674| 0| drop(spawn_lock);
675| 0| Ok(false)
676| | }
677| | }
678| 0|}
679| |
680| 0|fn wait_for_daemon_ready(models_dir: &Path) -> Result<bool, AppError> {
681| 0| let deadline = Instant::now() + Duration::from_millis(DAEMON_AUTO_START_MAX_WAIT_MS);
682| 0| let mut sleep_ms = DAEMON_AUTO_START_INITIAL_BACKOFF_MS.max(DAEMON_PING_TIMEOUT_MS);
683| |
684| 0| while Instant::now() < deadline {
685| 0| if (try_ping(models_dir)?).is_some() {
686| 0| return Ok(true);
687| 0| }
688| 0| thread::sleep(Duration::from_millis(sleep_ms));
689| 0| sleep_ms = (sleep_ms * 2).min(DAEMON_AUTO_START_MAX_BACKOFF_MS);
690| | }
691| |
692| 0| Ok(false)
693| 0|}
694| |
695| 0|fn autostart_disabled_by_env() -> bool {
696| 0| std::env::var("SQLITE_GRAPHRAG_DAEMON_CHILD").as_deref() == Ok("1")
697| 0| || std::env::var("SQLITE_GRAPHRAG_DAEMON_FORCE_AUTOSTART").as_deref() != Ok("1")
698| 0| && std::env::var("SQLITE_GRAPHRAG_DAEMON_DISABLE_AUTOSTART").as_deref() == Ok("1")
699| 0|}
700| |
701| 31|fn daemon_control_dir(models_dir: &Path) -> PathBuf {
702| 31| models_dir
703| 31| .parent()
704| 31| .map(Path::to_path_buf)
705| 31| .unwrap_or_else(|| models_dir.to_path_buf())
^0 ^0
706| 31|}
707| |
708| 0|fn spawn_lock_path(models_dir: &Path) -> PathBuf {
709| 0| daemon_control_dir(models_dir).join("daemon-spawn.lock")
710| 0|}
711| |
712| 30|fn spawn_state_path(models_dir: &Path) -> PathBuf {
713| 30| daemon_control_dir(models_dir).join("daemon-spawn-state.json")
714| 30|}
715| |
716| 0|fn pid_file_path(models_dir: &Path) -> PathBuf {
717| 0| daemon_control_dir(models_dir).join("daemon.pid")
718| 0|}
719| |
720| 0|fn try_acquire_spawn_lock(models_dir: &Path) -> Result<Option<File>, AppError> {
721| 0| let path = spawn_lock_path(models_dir);
722| 0| std::fs::create_dir_all(crate::paths::parent_or_err(&path)?).map_err(AppError::Io)?;
723| 0| let file = OpenOptions::new()
724| 0| .read(true)
725| 0| .write(true)
726| 0| .create(true)
727| 0| .truncate(false)
728| 0| .open(path)
729| 0| .map_err(AppError::Io)?;
730| |
731| 0| let deadline = Instant::now() + Duration::from_millis(DAEMON_SPAWN_LOCK_WAIT_MS);
732| | loop {
733| 0| match file.try_lock_exclusive() {
734| 0| Ok(()) => return Ok(Some(file)),
735| 0| Err(err) if err.kind() == std::io::ErrorKind::WouldBlock => {
736| 0| if Instant::now() >= deadline {
737| 0| return Ok(None);
738| 0| }
739| 0| thread::sleep(Duration::from_millis(50));
740| | }
741| 0| Err(err) => return Err(AppError::Io(err)),
742| | }
743| | }
744| 0|}
745| |
746| 3|fn spawn_backoff_active(models_dir: &Path) -> Result<bool, AppError> {
747| 3| let state = load_spawn_state(models_dir)?;
^0
748| 3| Ok(now_epoch_ms() < state.not_before_epoch_ms)
749| 3|}
750| |
751| |#[cold]
752| |#[inline(never)]
753| 11|fn record_spawn_failure(models_dir: &Path, message: &str) -> Result<(), AppError> {
754| 11| let mut state = load_spawn_state(models_dir)?;
^0
755| 11| state.consecutive_failures = state.consecutive_failures.saturating_add(1);
756| 11| let exponent = state.consecutive_failures.saturating_sub(1).min(6);
757| 11| let base_ms =
758| 11| (DAEMON_SPAWN_BACKOFF_BASE_MS * (1_u64 << exponent)).min(DAEMON_AUTO_START_MAX_BACKOFF_MS);
759| | // v1.0.36 (L2) + v1.0.43 (H7): half-jitter via fastrand (replaces SystemTime nanoseconds
760| | // which violated rules_rust_retry_com_backoff.md). Effective backoff range: [base/2, base).
761| 11| let half = base_ms / 2;
762| 11| let jitter = if half == 0 { 0 } else { fastrand::u64(0..half) };
^0
763| 11| let backoff_ms = half + jitter;
764| 11| state.not_before_epoch_ms = now_epoch_ms() + backoff_ms;
765| 11| state.last_error = Some(message.to_string());
766| 11| save_spawn_state(models_dir, &state)
767| 11|}
768| |
769| 1|fn clear_spawn_backoff_state(models_dir: &Path) -> Result<(), AppError> {
770| 1| let path = spawn_state_path(models_dir);
771| 1| if path.exists() {
772| 1| std::fs::remove_file(path).map_err(AppError::Io)?;
^0
773| 0| }
774| 1| Ok(())
775| 1|}
776| |
777| 17|fn load_spawn_state(models_dir: &Path) -> Result<DaemonSpawnState, AppError> {
778| 17| let path = spawn_state_path(models_dir);
779| 17| if !path.exists() {
780| 4| return Ok(DaemonSpawnState::default());
781| 13| }
782| |
783| 13| let bytes = std::fs::read(path).map_err(AppError::Io)?;
^0
784| 13| serde_json::from_slice(&bytes).map_err(AppError::Json)
785| 17|}
786| |
787| 12|fn save_spawn_state(models_dir: &Path, state: &DaemonSpawnState) -> Result<(), AppError> {
788| 12| let path = spawn_state_path(models_dir);
789| 12| std::fs::create_dir_all(crate::paths::parent_or_err(&path)?).map_err(AppError::Io)?;
^0 ^0
790| 12| let bytes = serde_json::to_vec(state).map_err(AppError::Json)?;
^0
791| 12| std::fs::write(path, bytes).map_err(AppError::Io)
792| 12|}
793| |
794| |/// Returns the GLiNER model variant string based on the environment variable
795| |/// `SQLITE_GRAPHRAG_GLINER_VARIANT`, defaulting to `"fp32"`.
796| 2|fn gliner_variant_from_env() -> String {
797| 2| std::env::var("SQLITE_GRAPHRAG_GLINER_VARIANT").unwrap_or_else(|_| "fp32".to_string())
^1 ^1
798| 2|}
799| |
800| 15|fn now_epoch_ms() -> u64 {
801| 15| SystemTime::now()
802| 15| .duration_since(UNIX_EPOCH)
803| 15| .unwrap_or_else(|_| Duration::from_secs(0))
^0
804| 15| .as_millis() as u64
805| 15|}
806| |
807| 2|fn to_local_socket_name(name: &str) -> std::io::Result<interprocess::local_socket::Name<'static>> {
808| 2| if let Ok(ns_name) = name.to_string().to_ns_name::<GenericNamespaced>() {
809| 2| return Ok(ns_name);
810| 0| }
811| |
812| | // Fallback when abstract namespaces are unavailable. Honours XDG_RUNTIME_DIR
813| | // (Linux user-private runtime dir) or SQLITE_GRAPHRAG_HOME (project override)
814| | // before falling back to /tmp, which can collide when the same name is used
815| | // by another user/project on a multi-tenant host. Added in v1.0.35.
816| 0| let path = if cfg!(unix) {
817| 0| let base = std::env::var_os("XDG_RUNTIME_DIR")
818| 0| .or_else(|| std::env::var_os("SQLITE_GRAPHRAG_HOME"))
819| 0| .map(std::path::PathBuf::from)
820| 0| .unwrap_or_else(std::env::temp_dir);
821| 0| base.join(format!("{name}.sock"))
822| 0| .to_string_lossy()
823| 0| .into_owned()
824| | } else {
825| 0| format!(r"\\.\pipe\{name}")
826| | };
827| 0| path.to_fs_name::<GenericFilePath>()
828| 2|}
829| |
830| |#[cfg(test)]
831| |mod tests {
832| | use super::*;
833| |
834| | #[test]
835| 1| fn record_and_clear_spawn_backoff_state() {
836| 1| let tmp = tempfile::tempdir().unwrap();
837| 1| let models_dir = tmp.path().join("cache").join("models");
838| 1| std::fs::create_dir_all(&models_dir).unwrap();
839| |
840| 1| assert!(!spawn_backoff_active(&models_dir).unwrap());
841| |
842| 1| record_spawn_failure(&models_dir, "spawn failed").unwrap();
843| 1| assert!(spawn_backoff_active(&models_dir).unwrap());
844| |
845| 1| let state = load_spawn_state(&models_dir).unwrap();
846| 1| assert_eq!(state.consecutive_failures, 1);
847| 1| assert_eq!(state.last_error.as_deref(), Some("spawn failed"));
848| |
849| 1| clear_spawn_backoff_state(&models_dir).unwrap();
850| 1| assert!(!spawn_backoff_active(&models_dir).unwrap());
851| 1| }
852| |
853| | #[test]
854| 1| fn daemon_control_dir_uses_models_parent() {
855| 1| let base = PathBuf::from("/tmp/sqlite-graphrag-cache-test");
856| 1| let models_dir = base.join("models");
857| 1| assert_eq!(daemon_control_dir(&models_dir), base);
858| 1| }
859| |
860| | #[test]
861| 1| fn version_state_constants_are_distinct() {
862| 1| assert_ne!(VERSION_NOT_CHECKED, VERSION_COMPATIBLE);
863| 1| assert_ne!(VERSION_NOT_CHECKED, VERSION_RESTART_ATTEMPTED);
864| 1| assert_ne!(VERSION_COMPATIBLE, VERSION_RESTART_ATTEMPTED);
865| 1| }
866| |
867| | #[test]
868| 1| fn wait_for_daemon_exit_immediate_when_not_running() {
869| 1| let tmp = tempfile::tempdir().unwrap();
870| 1| let models_dir = tmp.path().join("cache").join("models");
871| 1| std::fs::create_dir_all(&models_dir).unwrap();
872| |
873| 1| let start = Instant::now();
874| 1| wait_for_daemon_exit(&models_dir).unwrap();
875| | // Without a daemon, the first ping returns None and the function exits immediately.
876| 1| assert!(start.elapsed() < Duration::from_millis(500));
877| 1| }
878| |
879| | #[test]
880| 1| fn spawn_backoff_exponent_caps_at_six() {
881| 1| let tmp = tempfile::tempdir().unwrap();
882| 1| let models_dir = tmp.path().join("cache").join("models");
883| 1| std::fs::create_dir_all(&models_dir).unwrap();
884| |
885| | // Record 10 consecutive failures to force exponent saturation.
886| 11| for i in 0..10 {
^10
887| 10| record_spawn_failure(&models_dir, &format!("failure {i}")).unwrap();
888| 10| }
889| |
890| 1| let state = load_spawn_state(&models_dir).unwrap();
891| 1| assert_eq!(state.consecutive_failures, 10);
892| |
893| | // Exponent is clamped at 6, so max base_ms is base * 2^6.
894| | // Effective backoff range is [base/2, base), where base <= base_ms * 64.
895| 1| let max_base =
896| 1| (DAEMON_SPAWN_BACKOFF_BASE_MS * (1_u64 << 6)).min(DAEMON_AUTO_START_MAX_BACKOFF_MS);
897| | // The not_before_epoch_ms must not exceed now + max_base (upper bound with jitter < half).
898| 1| let now = now_epoch_ms();
899| 1| assert!(state.not_before_epoch_ms <= now + max_base);
900| 1| }
901| |
902| | #[test]
903| 1| fn spawn_backoff_half_jitter_in_range() {
904| | // Verify the half-jitter formula: result = half + fastrand::u64(0..half)
905| | // produces values in [half, half + half) == [base/2, base).
906| 1| let base_ms: u64 = 100;
907| 1| let half = base_ms / 2;
908| 101| for _ in 0..100 {
909| 100| let jitter = fastrand::u64(0..half);
910| 100| let result = half + jitter;
911| 100| assert!(result >= half, "result {result} below half {half}");
^0
912| 100| assert!(result < base_ms, "result {result} not below base {base_ms}");
^0
913| | }
914| 1| }
915| |
916| | #[test]
917| 1| fn to_local_socket_name_produces_valid_result() {
918| 1| let result = to_local_socket_name("sqlite-graphrag-test-daemon");
919| 1| assert!(result.is_ok(), "expected Ok, got {result:?}");
^0
920| | // The name string representation must be non-empty.
921| 1| let name = result.unwrap();
922| 1| let display = format!("{name:?}");
923| 1| assert!(!display.is_empty());
924| 1| }
925| |
926| | #[test]
927| 1| fn version_cas_not_checked_to_compatible() {
928| 1| let state = AtomicU8::new(VERSION_NOT_CHECKED);
929| 1| let result = state.compare_exchange(
930| | VERSION_NOT_CHECKED,
931| | VERSION_COMPATIBLE,
932| 1| Ordering::SeqCst,
933| 1| Ordering::SeqCst,
934| | );
935| 1| assert!(result.is_ok());
936| 1| assert_eq!(state.load(Ordering::SeqCst), VERSION_COMPATIBLE);
937| 1| }
938| |
939| | #[test]
940| 1| fn version_cas_prevents_double_restart() {
941| 1| let state = AtomicU8::new(VERSION_NOT_CHECKED);
942| |
943| | // First CAS: NOT_CHECKED → RESTART_ATTEMPTED succeeds.
944| 1| let first = state.compare_exchange(
945| | VERSION_NOT_CHECKED,
946| | VERSION_RESTART_ATTEMPTED,
947| 1| Ordering::SeqCst,
948| 1| Ordering::SeqCst,
949| | );
950| 1| assert!(first.is_ok());
951| |
952| | // Second CAS from NOT_CHECKED must fail — state is already RESTART_ATTEMPTED.
953| 1| let second = state.compare_exchange(
954| | VERSION_NOT_CHECKED,
955| | VERSION_RESTART_ATTEMPTED,
956| 1| Ordering::SeqCst,
957| 1| Ordering::SeqCst,
958| | );
959| 1| assert!(second.is_err());
960| 1| assert_eq!(state.load(Ordering::SeqCst), VERSION_RESTART_ATTEMPTED);
961| 1| }
962| |
963| | #[test]
964| 1| fn ping_response_includes_model_fields() {
965| 1| let resp = DaemonResponse::Ok {
966| 1| pid: 42,
967| 1| version: "1.0.0".to_string(),
968| 1| handled_embed_requests: 7,
969| 1| model_name: "multilingual-e5-small".to_string(),
970| 1| model_variant: "fp32".to_string(),
971| 1| };
972| 1| let json = serde_json::to_value(&resp).expect("serialization failed");
973| 1| assert_eq!(json["model_name"], "multilingual-e5-small");
974| 1| assert_eq!(json["model_variant"], "fp32");
975| 1| assert_eq!(json["status"], "ok");
976| 1| assert_eq!(json["handled_embed_requests"], 7u64);
977| 1| }
978| |
979| | #[test]
980| 1| fn gliner_variant_defaults_to_fp32() {
981| | // Ensure the default is fp32 when env var is not set.
982| 1| std::env::remove_var("SQLITE_GRAPHRAG_GLINER_VARIANT");
983| 1| let variant = gliner_variant_from_env();
984| 1| assert_eq!(variant, "fp32");
985| 1| }
986| |
987| | #[test]
988| 1| fn gliner_variant_reads_env_var() {
989| 1| std::env::set_var("SQLITE_GRAPHRAG_GLINER_VARIANT", "int8");
990| 1| let variant = gliner_variant_from_env();
991| 1| std::env::remove_var("SQLITE_GRAPHRAG_GLINER_VARIANT");
992| 1| assert_eq!(variant, "int8");
993| 1| }
994| |
995| | #[test]
996| 1| fn spawn_state_serialization_roundtrip() {
997| 1| let tmp = tempfile::tempdir().unwrap();
998| 1| let models_dir = tmp.path().join("cache").join("models");
999| 1| std::fs::create_dir_all(&models_dir).unwrap();
1000| |
1001| 1| let original = DaemonSpawnState {
1002| 1| consecutive_failures: 3,
1003| 1| not_before_epoch_ms: 9_999_999_999,
1004| 1| last_error: Some("test error message".to_string()),
1005| 1| };
1006| 1| save_spawn_state(&models_dir, &original).unwrap();
1007| |
1008| 1| let loaded = load_spawn_state(&models_dir).unwrap();
1009| 1| assert_eq!(loaded.consecutive_failures, original.consecutive_failures);
1010| 1| assert_eq!(loaded.not_before_epoch_ms, original.not_before_epoch_ms);
1011| 1| assert_eq!(loaded.last_error, original.last_error);
1012| 1| }
1013| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/embedder.rs:
1| |//! fastembed wrapper and per-process embedding cache.
2| |//!
3| |//! Owns the in-process `TextEmbedding` model and exposes batch encode/query
4| |//! helpers used by remember, recall, and related commands.
5| |// Workload: CPU-bound (ONNX inference, matrix multiplication via fastembed)
6| |
7| |use crate::constants::{
8| | EMBEDDING_DIM, EMBEDDING_MAX_TOKENS, FASTEMBED_BATCH_SIZE, PASSAGE_PREFIX, QUERY_PREFIX,
9| | REMEMBER_MAX_CONTROLLED_BATCH_CHUNKS, REMEMBER_MAX_CONTROLLED_BATCH_PADDED_TOKENS,
10| |};
11| |use crate::errors::AppError;
12| |use fastembed::{EmbeddingModel, ExecutionProviderDispatch, TextEmbedding, TextInitOptions};
13| |use ort::ep::CPU;
14| |use parking_lot::Mutex;
15| |use std::path::Path;
16| |use std::sync::OnceLock;
17| |
18| |/// Process-wide singleton embedding model behind a `Mutex`.
19| |///
20| |/// ONNX Runtime's `Session` is not guaranteed thread-safe for concurrent
21| |/// inference; `Mutex` serialises all embedding calls. This is correct by
22| |/// design — without the daemon, embedding throughput is intentionally serial.
23| |///
24| |/// For parallel workloads (enrich, ingest) start the daemon first:
25| |/// `sqlite-graphrag daemon` — the model is loaded once and served via UDS,
26| |/// eliminating Mutex contention across CLI invocations.
27| |static EMBEDDER: OnceLock<Mutex<TextEmbedding>> = OnceLock::new();
28| |
29| |/// Returns the process-wide singleton embedder, initializing it on first call.
30| |/// Subsequent calls return the cached instance regardless of `models_dir`.
31| |///
32| |/// # Errors
33| |///
34| |/// - [`AppError::Embedding`] — ONNX model load failure or runtime initialisation error.
35| |/// - [`AppError::Io`] — cache directory is inaccessible or cannot be created.
36| 0|pub fn get_embedder(models_dir: &Path) -> Result<&'static Mutex<TextEmbedding>, AppError> {
37| 0| if let Some(m) = EMBEDDER.get() {
38| 0| return Ok(m);
39| 0| }
40| |
41| 0| maybe_init_dynamic_ort(models_dir)?;
42| |
43| | // Multi-layer mitigation of the explosive RSS observed with variable-shape
44| | // payloads. The three current layers are:
45| | // 1. `with_arena_allocator(false)` on the CPU execution provider (line below)
46| | // 2. env var `ORT_DISABLE_CPU_MEM_ARENA=1` in `main.rs` (default since v1.0.18)
47| | // 3. env var `ORT_NUM_THREADS=1` + `ORT_INTRA_OP_NUM_THREADS=1` in `main.rs`
48| | // The `with_memory_pattern(false)` flag exists in ort 2.0 (`SessionBuilder`)
49| | // but fastembed 5.13.2 does NOT expose access to a custom SessionBuilder via
50| | // `TextInitOptions`. If RSS grows again in real corpora, the next
51| | // mitigation requires one of the following paths:
52| | // - Fork fastembed to expose `SessionBuilder::with_memory_pattern(false)`
53| | // - Bypass fastembed and use ort directly with a custom SessionBuilder
54| | // - Fixed padding in `plan_controlled_batches` to eliminate variable shapes
55| | // References:
56| | // https://onnxruntime.ai/docs/performance/tune-performance/memory.html
57| | // https://github.com/qdrant/fastembed/issues/570
58| 0| let cpu_ep: ExecutionProviderDispatch = CPU::default().with_arena_allocator(false).build();
59| |
60| 0| let model = TextEmbedding::try_new(
61| 0| TextInitOptions::new(EmbeddingModel::MultilingualE5Small)
62| 0| .with_execution_providers(vec![cpu_ep])
63| 0| .with_max_length(EMBEDDING_MAX_TOKENS)
64| 0| .with_show_download_progress(true)
65| 0| .with_cache_dir(models_dir.to_path_buf()),
66| | )
67| 0| .map_err(|e| AppError::Embedding(e.to_string()))?;
68| | // If another thread raced and won, discard our instance and return theirs.
69| 0| let _ = EMBEDDER.set(Mutex::new(model));
70| 0| EMBEDDER.get().ok_or_else(|| {
71| 0| AppError::Embedding(
72| 0| "embedder OnceLock unexpectedly empty after set() (likely a racing initializer aborted before completion)"
73| 0| .into(),
74| 0| )
75| 0| })
76| 0|}
77| |
78| |#[cfg(all(target_arch = "aarch64", target_os = "linux", target_env = "gnu"))]
79| |fn maybe_init_dynamic_ort(models_dir: &Path) -> Result<(), AppError> {
80| | let mut candidates = Vec::with_capacity(4);
81| |
82| | if let Ok(path) = std::env::var("ORT_DYLIB_PATH") {
83| | if !path.is_empty() {
84| | candidates.push(std::path::PathBuf::from(path));
85| | }
86| | }
87| |
88| | if let Ok(exe) = std::env::current_exe() {
89| | if let Some(dir) = exe.parent() {
90| | candidates.push(dir.join("libonnxruntime.so"));
91| | candidates.push(dir.join("lib").join("libonnxruntime.so"));
92| | }
93| | }
94| |
95| | candidates.push(models_dir.join("libonnxruntime.so"));
96| |
97| | for path in candidates {
98| | if !path.exists() {
99| | continue;
100| | }
101| |
102| | std::env::set_var("ORT_DYLIB_PATH", &path);
103| | let _ = ort::init_from(&path)
104| | .map_err(|e| AppError::Embedding(e.to_string()))?
105| | .commit();
106| | return Ok(());
107| | }
108| |
109| | Ok(())
110| |}
111| |
112| |#[cfg(not(all(target_arch = "aarch64", target_os = "linux", target_env = "gnu")))]
113| 0|fn maybe_init_dynamic_ort(_models_dir: &Path) -> Result<(), AppError> {
114| 0| Ok(())
115| 0|}
116| |
117| |/// Embeds a single passage using the `passage:` prefix required by E5 models.
118| |///
119| |/// # Errors
120| |/// Returns `Err` when the model returns an unexpected result.
121| |#[tracing::instrument(skip(embedder, text), fields(text_len = text.len()))]
122| 0|pub fn embed_passage(embedder: &Mutex<TextEmbedding>, text: &str) -> Result<Vec<f32>, AppError> {
123| 0| let prefixed = format!("{PASSAGE_PREFIX}{text}");
124| 0| let results = embedder
125| 0| .lock()
126| 0| .embed(vec![prefixed.as_str()], Some(1))
127| 0| .map_err(|e| AppError::Embedding(e.to_string()))?;
128| 0| let emb = results
129| 0| .into_iter()
130| 0| .next()
131| 0| .ok_or_else(|| AppError::Embedding("empty embedding result".into()))?;
132| 0| assert_eq!(emb.len(), EMBEDDING_DIM, "unexpected embedding dimension");
133| 0| Ok(emb)
134| 0|}
135| |
136| |/// Embeds a search query using the `query:` prefix required by E5 models.
137| |///
138| |/// # Errors
139| |/// Returns `Err` when the model returns an unexpected result.
140| |#[tracing::instrument(skip(embedder, text), fields(text_len = text.len()))]
141| 0|pub fn embed_query(embedder: &Mutex<TextEmbedding>, text: &str) -> Result<Vec<f32>, AppError> {
142| 0| let prefixed = format!("{QUERY_PREFIX}{text}");
143| 0| let results = embedder
144| 0| .lock()
145| 0| .embed(vec![prefixed.as_str()], Some(1))
146| 0| .map_err(|e| AppError::Embedding(e.to_string()))?;
147| 0| let emb = results
148| 0| .into_iter()
149| 0| .next()
150| 0| .ok_or_else(|| AppError::Embedding("empty embedding result".into()))?;
151| 0| Ok(emb)
152| 0|}
153| |
154| |/// Embeds multiple passages in a single ONNX batch call.
155| |///
156| |/// `batch_size` is capped at `FASTEMBED_BATCH_SIZE`. All texts receive the `passage:` prefix.
157| |///
158| |/// # Errors
159| |/// Returns `Err` when the model inference fails.
160| |#[tracing::instrument(skip(embedder, texts), fields(batch_size = texts.len()))]
161| 0|pub fn embed_passages_batch(
162| 0| embedder: &Mutex<TextEmbedding>,
163| 0| texts: &[&str],
164| 0| batch_size: usize,
165| 0|) -> Result<Vec<Vec<f32>>, AppError> {
166| 0| let prefixed: Vec<String> = texts
167| 0| .iter()
168| 0| .map(|t| format!("{PASSAGE_PREFIX}{t}"))
169| 0| .collect();
170| 0| let strs: Vec<&str> = prefixed.iter().map(String::as_str).collect();
171| 0| let results = embedder
172| 0| .lock()
173| 0| .embed(strs, Some(batch_size.min(FASTEMBED_BATCH_SIZE)))
174| 0| .map_err(|e| AppError::Embedding(e.to_string()))?;
175| 0| for emb in &results {
176| 0| assert_eq!(emb.len(), EMBEDDING_DIM, "unexpected embedding dimension");
177| | }
178| 0| Ok(results)
179| 0|}
180| |
181| |/// Returns the number of batches that [`embed_passages_controlled`] would produce
182| |/// for the given `token_counts` slice without running inference.
183| 1|pub fn controlled_batch_count(token_counts: &[usize]) -> usize {
184| 1| plan_controlled_batches(token_counts).len()
185| 1|}
186| |
187| |/// Embeds passages grouped into token-budget-aware batches to avoid OOM on variable-length inputs.
188| |///
189| |/// `texts` and `token_counts` must have the same length. Batches are planned using an
190| |/// internal budget algorithm and single-item batches fall back to [`embed_passage`].
191| |///
192| |/// # Errors
193| |/// Returns `Err` when lengths differ, the mutex is poisoned, or inference fails.
194| 0|pub fn embed_passages_controlled(
195| 0| embedder: &Mutex<TextEmbedding>,
196| 0| texts: &[&str],
197| 0| token_counts: &[usize],
198| 0|) -> Result<Vec<Vec<f32>>, AppError> {
199| 0| if texts.len() != token_counts.len() {
200| 0| return Err(AppError::Internal(anyhow::anyhow!(
201| 0| "texts/token_counts length mismatch in controlled embedding"
202| 0| )));
203| 0| }
204| |
205| 0| let mut results = Vec::with_capacity(texts.len());
206| 0| for (start, end) in plan_controlled_batches(token_counts) {
207| 0| if end - start == 1 {
208| 0| results.push(embed_passage(embedder, texts[start])?);
209| 0| continue;
210| 0| }
211| |
212| 0| results.extend(embed_passages_batch(
213| 0| embedder,
214| 0| &texts[start..end],
215| 0| end - start,
216| 0| )?);
217| | }
218| |
219| 0| Ok(results)
220| 0|}
221| |
222| |/// Embed multiple passages one-by-one (serial ONNX inference).
223| |///
224| |/// Serialization is **intentional**: ONNX batch inference can trigger pathological
225| |/// runtime behaviour on real-world Markdown chunks (variable token lengths cause
226| |/// extreme padding overhead). Callers that need parallelism should use the rayon
227| |/// `ThreadPool` in `src/commands/ingest.rs::run`, which partitions work across
228| |/// CPU threads and calls this function per shard.
229| |///
230| |/// # Errors
231| |///
232| |/// Returns [`AppError::Embedding`] when the ONNX encoder fails on any passage.
233| 0|pub fn embed_passages_serial<'a, I>(
234| 0| embedder: &Mutex<TextEmbedding>,
235| 0| texts: I,
236| 0|) -> Result<Vec<Vec<f32>>, AppError>
237| 0|where
238| 0| I: IntoIterator<Item = &'a str>,
239| |{
240| 0| let iter = texts.into_iter();
241| 0| let (lower, _) = iter.size_hint();
242| 0| let mut results = Vec::with_capacity(lower);
243| 0| for text in iter {
244| 0| results.push(embed_passage(embedder, text)?);
245| | }
246| 0| Ok(results)
247| 0|}
248| |
249| 2|fn plan_controlled_batches(token_counts: &[usize]) -> Vec<(usize, usize)> {
250| 2| let mut batches =
251| 2| Vec::with_capacity((token_counts.len() / REMEMBER_MAX_CONTROLLED_BATCH_CHUNKS).max(1));
252| 2| let mut start = 0usize;
253| |
254| 6| while start < token_counts.len() {
255| 4| let mut end = start + 1;
256| 4| let mut max_tokens = token_counts[start].max(1);
257| |
258| 7| while end < token_counts.len() && end - start < REMEMBER_MAX_CONTROLLED_BATCH_CHUNKS {
^5
259| 4| let candidate_max = max_tokens.max(token_counts[end].max(1));
260| 4| let candidate_len = end + 1 - start;
261| 4| if candidate_max * candidate_len > REMEMBER_MAX_CONTROLLED_BATCH_PADDED_TOKENS {
262| 1| break;
263| 3| }
264| 3| max_tokens = candidate_max;
265| 3| end += 1;
266| | }
267| |
268| 4| batches.push((start, end));
269| 4| start = end;
270| | }
271| |
272| 2| batches
273| 2|}
274| |
275| |/// Convert `&[f32]` to `&[u8]` for sqlite-vec storage.
276| |///
277| |/// # Safety
278| |///
279| |/// This function is sound when the following invariants hold:
280| |/// 1. `f32` has no padding bytes per the Rust reference
281| |/// (<https://doc.rust-lang.org/reference/types/numeric.html>);
282| |/// `[f32]` has the same byte representation as `[u8; size_of_val(v)]`.
283| |/// 2. The returned `&[u8]` borrows from `v`; its lifetime is tied to the input slice.
284| |/// 3. Endianness matches sqlite-vec on supported platforms (x86_64, aarch64 little-endian).
285| |/// Targets with big-endian `f32` storage are not supported by sqlite-vec.
286| |#[cfg(target_endian = "big")]
287| |compile_error!(
288| | "sqlite-graphrag requires little-endian f32 layout for sqlite-vec compatibility. \
289| | Big-endian targets (PPC64, S390x) are not supported."
290| |);
291| |
292| 25|pub fn f32_to_bytes(v: &[f32]) -> &[u8] {
293| | // SAFETY: see invariants above. f32→u8 transmute via from_raw_parts is sound.
294| 25| unsafe { std::slice::from_raw_parts(v.as_ptr() as *const u8, std::mem::size_of_val(v)) }
295| 25|}
296| |
297| |#[cfg(test)]
298| |mod tests {
299| | use super::*;
300| | use crate::constants::{EMBEDDING_DIM, PASSAGE_PREFIX, QUERY_PREFIX};
301| |
302| | // --- f32_to_bytes tests (pure function, no model) ---
303| |
304| | #[test]
305| 1| fn f32_to_bytes_empty_slice_returns_empty() {
306| 1| let v: Vec<f32> = vec![];
307| 1| assert_eq!(f32_to_bytes(&v), &[] as &[u8]);
308| 1| }
309| |
310| | #[test]
311| 1| fn f32_to_bytes_one_element_returns_4_bytes() {
312| 1| let v = vec![1.0_f32];
313| 1| let bytes = f32_to_bytes(&v);
314| 1| assert_eq!(bytes.len(), 4);
315| | // roundtrip: the 4 bytes must reconstruct the original f32
316| 1| let recovered = f32::from_le_bytes([bytes[0], bytes[1], bytes[2], bytes[3]]);
317| 1| assert_eq!(recovered, 1.0_f32);
318| 1| }
319| |
320| | #[test]
321| 1| fn f32_to_bytes_length_is_4x_elements() {
322| 1| let v = vec![0.0_f32, 1.0, 2.0, 3.0];
323| 1| assert_eq!(f32_to_bytes(&v).len(), v.len() * 4);
324| 1| }
325| |
326| | #[test]
327| 1| fn f32_to_bytes_zero_encoded_as_4_zeros() {
328| 1| let v = vec![0.0_f32];
329| 1| assert_eq!(f32_to_bytes(&v), &[0u8, 0, 0, 0]);
330| 1| }
331| |
332| | #[test]
333| 1| fn f32_to_bytes_roundtrip_vector_embedding_dim() {
334| 384| let v: Vec<f32> = (0..EMBEDDING_DIM).map(|i| i as f32 * 0.001).collect();
^1 ^1 ^1 ^1 ^1
335| 1| let bytes = f32_to_bytes(&v);
336| 1| assert_eq!(bytes.len(), EMBEDDING_DIM * 4);
337| | // reconstructs and compares first and last element
338| 1| let first = f32::from_le_bytes(bytes[0..4].try_into().unwrap());
339| 1| assert!((first - 0.0_f32).abs() < 1e-6);
340| 1| let last_start = (EMBEDDING_DIM - 1) * 4;
341| 1| let last = f32::from_le_bytes(bytes[last_start..last_start + 4].try_into().unwrap());
342| 1| assert!((last - (EMBEDDING_DIM - 1) as f32 * 0.001).abs() < 1e-4);
343| 1| }
344| |
345| | // --- verifies prefixes used by the embedder (no model) ---
346| |
347| | #[test]
348| 1| fn passage_prefix_not_empty() {
349| 1| assert_eq!(PASSAGE_PREFIX, "passage: ");
350| 1| }
351| |
352| | #[test]
353| 1| fn query_prefix_not_empty() {
354| 1| assert_eq!(QUERY_PREFIX, "query: ");
355| 1| }
356| |
357| | #[test]
358| 1| fn embedding_dim_is_384() {
359| 1| assert_eq!(EMBEDDING_DIM, 384);
360| 1| }
361| |
362| | // --- testes com modelo real (ignorados no CI normal) ---
363| |
364| | #[test]
365| | #[ignore = "requires ~600 MB model on disk; run with --include-ignored"]
366| 0| fn embed_passage_returns_vector_with_correct_dimension() {
367| 0| let dir = tempfile::tempdir().unwrap();
368| 0| let embedder = get_embedder(dir.path()).unwrap();
369| 0| let result = embed_passage(embedder, "test text").unwrap();
370| 0| assert_eq!(result.len(), EMBEDDING_DIM);
371| 0| }
372| |
373| | #[test]
374| | #[ignore = "requires ~600 MB model on disk; run with --include-ignored"]
375| 0| fn embed_query_returns_vector_with_correct_dimension() {
376| 0| let dir = tempfile::tempdir().unwrap();
377| 0| let embedder = get_embedder(dir.path()).unwrap();
378| 0| let result = embed_query(embedder, "test query").unwrap();
379| 0| assert_eq!(result.len(), EMBEDDING_DIM);
380| 0| }
381| |
382| | #[test]
383| | #[ignore = "requires ~600 MB model on disk; run with --include-ignored"]
384| 0| fn embed_passages_batch_returns_one_vector_per_text() {
385| 0| let dir = tempfile::tempdir().unwrap();
386| 0| let embedder = get_embedder(dir.path()).unwrap();
387| 0| let textos = ["primeiro", "segundo"];
388| 0| let results = embed_passages_batch(embedder, &textos, 2).unwrap();
389| 0| assert_eq!(results.len(), 2);
390| 0| for emb in &results {
391| 0| assert_eq!(emb.len(), EMBEDDING_DIM);
392| | }
393| 0| }
394| |
395| | #[test]
396| 1| fn controlled_batch_plan_respects_budget() {
397| 1| assert_eq!(
398| 1| plan_controlled_batches(&[100, 100, 100, 100, 300, 300]),
399| 1| vec![(0, 4), (4, 5), (5, 6)]
400| | );
401| 1| }
402| |
403| | #[test]
404| 1| fn controlled_batch_count_returns_one_for_single_chunk() {
405| 1| assert_eq!(controlled_batch_count(&[350]), 1);
406| 1| }
407| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/entity_type.rs:
1| |//! Canonical entity type taxonomy used across extraction, storage and CLI.
2| |//!
3| |//! `EntityType` is the single source of truth for the 13 graph entity kinds.
4| |//! It derives `clap::ValueEnum` so CLI flags can use it directly, and derives
5| |//! `serde::{Serialize, Deserialize}` with `rename_all = "lowercase"` so JSON
6| |//! round-trips remain backward-compatible with the pre-enum string format.
7| |
8| |use crate::errors::AppError;
9| |
10| |/// The 13 canonical graph entity classifications.
11| |///
12| |/// Values are serialized as lowercase strings (`"person"`, `"organization"`,
13| |/// etc.) matching the pre-enum wire format and the SQLite `type` column.
14| |#[derive(
15| | Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize, clap::ValueEnum,
16| |)]
17| |#[serde(rename_all = "snake_case")]
18| |#[clap(rename_all = "snake_case")]
19| |pub enum EntityType {
20| | Concept,
21| | Date,
22| | Dashboard,
23| | Decision,
24| | File,
25| | Incident,
26| | IssueTracker,
27| | Location,
28| | Memory,
29| | Organization,
30| | Person,
31| | Project,
32| | Tool,
33| |}
34| |
35| |impl EntityType {
36| | /// Returns the canonical lowercase string representation stored in SQLite.
37| 96| pub fn as_str(self) -> &'static str {
38| 96| match self {
39| 24| EntityType::Concept => "concept",
40| 0| EntityType::Date => "date",
41| 0| EntityType::Dashboard => "dashboard",
42| 0| EntityType::Decision => "decision",
43| 0| EntityType::File => "file",
44| 0| EntityType::Incident => "incident",
45| 1| EntityType::IssueTracker => "issue_tracker",
46| 0| EntityType::Location => "location",
47| 0| EntityType::Memory => "memory",
48| 11| EntityType::Organization => "organization",
49| 5| EntityType::Person => "person",
50| 53| EntityType::Project => "project",
51| 2| EntityType::Tool => "tool",
52| | }
53| 96| }
54| |}
55| |
56| |impl std::fmt::Display for EntityType {
57| 0| fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
58| 0| f.write_str(self.as_str())
59| 0| }
60| |}
61| |
62| |impl std::str::FromStr for EntityType {
63| | type Err = AppError;
64| |
65| 16| fn from_str(s: &str) -> Result<Self, Self::Err> {
66| 16| match s.to_lowercase().as_str() {
67| 16| "concept" => Ok(EntityType::Concept),
^1
68| 15| "date" => Ok(EntityType::Date),
^1
69| 14| "dashboard" => Ok(EntityType::Dashboard),
^0
70| 14| "decision" => Ok(EntityType::Decision),
^1
71| 13| "file" => Ok(EntityType::File),
^1
72| 12| "incident" => Ok(EntityType::Incident),
^1
73| 11| "issue_tracker" => Ok(EntityType::IssueTracker),
^1
74| 10| "location" => Ok(EntityType::Location),
^1
75| 9| "memory" => Ok(EntityType::Memory),
^0
76| 9| "organization" => Ok(EntityType::Organization),
^3
77| 6| "person" => Ok(EntityType::Person),
^3
78| 3| "project" => Ok(EntityType::Project),
^1
79| 2| "tool" => Ok(EntityType::Tool),
^1
80| 1| other => {
81| 1| let hint = match other {
82| 1| "reference" | "skill" | "note" | "feedback" => Some("concept"),
^0
83| 1| "document" => Some("file"),
^0
84| 1| "user" => Some("person"),
^0
85| 1| _ => None,
86| | };
87| 1| let msg = if let Some(suggested) = hint {
^0
88| 0| format!(
89| 0| "invalid entity_type '{other}'; '{other}' is a MEMORY type, not an entity type. \
90| 0| Try '{suggested}' instead. Valid entity types: concept, date, dashboard, \
91| 0| decision, file, incident, issue_tracker, location, memory, organization, \
92| 0| person, project, tool"
93| | )
94| | } else {
95| 1| format!(
96| 1| "invalid entity type: {other}; expected one of: concept, date, dashboard, \
97| 1| decision, file, incident, issue_tracker, location, memory, organization, \
98| 1| person, project, tool"
99| | )
100| | };
101| 1| Err(AppError::Validation(msg))
102| | }
103| | }
104| 16| }
105| |}
106| |
107| |impl rusqlite::types::FromSql for EntityType {
108| 0| fn column_result(value: rusqlite::types::ValueRef<'_>) -> rusqlite::types::FromSqlResult<Self> {
109| 0| let s = String::column_result(value)?;
110| 0| s.parse::<EntityType>().map_err(|e| {
111| 0| rusqlite::types::FromSqlError::Other(Box::new(std::io::Error::other(e.to_string())))
112| 0| })
113| 0| }
114| |}
115| |
116| |impl rusqlite::types::ToSql for EntityType {
117| 55| fn to_sql(&self) -> rusqlite::Result<rusqlite::types::ToSqlOutput<'_>> {
118| 55| Ok(rusqlite::types::ToSqlOutput::from(self.as_str()))
119| 55| }
120| |}
121| |
122| |#[cfg(test)]
123| |mod tests {
124| | use super::*;
125| |
126| | #[test]
127| 1| fn from_str_lowercase_roundtrip() {
128| 1| assert_eq!("person".parse::<EntityType>().unwrap(), EntityType::Person);
129| 1| assert_eq!(
130| 1| "organization".parse::<EntityType>().unwrap(),
131| | EntityType::Organization
132| | );
133| 1| assert_eq!(
134| 1| "issue_tracker".parse::<EntityType>().unwrap(),
135| | EntityType::IssueTracker
136| | );
137| 1| }
138| |
139| | #[test]
140| 1| fn from_str_uppercase_is_case_insensitive() {
141| 1| assert_eq!("PERSON".parse::<EntityType>().unwrap(), EntityType::Person);
142| 1| assert_eq!(
143| 1| "Organization".parse::<EntityType>().unwrap(),
144| | EntityType::Organization
145| | );
146| 1| }
147| |
148| | #[test]
149| 1| fn from_str_invalid_returns_err() {
150| 1| let result = "invalid".parse::<EntityType>();
151| 1| assert!(result.is_err());
152| 1| let msg = result.unwrap_err().to_string();
153| 1| assert!(msg.contains("invalid entity type"));
154| 1| }
155| |
156| | #[test]
157| 1| fn as_str_returns_canonical_lowercase() {
158| 1| assert_eq!(EntityType::Person.as_str(), "person");
159| 1| assert_eq!(EntityType::IssueTracker.as_str(), "issue_tracker");
160| 1| }
161| |
162| | #[test]
163| 1| fn serde_json_serializes_as_lowercase_string() {
164| 1| let json = serde_json::to_string(&EntityType::Person).unwrap();
165| 1| assert_eq!(json, "\"person\"");
166| 1| let json = serde_json::to_string(&EntityType::IssueTracker).unwrap();
167| 1| assert_eq!(json, "\"issue_tracker\"");
168| 1| }
169| |
170| | #[test]
171| 1| fn serde_json_deserializes_from_lowercase_string() {
172| 1| let et: EntityType = serde_json::from_str("\"person\"").unwrap();
173| 1| assert_eq!(et, EntityType::Person);
174| 1| }
175| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/errors.rs:
1| |//! Library-wide error type.
2| |//!
3| |//! `AppError` is the single error type returned by every public API in the
4| |//! crate. Each variant maps to a deterministic exit code through
5| |//! `AppError::exit_code`, which the binary propagates to the shell on
6| |//! failure. See the README for the full exit code contract.
7| |
8| |use crate::i18n::{current, Language};
9| |use thiserror::Error;
10| |
11| |/// Unified error type for all CLI and library operations.
12| |///
13| |/// Each variant corresponds to a distinct failure category. The
14| |/// [`AppError::exit_code`] method converts a variant into a stable numeric
15| |/// code so that shell callers and LLM agents can route on it.
16| |///
17| |/// # SemVer Policy
18| |///
19| |/// This enum is `#[non_exhaustive]`. New variants may be added in minor
20| |/// releases without breaking downstream match arms (use a wildcard `_`).
21| |#[derive(Error, Debug)]
22| |#[non_exhaustive]
23| |pub enum AppError {
24| | /// Input failed schema, length or format validation. Maps to exit code `1`.
25| | ///
26| | /// This variant groups multiple validation failure causes. Callers that need
27| | /// programmatic retry decisions should use [`AppError::is_retryable`] instead
28| | /// of parsing the message string.
29| | #[error("validation error: {0}")]
30| | Validation(String),
31| |
32| | /// External binary required for operation was not found in PATH. Maps to exit code `1`.
33| | #[error("binary not found: {name} — ensure it is installed and in PATH")]
34| | BinaryNotFound { name: String },
35| |
36| | /// Remote service signaled rate limiting; caller should retry with backoff. Maps to exit code `1`.
37| | #[error("rate limited: {detail}")]
38| | RateLimited { detail: String },
39| |
40| | /// Operation exceeded its time budget. Maps to exit code `1`.
41| | #[error("timeout after {duration_secs}s: {operation}")]
42| | Timeout {
43| | operation: String,
44| | duration_secs: u64,
45| | },
46| |
47| | /// A memory or entity with the same `(namespace, name)` already exists. Maps to exit code `9`.
48| | #[error("duplicate detected: {0}")]
49| | Duplicate(String),
50| |
51| | /// Optimistic update lost the race because `updated_at` changed. Maps to exit code `3`.
52| | #[error("conflict: {0}")]
53| | Conflict(String),
54| |
55| | /// The requested record does not exist or was soft-deleted. Maps to exit code `4`.
56| | #[error("not found: {0}")]
57| | NotFound(String),
58| |
59| | /// Namespace could not be resolved from flag, environment or markers. Maps to exit code `5`.
60| | #[error("namespace not resolved: {0}")]
61| | NamespaceError(String),
62| |
63| | /// Payload exceeded one of the configured body, name or batch limits. Maps to exit code `6`.
64| | #[error("limit exceeded: {0}")]
65| | LimitExceeded(String),
66| |
67| | /// Low-level SQLite error propagated from `rusqlite`. Maps to exit code `10`.
68| | #[error("database error: {0}")]
69| | Database(#[from] rusqlite::Error),
70| |
71| | /// Embedding generation via `fastembed` failed or produced the wrong shape. Maps to exit code `11`.
72| | #[error("embedding error: {0}")]
73| | Embedding(String),
74| |
75| | /// The `sqlite-vec` extension could not load or register its virtual table. Maps to exit code `12`.
76| | #[error("sqlite-vec extension failed: {0}")]
77| | VecExtension(String),
78| |
79| | /// SQLite returned `SQLITE_BUSY` after exhausting retries. Maps to exit code `15` (was `13` before v2.0.0; relocated to free `13` for BatchPartialFailure per PRD).
80| | #[error("database busy: {0}")]
81| | DbBusy(String),
82| |
83| | /// Batch operation failed partially — N of M items failed. Maps to exit code `13` (PRD 1822).
84| | ///
85| | /// Reserved for use in `import`, `reindex` and batch stdin (BLOCK 3/4). Variant present
86| | /// since v2.0.0 even if call-sites do not yet exist — stable exit code mapping.
87| | #[error("batch partial failure: {failed} of {total} items failed")]
88| | BatchPartialFailure { total: usize, failed: usize },
89| |
90| | /// Filesystem I/O error while reading or writing the database or cache. Maps to exit code `14`.
91| | #[error("IO error: {0}")]
92| | Io(#[from] std::io::Error),
93| |
94| | /// Unexpected internal error surfaced through `anyhow`. Maps to exit code `20`.
95| | #[error(transparent)]
96| | Internal(#[from] anyhow::Error),
97| |
98| | /// JSON serialization or deserialization failure. Maps to exit code `20`.
99| | #[error("json error: {0}")]
100| | Json(#[from] serde_json::Error),
101| |
102| | /// Another instance is already running and holds the advisory lock. Maps to exit code `75`.
103| | ///
104| | /// Use `--allow-parallel` to skip the lock or `--wait-lock SECONDS` to retry.
105| | #[error("lock busy: {0}")]
106| | LockBusy(String),
107| |
108| | /// All concurrency slots are occupied after the wait timeout. Maps to exit code `75`.
109| | ///
110| | /// Occurs when [`crate::constants::MAX_CONCURRENT_CLI_INSTANCES`] instances are already
111| | /// active and the wait limit [`crate::constants::CLI_LOCK_DEFAULT_WAIT_SECS`] is exhausted.
112| | #[error(
113| | "all {max} concurrency slots occupied after waiting {waited_secs}s (exit 75); \
114| | use --max-concurrency or wait for other invocations to finish"
115| | )]
116| | AllSlotsFull { max: usize, waited_secs: u64 },
117| |
118| | /// A heavy long-running job is already running for this job_type/namespace
119| | /// pair. Maps to exit code `75` (the same `EX_TEMPFAIL` code used by the
120| | /// CLI semaphore).
121| | ///
122| | /// G28-B (v1.0.68): ensures at most one `enrich`, `ingest --mode
123| | /// claude-code`, or `ingest --mode codex` runs at a time per namespace.
124| | /// Use `--wait-job-singleton <SECONDS>` (per-command) to poll until the
125| | /// other invocation finishes.
126| | #[error(
127| | "job {job_type} for namespace '{namespace}' is already running (exit 75); \
128| | wait for it to finish or pass --wait-job-singleton <SECONDS>"
129| | )]
130| | JobSingletonLocked { job_type: String, namespace: String },
131| |
132| | /// Available memory is below the minimum required to load the model. Maps to exit code `77`.
133| | ///
134| | /// Returned when `sysinfo` reports available memory below
135| | /// [`crate::constants::MIN_AVAILABLE_MEMORY_MB`] MiB before starting the ONNX model load.
136| | #[error(
137| | "available memory ({available_mb}MB) below required minimum ({required_mb}MB) \
138| | to load the model; abort other loads or use --skip-memory-guard (exit 77)"
139| | )]
140| | LowMemory { available_mb: u64, required_mb: u64 },
141| |}
142| |
143| |impl AppError {
144| | /// Returns the deterministic process exit code for this error variant.
145| | ///
146| | /// The codes follow the contract documented in the README: `1` for
147| | /// validation, `9` for duplicates (moved from `2` in v1.0.52), `3` for conflicts, `4` for missing
148| | /// records, `5` for namespace errors, `6` for limit violations, `10`–`14`
149| | /// for infrastructure failures, `13` for BatchPartialFailure (PRD 1822),
150| | /// `15` for DbBusy (migrated from `13` in v2.0.0), `20` for internal errors,
151| | /// `75` (EX_TEMPFAIL) when the advisory CLI lock is held or all concurrency
152| | /// slots are exhausted, and `77` when available memory is insufficient to
153| | /// load the embedding model.
154| | ///
155| | /// # Examples
156| | ///
157| | /// ```
158| | /// use sqlite_graphrag::errors::AppError;
159| | ///
160| | /// assert_eq!(AppError::Validation("invalid field".into()).exit_code(), 1);
161| | /// assert_eq!(AppError::Duplicate("ns/mem".into()).exit_code(), 9);
162| | /// assert_eq!(AppError::Conflict("ts changed".into()).exit_code(), 3);
163| | /// assert_eq!(AppError::NotFound("id 42".into()).exit_code(), 4);
164| | /// assert_eq!(AppError::NamespaceError("no marker".into()).exit_code(), 5);
165| | /// assert_eq!(AppError::LimitExceeded("body too large".into()).exit_code(), 6);
166| | /// assert_eq!(AppError::Embedding("wrong dim".into()).exit_code(), 11);
167| | /// assert_eq!(AppError::DbBusy("retries exhausted".into()).exit_code(), 15);
168| | /// assert_eq!(AppError::LockBusy("another instance".into()).exit_code(), 75);
169| | /// ```
170| | #[inline]
171| | #[must_use]
172| 26| pub fn exit_code(&self) -> i32 {
173| 26| match self {
174| 4| Self::Validation(_) => 1,
175| 1| Self::BinaryNotFound { .. } => 1,
176| 1| Self::RateLimited { .. } => 1,
177| 1| Self::Timeout { .. } => 1,
178| 1| Self::Duplicate(_) => crate::constants::DUPLICATE_EXIT_CODE,
179| 3| Self::Conflict(_) => 3,
180| 1| Self::NotFound(_) => 4,
181| 1| Self::NamespaceError(_) => 5,
182| 1| Self::LimitExceeded(_) => 6,
183| 0| Self::Database(_) => 10,
184| 1| Self::Embedding(_) => 11,
185| 1| Self::VecExtension(_) => 12,
186| 1| Self::BatchPartialFailure { .. } => crate::constants::BATCH_PARTIAL_FAILURE_EXIT_CODE,
187| 1| Self::DbBusy(_) => crate::constants::DB_BUSY_EXIT_CODE,
188| 2| Self::Io(_) => 14,
189| 2| Self::Internal(_) => 20,
190| 2| Self::Json(_) => 20,
191| 2| Self::LockBusy(_) => crate::constants::CLI_LOCK_EXIT_CODE,
192| 0| Self::AllSlotsFull { .. } => crate::constants::CLI_LOCK_EXIT_CODE,
193| 0| Self::JobSingletonLocked { .. } => crate::constants::CLI_LOCK_EXIT_CODE,
194| 0| Self::LowMemory { .. } => crate::constants::LOW_MEMORY_EXIT_CODE,
195| | }
196| 26| }
197| |
198| | /// Returns `true` when the error is transient and the operation may
199| | /// succeed on retry with backoff.
200| | ///
201| | /// # Examples
202| | ///
203| | /// ```
204| | /// use sqlite_graphrag::errors::AppError;
205| | ///
206| | /// assert!(AppError::DbBusy("busy".into()).is_retryable());
207| | /// assert!(AppError::LockBusy("held".into()).is_retryable());
208| | /// assert!(!AppError::NotFound("x".into()).is_retryable());
209| | /// assert!(!AppError::Validation("bad".into()).is_retryable());
210| | /// ```
211| | #[inline]
212| | #[must_use]
213| 11| pub fn is_retryable(&self) -> bool {
214| 5| matches!(
215| 11| self,
216| | Self::DbBusy(_)
217| | | Self::LockBusy(_)
218| | | Self::AllSlotsFull { .. }
219| | | Self::JobSingletonLocked { .. }
220| | | Self::LowMemory { .. }
221| | | Self::RateLimited { .. }
222| | | Self::Timeout { .. }
223| | )
224| 11| }
225| |
226| | /// Returns `true` when the error is permanent and must NOT be retried.
227| | ///
228| | /// Complement to [`Self::is_retryable`]. Errors not classified by either
229| | /// method (e.g. `Database`, `Io`, `Internal`) are ambiguous — the caller
230| | /// decides based on context.
231| | ///
232| | /// # Examples
233| | ///
234| | /// ```
235| | /// use sqlite_graphrag::errors::AppError;
236| | ///
237| | /// assert!(AppError::Validation("bad".into()).is_permanent());
238| | /// assert!(!AppError::DbBusy("busy".into()).is_permanent());
239| | /// ```
240| | #[inline]
241| | #[must_use]
242| 0| pub fn is_permanent(&self) -> bool {
243| 0| matches!(
244| 0| self,
245| | Self::Validation(_)
246| | | Self::BinaryNotFound { .. }
247| | | Self::Duplicate(_)
248| | | Self::NotFound(_)
249| | | Self::NamespaceError(_)
250| | | Self::LimitExceeded(_)
251| | | Self::VecExtension(_)
252| | )
253| 0| }
254| |
255| | /// Returns the localized error message in the active language (`--lang` / `SQLITE_GRAPHRAG_LANG`).
256| | ///
257| | /// In English the text is identical to the `Display` generated by thiserror.
258| | /// In Portuguese the prefixes and messages are translated to PT-BR.
259| 0| pub fn localized_message(&self) -> String {
260| 0| self.localized_message_for(current())
261| 0| }
262| |
263| | /// Returns the localized message for the explicitly provided language.
264| | /// Useful in tests that cannot depend on the global `OnceLock`.
265| | ///
266| | /// # Examples
267| | ///
268| | /// ```
269| | /// use sqlite_graphrag::errors::AppError;
270| | /// use sqlite_graphrag::i18n::Language;
271| | ///
272| | /// let err = AppError::NotFound("mem-xyz".into());
273| | ///
274| | /// let en = err.localized_message_for(Language::English);
275| | /// assert!(en.contains("not found"));
276| | ///
277| | /// let pt = err.localized_message_for(Language::Portuguese);
278| | /// assert!(pt.contains("n\u{e3}o encontrado"));
279| | /// ```
280| 21| pub fn localized_message_for(&self, lang: Language) -> String {
281| 21| match lang {
282| 2| Language::English => self.to_string(),
283| 19| Language::Portuguese => self.to_string_pt(),
284| | }
285| 21| }
286| |
287| 19| fn to_string_pt(&self) -> String {
288| | use crate::i18n::validation::app_error_pt as pt;
289| 19| match self {
290| 1| Self::Validation(msg) => pt::validation(msg),
291| 1| Self::BinaryNotFound { name } => pt::binary_not_found(name),
292| 1| Self::RateLimited { detail } => pt::rate_limited(detail),
293| | Self::Timeout {
294| 1| operation,
295| 1| duration_secs,
296| 1| } => pt::timeout(operation, *duration_secs),
297| 2| Self::Duplicate(msg) => pt::duplicate(msg),
298| 1| Self::Conflict(msg) => pt::conflict(msg),
299| 3| Self::NotFound(msg) => pt::not_found(msg),
300| 1| Self::NamespaceError(msg) => pt::namespace_error(msg),
301| 1| Self::LimitExceeded(msg) => pt::limit_exceeded(msg),
302| 0| Self::Database(e) => pt::database(&e.to_string()),
303| 1| Self::Embedding(msg) => pt::embedding(msg),
304| 1| Self::VecExtension(msg) => pt::vec_extension(msg),
305| 1| Self::DbBusy(msg) => pt::db_busy(msg),
306| 1| Self::BatchPartialFailure { total, failed } => {
307| 1| pt::batch_partial_failure(*total, *failed)
308| | }
309| 0| Self::Io(e) => pt::io(&e.to_string()),
310| 0| Self::Internal(e) => pt::internal(&e.to_string()),
311| 0| Self::Json(e) => pt::json(&e.to_string()),
312| 1| Self::LockBusy(msg) => pt::lock_busy(msg),
313| 1| Self::AllSlotsFull { max, waited_secs } => pt::all_slots_full(*max, *waited_secs),
314| | Self::JobSingletonLocked {
315| 0| job_type,
316| 0| namespace,
317| 0| } => pt::job_singleton_locked(job_type, namespace),
318| | Self::LowMemory {
319| 1| available_mb,
320| 1| required_mb,
321| 1| } => pt::low_memory(*available_mb, *required_mb),
322| | }
323| 19| }
324| |}
325| |
326| |#[cfg(test)]
327| |mod tests {
328| | use super::*;
329| | use std::io;
330| |
331| | #[test]
332| 1| fn exit_code_validation_returns_1() {
333| 1| assert_eq!(AppError::Validation("invalid field".into()).exit_code(), 1);
334| 1| }
335| |
336| | #[test]
337| 1| fn exit_code_duplicate_returns_9() {
338| 1| assert_eq!(AppError::Duplicate("namespace/name".into()).exit_code(), 9);
339| 1| }
340| |
341| | #[test]
342| 1| fn exit_code_conflict_returns_3() {
343| 1| assert_eq!(
344| 1| AppError::Conflict("updated_at changed".into()).exit_code(),
345| | 3
346| | );
347| 1| }
348| |
349| | #[test]
350| 1| fn exit_code_not_found_returns_4() {
351| 1| assert_eq!(AppError::NotFound("memory missing".into()).exit_code(), 4);
352| 1| }
353| |
354| | #[test]
355| 1| fn exit_code_namespace_error_returns_5() {
356| 1| assert_eq!(
357| 1| AppError::NamespaceError("not resolved".into()).exit_code(),
358| | 5
359| | );
360| 1| }
361| |
362| | #[test]
363| 1| fn exit_code_limit_exceeded_returns_6() {
364| 1| assert_eq!(
365| 1| AppError::LimitExceeded("body too large".into()).exit_code(),
366| | 6
367| | );
368| 1| }
369| |
370| | #[test]
371| 1| fn exit_code_embedding_returns_11() {
372| 1| assert_eq!(AppError::Embedding("model failure".into()).exit_code(), 11);
373| 1| }
374| |
375| | #[test]
376| 1| fn exit_code_vec_extension_returns_12() {
377| 1| assert_eq!(
378| 1| AppError::VecExtension("extension did not load".into()).exit_code(),
379| | 12
380| | );
381| 1| }
382| |
383| | #[test]
384| 1| fn exit_code_db_busy_returns_15() {
385| 1| assert_eq!(AppError::DbBusy("retries exhausted".into()).exit_code(), 15);
386| 1| }
387| |
388| | #[test]
389| 1| fn exit_code_batch_partial_failure_returns_13() {
390| 1| assert_eq!(
391| 1| AppError::BatchPartialFailure {
392| 1| total: 10,
393| 1| failed: 3
394| 1| }
395| 1| .exit_code(),
396| | 13
397| | );
398| 1| }
399| |
400| | #[test]
401| 1| fn display_batch_partial_failure_includes_counts() {
402| 1| let err = AppError::BatchPartialFailure {
403| 1| total: 50,
404| 1| failed: 7,
405| 1| };
406| 1| let msg = err.to_string();
407| 1| assert!(msg.contains("7"));
408| 1| assert!(msg.contains("50"));
409| | // to_string() uses the English #[error] attr; PT is in localized_message_for
410| 1| assert!(msg.contains("batch partial failure"));
411| 1| }
412| |
413| | #[test]
414| 1| fn exit_code_io_returns_14() {
415| 1| let io_err = io::Error::new(io::ErrorKind::NotFound, "file missing");
416| 1| assert_eq!(AppError::Io(io_err).exit_code(), 14);
417| 1| }
418| |
419| | #[test]
420| 1| fn exit_code_internal_returns_20() {
421| 1| let anyhow_err = anyhow::anyhow!("unexpected internal error");
422| 1| assert_eq!(AppError::Internal(anyhow_err).exit_code(), 20);
423| 1| }
424| |
425| | #[test]
426| 1| fn exit_code_json_returns_20() {
427| 1| let json_err = serde_json::from_str::<serde_json::Value>("invalid json {{").unwrap_err();
428| 1| assert_eq!(AppError::Json(json_err).exit_code(), 20);
429| 1| }
430| |
431| | #[test]
432| 1| fn exit_code_lock_busy_returns_75() {
433| 1| assert_eq!(
434| 1| AppError::LockBusy("another active instance".into()).exit_code(),
435| | 75
436| | );
437| 1| }
438| |
439| | #[test]
440| 1| fn display_validation_includes_message() {
441| 1| let err = AppError::Validation("invalid id".into());
442| 1| assert!(err.to_string().contains("invalid id"));
443| 1| assert!(err.to_string().contains("validation error"));
444| 1| }
445| |
446| | #[test]
447| 1| fn display_duplicate_includes_message() {
448| 1| let err = AppError::Duplicate("proj/mem".into());
449| 1| assert!(err.to_string().contains("proj/mem"));
450| 1| assert!(err.to_string().contains("duplicate detected"));
451| 1| }
452| |
453| | #[test]
454| 1| fn display_not_found_includes_message() {
455| 1| let err = AppError::NotFound("id 42".into());
456| 1| assert!(err.to_string().contains("id 42"));
457| 1| assert!(err.to_string().contains("not found"));
458| 1| }
459| |
460| | #[test]
461| 1| fn display_embedding_includes_message() {
462| 1| let err = AppError::Embedding("wrong dimension".into());
463| 1| assert!(err.to_string().contains("wrong dimension"));
464| 1| assert!(err.to_string().contains("embedding error"));
465| 1| }
466| |
467| | #[test]
468| 1| fn display_lock_busy_includes_message() {
469| 1| let err = AppError::LockBusy("pid 1234".into());
470| 1| assert!(err.to_string().contains("pid 1234"));
471| 1| assert!(err.to_string().contains("lock busy"));
472| 1| }
473| |
474| | #[test]
475| 1| fn from_io_error_converts_correctly() {
476| 1| let io_err = io::Error::new(io::ErrorKind::PermissionDenied, "permission denied");
477| 1| let app_err: AppError = io_err.into();
478| 1| assert_eq!(app_err.exit_code(), 14);
479| 1| assert!(app_err.to_string().contains("IO error"));
480| 1| }
481| |
482| | #[test]
483| 1| fn from_anyhow_error_converts_correctly() {
484| 1| let anyhow_err = anyhow::anyhow!("internal detail");
485| 1| let app_err: AppError = anyhow_err.into();
486| 1| assert_eq!(app_err.exit_code(), 20);
487| 1| assert!(app_err.to_string().contains("internal detail"));
488| 1| }
489| |
490| | #[test]
491| 1| fn from_serde_json_error_converts_correctly() {
492| 1| let json_err = serde_json::from_str::<serde_json::Value>("{bad_field}").unwrap_err();
493| 1| let app_err: AppError = json_err.into();
494| 1| assert_eq!(app_err.exit_code(), 20);
495| 1| assert!(app_err.to_string().contains("json error"));
496| 1| }
497| |
498| | #[test]
499| 1| fn exit_code_lock_busy_matches_constant() {
500| 1| assert_eq!(
501| 1| AppError::LockBusy("test".into()).exit_code(),
502| | crate::constants::CLI_LOCK_EXIT_CODE
503| | );
504| 1| }
505| |
506| | #[test]
507| 1| fn localized_message_en_equals_to_string() {
508| 1| let err = AppError::NotFound("mem-x".into());
509| 1| assert_eq!(
510| 1| err.localized_message_for(crate::i18n::Language::English),
511| 1| err.to_string()
512| | );
513| 1| }
514| |
515| | // Detailed Portuguese-specific assertions live in `src/i18n.rs`
516| | // (the bilingual module). Here we only verify that delegation is wired
517| | // correctly, without embedding PT strings in this English-only file.
518| |
519| | #[test]
520| 1| fn localized_message_pt_differs_from_en() {
521| 1| let err = AppError::NotFound("mem-x".into());
522| 1| let en = err.localized_message_for(crate::i18n::Language::English);
523| 1| let pt = err.localized_message_for(crate::i18n::Language::Portuguese);
524| 1| assert_ne!(en, pt, "PT and EN must produce distinct messages");
^0
525| 1| assert!(pt.contains("mem-x"), "PT must include the variant payload");
^0
526| 1| }
527| |
528| | #[test]
529| 1| fn localized_message_pt_delegates_to_app_error_pt_helper() {
530| | use crate::i18n::validation::app_error_pt as pt;
531| |
532| 1| let cases: Vec<(AppError, String)> = vec![
533| 1| (AppError::Validation("x".into()), pt::validation("x")),
534| 1| (AppError::Duplicate("x".into()), pt::duplicate("x")),
535| 1| (AppError::Conflict("x".into()), pt::conflict("x")),
536| 1| (AppError::NotFound("x".into()), pt::not_found("x")),
537| 1| (
538| 1| AppError::NamespaceError("x".into()),
539| 1| pt::namespace_error("x"),
540| 1| ),
541| 1| (AppError::LimitExceeded("x".into()), pt::limit_exceeded("x")),
542| 1| (AppError::Embedding("x".into()), pt::embedding("x")),
543| 1| (AppError::VecExtension("x".into()), pt::vec_extension("x")),
544| 1| (AppError::DbBusy("x".into()), pt::db_busy("x")),
545| 1| (
546| 1| AppError::BatchPartialFailure {
547| 1| total: 10,
548| 1| failed: 3,
549| 1| },
550| 1| pt::batch_partial_failure(10, 3),
551| 1| ),
552| 1| (AppError::LockBusy("x".into()), pt::lock_busy("x")),
553| 1| (
554| 1| AppError::AllSlotsFull {
555| 1| max: 4,
556| 1| waited_secs: 60,
557| 1| },
558| 1| pt::all_slots_full(4, 60),
559| 1| ),
560| 1| (
561| 1| AppError::LowMemory {
562| 1| available_mb: 100,
563| 1| required_mb: 500,
564| 1| },
565| 1| pt::low_memory(100, 500),
566| 1| ),
567| 1| (
568| 1| AppError::BinaryNotFound {
569| 1| name: "claude".into(),
570| 1| },
571| 1| pt::binary_not_found("claude"),
572| 1| ),
573| 1| (
574| 1| AppError::RateLimited {
575| 1| detail: "429".into(),
576| 1| },
577| 1| pt::rate_limited("429"),
578| 1| ),
579| 1| (
580| 1| AppError::Timeout {
581| 1| operation: "op".into(),
582| 1| duration_secs: 30,
583| 1| },
584| 1| pt::timeout("op", 30),
585| 1| ),
586| | ];
587| |
588| 17| for (err, expected) in cases {
^16 ^16
589| 16| let actual = err.localized_message_for(crate::i18n::Language::Portuguese);
590| 16| assert_eq!(actual, expected, "delegation mismatch");
^0
591| | }
592| 1| }
593| |
594| | #[test]
595| 1| fn is_retryable_transient_errors() {
596| 1| assert!(AppError::DbBusy("x".into()).is_retryable());
597| 1| assert!(AppError::LockBusy("x".into()).is_retryable());
598| 1| assert!(AppError::AllSlotsFull {
599| 1| max: 4,
600| 1| waited_secs: 60
601| 1| }
602| 1| .is_retryable());
603| 1| assert!(AppError::LowMemory {
604| 1| available_mb: 100,
605| 1| required_mb: 500
606| 1| }
607| 1| .is_retryable());
608| 1| assert!(AppError::RateLimited {
609| 1| detail: "429".into()
610| 1| }
611| 1| .is_retryable());
612| 1| assert!(AppError::Timeout {
613| 1| operation: "op".into(),
614| 1| duration_secs: 30
615| 1| }
616| 1| .is_retryable());
617| 1| }
618| |
619| | #[test]
620| 1| fn is_retryable_permanent_errors() {
621| 1| assert!(!AppError::Validation("x".into()).is_retryable());
622| 1| assert!(!AppError::NotFound("x".into()).is_retryable());
623| 1| assert!(!AppError::Duplicate("x".into()).is_retryable());
624| 1| assert!(!AppError::Conflict("x".into()).is_retryable());
625| 1| assert!(!AppError::BinaryNotFound { name: "x".into() }.is_retryable());
626| 1| }
627| |
628| | #[test]
629| 1| fn exit_code_new_variants() {
630| 1| assert_eq!(AppError::BinaryNotFound { name: "x".into() }.exit_code(), 1);
631| 1| assert_eq!(AppError::RateLimited { detail: "x".into() }.exit_code(), 1);
632| 1| assert_eq!(
633| 1| AppError::Timeout {
634| 1| operation: "x".into(),
635| 1| duration_secs: 5
636| 1| }
637| 1| .exit_code(),
638| | 1
639| | );
640| 1| }
641| |
642| | #[test]
643| 1| fn app_error_size_does_not_exceed_budget() {
644| 1| let size = std::mem::size_of::<AppError>();
645| 1| assert!(
646| 1| size <= 128,
647| 0| "AppError is {size} bytes — exceeds 128-byte budget; \
648| 0| consider boxing large variants to reduce memcpy cost in Result propagation"
649| | );
650| 1| }
651| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/extraction.rs:
1| |//! Entity and URL extraction pipeline (NER + regex prefilter).
2| |//!
3| |//! Runs named-entity recognition and regex heuristics to extract structured
4| |//! entities and hyperlinks from raw memory bodies before embedding.
5| |
6| |use std::path::{Path, PathBuf};
7| |use std::sync::OnceLock;
8| |
9| |use anyhow::{Context, Result};
10| |use ort::session::{builder::GraphOptimizationLevel, Session};
11| |use regex::Regex;
12| |use serde::{Deserialize, Serialize};
13| |use unicode_normalization::UnicodeNormalization;
14| |
15| |use crate::entity_type::EntityType;
16| |use crate::paths::AppPaths;
17| |use crate::storage::entities::{NewEntity, NewRelationship};
18| |
19| |const MAX_ENTS: usize = 30;
20| |// v1.0.31 A9: only consumed by the legacy `build_relationships`, which is
21| |// kept for unit tests pinning the cap behaviour.
22| |#[cfg(test)]
23| |const TOP_K_RELATIONS: usize = 5;
24| |const DEFAULT_RELATION: &str = "mentions";
25| |const MIN_ENTITY_CHARS: usize = 2;
26| |
27| |static REGEX_EMAIL: OnceLock<Regex> = OnceLock::new();
28| |static REGEX_URL: OnceLock<Regex> = OnceLock::new();
29| |static REGEX_UUID: OnceLock<Regex> = OnceLock::new();
30| |static REGEX_ALL_CAPS: OnceLock<Regex> = OnceLock::new();
31| |// v1.0.25 P0-4: filters section-structure markers like "Etapa 3", "Fase 1", "Passo 2".
32| |static REGEX_SECTION_MARKER: OnceLock<Regex> = OnceLock::new();
33| |// v1.0.25 P0-2: captures CamelCase brand names that NER model often misses (e.g. "OpenAI", "PostgreSQL").
34| |static REGEX_BRAND_CAMEL: OnceLock<Regex> = OnceLock::new();
35| |
36| |// v1.0.20: stopwords to filter common PT-BR/EN rule words captured as ALL_CAPS.
37| |// Without this filter, technical PT-BR corpora containing CAPS-formatted rules (NUNCA, PROIBIDO, DEVE)
38| |// generated ~70% of "garbage entities". We keep identifiers like MAX_RETRY (with underscore).
39| |// v1.0.22: expanded list with terms observed in 495-file flowaiper stress test.
40| |// Includes verbs (ADICIONAR, VALIDAR), adjectives (ALTA, BAIXA), common nouns (BANCO, CASO),
41| |// HTTP methods (GET, POST, DELETE) and generic data formats (JSON, XML).
42| |// v1.0.24: added 17 new terms observed in audit v1.0.23: generic status words (COMPLETED, DONE,
43| |// FIXED, PENDING), PT-BR imperative verbs (ACEITE, CONFIRME, NEGUE, RECUSE), PT-BR modal/
44| |// common verbs (DEVEMOS, PODEMOS, VAMOS), generic nouns (BORDA, CHECKLIST, PLAN, TOKEN),
45| |// and common abbreviations (ACK, ACL).
46| |// v1.0.25 P0-4: added technology/protocol acronyms (API, CLI, HTTP, HTTPS, JWT, LLM, REST, UI, URL)
47| |// and PT-BR section-label stems (CAPÍTULO, ETAPA, FASE, PASSO, SEÇÃO) to prevent section markers
48| |// and generic tech terms from being extracted as entities.
49| |// v1.0.31 A11: added PT-BR uppercase noise observed during ingest of technical Portuguese
50| |// rule documents — common nouns/adjectives written in caps as visual emphasis (ADAPTER, PROJETO,
51| |// PASSIVA, ATIVA, SOMENTE, LEITURA, ESCRITA, OBRIGATORIA, EXEMPLO, REGRA, DEFAULT). Each one
52| |// kept leaking as a "concept" entity and inflating the graph with non-entities.
53| |const ALL_CAPS_STOPWORDS: &[&str] = &[
54| | "ACEITE",
55| | "ACID",
56| | "ACK",
57| | "ACL",
58| | "ACRESCENTADO",
59| | "ADAPTER",
60| | "ADICIONADA",
61| | "ADICIONADAS",
62| | "ADICIONADO",
63| | "ADICIONADOS",
64| | "ADICIONAR",
65| | "AGENTS",
66| | "AINDA",
67| | "ALL",
68| | "ALTA",
69| | "ALWAYS",
70| | "APENAS",
71| | "API",
72| | "ARTEFATOS",
73| | "ATIVA",
74| | "ATIVO",
75| | "BAIXA",
76| | "BANCO",
77| | "BLOQUEAR",
78| | "BORDA",
79| | "BUG",
80| | "CAPÍTULO",
81| | "CASO",
82| | "CEO",
83| | "CHECKLIST",
84| | "CLARO",
85| | "CLAUDE_STREAM_IDLE_TIMEOUT_MS",
86| | "CLI",
87| | "COMPLETED",
88| | "CONFIRMADO",
89| | "CONFIRMARAM",
90| | "CONFIRME",
91| | "CONFIRMEI",
92| | "CONFIRMOU",
93| | "CONTRATO",
94| | "CRIE",
95| | "CRÍTICO",
96| | "CRITICAL",
97| | "CSV",
98| | "DDL",
99| | "DEFAULT",
100| | "DEFINIR",
101| | "DEPARTMENT",
102| | "DESC",
103| | "DEVE",
104| | "DEVEMOS",
105| | "DISCO",
106| | "DONE",
107| | "DSL",
108| | "DTO",
109| | "EFEITO",
110| | "ENTRADA",
111| | "EOF",
112| | "EPERM",
113| | "ERROR",
114| | "ESCREVA",
115| | "ESCRITA",
116| | "ESRCH",
117| | "ESSA",
118| | "ESSE",
119| | "ESSENCIAL",
120| | "ESTA",
121| | "ESTADO",
122| | "ESTE",
123| | "ETAPA",
124| | "EVITAR",
125| | "EXEMPLO",
126| | "EXPANDIR",
127| | "EXPOR",
128| | "FALHA",
129| | "FASE",
130| | "FATO",
131| | "FIFO",
132| | "FIXED",
133| | "FIXME",
134| | "FLUXO",
135| | "FONTES",
136| | "FORBIDDEN",
137| | "FUNCIONA",
138| | "GNU",
139| | "HACK",
140| | "HEARTBEAT",
141| | "HTTP",
142| | "HTTPS",
143| | "INATIVO",
144| | "JAMAIS",
145| | "JSON",
146| | "JWT",
147| | "LEITURA",
148| | "LLM",
149| | "MCP",
150| | "MESMO",
151| | "METADADOS",
152| | "MUST",
153| | "NDJSON",
154| | "NEGUE",
155| | "NEVER",
156| | "NOTE",
157| | "NUNCA",
158| | "OBRIGATORIA",
159| | "OBRIGATÓRIO",
160| | "OBSERVEI",
161| | "PADRÃO",
162| | "PASSIVA",
163| | "PASSO",
164| | "PENDING",
165| | "PGID",
166| | "PID",
167| | "PLAN",
168| | "PODEMOS",
169| | "PONTEIROS",
170| | "PREFERIR",
171| | "PROIBIDO",
172| | "PROJETO",
173| | "RECUSE",
174| | "REGRA",
175| | "REGRAS",
176| | "REMOVIDAS",
177| | "REQUIRED",
178| | "REQUISITO",
179| | "REST",
180| | "SEÇÃO",
181| | "SEMPRE",
182| | "SHALL",
183| | "SHOULD",
184| | "SIGTERM",
185| | "SOMENTE",
186| | "SOUL",
187| | "TODAS",
188| | "TODO",
189| | "TODOS",
190| | "TOKEN",
191| | "TOOLS",
192| | "TSV",
193| | "TUI",
194| | "UI",
195| | "URL",
196| | "USAR",
197| | "VALIDAR",
198| | "VAMOS",
199| | "VOCÊ",
200| | "WARNING",
201| | "XML",
202| | "YAML",
203| |];
204| |
205| |// v1.0.22: HTTP methods are protocol verbs, not semantically useful entities.
206| |// Filtered in apply_regex_prefilter (regex_all_caps path).
207| |const HTTP_METHODS: &[&str] = &[
208| | "GET", "POST", "PUT", "DELETE", "PATCH", "HEAD", "OPTIONS", "CONNECT", "TRACE",
209| |];
210| |
211| 51|fn is_filtered_all_caps(token: &str) -> bool {
212| | // Identifiers containing underscore are preserved (e.g. MAX_RETRY, FLOWAIPER_API_KEY)
213| 51| let is_identifier = token.contains('_');
214| 51| if is_identifier {
215| 8| return false;
216| 43| }
217| 43| ALL_CAPS_STOPWORDS.contains(&token) || HTTP_METHODS.contains(&token)
^6 ^6 ^6
218| 51|}
219| |
220| 21|fn regex_email() -> &'static Regex {
221| | // SAFETY: regex literal validated at compile-time via test::regex_literals_compile
222| 21| REGEX_EMAIL.get_or_init(|| {
^1
223| 1| Regex::new(r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}")
224| 1| .expect("compile-time validated email regex literal")
225| 1| })
226| 21|}
227| |
228| 8|fn regex_url() -> &'static Regex {
229| | // SAFETY: regex literal validated at compile-time via test::regex_literals_compile
230| 8| REGEX_URL.get_or_init(|| {
^1
231| 1| Regex::new(r#"https?://[^\s\)\]\}"'<>]+"#)
232| 1| .expect("compile-time validated URL regex literal")
233| 1| })
234| 8|}
235| |
236| 21|fn regex_uuid() -> &'static Regex {
237| | // SAFETY: regex literal validated at compile-time via test::regex_literals_compile
238| 21| REGEX_UUID.get_or_init(|| {
^1
239| 1| Regex::new(r"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}")
240| 1| .expect("compile-time validated UUID regex literal")
241| 1| })
242| 21|}
243| |
244| 21|fn regex_all_caps() -> &'static Regex {
245| 21| REGEX_ALL_CAPS.get_or_init(|| {
^1
246| 1| Regex::new(r"\b[A-Z][A-Z0-9_]{2,}\b")
247| 1| .expect("compile-time validated all-caps regex literal")
248| 1| })
249| 21|}
250| |
251| 25|fn regex_section_marker() -> &'static Regex {
252| 25| REGEX_SECTION_MARKER.get_or_init(|| {
^1
253| | // Matches PT-BR document-structure labels followed by a number: "Etapa 3", "Fase 1",
254| | // "Camada 5", "Passo 2", etc. v1.0.36 (H5): added "Camada" after audit found
255| | // "Camada 1".."Camada 5" leaking through into entity extraction with degree>=3.
256| | // Accented characters expressed as escapes to keep this source file ASCII-only
257| | // per the project language policy. Pattern is equivalent to:
258| | // \b(?:Etapa|Fase|Passo|Camada|Se\xe7\xe3o|Cap\xedtulo)\s+\d+\b
259| 1| Regex::new("\\b(?:Etapa|Fase|Passo|Camada|Se\u{00e7}\u{00e3}o|Cap\u{00ed}tulo)\\s+\\d+\\b")
260| 1| .expect("compile-time validated section marker regex literal")
261| 1| })
262| 25|}
263| |
264| 21|fn regex_brand_camel() -> &'static Regex {
265| 21| REGEX_BRAND_CAMEL.get_or_init(|| {
^1
266| | // Matches CamelCase brand names: one or more lowercase letters after an uppercase, then
267| | // another uppercase followed by more letters. Covers "OpenAI", "PostgreSQL", "ChatGPT".
268| 1| Regex::new(r"\b[A-Z][a-z]+[A-Z][A-Za-z]+\b")
269| 1| .expect("compile-time validated CamelCase brand regex literal")
270| 1| })
271| 21|}
272| |
273| |#[derive(Debug, Clone, PartialEq)]
274| |pub struct ExtractedEntity {
275| | pub name: String,
276| | pub entity_type: EntityType,
277| |}
278| |
279| |/// URL with source offset extracted from the memory body.
280| |#[derive(Debug, Clone)]
281| |pub struct ExtractedUrl {
282| | pub url: String,
283| | /// Byte position in the body where the URL was found.
284| | pub offset: usize,
285| |}
286| |
287| |#[derive(Debug, Clone)]
288| |pub struct ExtractionResult {
289| | pub entities: Vec<NewEntity>,
290| | pub relationships: Vec<NewRelationship>,
291| | /// True when build_relationships hit the cap before covering all entity pairs.
292| | /// Exposed in RememberResponse so callers can detect when relationships were cut.
293| | pub relationships_truncated: bool,
294| | /// Extraction method used: `"gliner-<variant>+regex"` or `"regex-only"`.
295| | /// Useful for auditing, metrics and user reports.
296| | pub extraction_method: String,
297| | /// URLs extracted from the body — stored separately from graph entities.
298| | pub urls: Vec<ExtractedUrl>,
299| |}
300| |
301| |pub trait Extractor: Send + Sync {
302| | fn extract(&self, body: &str) -> Result<ExtractionResult>;
303| |}
304| |
305| |/// GLiNER ONNX model quantization variant.
306| |#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
307| |pub enum GlinerVariant {
308| | Fp32,
309| | Fp16,
310| | Int8,
311| | Q4,
312| | Q4f16,
313| |}
314| |
315| |impl GlinerVariant {
316| | /// ONNX filename for this variant in the HuggingFace repository.
317| 7| pub fn as_filename(self) -> &'static str {
318| 7| match self {
319| 1| Self::Fp32 => "model.onnx",
320| 1| Self::Fp16 => "model_fp16.onnx",
321| 3| Self::Int8 => "model_quantized.onnx",
322| 1| Self::Q4 => "model_q4.onnx",
323| 1| Self::Q4f16 => "model_q4f16.onnx",
324| | }
325| 7| }
326| |
327| | /// Approximate model size for user-facing messages.
328| 2| pub fn display_size(self) -> &'static str {
329| 2| match self {
330| 1| Self::Fp32 => "1.1 GB",
331| 0| Self::Fp16 => "580 MB",
332| 1| Self::Int8 => "349 MB",
333| 0| Self::Q4 => "894 MB",
334| 0| Self::Q4f16 => "472 MB",
335| | }
336| 2| }
337| |}
338| |
339| |impl std::fmt::Display for GlinerVariant {
340| 14| fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
341| 14| match self {
342| 3| Self::Fp32 => f.write_str("fp32"),
343| 2| Self::Fp16 => f.write_str("fp16"),
344| 5| Self::Int8 => f.write_str("int8"),
345| 2| Self::Q4 => f.write_str("q4"),
346| 2| Self::Q4f16 => f.write_str("q4f16"),
347| | }
348| 14| }
349| |}
350| |
351| |impl std::str::FromStr for GlinerVariant {
352| | type Err = anyhow::Error;
353| 15| fn from_str(s: &str) -> Result<Self> {
354| 15| match s.to_lowercase().as_str() {
355| 15| "fp32" => Ok(Self::Fp32),
^3
356| 12| "fp16" => Ok(Self::Fp16),
^2
357| 10| "int8" => Ok(Self::Int8),
^3
358| 7| "q4" => Ok(Self::Q4),
^2
359| 5| "q4f16" => Ok(Self::Q4f16),
^2
360| 3| other => {
361| 3| anyhow::bail!("unknown GLiNER variant: {other}. Valid: fp32, fp16, int8, q4, q4f16")
362| | }
363| | }
364| 15| }
365| |}
366| |
367| |const GLINER_MAX_WIDTH: usize = 12;
368| |const GLINER_MAX_SEQ_LEN: usize = 384;
369| |const GLINER_ENT_TOKEN: &str = "<<ENT>>";
370| |const GLINER_SEP_TOKEN: &str = "<<SEP>>";
371| |
372| |const GLINER_ENTITY_LABELS: &[(&str, EntityType)] = &[
373| | ("person", EntityType::Person),
374| | ("organization", EntityType::Organization),
375| | ("location", EntityType::Location),
376| | ("date", EntityType::Date),
377| | ("project", EntityType::Project),
378| | ("tool", EntityType::Tool),
379| | ("file", EntityType::File),
380| | ("concept", EntityType::Concept),
381| | ("decision", EntityType::Decision),
382| | ("incident", EntityType::Incident),
383| | ("dashboard", EntityType::Dashboard),
384| | ("issue tracker", EntityType::IssueTracker),
385| | ("memory", EntityType::Memory),
386| |];
387| |
388| |struct GlinerModel {
389| | session: parking_lot::Mutex<Session>,
390| | tokenizer: tokenizers::Tokenizer,
391| | #[allow(dead_code)]
392| | variant: GlinerVariant,
393| |}
394| |
395| |impl GlinerModel {
396| 1| fn load(model_dir: &Path, variant: GlinerVariant) -> Result<Self> {
397| 1| let model_path = model_dir.join(variant.as_filename());
398| 1| let tokenizer_path = model_dir.join("tokenizer.json");
399| |
400| 1| let session = Session::builder()
401| 1| .map_err(|e| anyhow::anyhow!("creating GLiNER session builder: {e}"))?
^0 ^0
402| 1| .with_optimization_level(GraphOptimizationLevel::Level3)
403| 1| .map_err(|e| anyhow::anyhow!("setting optimization level: {e}"))?
^0 ^0
404| 1| .commit_from_file(&model_path)
405| 1| .map_err(|e| anyhow::anyhow!("loading GLiNER ONNX model from {model_path:?}: {e}"))?;
^0 ^0
406| |
407| 1| let tokenizer = tokenizers::Tokenizer::from_file(&tokenizer_path)
408| 1| .map_err(|e| anyhow::anyhow!("loading GLiNER tokenizer: {e}"))?;
^0 ^0
409| |
410| 1| Ok(Self {
411| 1| session: parking_lot::Mutex::new(session),
412| 1| tokenizer,
413| 1| variant,
414| 1| })
415| 1| }
416| |
417| 3| fn predict(
418| 3| &self,
419| 3| body: &str,
420| 3| entity_labels: &[(&str, EntityType)],
421| 3| threshold: f32,
422| 3| ) -> Result<Vec<ExtractedEntity>> {
423| 3| let label_names: Vec<&str> = entity_labels.iter().map(|(name, _)| *name).collect();
424| 3| let words: Vec<&str> = body.split_whitespace().collect();
425| 3| if words.is_empty() {
426| 0| return Ok(Vec::new());
427| 3| }
428| |
429| | // Cap words to fit within model sequence length (accounting for label tokens)
430| 3| let label_token_count = label_names.len() * 2 + 1;
431| 3| let max_words = GLINER_MAX_SEQ_LEN.saturating_sub(label_token_count + 2);
432| 3| let words = if words.len() > max_words {
433| 1| tracing::warn!(target: "extraction",
434| 0| original_words = words.len(),
435| | capped_words = max_words,
436| 0| "GLiNER input truncated to fit model sequence length"
437| | );
438| 1| &words[..max_words]
439| | } else {
440| 2| &words[..]
441| | };
442| 3| let num_words = words.len();
443| |
444| | // Build prompt: [<<ENT>>, label1, <<ENT>>, label2, ..., <<SEP>>, word1, word2, ...]
445| 3| let prompt_cap = label_names.len() * 2 + 1 + num_words;
446| 3| let mut prompt_tokens: Vec<String> = Vec::new();
447| 3| prompt_tokens.try_reserve(prompt_cap).map_err(|_| {
^0
448| 0| anyhow::anyhow!(
449| 0| "allocation of {prompt_cap} prompt tokens would exceed available memory"
450| | )
451| 0| })?;
452| 42| for label in &label_names {
^39
453| 39| prompt_tokens.push(GLINER_ENT_TOKEN.to_string());
454| 39| prompt_tokens.push((*label).to_string());
455| 39| }
456| 3| prompt_tokens.push(GLINER_SEP_TOKEN.to_string());
457| 367| for word in words {
^364
458| 364| prompt_tokens.push((*word).to_string());
459| 364| }
460| |
461| | // Encode each token individually (word-by-word encoding per GLiNER protocol)
462| 3| let seq_estimate = prompt_tokens.len() * 3;
463| 3| let mut all_ids: Vec<i64> = Vec::new();
464| 3| all_ids.try_reserve(seq_estimate).map_err(|_| {
^0
465| 0| anyhow::anyhow!("allocation of {seq_estimate} token IDs would exceed available memory")
466| 0| })?;
467| 3| let mut all_attention: Vec<i64> = Vec::new();
468| 3| all_attention.try_reserve(seq_estimate).map_err(|_| {
^0
469| 0| anyhow::anyhow!(
470| 0| "allocation of {seq_estimate} attention masks would exceed available memory"
471| | )
472| 0| })?;
473| 3| let mut all_word_mask: Vec<i64> = Vec::new();
474| 3| all_word_mask.try_reserve(seq_estimate).map_err(|_| {
^0
475| 0| anyhow::anyhow!("allocation of {seq_estimate} word masks would exceed available memory")
476| 0| })?;
477| |
478| | // BOS token
479| 3| all_ids.push(1);
480| 3| all_attention.push(1);
481| 3| all_word_mask.push(0);
482| |
483| 3| let text_offset = label_names.len() * 2 + 1;
484| 3| let mut word_id: i64 = 0;
485| |
486| 445| for (pos, token_str) in prompt_tokens.iter().enumerate() {
^3 ^3
487| 445| let encoding = self
488| 445| .tokenizer
489| 445| .encode(token_str.as_str(), false)
490| 445| .map_err(|e| anyhow::anyhow!("GLiNER tokenizer encode error: {e}"))?;
^0 ^0
491| 445| let ids = encoding.get_ids();
492| 445| let is_text_token = pos >= text_offset;
493| |
494| 827| for (sub_idx, &id) in ids.iter().enumerate() {
^445^445 ^445
495| 827| all_ids.push(id as i64);
496| 827| all_attention.push(1);
497| 827| if is_text_token && sub_idx == 0 {
^737
498| 364| word_id += 1;
499| 364| all_word_mask.push(word_id);
500| 463| } else {
501| 463| all_word_mask.push(0);
502| 463| }
503| | }
504| | }
505| |
506| | // EOS token
507| 3| all_ids.push(2);
508| 3| all_attention.push(1);
509| 3| all_word_mask.push(0);
510| |
511| 3| let seq_len = all_ids.len();
512| |
513| | // Build ORT tensors using Tensor::from_array((shape, data)) API
514| 3| let t_input_ids = ort::value::Tensor::<i64>::from_array(([1usize, seq_len], all_ids))
515| 3| .map_err(|e| anyhow::anyhow!("building input_ids tensor: {e}"))?;
^0 ^0
516| 3| let t_attention = ort::value::Tensor::<i64>::from_array(([1usize, seq_len], all_attention))
517| 3| .map_err(|e| anyhow::anyhow!("building attention_mask tensor: {e}"))?;
^0 ^0
518| 3| let t_words_mask =
519| 3| ort::value::Tensor::<i64>::from_array(([1usize, seq_len], all_word_mask))
520| 3| .map_err(|e| anyhow::anyhow!("building words_mask tensor: {e}"))?;
^0 ^0
521| 3| let t_text_lengths =
522| 3| ort::value::Tensor::<i64>::from_array(([1usize, 1usize], vec![num_words as i64]))
523| 3| .map_err(|e| anyhow::anyhow!("building text_lengths tensor: {e}"))?;
^0 ^0
524| |
525| | // Build span tensors
526| 3| let num_spans = num_words * GLINER_MAX_WIDTH;
527| 3| let mut span_idx_data = vec![0i64; num_spans * 2];
528| 3| let mut span_mask_data = vec![false; num_spans];
529| |
530| 364| for start in 0..num_words {
^3
531| 364| let remaining = num_words - start;
532| 364| let actual_max_width = GLINER_MAX_WIDTH.min(remaining);
533| 4.21k| for width in 0..actual_max_width {
^364
534| 4.21k| let dim = start * GLINER_MAX_WIDTH + width;
535| 4.21k| span_idx_data[dim * 2] = start as i64;
536| 4.21k| span_idx_data[dim * 2 + 1] = (start + width) as i64;
537| 4.21k| span_mask_data[dim] = true;
538| 4.21k| }
539| | }
540| |
541| 3| let t_span_idx =
542| 3| ort::value::Tensor::<i64>::from_array(([1usize, num_spans, 2usize], span_idx_data))
543| 3| .map_err(|e| anyhow::anyhow!("building span_idx tensor: {e}"))?;
^0 ^0
544| 3| let t_span_mask =
545| 3| ort::value::Tensor::<bool>::from_array(([1usize, num_spans], span_mask_data))
546| 3| .map_err(|e| anyhow::anyhow!("building span_mask tensor: {e}"))?;
^0 ^0
547| |
548| | // Run inference — Session::run requires &mut Session; bind guard first.
549| 3| let mut session_guard = self.session.lock();
550| 3| let outputs = session_guard
551| 3| .run(ort::inputs![
552| 3| "input_ids" => t_input_ids,
553| 3| "attention_mask" => t_attention,
554| 3| "words_mask" => t_words_mask,
555| 3| "text_lengths" => t_text_lengths,
556| 3| "span_idx" => t_span_idx,
557| 3| "span_mask" => t_span_mask
558| 3| ])
559| 3| .map_err(|e| anyhow::anyhow!("GLiNER inference forward pass: {e}"))?;
^0 ^0
560| |
561| | // Extract logits: [1, num_words, max_width, num_classes]
562| | // try_extract_tensor returns (&Shape, &[f32]); index manually.
563| 3| let (logits_shape, logits_data) = outputs["logits"]
564| 3| .try_extract_tensor::<f32>()
565| 3| .map_err(|e| anyhow::anyhow!("extracting logits tensor: {e}"))?;
^0 ^0
566| |
567| 3| let num_classes = label_names.len();
568| | // Expected shape: [1, num_words, GLINER_MAX_WIDTH, num_classes]
569| | // Shape derefs to &[i64] so we can index directly.
570| 3| let max_width = logits_shape
571| 3| .get(2)
572| 3| .copied()
573| 3| .unwrap_or(GLINER_MAX_WIDTH as i64) as usize;
574| 3| let nc = logits_shape.get(3).copied().unwrap_or(num_classes as i64) as usize;
575| |
576| 3| let candidates_cap = num_words * max_width;
577| 3| let mut candidates: Vec<(usize, usize, usize, f32)> = Vec::new();
578| 3| candidates.try_reserve(candidates_cap).map_err(|_| {
^0
579| 0| anyhow::anyhow!(
580| 0| "allocation of {candidates_cap} candidates would exceed available memory"
581| | )
582| 0| })?;
583| |
584| 364| for start in 0..num_words {
^3
585| 4.23k| for width in 0..max_width {
^364
586| 4.23k| let end = start + width;
587| 4.23k| if end >= num_words {
588| 20| break;
589| 4.21k| }
590| 54.8k| for class_idx in 0..nc.min(num_classes) {
^4.21k^4.21k^4.21k
591| | // flat index: batch=0 * (num_words*max_width*nc) + start*(max_width*nc) + width*nc + class_idx
592| 54.8k| let flat = start * (max_width * nc) + width * nc + class_idx;
593| 54.8k| if flat >= logits_data.len() {
594| 0| break;
595| 54.8k| }
596| 54.8k| let raw = logits_data[flat];
597| 54.8k| let score = 1.0 / (1.0 + (-raw).exp());
598| 54.8k| if score >= threshold {
599| 0| candidates.push((start, end, class_idx, score));
600| 54.8k| }
601| | }
602| | }
603| | }
604| |
605| | // Sort by score descending for greedy NMS
606| 3| candidates.sort_by(|a, b| b.3.partial_cmp(&a.3).unwrap_or(std::cmp::Ordering::Equal));
^0 ^0 ^0 ^0 ^0
607| |
608| | // Greedy non-maximum suppression
609| 3| let mut used = vec![false; num_words];
610| 3| let mut entities: Vec<ExtractedEntity> = Vec::with_capacity(candidates.len().min(MAX_ENTS));
611| |
612| 3| for (start, end, class_idx, _score) in &candidates {
^0 ^0 ^0 ^0
613| 0| let overlap = (*start..=*end).any(|i| used[i]);
614| 0| if overlap {
615| 0| continue;
616| 0| }
617| 0| for flag in used.iter_mut().take(*end + 1).skip(*start) {
618| 0| *flag = true;
619| 0| }
620| 0| let text = words[*start..=*end].join(" ");
621| 0| if text.len() < MIN_ENTITY_CHARS {
622| 0| continue;
623| 0| }
624| 0| let entity_type = entity_labels[*class_idx].1;
625| 0| entities.push(ExtractedEntity {
626| 0| name: text,
627| 0| entity_type,
628| 0| });
629| 0| if entities.len() >= MAX_ENTS {
630| 0| break;
631| 0| }
632| | }
633| |
634| 3| Ok(entities)
635| 3| }
636| |}
637| |
638| |static GLINER_MODEL: OnceLock<Option<GlinerModel>> = OnceLock::new();
639| |
640| 1|fn gliner_model_dir(paths: &AppPaths, variant: GlinerVariant) -> PathBuf {
641| 1| paths.models.join(format!("gliner-multi-v2.1/{variant}"))
642| 1|}
643| |
644| 1|fn ensure_gliner_model_files(paths: &AppPaths, variant: GlinerVariant) -> Result<PathBuf> {
645| 1| let dir = gliner_model_dir(paths, variant);
646| 1| std::fs::create_dir_all(&dir)
647| 1| .with_context(|| format!("creating GLiNER model directory: {dir:?}"))?;
^0 ^0
648| |
649| 1| let model_file = dir.join(variant.as_filename());
650| 1| let tokenizer_file = dir.join("tokenizer.json");
651| |
652| 1| if model_file.exists() && tokenizer_file.exists() {
653| 1| return Ok(dir);
654| 0| }
655| |
656| 0| let repo = crate::constants::gliner_model_repo();
657| 0| tracing::info!(target: "extraction",
658| 0| "Downloading GLiNER model ({variant}, ~{})...",
659| 0| variant.display_size()
660| | );
661| 0| crate::output::emit_progress_i18n(
662| 0| &format!(
663| 0| "Downloading GLiNER model ({variant}, ~{})...",
664| 0| variant.display_size()
665| 0| ),
666| 0| &format!(
667| 0| "Baixando modelo GLiNER ({variant}, ~{})...",
668| 0| variant.display_size()
669| 0| ),
670| | );
671| |
672| 0| let api = huggingface_hub::api::sync::Api::new().with_context(|| "creating HF Hub client")?;
673| 0| let hf_repo = api.model(repo);
674| |
675| 0| let remote_model = format!("onnx/{}", variant.as_filename());
676| 0| if !model_file.exists() {
677| 0| let src = hf_repo
678| 0| .get(&remote_model)
679| 0| .with_context(|| format!("downloading {remote_model} from HF Hub"))?;
680| 0| std::fs::copy(&src, &model_file)
681| 0| .with_context(|| format!("copying {} to cache", variant.as_filename()))?;
682| 0| }
683| |
684| 0| if !tokenizer_file.exists() {
685| 0| let src = hf_repo
686| 0| .get("tokenizer.json")
687| 0| .with_context(|| "downloading tokenizer.json from HF Hub")?;
688| 0| std::fs::copy(&src, &tokenizer_file).with_context(|| "copying tokenizer.json to cache")?;
689| 0| }
690| |
691| 0| Ok(dir)
692| 1|}
693| |
694| 1|fn load_gliner_model(paths: &AppPaths, variant: GlinerVariant) -> Result<GlinerModel> {
695| 1| let dir = ensure_gliner_model_files(paths, variant)?;
^0
696| 1| GlinerModel::load(&dir, variant)
697| 1|}
698| |
699| 3|fn get_or_init_gliner(paths: &AppPaths, variant: GlinerVariant) -> Option<&'static GlinerModel> {
700| 3| GLINER_MODEL
701| 3| .get_or_init(|| match load_gliner_model(paths, variant) {
^1
702| 1| Ok(m) => Some(m),
703| 0| Err(e) => {
704| 0| tracing::warn!(target: "extraction", error = %e, "GLiNER model unavailable, graceful degradation");
705| 0| None
706| | }
707| 1| })
708| 3| .as_ref()
709| 3|}
710| |
711| 21|fn apply_regex_prefilter(body: &str) -> Vec<ExtractedEntity> {
712| 21| let mut entities = Vec::with_capacity(16);
713| 21| let mut seen: std::collections::HashSet<String> = std::collections::HashSet::with_capacity(32);
714| |
715| 21| let add = |entities: &mut Vec<ExtractedEntity>,
716| | seen: &mut std::collections::HashSet<String>,
717| | name: &str,
718| 24| entity_type: EntityType| {
719| 24| let name = name.trim().to_string();
720| 24| if name.len() >= MIN_ENTITY_CHARS && seen.insert(name.clone()) {
721| 24| entities.push(ExtractedEntity { name, entity_type });
722| 24| }
^0
723| 24| };
724| |
725| | // v1.0.25 P0-4: strip section-structure markers before any other processing so that
726| | // "Etapa 3", "Fase 1", "Passo 2" are not fed to downstream regex passes.
727| 21| let cleaned = regex_section_marker().replace_all(body, " ");
728| 21| let cleaned = cleaned.as_ref();
729| |
730| 21| for m in regex_email().find_iter(cleaned) {
^5
731| 5| // v1.0.20: email is "concept" (regex alone cannot distinguish person from mailing list/role).
732| 5| add(&mut entities, &mut seen, m.as_str(), EntityType::Concept);
733| 5| }
734| 21| for m in regex_uuid().find_iter(cleaned) {
^1
735| 1| add(&mut entities, &mut seen, m.as_str(), EntityType::Concept);
736| 1| }
737| 51| for m in regex_all_caps().find_iter(cleaned) {
^21 ^21 ^21
738| 51| let candidate = m.as_str();
739| | // v1.0.22: filtro consolidado (stopwords + HTTP methods); preserva identificadores com underscore.
740| 51| if !is_filtered_all_caps(candidate) {
741| 14| add(&mut entities, &mut seen, candidate, EntityType::Concept);
742| 37| }
743| | }
744| | // v1.0.25 P0-2: capture CamelCase brand names that NER model often misses.
745| | // Maps to "organization" (V008 schema) because brand names are typically organisations.
746| 21| for m in regex_brand_camel().find_iter(cleaned) {
^4
747| 4| let name = m.as_str();
748| | // Skip if the uppercased form is a known stopword (e.g. "JsonSchema" → "JSONSCHEMA").
749| 4| if !ALL_CAPS_STOPWORDS.contains(&name.to_uppercase().as_str()) {
750| 4| add(&mut entities, &mut seen, name, EntityType::Organization);
751| 4| }
^0
752| | }
753| |
754| 21| entities
755| 21|}
756| |
757| |/// Extracts URLs from a memory body, deduplicated by text.
758| |/// URLs are stored in the `memory_urls` table separately from graph entities.
759| |/// v1.0.24: split of the URL block that polluted apply_regex_prefilter with entity_type='concept'.
760| 8|pub fn extract_urls(body: &str) -> Vec<ExtractedUrl> {
761| 8| let mut seen: std::collections::HashSet<String> = std::collections::HashSet::with_capacity(8);
762| 8| let mut result = Vec::with_capacity(4);
763| 8| for m in regex_url().find_iter(body) {
^4
764| 4| let raw = m.as_str();
765| 4| let cleaned = raw
766| 4| .trim_end_matches('`')
767| 4| .trim_end_matches(',')
768| 4| .trim_end_matches('.')
769| 4| .trim_end_matches(';')
770| 4| .trim_end_matches(')')
771| 4| .trim_end_matches(']')
772| 4| .trim_end_matches('}');
773| 4| if cleaned.len() >= 10 && seen.insert(cleaned.to_string()) {
774| 3| result.push(ExtractedUrl {
775| 3| url: cleaned.to_string(),
776| 3| offset: m.start(),
777| 3| });
778| 3| }
^1
779| | }
780| 8| result
781| 8|}
782| |
783| |/// Returns (relationships, truncated) where truncated is true when the cap was hit
784| |/// before all entity pairs were covered. Exposed in RememberResponse as
785| |/// `relationships_truncated` so callers can decide whether to increase the cap.
786| |///
787| |/// v1.0.31 A9: superseded by `build_relationships_by_sentence_cooccurrence` for
788| |/// the auto-extraction pipeline because the legacy pairwise scheme produces a
789| |/// dense C(N,2) graph polluted with co-mentions across unrelated paragraphs.
790| |/// Kept for unit tests that pin the cap behaviour and for callers that lack a
791| |/// body string.
792| |#[cfg(test)]
793| 2|fn build_relationships(entities: &[NewEntity]) -> (Vec<NewRelationship>, bool) {
794| 2| if entities.len() < 2 {
795| 0| return (Vec::new(), false);
796| 2| }
797| |
798| | // v1.0.22: cap configurable via env var (constants::max_relationships_per_memory).
799| | // Allows users with dense corpora to increase beyond the default 50.
800| 2| let max_rels = crate::constants::max_relationships_per_memory();
801| 2| let n = entities.len().min(MAX_ENTS);
802| 2| let mut rels: Vec<NewRelationship> = Vec::with_capacity(n.min(max_rels));
803| 2| let mut seen: std::collections::HashSet<(usize, usize)> =
804| 2| std::collections::HashSet::with_capacity(n.min(max_rels));
805| |
806| 2| let mut hit_cap = false;
807| 16| 'outer: for i in 0..n {
^2
808| 16| if rels.len() >= max_rels {
809| 1| hit_cap = true;
810| 1| break;
811| 15| }
812| |
813| 15| let mut for_entity = 0usize;
814| 70| for j in (i + 1)..n {
^15 ^15
815| 70| if for_entity >= TOP_K_RELATIONS {
816| 10| break;
817| 60| }
818| 60| if rels.len() >= max_rels {
819| 0| hit_cap = true;
820| 0| break 'outer;
821| 60| }
822| |
823| 60| let key = (i.min(j), i.max(j));
824| 60| if !seen.insert(key) {
825| 0| continue;
826| 60| }
827| |
828| 60| rels.push(NewRelationship {
829| 60| // clone needed: NewRelationship requires owned String for source/target
830| 60| source: entities[i].name.clone(),
831| 60| target: entities[j].name.clone(),
832| 60| relation: DEFAULT_RELATION.to_string(),
833| 60| strength: 0.5,
834| 60| description: None,
835| 60| });
836| 60| for_entity += 1;
837| | }
838| | }
839| |
840| | // v1.0.20: warn when relationships were truncated before covering all possible pairs.
841| 2| if hit_cap {
842| 1| tracing::warn!(target: "extraction",
843| 0| "relationships truncated to {max_rels} (with {n} entities, theoretical max was ~{}x combinations)",
844| 0| n.saturating_sub(1)
845| | );
846| 1| }
847| |
848| 2| (rels, hit_cap)
849| 2|}
850| |
851| |/// v1.0.31 A9: build relationships only between entities that actually
852| |/// co-occur within the same sentence (split on `.`, `!`, `?`, newline).
853| |///
854| |/// The legacy `build_relationships` pairs every entity with every other,
855| |/// yielding a dense C(N,2) graph dominated by spurious "mentions" edges
856| |/// across unrelated sections. Restricting to sentence-level co-occurrence
857| |/// keeps the edges semantically meaningful while still respecting the
858| |/// configurable `max_relationships_per_memory` cap.
859| |///
860| |/// Returns `(relationships, truncated)` mirroring `build_relationships`.
861| 8|fn build_relationships_by_sentence_cooccurrence(
862| 8| body: &str,
863| 8| entities: &[NewEntity],
864| 8|) -> (Vec<NewRelationship>, bool) {
865| 8| if entities.len() < 2 {
866| 3| return (Vec::new(), false);
867| 5| }
868| |
869| 5| let max_rels = crate::constants::max_relationships_per_memory();
870| 5| let lower_names: Vec<(usize, String)> = entities
871| 5| .iter()
872| 5| .take(MAX_ENTS)
873| 5| .enumerate()
874| 11| .map(|(i, e)| (i, e.name.to_lowercase()))
^5
875| 5| .collect();
876| |
877| 5| let mut rels: Vec<NewRelationship> = Vec::with_capacity(max_rels);
878| 5| let mut seen: std::collections::HashSet<(usize, usize)> =
879| 5| std::collections::HashSet::with_capacity(max_rels);
880| 5| let mut hit_cap = false;
881| |
882| 12| for sentence in body.split(['.', '!', '?', '\n']) {
^5 ^5 ^5
883| 12| if sentence.trim().is_empty() {
884| 2| continue;
885| 10| }
886| 10| let lower_sentence = sentence.to_lowercase();
887| 10| let present: Vec<usize> = lower_names
888| 10| .iter()
889| 22| .filter(|(_, name)| !name.is_empty() && lower_sentence.contains(name.as_str()))
^10
890| 10| .map(|(i, _)| *i)
891| 10| .collect();
892| |
893| 10| if present.len() < 2 {
894| 7| continue;
895| 3| }
896| |
897| 3| let n = present.len();
898| 6| for i in 0..n {
^3
899| 6| for j in (i + 1)..n {
^3
900| 3| if rels.len() >= max_rels {
901| 0| hit_cap = true;
902| 0| tracing::warn!(target: "extraction",
903| 0| "relationships truncated to {max_rels} during sentence-level pairing"
904| | );
905| 0| return (rels, hit_cap);
906| 3| }
907| 3| let ei = present[i];
908| 3| let ej = present[j];
909| 3| let key = (ei.min(ej), ei.max(ej));
910| 3| if seen.insert(key) {
911| 2| rels.push(NewRelationship {
912| 2| source: entities[ei].name.clone(),
913| 2| target: entities[ej].name.clone(),
914| 2| relation: DEFAULT_RELATION.to_string(),
915| 2| strength: 0.5,
916| 2| description: None,
917| 2| });
918| 2| }
^1
919| | }
920| | }
921| | }
922| |
923| 5| (rels, hit_cap)
924| 8|}
925| |
926| |/// v1.0.22 P1: extends entities with hyphenated or space-separated numeric suffixes.
927| |/// Cases: GPT extracted but body contains "GPT-5" → rewrites to "GPT-5".
928| |/// Cases: Claude extracted but body contains "Claude 4" → rewrites to "Claude 4".
929| |/// Conservative: only extends when the suffix is at most 7 characters.
930| |/// v1.0.24 P2-E: suffix accepts an optional lowercase ASCII letter after digits to cover
931| |/// models such as "GPT-4o", "Llama-5b", "Mistral-8x" (digits + [a-z]? + [x\d+]?).
932| 7|fn extend_with_numeric_suffix(entities: Vec<ExtractedEntity>, body: &str) -> Vec<ExtractedEntity> {
933| | static SUFFIX_RE: OnceLock<Regex> = OnceLock::new();
934| | // Matches: separator + digits + optional decimal + optional lowercase letter
935| | // Examples: "-4", " 5", "-4o", " 5b", "-8x", " 3.5", "-3.5-turbo" (capped by len)
936| 7| let suffix_re = SUFFIX_RE.get_or_init(|| {
^1
937| 1| Regex::new(r"^([\-\s]+\d+(?:\.\d+)?[a-z]?)")
938| 1| .expect("compile-time validated numeric suffix regex literal")
939| 1| });
940| |
941| 7| entities
942| 7| .into_iter()
943| 8| .map(|ent| {
^7
944| | // Finds the first case-sensitive occurrence of the entity in the body
945| 8| if let Some(pos) = body.find(&ent.name) {
946| 8| let after_pos = pos + ent.name.len();
947| 8| if after_pos < body.len() {
948| 8| let after = &body[after_pos..];
949| 8| if let Some(m) = suffix_re.find(after) {
^4
950| 4| let suffix = m.as_str();
951| | // Conservative: cap suffix length to 7 chars to avoid grabbing
952| | // long hyphenated phrases while allowing "4o", "5b", "3.5b".
953| 4| if suffix.len() <= 7 {
954| 4| let mut extended = String::with_capacity(ent.name.len() + suffix.len());
955| 4| extended.push_str(&ent.name);
956| 4| extended.push_str(suffix);
957| 4| return ExtractedEntity {
958| 4| name: extended,
959| 4| entity_type: ent.entity_type,
960| 4| };
961| 0| }
962| 4| }
963| 0| }
964| 0| }
965| 4| ent
966| 8| })
967| 7| .collect()
968| 7|}
969| |
970| |/// Captures versioned model names that NER model consistently misses.
971| |///
972| |/// NER model often classifies tokens like "Claude" or "Llama" as common nouns,
973| |/// failing to emit a B-PER/B-ORG tag. As a result, `extend_with_numeric_suffix`
974| |/// never sees these candidates and the version suffix gets lost.
975| |///
976| |/// This function scans the body with a conservative regex, matching capitalised
977| |/// words followed by a space-or-hyphen and a small integer. Matches that are not
978| |/// already covered by an existing entity (case-insensitive) are appended with the
979| |/// `concept` type, mirroring how `extend_with_numeric_suffix` represents these
980| |/// items downstream.
981| |///
982| |/// v1.0.24 P2-D: regex extended to cover:
983| |/// - Alphanumeric version suffixes: "GPT-4o", "Llama-3b", "Mistral-8x"
984| |/// - Composite versions: "Mixtral 8x7B" (digit × digit + uppercase letter)
985| |/// - Named release tiers after version: "Claude 4 Sonnet", "Llama 3 Pro"
986| |///
987| |/// Examples covered: "Claude 4", "Llama 3", "GPT-4o", "Claude 4 Sonnet", "Mixtral 8x7B".
988| |/// Examples already handled upstream and skipped here: plain "Apple" without a suffix.
989| 8|fn augment_versioned_model_names(
990| 8| entities: Vec<ExtractedEntity>,
991| 8| body: &str,
992| 8|) -> Vec<ExtractedEntity> {
993| | static VERSIONED_MODEL_RE: OnceLock<Regex> = OnceLock::new();
994| | // Pattern breakdown:
995| | // [A-Z][A-Za-z]{2,15} — capitalised model name (3-16 chars)
996| | // [\s\-]+ — separator: space(s) or hyphen(s)
997| | // \d+(?:\.\d+)? — version number, optional decimal
998| | // (?:[a-z]|x\d+[A-Za-z]?)? — optional alphanumeric suffix: "o", "b", "x7B"
999| | // (?:\s+(?:Sonnet|Opus|Haiku|Turbo|Pro|Lite|Mini|Nano|Flash|Ultra))? — optional release tier
1000| 8| let model_re = VERSIONED_MODEL_RE.get_or_init(|| {
^1
1001| 1| Regex::new(
1002| 1| r"\b([A-Z][A-Za-z]{2,15})[\s\-]+(\d+(?:\.\d+)?(?:[a-z]|x\d+[A-Za-z]?)?)(?:\s+(?:Sonnet|Opus|Haiku|Turbo|Pro|Lite|Mini|Nano|Flash|Ultra))?\b",
1003| | )
1004| 1| .expect("compile-time validated versioned model regex literal")
1005| 1| });
1006| |
1007| 8| let mut existing_lc: std::collections::HashSet<String> =
1008| 8| entities.iter().map(|ent| ent.name.to_lowercase()).collect();
^5 ^5
1009| 8| let mut result = entities;
1010| |
1011| 8| for caps in model_re.captures_iter(body) {
^5
1012| 5| let full_match = caps.get(0).map(|m| m.as_str()).unwrap_or("");
1013| | // Conservative cap: avoid harvesting multi-word noise like "section 12" inside
1014| | // long passages. A model name plus a one or two digit suffix fits in 24 chars.
1015| 5| if full_match.is_empty() || full_match.len() > 24 {
1016| 0| continue;
1017| 5| }
1018| 5| let normalized_lc = full_match.to_lowercase();
1019| 5| if existing_lc.contains(&normalized_lc) {
1020| 1| continue;
1021| 4| }
1022| | // Stop appending once the global entity cap is reached to keep parity with
1023| | // `merge_and_deduplicate` truncation semantics.
1024| 4| if result.len() >= MAX_ENTS {
1025| 0| break;
1026| 4| }
1027| 4| existing_lc.insert(normalized_lc);
1028| 4| result.push(ExtractedEntity {
1029| 4| name: full_match.to_string(),
1030| 4| entity_type: EntityType::Concept,
1031| 4| });
1032| | }
1033| |
1034| 8| result
1035| 8|}
1036| |
1037| 10|fn merge_and_deduplicate(
1038| 10| regex_ents: Vec<ExtractedEntity>,
1039| 10| ner_ents: Vec<ExtractedEntity>,
1040| 10|) -> Vec<ExtractedEntity> {
1041| | // v1.0.25 P0-3: Collision detection uses substring containment (not starts_with)
1042| | // and is scoped per entity_type. This fixes two bugs from prior versions:
1043| | //
1044| | // 1. starts_with was not symmetric for non-prefix substrings. "sonne" does not
1045| | // start_with "sonnet", so the pair could survive dedup depending on insertion
1046| | // order. contains() catches both directions unconditionally.
1047| | //
1048| | // 2. The lookup key omitted entity_type, so "Apple/organization" and
1049| | // "Apple/concept" collapsed into one. Key is now "type\0name_lc".
1050| | //
1051| | // Earlier invariants preserved:
1052| | // - NFKC normalization before lowercasing (v1.0.24).
1053| | // - Longest-wins: on collision keep the entity with the longer name.
1054| | // - Truncation warning at MAX_ENTS.
1055| 10| let mut by_lc: std::collections::HashMap<String, usize> =
1056| 10| std::collections::HashMap::with_capacity(regex_ents.len() + ner_ents.len());
1057| 10| let mut result: Vec<ExtractedEntity> = Vec::with_capacity(MAX_ENTS);
1058| 10| let mut truncated = false;
1059| |
1060| 10| let total_input = regex_ents.len() + ner_ents.len();
1061| 18| for ent in regex_ents.into_iter().chain(ner_ents) {
^10 ^10 ^10 ^10
1062| 18| let name_lc = ent.name.nfkc().collect::<String>().to_lowercase();
1063| | // Composite key: entity_type + NUL + normalised lowercase name.
1064| | // Collision search is scoped to the same type so that e.g.
1065| | // "Apple/organization" and "Apple/concept" are kept separately.
1066| 18| let key = {
1067| 18| let et = ent.entity_type.as_str();
1068| 18| let mut k = String::with_capacity(et.len() + 1 + name_lc.len());
1069| 18| k.push_str(et);
1070| 18| k.push('\0');
1071| 18| k.push_str(&name_lc);
1072| 18| k
1073| | };
1074| |
1075| | // Scan stored entries for substring containment within the same type.
1076| | // Two names collide when one is a case-insensitive substring of the other:
1077| | // "sonne" ⊂ "sonnet" → collision, keep "sonnet" (longest-wins)
1078| | // "open" ⊂ "openai" → collision, keep "openai" (longest-wins)
1079| 18| let type_prefix = {
1080| 18| let et = ent.entity_type.as_str();
1081| 18| let mut p = String::with_capacity(et.len() + 1);
1082| 18| p.push_str(et);
1083| 18| p.push('\0');
1084| 18| p
1085| | };
1086| 18| let mut collision_idx: Option<usize> = None;
1087| 22| for (existing_key, idx) in &by_lc {
^9 ^9
1088| | // Fast-path: check type prefix matches before scanning the name.
1089| 9| if !existing_key.starts_with(&type_prefix) {
1090| 1| continue;
1091| 8| }
1092| 8| let existing_name_lc = &existing_key[type_prefix.len()..];
1093| 8| if existing_name_lc == name_lc
1094| 5| || existing_name_lc.contains(name_lc.as_str())
1095| 5| || name_lc.contains(existing_name_lc)
1096| | {
1097| 5| collision_idx = Some(*idx);
1098| 5| break;
1099| 3| }
1100| | }
1101| 18| match collision_idx {
1102| 5| Some(idx) => {
1103| | // Replace stored entity only when the new candidate is strictly
1104| | // longer; otherwise drop the new one.
1105| 5| if ent.name.len() > result[idx].name.len() {
1106| 3| let old_name_lc = result[idx].name.nfkc().collect::<String>().to_lowercase();
1107| 3| let old_key = {
1108| 3| let et = result[idx].entity_type.as_str();
1109| 3| let mut k = String::with_capacity(et.len() + 1 + old_name_lc.len());
1110| 3| k.push_str(et);
1111| 3| k.push('\0');
1112| 3| k.push_str(&old_name_lc);
1113| 3| k
1114| 3| };
1115| 3| by_lc.remove(&old_key);
1116| 3| result[idx] = ent;
1117| 3| by_lc.insert(key, idx);
1118| 3| }
^2
1119| | }
1120| 13| None => {
1121| 13| by_lc.insert(key, result.len());
1122| 13| result.push(ent);
1123| 13| }
1124| | }
1125| 18| if result.len() >= MAX_ENTS {
1126| 0| truncated = true;
1127| 0| break;
1128| 18| }
1129| | }
1130| |
1131| | // v1.0.20: warn when silent truncation discards entities above MAX_ENTS.
1132| 10| if truncated {
1133| 0| tracing::warn!(target: "extraction",
1134| 0| "extraction truncated at {MAX_ENTS} entities (input had {total_input} candidates before deduplication)"
1135| | );
1136| 10| }
1137| |
1138| 10| result
1139| 10|}
1140| |
1141| 5|fn to_new_entities(extracted: Vec<ExtractedEntity>) -> Vec<NewEntity> {
1142| 5| extracted
1143| 5| .into_iter()
1144| 5| .map(|e| NewEntity {
1145| 7| name: e.name,
1146| 7| entity_type: e.entity_type,
1147| 7| description: None,
1148| 7| })
1149| 5| .collect()
1150| 5|}
1151| |
1152| 3|pub fn extract_graph_auto(
1153| 3| body: &str,
1154| 3| paths: &AppPaths,
1155| 3| variant: GlinerVariant,
1156| 3|) -> Result<ExtractionResult> {
1157| 3| let regex_entities = apply_regex_prefilter(body);
1158| 3| let threshold = crate::constants::gliner_confidence_threshold();
1159| |
1160| 3| let mut gliner_used = false;
1161| 3| let ner_entities = match get_or_init_gliner(paths, variant) {
1162| 3| Some(model) => match model.predict(body, GLINER_ENTITY_LABELS, threshold) {
1163| 3| Ok(ents) => {
1164| 3| gliner_used = true;
1165| 3| ents
1166| | }
1167| 0| Err(e) => {
1168| 0| tracing::warn!(target: "extraction", error = %e, "GLiNER NER failed, falling back to regex-only");
1169| 0| Vec::new()
1170| | }
1171| | },
1172| 0| None => Vec::new(),
1173| | };
1174| |
1175| 3| let merged = merge_and_deduplicate(regex_entities, ner_entities);
1176| 3| let extended = extend_with_numeric_suffix(merged, body);
1177| 3| let with_models = augment_versioned_model_names(extended, body);
1178| 3| let with_models: Vec<ExtractedEntity> = with_models
1179| 3| .into_iter()
1180| 4| .filter(|e| !regex_section_marker().is_match(&e.name))
^3
1181| 3| .collect();
1182| 3| let entities = to_new_entities(with_models);
1183| 3| let (relationships, relationships_truncated) =
1184| 3| build_relationships_by_sentence_cooccurrence(body, &entities);
1185| |
1186| 3| let extraction_method = if gliner_used {
1187| 3| format!("gliner-{variant}+regex")
1188| | } else {
1189| 0| "regex-only".to_string()
1190| | };
1191| |
1192| 3| let urls = extract_urls(body);
1193| |
1194| 3| Ok(ExtractionResult {
1195| 3| entities,
1196| 3| relationships,
1197| 3| relationships_truncated,
1198| 3| extraction_method,
1199| 3| urls,
1200| 3| })
1201| 3|}
1202| |
1203| |pub struct RegexExtractor;
1204| |
1205| |impl Extractor for RegexExtractor {
1206| 2| fn extract(&self, body: &str) -> Result<ExtractionResult> {
1207| 2| let regex_entities = apply_regex_prefilter(body);
1208| 2| let entities = to_new_entities(regex_entities);
1209| 2| let (relationships, relationships_truncated) =
1210| 2| build_relationships_by_sentence_cooccurrence(body, &entities);
1211| 2| let urls = extract_urls(body);
1212| 2| Ok(ExtractionResult {
1213| 2| entities,
1214| 2| relationships,
1215| 2| relationships_truncated,
1216| 2| extraction_method: "regex-only".to_string(),
1217| 2| urls,
1218| 2| })
1219| 2| }
1220| |}
1221| |
1222| |#[cfg(test)]
1223| |mod tests {
1224| | use super::*;
1225| | use crate::entity_type::EntityType;
1226| |
1227| 3| fn make_paths() -> AppPaths {
1228| | use std::path::PathBuf;
1229| 3| AppPaths {
1230| 3| db: PathBuf::from("/tmp/test.sqlite"),
1231| 3| models: PathBuf::from("/tmp/test_models"),
1232| 3| }
1233| 3| }
1234| |
1235| | #[test]
1236| 1| fn regex_email_captures_address() {
1237| 1| let ents = apply_regex_prefilter("contact: someone@company.com for more info");
1238| | // v1.0.20: emails are classified as "concept" (regex alone cannot distinguish person from role).
1239| 1| assert!(ents
1240| 1| .iter()
1241| 1| .any(|e| e.name == "someone@company.com" && e.entity_type == EntityType::Concept));
1242| 1| }
1243| |
1244| | #[test]
1245| 1| fn regex_all_caps_filters_pt_rule_word() {
1246| | // v1.0.20 fix P1: NUNCA, PROIBIDO, DEVE must not become "entities".
1247| 1| let ents = apply_regex_prefilter("NUNCA do this. PROIBIDO use X. DEVE follow Y.");
1248| 1| assert!(
1249| 1| !ents.iter().any(|e| e.name == "NUNCA"),
^0 ^0
1250| 0| "NUNCA must be filtered as a stopword"
1251| | );
1252| 1| assert!(
1253| 1| !ents.iter().any(|e| e.name == "PROIBIDO"),
^0 ^0
1254| 0| "PROIBIDO must be filtered"
1255| | );
1256| 1| assert!(
1257| 1| !ents.iter().any(|e| e.name == "DEVE"),
^0 ^0
1258| 0| "DEVE must be filtered"
1259| | );
1260| 1| }
1261| |
1262| | #[test]
1263| 1| fn regex_all_caps_accepts_underscored_constant() {
1264| | // Technical constants like MAX_RETRY, TIMEOUT_MS must always be accepted.
1265| 1| let ents = apply_regex_prefilter("configure MAX_RETRY=3 and API_TIMEOUT=30");
1266| 1| assert!(ents.iter().any(|e| e.name == "MAX_RETRY"));
1267| 2| assert!(ents.iter().any(|e| e.name == "API_TIMEOUT"));
^1 ^1 ^1
1268| 1| }
1269| |
1270| | #[test]
1271| 1| fn regex_all_caps_accepts_domain_acronym() {
1272| | // Legitimate (non-stopword) acronyms must pass: OPENAI, NVIDIA, GOOGLE.
1273| 1| let ents = apply_regex_prefilter("OPENAI launched GPT-5 with NVIDIA H100");
1274| 1| assert!(ents.iter().any(|e| e.name == "OPENAI"));
1275| 3| assert!(ents.iter().any(|e| e.name == "NVIDIA"));
^1 ^1 ^1
1276| 1| }
1277| |
1278| | #[test]
1279| 1| fn regex_url_does_not_appear_in_apply_regex_prefilter() {
1280| | // v1.0.24 P0-2: URLs were removed from apply_regex_prefilter and now go through extract_urls.
1281| 1| let ents = apply_regex_prefilter("see https://docs.rs/crate for details");
1282| 1| assert!(
1283| 1| !ents.iter().any(|e| e.name.starts_with("https://")),
^0 ^0
1284| 0| "URLs must not appear as entities after the P0-2 split"
1285| | );
1286| 1| }
1287| |
1288| | #[test]
1289| 1| fn extract_urls_captures_https() {
1290| 1| let urls = extract_urls("see https://docs.rs/crate for details");
1291| 1| assert_eq!(urls.len(), 1);
1292| 1| assert_eq!(urls[0].url, "https://docs.rs/crate");
1293| 1| assert!(urls[0].offset > 0);
1294| 1| }
1295| |
1296| | #[test]
1297| 1| fn extract_urls_trim_sufixo_pontuacao() {
1298| 1| let urls = extract_urls("link: https://example.com/path. fim");
1299| 1| assert!(!urls.is_empty());
1300| 1| assert!(
1301| 1| !urls[0].url.ends_with('.'),
1302| 0| "sufixo ponto deve ser removido"
1303| | );
1304| 1| }
1305| |
1306| | #[test]
1307| 1| fn extract_urls_dedupes_repeated() {
1308| 1| let body = "https://example.com referenciado aqui e depois aqui https://example.com";
1309| 1| let urls = extract_urls(body);
1310| 1| assert_eq!(urls.len(), 1, "URLs repetidas devem ser deduplicadas");
^0
1311| 1| }
1312| |
1313| | #[test]
1314| 1| fn regex_uuid_captura_identificador() {
1315| 1| let ents = apply_regex_prefilter("id=550e8400-e29b-41d4-a716-446655440000 no sistema");
1316| 1| assert!(ents.iter().any(|e| e.entity_type == EntityType::Concept));
1317| 1| }
1318| |
1319| | #[test]
1320| 1| fn regex_all_caps_captura_constante() {
1321| 1| let ents = apply_regex_prefilter("configure MAX_RETRY e TIMEOUT_MS");
1322| 1| assert!(ents.iter().any(|e| e.name == "MAX_RETRY"));
1323| 2| assert!(ents.iter().any(|e| e.name == "TIMEOUT_MS"));
^1 ^1 ^1
1324| 1| }
1325| |
1326| | #[test]
1327| 1| fn regex_all_caps_ignores_short_words() {
1328| 1| let ents = apply_regex_prefilter("use AI em seu projeto");
1329| 1| assert!(
1330| 1| !ents.iter().any(|e| e.name == "AI"),
^0 ^0
1331| 0| "AI tem apenas 2 chars, deve ser ignorado"
1332| | );
1333| 1| }
1334| |
1335| | #[test]
1336| 1| fn build_relationships_respeitam_max_rels() {
1337| 1| let entities: Vec<NewEntity> = (0..20)
1338| 1| .map(|i| NewEntity {
1339| 20| name: format!("entidade_{i}"),
1340| 20| entity_type: EntityType::Concept,
1341| 20| description: None,
1342| 20| })
1343| 1| .collect();
1344| 1| let (rels, truncated) = build_relationships(&entities);
1345| 1| let max_rels = crate::constants::max_relationships_per_memory();
1346| 1| assert!(rels.len() <= max_rels, "deve respeitar max_rels={max_rels}");
^0
1347| 1| if rels.len() == max_rels {
1348| 1| assert!(truncated, "truncated deve ser true quando atingiu o cap");
^0
1349| 0| }
1350| 1| }
1351| |
1352| | #[test]
1353| 1| fn build_relationships_without_duplicates() {
1354| 1| let entities: Vec<NewEntity> = (0..5)
1355| 1| .map(|i| NewEntity {
1356| 5| name: format!("ent_{i}"),
1357| 5| entity_type: EntityType::Concept,
1358| 5| description: None,
1359| 5| })
1360| 1| .collect();
1361| 1| let (rels, _truncated) = build_relationships(&entities);
1362| 1| let mut pares: std::collections::HashSet<(String, String)> =
1363| 1| std::collections::HashSet::new();
1364| 11| for r in &rels {
^10
1365| 10| let par = (r.source.clone(), r.target.clone());
1366| 10| assert!(pares.insert(par), "par duplicado encontrado");
^0
1367| | }
1368| 1| }
1369| |
1370| | #[test]
1371| 1| fn merge_dedupes_by_lowercase_name() {
1372| | // v1.0.25: collision detection is scoped per entity_type; same name + same type
1373| | // must deduplicate to one entry. Different types are kept separately.
1374| 1| let a = vec![ExtractedEntity {
1375| 1| name: "Rust".to_string(),
1376| 1| entity_type: EntityType::Concept,
1377| 1| }];
1378| 1| let b = vec![ExtractedEntity {
1379| 1| name: "rust".to_string(),
1380| 1| entity_type: EntityType::Concept,
1381| 1| }];
1382| 1| let merged = merge_and_deduplicate(a, b);
1383| 1| assert_eq!(
1384| 1| merged.len(),
1385| | 1,
1386| 0| "rust and Rust with the same type are the same entity"
1387| | );
1388| 1| }
1389| |
1390| | #[test]
1391| 1| fn regex_extractor_implements_trait() {
1392| 1| let extractor = RegexExtractor;
1393| 1| let result = extractor
1394| 1| .extract("contato: dev@empresa.io e MAX_TIMEOUT configurado")
1395| 1| .unwrap();
1396| 1| assert!(!result.entities.is_empty());
1397| 1| }
1398| |
1399| | #[test]
1400| 1| fn extract_returns_ok_without_model() {
1401| | // Without a downloaded model, must return Ok with regex-only entities.
1402| 1| let paths = make_paths();
1403| 1| let body = "contato: teste@exemplo.com com MAX_RETRY=3";
1404| 1| let result = extract_graph_auto(body, &paths, GlinerVariant::Int8).unwrap();
1405| 1| assert!(result
1406| 1| .entities
1407| 1| .iter()
1408| 1| .any(|e| e.name.contains("teste@exemplo.com")));
1409| 1| }
1410| |
1411| | #[test]
1412| 1| fn stopwords_filter_v1024_terms() {
1413| | // v1.0.24: verify that all 17 new stopwords added in P0-3 are filtered
1414| | // by apply_regex_prefilter so they do not appear as entities.
1415| 1| let body = "ACEITE ACK ACL BORDA CHECKLIST COMPLETED CONFIRME \
1416| 1| DEVEMOS DONE FIXED NEGUE PENDING PLAN PODEMOS RECUSE TOKEN VAMOS";
1417| 1| let ents = apply_regex_prefilter(body);
1418| 1| let names: Vec<&str> = ents.iter().map(|e| e.name.as_str()).collect();
^0 ^0
1419| 18| for word in &[
^17
1420| 18| "ACEITE",
1421| 18| "ACK",
1422| 18| "ACL",
1423| 18| "BORDA",
1424| 18| "CHECKLIST",
1425| 18| "COMPLETED",
1426| 18| "CONFIRME",
1427| 18| "DEVEMOS",
1428| 18| "DONE",
1429| 18| "FIXED",
1430| 18| "NEGUE",
1431| 18| "PENDING",
1432| 18| "PLAN",
1433| 18| "PODEMOS",
1434| 18| "RECUSE",
1435| 18| "TOKEN",
1436| 18| "VAMOS",
1437| 18| ] {
1438| 17| assert!(
1439| 17| !names.contains(word),
1440| 0| "v1.0.24 stopword {word} should be filtered but was found in entities"
1441| | );
1442| | }
1443| 1| }
1444| |
1445| | #[test]
1446| 1| fn dedup_normalizes_unicode_combining_marks() {
1447| | // v1.0.24 P1-E: "Caf\u{e9}" (NFC precomposed) and "Cafe\u{301}" (NFD with
1448| | // combining acute accent) must deduplicate to a single entity after NFKC
1449| | // normalization.
1450| 1| let nfc = vec![ExtractedEntity {
1451| 1| name: "Caf\u{e9}".to_string(),
1452| 1| entity_type: EntityType::Concept,
1453| 1| }];
1454| | // Build the NFD form: 'e' followed by combining acute accent U+0301
1455| 1| let nfd_name = "Cafe\u{301}".to_string();
1456| 1| let nfd = vec![ExtractedEntity {
1457| 1| name: nfd_name,
1458| 1| entity_type: EntityType::Concept,
1459| 1| }];
1460| 1| let merged = merge_and_deduplicate(nfc, nfd);
1461| 1| assert_eq!(
1462| 1| merged.len(),
1463| | 1,
1464| 0| "NFC 'Caf\\u{{e9}}' and NFD 'Cafe\\u{{301}}' must deduplicate to 1 entity after NFKC normalization"
1465| | );
1466| 1| }
1467| |
1468| | #[test]
1469| 1| fn extraction_method_regex_only_unchanged() {
1470| | // RegexExtractor always returns "regex-only" regardless of GLINER_MODEL state.
1471| | // This guards against accidentally changing the regex-only fallback string.
1472| 1| let result = RegexExtractor.extract("contact: dev@acme.io").unwrap();
1473| 1| assert_eq!(
1474| | result.extraction_method, "regex-only",
1475| 0| "RegexExtractor must return regex-only"
1476| | );
1477| 1| }
1478| |
1479| | // --- P2-E: extend_with_numeric_suffix alphanumeric suffix ---
1480| |
1481| | #[test]
1482| 1| fn extend_suffix_pure_numeric_unchanged() {
1483| | // Existing behaviour: pure-numeric suffix must still work after P2-E.
1484| 1| let ents = vec![ExtractedEntity {
1485| 1| name: "GPT".to_string(),
1486| 1| entity_type: EntityType::Concept,
1487| 1| }];
1488| 1| let result = extend_with_numeric_suffix(ents, "using GPT-5 in the project");
1489| 1| assert_eq!(
1490| 1| result[0].name, "GPT-5",
1491| 0| "purely numeric suffix must be extended"
1492| | );
1493| 1| }
1494| |
1495| | #[test]
1496| 1| fn extend_suffix_alphanumeric_letter_after_digit() {
1497| | // P2-E: "4o" suffix (digit + lowercase letter) must be captured.
1498| 1| let ents = vec![ExtractedEntity {
1499| 1| name: "GPT".to_string(),
1500| 1| entity_type: EntityType::Concept,
1501| 1| }];
1502| 1| let result = extend_with_numeric_suffix(ents, "using GPT-4o for advanced tasks");
1503| 1| assert_eq!(result[0].name, "GPT-4o", "suffix '4o' must be accepted");
^0
1504| 1| }
1505| |
1506| | #[test]
1507| 1| fn extend_suffix_alphanumeric_b_suffix() {
1508| | // P2-E: "5b" suffix (digit + 'b') must be captured.
1509| 1| let ents = vec![ExtractedEntity {
1510| 1| name: "Llama".to_string(),
1511| 1| entity_type: EntityType::Concept,
1512| 1| }];
1513| 1| let result = extend_with_numeric_suffix(ents, "Llama-5b open-weight model");
1514| 1| assert_eq!(result[0].name, "Llama-5b", "suffix '5b' must be accepted");
^0
1515| 1| }
1516| |
1517| | #[test]
1518| 1| fn extend_suffix_alphanumeric_x_suffix() {
1519| | // P2-E: "8x" suffix (digit + 'x') must be captured.
1520| 1| let ents = vec![ExtractedEntity {
1521| 1| name: "Mistral".to_string(),
1522| 1| entity_type: EntityType::Concept,
1523| 1| }];
1524| 1| let result = extend_with_numeric_suffix(ents, "testing Mistral-8x in production");
1525| 1| assert_eq!(result[0].name, "Mistral-8x", "suffix '8x' must be accepted");
^0
1526| 1| }
1527| |
1528| | // --- P2-D: augment_versioned_model_names extended regex ---
1529| |
1530| | #[test]
1531| 1| fn augment_versioned_gpt4o() {
1532| | // P2-D: "GPT-4o" must be captured with alphanumeric suffix.
1533| 1| let result = augment_versioned_model_names(vec![], "using GPT-4o for analysis");
1534| 1| assert!(
1535| 1| result.iter().any(|e| e.name == "GPT-4o"),
1536| 0| "GPT-4o must be captured by augment, found: {:?}",
1537| 0| result.iter().map(|e| &e.name).collect::<Vec<_>>()
1538| | );
1539| 1| }
1540| |
1541| | #[test]
1542| 1| fn augment_versioned_claude_4_sonnet() {
1543| | // P2-D: "Claude 4 Sonnet" must be captured with release tier.
1544| 1| let result =
1545| 1| augment_versioned_model_names(vec![], "best model: Claude 4 Sonnet released today");
1546| 1| assert!(
1547| 1| result.iter().any(|e| e.name == "Claude 4 Sonnet"),
1548| 0| "Claude 4 Sonnet must be captured, found: {:?}",
1549| 0| result.iter().map(|e| &e.name).collect::<Vec<_>>()
1550| | );
1551| 1| }
1552| |
1553| | #[test]
1554| 1| fn augment_versioned_llama_3_pro() {
1555| | // P2-D: "Llama 3 Pro" must be captured with release tier.
1556| 1| let result =
1557| 1| augment_versioned_model_names(vec![], "fine-tuning com Llama 3 Pro localmente");
1558| 1| assert!(
1559| 1| result.iter().any(|e| e.name == "Llama 3 Pro"),
1560| 0| "Llama 3 Pro deve ser capturado, achados: {:?}",
1561| 0| result.iter().map(|e| &e.name).collect::<Vec<_>>()
1562| | );
1563| 1| }
1564| |
1565| | #[test]
1566| 1| fn augment_versioned_mixtral_8x7b() {
1567| | // P2-D: "Mixtral 8x7B" composite version must be captured.
1568| 1| let result =
1569| 1| augment_versioned_model_names(vec![], "executando Mixtral 8x7B no servidor local");
1570| 1| assert!(
1571| 1| result.iter().any(|e| e.name == "Mixtral 8x7B"),
1572| 0| "Mixtral 8x7B deve ser capturado, achados: {:?}",
1573| 0| result.iter().map(|e| &e.name).collect::<Vec<_>>()
1574| | );
1575| 1| }
1576| |
1577| | #[test]
1578| 1| fn augment_versioned_does_not_duplicate_existing() {
1579| | // P2-D back-compat: entities already present must not be duplicated.
1580| 1| let existing = vec![ExtractedEntity {
1581| 1| name: "Claude 4".to_string(),
1582| 1| entity_type: EntityType::Concept,
1583| 1| }];
1584| 1| let result = augment_versioned_model_names(existing, "using Claude 4 in the project");
1585| 1| let count = result.iter().filter(|e| e.name == "Claude 4").count();
1586| 1| assert_eq!(count, 1, "Claude 4 must not be duplicated");
^0
1587| 1| }
1588| |
1589| | // ── v1.0.25 P0-4: new stopwords (API, CLI, HTTP, HTTPS, JWT, LLM, REST, UI, URL) ──
1590| |
1591| | #[test]
1592| 1| fn stopwords_filter_url_jwt_api_v1025() {
1593| | // Verify that v1.0.25 tech-acronym stopwords do not leak as entities.
1594| 1| let body = "We use URL, JWT, and API REST in our LLM-powered CLI via HTTP/HTTPS and UI.";
1595| 1| let ents = apply_regex_prefilter(body);
1596| 1| let names: Vec<&str> = ents.iter().map(|e| e.name.as_str()).collect();
^0 ^0
1597| 10| for blocked in &[
^9
1598| 10| "URL", "JWT", "API", "REST", "LLM", "CLI", "HTTP", "HTTPS", "UI",
1599| 10| ] {
1600| 9| assert!(
1601| 9| !names.contains(blocked),
1602| 0| "v1.0.25 stopword {blocked} leaked as entity; found names: {names:?}"
1603| | );
1604| | }
1605| 1| }
1606| |
1607| | // ── v1.0.25 P0-4: section-marker regex strips "Etapa N", "Fase N", etc. ──
1608| |
1609| | #[test]
1610| 1| fn section_markers_etapa_fase_filtered_v1025() {
1611| | // "Etapa 3" and "Fase 1" are document-structure labels, not entities.
1612| | // Body intentionally uses PT-BR section keywords (Etapa/Fase/Migra\u{e7}\u{e3}o) to
1613| | // exercise the PT-BR section-marker filter. ASCII-escaped per the project policy.
1614| 1| let body = "Etapa 3 do plano: implementar Fase 1 da Migra\u{e7}\u{e3}o.";
1615| 1| let ents = apply_regex_prefilter(body);
1616| 1| assert!(
1617| 1| !ents
1618| 1| .iter()
1619| 1| .any(|e| e.name.contains("Etapa") || e.name.contains("Fase")),
^0 ^0
1620| 0| "section markers must be stripped; entities: {:?}",
1621| 0| ents.iter().map(|e| &e.name).collect::<Vec<_>>()
1622| | );
1623| 1| }
1624| |
1625| | #[test]
1626| 1| fn section_markers_passo_secao_filtered_v1025() {
1627| | // PT-BR keywords Passo/Se\u{e7}\u{e3}o written with Unicode escapes per the
1628| | // project language policy.
1629| 1| let body = "Siga Passo 2 conforme Se\u{e7}\u{e3}o 3 do manual.";
1630| 1| let ents = apply_regex_prefilter(body);
1631| 1| assert!(
1632| 1| !ents
1633| 1| .iter()
1634| 1| .any(|e| e.name.contains("Passo") || e.name.contains("Se\u{e7}\u{e3}o")),
^0 ^0
1635| 0| "Passo/Se\\u{{e7}}\\u{{e3}}o section markers must be stripped; entities: {:?}",
1636| 0| ents.iter().map(|e| &e.name).collect::<Vec<_>>()
1637| | );
1638| 1| }
1639| |
1640| | // ── v1.0.25 P0-2: CamelCase brand names extracted as organization ──
1641| |
1642| | #[test]
1643| 1| fn brand_camelcase_extracted_as_organization_v1025() {
1644| | // "OpenAI" is a CamelCase brand that NER model often misses.
1645| 1| let body = "OpenAI launched GPT-4 and PostgreSQL added pgvector.";
1646| 1| let ents = apply_regex_prefilter(body);
1647| 2| let openai = ents.iter().find(|e| e.name == "OpenAI");
^1 ^1 ^1
1648| 1| assert!(
1649| 1| openai.is_some(),
1650| 0| "OpenAI must be extracted by CamelCase brand regex; entities: {:?}",
1651| 0| ents.iter().map(|e| &e.name).collect::<Vec<_>>()
1652| | );
1653| 1| assert_eq!(
1654| 1| openai.unwrap().entity_type,
1655| | EntityType::Organization,
1656| 0| "brand CamelCase must map to organization (V008)"
1657| | );
1658| 1| }
1659| |
1660| | #[test]
1661| 1| fn brand_postgresql_extracted_as_organization_v1025() {
1662| 1| let body = "migrating from MySQL to PostgreSQL for better performance.";
1663| 1| let ents = apply_regex_prefilter(body);
1664| 1| assert!(
1665| 1| ents.iter()
1666| 2| .any(|e| e.name == "PostgreSQL" && e.entity_type == EntityType::Organization),
^1 ^1
1667| 0| "PostgreSQL must be extracted as organization; entities: {:?}",
1668| 0| ents.iter()
1669| 0| .map(|e| (&e.name, &e.entity_type))
1670| 0| .collect::<Vec<_>>()
1671| | );
1672| 1| }
1673| |
1674| | // --- P0-3 longest-wins v1.0.25 ---
1675| |
1676| 10| fn entity(name: &str, entity_type: EntityType) -> ExtractedEntity {
1677| 10| ExtractedEntity {
1678| 10| name: name.to_string(),
1679| 10| entity_type,
1680| 10| }
1681| 10| }
1682| |
1683| | #[test]
1684| 1| fn merge_resolves_sonne_vs_sonnet_keeps_longest_v1025() {
1685| | // "Sonne" is a substring of "Sonnet" — longest-wins must keep "Sonnet".
1686| 1| let regex = vec![entity("Sonne", EntityType::Concept)];
1687| 1| let ner = vec![entity("Sonnet", EntityType::Concept)];
1688| 1| let result = merge_and_deduplicate(regex, ner);
1689| 1| assert_eq!(result.len(), 1, "expected 1 entity, got: {result:?}");
^0
1690| 1| assert_eq!(result[0].name, "Sonnet");
1691| 1| }
1692| |
1693| | #[test]
1694| 1| fn merge_resolves_open_vs_openai_keeps_longest_v1025() {
1695| | // "Open" is a substring of "OpenAI" — longest-wins must keep "OpenAI".
1696| 1| let regex = vec![
1697| 1| entity("Open", EntityType::Organization),
1698| 1| entity("OpenAI", EntityType::Organization),
1699| | ];
1700| 1| let result = merge_and_deduplicate(regex, vec![]);
1701| 1| assert_eq!(result.len(), 1, "expected 1 entity, got: {result:?}");
^0
1702| 1| assert_eq!(result[0].name, "OpenAI");
1703| 1| }
1704| |
1705| | #[test]
1706| 1| fn merge_keeps_both_when_no_containment_v1025() {
1707| | // "Alice" and "Bob" share no containment — both must be preserved.
1708| 1| let regex = vec![
1709| 1| entity("Alice", EntityType::Person),
1710| 1| entity("Bob", EntityType::Person),
1711| | ];
1712| 1| let result = merge_and_deduplicate(regex, vec![]);
1713| 1| assert_eq!(result.len(), 2, "expected 2 entities, got: {result:?}");
^0
1714| 1| }
1715| |
1716| | #[test]
1717| 1| fn merge_respects_entity_type_boundary_v1025() {
1718| | // Same name "Apple" but different types: both must survive independently.
1719| 1| let regex = vec![
1720| 1| entity("Apple", EntityType::Organization),
1721| 1| entity("Apple", EntityType::Concept),
1722| | ];
1723| 1| let result = merge_and_deduplicate(regex, vec![]);
1724| 1| assert_eq!(
1725| 1| result.len(),
1726| | 2,
1727| 0| "expected 2 entities (different types), got: {result:?}"
1728| | );
1729| 1| }
1730| |
1731| | #[test]
1732| 1| fn merge_case_insensitive_dedup_v1025() {
1733| | // "OpenAI" and "openai" are the same entity — deduplicate to exactly one.
1734| 1| let regex = vec![
1735| 1| entity("OpenAI", EntityType::Organization),
1736| 1| entity("openai", EntityType::Organization),
1737| | ];
1738| 1| let result = merge_and_deduplicate(regex, vec![]);
1739| 1| assert_eq!(
1740| 1| result.len(),
1741| | 1,
1742| 0| "expected 1 entity after case-insensitive dedup, got: {result:?}"
1743| | );
1744| 1| }
1745| |
1746| | // ── v1.0.31 A1: NER cap protects against pathological body sizes ──
1747| |
1748| | #[test]
1749| 1| fn extract_graph_auto_handles_large_body_under_30s() {
1750| | // Regression guard for the v1.0.31 A1 fix. A 80 KB body without real
1751| | // entities must complete in under 30 s; before the cap it took 5+ minutes.
1752| 1| let body = "x ".repeat(40_000);
1753| 1| let paths = make_paths();
1754| 1| let start = std::time::Instant::now();
1755| 1| let result = extract_graph_auto(&body, &paths, GlinerVariant::Int8)
1756| 1| .expect("extraction must not error");
1757| 1| let elapsed = start.elapsed();
1758| 1| assert!(
1759| 1| elapsed.as_secs() < 30,
1760| 0| "extract_graph_auto took {}s for 80 KB body (cap should keep it well under 30s)",
1761| 0| elapsed.as_secs()
1762| | );
1763| | // No real entities expected in synthetic body, but the call must succeed.
1764| 1| let _ = result.entities;
1765| 1| }
1766| |
1767| | // ── v1.0.31 A11: PT-BR uppercase noise must not leak as entities ──
1768| |
1769| | #[test]
1770| 1| fn pt_uppercase_stopwords_filtered_v1031() {
1771| 1| let body = "Para o ADAPTER funcionar com PROJETO em modo PASSIVA, devemos usar \
1772| 1| SOMENTE LEITURA conforme a REGRA OBRIGATORIA do EXEMPLO DEFAULT.";
1773| 1| let ents = apply_regex_prefilter(body);
1774| 1| let names: Vec<String> = ents.iter().map(|e| e.name.to_uppercase()).collect();
^0 ^0
1775| 10| for stop in &[
^9
1776| 10| "ADAPTER",
1777| 10| "PROJETO",
1778| 10| "PASSIVA",
1779| 10| "SOMENTE",
1780| 10| "LEITURA",
1781| 10| "REGRA",
1782| 10| "OBRIGATORIA",
1783| 10| "EXEMPLO",
1784| 10| "DEFAULT",
1785| 10| ] {
1786| 9| assert!(
1787| 9| !names.contains(&stop.to_string()),
1788| 0| "v1.0.31 A11 stoplist failed: {stop} leaked as entity; got names: {names:?}"
1789| | );
1790| | }
1791| 1| }
1792| |
1793| | #[test]
1794| 1| fn pt_underscored_identifier_preserved_v1031() {
1795| | // Identifiers with underscore must still pass through (FLOWAIPER_API_KEY,
1796| | // MAX_RETRY etc. are intentional entities, not noise).
1797| 1| let ents = apply_regex_prefilter("configure FLOWAIPER_API_KEY=foo and MAX_TIMEOUT=30");
1798| 2| let names: Vec<&str> = ents.iter().map(|e| e.name.as_str()).collect();
^1 ^1 ^1 ^1 ^1
1799| 1| assert!(names.contains(&"FLOWAIPER_API_KEY"));
1800| 1| assert!(names.contains(&"MAX_TIMEOUT"));
1801| 1| }
1802| |
1803| | // ── v1.0.31 A9: relationships only between entities co-occurring in same sentence ──
1804| |
1805| | #[test]
1806| 1| fn build_relationships_by_sentence_only_links_co_occurring_entities() {
1807| 1| let body = "Alice met Bob at the conference. Carol works alone in another room.";
1808| 1| let entities = vec![
1809| 1| NewEntity {
1810| 1| name: "Alice".to_string(),
1811| 1| entity_type: EntityType::Person,
1812| 1| description: None,
1813| 1| },
1814| 1| NewEntity {
1815| 1| name: "Bob".to_string(),
1816| 1| entity_type: EntityType::Person,
1817| 1| description: None,
1818| 1| },
1819| 1| NewEntity {
1820| 1| name: "Carol".to_string(),
1821| 1| entity_type: EntityType::Person,
1822| 1| description: None,
1823| 1| },
1824| | ];
1825| 1| let (rels, truncated) = build_relationships_by_sentence_cooccurrence(body, &entities);
1826| 1| assert!(!truncated);
1827| 1| assert_eq!(
1828| 1| rels.len(),
1829| | 1,
1830| 0| "only Alice/Bob should pair (same sentence); Carol is isolated"
1831| | );
1832| 1| let pair = (rels[0].source.as_str(), rels[0].target.as_str());
1833| 1| assert!(
1834| 1| matches!(pair, ("Alice", "Bob") | ("Bob", "Alice")),
^0 ^0
1835| 0| "unexpected pair {pair:?}"
1836| | );
1837| 1| }
1838| |
1839| | #[test]
1840| 1| fn build_relationships_by_sentence_returns_empty_for_single_entity() {
1841| 1| let body = "Alice is here.";
1842| 1| let entities = vec![NewEntity {
1843| 1| name: "Alice".to_string(),
1844| 1| entity_type: EntityType::Person,
1845| 1| description: None,
1846| 1| }];
1847| 1| let (rels, truncated) = build_relationships_by_sentence_cooccurrence(body, &entities);
1848| 1| assert!(rels.is_empty());
1849| 1| assert!(!truncated);
1850| 1| }
1851| |
1852| | #[test]
1853| 1| fn build_relationships_by_sentence_dedupes_pairs_across_sentences() {
1854| 1| let body = "Alice met Bob. Bob saw Alice again.";
1855| 1| let entities = vec![
1856| 1| NewEntity {
1857| 1| name: "Alice".to_string(),
1858| 1| entity_type: EntityType::Person,
1859| 1| description: None,
1860| 1| },
1861| 1| NewEntity {
1862| 1| name: "Bob".to_string(),
1863| 1| entity_type: EntityType::Person,
1864| 1| description: None,
1865| 1| },
1866| | ];
1867| 1| let (rels, _) = build_relationships_by_sentence_cooccurrence(body, &entities);
1868| 1| assert_eq!(
1869| 1| rels.len(),
1870| | 1,
1871| 0| "Alice/Bob pair must be emitted only once even when co-occurring in multiple sentences"
1872| | );
1873| 1| }
1874| |
1875| | #[test]
1876| 1| fn extraction_max_tokens_default_is_5000() {
1877| 1| std::env::remove_var("SQLITE_GRAPHRAG_EXTRACTION_MAX_TOKENS");
1878| 1| assert_eq!(crate::constants::extraction_max_tokens(), 5_000);
1879| 1| }
1880| |
1881| | #[test]
1882| 1| fn extraction_max_tokens_env_override_clamped() {
1883| 1| std::env::set_var("SQLITE_GRAPHRAG_EXTRACTION_MAX_TOKENS", "200");
1884| 1| assert_eq!(
1885| 1| crate::constants::extraction_max_tokens(),
1886| | 5_000,
1887| 0| "value below 512 must fall back to default"
1888| | );
1889| |
1890| 1| std::env::set_var("SQLITE_GRAPHRAG_EXTRACTION_MAX_TOKENS", "200000");
1891| 1| assert_eq!(
1892| 1| crate::constants::extraction_max_tokens(),
1893| | 5_000,
1894| 0| "value above 100_000 must fall back to default"
1895| | );
1896| |
1897| 1| std::env::set_var("SQLITE_GRAPHRAG_EXTRACTION_MAX_TOKENS", "8000");
1898| 1| assert_eq!(
1899| 1| crate::constants::extraction_max_tokens(),
1900| | 8_000,
1901| 0| "valid value must be honoured"
1902| | );
1903| |
1904| 1| std::env::remove_var("SQLITE_GRAPHRAG_EXTRACTION_MAX_TOKENS");
1905| 1| }
1906| |
1907| | #[test]
1908| 1| fn gliner_variant_from_str_valid() {
1909| 1| assert_eq!(
1910| 1| "fp32".parse::<GlinerVariant>().unwrap(),
1911| | GlinerVariant::Fp32
1912| | );
1913| 1| assert_eq!(
1914| 1| "fp16".parse::<GlinerVariant>().unwrap(),
1915| | GlinerVariant::Fp16
1916| | );
1917| 1| assert_eq!(
1918| 1| "int8".parse::<GlinerVariant>().unwrap(),
1919| | GlinerVariant::Int8
1920| | );
1921| 1| assert_eq!("q4".parse::<GlinerVariant>().unwrap(), GlinerVariant::Q4);
1922| 1| assert_eq!(
1923| 1| "q4f16".parse::<GlinerVariant>().unwrap(),
1924| | GlinerVariant::Q4f16
1925| | );
1926| | // Case-insensitive
1927| 1| assert_eq!(
1928| 1| "FP32".parse::<GlinerVariant>().unwrap(),
1929| | GlinerVariant::Fp32
1930| | );
1931| 1| assert_eq!(
1932| 1| "INT8".parse::<GlinerVariant>().unwrap(),
1933| | GlinerVariant::Int8
1934| | );
1935| 1| }
1936| |
1937| | #[test]
1938| 1| fn gliner_variant_from_str_invalid() {
1939| 1| assert!("invalid".parse::<GlinerVariant>().is_err());
1940| 1| assert!("fp64".parse::<GlinerVariant>().is_err());
1941| 1| assert!("".parse::<GlinerVariant>().is_err());
1942| 1| }
1943| |
1944| | #[test]
1945| 1| fn gliner_variant_filename_mapping() {
1946| 1| assert_eq!(GlinerVariant::Fp32.as_filename(), "model.onnx");
1947| 1| assert_eq!(GlinerVariant::Fp16.as_filename(), "model_fp16.onnx");
1948| 1| assert_eq!(GlinerVariant::Int8.as_filename(), "model_quantized.onnx");
1949| 1| assert_eq!(GlinerVariant::Q4.as_filename(), "model_q4.onnx");
1950| 1| assert_eq!(GlinerVariant::Q4f16.as_filename(), "model_q4f16.onnx");
1951| 1| }
1952| |
1953| | #[test]
1954| 1| fn gliner_variant_display() {
1955| 1| assert_eq!(format!("{}", GlinerVariant::Fp32), "fp32");
1956| 1| assert_eq!(format!("{}", GlinerVariant::Fp16), "fp16");
1957| 1| assert_eq!(format!("{}", GlinerVariant::Int8), "int8");
1958| 1| assert_eq!(format!("{}", GlinerVariant::Q4), "q4");
1959| 1| assert_eq!(format!("{}", GlinerVariant::Q4f16), "q4f16");
1960| 1| }
1961| |
1962| | #[test]
1963| 1| fn gliner_variant_display_size() {
1964| 1| assert_eq!(GlinerVariant::Fp32.display_size(), "1.1 GB");
1965| 1| assert_eq!(GlinerVariant::Int8.display_size(), "349 MB");
1966| 1| }
1967| |
1968| | #[test]
1969| 1| fn gliner_entity_labels_covers_all_types() {
1970| 1| let label_types: Vec<EntityType> = GLINER_ENTITY_LABELS.iter().map(|(_, t)| *t).collect();
1971| 1| assert!(label_types.contains(&EntityType::Person));
1972| 1| assert!(label_types.contains(&EntityType::Organization));
1973| 1| assert!(label_types.contains(&EntityType::Location));
1974| 1| assert!(label_types.contains(&EntityType::Date));
1975| 1| assert!(label_types.contains(&EntityType::Project));
1976| 1| assert!(label_types.contains(&EntityType::Tool));
1977| 1| assert!(label_types.contains(&EntityType::File));
1978| 1| assert!(label_types.contains(&EntityType::Concept));
1979| 1| assert!(label_types.contains(&EntityType::Decision));
1980| 1| assert!(label_types.contains(&EntityType::Incident));
1981| 1| assert!(label_types.contains(&EntityType::Dashboard));
1982| 1| assert!(label_types.contains(&EntityType::IssueTracker));
1983| 1| assert!(label_types.contains(&EntityType::Memory));
1984| 1| assert_eq!(GLINER_ENTITY_LABELS.len(), 13);
1985| 1| }
1986| |
1987| | #[test]
1988| 1| fn gliner_entity_labels_no_duplicates() {
1989| 1| let mut seen = std::collections::HashSet::new();
1990| 14| for (label, _) in GLINER_ENTITY_LABELS {
^13
1991| 13| assert!(seen.insert(*label), "duplicate label: {label}");
^0
1992| | }
1993| 1| }
1994| |
1995| | #[test]
1996| 1| fn extract_graph_auto_regex_only_fallback() {
1997| | // extract_graph_auto must succeed and capture regex entities regardless of whether
1998| | // GLiNER model files exist in the test environment (GLINER_MODEL is a global OnceLock
1999| | // that may already be initialised by a sibling test, so we cannot assert on
2000| | // extraction_method; use RegexExtractor for that invariant).
2001| 1| let result = extract_graph_auto(
2002| 1| "Contact someone@test.com about OPENAI project",
2003| 1| &make_paths(),
2004| 1| GlinerVariant::Fp32,
2005| | );
2006| 1| assert!(result.is_ok());
2007| 1| let res = result.unwrap();
2008| | // Regex prefilter must always capture the email entity
2009| 1| assert!(res.entities.iter().any(|e| e.name == "someone@test.com"));
2010| | // extraction_method must be one of the two valid values
2011| 1| assert!(
2012| 1| res.extraction_method == "regex-only" || res.extraction_method.starts_with("gliner-"),
2013| 0| "unexpected extraction_method: {}",
2014| | res.extraction_method
2015| | );
2016| 1| }
2017| |
2018| | #[test]
2019| 1| fn gliner_variant_roundtrip() {
2020| 6| for variant in &[
^5
2021| 6| GlinerVariant::Fp32,
2022| 6| GlinerVariant::Fp16,
2023| 6| GlinerVariant::Int8,
2024| 6| GlinerVariant::Q4,
2025| 6| GlinerVariant::Q4f16,
2026| 6| ] {
2027| 5| let s = format!("{variant}");
2028| 5| let parsed: GlinerVariant = s.parse().unwrap();
2029| 5| assert_eq!(*variant, parsed);
2030| | }
2031| 1| }
2032| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/graph.rs:
1| |//! Entity graph traversal (BFS over memory_entities + relations).
2| |//!
3| |//! Queries the SQLite entity and relation tables to expand neighbourhood
4| |//! sets used by the `related` and `recall` commands.
5| |
6| |// src/graph.rs
7| |
8| |use crate::errors::AppError;
9| |use rusqlite::{params, Connection};
10| |
11| |/// Traverses the entity graph by BFS from seed memories.
12| |///
13| |/// Returns `memory_id`s reachable through entity and relationship edges,
14| |/// excluding the seeds themselves. The algorithm:
15| |/// 1. Collects entities associated with seeds via `memory_entities`.
16| |/// 2. Runs BFS over `relationships` filtered by `weight >= min_weight` and `namespace`.
17| |/// 3. Returns memories linked to discovered entities (excluding soft-deleted).
18| |///
19| |/// # Errors
20| |///
21| |/// Propagates [`AppError::Database`] (exit 10) on SQLite query failures.
22| |///
23| |/// # Examples
24| |///
25| |/// ```
26| |/// use rusqlite::Connection;
27| |/// use sqlite_graphrag::graph::traverse_from_memories;
28| |///
29| |/// // Empty seed list returns immediately without querying the database.
30| |/// let conn = Connection::open_in_memory().unwrap();
31| |/// let ids = traverse_from_memories(&conn, &[], "global", 0.5, 3).unwrap();
32| |/// assert!(ids.is_empty());
33| |/// ```
34| |///
35| |/// ```
36| |/// use rusqlite::Connection;
37| |/// use sqlite_graphrag::graph::traverse_from_memories;
38| |///
39| |/// // max_hops == 0 returns immediately without traversal.
40| |/// let conn = Connection::open_in_memory().unwrap();
41| |/// let ids = traverse_from_memories(&conn, &[1, 2], "global", 0.5, 0).unwrap();
42| |/// assert!(ids.is_empty());
43| |/// ```
44| 16|pub fn traverse_from_memories(
45| 16| conn: &Connection,
46| 16| seed_memory_ids: &[i64],
47| 16| namespace: &str,
48| 16| min_weight: f64,
49| 16| max_hops: u32,
50| 16|) -> Result<Vec<i64>, AppError> {
51| 16| if seed_memory_ids.is_empty() || max_hops == 0 {
^15
52| 2| return Ok(vec![]);
53| 14| }
54| |
55| | // Step 1: collect seed entity IDs from seed memories
56| 14| let mut seed_entities: Vec<i64> = Vec::with_capacity(seed_memory_ids.len());
57| 29| for &mem_id in seed_memory_ids {
^15
58| 15| let mut stmt =
59| 15| conn.prepare_cached("SELECT entity_id FROM memory_entities WHERE memory_id = ?1")?;
^0
60| 15| let ids: Vec<i64> = stmt
61| 15| .query_map(params![mem_id], |r| r.get(0))?
^0
62| 15| .filter_map(|r| r.ok())
63| 15| .collect();
64| 15| seed_entities.extend(ids);
65| | }
66| 14| seed_entities.sort_unstable();
67| 14| seed_entities.dedup();
68| |
69| 14| if seed_entities.is_empty() {
70| 1| return Ok(vec![]);
71| 13| }
72| |
73| | // Step 2: BFS over relationships
74| | use std::collections::HashSet;
75| 13| let mut visited: HashSet<i64> = seed_entities.iter().copied().collect();
76| 13| let mut frontier: Vec<i64> = seed_entities.to_vec();
77| |
78| 13| for _ in 0..max_hops {
79| 25| if frontier.is_empty() {
80| 7| break;
81| 18| }
82| 18| let mut next_frontier = Vec::with_capacity(frontier.len() * 2);
83| |
84| 38| for &entity_id in &frontier {
^20
85| 20| let mut stmt = conn.prepare_cached(
86| 20| "SELECT target_id FROM relationships
87| 20| WHERE source_id = ?1 AND weight >= ?2 AND namespace = ?3",
88| 0| )?;
89| 20| let neighbors: Vec<i64> = stmt
90| 20| .query_map(params![entity_id, min_weight, namespace], |r| r.get(0))?
^15^15 ^0
91| 20| .filter_map(|r| r.ok())
^15^15
92| 20| .filter(|id| !visited.contains(id))
^15 ^15 ^15
93| 20| .collect();
94| |
95| 32| for id in neighbors {
^12
96| 12| visited.insert(id);
97| 12| next_frontier.push(id);
98| 12| }
99| | }
100| 18| frontier = next_frontier;
101| | }
102| |
103| | // Step 3: find memories connected to traversed entities (excluding seeds)
104| 13| let seed_set: HashSet<i64> = seed_memory_ids.iter().copied().collect();
105| 13| let graph_only_entities: Vec<i64> = visited
106| 13| .into_iter()
107| 27| .filter(|id| !seed_entities.contains(id))
^13
108| 13| .collect();
109| |
110| 13| let mut result_ids: Vec<i64> = Vec::with_capacity(graph_only_entities.len());
111| 25| for &entity_id in &graph_only_entities {
^12
112| 12| let mut stmt = conn.prepare_cached(
113| 12| "SELECT DISTINCT me.memory_id
114| 12| FROM memory_entities me
115| 12| JOIN memories m ON m.id = me.memory_id
116| 12| WHERE me.entity_id = ?1 AND m.deleted_at IS NULL",
117| 0| )?;
118| 12| let mem_ids: Vec<i64> = stmt
119| 12| .query_map(params![entity_id], |r| r.get(0))?
^11^11 ^0
120| 12| .filter_map(|r| r.ok())
^11^11
121| 12| .filter(|id| !seed_set.contains(id))
^11 ^11 ^11
122| 12| .collect();
123| 12| result_ids.extend(mem_ids);
124| | }
125| |
126| 13| result_ids.sort_unstable();
127| 13| result_ids.dedup();
128| 13| Ok(result_ids)
129| 16|}
130| |
131| |/// BFS graph traversal that also returns the hop distance for each reached memory.
132| |///
133| |/// Identical to [`traverse_from_memories`] but returns `(memory_id, hop_count)` tuples
134| |/// instead of bare IDs. `hop_count` is the BFS depth at which the entity was first
135| |/// discovered, starting from 1 for direct neighbours of the seed entities.
136| |///
137| |/// When `max_neighbors_per_hop` is `Some(k)`, only the top-`k` neighbours by
138| |/// `weight DESC` are followed at each entity expansion. Pass `None` to retain
139| |/// the original behaviour (all neighbours above `min_weight` are followed).
140| |///
141| |/// # Errors
142| |///
143| |/// Propagates [`AppError::Database`] (exit 10) on SQLite query failures.
144| 0|pub fn traverse_from_memories_with_hops(
145| 0| conn: &Connection,
146| 0| seed_memory_ids: &[i64],
147| 0| namespace: &str,
148| 0| min_weight: f64,
149| 0| max_hops: u32,
150| 0|) -> Result<Vec<(i64, u32)>, AppError> {
151| 0| traverse_from_memories_with_hops_inner(
152| 0| conn,
153| 0| seed_memory_ids,
154| 0| namespace,
155| 0| min_weight,
156| 0| max_hops,
157| 0| None,
158| | )
159| 0|}
160| |
161| |/// Extended variant that accepts an optional neighbour cap per hop.
162| |///
163| |/// Pass `max_neighbors_per_hop = Some(k)` to prune each entity's expansion to
164| |/// its top-`k` neighbours by edge weight, limiting combinatorial blow-up in
165| |/// dense graphs. `None` is equivalent to the public
166| |/// [`traverse_from_memories_with_hops`] function.
167| |///
168| |/// # Errors
169| |///
170| |/// Propagates [`AppError::Database`] (exit 10) on SQLite query failures.
171| 0|pub fn traverse_from_memories_with_hops_capped(
172| 0| conn: &Connection,
173| 0| seed_memory_ids: &[i64],
174| 0| namespace: &str,
175| 0| min_weight: f64,
176| 0| max_hops: u32,
177| 0| max_neighbors_per_hop: Option<usize>,
178| 0|) -> Result<Vec<(i64, u32)>, AppError> {
179| 0| traverse_from_memories_with_hops_inner(
180| 0| conn,
181| 0| seed_memory_ids,
182| 0| namespace,
183| 0| min_weight,
184| 0| max_hops,
185| 0| max_neighbors_per_hop,
186| | )
187| 0|}
188| |
189| 0|fn traverse_from_memories_with_hops_inner(
190| 0| conn: &Connection,
191| 0| seed_memory_ids: &[i64],
192| 0| namespace: &str,
193| 0| min_weight: f64,
194| 0| max_hops: u32,
195| 0| max_neighbors_per_hop: Option<usize>,
196| 0|) -> Result<Vec<(i64, u32)>, AppError> {
197| 0| if seed_memory_ids.is_empty() || max_hops == 0 {
198| 0| return Ok(vec![]);
199| 0| }
200| |
201| | // Collect seed entity IDs from seed memories
202| 0| let mut seed_entities: Vec<i64> = Vec::with_capacity(seed_memory_ids.len());
203| 0| for &mem_id in seed_memory_ids {
204| 0| let mut stmt =
205| 0| conn.prepare_cached("SELECT entity_id FROM memory_entities WHERE memory_id = ?1")?;
206| 0| let ids: Vec<i64> = stmt
207| 0| .query_map(params![mem_id], |r| r.get(0))?
208| 0| .filter_map(|r| r.ok())
209| 0| .collect();
210| 0| seed_entities.extend(ids);
211| | }
212| 0| seed_entities.sort_unstable();
213| 0| seed_entities.dedup();
214| |
215| 0| if seed_entities.is_empty() {
216| 0| return Ok(vec![]);
217| 0| }
218| |
219| | // BFS over relationships, tracking depth per entity
220| | use std::collections::HashMap;
221| 0| let mut entity_depth: HashMap<i64, u32> = seed_entities.iter().map(|&id| (id, 0)).collect();
222| 0| let mut frontier: Vec<i64> = seed_entities.to_vec();
223| |
224| 0| for hop in 1..=max_hops {
225| 0| if frontier.is_empty() {
226| 0| break;
227| 0| }
228| 0| let mut next_frontier = Vec::with_capacity(frontier.len() * 2);
229| |
230| 0| for &entity_id in &frontier {
231| | // Fetch neighbours ordered by weight DESC to support capping.
232| 0| let mut stmt = conn.prepare_cached(
233| 0| "SELECT target_id, weight FROM relationships
234| 0| WHERE source_id = ?1 AND weight >= ?2 AND namespace = ?3
235| 0| ORDER BY weight DESC",
236| 0| )?;
237| 0| let mut neighbors: Vec<i64> = stmt
238| 0| .query_map(params![entity_id, min_weight, namespace], |r| {
239| 0| Ok((r.get::<_, i64>(0)?, r.get::<_, f64>(1)?))
240| 0| })?
241| 0| .filter_map(|r| r.ok())
242| 0| .filter(|(id, _)| !entity_depth.contains_key(id))
243| 0| .map(|(id, _)| id)
244| 0| .collect();
245| |
246| | // Apply optional per-hop neighbour cap.
247| 0| if let Some(cap) = max_neighbors_per_hop {
248| 0| neighbors.truncate(cap);
249| 0| }
250| |
251| 0| for id in neighbors {
252| 0| entity_depth.insert(id, hop);
253| 0| next_frontier.push(id);
254| 0| }
255| | }
256| 0| frontier = next_frontier;
257| | }
258| |
259| | // Find memories connected to traversed entities (excluding seeds), preserving hop depth
260| 0| let seed_set: std::collections::HashSet<i64> = seed_memory_ids.iter().copied().collect();
261| 0| let seed_entity_set: std::collections::HashSet<i64> = seed_entities.iter().copied().collect();
262| |
263| 0| let mut result: Vec<(i64, u32)> = Vec::with_capacity(entity_depth.len());
264| 0| let mut seen_memories: std::collections::HashSet<i64> =
265| 0| std::collections::HashSet::with_capacity(entity_depth.len());
266| |
267| 0| for (&entity_id, &hop) in &entity_depth {
268| 0| if seed_entity_set.contains(&entity_id) {
269| 0| continue;
270| 0| }
271| 0| let mut stmt = conn.prepare_cached(
272| 0| "SELECT DISTINCT me.memory_id
273| 0| FROM memory_entities me
274| 0| JOIN memories m ON m.id = me.memory_id
275| 0| WHERE me.entity_id = ?1 AND m.deleted_at IS NULL",
276| 0| )?;
277| 0| let mem_ids: Vec<i64> = stmt
278| 0| .query_map(params![entity_id], |r| r.get(0))?
279| 0| .filter_map(|r| r.ok())
280| 0| .filter(|id| !seed_set.contains(id) && !seen_memories.contains(id))
281| 0| .collect();
282| |
283| 0| for mem_id in mem_ids {
284| 0| seen_memories.insert(mem_id);
285| 0| result.push((mem_id, hop));
286| 0| }
287| | }
288| |
289| 0| result.sort_unstable_by_key(|&(id, _)| id);
290| 0| Ok(result)
291| 0|}
292| |
293| |/// Depth map from BFS: entity_id → hop distance from seeds.
294| |pub type EntityDepthMap = std::collections::HashMap<i64, u32>;
295| |
296| |/// Predecessor map from BFS: entity_id → (parent_entity_id, relation_type, edge_weight).
297| |///
298| |/// Enables path reconstruction from any discovered entity back to a seed.
299| |pub type PredecessorMap = std::collections::HashMap<i64, (i64, String, f64)>;
300| |
301| |/// BFS that also returns a predecessor map for path reconstruction.
302| |///
303| |/// Used by `deep-research` to reconstruct directed evidence chains from
304| |/// discovered entities back to their seeds.
305| |///
306| |/// Returns `(entity_depth, predecessor)` where:
307| |/// - `entity_depth`: depth of each reached entity (0 = seed).
308| |/// - `predecessor`: the BFS tree edge that first reached each non-seed entity.
309| |///
310| |/// # Errors
311| |///
312| |/// Propagates [`AppError::Database`] (exit 10) on SQLite query failures.
313| 2|pub fn bfs_with_predecessors(
314| 2| conn: &Connection,
315| 2| seed_entity_ids: &[i64],
316| 2| namespace: &str,
317| 2| min_weight: f64,
318| 2| max_hops: u32,
319| 2| max_neighbors_per_hop: Option<usize>,
320| 2|) -> Result<(EntityDepthMap, PredecessorMap), AppError> {
321| | use std::collections::HashMap;
322| |
323| 2| let mut entity_depth: HashMap<i64, u32> = seed_entity_ids.iter().map(|&id| (id, 0)).collect();
324| 2| let mut predecessor: HashMap<i64, (i64, String, f64)> =
325| 2| HashMap::with_capacity(max_hops as usize * 10);
326| 2| let mut frontier: Vec<i64> = seed_entity_ids.to_vec();
327| |
328| 2| for hop in 1..=max_hops {
329| 2| if frontier.is_empty() {
330| 0| break;
331| 2| }
332| 2| let mut next_frontier = Vec::with_capacity(frontier.len() * 2);
333| |
334| 4| for &entity_id in &frontier {
^2
335| 2| let mut stmt = conn.prepare_cached(
336| 2| "SELECT target_id, relation, weight FROM relationships
337| 2| WHERE source_id = ?1 AND weight >= ?2 AND namespace = ?3
338| 2| ORDER BY weight DESC",
339| 0| )?;
340| 2| let mut neighbors: Vec<(i64, String, f64)> = stmt
341| 10| .query_map(params![entity_id, min_weight, namespace], |r| {
^2 ^2
342| | Ok((
343| 10| r.get::<_, i64>(0)?,
^0
344| 10| r.get::<_, String>(1)?,
^0
345| 10| r.get::<_, f64>(2)?,
^0
346| | ))
347| 10| })?
^0
348| 10| .filter_map(|r| r.ok())
^2
349| 10| .filter(|(id, _, _)| !entity_depth.contains_key(id))
^2
350| 2| .collect();
351| |
352| 2| if let Some(cap) = max_neighbors_per_hop {
^1
353| 1| neighbors.truncate(cap);
354| 1| }
355| |
356| 9| for (id, relation, weight) in neighbors {
^7 ^7 ^7
357| 7| entity_depth.insert(id, hop);
358| 7| predecessor.insert(id, (entity_id, relation, weight));
359| 7| next_frontier.push(id);
360| 7| }
361| | }
362| 2| frontier = next_frontier;
363| | }
364| |
365| 2| Ok((entity_depth, predecessor))
366| 2|}
367| |
368| |#[cfg(test)]
369| |mod tests {
370| | use super::*;
371| | use rusqlite::Connection;
372| |
373| 16| fn setup_db() -> Connection {
374| 16| let conn = Connection::open_in_memory().unwrap();
375| 16| conn.execute_batch(
376| 16| "CREATE TABLE memories (
377| 16| id INTEGER PRIMARY KEY,
378| 16| namespace TEXT NOT NULL,
379| 16| deleted_at TEXT
380| 16| );
381| 16| CREATE TABLE memory_entities (
382| 16| memory_id INTEGER NOT NULL,
383| 16| entity_id INTEGER NOT NULL
384| 16| );
385| 16| CREATE TABLE relationships (
386| 16| source_id INTEGER NOT NULL,
387| 16| target_id INTEGER NOT NULL,
388| 16| weight REAL NOT NULL,
389| 16| namespace TEXT NOT NULL
390| 16| );",
391| | )
392| 16| .unwrap();
393| 16| conn
394| 16| }
395| |
396| 31| fn insert_memory(conn: &Connection, id: i64, namespace: &str, deleted: bool) {
397| 31| conn.execute(
398| 31| "INSERT INTO memories (id, namespace, deleted_at) VALUES (?1, ?2, ?3)",
399| 31| params![
400| | id,
401| | namespace,
402| 31| if deleted { Some("2024-01-01") } else { None }
^1 ^30
403| | ],
404| | )
405| 31| .unwrap();
406| 31| }
407| |
408| 31| fn link_memory_entity(conn: &Connection, memory_id: i64, entity_id: i64) {
409| 31| conn.execute(
410| 31| "INSERT INTO memory_entities (memory_id, entity_id) VALUES (?1, ?2)",
411| 31| params![memory_id, entity_id],
412| | )
413| 31| .unwrap();
414| 31| }
415| |
416| 18| fn insert_relationship(conn: &Connection, src: i64, tgt: i64, weight: f64, ns: &str) {
417| 18| conn.execute(
418| 18| "INSERT INTO relationships (source_id, target_id, weight, namespace) VALUES (?1, ?2, ?3, ?4)",
419| 18| params![src, tgt, weight, ns],
420| | )
421| 18| .unwrap();
422| 18| }
423| |
424| | // --- edge cases retornando vazio ---
425| |
426| | #[test]
427| 1| fn returns_empty_when_seeds_empty() {
428| 1| let conn = setup_db();
429| 1| let result = traverse_from_memories(&conn, &[], "ns", 0.5, 3).unwrap();
430| 1| assert!(result.is_empty());
431| 1| }
432| |
433| | #[test]
434| 1| fn returns_empty_when_max_hops_zero() {
435| 1| let conn = setup_db();
436| 1| insert_memory(&conn, 1, "ns", false);
437| 1| link_memory_entity(&conn, 1, 10);
438| 1| let result = traverse_from_memories(&conn, &[1], "ns", 0.5, 0).unwrap();
439| 1| assert!(result.is_empty());
440| 1| }
441| |
442| | #[test]
443| 1| fn returns_empty_when_seed_has_no_entities() {
444| 1| let conn = setup_db();
445| 1| insert_memory(&conn, 1, "ns", false);
446| | // memory exists but has no associated entities
447| 1| let result = traverse_from_memories(&conn, &[1], "ns", 0.5, 3).unwrap();
448| 1| assert!(result.is_empty());
449| 1| }
450| |
451| | #[test]
452| 1| fn returns_empty_when_no_relationships() {
453| 1| let conn = setup_db();
454| 1| insert_memory(&conn, 1, "ns", false);
455| 1| link_memory_entity(&conn, 1, 10);
456| | // entity 10 has no relationships
457| 1| let result = traverse_from_memories(&conn, &[1], "ns", 0.5, 3).unwrap();
458| 1| assert!(result.is_empty());
459| 1| }
460| |
461| | // --- basic happy path ---
462| |
463| | #[test]
464| 1| fn traversal_basic_one_hop() {
465| 1| let conn = setup_db();
466| |
467| | // seed: memory 1 com entity 10
468| 1| insert_memory(&conn, 1, "ns", false);
469| 1| link_memory_entity(&conn, 1, 10);
470| |
471| | // vizinha: entity 20 ligada a memory 2
472| 1| insert_memory(&conn, 2, "ns", false);
473| 1| link_memory_entity(&conn, 2, 20);
474| |
475| | // relacionamento 10 -> 20
476| 1| insert_relationship(&conn, 10, 20, 1.0, "ns");
477| |
478| 1| let result = traverse_from_memories(&conn, &[1], "ns", 0.5, 1).unwrap();
479| 1| assert_eq!(result, vec![2]);
480| 1| }
481| |
482| | #[test]
483| 1| fn traversal_two_hops() {
484| 1| let conn = setup_db();
485| |
486| 1| insert_memory(&conn, 1, "ns", false);
487| 1| link_memory_entity(&conn, 1, 10);
488| |
489| 1| insert_memory(&conn, 2, "ns", false);
490| 1| link_memory_entity(&conn, 2, 20);
491| |
492| 1| insert_memory(&conn, 3, "ns", false);
493| 1| link_memory_entity(&conn, 3, 30);
494| |
495| | // cadeia 10 -> 20 -> 30
496| 1| insert_relationship(&conn, 10, 20, 1.0, "ns");
497| 1| insert_relationship(&conn, 20, 30, 1.0, "ns");
498| |
499| 1| let mut result = traverse_from_memories(&conn, &[1], "ns", 0.5, 2).unwrap();
500| 1| result.sort_unstable();
501| 1| assert_eq!(result, vec![2, 3]);
502| 1| }
503| |
504| | #[test]
505| 1| fn max_hops_limits_depth() {
506| 1| let conn = setup_db();
507| |
508| 1| insert_memory(&conn, 1, "ns", false);
509| 1| link_memory_entity(&conn, 1, 10);
510| |
511| 1| insert_memory(&conn, 2, "ns", false);
512| 1| link_memory_entity(&conn, 2, 20);
513| |
514| 1| insert_memory(&conn, 3, "ns", false);
515| 1| link_memory_entity(&conn, 3, 30);
516| |
517| 1| insert_relationship(&conn, 10, 20, 1.0, "ns");
518| 1| insert_relationship(&conn, 20, 30, 1.0, "ns");
519| |
520| | // with only 1 hop, memory 3 must not appear
521| 1| let result = traverse_from_memories(&conn, &[1], "ns", 0.5, 1).unwrap();
522| 1| assert_eq!(result, vec![2]);
523| 1| assert!(!result.contains(&3));
524| 1| }
525| |
526| | // --- filtro de peso ---
527| |
528| | #[test]
529| 1| fn relationship_with_weight_below_min_ignored() {
530| 1| let conn = setup_db();
531| |
532| 1| insert_memory(&conn, 1, "ns", false);
533| 1| link_memory_entity(&conn, 1, 10);
534| |
535| 1| insert_memory(&conn, 2, "ns", false);
536| 1| link_memory_entity(&conn, 2, 20);
537| |
538| | // peso 0.3 < min_weight 0.5
539| 1| insert_relationship(&conn, 10, 20, 0.3, "ns");
540| |
541| 1| let result = traverse_from_memories(&conn, &[1], "ns", 0.5, 3).unwrap();
542| 1| assert!(result.is_empty());
543| 1| }
544| |
545| | #[test]
546| 1| fn relationship_with_weight_exactly_at_min_included() {
547| 1| let conn = setup_db();
548| |
549| 1| insert_memory(&conn, 1, "ns", false);
550| 1| link_memory_entity(&conn, 1, 10);
551| |
552| 1| insert_memory(&conn, 2, "ns", false);
553| 1| link_memory_entity(&conn, 2, 20);
554| |
555| 1| insert_relationship(&conn, 10, 20, 0.5, "ns");
556| |
557| 1| let result = traverse_from_memories(&conn, &[1], "ns", 0.5, 1).unwrap();
558| 1| assert_eq!(result, vec![2]);
559| 1| }
560| |
561| | // --- isolamento de namespace ---
562| |
563| | #[test]
564| 1| fn relationship_from_different_namespace_ignored() {
565| 1| let conn = setup_db();
566| |
567| 1| insert_memory(&conn, 1, "ns_a", false);
568| 1| link_memory_entity(&conn, 1, 10);
569| |
570| 1| insert_memory(&conn, 2, "ns_a", false);
571| 1| link_memory_entity(&conn, 2, 20);
572| |
573| | // relacionamento no namespace errado
574| 1| insert_relationship(&conn, 10, 20, 1.0, "ns_b");
575| |
576| 1| let result = traverse_from_memories(&conn, &[1], "ns_a", 0.5, 3).unwrap();
577| 1| assert!(result.is_empty());
578| 1| }
579| |
580| | // --- exclude seeds from result ---
581| |
582| | #[test]
583| 1| fn seeds_do_not_appear_in_result() {
584| 1| let conn = setup_db();
585| |
586| 1| insert_memory(&conn, 1, "ns", false);
587| 1| link_memory_entity(&conn, 1, 10);
588| |
589| 1| insert_memory(&conn, 2, "ns", false);
590| 1| link_memory_entity(&conn, 2, 20);
591| |
592| | // relacionamento de 20 de volta para 10 (ciclo)
593| 1| insert_relationship(&conn, 10, 20, 1.0, "ns");
594| 1| insert_relationship(&conn, 20, 10, 1.0, "ns");
595| |
596| 1| let result = traverse_from_memories(&conn, &[1], "ns", 0.5, 3).unwrap();
597| | // memory 1 must not appear even with a cycle
598| 1| assert!(!result.contains(&1));
599| 1| assert_eq!(result, vec![2]);
600| 1| }
601| |
602| | // --- soft-deleted memories excluded ---
603| |
604| | #[test]
605| 1| fn deleted_memories_not_included() {
606| 1| let conn = setup_db();
607| |
608| 1| insert_memory(&conn, 1, "ns", false);
609| 1| link_memory_entity(&conn, 1, 10);
610| |
611| | // memory 2 foi deletada
612| 1| insert_memory(&conn, 2, "ns", true);
613| 1| link_memory_entity(&conn, 2, 20);
614| |
615| 1| insert_relationship(&conn, 10, 20, 1.0, "ns");
616| |
617| 1| let result = traverse_from_memories(&conn, &[1], "ns", 0.5, 3).unwrap();
618| 1| assert!(result.is_empty());
619| 1| }
620| |
621| | // --- multiple seeds ---
622| |
623| | #[test]
624| 1| fn multiple_seeds_merged_in_result() {
625| 1| let conn = setup_db();
626| |
627| 1| insert_memory(&conn, 1, "ns", false);
628| 1| link_memory_entity(&conn, 1, 10);
629| |
630| 1| insert_memory(&conn, 2, "ns", false);
631| 1| link_memory_entity(&conn, 2, 20);
632| |
633| 1| insert_memory(&conn, 3, "ns", false);
634| 1| link_memory_entity(&conn, 3, 30);
635| |
636| 1| insert_memory(&conn, 4, "ns", false);
637| 1| link_memory_entity(&conn, 4, 40);
638| |
639| 1| insert_relationship(&conn, 10, 30, 1.0, "ns");
640| 1| insert_relationship(&conn, 20, 40, 1.0, "ns");
641| |
642| 1| let mut result = traverse_from_memories(&conn, &[1, 2], "ns", 0.5, 1).unwrap();
643| 1| result.sort_unstable();
644| 1| assert_eq!(result, vec![3, 4]);
645| 1| }
646| |
647| | // --- result deduplication ---
648| |
649| | #[test]
650| 1| fn result_without_duplicates() {
651| 1| let conn = setup_db();
652| |
653| 1| insert_memory(&conn, 1, "ns", false);
654| 1| link_memory_entity(&conn, 1, 10);
655| 1| link_memory_entity(&conn, 1, 11); // dois seeds na mesma memory
656| |
657| 1| insert_memory(&conn, 2, "ns", false);
658| 1| link_memory_entity(&conn, 2, 20);
659| |
660| | // ambos os seeds apontam para a mesma entity 20
661| 1| insert_relationship(&conn, 10, 20, 1.0, "ns");
662| 1| insert_relationship(&conn, 11, 20, 1.0, "ns");
663| |
664| 1| let result = traverse_from_memories(&conn, &[1], "ns", 0.5, 1).unwrap();
665| | // memory 2 deve aparecer apenas uma vez
666| 1| assert_eq!(result.len(), 1);
667| 1| assert_eq!(result, vec![2]);
668| 1| }
669| |
670| | // --- single node ---
671| |
672| | #[test]
673| 1| fn single_node_without_neighbors_returns_empty() {
674| 1| let conn = setup_db();
675| |
676| 1| insert_memory(&conn, 1, "ns", false);
677| 1| link_memory_entity(&conn, 1, 10);
678| | // entity 10 has no outgoing relationships
679| |
680| 1| let result = traverse_from_memories(&conn, &[1], "ns", 0.5, 5).unwrap();
681| 1| assert!(result.is_empty());
682| 1| }
683| |
684| | // --- ciclos no grafo ---
685| |
686| | #[test]
687| 1| fn cycle_does_not_cause_infinite_loop() {
688| 1| let conn = setup_db();
689| |
690| 1| insert_memory(&conn, 1, "ns", false);
691| 1| link_memory_entity(&conn, 1, 10);
692| |
693| 1| insert_memory(&conn, 2, "ns", false);
694| 1| link_memory_entity(&conn, 2, 20);
695| |
696| 1| insert_memory(&conn, 3, "ns", false);
697| 1| link_memory_entity(&conn, 3, 30);
698| |
699| | // triangle 10 -> 20 -> 30 -> 10
700| 1| insert_relationship(&conn, 10, 20, 1.0, "ns");
701| 1| insert_relationship(&conn, 20, 30, 1.0, "ns");
702| 1| insert_relationship(&conn, 30, 10, 1.0, "ns");
703| |
704| 1| let mut result = traverse_from_memories(&conn, &[1], "ns", 0.5, 10).unwrap();
705| 1| result.sort_unstable();
706| | // deve retornar 2 e 3 sem loop infinito
707| 1| assert_eq!(result, vec![2, 3]);
708| 1| }
709| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/i18n.rs:
1| |//! Bilingual human-readable message layer.
2| |//!
3| |//! The CLI uses `--lang en|pt` (global flag) or `SQLITE_GRAPHRAG_LANG` (env var) to choose
4| |//! the language of stderr progress messages. JSON stdout is deterministic and identical
5| |//! across languages — only strings intended for humans pass through this module.
6| |//!
7| |//! Detection (highest to lowest priority):
8| |//! 1. Explicit `--lang` flag
9| |//! 2. Env var `SQLITE_GRAPHRAG_LANG`
10| |//! 3. OS locale (`LANG`, `LC_ALL`) with `pt` prefix
11| |//! 4. Fallback `English`
12| |
13| |use std::sync::OnceLock;
14| |
15| |#[derive(Copy, Clone, Debug, PartialEq, Eq, clap::ValueEnum)]
16| |pub enum Language {
17| | #[value(name = "en", aliases = ["english", "EN"])]
18| | English,
19| | #[value(name = "pt", aliases = ["portugues", "portuguese", "pt-BR", "pt-br", "PT"])]
20| | Portuguese,
21| |}
22| |
23| |impl Language {
24| | /// Parses a command-line string into a `Language` without relying on clap.
25| | /// Accepts the same aliases defined in `#[value(...)]`: "en", "pt", etc.
26| 0| pub fn from_str_opt(s: &str) -> Option<Self> {
27| 0| match s.to_lowercase().as_str() {
28| 0| "en" | "english" => Some(Language::English),
29| 0| "pt" | "pt-br" | "portugues" | "portuguese" => Some(Language::Portuguese),
30| 0| _ => None,
31| | }
32| 0| }
33| |
34| 8| pub fn from_env_or_locale() -> Self {
35| | // Priority 1: explicit SQLITE_GRAPHRAG_LANG env var (highest precedence).
36| | // Empty string treated as unset per POSIX convention.
37| 8| if let Ok(v) = std::env::var("SQLITE_GRAPHRAG_LANG") {
^3
38| 3| if !v.is_empty() {
39| 3| let lower = v.to_lowercase();
40| 3| if lower.starts_with("pt") {
41| 3| return Language::Portuguese;
42| 0| }
43| 0| if lower.starts_with("en") {
44| 0| return Language::English;
45| 0| }
46| 0| tracing::warn!(target: "i18n",
47| | value = %v,
48| 0| "SQLITE_GRAPHRAG_LANG value not recognized, falling back to locale detection"
49| | );
50| 0| }
51| 5| }
52| | // Priority 2: POSIX locale precedence LC_ALL > LC_MESSAGES > LANG.
53| | // We read these via std::env (not via sys_locale) because:
54| | // (a) `sys_locale::get_locale()` calls into native OS APIs (CFLocaleCopyCurrent
55| | // on macOS, GetUserDefaultLocaleName on Windows) which cache the
56| | // system locale and IGNORE env vars set at runtime by tests;
57| | // (b) POSIX specifies LC_ALL > LC_MESSAGES > LANG ordering and an
58| | // unrecognised LC_ALL value must stop iteration (fall back to
59| | // English default).
60| 7| for var in ["LC_ALL", "LC_MESSAGES", "LANG"] {
^5 ^5
61| 7| if let Ok(v) = std::env::var(var) {
^5
62| 5| if v.is_empty() {
63| 0| continue;
64| 5| }
65| 5| let lower = v.to_lowercase();
66| 5| if lower.starts_with("pt") {
67| 2| return Language::Portuguese;
68| 3| }
69| 3| if lower.starts_with("en") {
70| 1| return Language::English;
71| 2| }
72| | // Unrecognised value in a higher-precedence variable stops
73| | // iteration per POSIX.1-2017 §8.2.
74| 2| if var == "LC_ALL" {
75| 2| return Language::English;
76| 0| }
77| 2| }
78| | }
79| | // Priority 3: cross-platform locale detection via native OS APIs.
80| | // Only reached when no POSIX env var is set.
81| 0| if let Some(locale) = sys_locale::get_locale() {
82| 0| let lower = locale.to_lowercase();
83| 0| if lower.starts_with("pt") {
84| 0| return Language::Portuguese;
85| 0| }
86| 0| if lower.starts_with("en") {
87| 0| return Language::English;
88| 0| }
89| 0| }
90| 0| Language::English
91| 8| }
92| |}
93| |
94| |static GLOBAL_LANGUAGE: OnceLock<Language> = OnceLock::new();
95| |
96| |/// Initializes the global language. Subsequent calls are silently ignored
97| |/// (OnceLock semantics) — guaranteeing thread-safety and determinism.
98| |///
99| |/// v1.0.36 (L6): early-return when already initialized so the env-fallback
100| |/// resolver (`from_env_or_locale`) does not run a second time. Without this
101| |/// guard, calling `init(None)` after `current()` already populated the
102| |/// OnceLock causes `from_env_or_locale` to fire its `tracing::warn!` twice
103| |/// for unrecognized `SQLITE_GRAPHRAG_LANG` values.
104| 0|pub fn init(explicit: Option<Language>) {
105| 0| if GLOBAL_LANGUAGE.get().is_some() {
106| 0| return;
107| 0| }
108| 0| let resolved = explicit.unwrap_or_else(Language::from_env_or_locale);
109| 0| let _ = GLOBAL_LANGUAGE.set(resolved);
110| 0|}
111| |
112| |/// Returns the active language, or fallback English if `init` was never called.
113| 4|pub fn current() -> Language {
114| 4| *GLOBAL_LANGUAGE.get_or_init(Language::from_env_or_locale)
115| 4|}
116| |
117| |/// Translates a bilingual message by selecting the active variant.
118| |///
119| |/// v1.0.36 (M4): inputs are constrained to `&'static str` so the function
120| |/// can return one of them directly without `Box::leak`. The previous
121| |/// implementation leaked one allocation per call which accumulated in
122| |/// long-running pipelines; this version is allocation-free. All in-tree
123| |/// callers already pass string literals, which are `&'static str`.
124| 0|pub fn tr(en: &'static str, pt: &'static str) -> &'static str {
125| 0| match current() {
126| 0| Language::English => en,
127| 0| Language::Portuguese => pt,
128| | }
129| 0|}
130| |
131| |/// Progress message emitted after pruning relationships.
132| |///
133| |/// English-only: this string is emitted to stderr as a progress notice and
134| |/// does not vary by language because the prune-relations command targets
135| |/// agent-first pipelines where deterministic output matters.
136| 0|pub fn relations_pruned(count: usize, relation: &str, namespace: &str) -> String {
137| 0| format!("pruned {count} '{relation}' relationships in namespace '{namespace}'")
138| 0|}
139| |
140| |/// Progress message for dry-run preview of prune-relations.
141| |///
142| |/// English-only: emitted to stderr as a progress notice.
143| 0|pub fn prune_dry_run(count: usize, relation: &str) -> String {
144| 0| format!("dry run: {count} '{relation}' relationships would be removed")
145| 0|}
146| |
147| |/// Warning message when --yes is not passed for destructive prune-relations.
148| |///
149| |/// English-only: emitted to stderr as a progress notice.
150| 0|pub fn prune_requires_yes() -> String {
151| 0| "destructive operation requires --yes flag; use --dry-run to preview".to_string()
152| 0|}
153| |
154| |/// Localized prefix for error messages displayed to the end user.
155| 0|pub fn error_prefix() -> &'static str {
156| 0| match current() {
157| 0| Language::English => "Error",
158| 0| Language::Portuguese => "Erro",
159| | }
160| 0|}
161| |
162| |/// Error messages for `AppError` variants — always English.
163| |///
164| |/// These strings end up inside `AppError` inner fields and may appear in
165| |/// deterministic JSON stdout (e.g. ingest NDJSON). Portuguese translations
166| |/// for stderr live in `pub mod app_error_pt` and are applied by
167| |/// `localized_message_for(Language::Portuguese)`.
168| |pub mod errors_msg {
169| 0| pub fn memory_not_found(nome: &str, namespace: &str) -> String {
170| 0| format!("memory '{nome}' not found in namespace '{namespace}'")
171| 0| }
172| |
173| 0| pub fn memory_or_entity_not_found(name: &str, namespace: &str) -> String {
174| 0| format!("memory or entity '{name}' not found in namespace '{namespace}'")
175| 0| }
176| |
177| 0| pub fn database_not_found(path: &str) -> String {
178| 0| format!("database not found at {path}. Run 'sqlite-graphrag init' first.")
179| 0| }
180| |
181| 0| pub fn entity_not_found(nome: &str, namespace: &str) -> String {
182| 0| format!("entity \"{nome}\" does not exist in namespace \"{namespace}\"")
183| 0| }
184| |
185| 0| pub fn relationship_not_found(de: &str, rel: &str, para: &str, namespace: &str) -> String {
186| 0| format!(
187| 0| "relationship \"{de}\" --[{rel}]--> \"{para}\" does not exist in namespace \"{namespace}\""
188| | )
189| 0| }
190| |
191| 0| pub fn duplicate_memory(nome: &str, namespace: &str) -> String {
192| 0| format!(
193| 0| "memory '{nome}' already exists in namespace '{namespace}'. Use --force-merge to update."
194| | )
195| 0| }
196| |
197| 0| pub fn duplicate_memory_soft_deleted(name: &str, namespace: &str) -> String {
198| 0| format!(
199| 0| "memory '{name}' exists but is soft-deleted in namespace '{namespace}'; \
200| 0| use --force-merge to restore and update, or `restore` to revive it"
201| | )
202| 0| }
203| |
204| 0| pub fn optimistic_lock_conflict(expected: i64, current_ts: i64) -> String {
205| 0| format!(
206| 0| "optimistic lock conflict: expected updated_at={expected}, but current is {current_ts}"
207| | )
208| 0| }
209| |
210| 0| pub fn version_not_found(versao: i64, nome: &str) -> String {
211| 0| format!("version {versao} not found for memory '{nome}'")
212| 0| }
213| |
214| 0| pub fn no_recall_results(max_distance: f32, query: &str, namespace: &str) -> String {
215| 0| format!(
216| 0| "no results within --max-distance {max_distance} for query '{query}' in namespace '{namespace}'"
217| | )
218| 0| }
219| |
220| 0| pub fn soft_deleted_memory_not_found(nome: &str, namespace: &str) -> String {
221| 0| format!("soft-deleted memory '{nome}' not found in namespace '{namespace}'")
222| 0| }
223| |
224| 0| pub fn concurrent_process_conflict() -> String {
225| 0| "optimistic lock conflict: memory was modified by another process".to_string()
226| 0| }
227| |
228| 0| pub fn entity_limit_exceeded(max: usize) -> String {
229| 0| format!("entities exceed limit of {max}")
230| 0| }
231| |
232| 0| pub fn relationship_limit_exceeded(max: usize) -> String {
233| 0| format!("relationships exceed limit of {max}")
234| 0| }
235| |}
236| |
237| |/// Localized validation messages for memory fields.
238| |pub mod validation {
239| | use super::current;
240| | use crate::i18n::Language;
241| |
242| 1| pub fn name_length(max: usize) -> String {
243| 1| match current() {
244| 0| Language::English => format!("name must be 1-{max} chars"),
245| 1| Language::Portuguese => format!("nome deve ter entre 1 e {max} caracteres"),
246| | }
247| 1| }
248| |
249| 1| pub fn reserved_name() -> String {
250| 1| match current() {
251| | Language::English => {
252| 0| "names and namespaces starting with __ are reserved for internal use".to_string()
253| | }
254| | Language::Portuguese => {
255| 1| "nomes e namespaces iniciados com __ são reservados para uso interno".to_string()
256| | }
257| | }
258| 1| }
259| |
260| 0| pub fn name_kebab(nome: &str) -> String {
261| 0| match current() {
262| 0| Language::English => format!(
263| 0| "name must be kebab-case slug (lowercase letters, digits, hyphens): '{nome}'"
264| | ),
265| | Language::Portuguese => {
266| 0| format!("nome deve estar em kebab-case (minúsculas, dígitos, hífens): '{nome}'")
267| | }
268| | }
269| 0| }
270| |
271| 0| pub fn description_exceeds(max: usize) -> String {
272| 0| match current() {
273| 0| Language::English => format!("description must be <= {max} chars"),
274| 0| Language::Portuguese => format!("descrição deve ter no máximo {max} caracteres"),
275| | }
276| 0| }
277| |
278| 0| pub fn body_exceeds(max: usize) -> String {
279| 0| match current() {
280| 0| Language::English => format!("body exceeds {max} bytes"),
281| 0| Language::Portuguese => format!("corpo excede {max} bytes"),
282| | }
283| 0| }
284| |
285| 0| pub fn new_name_length(max: usize) -> String {
286| 0| match current() {
287| 0| Language::English => format!("new-name must be 1-{max} chars"),
288| 0| Language::Portuguese => format!("novo nome deve ter entre 1 e {max} caracteres"),
289| | }
290| 0| }
291| |
292| 0| pub fn new_name_kebab(nome: &str) -> String {
293| 0| match current() {
294| 0| Language::English => format!(
295| 0| "new-name must be kebab-case slug (lowercase letters, digits, hyphens): '{nome}'"
296| | ),
297| 0| Language::Portuguese => format!(
298| 0| "novo nome deve estar em kebab-case (minúsculas, dígitos, hífens): '{nome}'"
299| | ),
300| | }
301| 0| }
302| |
303| 0| pub fn namespace_length() -> String {
304| 0| match current() {
305| 0| Language::English => "namespace must be 1-80 chars".to_string(),
306| 0| Language::Portuguese => "namespace deve ter entre 1 e 80 caracteres".to_string(),
307| | }
308| 0| }
309| |
310| 0| pub fn namespace_format() -> String {
311| 0| match current() {
312| 0| Language::English => "namespace must be alphanumeric + hyphens/underscores".to_string(),
313| | Language::Portuguese => {
314| 0| "namespace deve ser alfanumérico com hífens/sublinhados".to_string()
315| | }
316| | }
317| 0| }
318| |
319| 1| pub fn path_traversal(p: &str) -> String {
320| 1| match current() {
321| 0| Language::English => format!("path traversal rejected: {p}"),
322| 1| Language::Portuguese => format!("traversal de caminho rejeitado: {p}"),
323| | }
324| 1| }
325| |
326| 1| pub fn invalid_tz(v: &str) -> String {
327| 1| match current() {
328| 0| Language::English => format!(
329| 0| "SQLITE_GRAPHRAG_DISPLAY_TZ invalid: '{v}'; use an IANA name like 'America/Sao_Paulo'"
330| | ),
331| 1| Language::Portuguese => format!(
332| 1| "SQLITE_GRAPHRAG_DISPLAY_TZ inválido: '{v}'; use um nome IANA como 'America/Sao_Paulo'"
333| | ),
334| | }
335| 1| }
336| |
337| 0| pub fn empty_query() -> String {
338| 0| match current() {
339| 0| Language::English => "query cannot be empty".to_string(),
340| 0| Language::Portuguese => "a consulta não pode estar vazia".to_string(),
341| | }
342| 0| }
343| |
344| 0| pub fn empty_body() -> String {
345| 0| match current() {
346| 0| Language::English => "body cannot be empty: provide --body, --body-file, or --body-stdin with content, or supply a graph via --entities-file/--graph-stdin".to_string(),
347| 0| Language::Portuguese => "o corpo não pode estar vazio: forneça --body, --body-file ou --body-stdin com conteúdo, ou um grafo via --entities-file/--graph-stdin".to_string(),
348| | }
349| 0| }
350| |
351| 0| pub fn invalid_namespace_config(path: &str, err: &str) -> String {
352| 0| match current() {
353| | Language::English => {
354| 0| format!("invalid project namespace config '{path}': {err}")
355| | }
356| | Language::Portuguese => {
357| 0| format!("configuração de namespace de projeto inválida '{path}': {err}")
358| | }
359| | }
360| 0| }
361| |
362| 0| pub fn invalid_projects_mapping(path: &str, err: &str) -> String {
363| 0| match current() {
364| 0| Language::English => format!("invalid projects mapping '{path}': {err}"),
365| 0| Language::Portuguese => format!("mapeamento de projetos inválido '{path}': {err}"),
366| | }
367| 0| }
368| |
369| 0| pub fn self_referential_link() -> String {
370| 0| match current() {
371| 0| Language::English => "--from and --to must be different entities — self-referential relationships are not supported".to_string(),
372| 0| Language::Portuguese => "--from e --to devem ser entidades diferentes — relacionamentos auto-referenciais não são suportados".to_string(),
373| | }
374| 0| }
375| |
376| 0| pub fn invalid_link_weight(weight: f64) -> String {
377| 0| match current() {
378| | Language::English => {
379| 0| format!("--weight: must be between 0.0 and 1.0 (actual: {weight})")
380| | }
381| | Language::Portuguese => {
382| 0| format!("--weight: deve estar entre 0.0 e 1.0 (atual: {weight})")
383| | }
384| | }
385| 0| }
386| |
387| 0| pub fn sync_destination_equals_source() -> String {
388| 0| match current() {
389| | Language::English => {
390| 0| "destination path must differ from the source database path".to_string()
391| | }
392| | Language::Portuguese => {
393| 0| "caminho de destino deve ser diferente do caminho do banco de dados fonte"
394| 0| .to_string()
395| | }
396| | }
397| 0| }
398| |
399| | /// Portuguese translations for `AppError` Display messages.
400| | ///
401| | /// Each helper mirrors a single `AppError` variant's `#[error(...)]` text in
402| | /// Portuguese, keeping the language barrier confined to this module. The
403| | /// English source of truth lives in `src/errors.rs` via `thiserror`.
404| | pub mod app_error_pt {
405| 2| pub fn validation(msg: &str) -> String {
406| 2| format!("erro de validação: {msg}")
407| 2| }
408| |
409| 3| pub fn duplicate(msg: &str) -> String {
410| 3| let translated = msg
411| 3| .replace("already exists in namespace", "já existe no namespace")
412| 3| .replace(
413| 3| "exists but is soft-deleted in namespace",
414| 3| "existe mas está excluída temporariamente no namespace",
415| 3| )
416| 3| .replace(
417| 3| "Use --force-merge to update.",
418| 3| "Use --force-merge para atualizar.",
419| 3| )
420| 3| .replace(
421| 3| "use --force-merge to restore and update, or `restore` to revive it",
422| 3| "use --force-merge para restaurar e atualizar, ou `restore` para revivê-la",
423| 3| )
424| 3| .replace("memory", "memória");
425| 3| format!("duplicata detectada: {translated}")
426| 3| }
427| |
428| 2| pub fn conflict(msg: &str) -> String {
429| 2| let translated = msg
430| 2| .replace("optimistic lock conflict", "conflito de lock otimista")
431| 2| .replace("but current is", "mas atual é")
432| 2| .replace(
433| 2| "was modified by another process",
434| 2| "foi modificada por outro processo",
435| 2| );
436| 2| format!("conflito: {translated}")
437| 2| }
438| |
439| 4| pub fn not_found(msg: &str) -> String {
440| 4| let translated = msg
441| 4| .replace("not found in namespace", "não encontrada no namespace")
442| 4| .replace("not found for memory", "não encontrada para memória")
443| 4| .replace("does not exist in namespace", "não existe no namespace")
444| 4| .replace("memory or entity", "memória ou entidade")
445| 4| .replace("memory", "memória")
446| 4| .replace("entity", "entidade")
447| 4| .replace("version", "versão")
448| 4| .replace("soft-deleted", "excluída temporariamente");
449| 4| format!("não encontrado: {translated}")
450| 4| }
451| |
452| 2| pub fn namespace_error(msg: &str) -> String {
453| 2| format!("namespace não resolvido: {msg}")
454| 2| }
455| |
456| 2| pub fn limit_exceeded(msg: &str) -> String {
457| 2| let translated = msg
458| 2| .replace("exceeds limit of", "excede limite de")
459| 2| .replace("body exceeds", "corpo excede")
460| 2| .replace("entities exceed limit", "entidades excedem limite")
461| 2| .replace(
462| 2| "relationships exceed limit",
463| 2| "relacionamentos excedem limite",
464| 2| );
465| 2| format!("limite excedido: {translated}")
466| 2| }
467| |
468| 0| pub fn database(err: &str) -> String {
469| 0| format!("erro de banco de dados: {err}")
470| 0| }
471| |
472| 2| pub fn embedding(msg: &str) -> String {
473| 2| format!("erro de embedding: {msg}")
474| 2| }
475| |
476| 2| pub fn vec_extension(msg: &str) -> String {
477| 2| format!("extensão sqlite-vec falhou: {msg}")
478| 2| }
479| |
480| 2| pub fn db_busy(msg: &str) -> String {
481| 2| format!("banco ocupado: {msg}")
482| 2| }
483| |
484| 2| pub fn batch_partial_failure(total: usize, failed: usize) -> String {
485| 2| format!("falha parcial em batch: {failed} de {total} itens falharam")
486| 2| }
487| |
488| 0| pub fn io(err: &str) -> String {
489| 0| format!("erro de I/O: {err}")
490| 0| }
491| |
492| 0| pub fn internal(err: &str) -> String {
493| 0| format!("erro interno: {err}")
494| 0| }
495| |
496| 0| pub fn json(err: &str) -> String {
497| 0| format!("erro de JSON: {err}")
498| 0| }
499| |
500| 2| pub fn lock_busy(msg: &str) -> String {
501| 2| format!("lock ocupado: {msg}")
502| 2| }
503| |
504| 2| pub fn all_slots_full(max: usize, waited_secs: u64) -> String {
505| 2| format!(
506| 2| "todos os {max} slots de concorrência ocupados após aguardar {waited_secs}s \
507| 2| (exit 75); use --max-concurrency ou aguarde outras invocações terminarem"
508| | )
509| 2| }
510| |
511| 0| pub fn job_singleton_locked(job_type: &str, namespace: &str) -> String {
512| 0| format!(
513| 0| "job {job_type} para o namespace '{namespace}' já está em execução (exit 75); \
514| 0| aguarde a conclusão ou passe --wait-job-singleton <SEGUNDOS>"
515| | )
516| 0| }
517| |
518| 2| pub fn low_memory(available_mb: u64, required_mb: u64) -> String {
519| 2| format!(
520| 2| "memória disponível ({available_mb}MB) abaixo do mínimo requerido ({required_mb}MB) \
521| 2| para carregar o modelo; aborte outras cargas ou use --skip-memory-guard (exit 77)"
522| | )
523| 2| }
524| |
525| 2| pub fn binary_not_found(name: &str) -> String {
526| 2| format!("binário não encontrado: {name} — instale e adicione ao PATH")
527| 2| }
528| |
529| 2| pub fn rate_limited(detail: &str) -> String {
530| 2| format!("taxa de requisição excedida: {detail}")
531| 2| }
532| |
533| 2| pub fn timeout(operation: &str, secs: u64) -> String {
534| 2| format!("timeout após {secs}s: {operation}")
535| 2| }
536| | }
537| |
538| | /// Portuguese translations for runtime startup messages emitted from `main.rs`.
539| | ///
540| | /// These mirror the English text supplied alongside each call to
541| | /// `output::emit_progress_i18n` / `output::emit_error_i18n`, keeping the PT
542| | /// strings confined to this module per the language policy.
543| | pub mod runtime_pt {
544| 0| pub fn embedding_heavy_must_measure_ram() -> String {
545| 0| "comando intensivo em embedding precisa medir RAM disponível".to_string()
546| 0| }
547| |
548| 0| pub fn heavy_command_detected(available_mb: u64, safe_concurrency: usize) -> String {
549| 0| format!(
550| 0| "Comando pesado detectado; memória disponível: {available_mb} MB; \
551| 0| concorrência segura: {safe_concurrency}"
552| | )
553| 0| }
554| |
555| 0| pub fn reducing_concurrency(
556| 0| requested_concurrency: usize,
557| 0| effective_concurrency: usize,
558| 0| ) -> String {
559| 0| format!(
560| 0| "Reduzindo a concorrência solicitada de {requested_concurrency} para \
561| 0| {effective_concurrency} para evitar oversubscription de memória"
562| | )
563| 0| }
564| |
565| 0| pub fn initializing_embedding_model() -> &'static str {
566| 0| "Inicializando modelo de embedding (pode baixar na primeira execução)..."
567| 0| }
568| |
569| 0| pub fn embedding_chunks_serially(count: usize) -> String {
570| 0| format!("Embedando {count} chunks serialmente para manter memória limitada...")
571| 0| }
572| |
573| 0| pub fn remember_step_input_validated(available_mb: u64) -> String {
574| 0| format!("Etapa remember: entrada validada; memória disponível {available_mb} MB")
575| 0| }
576| |
577| 0| pub fn remember_step_chunking_completed(
578| 0| total_passage_tokens: usize,
579| 0| model_max_length: usize,
580| 0| chunks_count: usize,
581| 0| rss_mb: u64,
582| 0| ) -> String {
583| 0| format!(
584| 0| "Etapa remember: tokenizer contou {total_passage_tokens} tokens de passagem \
585| 0| (máximo do modelo {model_max_length}); chunking gerou {chunks_count} chunks; \
586| 0| RSS do processo {rss_mb} MB"
587| | )
588| 0| }
589| |
590| 0| pub fn remember_step_embeddings_completed(rss_mb: u64) -> String {
591| 0| format!("Etapa remember: embeddings dos chunks concluídos; RSS do processo {rss_mb} MB")
592| 0| }
593| |
594| 0| pub fn restore_recomputing_embedding() -> &'static str {
595| 0| "Recalculando embedding da memória restaurada..."
596| 0| }
597| |
598| 0| pub fn edit_recomputing_embedding() -> &'static str {
599| 0| "Recalculando embedding da memória editada..."
600| 0| }
601| | }
602| |}
603| |
604| |#[cfg(test)]
605| |mod tests {
606| | use super::*;
607| | use serial_test::serial;
608| |
609| | #[test]
610| | #[serial]
611| 1| fn fallback_english_when_env_absent() {
612| 1| std::env::remove_var("SQLITE_GRAPHRAG_LANG");
613| 1| std::env::set_var("LC_ALL", "C");
614| 1| std::env::set_var("LANG", "C");
615| 1| assert_eq!(Language::from_env_or_locale(), Language::English);
616| 1| std::env::remove_var("LC_ALL");
617| 1| std::env::remove_var("LANG");
618| | }
619| |
620| | #[test]
621| | #[serial]
622| 1| fn env_pt_selects_portuguese() {
623| 1| std::env::remove_var("LC_ALL");
624| 1| std::env::remove_var("LANG");
625| 1| std::env::set_var("SQLITE_GRAPHRAG_LANG", "pt");
626| 1| assert_eq!(Language::from_env_or_locale(), Language::Portuguese);
627| 1| std::env::remove_var("SQLITE_GRAPHRAG_LANG");
628| | }
629| |
630| | #[test]
631| | #[serial]
632| 1| fn env_pt_br_selects_portuguese() {
633| 1| std::env::remove_var("LC_ALL");
634| 1| std::env::remove_var("LANG");
635| 1| std::env::set_var("SQLITE_GRAPHRAG_LANG", "pt-BR");
636| 1| assert_eq!(Language::from_env_or_locale(), Language::Portuguese);
637| 1| std::env::remove_var("SQLITE_GRAPHRAG_LANG");
638| | }
639| |
640| | #[test]
641| | #[serial]
642| 1| fn locale_ptbr_utf8_selects_portuguese() {
643| 1| std::env::remove_var("SQLITE_GRAPHRAG_LANG");
644| 1| std::env::set_var("LC_ALL", "pt_BR.UTF-8");
645| 1| assert_eq!(Language::from_env_or_locale(), Language::Portuguese);
646| 1| std::env::remove_var("LC_ALL");
647| | }
648| |
649| | #[test]
650| | #[serial]
651| 1| fn posix_precedence_lc_all_overrides_lang() {
652| 1| std::env::remove_var("SQLITE_GRAPHRAG_LANG");
653| 1| std::env::remove_var("LC_MESSAGES");
654| 1| std::env::set_var("LC_ALL", "en_US.UTF-8");
655| 1| std::env::set_var("LANG", "pt_BR.UTF-8");
656| 1| assert_eq!(
657| 1| Language::from_env_or_locale(),
658| | Language::English,
659| 0| "LC_ALL=en_US must override LANG=pt_BR per POSIX"
660| | );
661| 1| std::env::remove_var("LC_ALL");
662| 1| std::env::remove_var("LANG");
663| | }
664| |
665| | #[test]
666| | #[serial]
667| 1| fn posix_precedence_lc_all_unrecognized_stops_iteration() {
668| 1| std::env::remove_var("SQLITE_GRAPHRAG_LANG");
669| 1| std::env::remove_var("LC_MESSAGES");
670| 1| std::env::set_var("LC_ALL", "ja_JP.UTF-8");
671| 1| std::env::set_var("LANG", "pt_BR.UTF-8");
672| 1| assert_eq!(
673| 1| Language::from_env_or_locale(),
674| | Language::English,
675| 0| "LC_ALL=ja_JP set must stop iteration; falls back to English default"
676| | );
677| 1| std::env::remove_var("LC_ALL");
678| 1| std::env::remove_var("LANG");
679| | }
680| |
681| | #[test]
682| | #[serial]
683| 1| fn lang_pt_selects_portuguese_when_lc_all_unset() {
684| 1| std::env::remove_var("SQLITE_GRAPHRAG_LANG");
685| 1| std::env::remove_var("LC_ALL");
686| 1| std::env::remove_var("LC_MESSAGES");
687| 1| std::env::set_var("LANG", "pt_BR.UTF-8");
688| 1| assert_eq!(Language::from_env_or_locale(), Language::Portuguese);
689| 1| std::env::remove_var("LANG");
690| | }
691| |
692| | mod validation_tests {
693| | use super::*;
694| |
695| | #[test]
696| 1| fn name_length_en() {
697| 1| let msg = match Language::English {
698| 1| Language::English => format!("name must be 1-{} chars", 80),
699| 0| Language::Portuguese => format!("nome deve ter entre 1 e {} caracteres", 80),
700| | };
701| 1| assert!(msg.contains("name must be 1-80 chars"), "obtido: {msg}");
^0
702| 1| }
703| |
704| | #[test]
705| 1| fn name_length_pt() {
706| 1| let msg = match Language::Portuguese {
707| 0| Language::English => format!("name must be 1-{} chars", 80),
708| 1| Language::Portuguese => format!("nome deve ter entre 1 e {} caracteres", 80),
709| | };
710| 1| assert!(
711| 1| msg.contains("nome deve ter entre 1 e 80 caracteres"),
712| 0| "obtido: {msg}"
713| | );
714| 1| }
715| |
716| | #[test]
717| 1| fn name_kebab_en() {
718| 1| let nome = "Invalid_Name";
719| 1| let msg = match Language::English {
720| 1| Language::English => format!(
721| 1| "name must be kebab-case slug (lowercase letters, digits, hyphens): '{nome}'"
722| | ),
723| | Language::Portuguese => {
724| 0| format!("nome deve estar em kebab-case (minúsculas, dígitos, hífens): '{nome}'")
725| | }
726| | };
727| 1| assert!(msg.contains("kebab-case slug"), "obtido: {msg}");
^0
728| 1| assert!(msg.contains("Invalid_Name"), "obtido: {msg}");
^0
729| 1| }
730| |
731| | #[test]
732| 1| fn name_kebab_pt() {
733| 1| let nome = "Invalid_Name";
734| 1| let msg = match Language::Portuguese {
735| 0| Language::English => format!(
736| 0| "name must be kebab-case slug (lowercase letters, digits, hyphens): '{nome}'"
737| | ),
738| | Language::Portuguese => {
739| 1| format!("nome deve estar em kebab-case (minúsculas, dígitos, hífens): '{nome}'")
740| | }
741| | };
742| 1| assert!(msg.contains("kebab-case"), "obtido: {msg}");
^0
743| 1| assert!(msg.contains("minúsculas"), "obtido: {msg}");
^0
744| 1| assert!(msg.contains("Invalid_Name"), "obtido: {msg}");
^0
745| 1| }
746| |
747| | #[test]
748| 1| fn description_exceeds_en() {
749| 1| let msg = match Language::English {
750| 1| Language::English => format!("description must be <= {} chars", 500),
751| 0| Language::Portuguese => format!("descrição deve ter no máximo {} caracteres", 500),
752| | };
753| 1| assert!(msg.contains("description must be <= 500"), "obtido: {msg}");
^0
754| 1| }
755| |
756| | #[test]
757| 1| fn description_exceeds_pt() {
758| 1| let msg = match Language::Portuguese {
759| 0| Language::English => format!("description must be <= {} chars", 500),
760| 1| Language::Portuguese => format!("descrição deve ter no máximo {} caracteres", 500),
761| | };
762| 1| assert!(
763| 1| msg.contains("descrição deve ter no máximo 500"),
764| 0| "obtido: {msg}"
765| | );
766| 1| }
767| |
768| | #[test]
769| 1| fn body_exceeds_en() {
770| 1| let limite = crate::constants::MAX_MEMORY_BODY_LEN;
771| 1| let msg = match Language::English {
772| 1| Language::English => format!("body exceeds {limite} bytes"),
773| 0| Language::Portuguese => format!("corpo excede {limite} bytes"),
774| | };
775| 1| assert!(msg.contains("body exceeds 512000"), "obtido: {msg}");
^0
776| 1| }
777| |
778| | #[test]
779| 1| fn body_exceeds_pt() {
780| 1| let limite = crate::constants::MAX_MEMORY_BODY_LEN;
781| 1| let msg = match Language::Portuguese {
782| 0| Language::English => format!("body exceeds {limite} bytes"),
783| 1| Language::Portuguese => format!("corpo excede {limite} bytes"),
784| | };
785| 1| assert!(msg.contains("corpo excede 512000"), "obtido: {msg}");
^0
786| 1| }
787| |
788| | #[test]
789| 1| fn new_name_length_en() {
790| 1| let msg = match Language::English {
791| 1| Language::English => format!("new-name must be 1-{} chars", 80),
792| 0| Language::Portuguese => format!("novo nome deve ter entre 1 e {} caracteres", 80),
793| | };
794| 1| assert!(msg.contains("new-name must be 1-80"), "obtido: {msg}");
^0
795| 1| }
796| |
797| | #[test]
798| 1| fn new_name_length_pt() {
799| 1| let msg = match Language::Portuguese {
800| 0| Language::English => format!("new-name must be 1-{} chars", 80),
801| 1| Language::Portuguese => format!("novo nome deve ter entre 1 e {} caracteres", 80),
802| | };
803| 1| assert!(
804| 1| msg.contains("novo nome deve ter entre 1 e 80"),
805| 0| "obtido: {msg}"
806| | );
807| 1| }
808| |
809| | #[test]
810| 1| fn new_name_kebab_en() {
811| 1| let nome = "Bad Name";
812| 1| let msg = match Language::English {
813| 1| Language::English => format!(
814| 1| "new-name must be kebab-case slug (lowercase letters, digits, hyphens): '{nome}'"
815| | ),
816| 0| Language::Portuguese => format!(
817| 0| "novo nome deve estar em kebab-case (minúsculas, dígitos, hífens): '{nome}'"
818| | ),
819| | };
820| 1| assert!(msg.contains("new-name must be kebab-case"), "obtido: {msg}");
^0
821| 1| }
822| |
823| | #[test]
824| 1| fn new_name_kebab_pt() {
825| 1| let nome = "Bad Name";
826| 1| let msg = match Language::Portuguese {
827| 0| Language::English => format!(
828| 0| "new-name must be kebab-case slug (lowercase letters, digits, hyphens): '{nome}'"
829| | ),
830| 1| Language::Portuguese => format!(
831| 1| "novo nome deve estar em kebab-case (minúsculas, dígitos, hífens): '{nome}'"
832| | ),
833| | };
834| 1| assert!(
835| 1| msg.contains("novo nome deve estar em kebab-case"),
836| 0| "obtido: {msg}"
837| | );
838| 1| }
839| |
840| | #[test]
841| 1| fn reserved_name_en() {
842| 1| let msg = match Language::English {
843| | Language::English => {
844| 1| "names and namespaces starting with __ are reserved for internal use"
845| 1| .to_string()
846| | }
847| | Language::Portuguese => {
848| 0| "nomes e namespaces iniciados com __ são reservados para uso interno"
849| 0| .to_string()
850| | }
851| | };
852| 1| assert!(msg.contains("reserved for internal use"), "obtido: {msg}");
^0
853| 1| }
854| |
855| | #[test]
856| 1| fn reserved_name_pt() {
857| 1| let msg = match Language::Portuguese {
858| | Language::English => {
859| 0| "names and namespaces starting with __ are reserved for internal use"
860| 0| .to_string()
861| | }
862| | Language::Portuguese => {
863| 1| "nomes e namespaces iniciados com __ são reservados para uso interno"
864| 1| .to_string()
865| | }
866| | };
867| 1| assert!(msg.contains("reservados para uso interno"), "obtido: {msg}");
^0
868| 1| }
869| | }
870| |
871| | mod app_error_pt_translation_tests {
872| | use crate::errors::AppError;
873| |
874| | #[test]
875| 1| fn localized_message_pt_not_found_fully_translated() {
876| 1| let err =
877| 1| AppError::NotFound("memory 'test-mem' not found in namespace 'global'".into());
878| 1| let pt = err.localized_message_for(crate::i18n::Language::Portuguese);
879| 1| assert!(
880| 1| pt.contains("memória"),
881| 0| "PT must translate 'memory' to 'memória': {pt}"
882| | );
883| 1| assert!(
884| 1| pt.contains("não encontrada no namespace"),
885| 0| "PT must translate full phrase: {pt}"
886| | );
887| 1| assert!(
888| 1| !pt.contains("not found in namespace"),
889| 0| "PT must not contain English phrase: {pt}"
890| | );
891| 1| }
892| |
893| | #[test]
894| 1| fn localized_message_pt_duplicate_fully_translated() {
895| 1| let err = AppError::Duplicate(
896| 1| "memory 'x' already exists in namespace 'global'. Use --force-merge to update."
897| 1| .into(),
898| 1| );
899| 1| let pt = err.localized_message_for(crate::i18n::Language::Portuguese);
900| 1| assert!(pt.contains("memória"), "PT must translate 'memory': {pt}");
^0
901| 1| assert!(
902| 1| pt.contains("já existe no namespace"),
903| 0| "PT must translate 'already exists': {pt}"
904| | );
905| 1| }
906| | }
907| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/lib.rs:
1| |//! # sqlite-graphrag
2| |//!
3| |//! Local GraphRAG memory for LLMs in a single SQLite file — zero external
4| |//! services required.
5| |//!
6| |//! `sqlite-graphrag` is a CLI-first library that persists memories, entities and
7| |//! typed relationships inside a single SQLite database. It combines FTS5
8| |//! full-text search with `sqlite-vec` KNN over locally-generated embeddings to
9| |//! expose a hybrid retrieval ranker tailored for LLM agents.
10| |//!
11| |//! ## CLI usage
12| |//!
13| |//! Install and initialize once, then save and recall memories:
14| |//!
15| |//! ```bash
16| |//! cargo install sqlite-graphrag
17| |//! sqlite-graphrag init
18| |//! sqlite-graphrag remember \
19| |//! --name onboarding-note \
20| |//! --type user \
21| |//! --description "first memory" \
22| |//! --body "hello graphrag"
23| |//! sqlite-graphrag recall "graphrag" --k 5
24| |//! ```
25| |//!
26| |//! ## Crate layout
27| |//!
28| |//! The public modules group the CLI, the SQLite storage layer and the
29| |//! supporting primitives (embedder, chunking, graph, namespace detection,
30| |//! output, paths and pragmas). The CLI binary wires them together through the
31| |//! commands in [`commands`].
32| |//!
33| |//! ## Exit codes
34| |//!
35| |//! Errors returned from [`errors::AppError`] map to deterministic exit codes
36| |//! suitable for orchestration by shell scripts and LLM agents. Consult the
37| |//! README for the full contract.
38| |
39| |use std::sync::atomic::{AtomicBool, AtomicU8, Ordering};
40| |use std::sync::OnceLock;
41| |use tokio_util::sync::CancellationToken;
42| |
43| |/// Signals that a shutdown signal (SIGINT / SIGTERM / SIGHUP) has been received.
44| |///
45| |/// Set in `main` via `ctrlc::set_handler`. Long-running subcommands can
46| |/// poll [`shutdown_requested`] to shut down gracefully before timeout.
47| |/// Async code should prefer [`cancel_token`] with `tokio::select!`.
48| |pub static SHUTDOWN: AtomicBool = AtomicBool::new(false);
49| |
50| |/// Counter of shutdown signals received. 0=none, 1=graceful, 2+=forced exit.
51| |pub static SIGNAL_COUNT: AtomicU8 = AtomicU8::new(0);
52| |
53| |/// Signal number that triggered shutdown (2=SIGINT, 15=SIGTERM). 0=none.
54| |static SIGNAL_NUMBER: AtomicU8 = AtomicU8::new(0);
55| |
56| |static CANCEL: OnceLock<CancellationToken> = OnceLock::new();
57| |
58| |/// Returns the process-wide cancellation token for async graceful shutdown.
59| |///
60| |/// The token is cancelled by the signal handler alongside [`SHUTDOWN`].
61| |/// Async loops should use `token.cancelled().await` inside `tokio::select!`
62| |/// for instant wake-up instead of polling [`shutdown_requested`].
63| 0|pub fn cancel_token() -> &'static CancellationToken {
64| 0| CANCEL.get_or_init(CancellationToken::new)
65| 0|}
66| |
67| |/// Returns `true` if a shutdown signal has been received since the process started.
68| |///
69| |/// The value reflects the state of [`SHUTDOWN`]. Without a `ctrlc::set_handler` call,
70| |/// the initial state is always `false`.
71| |///
72| |/// # Examples
73| |///
74| |/// ```
75| |/// use sqlite_graphrag::shutdown_requested;
76| |///
77| |/// // Under normal startup conditions the signal has not been received.
78| |/// assert!(!shutdown_requested());
79| |/// ```
80| |///
81| |/// ```
82| |/// use std::sync::atomic::Ordering;
83| |/// use sqlite_graphrag::{SHUTDOWN, shutdown_requested};
84| |///
85| |/// // Simulate receiving a signal and verify that the function reflects the state.
86| |/// SHUTDOWN.store(true, Ordering::Release);
87| |/// assert!(shutdown_requested());
88| |/// // Restore to avoid contaminating other tests.
89| |/// SHUTDOWN.store(false, Ordering::Release);
90| |/// ```
91| 0|pub fn shutdown_requested() -> bool {
92| | // ORDERING: Acquire pairs with the Release store in the signal handler (main.rs).
93| 0| SHUTDOWN.load(Ordering::Acquire)
94| 0|}
95| |
96| |/// Returns the signal number that triggered shutdown (0 if none received).
97| |///
98| |/// Typically 2 (SIGINT) for Ctrl+C. Used to compute Unix-conventional exit
99| |/// code 128+N in the main function.
100| 0|pub fn shutdown_signal() -> u8 {
101| 0| SIGNAL_NUMBER.load(Ordering::Acquire)
102| 0|}
103| |
104| |/// Token-aware chunking utilities for bodies that exceed the embedding window.
105| |pub mod chunking;
106| |
107| |/// Hybrid entity extraction: regex pre-filter + GLiNER zero-shot NER (graceful degradation).
108| |pub mod extraction;
109| |
110| |/// `clap` definitions for the top-level `sqlite-graphrag` binary.
111| |pub mod cli;
112| |
113| |/// Subcommand handlers wired into the `clap` tree from [`cli`].
114| |pub mod commands;
115| |
116| |/// Compile-time constants: embedding dimensions, limits and thresholds.
117| |pub mod constants;
118| |
119| |/// Daemon IPC for persistent embedding model reuse across CLI invocations.
120| |pub mod daemon;
121| |
122| |/// Local embedding generation backed by `fastembed`.
123| |pub mod embedder;
124| |
125| |/// Canonical entity type taxonomy: 13 variants, ValueEnum + serde + rusqlite impls.
126| |pub mod entity_type;
127| |
128| |/// Library-wide error type and the mapping to process exit codes (see [`errors::AppError`]).
129| |pub mod errors;
130| |
131| |/// Graph traversal helpers over the entities and relationships tables.
132| |pub mod graph;
133| |
134| |/// Type aliases for AHash-backed collections in hot paths.
135| |pub mod hash;
136| |
137| |/// Bilingual message layer for human-facing stderr progress (`--lang en|pt`, `SQLITE_GRAPHRAG_LANG`).
138| |pub mod i18n;
139| |
140| |/// Counting semaphore via lock files to limit parallel invocations.
141| |/// Provides `acquire_cli_slot` (counting semaphore) and the G28-B
142| |/// per-namespace heavy-job singleton `acquire_job_singleton` for
143| |/// `enrich`, `ingest --mode claude-code`, `ingest --mode codex`.
144| |pub mod lock;
145| |
146| |/// Memory guard: checks RAM availability before loading the ONNX model.
147| |pub mod memory_guard;
148| |
149| |/// Type-safe enumeration of the five `memories.source` CHECK constraint values.
150| |/// Replaces the footgun `pub source: String` to prevent G29-style regressions.
151| |#[allow(rustdoc::broken_intra_doc_links)]
152| |pub mod memory_source;
153| |
154| |/// Namespace resolution with precedence between flag, environment and markers.
155| |pub mod namespace;
156| |
157| |/// Centralized stdout/stderr emitters for CLI output formatting.
158| |pub mod output;
159| |
160| |/// Dual-format argument parser: accepts Unix epoch and RFC 3339.
161| |pub mod parsers;
162| |
163| |/// G29 Passo 4: preservation checks (Jaccard trigram) for LLM-enriched bodies.
164| |pub mod preservation;
165| |
166| |/// Filesystem paths for the project-local database and app support directories.
167| |pub mod paths;
168| |
169| |/// SQLite pragma helpers applied on every connection.
170| |pub mod pragmas;
171| |
172| |/// Cross-platform signal handling: SIGINT, SIGTERM, SIGHUP.
173| |pub mod signals;
174| |
175| |/// Centralized retry infrastructure with exponential backoff and half-jitter.
176| |pub mod retry;
177| |
178| |/// G28: orphan-process reaper that runs at CLI startup.
179| |#[allow(rustdoc::broken_intra_doc_links)]
180| |pub mod reaper;
181| |
182| |/// G28-D: system load average observation (pre-spawn saturation check).
183| |pub mod system_load;
184| |
185| |/// Persistence layer: memories, entities, chunks and version history.
186| |pub mod storage;
187| |
188| |/// Centralized tracing subscriber initialization with panic hook and log bridge.
189| |pub mod telemetry;
190| |
191| |/// Cross-platform terminal initialization: UTF-8 console, ANSI colors, NO_COLOR.
192| |pub mod terminal;
193| |
194| |/// Display time zone for `*_iso` fields (flag `--tz`, env `SQLITE_GRAPHRAG_DISPLAY_TZ`, fallback UTC).
195| |pub mod tz;
196| |
197| |/// Stdin reader with configurable timeout to prevent indefinite blocking.
198| |pub mod stdin_helper;
199| |
200| |/// Real tokenizer of the embedding model for accurate token counting and chunking.
201| |pub mod tokenizer;
202| |
203| |mod embedded_migrations {
204| | use refinery::embed_migrations;
205| | embed_migrations!("migrations");
206| |}
207| |
208| |pub use embedded_migrations::migrations;
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/lock.rs:
1| |//! Counting semaphore via lock files to limit parallel CLI invocations.
2| |//!
3| |//! `acquire_cli_slot` tries to acquire one of `N` available slots by opening the file
4| |//! `cli-slot-{N}.lock` in the OS cache directory and obtaining an exclusive `flock`.
5| |//! The returned [`std::fs::File`] MUST be kept alive for the entire duration of `main`;
6| |//! dropping it releases the slot automatically for the next invocation.
7| |//!
8| |//! When `wait_seconds` is `Some(n) > 0`, the function polls every
9| |//! [`crate::constants::CLI_LOCK_POLL_INTERVAL_MS`] milliseconds until the deadline. When it
10| |//! is `None` or `Some(0)`, a single attempt is made and `Err(AppError::AllSlotsFull)` is
11| |//! returned immediately if all slots are occupied.
12| |//!
13| |//! ## Job-type singleton (G28-B, v1.0.68)
14| |//!
15| |//! Heavy long-running jobs (`enrich`, `ingest --mode claude-code`,
16| |//! `ingest --mode codex`) also acquire a *singleton* lock per `(job_type,
17| |//! namespace)` via `acquire_job_singleton`. This guarantees at most one
18| |//! heavy job per namespace runs at any time, which was the root cause
19| |//! of the 2026-06-03 process-proliferation incident (4 parallel `enrich`
20| |//! instances × N workers × 10 MCP servers = ~192 spawned processes).
21| |// Workload: I/O-bound (flock polling with exponential backoff sleep)
22| |
23| |use std::fs::{File, OpenOptions};
24| |use std::path::{Path, PathBuf};
25| |use std::thread;
26| |use std::time::{Duration, Instant};
27| |
28| |use directories::ProjectDirs;
29| |use fs4::fs_std::FileExt;
30| |
31| |use crate::constants::{
32| | CLI_LOCK_POLL_INTERVAL_MS, JOB_SINGLETON_POLL_INTERVAL_MS, MAX_CONCURRENT_CLI_INSTANCES,
33| |};
34| |use crate::errors::AppError;
35| |
36| |/// Job-type classification for `acquire_job_singleton`.
37| |///
38| |/// `Light` is intentionally NOT a variant here because lightweight
39| |/// commands (`recall`, `stats`, `read`, `list`) share the existing
40| |/// counting-semaphore in [`acquire_cli_slot`] and do not need a singleton.
41| |#[derive(Debug, Clone, Copy, PartialEq, Eq)]
42| |pub enum JobType {
43| | /// `enrich` command (LLM-driven entity/relation/body enrichment).
44| | Enrich,
45| | /// `ingest --mode claude-code` (LLM-curated ingestion).
46| | IngestClaudeCode,
47| | /// `ingest --mode codex` (OpenAI Codex CLI ingestion).
48| | IngestCodex,
49| |}
50| |
51| |impl JobType {
52| | /// Returns the kebab-case tag used inside the lock file name.
53| 8| fn tag(self) -> &'static str {
54| 8| match self {
55| 6| JobType::Enrich => "enrich",
56| 2| JobType::IngestClaudeCode => "ingest-claude-code",
57| 0| JobType::IngestCodex => "ingest-codex",
58| | }
59| 8| }
60| |}
61| |
62| |/// Returns the lock file path for the given slot.
63| |///
64| |/// Honours `SQLITE_GRAPHRAG_CACHE_DIR` when set (useful for tests, containers,
65| |/// and NFS caches), falling back to the OS default cache directory via
66| |/// `directories::ProjectDirs`. The slot must be 1-based.
67| 0|fn slot_path(slot: usize) -> Result<PathBuf, AppError> {
68| 0| let cache = cache_dir()?;
69| 0| std::fs::create_dir_all(&cache)?;
70| 0| Ok(cache.join(format!("cli-slot-{slot}.lock")))
71| 0|}
72| |
73| |/// Resolves the lock-file directory honouring `SQLITE_GRAPHRAG_CACHE_DIR`.
74| 7|fn cache_dir() -> Result<PathBuf, AppError> {
75| 7| if let Some(override_dir) = std::env::var_os("SQLITE_GRAPHRAG_CACHE_DIR") {
^0
76| 0| Ok(PathBuf::from(override_dir))
77| | } else {
78| 7| let dirs = ProjectDirs::from("", "", "sqlite-graphrag").ok_or_else(|| {
^0
79| 0| AppError::Io(std::io::Error::new(
80| 0| std::io::ErrorKind::NotFound,
81| 0| "could not determine cache directory for sqlite-graphrag lock files",
82| 0| ))
83| 0| })?;
84| 7| Ok(dirs.cache_dir().to_path_buf())
85| | }
86| 7|}
87| |
88| |/// Computes a short, filesystem-safe hash of the database path so two distinct
89| |/// databases (e.g. `/tmp/a.sqlite` and `/tmp/b.sqlite`) get distinct lock
90| |/// files in the shared cache directory. First 12 hex chars of BLAKE3 are
91| |/// sufficient for collision avoidance across the local filesystem.
92| 10|pub fn db_path_hash(db_path: &Path) -> String {
93| 10| let canonical = db_path
94| 10| .canonicalize()
95| 10| .unwrap_or_else(|_| db_path.to_path_buf());
96| 10| let hash = blake3::hash(canonical.to_string_lossy().as_bytes());
97| 10| hash.to_hex().to_string()[..12].to_string()
98| 10|}
99| |
100| |/// Returns the singleton lock file path for a given (job_type, namespace, db_hash).
101| |///
102| |/// Layout: `job-singleton-{tag}-{namespace_slug}-{db_hash}.lock` in the same
103| |/// cache dir as the CLI slots. The namespace is sanitised to a filesystem-safe
104| |/// slug (lowercase, hyphens, alphanumeric) and defaults to `default` when
105| |/// empty. The `db_hash` is the BLAKE3 prefix returned by [`db_path_hash`].
106| |///
107| |/// G30 (v1.0.69): the previous implementation ignored the database path
108| |/// entirely, so two concurrent `enrich` invocations against different
109| |/// `graphrag.sqlite` files (production vs. test) collided on the same
110| |/// cache-dir lock. The db_hash scope makes the singleton per-database while
111| |/// still sharing the same cache dir.
112| 7|pub fn job_singleton_path(
113| 7| job_type: JobType,
114| 7| namespace: &str,
115| 7| db_hash: &str,
116| 7|) -> Result<PathBuf, AppError> {
117| 7| let cache = cache_dir()?;
^0
118| 7| std::fs::create_dir_all(&cache)?;
^0
119| 7| let slug = if namespace.is_empty() {
120| 0| "default".to_string()
121| | } else {
122| 7| namespace
123| 7| .chars()
124| 83| .map(|c| {
^7
125| 83| if c.is_ascii_alphanumeric() || c == '-' || c == '_' {
^14 ^2
126| 81| c.to_ascii_lowercase()
127| | } else {
128| 2| '-'
129| | }
130| 83| })
131| 7| .collect::<String>()
132| | };
133| 7| let safe_hash: String = db_hash
134| 7| .chars()
135| 84| .filter(|c| c.is_ascii_alphanumeric())
^7
136| 7| .take(16)
137| 7| .collect();
138| 7| Ok(cache.join(format!(
139| 7| "job-singleton-{}-{slug}-{safe_hash}.lock",
140| 7| job_type.tag()
141| 7| )))
142| 7|}
143| |
144| |/// Tries to open and exclusively lock the lock file for the given slot.
145| |///
146| |/// Returns `Ok(file)` if the slot is free, or `Err(io::Error)` if it is
147| |/// held by another instance (non-blocking).
148| 0|fn try_acquire_slot(slot: usize) -> Result<File, AppError> {
149| 0| let path = slot_path(slot)?;
150| 0| let file = OpenOptions::new()
151| 0| .read(true)
152| 0| .write(true)
153| 0| .create(true)
154| 0| .truncate(false)
155| 0| .open(&path)?;
156| 0| file.try_lock_exclusive().map_err(AppError::Io)?;
157| 0| Ok(file)
158| 0|}
159| |
160| |/// Acquires a concurrency slot from the `max_concurrency`-position semaphore.
161| |///
162| |/// Iterates slots `1..=max_concurrency` attempting `try_lock_exclusive` on each
163| |/// `cli-slot-N.lock` file. When a free slot is found, returns `(File, slot_number)`.
164| |/// If all slots are occupied:
165| |///
166| |/// - If `wait_seconds` is `None` or `Some(0)`, returns immediately with
167| |/// `AppError::AllSlotsFull { max, waited_secs: 0 }`.
168| |/// - If `wait_seconds` is `Some(n) > 0`, enters a polling loop every
169| |/// [`crate::constants::CLI_LOCK_POLL_INTERVAL_MS`] ms until the deadline expires, returning
170| |/// `AppError::AllSlotsFull { max, waited_secs: n }` if no slot opens.
171| |///
172| |/// The returned `File` MUST be kept alive until the process exits; dropping it
173| |/// releases the slot automatically via the implicit `flock` on close.
174| 0|pub fn acquire_cli_slot(
175| 0| max_concurrency: usize,
176| 0| wait_seconds: Option<u64>,
177| 0|) -> Result<(File, usize), AppError> {
178| | // G18: use env override or 2*cpus as ceiling instead of hardcoded 4
179| 0| let ncpus = std::thread::available_parallelism()
180| 0| .map(|n| n.get())
181| 0| .unwrap_or(4);
182| 0| let ceiling = std::env::var("SQLITE_GRAPHRAG_MAX_CLI_INSTANCES")
183| 0| .ok()
184| 0| .and_then(|v| v.parse::<usize>().ok())
185| 0| .unwrap_or_else(|| (2 * ncpus).max(MAX_CONCURRENT_CLI_INSTANCES));
186| 0| let max = max_concurrency.clamp(1, ceiling);
187| 0| let wait_secs = wait_seconds.unwrap_or(0);
188| |
189| | // Tentativa inicial sem espera.
190| 0| if let Some((file, slot)) = try_any_slot(max)? {
191| 0| return Ok((file, slot));
192| 0| }
193| |
194| 0| if wait_secs == 0 {
195| 0| return Err(AppError::AllSlotsFull {
196| 0| max,
197| 0| waited_secs: 0,
198| 0| });
199| 0| }
200| |
201| | // Polling loop with progressive backoff until the deadline.
202| 0| let deadline = Instant::now() + Duration::from_secs(wait_secs);
203| 0| let mut polls: u64 = 0;
204| | loop {
205| 0| let poll_delay = CLI_LOCK_POLL_INTERVAL_MS
206| 0| .saturating_mul(1 + polls / 4)
207| 0| .min(CLI_LOCK_POLL_INTERVAL_MS * 4);
208| 0| thread::sleep(Duration::from_millis(poll_delay));
209| 0| polls += 1;
210| 0| if let Some((file, slot)) = try_any_slot(max)? {
211| 0| return Ok((file, slot));
212| 0| }
213| 0| if Instant::now() >= deadline {
214| 0| return Err(AppError::AllSlotsFull {
215| 0| max,
216| 0| waited_secs: wait_secs,
217| 0| });
218| 0| }
219| | }
220| 0|}
221| |
222| |/// Acquires a process-wide singleton lock for a heavy job type and namespace.
223| |///
224| |/// G28-B (v1.0.68): ensures at most one `enrich`, `ingest --mode
225| |/// claude-code`, or `ingest --mode codex` runs at a time per namespace.
226| |/// A second invocation in the same namespace either:
227| |///
228| |/// - Returns immediately with `AppError::JobSingletonLocked { job_type,
229| |/// namespace }` when `wait_seconds` is `None` or `Some(0)`.
230| |/// - Polls every [`JOB_SINGLETON_POLL_INTERVAL_MS`] ms until the lock
231| |/// drops or the deadline expires, returning the same error on timeout.
232| |///
233| |/// The returned `File` MUST be kept alive until the process exits;
234| |/// dropping it releases the singleton for the next invocation.
235| 6|pub fn acquire_job_singleton(
236| 6| job_type: JobType,
237| 6| namespace: &str,
238| 6| db_path: &Path,
239| 6| wait_seconds: Option<u64>,
240| 6| force: bool,
241| 6|) -> Result<File, AppError> {
242| 6| let db_hash = db_path_hash(db_path);
243| 6| let path = job_singleton_path(job_type, namespace, &db_hash)?;
^0
244| |
245| | // G30+G09: when --force is set, attempt to break a stale lock by
246| | // detecting and removing a pre-existing lock file. This is a last
247| | // resort: only enabled by an explicit operator flag. A real orphan
248| | // lock from a previous crash leaves a 0-byte file behind, which the
249| | // next non-forced caller would still try to lock.
250| 6| if force && path.exists() {
^0
251| 0| tracing::warn!(target: "lock",
252| 0| path = %path.display(),
253| 0| "force=true; removing pre-existing singleton lock file"
254| | );
255| 0| let _ = std::fs::remove_file(&path);
256| 6| }
257| |
258| 6| let file = OpenOptions::new()
259| 6| .read(true)
260| 6| .write(true)
261| 6| .create(true)
262| 6| .truncate(false)
263| 6| .open(&path)?;
^0
264| 6| if let Err(e) = file.try_lock_exclusive() {
^1
265| 1| if !is_lock_contended(&e) {
266| 0| return Err(AppError::Io(e));
267| 1| }
268| | // Already held by another instance.
269| 1| let wait_secs = wait_seconds.unwrap_or(0);
270| 1| if wait_secs == 0 {
271| 1| return Err(AppError::JobSingletonLocked {
272| 1| job_type: job_type.tag().to_string(),
273| 1| namespace: namespace.to_string(),
274| 1| });
275| 0| }
276| 0| let deadline = Instant::now() + Duration::from_secs(wait_secs);
277| | // Drop the failed handle before polling; flock is per-process so we
278| | // re-open each attempt to refresh contention state.
279| 0| drop(file);
280| | loop {
281| 0| thread::sleep(Duration::from_millis(JOB_SINGLETON_POLL_INTERVAL_MS));
282| 0| let file = OpenOptions::new()
283| 0| .read(true)
284| 0| .write(true)
285| 0| .create(true)
286| 0| .truncate(false)
287| 0| .open(&path)?;
288| 0| if file.try_lock_exclusive().is_ok() {
289| 0| return Ok(file);
290| 0| }
291| 0| if Instant::now() >= deadline {
292| 0| return Err(AppError::JobSingletonLocked {
293| 0| job_type: job_type.tag().to_string(),
294| 0| namespace: namespace.to_string(),
295| 0| });
296| 0| }
297| | }
298| 5| }
299| 5| Ok(file)
300| 6|}
301| |
302| |/// Tries to acquire any free slot in `1..=max`, returning the first available one.
303| |///
304| |/// Returns `Ok(Some((file, slot)))` if a slot was obtained, `Ok(None)` if all are
305| |/// occupied (`EWOULDBLOCK`). Propagates I/O errors other than "lock contended".
306| 0|fn try_any_slot(max: usize) -> Result<Option<(File, usize)>, AppError> {
307| 0| for slot in 1..=max {
308| 0| match try_acquire_slot(slot) {
309| 0| Ok(file) => return Ok(Some((file, slot))),
310| 0| Err(AppError::Io(e)) if is_lock_contended(&e) => continue,
311| 0| Err(e) => return Err(e),
312| | }
313| | }
314| 0| Ok(None)
315| 0|}
316| |
317| 1|fn is_lock_contended(error: &std::io::Error) -> bool {
318| 1| if error.kind() == std::io::ErrorKind::WouldBlock {
319| 1| return true;
320| 0| }
321| |
322| | #[cfg(windows)]
323| | {
324| | matches!(error.raw_os_error(), Some(32 | 33))
325| | }
326| |
327| | #[cfg(not(windows))]
328| | {
329| 0| false
330| | }
331| 1|}
332| |
333| |#[cfg(test)]
334| |mod tests {
335| | use super::*;
336| | use std::sync::atomic::{AtomicUsize, Ordering};
337| | static SEQ: AtomicUsize = AtomicUsize::new(0);
338| |
339| 9| fn unique_ns() -> String {
340| 9| let n = SEQ.fetch_add(1, Ordering::SeqCst);
341| 9| let pid = std::process::id();
342| 9| format!("test-{pid}-{n}")
343| 9| }
344| |
345| | #[test]
346| 1| fn job_singleton_path_sanitises_namespace() {
347| 1| let p = job_singleton_path(JobType::Enrich, "Foo Bar/Baz", "abc123def456")
348| 1| .expect("path should resolve");
349| 1| let name = p.file_name().unwrap().to_string_lossy().to_string();
350| 1| assert!(name.contains("enrich"), "got {name}");
^0
351| 1| assert!(name.contains("foo-bar-baz"), "got {name}");
^0
352| 1| assert!(
353| 1| name.contains("abc123def456"),
354| 0| "must embed db_hash: got {name}"
355| | );
356| 1| }
357| |
358| | #[test]
359| 1| fn job_singleton_blocks_second_invocation_same_namespace() {
360| 1| let ns = unique_ns();
361| 1| let db = std::env::temp_dir().join(format!("test-{}.sqlite", unique_ns()));
362| 1| let first = acquire_job_singleton(JobType::Enrich, &ns, &db, Some(0), false)
363| 1| .expect("first acquire should succeed");
364| 1| let second = acquire_job_singleton(JobType::Enrich, &ns, &db, Some(0), false);
365| 1| assert!(
366| 1| matches!(second, Err(AppError::JobSingletonLocked { .. })),
^0
367| 0| "expected JobSingletonLocked, got {second:?}"
368| | );
369| 1| drop(first);
370| 1| }
371| |
372| | #[test]
373| 1| fn job_singleton_allows_different_namespaces() {
374| 1| let ns_a = unique_ns();
375| 1| let ns_b = unique_ns();
376| 1| let db_a = std::env::temp_dir().join(format!("test-a-{}.sqlite", unique_ns()));
377| 1| let db_b = std::env::temp_dir().join(format!("test-b-{}.sqlite", unique_ns()));
378| 1| let first = acquire_job_singleton(JobType::IngestClaudeCode, &ns_a, &db_a, Some(0), false)
379| 1| .expect("ns_a should acquire");
380| 1| let second = acquire_job_singleton(JobType::IngestClaudeCode, &ns_b, &db_b, Some(0), false)
381| 1| .expect("ns_b should acquire in parallel");
382| 1| drop(first);
383| 1| drop(second);
384| 1| }
385| |
386| | #[test]
387| 1| fn job_singleton_scoped_by_db_hash() {
388| | // G30: two databases, same namespace, different content. Both locks
389| | // should succeed because the db_hash differs.
390| 1| let ns = unique_ns();
391| 1| let db_a = std::env::temp_dir().join(format!("test-x-{}.sqlite", unique_ns()));
392| 1| let db_b = std::env::temp_dir().join(format!("test-y-{}.sqlite", unique_ns()));
393| 1| let first = acquire_job_singleton(JobType::Enrich, &ns, &db_a, Some(0), false)
394| 1| .expect("db_a should acquire");
395| 1| let second = acquire_job_singleton(JobType::Enrich, &ns, &db_b, Some(0), false)
396| 1| .expect("db_b should acquire independently (G30 fix)");
397| 1| drop(first);
398| 1| drop(second);
399| 1| }
400| |
401| | #[test]
402| 1| fn db_path_hash_is_stable_for_same_path() {
403| 1| let p = std::env::temp_dir().join("hashing-test.sqlite");
404| 1| let h1 = db_path_hash(&p);
405| 1| let h2 = db_path_hash(&p);
406| 1| assert_eq!(h1, h2, "same path must produce same hash");
^0
407| 1| assert_eq!(h1.len(), 12, "BLAKE3 prefix must be 12 hex chars");
^0
408| 1| }
409| |
410| | #[test]
411| 1| fn db_path_hash_differs_for_different_paths() {
412| 1| let a = std::env::temp_dir().join("hash-a.sqlite");
413| 1| let b = std::env::temp_dir().join("hash-b.sqlite");
414| 1| assert_ne!(db_path_hash(&a), db_path_hash(&b));
415| 1| }
416| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/memory_guard.rs:
1| |//! Memory guard: checks RAM availability before loading the ONNX model.
2| |//!
3| |//! Loading the model via `fastembed` consumes approximately
4| |//! [`crate::constants::EMBEDDING_LOAD_EXPECTED_RSS_MB`] MiB of resident memory.
5| |//! Without this guard, multiple parallel invocations can exhaust RAM and trigger
6| |//! OOM (Out-Of-Memory), stalling the system.
7| |//!
8| |//! This guard queries the OS via `sysinfo` before any heavy initialisation,
9| |//! aborting with [`crate::errors::AppError::LowMemory`] (exit 77) when the
10| |//! configured floor is not met.
11| |
12| |use sysinfo::{
13| | get_current_pid, MemoryRefreshKind, ProcessRefreshKind, ProcessesToUpdate, RefreshKind, System,
14| | UpdateKind,
15| |};
16| |
17| |use crate::errors::AppError;
18| |
19| |/// Returns the current available memory in MiB.
20| 3|pub fn available_memory_mb() -> u64 {
21| 3| let sys =
22| 3| System::new_with_specifics(RefreshKind::new().with_memory(MemoryRefreshKind::everything()));
23| 3| let available_bytes = sys.available_memory();
24| 3| available_bytes / (1024 * 1024)
25| 3|}
26| |
27| |/// Returns the current process RSS in MiB when available.
28| 1|pub fn current_process_memory_mb() -> Option<u64> {
29| 1| let pid = get_current_pid().ok()?;
^0
30| 1| let mut sys =
31| 1| System::new_with_specifics(RefreshKind::new().with_memory(MemoryRefreshKind::everything()));
32| 1| sys.refresh_processes_specifics(
33| 1| ProcessesToUpdate::Some(&[pid]),
34| | true,
35| 1| ProcessRefreshKind::new()
36| 1| .with_memory()
37| 1| .with_exe(UpdateKind::OnlyIfNotSet),
38| | );
39| 1| sys.process(pid).map(|p| p.memory() / (1024 * 1024))
40| 1|}
41| |
42| |/// Calculates the safe concurrency ceiling for heavy embedding workloads.
43| |///
44| |/// Canonical formula:
45| |/// `permits = min(cpus, available_memory_mb / ram_per_task_mb) * 0.5`
46| |///
47| |/// The result is clamped between `1` and `max_concurrency`.
48| 3|pub fn calculate_safe_concurrency(
49| 3| available_mb: u64,
50| 3| cpu_count: usize,
51| 3| ram_per_task_mb: u64,
52| 3| max_concurrency: usize,
53| 3|) -> usize {
54| 3| let cpu_count = cpu_count.max(1);
55| 3| let max_concurrency = max_concurrency.max(1);
56| 3| let ram_per_task_mb = ram_per_task_mb.max(1);
57| |
58| 3| let memory_bound = (available_mb / ram_per_task_mb) as usize;
59| 3| let resource_bound = cpu_count.min(memory_bound).max(1);
60| | // G18: removed unconditional /2 margin — callers should pass lower ram_per_task_mb
61| | // when daemon is active (model shared) instead of halving the result
62| 3| resource_bound.min(max_concurrency)
63| 3|}
64| |
65| |/// Checks whether sufficient memory is available to start loading the model.
66| |///
67| |/// # Parameters
68| |/// - `min_mb`: minimum floor in MiB of available memory (typically
69| |/// [`crate::constants::MIN_AVAILABLE_MEMORY_MB`]).
70| |///
71| |/// # Errors
72| |/// Returns [`AppError::LowMemory`] when `available_mb < min_mb`.
73| |///
74| |/// # Returns
75| |/// Returns `Ok(available_mb)` with the actual available memory in MiB.
76| 3|pub fn check_available_memory(min_mb: u64) -> Result<u64, AppError> {
77| 3| let available_mb = available_memory_mb();
78| |
79| 3| if available_mb < min_mb {
80| 2| return Err(AppError::LowMemory {
81| 2| available_mb,
82| 2| required_mb: min_mb,
83| 2| });
84| 1| }
85| |
86| 1| Ok(available_mb)
87| 3|}
88| |
89| |#[cfg(test)]
90| |mod tests {
91| | use super::*;
92| |
93| | #[test]
94| 1| fn check_available_memory_with_zero_always_passes() {
95| 1| let result = check_available_memory(0);
96| 1| assert!(result.is_ok(), "min_mb=0 must always pass, got: {result:?}");
^0
97| 1| let mb = result.unwrap();
98| 1| assert!(mb > 0, "system must report positive memory");
^0
99| 1| }
100| |
101| | #[test]
102| 1| fn check_available_memory_with_huge_value_fails() {
103| 1| let result = check_available_memory(u64::MAX);
104| 1| assert!(
105| 1| matches!(result, Err(AppError::LowMemory { .. })),
^0
106| 0| "u64::MAX MiB must fail with LowMemory, got: {result:?}"
107| | );
108| 1| }
109| |
110| | #[test]
111| 1| fn low_memory_error_contains_correct_values() {
112| 1| match check_available_memory(u64::MAX) {
113| | Err(AppError::LowMemory {
114| 1| available_mb,
115| 1| required_mb,
116| | }) => {
117| 1| assert_eq!(required_mb, u64::MAX);
118| 1| assert!(available_mb < u64::MAX);
119| | }
120| 0| other => unreachable!("expected LowMemory, got: {other:?}"),
121| | }
122| 1| }
123| |
124| | #[test]
125| 1| fn calculate_safe_concurrency_respects_half_margin() {
126| 1| let permits = calculate_safe_concurrency(8_000, 8, 1_000, 4);
127| 1| assert_eq!(permits, 4);
128| 1| }
129| |
130| | #[test]
131| 1| fn calculate_safe_concurrency_never_returns_zero() {
132| 1| let permits = calculate_safe_concurrency(100, 1, 10_000, 4);
133| 1| assert_eq!(permits, 1);
134| 1| }
135| |
136| | #[test]
137| 1| fn calculate_safe_concurrency_respects_max_ceiling() {
138| 1| let permits = calculate_safe_concurrency(128_000, 64, 500, 4);
139| 1| assert_eq!(permits, 4);
140| 1| }
141| |
142| | #[test]
143| 1| fn current_process_memory_mb_returns_some_value() {
144| 1| let rss = current_process_memory_mb();
145| 1| assert!(rss.is_some());
146| 1| }
147| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/memory_source.rs:
1| |//! Type-safe enumeration of the `memories.source` column domain.
2| |//!
3| |//! The CHECK constraint on the `memories` table accepts exactly five values:
4| |//! `agent`, `user`, `system`, `import`, `sync`. Any other literal is rejected
5| |//! at runtime by SQLite with `SQLITE_CONSTRAINT_CHECK`.
6| |//!
7| |//! This enum eliminates the silent footgun of `pub source: String` by forcing
8| |//! every call-site to pick a typed variant that maps deterministically to one
9| |//! of the five allowed CHECK values via [`MemorySource::as_str`].
10| |//!
11| |//! # Examples
12| |//!
13| |//! ```
14| |//! use sqlite_graphrag::memory_source::MemorySource;
15| |//!
16| |//! let src = MemorySource::Agent;
17| |//! assert_eq!(src.as_str(), "agent");
18| |//!
19| |//! let parsed = MemorySource::try_from("user").expect("user is valid");
20| |//! assert_eq!(parsed, MemorySource::User);
21| |//!
22| |//! let err = MemorySource::try_from("enrich").unwrap_err();
23| |//! assert!(format!("{err}").contains("invalid memory source"));
24| |//! ```
25| |
26| |use crate::errors::AppError;
27| |use serde::{Deserialize, Serialize};
28| |
29| |/// Enumerates the five values accepted by the `memories.source` CHECK constraint.
30| |///
31| |/// Adding a new variant requires:
32| |///
33| |/// 1. Updating the DDL CHECK constraint in `migrations/V001__init.sql`.
34| |/// 2. Running a migration that backfills any pre-existing values
35| |/// (`UPDATE memories SET source='agent' WHERE source NOT IN (...)`).
36| |/// 3. Bumping [`crate::constants::CURRENT_SCHEMA_VERSION`].
37| |#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
38| |#[serde(rename_all = "snake_case")]
39| |pub enum MemorySource {
40| | /// Mutated by an LLM agent (remember, edit, rename, body-enrich).
41| | Agent,
42| | /// Mutated by a human operator.
43| | User,
44| | /// Mutated by an internal migration or system job.
45| | System,
46| | /// Inserted by bulk import (ingest, ingest --mode claude-code, ingest --mode codex).
47| | Import,
48| | /// Inserted by an external sync job.
49| | Sync,
50| |}
51| |
52| |impl MemorySource {
53| | /// Returns the canonical snake_case string stored in the SQLite column.
54| | ///
55| | /// The returned slice has `'static` lifetime because all five values are
56| | /// ASCII literals known at compile time.
57| 25| pub const fn as_str(self) -> &'static str {
58| 25| match self {
59| 5| Self::Agent => "agent",
60| 5| Self::User => "user",
61| 5| Self::System => "system",
62| 5| Self::Import => "import",
63| 5| Self::Sync => "sync",
64| | }
65| 25| }
66| |
67| | /// Returns every variant as a static slice, useful for error messages and docs.
68| | pub const ALL: &'static [MemorySource] = &[
69| | Self::Agent,
70| | Self::User,
71| | Self::System,
72| | Self::Import,
73| | Self::Sync,
74| | ];
75| |}
76| |
77| |impl std::fmt::Display for MemorySource {
78| 5| fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
79| 5| f.write_str(self.as_str())
80| 5| }
81| |}
82| |
83| |/// Parses a stored `memories.source` string back into a typed variant.
84| |///
85| |/// # Errors
86| |///
87| |/// Returns [`AppError::Validation`] when the input is not one of the five
88| |/// canonical values. The error message lists every accepted value so the
89| |/// caller can self-correct without consulting the schema.
90| |impl TryFrom<&str> for MemorySource {
91| | type Error = AppError;
92| |
93| 8| fn try_from(value: &str) -> Result<Self, Self::Error> {
94| 8| match value {
95| 8| "agent" => Ok(Self::Agent),
^2
96| 6| "user" => Ok(Self::User),
^1
97| 5| "system" => Ok(Self::System),
^1
98| 4| "import" => Ok(Self::Import),
^1
99| 3| "sync" => Ok(Self::Sync),
^1
100| 2| other => Err(AppError::Validation(format!(
101| 2| "invalid memory source: {other:?}; expected one of {}",
102| 2| Self::ALL
103| 2| .iter()
104| 10| .map(|v| v.as_str())
^2
105| 2| .collect::<Vec<_>>()
106| 2| .join(", ")
107| | ))),
108| | }
109| 8| }
110| |}
111| |
112| |impl TryFrom<String> for MemorySource {
113| | type Error = AppError;
114| |
115| 1| fn try_from(value: String) -> Result<Self, Self::Error> {
116| 1| Self::try_from(value.as_str())
117| 1| }
118| |}
119| |
120| |/// Validates a raw `memories.source` string against the CHECK constraint domain.
121| |///
122| |/// This is the runtime guard for callers that still take `&str` (legacy
123| |/// call-sites, FTS rows already in the database, deserialised JSON). The
124| |/// function returns the canonical slice on success and an [`AppError::Validation`]
125| |/// on failure, with an actionable message listing every accepted value.
126| |///
127| |/// Use this at every boundary that touches the `source` column:
128| |/// `memories::insert`, `memories::update`, and any new code path that
129| |/// builds a `NewMemory` from operator-supplied input. It is the safety
130| |/// net that prevented the original G29 bug from regressing in v1.0.69
131| |/// when the typed [`MemorySource`] enum was still being rolled out.
132| 43|pub fn validate_source(raw: &str) -> Result<&'static str, AppError> {
133| 43| match raw {
134| 43| "agent" => Ok("agent"),
135| 0| "user" => Ok("user"),
136| 0| "system" => Ok("system"),
137| 0| "import" => Ok("import"),
138| 0| "sync" => Ok("sync"),
139| 0| other => Err(AppError::Validation(format!(
140| 0| "invalid memory source: {other:?}; expected one of {}",
141| 0| MemorySource::ALL
142| 0| .iter()
143| 0| .map(|v| v.as_str())
144| 0| .collect::<Vec<_>>()
145| 0| .join(", ")
146| | ))),
147| | }
148| 43|}
149| |
150| |#[cfg(test)]
151| |mod tests {
152| | use super::*;
153| |
154| | #[test]
155| 1| fn as_str_returns_canonical_lowercase() {
156| 1| assert_eq!(MemorySource::Agent.as_str(), "agent");
157| 1| assert_eq!(MemorySource::User.as_str(), "user");
158| 1| assert_eq!(MemorySource::System.as_str(), "system");
159| 1| assert_eq!(MemorySource::Import.as_str(), "import");
160| 1| assert_eq!(MemorySource::Sync.as_str(), "sync");
161| 1| }
162| |
163| | #[test]
164| 1| fn try_from_valid_strings_succeeds() {
165| 1| assert_eq!(
166| 1| MemorySource::try_from("agent").unwrap(),
167| | MemorySource::Agent
168| | );
169| 1| assert_eq!(MemorySource::try_from("user").unwrap(), MemorySource::User);
170| 1| assert_eq!(
171| 1| MemorySource::try_from("system").unwrap(),
172| | MemorySource::System
173| | );
174| 1| assert_eq!(
175| 1| MemorySource::try_from("import").unwrap(),
176| | MemorySource::Import
177| | );
178| 1| assert_eq!(MemorySource::try_from("sync").unwrap(), MemorySource::Sync);
179| 1| }
180| |
181| | #[test]
182| 1| fn try_from_invalid_string_returns_err() {
183| | // G29 reproducer: "enrich" is the historical bug.
184| 1| let err = MemorySource::try_from("enrich").unwrap_err();
185| 1| let msg = format!("{err}");
186| 1| assert!(msg.contains("invalid memory source"), "got: {msg}");
^0
187| 1| assert!(msg.contains("\"enrich\""), "got: {msg}");
^0
188| 1| assert!(msg.contains("agent"), "must list agent as valid: {msg}");
^0
189| 1| }
190| |
191| | #[test]
192| 1| fn try_from_empty_string_returns_err() {
193| 1| assert!(MemorySource::try_from("").is_err());
194| 1| }
195| |
196| | #[test]
197| 1| fn try_from_string_owned_works() {
198| 1| let src: MemorySource = String::from("agent").try_into().unwrap();
199| 1| assert_eq!(src, MemorySource::Agent);
200| 1| }
201| |
202| | #[test]
203| 1| fn display_matches_as_str() {
204| 6| for v in MemorySource::ALL {
^5
205| 5| assert_eq!(format!("{v}"), v.as_str());
206| | }
207| 1| }
208| |
209| | #[test]
210| 1| fn serialize_round_trip_preserves_variant() {
211| 1| let v = MemorySource::Import;
212| 1| let json = serde_json::to_string(&v).unwrap();
213| 1| assert_eq!(json, "\"import\"");
214| 1| let back: MemorySource = serde_json::from_str(&json).unwrap();
215| 1| assert_eq!(back, v);
216| 1| }
217| |
218| | #[test]
219| 1| fn all_slice_has_exactly_five_variants() {
220| 1| assert_eq!(MemorySource::ALL.len(), 5);
221| 1| }
222| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/namespace.rs:
1| |//! Namespace resolution layer (flag > env > "global" fallback).
2| |//!
3| |//! Validates and resolves the active namespace used to scope all SQLite
4| |//! operations, enforcing safe characters and traversal-free names.
5| |
6| |use crate::errors::AppError;
7| |use crate::i18n::validation;
8| |use serde::Serialize;
9| |use std::path::Path;
10| |
11| |#[derive(Debug, Clone, Copy, Serialize, PartialEq, Eq)]
12| |#[serde(rename_all = "snake_case")]
13| |pub enum NamespaceSource {
14| | ExplicitFlag,
15| | Environment,
16| | Default,
17| |}
18| |
19| |#[derive(Debug, Clone, Serialize)]
20| |pub struct NamespaceResolution {
21| | pub namespace: String,
22| | pub source: NamespaceSource,
23| | pub cwd: String,
24| |}
25| |
26| |/// Resolves the active namespace, returning only the final name.
27| |///
28| |/// Shortcut over [`detect_namespace`] when the source does not matter.
29| |/// With a valid explicit flag, the returned namespace is exactly the passed value.
30| |/// Without a flag, the final fallback is `"global"`.
31| |///
32| |/// # Errors
33| |///
34| |/// Returns [`AppError::Validation`] if `explicit` contains invalid characters
35| |/// or exceeds 80 characters.
36| |///
37| |/// # Examples
38| |///
39| |/// ```
40| |/// use sqlite_graphrag::namespace::resolve_namespace;
41| |///
42| |/// // A valid explicit flag is accepted and reflected in the result.
43| |/// let ns = resolve_namespace(Some("meu-projeto")).unwrap();
44| |/// assert_eq!(ns, "meu-projeto");
45| |/// ```
46| |///
47| |/// ```
48| |/// use sqlite_graphrag::namespace::resolve_namespace;
49| |/// use sqlite_graphrag::errors::AppError;
50| |///
51| |/// // Namespace with invalid characters causes a validation error (exit 1).
52| |/// let err = resolve_namespace(Some("ns with space")).unwrap_err();
53| |/// assert_eq!(err.exit_code(), 1);
54| |/// ```
55| 1|pub fn resolve_namespace(explicit: Option<&str>) -> Result<String, AppError> {
56| 1| Ok(detect_namespace(explicit)?.namespace)
^0
57| 1|}
58| |
59| |/// Resolves the active namespace, returning a struct with the source and current directory.
60| |///
61| |/// Precedence: explicit flag > `SQLITE_GRAPHRAG_NAMESPACE` > fallback `"global"`.
62| |///
63| |/// # Errors
64| |///
65| |/// Returns [`AppError::Validation`] if the resolved namespace contains invalid characters.
66| |///
67| |/// # Examples
68| |///
69| |/// ```
70| |/// use sqlite_graphrag::namespace::{detect_namespace, NamespaceSource};
71| |///
72| |/// // With an explicit flag, the source is `ExplicitFlag`.
73| |/// let res = detect_namespace(Some("producao")).unwrap();
74| |/// assert_eq!(res.namespace, "producao");
75| |/// assert_eq!(res.source, NamespaceSource::ExplicitFlag);
76| |/// ```
77| |///
78| |/// ```
79| |/// use sqlite_graphrag::namespace::{detect_namespace, NamespaceSource};
80| |///
81| |/// // Without any explicit configuration, fallback is "global".
82| |/// // Removes env var to guarantee deterministic behaviour.
83| |/// std::env::remove_var("SQLITE_GRAPHRAG_NAMESPACE");
84| |/// let res = detect_namespace(None).unwrap();
85| |/// assert_eq!(res.namespace, "global");
86| |/// assert_eq!(res.source, NamespaceSource::Default);
87| |/// ```
88| 4|pub fn detect_namespace(explicit: Option<&str>) -> Result<NamespaceResolution, AppError> {
89| 4| let cwd = std::env::current_dir().map_err(AppError::Io)?;
^0
90| 4| let cwd_display = normalize_path(&cwd);
91| |
92| 4| if let Some(ns) = explicit {
^1
93| 1| validate_namespace(ns)?;
^0
94| 1| return Ok(NamespaceResolution {
95| 1| namespace: ns.to_owned(),
96| 1| source: NamespaceSource::ExplicitFlag,
97| 1| cwd: cwd_display,
98| 1| });
99| 3| }
100| |
101| 3| if let Ok(ns) = std::env::var("SQLITE_GRAPHRAG_NAMESPACE") {
^1
102| 1| if !ns.is_empty() {
103| 1| validate_namespace(&ns)?;
^0
104| 1| return Ok(NamespaceResolution {
105| 1| namespace: ns,
106| 1| source: NamespaceSource::Environment,
107| 1| cwd: cwd_display,
108| 1| });
109| 0| }
110| 2| }
111| |
112| 2| Ok(NamespaceResolution {
113| 2| namespace: "global".to_owned(),
114| 2| source: NamespaceSource::Default,
115| 2| cwd: cwd_display,
116| 2| })
117| 4|}
118| |
119| 2|fn validate_namespace(ns: &str) -> Result<(), AppError> {
120| 2| if ns.is_empty() || ns.len() > 80 {
121| 0| return Err(AppError::Validation(validation::namespace_length()));
122| 2| }
123| 2| if !ns
124| 2| .chars()
125| 30| .all(|c| c.is_alphanumeric() || c == '-' || c == '_')
^2 ^3 ^0
126| | {
127| 0| return Err(AppError::Validation(validation::namespace_format()));
128| 2| }
129| 2| Ok(())
130| 2|}
131| |
132| 4|fn normalize_path(path: &Path) -> String {
133| 4| path.canonicalize()
134| 4| .unwrap_or_else(|_| path.to_path_buf())
^0 ^0
135| 4| .display()
136| 4| .to_string()
137| 4|}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/output.rs:
1| |//! Single point of terminal I/O for the CLI (stdout JSON, stderr human).
2| |//!
3| |//! All user-visible output must go through this module; direct `println!` in
4| |//! other modules is forbidden.
5| |
6| |use crate::errors::AppError;
7| |use serde::Serialize;
8| |
9| |/// Output format variants accepted by `--format` CLI flags.
10| |#[derive(Debug, Clone, Copy, clap::ValueEnum, Default)]
11| |pub enum OutputFormat {
12| | #[default]
13| | Json,
14| | Text,
15| | Markdown,
16| |}
17| |
18| |/// Restricted JSON-only format for commands that always emit JSON.
19| |#[derive(Debug, Clone, Copy, clap::ValueEnum, Default)]
20| |pub enum JsonOutputFormat {
21| | #[default]
22| | Json,
23| |}
24| |
25| |/// Serializes `value` as pretty-printed JSON and writes it to stdout with a trailing newline.
26| |///
27| |/// Flushes stdout after writing. A `BrokenPipe` error is silenced so that
28| |/// piping to consumers that close early (e.g. `head`) does not surface an error.
29| |///
30| |/// # Errors
31| |/// Returns `Err` when serialization fails or when a non-`BrokenPipe` I/O error occurs.
32| |#[inline]
33| 3|pub fn emit_json<T: Serialize>(value: &T) -> Result<(), AppError> {
34| 3| let json = serde_json::to_string_pretty(value)?;
^2 ^1
35| 2| let mut out = std::io::stdout().lock();
36| 2| if let Err(e) = std::io::Write::write_all(&mut out, json.as_bytes())
^0
37| 2| .and_then(|()| std::io::Write::write_all(&mut out, b"\n"))
38| 2| .and_then(|()| std::io::Write::flush(&mut out))
39| | {
40| 0| if e.kind() == std::io::ErrorKind::BrokenPipe {
41| 0| return Ok(());
42| 0| }
43| 0| return Err(AppError::Io(e));
44| 2| }
45| 2| Ok(())
46| 3|}
47| |
48| |/// Serializes `value` as compact (single-line) JSON and writes it to stdout with a trailing newline.
49| |///
50| |/// Flushes stdout after writing. A `BrokenPipe` error is silenced.
51| |///
52| |/// # Errors
53| |/// Returns `Err` when serialization fails or when a non-`BrokenPipe` I/O error occurs.
54| |#[inline]
55| 2|pub fn emit_json_compact<T: Serialize>(value: &T) -> Result<(), AppError> {
56| 2| let json = serde_json::to_string(value)?;
^1 ^1
57| 1| let mut out = std::io::stdout().lock();
58| 1| if let Err(e) = std::io::Write::write_all(&mut out, json.as_bytes())
^0
59| 1| .and_then(|()| std::io::Write::write_all(&mut out, b"\n"))
60| 1| .and_then(|()| std::io::Write::flush(&mut out))
61| | {
62| 0| if e.kind() == std::io::ErrorKind::BrokenPipe {
63| 0| return Ok(());
64| 0| }
65| 0| return Err(AppError::Io(e));
66| 1| }
67| 1| Ok(())
68| 2|}
69| |
70| |/// Writes compact JSON to stdout, silently ignoring serialization and I/O errors.
71| |/// Designed for NDJSON streaming where partial output is acceptable.
72| |#[inline]
73| 0|pub fn emit_json_line<T: Serialize>(value: &T) {
74| 0| if let Ok(json) = serde_json::to_string(value) {
75| 0| let mut out = std::io::stdout().lock();
76| 0| let _ = std::io::Write::write_all(&mut out, json.as_bytes());
77| 0| let _ = std::io::Write::write_all(&mut out, b"\n");
78| 0| let _ = std::io::Write::flush(&mut out);
79| 0| }
80| 0|}
81| |
82| |/// Writes `msg` followed by a newline to stdout and flushes.
83| |///
84| |/// A `BrokenPipe` error is silenced gracefully.
85| |#[inline]
86| 1|pub fn emit_text(msg: &str) {
87| 1| let mut out = std::io::stdout().lock();
88| 1| let _ = std::io::Write::write_all(&mut out, msg.as_bytes())
89| 1| .and_then(|()| std::io::Write::write_all(&mut out, b"\n"))
90| 1| .and_then(|()| std::io::Write::flush(&mut out));
91| 1|}
92| |
93| |/// Logs `msg` as a structured `tracing::info!` event (does not write to stdout).
94| |#[inline]
95| 1|pub fn emit_progress(msg: &str) {
96| 1| tracing::info!(target: "output", message = msg);
97| 1|}
98| |
99| |/// Emits a bilingual progress message honouring `--lang` or `SQLITE_GRAPHRAG_LANG`.
100| |/// Usage: `output::emit_progress_i18n("Computing embedding...", "Calculando embedding...")`.
101| 0|pub fn emit_progress_i18n(en: &str, pt: &str) {
102| | use crate::i18n::{current, Language};
103| 0| match current() {
104| 0| Language::English => tracing::info!(target: "output", message = en),
105| 0| Language::Portuguese => tracing::info!(target: "output", message = pt),
106| | }
107| 0|}
108| |
109| |/// Emits a JSON error envelope to stdout for machine consumers.
110| |///
111| |/// Ensures the stdout JSON contract is honoured even on error paths:
112| |/// `{"error": true, "code": <exit_code>, "message": "<localized_msg>"}`.
113| |/// A `BrokenPipe` error is silenced so piping to early-closing consumers
114| |/// does not surface a secondary error.
115| |#[cold]
116| |#[inline(never)]
117| 0|pub fn emit_error_json(code: i32, message: &str) {
118| | #[derive(serde::Serialize)]
119| | struct ErrorEnvelope<'a> {
120| | error: bool,
121| | code: i32,
122| | message: &'a str,
123| | }
124| 0| let envelope = ErrorEnvelope {
125| 0| error: true,
126| 0| code,
127| 0| message,
128| 0| };
129| 0| if emit_json(&envelope).is_err() {
130| | use std::io::Write;
131| 0| let escaped = message.replace('\\', "\\\\").replace('"', "\\\"");
132| 0| let _ = writeln!(
133| 0| std::io::stdout().lock(),
134| 0| r#"{{"error":true,"code":{code},"message":"{escaped}"}}"#
135| | );
136| 0| }
137| 0|}
138| |
139| |/// Emits a localised error message to stderr with the `Error:`/`Erro:` prefix.
140| |///
141| |/// Centralises human-readable error output following Pattern 5 (`output.rs` is the
142| |/// SOLE I/O point of the CLI). Does not log via `tracing` — call `tracing::error!`
143| |/// explicitly before this function when structured observability is desired.
144| |#[cold]
145| |#[inline(never)]
146| 0|pub fn emit_error(localized_msg: &str) {
147| 0| tracing::error!(target: "output", message = localized_msg);
148| 0| eprintln!("{}: {}", crate::i18n::error_prefix(), localized_msg);
149| 0|}
150| |
151| |/// Emits a bilingual error to stderr honouring `--lang` or `SQLITE_GRAPHRAG_LANG`.
152| |/// Usage: `output::emit_error_i18n("invariant violated", "invariante violado")`.
153| |#[cold]
154| |#[inline(never)]
155| 0|pub fn emit_error_i18n(en: &str, pt: &str) {
156| | use crate::i18n::{current, Language};
157| 0| let msg = match current() {
158| 0| Language::English => en,
159| 0| Language::Portuguese => pt,
160| | };
161| 0| emit_error(msg);
162| 0|}
163| |
164| |/// JSON payload emitted by the `remember` subcommand.
165| |///
166| |/// All fields are required by the JSON contract (see `docs/schemas/remember.schema.json`).
167| |/// `operation` is an alias of `action` for compatibility with clients using the old field name.
168| |///
169| |/// # Examples
170| |///
171| |/// ```
172| |/// use sqlite_graphrag::output::RememberResponse;
173| |///
174| |/// let resp = RememberResponse {
175| |/// memory_id: 1,
176| |/// name: "nota-inicial".into(),
177| |/// namespace: "global".into(),
178| |/// action: "created".into(),
179| |/// operation: "created".into(),
180| |/// version: 1,
181| |/// entities_persisted: 0,
182| |/// relationships_persisted: 0,
183| |/// relationships_truncated: false,
184| |/// chunks_created: 1,
185| |/// chunks_persisted: 0,
186| |/// urls_persisted: 0,
187| |/// extraction_method: None,
188| |/// merged_into_memory_id: None,
189| |/// warnings: vec![],
190| |/// created_at: 1_700_000_000,
191| |/// created_at_iso: "2023-11-14T22:13:20Z".into(),
192| |/// elapsed_ms: 42,
193| |/// name_was_normalized: false,
194| |/// original_name: None,
195| |/// };
196| |///
197| |/// let json = serde_json::to_string(&resp).unwrap();
198| |/// assert!(json.contains("\"memory_id\":1"));
199| |/// assert!(json.contains("\"elapsed_ms\":42"));
200| |/// assert!(json.contains("\"merged_into_memory_id\":null"));
201| |/// assert!(json.contains("\"urls_persisted\":0"));
202| |/// assert!(json.contains("\"relationships_truncated\":false"));
203| |/// ```
204| |#[derive(Serialize)]
205| |pub struct RememberResponse {
206| | pub memory_id: i64,
207| | pub name: String,
208| | pub namespace: String,
209| | pub action: String,
210| | /// Semantic alias of `action` for compatibility with the contract documented in SKILL.md.
211| | pub operation: String,
212| | pub version: i64,
213| | pub entities_persisted: usize,
214| | pub relationships_persisted: usize,
215| | /// True when the relationship builder hit the cap before covering all entity pairs.
216| | /// Callers can use this to decide whether to increase GRAPHRAG_MAX_RELATIONSHIPS_PER_MEMORY.
217| | pub relationships_truncated: bool,
218| | /// Total number of chunks the body was split into BEFORE dedup.
219| | ///
220| | /// For single-chunk bodies this equals 1 even though no row is added to
221| | /// the `memory_chunks` table — the memory row itself acts as the chunk.
222| | /// Use `chunks_persisted` to know how many rows were actually written.
223| | pub chunks_created: usize,
224| | /// Number of chunks actually written to chunks/embeddings tables. Always <= chunks_created.
225| | ///
226| | /// Equal when no chunk had identical normalized text already in DB; less when dedup skipped
227| | /// some. Equals zero for single-chunk bodies (the memory row is the chunk) and equals
228| | /// `chunks_created` for multi-chunk bodies. Added in v1.0.23 to disambiguate from
229| | /// `chunks_created` and reflect database state precisely.
230| | pub chunks_persisted: usize,
231| | /// Number of unique URLs inserted into `memory_urls` for this memory.
232| | /// Added in v1.0.24 — split URLs out of the entity graph (P0-2 fix).
233| | #[serde(default)]
234| | pub urls_persisted: usize,
235| | /// Extraction method used: "gliner-{variant}+regex" or "regex-only". None when NER is not enabled.
236| | #[serde(skip_serializing_if = "Option::is_none")]
237| | pub extraction_method: Option<String>,
238| | pub merged_into_memory_id: Option<i64>,
239| | pub warnings: Vec<String>,
240| | /// Timestamp Unix epoch seconds.
241| | pub created_at: i64,
242| | /// RFC 3339 UTC timestamp string parallel to `created_at` for ISO 8601 parsers.
243| | pub created_at_iso: String,
244| | /// Total execution time in milliseconds from handler start to serialisation.
245| | pub elapsed_ms: u64,
246| | /// True when the user-supplied `--name` differed from the persisted slug
247| | /// (i.e. kebab-case normalization changed the value). Added in v1.0.32 so
248| | /// callers can detect normalization without parsing stderr WARN logs.
249| | #[serde(default)]
250| | pub name_was_normalized: bool,
251| | /// Original user-supplied `--name` value before normalization.
252| | /// Present only when `name_was_normalized == true`; omitted otherwise to
253| | /// keep the common (already-kebab) payload small.
254| | #[serde(skip_serializing_if = "Option::is_none")]
255| | pub original_name: Option<String>,
256| |}
257| |
258| |/// Individual item returned by the `recall` query.
259| |///
260| |/// The `memory_type` field is serialised as `"type"` in JSON to maintain
261| |/// compatibility with external clients — the Rust name uses `memory_type`
262| |/// to avoid conflict with the reserved keyword.
263| |///
264| |/// # Examples
265| |///
266| |/// ```
267| |/// use sqlite_graphrag::output::RecallItem;
268| |///
269| |/// let item = RecallItem {
270| |/// memory_id: 7,
271| |/// name: "nota-rust".into(),
272| |/// namespace: "global".into(),
273| |/// memory_type: "user".into(),
274| |/// description: "aprendizado de Rust".into(),
275| |/// snippet: "ownership e borrowing".into(),
276| |/// distance: 0.12,
277| |/// score: 0.88,
278| |/// source: "direct".into(),
279| |/// graph_depth: None,
280| |/// };
281| |///
282| |/// let json = serde_json::to_string(&item).unwrap();
283| |/// // Rust field `memory_type` appears as `"type"` in JSON.
284| |/// assert!(json.contains("\"type\":\"user\""));
285| |/// assert!(!json.contains("memory_type"));
286| |/// assert!(json.contains("\"distance\":0.12"));
287| |/// ```
288| |#[derive(Serialize, Clone)]
289| |pub struct RecallItem {
290| | pub memory_id: i64,
291| | pub name: String,
292| | pub namespace: String,
293| | #[serde(rename = "type")]
294| | pub memory_type: String,
295| | pub description: String,
296| | pub snippet: String,
297| | pub distance: f32,
298| | /// Cosine similarity in `[0.0, 1.0]` derived as `1.0 - distance` and clamped
299| | /// to that interval. Always populated to satisfy the documented contract
300| | /// (M-A5 in v1.0.40); higher means more similar. For graph hits the value
301| | /// reflects the hop-derived distance proxy and should be interpreted
302| | /// alongside `graph_depth` rather than as a true cosine score.
303| | pub score: f32,
304| | pub source: String,
305| | /// Number of graph hops between this match and the seed memories.
306| | ///
307| | /// Set to `None` for direct vector matches (where `distance` is meaningful)
308| | /// and to `Some(N)` for traversal results, with `N=0` when the depth could
309| | /// not be tracked precisely. Added in v1.0.23 to disambiguate graph results
310| | /// from the `distance: 0.0` placeholder previously used for graph entries.
311| | /// Field is omitted from JSON output when `None`.
312| | #[serde(skip_serializing_if = "Option::is_none")]
313| | pub graph_depth: Option<u32>,
314| |}
315| |
316| |impl RecallItem {
317| | /// Computes the similarity score from a vector distance, clamped to
318| | /// `[0.0, 1.0]`. Cosine distance returned by sqlite-vec lives in `[0, 2]`
319| | /// in theory but the embedder produces unit-norm vectors so the practical
320| | /// range is `[0, 1]`. Centralized so every constructor keeps the contract.
321| | #[inline]
322| 11| pub fn score_from_distance(distance: f32) -> f32 {
323| 11| let raw = 1.0 - distance;
324| 11| if raw.is_nan() {
325| 1| 0.0
326| | } else {
327| 10| raw.clamp(0.0, 1.0)
328| | }
329| 11| }
330| |}
331| |
332| |/// Full response envelope returned by the `recall` subcommand.
333| |///
334| |/// Contains both direct vector matches and graph-traversal matches, plus the
335| |/// aggregated `results` list that merges both for callers that do not need
336| |/// to distinguish the source.
337| |#[derive(Serialize)]
338| |pub struct RecallResponse {
339| | pub query: String,
340| | pub k: usize,
341| | pub direct_matches: Vec<RecallItem>,
342| | pub graph_matches: Vec<RecallItem>,
343| | /// Aggregated alias of `direct_matches` + `graph_matches` for the contract documented in SKILL.md.
344| | pub results: Vec<RecallItem>,
345| | /// Total execution time in milliseconds from handler start to serialisation.
346| | pub elapsed_ms: u64,
347| |}
348| |
349| |#[cfg(test)]
350| |mod tests {
351| | use super::*;
352| | use serde::Serialize;
353| |
354| | #[derive(Serialize)]
355| | struct Dummy {
356| | val: u32,
357| | }
358| |
359| | // Non-serializable type to force a JSON serialization error
360| | struct NotSerializable;
361| | impl Serialize for NotSerializable {
362| 2| fn serialize<S: serde::Serializer>(&self, _: S) -> Result<S::Ok, S::Error> {
363| 2| Err(serde::ser::Error::custom(
364| 2| "intentional serialization failure",
365| 2| ))
366| 2| }
367| | }
368| |
369| | #[test]
370| 1| fn emit_json_returns_ok_for_valid_value() {
371| 1| let v = Dummy { val: 42 };
372| 1| assert!(emit_json(&v).is_ok());
373| 1| }
374| |
375| | #[test]
376| 1| fn emit_json_returns_err_for_non_serializable_value() {
377| 1| let v = NotSerializable;
378| 1| assert!(emit_json(&v).is_err());
379| 1| }
380| |
381| | #[test]
382| 1| fn emit_json_compact_returns_ok_for_valid_value() {
383| 1| let v = Dummy { val: 7 };
384| 1| assert!(emit_json_compact(&v).is_ok());
385| 1| }
386| |
387| | #[test]
388| 1| fn emit_json_compact_returns_err_for_non_serializable_value() {
389| 1| let v = NotSerializable;
390| 1| assert!(emit_json_compact(&v).is_err());
391| 1| }
392| |
393| | #[test]
394| 1| fn emit_text_does_not_panic() {
395| 1| emit_text("mensagem de teste");
396| 1| }
397| |
398| | #[test]
399| 1| fn emit_progress_does_not_panic() {
400| 1| emit_progress("progresso de teste");
401| 1| }
402| |
403| | #[test]
404| 1| fn remember_response_serializes_correctly() {
405| 1| let r = RememberResponse {
406| 1| memory_id: 1,
407| 1| name: "teste".to_string(),
408| 1| namespace: "ns".to_string(),
409| 1| action: "created".to_string(),
410| 1| operation: "created".to_string(),
411| 1| version: 1,
412| 1| entities_persisted: 2,
413| 1| relationships_persisted: 3,
414| 1| relationships_truncated: false,
415| 1| chunks_created: 4,
416| 1| chunks_persisted: 4,
417| 1| urls_persisted: 2,
418| 1| extraction_method: None,
419| 1| merged_into_memory_id: None,
420| 1| warnings: vec!["aviso".to_string()],
421| 1| created_at: 1776569715,
422| 1| created_at_iso: "2026-04-19T03:34:15Z".to_string(),
423| 1| elapsed_ms: 123,
424| 1| name_was_normalized: false,
425| 1| original_name: None,
426| 1| };
427| 1| let json = serde_json::to_string(&r).unwrap();
428| 1| assert!(json.contains("memory_id"));
429| 1| assert!(json.contains("aviso"));
430| 1| assert!(json.contains("\"namespace\""));
431| 1| assert!(json.contains("\"merged_into_memory_id\""));
432| 1| assert!(json.contains("\"operation\""));
433| 1| assert!(json.contains("\"created_at\""));
434| 1| assert!(json.contains("\"created_at_iso\""));
435| 1| assert!(json.contains("\"elapsed_ms\""));
436| 1| assert!(json.contains("\"urls_persisted\""));
437| 1| assert!(json.contains("\"relationships_truncated\":false"));
438| 1| }
439| |
440| | #[test]
441| 1| fn recall_item_serializes_renamed_type_field() {
442| 1| let item = RecallItem {
443| 1| memory_id: 10,
444| 1| name: "entidade".to_string(),
445| 1| namespace: "ns".to_string(),
446| 1| memory_type: "entity".to_string(),
447| 1| description: "desc".to_string(),
448| 1| snippet: "trecho".to_string(),
449| 1| distance: 0.5,
450| 1| score: RecallItem::score_from_distance(0.5),
451| 1| source: "db".to_string(),
452| 1| graph_depth: None,
453| 1| };
454| 1| let json = serde_json::to_string(&item).unwrap();
455| 1| assert!(json.contains("\"type\""));
456| 1| assert!(!json.contains("memory_type"));
457| | // Field is omitted from JSON when None.
458| 1| assert!(!json.contains("graph_depth"));
459| 1| assert!(json.contains("\"score\":0.5"));
460| 1| }
461| |
462| | #[test]
463| 1| fn recall_response_serializes_with_lists() {
464| 1| let resp = RecallResponse {
465| 1| query: "busca".to_string(),
466| 1| k: 10,
467| 1| direct_matches: vec![],
468| 1| graph_matches: vec![],
469| 1| results: vec![],
470| 1| elapsed_ms: 42,
471| 1| };
472| 1| let json = serde_json::to_string(&resp).unwrap();
473| 1| assert!(json.contains("direct_matches"));
474| 1| assert!(json.contains("graph_matches"));
475| 1| assert!(json.contains("\"k\":"));
476| 1| assert!(json.contains("\"results\""));
477| 1| assert!(json.contains("\"elapsed_ms\""));
478| 1| }
479| |
480| | #[test]
481| 1| fn error_envelope_serializes_correctly() {
482| | #[derive(serde::Serialize)]
483| | struct ErrorEnvelope<'a> {
484| | error: bool,
485| | code: i32,
486| | message: &'a str,
487| | }
488| 1| let envelope = ErrorEnvelope {
489| 1| error: true,
490| 1| code: 10,
491| 1| message: "database disk image is malformed",
492| 1| };
493| 1| let json = serde_json::to_value(&envelope).unwrap();
494| 1| assert_eq!(json["error"], true);
495| 1| assert_eq!(json["code"], 10);
496| 1| assert_eq!(json["message"], "database disk image is malformed");
497| 1| }
498| |
499| | #[test]
500| 1| fn output_format_default_is_json() {
501| 1| let fmt = OutputFormat::default();
502| 1| assert!(matches!(fmt, OutputFormat::Json));
^0
503| 1| }
504| |
505| | #[test]
506| 1| fn output_format_variants_exist() {
507| 1| let _text = OutputFormat::Text;
508| 1| let _md = OutputFormat::Markdown;
509| 1| let _json = OutputFormat::Json;
510| 1| }
511| |
512| | #[test]
513| 1| fn recall_item_clone_produces_equal_value() {
514| 1| let item = RecallItem {
515| 1| memory_id: 99,
516| 1| name: "clone".to_string(),
517| 1| namespace: "ns".to_string(),
518| 1| memory_type: "relation".to_string(),
519| 1| description: "d".to_string(),
520| 1| snippet: "s".to_string(),
521| 1| distance: 0.1,
522| 1| score: RecallItem::score_from_distance(0.1),
523| 1| source: "src".to_string(),
524| 1| graph_depth: Some(2),
525| 1| };
526| 1| let cloned = item.clone();
527| 1| assert_eq!(cloned.memory_id, item.memory_id);
528| 1| assert_eq!(cloned.name, item.name);
529| 1| assert_eq!(cloned.graph_depth, Some(2));
530| 1| }
531| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/parsers/mod.rs:
1| |//! Input format parsers (timestamp, range validators).
2| |
3| |use chrono::DateTime;
4| |use unicode_normalization::UnicodeNormalization;
5| |
6| |/// Accepts a Unix epoch (integer >= 0) or RFC 3339 timestamp and returns the Unix epoch.
7| 7|pub fn parse_expected_updated_at(s: &str) -> Result<i64, String> {
8| 7| if let Ok(secs) = s.parse::<i64>() {
^3
9| 3| if secs >= 0 {
10| 2| return Ok(secs);
11| 1| }
12| 4| }
13| 5| DateTime::parse_from_rfc3339(s)
14| 5| .map(|dt| dt.timestamp())
^2 ^2
15| 5| .map_err(|e| {
^3
16| 3| format!(
17| 3| "value must be a Unix epoch (integer >= 0) or RFC 3339 (e.g. 2026-04-19T12:00:00Z): {e}"
18| | )
19| 3| })
20| 7|}
21| |
22| |/// Validates `-k`/`--k` for `recall` and `hybrid-search` to the inclusive range `1..=4096`.
23| |///
24| |/// The upper bound matches the `sqlite-vec` knn limit; values above it would surface a leaky
25| |/// engine error such as `k value in knn query too large, provided 10000 and the limit is 4096`.
26| |/// Validating at parse time turns the failure into a clean Clap error before any database work.
27| 11|pub fn parse_k_range(s: &str) -> Result<usize, String> {
28| 11| let value: usize = s
^9 ^9
29| 11| .parse()
30| 11| .map_err(|_| format!("'{s}' is not a valid non-negative integer"))?;
^2 ^2
31| 9| if !(1..=4096).contains(&value) {
32| 2| return Err(format!(
33| 2| "k must be between 1 and 4096 (inclusive); got {value}"
34| 2| ));
35| 7| }
36| 7| Ok(value)
37| 11|}
38| |
39| |/// Flexible boolean parser for Clap env var integration.
40| |///
41| |/// Accepts common truthy/falsy conventions used in shell environments:
42| |/// truthy: `1`, `true`, `yes`, `on` (case-insensitive)
43| |/// falsy: `0`, `false`, `no`, `off`, empty string (case-insensitive)
44| 24|pub fn parse_bool_flexible(s: &str) -> Result<bool, String> {
45| 24| match s.to_lowercase().as_str() {
46| 24| "1" | "true" | "yes" | "on" => Ok(true),
^23 ^20 ^18 ^8
47| 16| "0" | "false" | "no" | "off" | "" => Ok(false),
^15 ^8 ^6 ^4 ^13
48| 3| _ => Err(format!(
49| 3| "invalid boolean value '{s}': expected true/false/1/0/yes/no/on/off"
50| 3| )),
51| | }
52| 24|}
53| |
54| |#[cfg(test)]
55| |mod tests {
56| | use super::*;
57| |
58| | #[test]
59| 1| fn accepts_unix_epoch() {
60| 1| assert_eq!(parse_expected_updated_at("1700000000").unwrap(), 1700000000);
61| 1| }
62| |
63| | #[test]
64| 1| fn accepts_zero() {
65| 1| assert_eq!(parse_expected_updated_at("0").unwrap(), 0);
66| 1| }
67| |
68| | #[test]
69| 1| fn accepts_rfc_3339_utc() {
70| 1| let result = parse_expected_updated_at("2020-01-01T00:00:00Z");
71| 1| assert!(result.is_ok());
72| 1| assert_eq!(result.unwrap(), 1577836800);
73| 1| }
74| |
75| | #[test]
76| 1| fn accepts_rfc_3339_with_offset() {
77| 1| let result = parse_expected_updated_at("2026-04-19T12:00:00+00:00");
78| 1| assert!(result.is_ok());
79| 1| }
80| |
81| | #[test]
82| 1| fn rejects_invalid_string() {
83| 1| assert!(parse_expected_updated_at("bananas").is_err());
84| 1| }
85| |
86| | #[test]
87| 1| fn rejects_negative() {
88| 1| let err = parse_expected_updated_at("-1");
89| 1| assert!(err.is_err());
90| 1| }
91| |
92| | #[test]
93| 1| fn error_message_mentions_format() {
94| 1| let msg = parse_expected_updated_at("invalid").unwrap_err();
95| 1| assert!(msg.contains("RFC 3339") || msg.contains("Unix epoch"));
^0
96| 1| }
97| |
98| | #[test]
99| 1| fn k_accepts_valid_range_endpoints() {
100| 1| assert_eq!(parse_k_range("1").unwrap(), 1);
101| 1| assert_eq!(parse_k_range("4096").unwrap(), 4096);
102| 1| assert_eq!(parse_k_range("10").unwrap(), 10);
103| 1| }
104| |
105| | #[test]
106| 1| fn k_rejects_zero() {
107| 1| let msg = parse_k_range("0").unwrap_err();
108| 1| assert!(msg.contains("between 1 and 4096"));
109| 1| }
110| |
111| | #[test]
112| 1| fn k_rejects_above_limit() {
113| 1| let msg = parse_k_range("10000").unwrap_err();
114| 1| assert!(msg.contains("between 1 and 4096"));
115| 1| }
116| |
117| | #[test]
118| 1| fn k_rejects_non_integer() {
119| 1| let msg = parse_k_range("abc").unwrap_err();
120| 1| assert!(msg.contains("not a valid"));
121| 1| }
122| |
123| | #[test]
124| 1| fn k_rejects_negative() {
125| | // usize parser fails on negatives before range check
126| 1| assert!(parse_k_range("-5").is_err());
127| 1| }
128| |
129| | #[test]
130| 1| fn bool_flexible_truthy() {
131| 9| for v in &["1", "true", "True", "TRUE", "yes", "Yes", "on", "ON"] {
^8
132| 8| assert!(parse_bool_flexible(v).unwrap(), "should be true: {v}");
^0
133| | }
134| 1| }
135| |
136| | #[test]
137| 1| fn bool_flexible_falsy() {
138| 10| for v in &["0", "false", "False", "FALSE", "no", "No", "off", "OFF", ""] {
^9
139| 9| assert!(!parse_bool_flexible(v).unwrap(), "should be false: {v}");
^0
140| | }
141| 1| }
142| |
143| | #[test]
144| 1| fn bool_flexible_rejects_invalid() {
145| 1| assert!(parse_bool_flexible("banana").is_err());
146| 1| assert!(parse_bool_flexible("2").is_err());
147| 1| assert!(parse_bool_flexible("nope").is_err());
148| 1| }
149| |}
150| |
151| |/// The 12 well-known relation types from v1.0.0.
152| |///
153| |/// Non-canonical relations are accepted but emit a `tracing::warn!`.
154| |pub const CANONICAL_RELATIONS: &[&str] = &[
155| | "applies_to",
156| | "uses",
157| | "depends_on",
158| | "causes",
159| | "fixes",
160| | "contradicts",
161| | "supports",
162| | "follows",
163| | "related",
164| | "mentions",
165| | "replaces",
166| | "tracked_in",
167| |];
168| |
169| |/// Returns `true` when the relation is one of the 12 canonical types.
170| 6|pub fn is_canonical_relation(s: &str) -> bool {
171| 6| CANONICAL_RELATIONS.contains(&s)
172| 6|}
173| |
174| |/// Normalizes a relation string: lowercase + hyphens to underscores.
175| 6|pub fn normalize_relation(s: &str) -> String {
176| 6| s.to_lowercase().replace('-', "_")
177| 6|}
178| |
179| |/// Normalizes an entity name to kebab-case ASCII.
180| |///
181| |/// Applies NFKD decomposition, filters to ASCII (transliterating by dropping
182| |/// diacritical combining marks), lowercases, converts spaces and underscores
183| |/// to hyphens, collapses consecutive hyphens, and trims leading/trailing hyphens.
184| |///
185| |/// # Examples
186| |///
187| |/// ```
188| |/// use sqlite_graphrag::parsers::normalize_entity_name;
189| |///
190| |/// assert_eq!(normalize_entity_name("Danilo Aguiar"), "danilo-aguiar");
191| |/// assert_eq!(normalize_entity_name("CANONICAL_RELATIONS"), "canonical-relations");
192| |/// assert_eq!(normalize_entity_name(" hello world "), "hello-world");
193| |/// assert_eq!(normalize_entity_name("danilo-aguiar"), "danilo-aguiar"); // idempotent
194| |/// ```
195| 77|pub fn normalize_entity_name(s: &str) -> String {
196| | // NFKD: decompose precomposed characters into base + combining marks.
197| | // Then keep only ASCII characters, effectively stripping diacritics.
198| 648| let ascii: String = s.nfkd().filter(|c| c.is_ascii()).collect();
^77 ^77 ^77^77 ^77 ^77
199| | // Lowercase, then replace spaces and underscores with hyphens.
200| 77| let hyphenated: String = ascii
201| 77| .to_lowercase()
202| 77| .chars()
203| 645| .map(|c| if c.is_ascii_alphanumeric() { c } else { '-' })
^77 ^545 ^100
204| 77| .collect();
205| | // Collapse consecutive hyphens and trim from both ends.
206| 77| let mut result = String::with_capacity(hyphenated.len());
207| 77| let mut prev_was_hyphen = false;
208| 645| for ch in hyphenated.chars() {
^77 ^77
209| 645| if ch == '-' {
210| 100| if !prev_was_hyphen {
211| 90| result.push('-');
212| 90| }
^10
213| 100| prev_was_hyphen = true;
214| 545| } else {
215| 545| result.push(ch);
216| 545| prev_was_hyphen = false;
217| 545| }
218| | }
219| 77| result.trim_matches('-').to_string()
220| 77|}
221| |
222| |/// Validates that a normalized relation matches `^[a-z][a-z0-9_]*$`.
223| 26|pub fn validate_relation_format(s: &str) -> Result<(), String> {
224| 26| if s.is_empty() {
225| 3| return Err("relation must not be empty".to_string());
226| 23| }
227| 23| if !s.as_bytes()[0].is_ascii_lowercase() {
228| 1| return Err(format!(
229| 1| "relation must start with a lowercase letter, got '{s}'"
230| 1| ));
231| 22| }
232| 22| if !s
233| 22| .bytes()
234| 173| .all(|b| b.is_ascii_lowercase() || b.is_ascii_digit() || b == b'_')
^22 ^9^9 ^9
235| | {
236| 1| return Err(format!(
237| 1| "relation must contain only lowercase letters, digits and underscores, got '{s}'"
238| 1| ));
239| 21| }
240| 21| Ok(())
241| 26|}
242| |
243| |/// Emits a `tracing::warn!` when the relation is not in [`CANONICAL_RELATIONS`].
244| 0|pub fn warn_if_non_canonical(relation: &str) {
245| 0| if !is_canonical_relation(relation) {
246| 0| tracing::warn!(target: "parsers",
247| | relation,
248| 0| "non-canonical relation accepted; consider using a well-known value"
249| | );
250| 0| }
251| 0|}
252| |
253| |/// Clap `value_parser` for `--relation`: normalizes and validates format.
254| |///
255| |/// Accepts any kebab-case or snake_case string. Non-canonical values are
256| |/// accepted at parse time; the warning is emitted at command execution.
257| 3|pub fn parse_relation(s: &str) -> Result<String, String> {
258| 3| let normalized = normalize_relation(s);
259| 3| validate_relation_format(&normalized)?;
^1
260| 2| Ok(normalized)
261| 3|}
262| |
263| |#[cfg(test)]
264| |mod relation_tests {
265| | use super::*;
266| |
267| | #[test]
268| 1| fn canonical_relations_all_valid() {
269| 13| for r in CANONICAL_RELATIONS {
^12
270| 12| assert!(
271| 12| validate_relation_format(r).is_ok(),
272| 0| "canonical relation '{r}' should be valid"
273| | );
274| | }
275| 1| }
276| |
277| | #[test]
278| 1| fn normalize_converts_hyphens_and_uppercase() {
279| 1| assert_eq!(normalize_relation("Depends-On"), "depends_on");
280| 1| assert_eq!(normalize_relation("TESTED-BY"), "tested_by");
281| 1| assert_eq!(normalize_relation("uses"), "uses");
282| 1| }
283| |
284| | #[test]
285| 1| fn validate_rejects_empty() {
286| 1| assert!(validate_relation_format("").is_err());
287| 1| }
288| |
289| | #[test]
290| 1| fn validate_rejects_digit_start() {
291| 1| assert!(validate_relation_format("123abc").is_err());
292| 1| }
293| |
294| | #[test]
295| 1| fn validate_rejects_spaces() {
296| 1| assert!(validate_relation_format("has spaces").is_err());
297| 1| }
298| |
299| | #[test]
300| 1| fn validate_accepts_custom_relations() {
301| 1| assert!(validate_relation_format("implements").is_ok());
302| 1| assert!(validate_relation_format("tested_by").is_ok());
303| 1| assert!(validate_relation_format("part_of").is_ok());
304| 1| assert!(validate_relation_format("blocks").is_ok());
305| 1| }
306| |
307| | #[test]
308| 1| fn parse_relation_normalizes_and_validates() {
309| 1| assert_eq!(parse_relation("Tested-By").unwrap(), "tested_by");
310| 1| assert_eq!(parse_relation("uses").unwrap(), "uses");
311| 1| assert!(parse_relation("").is_err());
312| 1| }
313| |
314| | #[test]
315| 1| fn is_canonical_detects_known() {
316| 1| assert!(is_canonical_relation("uses"));
317| 1| assert!(is_canonical_relation("applies_to"));
318| 1| assert!(!is_canonical_relation("implements"));
319| 1| assert!(!is_canonical_relation("blocks"));
320| 1| }
321| |}
322| |
323| |#[cfg(test)]
324| |mod entity_name_tests {
325| | use super::*;
326| |
327| | #[test]
328| 1| fn strips_diacritics_from_accented_name() {
329| 1| assert_eq!(normalize_entity_name("Danilo Aguiar"), "danilo-aguiar");
330| 1| }
331| |
332| | #[test]
333| 1| fn strips_diacritics_unicode_accents() {
334| | // é → e, ã → a, ç → c
335| 1| assert_eq!(normalize_entity_name("São Paulo"), "sao-paulo");
336| 1| assert_eq!(normalize_entity_name("Ünit Tëst"), "unit-test");
337| 1| }
338| |
339| | #[test]
340| 1| fn converts_spaces_to_hyphens() {
341| 1| assert_eq!(normalize_entity_name("hello world"), "hello-world");
342| 1| assert_eq!(normalize_entity_name(" hello world "), "hello-world");
343| 1| }
344| |
345| | #[test]
346| 1| fn converts_underscores_to_hyphens() {
347| 1| assert_eq!(normalize_entity_name("hello_world"), "hello-world");
348| 1| assert_eq!(
349| 1| normalize_entity_name("CANONICAL_RELATIONS"),
350| | "canonical-relations"
351| | );
352| 1| }
353| |
354| | #[test]
355| 1| fn all_caps_becomes_lowercase_kebab() {
356| 1| assert_eq!(
357| 1| normalize_entity_name("CANONICAL_RELATIONS"),
358| | "canonical-relations"
359| | );
360| 1| assert_eq!(normalize_entity_name("MY_ENTITY_NAME"), "my-entity-name");
361| 1| }
362| |
363| | #[test]
364| 1| fn idempotent_on_already_normalized() {
365| 1| let name = "danilo-aguiar";
366| 1| assert_eq!(normalize_entity_name(name), name);
367| 1| let name2 = "canonical-relations";
368| 1| assert_eq!(normalize_entity_name(name2), name2);
369| 1| }
370| |
371| | #[test]
372| 1| fn collapses_consecutive_hyphens() {
373| 1| assert_eq!(normalize_entity_name("foo--bar"), "foo-bar");
374| 1| assert_eq!(normalize_entity_name("foo - bar"), "foo-bar");
375| 1| }
376| |
377| | #[test]
378| 1| fn trims_leading_trailing_hyphens() {
379| 1| assert_eq!(normalize_entity_name("-foo-"), "foo");
380| 1| assert_eq!(normalize_entity_name("--hello--"), "hello");
381| 1| }
382| |
383| | #[test]
384| 1| fn empty_or_only_separators_returns_empty() {
385| 1| assert_eq!(normalize_entity_name(""), "");
386| 1| assert_eq!(normalize_entity_name("---"), "");
387| 1| }
388| |
389| | #[test]
390| 1| fn normalizes_dots_slashes_and_punctuation() {
391| 1| assert_eq!(normalize_entity_name("lei-14.478/2022"), "lei-14-478-2022");
392| 1| assert_eq!(normalize_entity_name("src/main.rs"), "src-main-rs");
393| 1| assert_eq!(normalize_entity_name("user@domain.com"), "user-domain-com");
394| 1| assert_eq!(normalize_entity_name("v1.0.66"), "v1-0-66");
395| 1| assert_eq!(normalize_entity_name("key:value"), "key-value");
396| 1| }
397| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/paths.rs:
1| |//! XDG/cwd path resolution and traversal-safe overrides.
2| |//!
3| |//! Resolves data directories via [`directories::ProjectDirs`] and validates
4| |//! that user-supplied paths cannot escape the project root.
5| |
6| |use crate::errors::AppError;
7| |use crate::i18n::validation;
8| |use directories::ProjectDirs;
9| |use std::path::{Component, Path, PathBuf};
10| |
11| |/// Resolved filesystem paths used by the CLI at runtime.
12| |///
13| |/// Constructed via [`AppPaths::resolve`], which applies the three-layer precedence:
14| |/// CLI flag → `SQLITE_GRAPHRAG_DB_PATH` env var → `SQLITE_GRAPHRAG_HOME` env var → cwd.
15| |#[derive(Debug, Clone)]
16| |pub struct AppPaths {
17| | /// Absolute path to the SQLite database file.
18| | pub db: PathBuf,
19| | /// Directory where embedding model files are cached.
20| | pub models: PathBuf,
21| |}
22| |
23| |impl AppPaths {
24| 7| pub fn resolve(db_override: Option<&str>) -> Result<Self, AppError> {
25| 7| let proj = ProjectDirs::from("", "", "sqlite-graphrag").ok_or_else(|| {
^0
26| 0| AppError::Io(std::io::Error::other("could not determine home directory"))
27| 0| })?;
28| |
29| 7| let cache_root = if let Some(override_dir) = std::env::var_os("SQLITE_GRAPHRAG_CACHE_DIR") {
^0
30| 0| PathBuf::from(override_dir)
31| | } else {
32| 7| proj.cache_dir().to_path_buf()
33| | };
34| |
35| 7| let db = if let Some(p) = db_override {
^6 ^2
36| 2| validate_path(p)?;
^0
37| 2| PathBuf::from(p)
38| 5| } else if let Ok(env_path) = std::env::var("SQLITE_GRAPHRAG_DB_PATH") {
^2
39| 2| validate_path(&env_path)?;
^0
40| 2| PathBuf::from(env_path)
41| 3| } else if let Some(home_dir) = home_env_dir()? {
^1 ^1
42| 1| home_dir.join("graphrag.sqlite")
43| | } else {
44| 1| std::env::current_dir()
45| 1| .map_err(AppError::Io)?
^0
46| 1| .join("graphrag.sqlite")
47| | };
48| |
49| 6| Ok(Self {
50| 6| db,
51| 6| models: cache_root.join("models"),
52| 6| })
53| 7| }
54| |
55| 1| pub fn ensure_dirs(&self) -> Result<(), AppError> {
56| 2| for dir in [parent_or_err(&self.db)?, self.models.as_path()] {
^1 ^1 ^0 ^1
57| 2| std::fs::create_dir_all(dir)?;
^0
58| | }
59| 1| Ok(())
60| 1| }
61| |}
62| |
63| 6|fn validate_path(p: &str) -> Result<(), AppError> {
64| 44| if Path::new(p).components().any(|c| c == Component::ParentDir) {
^6 ^6
65| 1| return Err(AppError::Validation(validation::path_traversal(p)));
66| 5| }
67| 5| Ok(())
68| 6|}
69| |
70| |/// Resolves `SQLITE_GRAPHRAG_HOME` as the root directory for the default database.
71| |///
72| |/// Returns `Ok(Some(dir))` when the env var is set and valid,
73| |/// `Ok(None)` when absent or empty (falls back to `current_dir`),
74| |/// and `Err(...)` when the value contains traversal components.
75| 3|fn home_env_dir() -> Result<Option<PathBuf>, AppError> {
76| 3| let raw = match std::env::var("SQLITE_GRAPHRAG_HOME") {
77| 3| Ok(v) => v,
78| 0| Err(_) => return Ok(None),
79| | };
80| 3| if raw.is_empty() {
81| 1| return Ok(None);
82| 2| }
83| 2| validate_path(&raw)?;
^1
84| 1| Ok(Some(PathBuf::from(raw)))
85| 3|}
86| |
87| 17|pub(crate) fn parent_or_err(path: &Path) -> Result<&Path, AppError> {
88| 17| path.parent().ok_or_else(|| {
^2
89| 2| AppError::Validation(format!(
90| 2| "path '{}' has no valid parent component",
91| 2| path.display()
92| 2| ))
93| 2| })
94| 17|}
95| |
96| |#[cfg(test)]
97| |mod tests {
98| | use super::*;
99| | use serial_test::serial;
100| | use tempfile::TempDir;
101| |
102| | /// Clears all variables that affect `AppPaths::resolve` to isolate the
103| | /// test from the developer/CI environment.
104| 10| fn clean_env_paths() {
105| | // SAFETY: tests are annotated with #[serial], guaranteeing single-threaded execution.
106| 10| unsafe {
107| 10| std::env::remove_var("SQLITE_GRAPHRAG_HOME");
108| 10| std::env::remove_var("SQLITE_GRAPHRAG_DB_PATH");
109| 10| std::env::remove_var("SQLITE_GRAPHRAG_CACHE_DIR");
110| 10| }
111| 10| }
112| |
113| | #[test]
114| | #[serial]
115| 1| fn home_env_resolves_db_in_subdir() {
116| 1| clean_env_paths();
117| 1| let tmp = TempDir::new().expect("tempdir");
118| | // SAFETY: tests are annotated with #[serial], guaranteeing single-threaded execution.
119| 1| unsafe {
120| 1| std::env::set_var("SQLITE_GRAPHRAG_HOME", tmp.path());
121| 1| }
122| |
123| 1| let paths = AppPaths::resolve(None).expect("resolve with valid HOME");
124| 1| assert_eq!(paths.db, tmp.path().join("graphrag.sqlite"));
125| |
126| 1| clean_env_paths();
127| | }
128| |
129| | #[test]
130| | #[serial]
131| 1| fn home_env_traversal_rejected() {
132| 1| clean_env_paths();
133| | // SAFETY: tests are annotated with #[serial], guaranteeing single-threaded execution.
134| 1| unsafe {
135| 1| std::env::set_var("SQLITE_GRAPHRAG_HOME", "/tmp/../etc");
136| 1| }
137| |
138| 1| let result = AppPaths::resolve(None);
139| 1| assert!(
140| 1| matches!(result, Err(AppError::Validation(_))),
^0
141| 0| "traversal in SQLITE_GRAPHRAG_HOME must fail as Validation, got {result:?}"
142| | );
143| |
144| 1| clean_env_paths();
145| | }
146| |
147| | #[test]
148| | #[serial]
149| 1| fn db_path_overrides_home() {
150| 1| clean_env_paths();
151| 1| let tmp_home = TempDir::new().expect("tempdir home");
152| 1| let tmp_db = TempDir::new().expect("tempdir db");
153| 1| let explicit_db = tmp_db.path().join("explicit.sqlite");
154| | // SAFETY: tests are annotated with #[serial], guaranteeing single-threaded execution.
155| 1| unsafe {
156| 1| std::env::set_var("SQLITE_GRAPHRAG_HOME", tmp_home.path());
157| 1| std::env::set_var("SQLITE_GRAPHRAG_DB_PATH", &explicit_db);
158| 1| }
159| |
160| 1| let paths = AppPaths::resolve(None).expect("resolve with DB_PATH and HOME");
161| 1| assert_eq!(paths.db, explicit_db);
162| |
163| 1| clean_env_paths();
164| | }
165| |
166| | #[test]
167| | #[serial]
168| 1| fn flag_overrides_home() {
169| 1| clean_env_paths();
170| 1| let tmp_home = TempDir::new().expect("tempdir home");
171| 1| let tmp_flag = TempDir::new().expect("tempdir flag");
172| 1| let db_flag = tmp_flag.path().join("via-flag.sqlite");
173| | // SAFETY: tests are annotated with #[serial], guaranteeing single-threaded execution.
174| 1| unsafe {
175| 1| std::env::set_var("SQLITE_GRAPHRAG_HOME", tmp_home.path());
176| 1| }
177| |
178| 1| let paths = AppPaths::resolve(Some(db_flag.to_str().expect("utf8")))
179| 1| .expect("resolve with flag and HOME");
180| 1| assert_eq!(paths.db, db_flag);
181| |
182| 1| clean_env_paths();
183| | }
184| |
185| | #[test]
186| | #[serial]
187| 1| fn home_env_empty_falls_back_to_cwd() {
188| 1| clean_env_paths();
189| | // SAFETY: tests are annotated with #[serial], guaranteeing single-threaded execution.
190| 1| unsafe {
191| 1| std::env::set_var("SQLITE_GRAPHRAG_HOME", "");
192| 1| }
193| |
194| 1| let paths = AppPaths::resolve(None).expect("resolve with empty HOME");
195| 1| let expected = std::env::current_dir()
196| 1| .expect("cwd")
197| 1| .join("graphrag.sqlite");
198| 1| assert_eq!(paths.db, expected);
199| |
200| 1| clean_env_paths();
201| | }
202| |
203| | #[test]
204| 1| fn parent_or_err_accepts_normal_path() {
205| 1| let p = PathBuf::from("/home/user/db.sqlite");
206| 1| let parent = parent_or_err(&p).expect("valid parent");
207| 1| assert_eq!(parent, Path::new("/home/user"));
208| 1| }
209| |
210| | #[test]
211| 1| fn parent_or_err_accepts_relative_path() {
212| 1| let p = PathBuf::from("subdir/file.sqlite");
213| 1| let parent = parent_or_err(&p).expect("relative parent");
214| 1| assert_eq!(parent, Path::new("subdir"));
215| 1| }
216| |
217| | #[test]
218| 1| fn parent_or_err_rejects_unix_root() {
219| 1| let p = PathBuf::from("/");
220| 1| let result = parent_or_err(&p);
221| 1| assert!(matches!(result, Err(AppError::Validation(_))));
^0
222| 1| }
223| |
224| | #[test]
225| 1| fn parent_or_err_rejects_empty_path() {
226| 1| let p = PathBuf::from("");
227| 1| let result = parent_or_err(&p);
228| 1| assert!(matches!(result, Err(AppError::Validation(_))));
^0
229| 1| }
230| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/pragmas.rs:
1| |//! SQLite PRAGMA helpers applied at connection open and on each transaction.
2| |
3| |use crate::errors::AppError;
4| |use rusqlite::Connection;
5| |
6| |/// Applies one-time PRAGMAs on a freshly opened connection (e.g. `auto_vacuum`).
7| |///
8| |/// Calls [`apply_connection_pragmas`] internally and then sets `wal_autocheckpoint`.
9| |/// Must be called once per database file, not once per connection.
10| |///
11| |/// # Errors
12| |/// Returns `Err` when any PRAGMA execution fails.
13| 1|pub fn apply_init_pragmas(conn: &Connection) -> Result<(), AppError> {
14| 1| conn.execute_batch("PRAGMA auto_vacuum = INCREMENTAL;")?;
^0
15| 1| apply_connection_pragmas(conn)?;
^0
16| 1| conn.execute_batch(&format!(
17| 1| "PRAGMA wal_autocheckpoint = {};",
18| 1| crate::constants::WAL_AUTOCHECKPOINT_PAGES
19| 1| ))?;
^0
20| 1| Ok(())
21| 1|}
22| |
23| |/// Re-asserts `PRAGMA journal_mode = WAL` after operations that may revert it
24| |/// (notably refinery-driven migrations, which can open internal handles that
25| |/// reset the journal mode in some scenarios). Idempotent and cheap; emits
26| |/// `tracing::warn!` if WAL fails to engage so degraded behaviour is observable.
27| 1|pub fn ensure_wal_mode(conn: &Connection) -> Result<(), AppError> {
28| 1| let mode: String = conn.query_row("PRAGMA journal_mode = WAL;", [], |r| r.get(0))?;
^0
29| 1| if mode != "wal" {
30| 0| tracing::warn!(target: "pragmas", mode = %mode, "journal_mode did not switch to WAL after re-assertion");
31| 1| }
32| 1| Ok(())
33| 1|}
34| |
35| |/// Applies per-connection PRAGMAs: synchronous, foreign keys, busy timeout, cache, mmap, WAL.
36| |///
37| |/// Safe to call on every new connection; all settings are idempotent.
38| |///
39| |/// # Errors
40| |/// Returns `Err` when any PRAGMA execution fails.
41| 3|pub fn apply_connection_pragmas(conn: &Connection) -> Result<(), AppError> {
42| 3| conn.execute_batch(&format!(
43| 3| "PRAGMA synchronous = NORMAL;
44| 3| PRAGMA foreign_keys = ON;
45| 3| PRAGMA busy_timeout = {busy};
46| 3| PRAGMA cache_size = {cache};
47| 3| PRAGMA temp_store = MEMORY;
48| 3| PRAGMA mmap_size = {mmap};",
49| 3| busy = crate::constants::BUSY_TIMEOUT_MILLIS,
50| 3| cache = crate::constants::CACHE_SIZE_KB,
51| 3| mmap = crate::constants::MMAP_SIZE_BYTES,
52| 3| ))?;
^0
53| 3| let mode: String = conn.query_row("PRAGMA journal_mode = WAL;", [], |r| r.get(0))?;
^0
54| 3| if mode != "wal" {
55| 0| tracing::warn!(target: "pragmas", mode = %mode, "journal_mode did not switch to WAL");
56| 3| }
57| 3| Ok(())
58| 3|}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/preservation.rs:
1| |//! Preservation checks for LLM-enriched memory bodies (G29 Passo 4).
2| |//!
3| |//! When a language model rewrites a memory body, the operator must be
4| |//! protected against silent hallucination: the LLM may invent facts, drop
5| |//! key terms, or drift semantically far from the source. This module
6| |//! provides a lightweight, deterministic similarity metric that runs
7| |//! locally without any model call, so the gate can be enforced before the
8| |//! enriched body touches persistent storage.
9| |//!
10| |//! The default metric is a normalised trigram-Jaccard similarity computed
11| |//! on the union of `set_a` and `set_b`. The score is in `[0.0, 1.0]`,
12| |//! where `1.0` means the two inputs share every trigram and `0.0` means
13| |//! they share none. The threshold default of `0.7` follows the gap G29
14| |//! specification, with `--preserve-threshold <F>` letting operators tune
15| |//! it per workload.
16| |//!
17| |//! # Examples
18| |//!
19| |//! ```
20| |//! use sqlite_graphrag::preservation::{jaccard_similarity, PreservationVerdict};
21| |//!
22| |//! let score = jaccard_similarity("the quick brown fox", "the quick red fox");
23| |//! assert!(score > 0.5);
24| |//!
25| |//! let verdict = PreservationVerdict::evaluate("orig body", "rewritten body", 0.7);
26| |//! assert!(matches!(verdict, PreservationVerdict::Preserved { .. }));
27| |//! ```
28| |
29| |use serde::{Deserialize, Serialize};
30| |use std::collections::HashSet;
31| |
32| |/// Computes the trigram-Jaccard similarity between two strings.
33| |///
34| |/// The score is `|A ∩ B| / |A ∪ B|` where `A` and `B` are the sets of
35| |/// character-trigrams extracted from each input. The trigrams are taken
36| |/// over Unicode scalar values via `char_indices`, so the function is
37| |/// safe to call on multi-byte UTF-8 inputs without byte-boundary errors.
38| |///
39| |/// # Edge cases
40| |///
41| |/// - Both inputs empty: returns `1.0` (the empty trigram set is trivially
42| |/// contained in itself).
43| |/// - One input empty, the other non-empty: returns `0.0` (no overlap).
44| |/// - Identical inputs: returns `1.0`.
45| |///
46| |/// The function is pure: no I/O, no allocation beyond the two trigram
47| |/// sets, deterministic for a given pair of inputs. It is safe to call
48| |/// in hot paths.
49| 11|pub fn jaccard_similarity(a: &str, b: &str) -> f64 {
50| 11| let set_a = trigrams(a);
51| 11| let set_b = trigrams(b);
52| 11| if set_a.is_empty() && set_b.is_empty() {
^2 ^2
53| 1| return 1.0;
54| 10| }
55| 10| let intersection = set_a.intersection(&set_b).count() as f64;
56| 10| let union = set_a.union(&set_b).count() as f64;
57| 10| if union == 0.0 {
58| 0| 0.0
59| | } else {
60| 10| intersection / union
61| | }
62| 11|}
63| |
64| |/// Extracts the set of character-trigrams from a string.
65| |///
66| |/// Padding handles short strings: inputs with fewer than three characters
67| |/// are represented by the unique chars they do contain (with the
68| |/// `[c, '\0', '\0']` padding), which guarantees that two identical
69| |/// short strings still produce the same trigram set and score `1.0`.
70| 22|fn trigrams(input: &str) -> HashSet<[char; 3]> {
71| 22| let chars: Vec<char> = input.chars().collect();
72| 22| if chars.is_empty() {
73| 4| return HashSet::new();
74| 18| }
75| 18| let mut out: HashSet<[char; 3]> = HashSet::with_capacity(chars.len().saturating_add(2));
76| 18| let mut window: [char; 3] = ['\0', '\0', '\0'];
77| 339| for (i, ch) in chars.iter().enumerate() {
^18 ^18
78| 339| window[0] = if i >= 1 { chars[i - 1] } else { '\0' };
^321 ^18
79| 339| window[1] = *ch;
80| 339| window[2] = if i + 1 < chars.len() {
81| 321| chars[i + 1]
82| | } else {
83| 18| '\0'
84| | };
85| 339| out.insert(window);
86| | }
87| 18| out
88| 22|}
89| |
90| |/// Outcome of a preservation evaluation against a configurable threshold.
91| |///
92| |/// `PreservationVerdict` is the wire type the enrich pipeline emits in its
93| |/// NDJSON stream: every body-enrich attempt ends in one of the four
94| |/// variants so callers can route the result without re-running the
95| |/// similarity computation.
96| |#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
97| |#[serde(tag = "verdict", rename_all = "snake_case")]
98| |pub enum PreservationVerdict {
99| | /// The rewritten body is at least `threshold`-similar to the original.
100| | Preserved { score: f64, threshold: f64 },
101| | /// The rewritten body diverges too much from the original and was
102| | /// rejected by the gate.
103| | Rejected { score: f64, threshold: f64 },
104| | /// The original and rewritten bodies are byte-equal (no rewrite was
105| | /// needed); preserved by definition.
106| | Unchanged { byte_len: usize },
107| |}
108| |
109| |impl PreservationVerdict {
110| | /// Evaluates the gate against `threshold` and returns the matching
111| | /// variant. The threshold is clamped to `[0.0, 1.0]` defensively; an
112| | /// out-of-range value does not panic the caller.
113| 6| pub fn evaluate(original: &str, rewritten: &str, threshold: f64) -> Self {
114| 6| let threshold = threshold.clamp(0.0, 1.0);
115| 6| if original == rewritten {
116| 2| return Self::Unchanged {
117| 2| byte_len: original.len(),
118| 2| };
119| 4| }
120| 4| let score = jaccard_similarity(original, rewritten);
121| 4| if score >= threshold {
122| 3| Self::Preserved { score, threshold }
123| | } else {
124| 1| Self::Rejected { score, threshold }
125| | }
126| 6| }
127| |
128| | /// Returns `true` when the gate accepted the rewrite.
129| 6| pub fn is_accepted(&self) -> bool {
130| 6| matches!(self, Self::Preserved { .. } | Self::Unchanged { .. })
^1
131| 6| }
132| |}
133| |
134| |#[cfg(test)]
135| |mod tests {
136| | use super::*;
137| |
138| | #[test]
139| 1| fn identical_strings_score_one() {
140| 1| let s = "the quick brown fox jumps over the lazy dog";
141| 1| assert!((jaccard_similarity(s, s) - 1.0).abs() < f64::EPSILON);
142| 1| }
143| |
144| | #[test]
145| 1| fn completely_different_strings_score_zero_or_near_zero() {
146| 1| let a = "aaaaaaaaaa";
147| 1| let b = "zzzzzzzzzz";
148| 1| assert!(jaccard_similarity(a, b) < 0.05);
149| 1| }
150| |
151| | #[test]
152| 1| fn partial_overlap_scores_between_zero_and_one() {
153| 1| let a = "the quick brown fox jumps";
154| 1| let b = "the slow brown cat sleeps";
155| 1| let score = jaccard_similarity(a, b);
156| 1| assert!(score > 0.0 && score < 1.0, "got {score}");
^0
157| 1| }
158| |
159| | #[test]
160| 1| fn both_empty_score_one() {
161| 1| assert!((jaccard_similarity("", "") - 1.0).abs() < f64::EPSILON);
162| 1| }
163| |
164| | #[test]
165| 1| fn one_empty_scores_zero() {
166| 1| assert!(jaccard_similarity("hello", "").abs() < f64::EPSILON);
167| 1| assert!(jaccard_similarity("", "hello").abs() < f64::EPSILON);
168| 1| }
169| |
170| | #[test]
171| 1| fn unicode_strings_do_not_panic() {
172| | // Multi-byte UTF-8: 1 char each, very short.
173| 1| let a = "ç日本語";
174| 1| let b = "ç中文";
175| 1| let _ = jaccard_similarity(a, b);
176| 1| }
177| |
178| | #[test]
179| 1| fn verdict_preserved_when_above_threshold() {
180| 1| let v = PreservationVerdict::evaluate("hello world", "hello world!", 0.5);
181| 1| assert!(v.is_accepted());
182| 1| assert!(matches!(v, PreservationVerdict::Preserved { .. }));
^0
183| 1| }
184| |
185| | #[test]
186| 1| fn verdict_unchanged_for_identical() {
187| 1| let v = PreservationVerdict::evaluate("same", "same", 0.9);
188| 1| assert!(v.is_accepted());
189| 1| assert!(matches!(v, PreservationVerdict::Unchanged { byte_len: 4 }));
^0
190| 1| }
191| |
192| | #[test]
193| 1| fn threshold_clamped_out_of_range() {
194| | // Threshold above 1.0 is clamped to 1.0: identical bodies match
195| | // by the `Unchanged` short-circuit, accepted.
196| 1| let v = PreservationVerdict::evaluate("abc", "abc", 99.0);
197| 1| assert!(v.is_accepted());
198| | // Threshold below 0.0 is clamped to 0.0: every non-empty rewrite
199| | // meets a 0.0 floor and is accepted. This is the documented
200| | // behaviour of `clamp(0.0, 1.0)` and is the only sane reading
201| | // once a negative threshold is no longer in scope.
202| 1| let v = PreservationVerdict::evaluate("abc", "xyz", -5.0);
203| 1| assert!(v.is_accepted());
204| | // Threshold of exactly 0.0 accepts only identical bodies; even
205| | // a single-character drift fails the gate.
206| 1| let v = PreservationVerdict::evaluate("abc", "abcd", 0.0);
207| 1| assert!(
208| 1| v.is_accepted(),
209| 0| "single-char append is mostly the same body"
210| | );
211| 1| }
212| |
213| | #[test]
214| 1| fn g29_repro_evaluates_rejected_when_diverges() {
215| | // G29 reproducer: LLM rewrites a body and drifts far from source.
216| 1| let original = "JWT token rotation strategy with 15-min expiry and refresh flow";
217| 1| let drifted = "The weather in Tokyo is sunny today with mild temperatures expected";
218| 1| let v = PreservationVerdict::evaluate(original, drifted, 0.7);
219| 1| assert!(!v.is_accepted(), "should reject hallucinated rewrite");
^0
220| 1| }
221| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/reaper.rs:
1| |//! G28: Reaper for orphan external processes.
2| |//!
3| |//! When the CLI crashes or is killed (SIGKILL, OOM, machine reset), child
4| |//! processes spawned by `claude -p` or `codex exec` may be left running.
5| |//! Without cleanup they accumulate as zombies that consume CPU, RAM, and
6| |//! MCP-spawned subprocess trees (the 2026-06-03 incident: 1.877 processes
7| |//! total, load average 276 on a 10-CPU host).
8| |//!
9| |//! [`scan_and_kill_orphans`] walks the process table at startup and
10| |//! terminates any `claude` or `codex` invocation whose `PPID` is `1`
11| |//! (reparented to `init`/`launchd` after the parent died) and that is
12| |//! older than the `ORPHAN_MIN_AGE_SECS` constant. The scan is conservative: it only
13| |//! kills processes that (a) match a known LLM CLI name, AND (b) are
14| |//! orphaned, AND (c) are older than the threshold. A short-lived CLI
15| |//! that is just starting up is left alone.
16| |
17| |use std::time::Duration;
18| |
19| |const ORPHAN_MIN_AGE_SECS: u64 = 60;
20| |const ORPHAN_SCAN_TARGETS: &[&str] = &["claude", "codex"];
21| |
22| |#[derive(Debug, Clone, Copy, PartialEq, Eq)]
23| |pub struct ReaperReport {
24| | /// Number of orphan processes detected.
25| | pub found: usize,
26| | /// Number of orphan processes successfully terminated.
27| | pub killed: usize,
28| | /// Number that we could not terminate (permission, ESRCH, etc).
29| | pub failed: usize,
30| | /// Elapsed wall time of the scan.
31| | pub elapsed_ms: u64,
32| |}
33| |
34| |/// Walks the process table and kills orphan LLM invocations.
35| |///
36| |/// The scan is best-effort and never panics: on any unexpected error it
37| |/// logs the failure and returns a report with `killed = 0`.
38| 1|pub fn scan_and_kill_orphans() -> ReaperReport {
39| 1| let start = std::time::Instant::now();
40| 1| let mut report = ReaperReport {
41| 1| found: 0,
42| 1| killed: 0,
43| 1| failed: 0,
44| 1| elapsed_ms: 0,
45| 1| };
46| |
47| | #[cfg(unix)]
48| | {
49| 1| if let Err(e) = scan_unix(&mut report) {
50| 1| tracing::warn!(target: "reaper", error = %e, "orphan scan failed");
^0
51| 0| }
52| | }
53| |
54| | #[cfg(not(unix))]
55| | {
56| | tracing::debug!(target: "reaper", "orphan scan is a no-op on non-Unix platforms");
57| | }
58| |
59| 1| report.elapsed_ms = start.elapsed().as_millis() as u64;
60| 1| if report.killed > 0 {
61| 0| tracing::warn!(
62| | target: "reaper",
63| | found = report.found,
64| | killed = report.killed,
65| | failed = report.failed,
66| 0| "reaped orphan LLM subprocesses"
67| | );
68| | } else {
69| 1| tracing::info!(target: "reaper", found = report.found, "no orphan LLM subprocesses detected");
^0
70| | }
71| 1| report
72| 1|}
73| |
74| |#[cfg(unix)]
75| 1|fn scan_unix(report: &mut ReaperReport) -> std::io::Result<()> {
76| | use std::fs;
77| | use std::path::Path;
78| |
79| 1| let proc = Path::new("/proc");
80| 1| let entries = fs::read_dir(proc)?;
^0
81| 0| for entry in entries.flatten() {
82| 0| let name = entry.file_name();
83| 0| let Some(name_str) = name.to_str() else {
84| 0| continue;
85| | };
86| 0| if !name_str.chars().all(|c| c.is_ascii_digit()) {
87| 0| continue;
88| 0| }
89| 0| let pid: i32 = match name_str.parse() {
90| 0| Ok(p) => p,
91| 0| Err(_) => continue,
92| | };
93| 0| if pid == std::process::id() as i32 {
94| 0| continue;
95| 0| }
96| |
97| 0| let stat_path = entry.path().join("stat");
98| 0| let stat = match fs::read_to_string(&stat_path) {
99| 0| Ok(s) => s,
100| 0| Err(_) => continue,
101| | };
102| |
103| | // /proc/[pid]/stat has the form: `pid (comm) state ppid ...`
104| | // The comm field can contain spaces and parens; the last `)`
105| | // separates the comm from the rest.
106| 0| let Some(close_paren) = stat.rfind(')') else {
107| 0| continue;
108| | };
109| 0| let after = &stat[close_paren + 1..];
110| 0| let mut parts = after.split_whitespace();
111| | // parts[0] = state (e.g. "R"), parts[1] = ppid, parts[2] = pgrp, ...
112| 0| let state = parts.next().unwrap_or("");
113| 0| let ppid: i32 = parts.next().and_then(|p| p.parse().ok()).unwrap_or(-1);
114| |
115| | // Only target processes orphaned to init (PPID 1 on Linux/Unix
116| | // when the parent is gone) or whose parent is also dead.
117| 0| if ppid != 1 {
118| 0| continue;
119| 0| }
120| |
121| | // Skip zombies (state Z) — they need no kill.
122| 0| if state.starts_with('Z') {
123| 0| continue;
124| 0| }
125| |
126| | // Resolve the comm field. proc/[pid]/comm is the short program
127| | // name (no path); we use it instead of parsing the bracketed
128| | // comm from stat to avoid encoding edge cases.
129| 0| let comm_path = entry.path().join("comm");
130| 0| let comm = match fs::read_to_string(&comm_path) {
131| 0| Ok(s) => s.trim().to_string(),
132| 0| Err(_) => continue,
133| | };
134| |
135| 0| if !ORPHAN_SCAN_TARGETS.iter().any(|t| comm == *t) {
136| 0| continue;
137| 0| }
138| |
139| | // Age check: skip processes that just spawned (under 60s old) so
140| | // we never race with a concurrent CLI invocation.
141| 0| let age_ok = check_process_age(pid, ORPHAN_MIN_AGE_SECS);
142| 0| if !age_ok {
143| 0| continue;
144| 0| }
145| |
146| 0| report.found += 1;
147| 0| match terminate_pid(pid) {
148| | Ok(()) => {
149| 0| report.killed += 1;
150| 0| tracing::info!(target: "reaper", pid, comm = %comm, "killed orphan LLM subprocess");
151| | }
152| 0| Err(e) => {
153| 0| report.failed += 1;
154| 0| tracing::warn!(target: "reaper", pid, comm = %comm, error = %e, "failed to kill orphan");
155| | }
156| | }
157| | }
158| 0| Ok(())
159| 1|}
160| |
161| |#[cfg(unix)]
162| 0|fn check_process_age(pid: i32, min_age_secs: u64) -> bool {
163| | use std::fs;
164| | // /proc/[pid]/stat field 22 is start_time in clock ticks since boot.
165| | // We instead use the simpler heuristic: stat file mtime.
166| 0| let stat_path = std::path::Path::new("/proc")
167| 0| .join(pid.to_string())
168| 0| .join("stat");
169| 0| let Ok(meta) = fs::metadata(&stat_path) else {
170| 0| return false;
171| | };
172| 0| let Ok(modified) = meta.modified() else {
173| 0| return false;
174| | };
175| 0| let Ok(elapsed) = std::time::SystemTime::now().duration_since(modified) else {
176| 0| return false;
177| | };
178| 0| elapsed >= Duration::from_secs(min_age_secs)
179| 0|}
180| |
181| |#[cfg(unix)]
182| 0|fn terminate_pid(pid: i32) -> std::io::Result<()> {
183| | // SIGTERM first; if the process ignores it for >2s, the caller can
184| | // escalate to SIGKILL. For the reaper we send TERM and return; a
185| | // follow-up sweep can send KILL if needed.
186| 0| let rc = unsafe { libc::kill(pid, libc::SIGTERM) };
187| 0| if rc == 0 {
188| 0| Ok(())
189| | } else {
190| 0| Err(std::io::Error::last_os_error())
191| | }
192| 0|}
193| |
194| |#[cfg(test)]
195| |mod tests {
196| | use super::*;
197| |
198| | #[test]
199| 1| fn reaper_report_starts_zeroed() {
200| 1| let r = ReaperReport {
201| 1| found: 0,
202| 1| killed: 0,
203| 1| failed: 0,
204| 1| elapsed_ms: 0,
205| 1| };
206| 1| assert_eq!(r.found, 0);
207| 1| assert_eq!(r.killed, 0);
208| 1| assert_eq!(r.failed, 0);
209| 1| }
210| |
211| | #[test]
212| 1| fn orphan_min_age_is_one_minute() {
213| | // G28: the threshold of 60s is the safety margin that prevents
214| | // a CLI invocation from killing a concurrent peer that just
215| | // started 5s ago.
216| 1| assert_eq!(ORPHAN_MIN_AGE_SECS, 60);
217| 1| }
218| |
219| | #[test]
220| 1| fn orphan_targets_include_claude_and_codex() {
221| 1| assert!(ORPHAN_SCAN_TARGETS.contains(&"claude"));
222| 1| assert!(ORPHAN_SCAN_TARGETS.contains(&"codex"));
223| 1| }
224| |
225| | #[test]
226| 1| fn scan_completes_without_panic_on_linux() {
227| | // Just ensure the function returns a ReaperReport on the test
228| | // host. On Linux CI we may be PID 1 in containers; the report
229| | // will simply have found=0.
230| 1| let r = scan_and_kill_orphans();
231| 1| assert!(r.elapsed_ms < 30_000, "scan must finish in <30s");
^0
232| 1| }
233| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/retry.rs:
1| |//! Centralized retry infrastructure with exponential backoff and half-jitter.
2| |//!
3| |//! Provides [`RetryConfig`](crate::retry::RetryConfig) with named constructors for each failure domain
4| |//! (SQLite BUSY, LLM rate-limit, cold-start) and a [`compute_delay`](crate::retry::compute_delay) function
5| |//! that applies the configured jitter strategy.
6| |
7| |use std::time::{Duration, Instant};
8| |
9| |/// Configures retry behavior for a specific failure domain.
10| |///
11| |/// Use the named constructors ([`Self::sqlite_busy`], [`Self::llm_rate_limit`],
12| |/// [`Self::cold_start`]) for pre-tuned policies. All timing values are in
13| |/// milliseconds except `max_elapsed_secs` which is in seconds.
14| |#[derive(Debug, Clone)]
15| |pub struct RetryConfig {
16| | /// Base delay for the first retry attempt (ms).
17| | pub initial_delay_ms: u64,
18| | /// Upper bound on any single delay (ms).
19| | pub max_delay_ms: u64,
20| | /// Multiplicative factor applied per attempt.
21| | pub multiplier: u64,
22| | /// Hard cap on total attempts (0 = unlimited, use deadline).
23| | pub max_attempts: u32,
24| | /// Total elapsed wall-clock time before giving up (seconds).
25| | pub max_elapsed_secs: u64,
26| | /// Jitter strategy applied to computed delays.
27| | pub jitter: JitterKind,
28| |}
29| |
30| |/// Jitter strategy for randomizing retry delays.
31| |#[derive(Debug, Clone, Copy, PartialEq, Eq)]
32| |pub enum JitterKind {
33| | /// No randomization — deterministic delay.
34| | None,
35| | /// Half-jitter: delay in [base/2, base). Guarantees minimum wait.
36| | Half,
37| | /// Full-jitter: delay in [0, base). Maximum spread.
38| | Full,
39| |}
40| |
41| |impl RetryConfig {
42| | /// SQLite BUSY retry: 5 attempts, 300ms base, half-jitter, 30s deadline.
43| 1| pub fn sqlite_busy() -> Self {
44| 1| Self {
45| 1| initial_delay_ms: 300,
46| 1| max_delay_ms: 4800,
47| 1| multiplier: 2,
48| 1| max_attempts: 5,
49| 1| max_elapsed_secs: 30,
50| 1| jitter: JitterKind::Half,
51| 1| }
52| 1| }
53| |
54| | /// LLM rate-limit retry: 60s base, 900s cap, half-jitter, 1h deadline.
55| 2| pub fn llm_rate_limit() -> Self {
56| 2| Self {
57| 2| initial_delay_ms: 60_000,
58| 2| max_delay_ms: 900_000,
59| 2| multiplier: 2,
60| 2| max_attempts: 20,
61| 2| max_elapsed_secs: 3600,
62| 2| jitter: JitterKind::Half,
63| 2| }
64| 2| }
65| |
66| | /// Cold-start retry: 2s base, 2 attempts, no jitter, 30s deadline.
67| 1| pub fn cold_start() -> Self {
68| 1| Self {
69| 1| initial_delay_ms: 2000,
70| 1| max_delay_ms: 4000,
71| 1| multiplier: 2,
72| 1| max_attempts: 2,
73| 1| max_elapsed_secs: 30,
74| 1| jitter: JitterKind::None,
75| 1| }
76| 1| }
77| |}
78| |
79| |/// Computes the delay for a given attempt using the config's jitter strategy.
80| |///
81| |/// # Formula
82| |///
83| |/// ```text
84| |/// base = min(initial_delay_ms * multiplier^attempt, max_delay_ms)
85| |/// delay = apply_jitter(base, jitter_kind)
86| |/// ```
87| 902|pub fn compute_delay(config: &RetryConfig, attempt: u32) -> Duration {
88| 902| let base = config
89| 902| .initial_delay_ms
90| 902| .saturating_mul(config.multiplier.saturating_pow(attempt))
91| 902| .min(config.max_delay_ms);
92| |
93| 902| let delay_ms = match config.jitter {
94| 2| JitterKind::None => base,
95| | JitterKind::Half => {
96| 500| let half = base / 2;
97| 500| if half == 0 {
98| 0| base
99| | } else {
100| 500| half + fastrand::u64(0..half)
101| | }
102| | }
103| | JitterKind::Full => {
104| 400| if base == 0 {
105| 0| 0
106| | } else {
107| 400| fastrand::u64(0..base)
108| | }
109| | }
110| | };
111| |
112| 902| Duration::from_millis(delay_ms)
113| 902|}
114| |
115| |/// Returns `true` if the env var `SQLITE_GRAPHRAG_DISABLE_RETRY` is set to `1`.
116| |///
117| |/// When active, all retry loops should propagate the error immediately without
118| |/// sleeping. Use during incidents to prevent retry storms.
119| 8|pub fn is_kill_switch_active() -> bool {
120| 8| std::env::var("SQLITE_GRAPHRAG_DISABLE_RETRY").is_ok_and(|v| v == "1")
^0 ^0
121| 8|}
122| |
123| |#[cfg(test)]
124| |mod tests {
125| | use super::*;
126| |
127| | #[test]
128| 1| fn compute_delay_half_jitter_in_bounds() {
129| 1| let cfg = RetryConfig::llm_rate_limit();
130| 6| for attempt in 0..5 {
^5
131| 505| for _ in 0..100 {
132| 500| let d = compute_delay(&cfg, attempt);
133| 500| let base = cfg
134| 500| .initial_delay_ms
135| 500| .saturating_mul(cfg.multiplier.saturating_pow(attempt))
136| 500| .min(cfg.max_delay_ms);
137| 500| let half = base / 2;
138| 500| assert!(d.as_millis() >= half as u128);
139| 500| assert!(d.as_millis() < base as u128);
140| | }
141| | }
142| 1| }
143| |
144| | #[test]
145| 1| fn compute_delay_no_jitter_is_deterministic() {
146| 1| let cfg = RetryConfig::cold_start();
147| 1| let d1 = compute_delay(&cfg, 0);
148| 1| let d2 = compute_delay(&cfg, 0);
149| 1| assert_eq!(d1, d2);
150| 1| assert_eq!(d1, Duration::from_millis(2000));
151| 1| }
152| |
153| | #[test]
154| 1| fn kill_switch_inactive_by_default() {
155| 1| std::env::remove_var("SQLITE_GRAPHRAG_DISABLE_RETRY");
156| 1| assert!(!is_kill_switch_active());
157| 1| }
158| |
159| | #[test]
160| 1| fn sqlite_busy_config_matches_constants() {
161| 1| let cfg = RetryConfig::sqlite_busy();
162| 1| assert_eq!(cfg.initial_delay_ms, 300);
163| 1| assert_eq!(cfg.max_attempts, 5);
164| 1| assert_eq!(cfg.max_elapsed_secs, 30);
165| 1| }
166| |
167| | #[test]
168| 1| fn llm_rate_limit_has_deadline() {
169| 1| let cfg = RetryConfig::llm_rate_limit();
170| 1| assert_eq!(cfg.max_elapsed_secs, 3600);
171| 1| assert_eq!(cfg.max_delay_ms, 900_000);
172| 1| }
173| |
174| | #[test]
175| 1| fn full_jitter_stays_below_base() {
176| 1| let cfg = RetryConfig {
177| 1| initial_delay_ms: 1000,
178| 1| max_delay_ms: 10_000,
179| 1| multiplier: 2,
180| 1| max_attempts: 5,
181| 1| max_elapsed_secs: 60,
182| 1| jitter: JitterKind::Full,
183| 1| };
184| 5| for attempt in 0..4 {
^4
185| 404| for _ in 0..100 {
186| 400| let d = compute_delay(&cfg, attempt);
187| 400| let base = cfg
188| 400| .initial_delay_ms
189| 400| .saturating_mul(cfg.multiplier.saturating_pow(attempt))
190| 400| .min(cfg.max_delay_ms);
191| 400| assert!(d.as_millis() < base as u128);
192| | }
193| | }
194| 1| }
195| |}
196| |
197| |// ---------------------------------------------------------------------------
198| |// Circuit Breaker (G28-D, v1.0.68)
199| |// ---------------------------------------------------------------------------
200| |
201| |/// Outcome of a single retry attempt, used to feed a [`CircuitBreaker`].
202| |///
203| |/// We keep this intentionally narrow: rate-limit / timeout errors are
204| |/// TRANSIENT and should NOT count toward the breaker; everything else
205| |/// counts as a HARD failure that contributes to opening the breaker.
206| |#[derive(Debug, Clone, Copy, PartialEq, Eq)]
207| |pub enum AttemptOutcome {
208| | /// Transient error: counts as a successful iteration, does NOT trip the breaker.
209| | /// Examples: `AppError::RateLimited`, `AppError::Timeout`, `AppError::DbBusy`.
210| | Transient,
211| | /// Hard failure: counts toward the breaker's failure threshold.
212| | /// Examples: `AppError::Validation`, `AppError::Conflict`,
213| | /// `AppError::Embedding`, `AppError::Internal`.
214| | HardFailure,
215| | /// Successful iteration: resets the consecutive-failure counter.
216| | Success,
217| |}
218| |
219| |/// Counts consecutive hard failures and trips open after a threshold.
220| |///
221| |/// G28-D (v1.0.68): caps `enrich --retry-failed` and `ingest --retry-failed`
222| |/// loops so persistent failures (e.g., LLM provider returning the same
223| |/// 4xx for hours) cannot run unbounded. After `threshold` consecutive
224| |/// [`AttemptOutcome::HardFailure`] outcomes, `record` returns `true` and
225| |/// the caller is expected to abort with `AppError::CircuitBreakerOpen`.
226| |///
227| |/// Rate-limited / transient errors are explicitly NOT counted, so a
228| |/// provider that throttles but eventually recovers will not trip the
229| |/// breaker.
230| |#[derive(Debug, Clone)]
231| |pub struct CircuitBreaker {
232| | threshold: u32,
233| | cooldown: Duration,
234| | consecutive_failures: u32,
235| | open_until: Option<Instant>,
236| |}
237| |
238| |impl CircuitBreaker {
239| | /// Creates a breaker that opens after `threshold` consecutive hard
240| | /// failures and stays open for `cooldown` after the last failure.
241| 3| pub fn new(threshold: u32, cooldown: Duration) -> Self {
242| 3| Self {
243| 3| threshold,
244| 3| cooldown,
245| 3| consecutive_failures: 0,
246| 3| open_until: None,
247| 3| }
248| 3| }
249| |
250| | /// Records one attempt outcome.
251| | ///
252| | /// Returns `true` when the breaker is now open and the caller must
253| | /// abort the job. Returns `false` when the attempt should continue.
254| 17| pub fn record(&mut self, outcome: AttemptOutcome) -> bool {
255| 17| match outcome {
256| | AttemptOutcome::Success | AttemptOutcome::Transient => {
257| 11| self.consecutive_failures = 0;
258| 11| false
259| | }
260| | AttemptOutcome::HardFailure => {
261| 6| self.consecutive_failures = self.consecutive_failures.saturating_add(1);
262| 6| if self.consecutive_failures >= self.threshold.max(1) {
263| 1| self.open_until = Some(Instant::now() + self.cooldown);
264| 1| tracing::error!(
265| | target: "circuit_breaker",
266| | consecutive_failures = self.consecutive_failures,
267| | threshold = self.threshold,
268| 0| cooldown_secs = self.cooldown.as_secs(),
269| 0| "circuit breaker opened — aborting job"
270| | );
271| 1| true
272| | } else {
273| 5| false
274| | }
275| | }
276| | }
277| 17| }
278| |
279| | /// `true` when the breaker is currently open (and not yet cooled down).
280| 3| pub fn is_open(&self) -> bool {
281| 3| self.open_until
282| 3| .map(|deadline| Instant::now() < deadline)
^1 ^1
283| 3| .unwrap_or(false)
284| 3| }
285| |
286| | /// Resets the breaker to closed state.
287| 0| pub fn reset(&mut self) {
288| 0| self.consecutive_failures = 0;
289| 0| self.open_until = None;
290| 0| }
291| |
292| | /// Returns the number of consecutive HardFailure outcomes observed
293| | /// since the last success or reset. Public so callers can include
294| | /// the value in their abort log line.
295| 0| pub fn consecutive_failures(&self) -> u32 {
296| 0| self.consecutive_failures
297| 0| }
298| |}
299| |
300| |#[cfg(test)]
301| |mod circuit_breaker_tests {
302| | use super::*;
303| |
304| | #[test]
305| 1| fn opens_after_threshold_consecutive_hard_failures() {
306| 1| let mut cb = CircuitBreaker::new(3, Duration::from_secs(60));
307| 1| assert!(!cb.record(AttemptOutcome::HardFailure));
308| 1| assert!(!cb.record(AttemptOutcome::HardFailure));
309| 1| assert!(cb.record(AttemptOutcome::HardFailure));
310| 1| assert!(cb.is_open());
311| 1| }
312| |
313| | #[test]
314| 1| fn ignores_transient_errors() {
315| 1| let mut cb = CircuitBreaker::new(2, Duration::from_secs(60));
316| | // 10 transients in a row should never open the breaker.
317| 11| for _ in 0..10 {
318| 10| assert!(!cb.record(AttemptOutcome::Transient));
319| | }
320| 1| assert!(!cb.is_open());
321| 1| }
322| |
323| | #[test]
324| 1| fn success_resets_consecutive_failures() {
325| 1| let mut cb = CircuitBreaker::new(3, Duration::from_secs(60));
326| 1| cb.record(AttemptOutcome::HardFailure);
327| 1| cb.record(AttemptOutcome::HardFailure);
328| 1| cb.record(AttemptOutcome::Success);
329| 1| assert!(!cb.record(AttemptOutcome::HardFailure));
330| 1| assert!(!cb.is_open());
331| 1| }
332| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/signals.rs:
1| |//! Cross-platform signal handling: SIGINT, SIGTERM, SIGHUP.
2| |
3| |use std::sync::atomic::Ordering;
4| |
5| |/// Registers the global shutdown handler for Ctrl+C / SIGTERM / SIGHUP.
6| |///
7| |/// First signal: sets [`SHUTDOWN`](crate::SHUTDOWN) flag, cancels the global
8| |/// cancellation token, logs graceful shutdown intent.
9| |///
10| |/// Second signal: calls [`std::process::exit(130)`] for immediate termination
11| |/// following Unix convention (128 + SIGINT=2).
12| 0|pub fn register_shutdown_handler() {
13| 0| if let Err(e) = ctrlc::set_handler(move || {
14| 0| let prev = crate::SIGNAL_COUNT.fetch_add(1, Ordering::AcqRel);
15| 0| if prev == 0 {
16| 0| crate::SHUTDOWN.store(true, Ordering::Release);
17| 0| crate::SIGNAL_NUMBER.store(2, Ordering::Release);
18| 0| crate::cancel_token().cancel();
19| 0| tracing::warn!(
20| | target: "signals",
21| 0| "shutdown signal received; finishing current operation gracefully"
22| | );
23| | } else {
24| 0| eprintln!("\nForced shutdown (second signal received). Exiting immediately.");
25| 0| std::process::exit(130);
26| | }
27| 0| }) {
28| 0| tracing::warn!(target: "signals", error = %e, "signal handler registration failed");
29| 0| }
30| 0|}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/stdin_helper.rs:
1| |//! Stdin reader with timeout to prevent indefinite blocking when the
2| |//! upstream pipe is held open without sending data.
3| |//!
4| |//! Used by `remember --body-stdin` and `edit` body input to enforce a
5| |//! deadline (default 60s). When the timeout fires, the spawned reader
6| |//! thread is leaked because `std::io::stdin()` cannot be cancelled
7| |//! from outside; this is acceptable in error scenarios because the
8| |//! process is about to exit anyway.
9| |//!
10| |//! When stdin is attached to a terminal (interactive TTY), the function
11| |//! returns an `AppError::Internal` immediately with an actionable message
12| |//! instead of blocking for up to `secs` seconds waiting for EOF.
13| |
14| |use crate::errors::AppError;
15| |use std::io::{IsTerminal, Read};
16| |use std::sync::mpsc;
17| |use std::thread;
18| |use std::time::Duration;
19| |
20| |/// Reads stdin to a `String` with a hard deadline.
21| |///
22| |/// Returns `AppError::Internal` immediately when stdin is attached to a
23| |/// terminal (TTY) — the caller must redirect data via a pipe or file.
24| |///
25| |/// # Errors
26| |/// Returns `AppError::Internal` when stdin is a TTY, when the read does
27| |/// not finish within `secs` seconds, or `AppError::Io` when the
28| |/// underlying read fails.
29| 1|pub fn read_stdin_with_timeout(secs: u64) -> Result<String, AppError> {
30| 1| if std::io::stdin().is_terminal() {
31| 0| return Err(AppError::Internal(anyhow::anyhow!(
32| 0| "stdin is attached to a terminal; pipe data via stdin \
33| 0| (e.g. `echo ... | sqlite-graphrag ...` or `... < file`) \
34| 0| or use --body instead of --body-stdin"
35| 0| )));
36| 1| }
37| 1| let (tx, rx) = mpsc::channel::<std::io::Result<String>>();
38| 1| thread::spawn(move || {
39| 1| let mut buf = String::new();
40| 1| let result = std::io::stdin().read_to_string(&mut buf).map(|_| buf);
41| 1| let _ = tx.send(result);
42| 1| });
43| 1| match rx.recv_timeout(Duration::from_secs(secs)) {
44| 1| Ok(Ok(buf)) => Ok(buf),
45| 0| Ok(Err(e)) => Err(AppError::Io(e)),
46| 0| Err(mpsc::RecvTimeoutError::Timeout) => Err(AppError::Internal(anyhow::anyhow!(
47| 0| "stdin read timed out after {secs}s; pipe must close within timeout window"
48| 0| ))),
49| 0| Err(mpsc::RecvTimeoutError::Disconnected) => Err(AppError::Internal(anyhow::anyhow!(
50| 0| "stdin reader thread disconnected unexpectedly"
51| 0| ))),
52| | }
53| 1|}
54| |
55| |#[cfg(test)]
56| |mod tests {
57| | use super::*;
58| | use std::time::Instant;
59| |
60| | // Note: we cannot easily test the success path because tests inherit stdin
61| | // from the test runner. We only assert the timeout path here.
62| | #[test]
63| 1| fn read_stdin_with_timeout_returns_internal_error_on_timeout() {
64| | // 1s is enough — stdin in test runner is typically a tty or pipe with no input.
65| 1| let start = Instant::now();
66| 1| let result = read_stdin_with_timeout(1);
67| 1| let elapsed = start.elapsed();
68| | // We expect either a timeout (most cases), an immediate TTY error, or a
69| | // successful EOF read (rare in CI environments).
70| 0| match result {
71| 0| Err(AppError::Internal(e)) => {
72| 0| let msg = e.to_string();
73| | // Accept both the TTY-detected error and the timeout error.
74| 0| assert!(
75| 0| msg.contains("timed out") || msg.contains("terminal"),
76| 0| "unexpected internal error: {msg}"
77| | );
78| | // TTY path exits immediately; timeout path takes ~1s.
79| 0| assert!(elapsed.as_secs_f64() < 2.5);
80| | }
81| 1| Ok(_) | Err(AppError::Io(_)) => {
82| 1| // EOF reached before timeout — also acceptable in CI environments.
83| 1| }
84| 0| Err(other) => unreachable!("stdin test: expected Internal/Io, got {other:?}"),
85| | }
86| 1| }
87| |
88| | // TTY detection cannot be simulated in unit tests because the test runner
89| | // always provides a non-TTY stdin (pipe). Empirical validation:
90| | // cargo run --release -- remember --body-stdin --name h1-test
91| | // Expected: exits in <2s with "stdin is attached to a terminal" message.
92| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/storage/backend.rs:
1| |//! Storage backend abstraction layer (G14 — phase 1).
2| |//!
3| |//! Defines a trait that abstracts the database connection, enabling future
4| |//! migration from rusqlite to libSQL embedded replicas or other backends.
5| |//!
6| |//! Phase 1 scope: trait definition + SqliteBackend wrapper only.
7| |//! Phase 2 (v1.0.69+): migrate remaining 43 command handlers to use the trait.
8| |
9| |use rusqlite::Connection;
10| |
11| |/// Backend-agnostic storage abstraction.
12| |///
13| |/// Phase 1: wraps `rusqlite::Connection` without functional change.
14| |/// Phase 2: will be implemented for `libsql::Connection` with embedded replicas.
15| |pub trait StorageBackend {
16| | /// Execute a SQL statement and return the number of affected rows.
17| | fn execute_sql(
18| | &self,
19| | sql: &str,
20| | params: &[&dyn rusqlite::types::ToSql],
21| | ) -> Result<usize, crate::errors::AppError>;
22| |
23| | /// Query a single row and map it with the provided closure.
24| | fn query_one<T, F>(
25| | &self,
26| | sql: &str,
27| | params: &[&dyn rusqlite::types::ToSql],
28| | f: F,
29| | ) -> Result<Option<T>, crate::errors::AppError>
30| | where
31| | F: FnOnce(&rusqlite::Row<'_>) -> Result<T, rusqlite::Error>;
32| |
33| | /// Returns a reference to the underlying rusqlite Connection.
34| | /// Phase 1 escape hatch — will be removed when full migration is complete.
35| | fn as_connection(&self) -> &Connection;
36| |}
37| |
38| |/// Default implementation wrapping a rusqlite Connection.
39| |pub struct SqliteBackend {
40| | conn: Connection,
41| |}
42| |
43| |impl SqliteBackend {
44| 1| pub fn new(conn: Connection) -> Self {
45| 1| Self { conn }
46| 1| }
47| |
48| 0| pub fn into_inner(self) -> Connection {
49| 0| self.conn
50| 0| }
51| |}
52| |
53| |impl StorageBackend for SqliteBackend {
54| 1| fn execute_sql(
55| 1| &self,
56| 1| sql: &str,
57| 1| params: &[&dyn rusqlite::types::ToSql],
58| 1| ) -> Result<usize, crate::errors::AppError> {
59| 1| self.conn
60| 1| .execute(sql, params)
61| 1| .map_err(crate::errors::AppError::Database)
62| 1| }
63| |
64| 1| fn query_one<T, F>(
65| 1| &self,
66| 1| sql: &str,
67| 1| params: &[&dyn rusqlite::types::ToSql],
68| 1| f: F,
69| 1| ) -> Result<Option<T>, crate::errors::AppError>
70| 1| where
71| 1| F: FnOnce(&rusqlite::Row<'_>) -> Result<T, rusqlite::Error>,
72| | {
73| 1| match self.conn.query_row(sql, params, f) {
74| 1| Ok(val) => Ok(Some(val)),
75| 0| Err(rusqlite::Error::QueryReturnedNoRows) => Ok(None),
76| 0| Err(e) => Err(crate::errors::AppError::Database(e)),
77| | }
78| 1| }
79| |
80| 0| fn as_connection(&self) -> &Connection {
81| 0| &self.conn
82| 0| }
83| |}
84| |
85| |#[cfg(test)]
86| |mod tests {
87| | use super::*;
88| |
89| | #[test]
90| 1| fn sqlite_backend_wraps_connection() {
91| 1| let conn = Connection::open_in_memory().unwrap();
92| 1| conn.execute_batch("CREATE TABLE test (id INTEGER PRIMARY KEY, val TEXT)")
93| 1| .unwrap();
94| 1| let backend = SqliteBackend::new(conn);
95| 1| let affected = backend
96| 1| .execute_sql(
97| 1| "INSERT INTO test (val) VALUES (?1)",
98| 1| &[&"hello" as &dyn rusqlite::types::ToSql],
99| | )
100| 1| .unwrap();
101| 1| assert_eq!(affected, 1);
102| |
103| 1| let result: Option<String> = backend
104| 1| .query_one("SELECT val FROM test WHERE id = 1", &[], |r| r.get(0))
105| 1| .unwrap();
106| 1| assert_eq!(result, Some("hello".to_string()));
107| 1| }
108| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/storage/chunks.rs:
1| |//! Chunk storage CRUD for multi-chunk memories.
2| |//!
3| |//! Manages the `memory_chunks` table: insert embeddings for bodies that
4| |//! exceed the 512-token E5 limit and query chunks for vector search.
5| |
6| |// src/storage/chunks.rs
7| |// Chunk storage for bodies exceeding 512 tokens E5 limit
8| |
9| |use crate::embedder::f32_to_bytes;
10| |use crate::errors::AppError;
11| |use rusqlite::{params, Connection};
12| |
13| |#[derive(Debug, Clone)]
14| |pub struct Chunk {
15| | pub memory_id: i64,
16| | pub chunk_idx: i32,
17| | pub chunk_text: String,
18| | pub start_offset: i32,
19| | pub end_offset: i32,
20| | pub token_count: i32,
21| |}
22| |
23| 6|pub fn insert_chunks(conn: &Connection, chunks: &[Chunk]) -> Result<(), AppError> {
24| 14| for chunk in chunks {
^9
25| 9| conn.execute(
26| 9| "INSERT INTO memory_chunks (memory_id, chunk_idx, chunk_text, start_offset, end_offset, token_count)
27| 9| VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
28| 9| params![
29| | chunk.memory_id,
30| | chunk.chunk_idx,
31| | chunk.chunk_text,
32| | chunk.start_offset,
33| | chunk.end_offset,
34| | chunk.token_count,
35| | ],
36| 1| )?;
37| | }
38| 5| Ok(())
39| 6|}
40| |
41| 0|pub fn insert_chunk_slices(
42| 0| conn: &Connection,
43| 0| memory_id: i64,
44| 0| body: &str,
45| 0| chunks: &[crate::chunking::Chunk],
46| 0|) -> Result<(), AppError> {
47| 0| for (chunk_idx, chunk) in chunks.iter().enumerate() {
48| 0| conn.execute(
49| 0| "INSERT INTO memory_chunks (memory_id, chunk_idx, chunk_text, start_offset, end_offset, token_count)
50| 0| VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
51| 0| params![
52| | memory_id,
53| 0| chunk_idx as i32,
54| 0| crate::chunking::chunk_text(body, chunk),
55| 0| chunk.start_offset as i32,
56| 0| chunk.end_offset as i32,
57| 0| chunk.token_count_approx as i32,
58| | ],
59| 0| )?;
60| | }
61| 0| Ok(())
62| 0|}
63| |
64| 1|pub fn upsert_chunk_vec(
65| 1| conn: &Connection,
66| 1| _rowid: i64,
67| 1| memory_id: i64,
68| 1| chunk_idx: i32,
69| 1| embedding: &[f32],
70| 1|) -> Result<(), AppError> {
71| 1| conn.execute(
72| 1| "INSERT OR REPLACE INTO vec_chunks(rowid, memory_id, chunk_idx, embedding)
73| 1| VALUES (
74| 1| (SELECT id FROM memory_chunks WHERE memory_id = ?1 AND chunk_idx = ?2),
75| 1| ?1, ?2, ?3
76| 1| )",
77| 1| params![memory_id, chunk_idx, f32_to_bytes(embedding)],
78| 0| )?;
79| 1| Ok(())
80| 1|}
81| |
82| 2|pub fn delete_chunks(conn: &Connection, memory_id: i64) -> Result<(), AppError> {
83| 2| conn.execute(
84| 2| "DELETE FROM memory_chunks WHERE memory_id = ?1",
85| 2| params![memory_id],
86| 0| )?;
87| 2| Ok(())
88| 2|}
89| |
90| 2|pub fn knn_search_chunks(
91| 2| conn: &Connection,
92| 2| embedding: &[f32],
93| 2| k: usize,
94| 2|) -> Result<Vec<(i64, i32, f32)>, AppError> {
95| 2| let bytes = f32_to_bytes(embedding);
96| 2| let mut stmt = conn.prepare_cached(
97| 2| "SELECT memory_id, chunk_idx, distance FROM vec_chunks
98| 2| WHERE embedding MATCH ?1
99| 2| ORDER BY distance LIMIT ?2",
100| 0| )?;
101| 2| let rows = stmt
102| 2| .query_map(params![bytes, k as i64], |r| {
^1
103| | Ok((
104| 1| r.get::<_, i64>(0)?,
^0
105| 1| r.get::<_, i32>(1)?,
^0
106| 1| r.get::<_, f32>(2)?,
^0
107| | ))
108| 1| })?
^0
109| 2| .collect::<Result<Vec<_>, _>>()?;
^0
110| 2| Ok(rows)
111| 2|}
112| |
113| 4|pub fn get_chunks_by_memory(conn: &Connection, memory_id: i64) -> Result<Vec<Chunk>, AppError> {
114| 4| let mut stmt = conn.prepare_cached(
115| 4| "SELECT memory_id, chunk_idx, chunk_text, start_offset, end_offset, token_count
116| 4| FROM memory_chunks WHERE memory_id = ?1 ORDER BY chunk_idx",
117| 0| )?;
118| 4| let rows = stmt
119| 5| .query_map(params![memory_id], |r| {
^4 ^4
120| | Ok(Chunk {
121| 5| memory_id: r.get(0)?,
^0
122| 5| chunk_idx: r.get(1)?,
^0
123| 5| chunk_text: r.get(2)?,
^0
124| 5| start_offset: r.get(3)?,
^0
125| 5| end_offset: r.get(4)?,
^0
126| 5| token_count: r.get(5)?,
^0
127| | })
128| 5| })?
^0
129| 4| .collect::<Result<Vec<_>, _>>()?;
^0
130| 4| Ok(rows)
131| 4|}
132| |
133| |#[cfg(test)]
134| |mod tests {
135| | use super::*;
136| | use crate::constants::EMBEDDING_DIM;
137| | use crate::storage::connection::register_vec_extension;
138| | use rusqlite::Connection;
139| | use tempfile::TempDir;
140| |
141| 9| fn setup_db() -> (TempDir, Connection) {
142| 9| register_vec_extension();
143| 9| let tmp = TempDir::new().unwrap();
144| 9| let db_path = tmp.path().join("test.db");
145| 9| let mut conn = Connection::open(&db_path).unwrap();
146| 9| crate::migrations::runner().run(&mut conn).unwrap();
147| 9| (tmp, conn)
148| 9| }
149| |
150| 4| fn insert_memory(conn: &Connection) -> i64 {
151| 4| conn.execute(
152| 4| "INSERT INTO memories (namespace, name, type, description, body, body_hash)
153| 4| VALUES ('global', 'test-mem', 'user', 'desc', 'body', 'hash1')",
154| 4| [],
155| | )
156| 4| .unwrap();
157| 4| conn.last_insert_rowid()
158| 4| }
159| |
160| | #[test]
161| 1| fn test_insert_chunks_empty_ok() {
162| 1| let (_tmp, conn) = setup_db();
163| 1| let resultado = insert_chunks(&conn, &[]);
164| 1| assert!(resultado.is_ok());
165| 1| }
166| |
167| | #[test]
168| 1| fn test_insert_chunks_and_get_by_memory() {
169| 1| let (_tmp, conn) = setup_db();
170| 1| let memory_id = insert_memory(&conn);
171| |
172| 1| let chunks = vec![
173| 1| Chunk {
174| 1| memory_id,
175| 1| chunk_idx: 0,
176| 1| chunk_text: "primeiro chunk".to_string(),
177| 1| start_offset: 0,
178| 1| end_offset: 14,
179| 1| token_count: 3,
180| 1| },
181| 1| Chunk {
182| 1| memory_id,
183| 1| chunk_idx: 1,
184| 1| chunk_text: "segundo chunk".to_string(),
185| 1| start_offset: 15,
186| 1| end_offset: 28,
187| 1| token_count: 3,
188| 1| },
189| | ];
190| |
191| 1| insert_chunks(&conn, &chunks).unwrap();
192| |
193| 1| let recuperados = get_chunks_by_memory(&conn, memory_id).unwrap();
194| 1| assert_eq!(recuperados.len(), 2);
195| 1| assert_eq!(recuperados[0].chunk_idx, 0);
196| 1| assert_eq!(recuperados[0].chunk_text, "primeiro chunk");
197| 1| assert_eq!(recuperados[0].start_offset, 0);
198| 1| assert_eq!(recuperados[0].end_offset, 14);
199| 1| assert_eq!(recuperados[0].token_count, 3);
200| 1| assert_eq!(recuperados[1].chunk_idx, 1);
201| 1| assert_eq!(recuperados[1].chunk_text, "segundo chunk");
202| 1| }
203| |
204| | #[test]
205| 1| fn test_get_chunks_missing_memory_returns_empty() {
206| 1| let (_tmp, conn) = setup_db();
207| 1| let resultado = get_chunks_by_memory(&conn, 9999).unwrap();
208| 1| assert!(resultado.is_empty());
209| 1| }
210| |
211| | #[test]
212| 1| fn test_delete_chunks_removes_all() {
213| 1| let (_tmp, conn) = setup_db();
214| 1| let memory_id = insert_memory(&conn);
215| |
216| 1| let chunks = vec![
217| 1| Chunk {
218| 1| memory_id,
219| 1| chunk_idx: 0,
220| 1| chunk_text: "chunk a".to_string(),
221| 1| start_offset: 0,
222| 1| end_offset: 7,
223| 1| token_count: 2,
224| 1| },
225| 1| Chunk {
226| 1| memory_id,
227| 1| chunk_idx: 1,
228| 1| chunk_text: "chunk b".to_string(),
229| 1| start_offset: 8,
230| 1| end_offset: 15,
231| 1| token_count: 2,
232| 1| },
233| | ];
234| 1| insert_chunks(&conn, &chunks).unwrap();
235| |
236| 1| delete_chunks(&conn, memory_id).unwrap();
237| |
238| 1| let recuperados = get_chunks_by_memory(&conn, memory_id).unwrap();
239| 1| assert!(recuperados.is_empty());
240| 1| }
241| |
242| | #[test]
243| 1| fn test_delete_chunks_memory_without_chunks_ok() {
244| 1| let (_tmp, conn) = setup_db();
245| 1| let resultado = delete_chunks(&conn, 9999);
246| 1| assert!(resultado.is_ok());
247| 1| }
248| |
249| | #[test]
250| 1| fn test_get_chunks_ordered_by_chunk_idx() {
251| 1| let (_tmp, conn) = setup_db();
252| 1| let memory_id = insert_memory(&conn);
253| |
254| 1| let chunks = vec![
255| 1| Chunk {
256| 1| memory_id,
257| 1| chunk_idx: 2,
258| 1| chunk_text: "terceiro".to_string(),
259| 1| start_offset: 20,
260| 1| end_offset: 28,
261| 1| token_count: 1,
262| 1| },
263| 1| Chunk {
264| 1| memory_id,
265| 1| chunk_idx: 0,
266| 1| chunk_text: "primeiro".to_string(),
267| 1| start_offset: 0,
268| 1| end_offset: 8,
269| 1| token_count: 1,
270| 1| },
271| 1| Chunk {
272| 1| memory_id,
273| 1| chunk_idx: 1,
274| 1| chunk_text: "segundo".to_string(),
275| 1| start_offset: 9,
276| 1| end_offset: 16,
277| 1| token_count: 1,
278| 1| },
279| | ];
280| 1| insert_chunks(&conn, &chunks).unwrap();
281| |
282| 1| let recuperados = get_chunks_by_memory(&conn, memory_id).unwrap();
283| 1| assert_eq!(recuperados.len(), 3);
284| 1| assert_eq!(recuperados[0].chunk_idx, 0);
285| 1| assert_eq!(recuperados[1].chunk_idx, 1);
286| 1| assert_eq!(recuperados[2].chunk_idx, 2);
287| 1| }
288| |
289| | #[test]
290| 1| fn test_upsert_chunk_vec_and_knn_search() {
291| 1| let (_tmp, conn) = setup_db();
292| 1| let memory_id = insert_memory(&conn);
293| |
294| 1| let chunk = Chunk {
295| 1| memory_id,
296| 1| chunk_idx: 0,
297| 1| chunk_text: "embedding test".to_string(),
298| 1| start_offset: 0,
299| 1| end_offset: 14,
300| 1| token_count: 2,
301| 1| };
302| 1| insert_chunks(&conn, &[chunk]).unwrap();
303| |
304| 1| let mut embedding = vec![0.0f32; EMBEDDING_DIM];
305| 1| embedding[0] = 1.0;
306| |
307| 1| let chunk_id: i64 = conn
308| 1| .query_row(
309| 1| "SELECT id FROM memory_chunks WHERE memory_id = ?1 AND chunk_idx = 0",
310| 1| params![memory_id],
311| 1| |r| r.get(0),
312| | )
313| 1| .unwrap();
314| |
315| 1| upsert_chunk_vec(&conn, chunk_id, memory_id, 0, &embedding).unwrap();
316| |
317| 1| let resultados = knn_search_chunks(&conn, &embedding, 1).unwrap();
318| 1| assert_eq!(resultados.len(), 1);
319| 1| assert_eq!(resultados[0].0, memory_id);
320| 1| assert_eq!(resultados[0].1, 0);
321| 1| }
322| |
323| | #[test]
324| 1| fn test_knn_search_chunks_without_data_returns_empty() {
325| 1| let (_tmp, conn) = setup_db();
326| 1| let embedding = vec![0.0f32; EMBEDDING_DIM];
327| 1| let resultado = knn_search_chunks(&conn, &embedding, 5).unwrap();
328| 1| assert!(resultado.is_empty());
329| 1| }
330| |
331| | #[test]
332| 1| fn test_insert_chunks_invalid_fk_fails() {
333| 1| let (_tmp, conn) = setup_db();
334| 1| let chunk = Chunk {
335| 1| memory_id: 99999,
336| 1| chunk_idx: 0,
337| 1| chunk_text: "sem pai".to_string(),
338| 1| start_offset: 0,
339| 1| end_offset: 7,
340| 1| token_count: 1,
341| 1| };
342| 1| let resultado = insert_chunks(&conn, &[chunk]);
343| 1| assert!(resultado.is_err());
344| 1| }
345| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/storage/connection.rs:
1| |//! SQLite connection setup with PRAGMAs and 0600 permissions.
2| |//!
3| |//! Opens (or creates) the database file, loads the `sqlite-vec` extension,
4| |//! applies WAL/journal PRAGMAs, and enforces 0600 file permissions on Unix.
5| |
6| |use crate::errors::AppError;
7| |use crate::paths::AppPaths;
8| |use crate::pragmas::{apply_connection_pragmas, apply_init_pragmas, ensure_wal_mode};
9| |use rusqlite::Connection;
10| |use sqlite_vec::sqlite3_vec_init;
11| |use std::path::Path;
12| |use std::sync::OnceLock;
13| |
14| |static VEC_EXTENSION_REGISTERED: OnceLock<()> = OnceLock::new();
15| |
16| |/// Register sqlite-vec GLOBALLY before any connection is opened.
17| |///
18| |/// Idempotent: subsequent calls are no-ops thanks to `OnceLock`. Safe to invoke from
19| |/// both the binary entry point (`main.rs`) and library helpers like `ensure_db_ready`
20| |/// so unit tests that exercise CRUD handlers do not need to pre-register the extension.
21| 86|pub fn register_vec_extension() {
22| 86| VEC_EXTENSION_REGISTERED.get_or_init(|| {
^1
23| | // SAFETY: sqlite3_auto_extension is a C FFI function that registers a callback
24| | // invoked when SQLite opens any new connection. Soundness assumptions:
25| | // 1. `sqlite3_vec_init` has the exact ABI signature `extern "C" fn(...) -> i32`
26| | // expected by SQLite's auto-extension API (verified by sqlite-vec crate).
27| | // 2. The transmute from `*const ()` to the expected fn pointer is valid because
28| | // both have identical layout on supported platforms (Linux, macOS, Windows).
29| | // 3. `OnceLock::get_or_init` guarantees this closure runs at most once across
30| | // all threads; the auto-extension list is mutated exactly one time.
31| | #[allow(clippy::missing_transmute_annotations)]
32| 1| unsafe {
33| 1| rusqlite::ffi::sqlite3_auto_extension(Some(std::mem::transmute(
34| 1| sqlite3_vec_init as *const (),
35| 1| )));
36| 1| }
37| 1| });
38| 86|}
39| |
40| 2|pub fn open_rw(path: &Path) -> Result<Connection, AppError> {
41| 2| let conn = Connection::open(path)?;
^0
42| 2| apply_connection_pragmas(&conn)?;
^0
43| 2| apply_secure_permissions(path);
44| 2| Ok(conn)
45| 2|}
46| |
47| 0|pub fn ensure_schema(conn: &mut Connection) -> Result<(), AppError> {
48| 0| crate::migrations::runner()
49| 0| .run(conn)
50| 0| .map_err(|e| AppError::Internal(anyhow::anyhow!("migration failed: {e}")))?;
51| 0| conn.execute_batch(&format!(
52| 0| "PRAGMA user_version = {};",
53| 0| crate::constants::SCHEMA_USER_VERSION
54| 0| ))?;
55| 0| Ok(())
56| 0|}
57| |
58| |/// Ensures the database file exists and the schema is at the current version.
59| |///
60| |/// Behavior:
61| |/// - DB does not exist: creates the file, applies init PRAGMAs, runs all migrations,
62| |/// sets `PRAGMA user_version`, and populates `schema_meta` with default values.
63| |/// Emits `tracing::info!` on creation.
64| |/// - DB exists with `user_version` below `SCHEMA_USER_VERSION`: runs the remaining
65| |/// migrations and updates `user_version`. Emits `tracing::warn!` on auto-migration.
66| |/// - DB exists with `user_version` equal to `SCHEMA_USER_VERSION`: no-op.
67| |///
68| |/// This helper unifies the auto-init contract across CRUD handlers so users can run
69| |/// any subcommand on a fresh directory without invoking `init` first. Idempotent
70| |/// and safe to call before every handler that needs a ready database.
71| 1|pub fn ensure_db_ready(paths: &AppPaths) -> Result<(), AppError> {
72| 1| register_vec_extension();
73| 1| paths.ensure_dirs()?;
^0
74| |
75| 1| let db_existed = paths.db.exists();
76| |
77| 1| if !db_existed {
78| 1| tracing::info!(target: "storage",
79| 0| path = %paths.db.display(),
80| | schema_version = crate::constants::CURRENT_SCHEMA_VERSION,
81| 0| "creating database (auto-init)"
82| | );
83| 0| }
84| |
85| 1| let mut conn = open_rw(&paths.db)?;
^0
86| |
87| 1| if !db_existed {
88| 1| apply_init_pragmas(&conn)?;
^0
89| 0| }
90| |
91| 1| let current_user_version: i64 = conn
92| 1| .query_row("PRAGMA user_version", [], |row| row.get(0))
93| 1| .unwrap_or(0);
94| 1| let target_user_version = crate::constants::SCHEMA_USER_VERSION;
95| |
96| 1| if current_user_version < target_user_version {
97| 1| if db_existed {
98| 0| tracing::warn!(target: "storage",
99| | from = current_user_version,
100| | to = target_user_version,
101| 0| path = %paths.db.display(),
102| 0| "auto-migrating database schema"
103| | );
104| 1| }
105| 1| crate::migrations::runner()
106| 1| .run(&mut conn)
107| 1| .map_err(|e| AppError::Internal(anyhow::anyhow!("auto-migration failed: {e}")))?;
^0 ^0 ^0
108| 1| conn.execute_batch(&format!("PRAGMA user_version = {target_user_version};"))?;
^0
109| |
110| 1| if !db_existed {
111| 1| insert_default_schema_meta(&conn)?;
^0
112| 0| }
113| |
114| | // Defensive re-assertion: refinery's migration runner may open internal
115| | // handles that revert journal_mode to delete on some platforms. Re-apply
116| | // WAL after migrations to guarantee the documented contract holds for
117| | // every command that goes through the auto-init path.
118| 1| ensure_wal_mode(&conn)?;
^0
119| 0| }
120| |
121| 1| Ok(())
122| 1|}
123| |
124| 1|fn insert_default_schema_meta(conn: &Connection) -> Result<(), AppError> {
125| 1| conn.execute(
126| 1| "INSERT OR REPLACE INTO schema_meta (key, value) VALUES ('schema_version', ?1)",
127| 1| rusqlite::params![crate::constants::CURRENT_SCHEMA_VERSION.to_string()],
128| 0| )?;
129| 1| conn.execute(
130| 1| "INSERT OR REPLACE INTO schema_meta (key, value) VALUES ('model', 'multilingual-e5-small')",
131| 1| [],
132| 0| )?;
133| 1| conn.execute(
134| 1| "INSERT OR REPLACE INTO schema_meta (key, value) VALUES ('dim', '384')",
135| 1| [],
136| 0| )?;
137| 1| conn.execute(
138| 1| "INSERT OR REPLACE INTO schema_meta (key, value) VALUES ('created_at', CAST(unixepoch() AS TEXT))",
139| 1| [],
140| 0| )?;
141| 1| conn.execute(
142| 1| "INSERT OR REPLACE INTO schema_meta (key, value) VALUES ('sqlite-graphrag_version', ?1)",
143| 1| rusqlite::params![crate::constants::SQLITE_GRAPHRAG_VERSION],
144| 0| )?;
145| 1| Ok(())
146| 1|}
147| |
148| |/// Applies 600 permissions (owner read/write only) to the SQLite file and its WAL/SHM
149| |/// companion files on Unix to prevent leaking private memories in shared directories
150| |/// (e.g. multi-user /tmp, Dropbox, NFS). On Windows, NTFS DACL default is private-to-user
151| |/// so explicit permission setting is unnecessary; a debug log records the skip. Failures
152| |/// are silent to avoid blocking the operation when the process does not own the file
153| |/// (e.g. read-only mount).
154| |#[allow(unused_variables)]
155| 2|fn apply_secure_permissions(path: &Path) {
156| | #[cfg(unix)]
157| | {
158| | use std::os::unix::fs::PermissionsExt;
159| 2| let candidates = [
160| 2| path.to_path_buf(),
161| 2| path.with_extension(format!(
162| 2| "{}-wal",
163| 2| path.extension()
164| 2| .and_then(|e| e.to_str())
165| 2| .unwrap_or("sqlite")
166| | )),
167| 2| path.with_extension(format!(
168| 2| "{}-shm",
169| 2| path.extension()
170| 2| .and_then(|e| e.to_str())
171| 2| .unwrap_or("sqlite")
172| | )),
173| | ];
174| 6| for file in candidates.iter() {
^2 ^2
175| 6| if file.exists() {
176| 4| if let Ok(meta) = std::fs::metadata(file) {
177| 4| let mut perms = meta.permissions();
178| 4| perms.set_mode(0o600);
179| 4| let _ = std::fs::set_permissions(file, perms);
180| 4| }
^0
181| 2| }
182| | }
183| | }
184| | #[cfg(windows)]
185| | {
186| | tracing::debug!(target: "storage",
187| | path = %path.display(),
188| | "skipping Unix mode 0o600 on Windows; NTFS DACL default is private-to-user"
189| | );
190| | }
191| 2|}
192| |
193| 0|pub fn open_ro(path: &Path) -> Result<Connection, AppError> {
194| 0| let conn = Connection::open_with_flags(
195| 0| path,
196| 0| rusqlite::OpenFlags::SQLITE_OPEN_READ_ONLY | rusqlite::OpenFlags::SQLITE_OPEN_URI,
197| 0| )?;
198| 0| conn.execute_batch("PRAGMA foreign_keys = ON;")?;
199| 0| Ok(conn)
200| 0|}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/storage/entities.rs:
1| |//! Persistence layer for entities, relationships and their junction tables.
2| |//!
3| |//! The entity graph mirrors the conceptual content of memories: `entities`
4| |//! holds nodes, `relationships` holds typed edges and `memory_entities` and
5| |//! `memory_relationships` connect each memory to the graph slice it emitted.
6| |
7| |use crate::embedder::f32_to_bytes;
8| |use crate::entity_type::EntityType;
9| |use crate::errors::AppError;
10| |use crate::parsers::normalize_entity_name;
11| |use crate::storage::utils::with_busy_retry;
12| |use rusqlite::{params, Connection};
13| |use serde::{Deserialize, Serialize};
14| |
15| |/// Input payload used to upsert a single entity.
16| |///
17| |/// `name` is normalized to kebab-case by the caller. `description` is
18| |/// optional and preserved across upserts when the new value is `None`.
19| |#[derive(Debug, Serialize, Deserialize, Clone)]
20| |#[serde(deny_unknown_fields)]
21| |pub struct NewEntity {
22| | pub name: String,
23| | #[serde(alias = "type")]
24| | pub entity_type: EntityType,
25| | pub description: Option<String>,
26| |}
27| |
28| |/// Input payload used to upsert a typed relationship between entities.
29| |///
30| |/// `strength` must lie within `[0.0, 1.0]` and is mapped to the `weight`
31| |/// column of the `relationships` table.
32| |#[derive(Debug, Serialize, Deserialize, Clone)]
33| |#[serde(deny_unknown_fields)]
34| |pub struct NewRelationship {
35| | #[serde(alias = "from")]
36| | pub source: String,
37| | #[serde(alias = "to")]
38| | pub target: String,
39| | #[serde(alias = "type")]
40| | pub relation: String,
41| | #[serde(alias = "weight")]
42| | pub strength: f64,
43| | pub description: Option<String>,
44| |}
45| |
46| |/// Validates entity name against quality rules.
47| |///
48| |/// Rejects names with newlines, names shorter than 2 characters, and
49| |/// ALL_CAPS abbreviations of 4 characters or fewer (common NER noise).
50| |///
51| |/// # Errors
52| |///
53| |/// Returns `Err(AppError::Validation)` when the name violates any rule.
54| 62|pub fn validate_entity_name(name: &str) -> Result<(), AppError> {
55| 62| if name.len() < 2 {
56| 2| return Err(AppError::Validation(format!(
57| 2| "entity name '{name}' must be at least 2 characters"
58| 2| )));
59| 60| }
60| 60| if name.contains('\n') || name.contains('\r') {
^59 ^59
61| 2| return Err(AppError::Validation(
62| 2| "entity name must not contain newline characters".to_string(),
63| 2| ));
64| 58| }
65| 58| if name.len() <= 4
66| 19| && name
67| 19| .chars()
68| 29| .all(|c| c.is_ascii_uppercase() || c == '_' || c == '-')
^19 ^15 ^15
69| | {
70| 4| return Err(AppError::Validation(format!(
71| 4| "entity name '{name}' rejected: short ALL_CAPS names are typically NER noise"
72| 4| )));
73| 54| }
74| 54| Ok(())
75| 62|}
76| |
77| |/// Upserts an entity and returns its primary key.
78| |///
79| |/// Uses `ON CONFLICT(namespace, name)` to keep one row per entity within a
80| |/// namespace, refreshing `type` and `description` opportunistically.
81| |///
82| |/// # Errors
83| |///
84| |/// Returns `Err(AppError::Database)` on any `rusqlite` failure.
85| 48|pub fn upsert_entity(conn: &Connection, namespace: &str, e: &NewEntity) -> Result<i64, AppError> {
86| | // Step 1: validate the original name — catches ALL_CAPS short noise (NER artefacts),
87| | // newlines, and names shorter than 2 characters before any transformation.
88| 48| validate_entity_name(&e.name)?;
^0
89| | // Step 2: normalize to kebab-case ASCII (NFKD, lowercase, spaces/underscores → hyphens).
90| 48| let normalized_name = normalize_entity_name(&e.name);
91| | // Step 3: guard post-normalization length — a valid original could collapse to < 2 chars
92| | // (e.g. a single accented character that strips entirely).
93| 48| if normalized_name.chars().count() < 2 {
94| 0| return Err(AppError::Validation(format!(
95| 0| "entity name '{}' normalizes to '{}' which is too short (minimum 2 characters)",
96| 0| e.name, normalized_name
97| 0| )));
98| 48| }
99| 48| conn.execute(
100| 48| "INSERT INTO entities (namespace, name, type, description)
101| 48| VALUES (?1, ?2, ?3, ?4)
102| 48| ON CONFLICT(namespace, name) DO UPDATE SET
103| 48| type = excluded.type,
104| 48| description = COALESCE(excluded.description, entities.description),
105| 48| updated_at = unixepoch()",
106| 48| params![namespace, normalized_name, e.entity_type, e.description],
107| 0| )?;
108| 48| let id: i64 = conn.query_row(
109| 48| "SELECT id FROM entities WHERE namespace = ?1 AND name = ?2",
110| 48| params![namespace, normalized_name],
111| 48| |r| r.get(0),
112| 0| )?;
113| 48| Ok(id)
114| 48|}
115| |
116| |/// Replaces the vector row for an entity in `vec_entities`.
117| |///
118| |/// vec0 virtual tables do not honour `INSERT OR REPLACE` when the primary key
119| |/// already exists — they raise a UNIQUE constraint error instead of silently
120| |/// replacing the row. The workaround is an explicit DELETE before INSERT so
121| |/// that the insert never conflicts. `embedding` must have length
122| |/// [`crate::constants::EMBEDDING_DIM`].
123| |///
124| |/// # Errors
125| |///
126| |/// Returns `Err(AppError::Database)` on any `rusqlite` failure.
127| 7|pub fn upsert_entity_vec(
128| 7| conn: &Connection,
129| 7| entity_id: i64,
130| 7| namespace: &str,
131| 7| entity_type: EntityType,
132| 7| embedding: &[f32],
133| 7| name: &str,
134| 7|) -> Result<(), AppError> {
135| | // Both statements wrapped in with_busy_retry: WAL concurrency can cause
136| | // SQLITE_BUSY on vec0 virtual table writes when multiple CLI instances run.
137| 7| let embedding_bytes = f32_to_bytes(embedding);
138| 7| with_busy_retry(|| {
139| 7| conn.execute(
140| 7| "DELETE FROM vec_entities WHERE entity_id = ?1",
141| 7| params![entity_id],
142| 0| )?;
143| 7| conn.execute(
144| 7| "INSERT INTO vec_entities(entity_id, namespace, type, embedding, name)
145| 7| VALUES (?1, ?2, ?3, ?4, ?5)",
146| 7| params![entity_id, namespace, entity_type, &embedding_bytes, name],
147| 0| )?;
148| 7| Ok(())
149| 7| })
150| 7|}
151| |
152| |/// Upserts a typed relationship between two entity ids.
153| |///
154| |/// Conflicts on `(source_id, target_id, relation)` refresh `weight` and
155| |/// preserve a non-null `description`. Returns the `rowid` of the stored row.
156| |///
157| |/// # Errors
158| |///
159| |/// Returns `Err(AppError::Database)` on any `rusqlite` failure.
160| 9|pub fn upsert_relationship(
161| 9| conn: &Connection,
162| 9| namespace: &str,
163| 9| source_id: i64,
164| 9| target_id: i64,
165| 9| rel: &NewRelationship,
166| 9|) -> Result<i64, AppError> {
167| 9| conn.execute(
168| 9| "INSERT INTO relationships (namespace, source_id, target_id, relation, weight, description)
169| 9| VALUES (?1, ?2, ?3, ?4, ?5, ?6)
170| 9| ON CONFLICT(source_id, target_id, relation) DO UPDATE SET
171| 9| weight = excluded.weight,
172| 9| description = COALESCE(excluded.description, relationships.description)",
173| 9| params![
174| | namespace,
175| | source_id,
176| | target_id,
177| | rel.relation,
178| | rel.strength,
179| | rel.description
180| | ],
181| 0| )?;
182| 9| let id: i64 = conn.query_row(
183| 9| "SELECT id FROM relationships WHERE source_id=?1 AND target_id=?2 AND relation=?3",
184| 9| params![source_id, target_id, rel.relation],
185| 9| |r| r.get(0),
186| 0| )?;
187| 9| Ok(id)
188| 9|}
189| |
190| |/// Links a memory to an entity in the `memory_entities` join table.
191| |///
192| |/// # Errors
193| |///
194| |/// Returns [`AppError::Database`] when the underlying SQLite operation fails.
195| 3|pub fn link_memory_entity(
196| 3| conn: &Connection,
197| 3| memory_id: i64,
198| 3| entity_id: i64,
199| 3|) -> Result<(), AppError> {
200| 3| conn.execute(
201| 3| "INSERT OR IGNORE INTO memory_entities (memory_id, entity_id) VALUES (?1, ?2)",
202| 3| params![memory_id, entity_id],
203| 0| )?;
204| 3| Ok(())
205| 3|}
206| |
207| |/// Links a memory to a relationship in the `memory_relationships` join table.
208| |///
209| |/// # Errors
210| |///
211| |/// Returns [`AppError::Database`] when the underlying SQLite operation fails.
212| 2|pub fn link_memory_relationship(
213| 2| conn: &Connection,
214| 2| memory_id: i64,
215| 2| rel_id: i64,
216| 2|) -> Result<(), AppError> {
217| 2| conn.execute(
218| 2| "INSERT OR IGNORE INTO memory_relationships (memory_id, relationship_id) VALUES (?1, ?2)",
219| 2| params![memory_id, rel_id],
220| 0| )?;
221| 2| Ok(())
222| 2|}
223| |
224| |/// Increments the `degree` counter of an entity by one.
225| |///
226| |/// # Errors
227| |///
228| |/// Returns [`AppError::Database`] when the underlying SQLite operation fails.
229| 2|pub fn increment_degree(conn: &Connection, entity_id: i64) -> Result<(), AppError> {
230| 2| conn.execute(
231| 2| "UPDATE entities SET degree = degree + 1 WHERE id = ?1",
232| 2| params![entity_id],
233| 0| )?;
234| 2| Ok(())
235| 2|}
236| |
237| |/// Looks up the entity by name and namespace. Returns the id when it exists.
238| |///
239| |/// # Errors
240| |///
241| |/// Returns [`AppError::Database`] when the underlying SQLite operation fails.
242| 6|pub fn find_entity_id(
243| 6| conn: &Connection,
244| 6| namespace: &str,
245| 6| name: &str,
246| 6|) -> Result<Option<i64>, AppError> {
247| | // Normalize the lookup name so it matches the normalized names written by
248| | // `upsert_entity`. Without this, an entity written through normalization
249| | // (e.g. "Foo Bar" -> "foo-bar") would be unreachable by its original
250| | // spelling, breaking delete-entity, reclassify, merge-entities, rename and
251| | // memory-entities lookups.
252| 6| let name = normalize_entity_name(name);
253| 6| let mut stmt =
254| 6| conn.prepare_cached("SELECT id FROM entities WHERE namespace = ?1 AND name = ?2")?;
^0
255| 6| match stmt.query_row(params![namespace, &name], |r| r.get::<_, i64>(0)) {
^2^2
256| 2| Ok(id) => Ok(Some(id)),
257| 4| Err(rusqlite::Error::QueryReturnedNoRows) => Ok(None),
258| 0| Err(e) => Err(AppError::Database(e)),
259| | }
260| 6|}
261| |
262| |/// Structure representing an existing relation.
263| |#[derive(Debug, Serialize)]
264| |pub struct RelationshipRow {
265| | pub id: i64,
266| | pub namespace: String,
267| | pub source_id: i64,
268| | pub target_id: i64,
269| | pub relation: String,
270| | pub weight: f64,
271| | pub description: Option<String>,
272| |}
273| |
274| |/// Looks up a specific relation by (source_id, target_id, relation).
275| |///
276| |/// # Errors
277| |///
278| |/// Returns [`AppError::Database`] when the underlying SQLite operation fails.
279| 6|pub fn find_relationship(
280| 6| conn: &Connection,
281| 6| source_id: i64,
282| 6| target_id: i64,
283| 6| relation: &str,
284| 6|) -> Result<Option<RelationshipRow>, AppError> {
285| 6| let mut stmt = conn.prepare_cached(
286| 6| "SELECT id, namespace, source_id, target_id, relation, weight, description
287| 6| FROM relationships
288| 6| WHERE source_id = ?1 AND target_id = ?2 AND relation = ?3",
289| 0| )?;
290| 6| match stmt.query_row(params![source_id, target_id, relation], |r| {
^2
291| | Ok(RelationshipRow {
292| 2| id: r.get(0)?,
^0
293| 2| namespace: r.get(1)?,
^0
294| 2| source_id: r.get(2)?,
^0
295| 2| target_id: r.get(3)?,
^0
296| 2| relation: r.get(4)?,
^0
297| 2| weight: r.get(5)?,
^0
298| 2| description: r.get(6)?,
^0
299| | })
300| 2| }) {
301| 2| Ok(row) => Ok(Some(row)),
302| 4| Err(rusqlite::Error::QueryReturnedNoRows) => Ok(None),
303| 0| Err(e) => Err(AppError::Database(e)),
304| | }
305| 6|}
306| |
307| |/// Creates a relation if it does not exist (returns action="created")
308| |/// or returns the existing relation (action="already_exists") with updated weight.
309| |///
310| |/// # Errors
311| |///
312| |/// - [`AppError::Database`] — SQLite query or constraint failure.
313| |/// - [`AppError::Validation`] — self-link attempt (source equals target).
314| 3|pub fn create_or_fetch_relationship(
315| 3| conn: &Connection,
316| 3| namespace: &str,
317| 3| source_id: i64,
318| 3| target_id: i64,
319| 3| relation: &str,
320| 3| weight: f64,
321| 3| description: Option<&str>,
322| 3|) -> Result<(i64, bool), AppError> {
323| | // Check if it exists first; update weight if different.
324| 3| let existing = find_relationship(conn, source_id, target_id, relation)?;
^0
325| 3| if let Some(row) = existing {
^1
326| 1| if (row.weight - weight).abs() > f64::EPSILON {
327| 0| conn.execute(
328| 0| "UPDATE relationships SET weight = ?1 WHERE id = ?2",
329| 0| params![weight, row.id],
330| 0| )?;
331| 1| }
332| 1| return Ok((row.id, false));
333| 2| }
334| 2| conn.execute(
335| 2| "INSERT INTO relationships (namespace, source_id, target_id, relation, weight, description)
336| 2| VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
337| 2| params![
338| | namespace,
339| | source_id,
340| | target_id,
341| | relation,
342| | weight,
343| | description
344| | ],
345| 0| )?;
346| 2| let id: i64 = conn.query_row(
347| 2| "SELECT id FROM relationships WHERE source_id = ?1 AND target_id = ?2 AND relation = ?3",
348| 2| params![source_id, target_id, relation],
349| 2| |r| r.get(0),
350| 0| )?;
351| 2| Ok((id, true))
352| 3|}
353| |
354| |/// Removes a relation by id and cleans up memory_relationships.
355| |///
356| |/// # Errors
357| |///
358| |/// Returns [`AppError::Database`] when the underlying SQLite operation fails.
359| 1|pub fn delete_relationship_by_id(conn: &Connection, relationship_id: i64) -> Result<(), AppError> {
360| 1| conn.execute(
361| 1| "DELETE FROM memory_relationships WHERE relationship_id = ?1",
362| 1| params![relationship_id],
363| 0| )?;
364| 1| conn.execute(
365| 1| "DELETE FROM relationships WHERE id = ?1",
366| 1| params![relationship_id],
367| 0| )?;
368| 1| Ok(())
369| 1|}
370| |
371| |/// Recalculates the `degree` field of an entity.
372| |///
373| |/// # Errors
374| |///
375| |/// Returns [`AppError::Database`] when the underlying SQLite operation fails.
376| 1|pub fn recalculate_degree(conn: &Connection, entity_id: i64) -> Result<(), AppError> {
377| 1| conn.execute(
378| 1| "UPDATE entities
379| 1| SET degree = (SELECT COUNT(*) FROM relationships
380| 1| WHERE source_id = entities.id OR target_id = entities.id)
381| 1| WHERE id = ?1",
382| 1| params![entity_id],
383| 0| )?;
384| 1| Ok(())
385| 1|}
386| |
387| |/// Entity row with enough data for graph export/query.
388| |#[derive(Debug, Serialize, Clone)]
389| |pub struct EntityNode {
390| | pub id: i64,
391| | pub name: String,
392| | pub namespace: String,
393| | pub kind: String,
394| |}
395| |
396| |/// Lists entities, filtering by namespace if provided.
397| |///
398| |/// # Errors
399| |///
400| |/// Returns [`AppError::Database`] when the underlying SQLite operation fails.
401| 2|pub fn list_entities(
402| 2| conn: &Connection,
403| 2| namespace: Option<&str>,
404| 2|) -> Result<Vec<EntityNode>, AppError> {
405| 2| if let Some(ns) = namespace {
^1
406| 1| let mut stmt = conn.prepare_cached(
407| 1| "SELECT id, name, namespace, type FROM entities WHERE namespace = ?1 ORDER BY id",
408| 0| )?;
409| 1| let rows = stmt
410| 2| .query_map(params![ns], |r| {
^1 ^1
411| | Ok(EntityNode {
412| 2| id: r.get(0)?,
^0
413| 2| name: r.get(1)?,
^0
414| 2| namespace: r.get(2)?,
^0
415| 2| kind: r.get(3)?,
^0
416| | })
417| 2| })?
^0
418| 1| .collect::<Result<Vec<_>, _>>()?;
^0
419| 1| Ok(rows)
420| | } else {
421| 1| let mut stmt = conn.prepare_cached(
422| 1| "SELECT id, name, namespace, type FROM entities ORDER BY namespace, id",
423| 0| )?;
424| 1| let rows = stmt
425| 2| .query_map([], |r| {
^1 ^1
426| | Ok(EntityNode {
427| 2| id: r.get(0)?,
^0
428| 2| name: r.get(1)?,
^0
429| 2| namespace: r.get(2)?,
^0
430| 2| kind: r.get(3)?,
^0
431| | })
432| 2| })?
^0
433| 1| .collect::<Result<Vec<_>, _>>()?;
^0
434| 1| Ok(rows)
435| | }
436| 2|}
437| |
438| |/// Lists relations filtered by namespace (of source/target entities).
439| |///
440| |/// # Errors
441| |///
442| |/// Returns [`AppError::Database`] when the underlying SQLite operation fails.
443| 1|pub fn list_relationships_by_namespace(
444| 1| conn: &Connection,
445| 1| namespace: Option<&str>,
446| 1|) -> Result<Vec<RelationshipRow>, AppError> {
447| 1| if let Some(ns) = namespace {
448| 1| let mut stmt = conn.prepare_cached(
449| 1| "SELECT r.id, r.namespace, r.source_id, r.target_id, r.relation, r.weight, r.description
450| 1| FROM relationships r
451| 1| JOIN entities se ON se.id = r.source_id AND se.namespace = ?1
452| 1| JOIN entities te ON te.id = r.target_id AND te.namespace = ?1
453| 1| ORDER BY r.id",
454| 0| )?;
455| 1| let rows = stmt
456| 1| .query_map(params![ns], |r| {
457| | Ok(RelationshipRow {
458| 1| id: r.get(0)?,
^0
459| 1| namespace: r.get(1)?,
^0
460| 1| source_id: r.get(2)?,
^0
461| 1| target_id: r.get(3)?,
^0
462| 1| relation: r.get(4)?,
^0
463| 1| weight: r.get(5)?,
^0
464| 1| description: r.get(6)?,
^0
465| | })
466| 1| })?
^0
467| 1| .collect::<Result<Vec<_>, _>>()?;
^0
468| 1| Ok(rows)
469| | } else {
470| 0| let mut stmt = conn.prepare_cached(
471| 0| "SELECT id, namespace, source_id, target_id, relation, weight, description
472| 0| FROM relationships ORDER BY id",
473| 0| )?;
474| 0| let rows = stmt
475| 0| .query_map([], |r| {
476| | Ok(RelationshipRow {
477| 0| id: r.get(0)?,
478| 0| namespace: r.get(1)?,
479| 0| source_id: r.get(2)?,
480| 0| target_id: r.get(3)?,
481| 0| relation: r.get(4)?,
482| 0| weight: r.get(5)?,
483| 0| description: r.get(6)?,
484| | })
485| 0| })?
486| 0| .collect::<Result<Vec<_>, _>>()?;
487| 0| Ok(rows)
488| | }
489| 1|}
490| |
491| |/// Locates orphan entities: no link in memory_entities and no relations.
492| |///
493| |/// # Errors
494| |///
495| |/// Returns [`AppError::Database`] when the underlying SQLite operation fails.
496| 3|pub fn find_orphan_entity_ids(
497| 3| conn: &Connection,
498| 3| namespace: Option<&str>,
499| 3|) -> Result<Vec<i64>, AppError> {
500| 3| if let Some(ns) = namespace {
^2
501| 2| let mut stmt = conn.prepare_cached(
502| 2| "SELECT e.id FROM entities e
503| 2| WHERE e.namespace = ?1
504| 2| AND NOT EXISTS (SELECT 1 FROM memory_entities me WHERE me.entity_id = e.id)
505| 2| AND NOT EXISTS (
506| 2| SELECT 1 FROM relationships r
507| 2| WHERE r.source_id = e.id OR r.target_id = e.id
508| 2| )",
509| 0| )?;
510| 2| let ids = stmt
511| 2| .query_map(params![ns], |r| r.get::<_, i64>(0))?
^1^1 ^0
512| 2| .collect::<Result<Vec<_>, _>>()?;
^0
513| 2| Ok(ids)
514| | } else {
515| 1| let mut stmt = conn.prepare_cached(
516| 1| "SELECT e.id FROM entities e
517| 1| WHERE NOT EXISTS (SELECT 1 FROM memory_entities me WHERE me.entity_id = e.id)
518| 1| AND NOT EXISTS (
519| 1| SELECT 1 FROM relationships r
520| 1| WHERE r.source_id = e.id OR r.target_id = e.id
521| 1| )",
522| 0| )?;
523| 1| let ids = stmt
524| 2| .query_map([], |r| r.get::<_, i64>(0))?
^1 ^1 ^0
525| 1| .collect::<Result<Vec<_>, _>>()?;
^0
526| 1| Ok(ids)
527| | }
528| 3|}
529| |
530| |/// Deletes entities and their associated vectors. Returns the number of entities removed.
531| |///
532| |/// # Errors
533| |///
534| |/// Returns [`AppError::Database`] when the underlying SQLite operation fails.
535| 5|pub fn delete_entities_by_ids(conn: &Connection, entity_ids: &[i64]) -> Result<usize, AppError> {
536| 5| if entity_ids.is_empty() {
537| 1| return Ok(0);
538| 4| }
539| 4| let mut removed = 0usize;
540| 9| for id in entity_ids {
^5
541| | // vec0 lacks FK CASCADE — clean vec_entities explicitly.
542| 5| let _ = conn.execute("DELETE FROM vec_entities WHERE entity_id = ?1", params![id]);
543| 5| let affected = conn.execute("DELETE FROM entities WHERE id = ?1", params![id])?;
^0
544| 5| removed += affected;
545| | }
546| 4| Ok(removed)
547| 5|}
548| |
549| |/// Counts relationships matching the given relation type within a namespace.
550| |///
551| |/// Used by `prune-relations --dry-run` to preview the number of relationships
552| |/// that would be deleted without actually modifying the database.
553| |///
554| |/// # Errors
555| |///
556| |/// Returns `Err(AppError::Database)` on any `rusqlite` failure.
557| 0|pub fn count_relationships_by_relation(
558| 0| conn: &Connection,
559| 0| namespace: &str,
560| 0| relation: &str,
561| 0|) -> Result<usize, AppError> {
562| 0| let count: i64 = conn.query_row(
563| 0| "SELECT COUNT(*) FROM relationships WHERE namespace = ?1 AND relation = ?2",
564| 0| params![namespace, relation],
565| 0| |r| r.get(0),
566| 0| )?;
567| 0| Ok(count as usize)
568| 0|}
569| |
570| |/// Returns unique entity names involved in relationships of the given type.
571| |///
572| |/// Queries both source and target sides of every matching relationship row,
573| |/// deduplicates via `DISTINCT`, and returns the names in alphabetical order.
574| |///
575| |/// # Errors
576| |///
577| |/// Returns `Err(AppError::Database)` on any `rusqlite` failure.
578| 0|pub fn list_entity_names_by_relation(
579| 0| conn: &Connection,
580| 0| namespace: &str,
581| 0| relation: &str,
582| 0|) -> Result<Vec<String>, AppError> {
583| 0| let mut stmt = conn.prepare_cached(
584| 0| "SELECT DISTINCT e.name FROM entities e
585| 0| INNER JOIN relationships r ON (e.id = r.source_id OR e.id = r.target_id)
586| 0| WHERE r.namespace = ?1 AND r.relation = ?2
587| 0| ORDER BY e.name",
588| 0| )?;
589| 0| let names: Vec<String> = stmt
590| 0| .query_map(params![namespace, relation], |row| row.get(0))?
591| 0| .collect::<Result<Vec<_>, _>>()?;
592| 0| Ok(names)
593| 0|}
594| |
595| |/// Deletes all relationships matching a relation type within a namespace.
596| |///
597| |/// Operates in chunks of 1000 to avoid holding long write locks and blocking
598| |/// WAL readers. After deletion, recalculates degree for every affected entity.
599| |///
600| |/// Returns `(count_deleted, affected_entity_ids)`.
601| |///
602| |/// # Errors
603| |///
604| |/// Returns `Err(AppError::Database)` on any `rusqlite` failure.
605| 0|pub fn delete_relationships_by_relation(
606| 0| conn: &Connection,
607| 0| namespace: &str,
608| 0| relation: &str,
609| 0|) -> Result<(usize, Vec<i64>), AppError> {
610| | // Step 1: collect all affected entity IDs before deletion.
611| 0| let mut stmt = conn.prepare_cached(
612| 0| "SELECT DISTINCT source_id FROM relationships WHERE namespace = ?1 AND relation = ?2
613| 0| UNION
614| 0| SELECT DISTINCT target_id FROM relationships WHERE namespace = ?1 AND relation = ?2",
615| 0| )?;
616| 0| let entity_ids: Vec<i64> = stmt
617| 0| .query_map(params![namespace, relation], |r| r.get::<_, i64>(0))?
618| 0| .collect::<Result<Vec<_>, _>>()?;
619| |
620| | // Step 2: collect relationship IDs to delete.
621| 0| let mut id_stmt =
622| 0| conn.prepare_cached("SELECT id FROM relationships WHERE namespace = ?1 AND relation = ?2")?;
623| 0| let rel_ids: Vec<i64> = id_stmt
624| 0| .query_map(params![namespace, relation], |r| r.get::<_, i64>(0))?
625| 0| .collect::<Result<Vec<_>, _>>()?;
626| |
627| | // Step 3: delete in chunks of 1000 (memory_relationships + relationships).
628| 0| let mut total_deleted: usize = 0;
629| 0| for chunk in rel_ids.chunks(1000) {
630| 0| for &rel_id in chunk {
631| 0| conn.execute(
632| 0| "DELETE FROM memory_relationships WHERE relationship_id = ?1",
633| 0| params![rel_id],
634| 0| )?;
635| 0| let affected =
636| 0| conn.execute("DELETE FROM relationships WHERE id = ?1", params![rel_id])?;
637| 0| total_deleted += affected;
638| | }
639| | }
640| |
641| | // Step 4: recalculate degree for all affected entities.
642| 0| for &eid in &entity_ids {
643| 0| recalculate_degree(conn, eid)?;
644| | }
645| |
646| 0| Ok((total_deleted, entity_ids))
647| 0|}
648| |
649| |/// Searches the `vec_entities` virtual table for the k nearest neighbours.
650| |///
651| |/// # Errors
652| |///
653| |/// - [`AppError::Database`] — SQLite or sqlite-vec query failure.
654| |/// - [`AppError::Embedding`] — invalid or mismatched embedding dimension.
655| 0|pub fn knn_search(
656| 0| conn: &Connection,
657| 0| embedding: &[f32],
658| 0| namespace: &str,
659| 0| k: usize,
660| 0|) -> Result<Vec<(i64, f32)>, AppError> {
661| 0| let bytes = f32_to_bytes(embedding);
662| 0| let mut stmt = conn.prepare_cached(
663| 0| "SELECT entity_id, distance FROM vec_entities
664| 0| WHERE embedding MATCH ?1 AND namespace = ?2
665| 0| ORDER BY distance LIMIT ?3",
666| 0| )?;
667| 0| let rows = stmt
668| 0| .query_map(params![bytes, namespace, k as i64], |r| {
669| 0| Ok((r.get::<_, i64>(0)?, r.get::<_, f32>(1)?))
670| 0| })?
671| 0| .collect::<Result<Vec<_>, _>>()?;
672| 0| Ok(rows)
673| 0|}
674| |
675| |#[cfg(test)]
676| |mod tests {
677| | use super::*;
678| | use crate::constants::EMBEDDING_DIM;
679| | use crate::entity_type::EntityType;
680| | use crate::storage::connection::register_vec_extension;
681| | use rusqlite::Connection;
682| | use tempfile::TempDir;
683| |
684| | type TestResult = Result<(), Box<dyn std::error::Error>>;
685| |
686| 31| fn setup_db() -> Result<(TempDir, Connection), Box<dyn std::error::Error>> {
687| 31| register_vec_extension();
688| 31| let tmp = TempDir::new()?;
^0
689| 31| let db_path = tmp.path().join("test.db");
690| 31| let mut conn = Connection::open(&db_path)?;
^0
691| 31| crate::migrations::runner().run(&mut conn)?;
^0
692| 31| Ok((tmp, conn))
693| 31| }
694| |
695| 3| fn insert_memory(conn: &Connection) -> Result<i64, Box<dyn std::error::Error>> {
696| 3| conn.execute(
697| 3| "INSERT INTO memories (namespace, name, type, description, body, body_hash)
698| 3| VALUES ('global', 'test-mem', 'user', 'desc', 'body', 'hash1')",
699| 3| [],
700| 0| )?;
701| 3| Ok(conn.last_insert_rowid())
702| 3| }
703| |
704| 45| fn new_entity_helper(name: &str) -> NewEntity {
705| 45| NewEntity {
706| 45| name: name.to_string(),
707| 45| entity_type: EntityType::Project,
708| 45| description: None,
709| 45| }
710| 45| }
711| |
712| 4| fn embedding_zero() -> Vec<f32> {
713| 4| vec![0.0f32; EMBEDDING_DIM]
714| 4| }
715| |
716| | // ------------------------------------------------------------------ //
717| | // upsert_entity
718| | // ------------------------------------------------------------------ //
719| |
720| | #[test]
721| 1| fn test_upsert_entity_creates_new() -> TestResult {
722| 1| let (_tmp, conn) = setup_db()?;
^0
723| 1| let e = new_entity_helper("projeto-alpha");
724| 1| let id = upsert_entity(&conn, "global", &e)?;
^0
725| 1| assert!(id > 0);
726| 1| Ok(())
727| 1| }
728| |
729| | #[test]
730| 1| fn test_upsert_entity_idempotent_returns_same_id() -> TestResult {
731| 1| let (_tmp, conn) = setup_db()?;
^0
732| 1| let e = new_entity_helper("projeto-beta");
733| 1| let id1 = upsert_entity(&conn, "global", &e)?;
^0
734| 1| let id2 = upsert_entity(&conn, "global", &e)?;
^0
735| 1| assert_eq!(id1, id2);
736| 1| Ok(())
737| 1| }
738| |
739| | #[test]
740| 1| fn test_upsert_entity_updates_description() -> TestResult {
741| 1| let (_tmp, conn) = setup_db()?;
^0
742| 1| let e1 = new_entity_helper("projeto-gamma");
743| 1| let id1 = upsert_entity(&conn, "global", &e1)?;
^0
744| |
745| 1| let e2 = NewEntity {
746| 1| name: "projeto-gamma".to_string(),
747| 1| entity_type: EntityType::Tool,
748| 1| description: Some("nova desc".to_string()),
749| 1| };
750| 1| let id2 = upsert_entity(&conn, "global", &e2)?;
^0
751| 1| assert_eq!(id1, id2);
752| |
753| 1| let desc: Option<String> = conn.query_row(
754| 1| "SELECT description FROM entities WHERE id = ?1",
755| 1| params![id1],
756| 1| |r| r.get(0),
757| 0| )?;
758| 1| assert_eq!(desc.as_deref(), Some("nova desc"));
759| 1| Ok(())
760| 1| }
761| |
762| | #[test]
763| 1| fn test_upsert_entity_different_namespaces_create_distinct_records() -> TestResult {
764| 1| let (_tmp, conn) = setup_db()?;
^0
765| 1| let e = new_entity_helper("compartilhada");
766| 1| let id1 = upsert_entity(&conn, "ns1", &e)?;
^0
767| 1| let id2 = upsert_entity(&conn, "ns2", &e)?;
^0
768| 1| assert_ne!(id1, id2);
769| 1| Ok(())
770| 1| }
771| |
772| | // ------------------------------------------------------------------ //
773| | // upsert_entity_vec — covers DELETE+INSERT (new branch after the OOM fix)
774| | // ------------------------------------------------------------------ //
775| |
776| | #[test]
777| 1| fn test_upsert_entity_vec_first_time_without_conflict() -> TestResult {
778| 1| let (_tmp, conn) = setup_db()?;
^0
779| 1| let e = new_entity_helper("vec-nova");
780| 1| let entity_id = upsert_entity(&conn, "global", &e)?;
^0
781| 1| let emb = embedding_zero();
782| |
783| 1| let result = upsert_entity_vec(
784| 1| &conn,
785| 1| entity_id,
786| 1| "global",
787| 1| EntityType::Project,
788| 1| &emb,
789| 1| "vec-nova",
790| | );
791| 1| assert!(result.is_ok(), "first insertion must succeed");
^0
792| |
793| 1| let count: i64 = conn.query_row(
794| 1| "SELECT COUNT(*) FROM vec_entities WHERE entity_id = ?1",
795| 1| params![entity_id],
796| 1| |r| r.get(0),
797| 0| )?;
798| 1| assert_eq!(count, 1, "must have exactly one row after insertion");
^0
799| 1| Ok(())
800| 1| }
801| |
802| | #[test]
803| 1| fn test_upsert_entity_vec_second_time_replaces_without_error() -> TestResult {
804| | // Covers the branch where DELETE removes the existing row before INSERT.
805| 1| let (_tmp, conn) = setup_db()?;
^0
806| 1| let e = new_entity_helper("vec-existente");
807| 1| let entity_id = upsert_entity(&conn, "global", &e)?;
^0
808| 1| let emb = embedding_zero();
809| |
810| 1| upsert_entity_vec(
811| 1| &conn,
812| 1| entity_id,
813| 1| "global",
814| 1| EntityType::Project,
815| 1| &emb,
816| 1| "vec-existente",
817| 0| )?;
818| |
819| | // Second call: DELETE returns 1 removed row, INSERT must succeed.
820| 1| let result = upsert_entity_vec(
821| 1| &conn,
822| 1| entity_id,
823| 1| "global",
824| 1| EntityType::Tool,
825| 1| &emb,
826| 1| "vec-existente",
827| | );
828| 1| assert!(
829| 1| result.is_ok(),
830| 0| "second insertion (replace) must succeed: {result:?}"
831| | );
832| |
833| 1| let count: i64 = conn.query_row(
834| 1| "SELECT COUNT(*) FROM vec_entities WHERE entity_id = ?1",
835| 1| params![entity_id],
836| 1| |r| r.get(0),
837| 0| )?;
838| 1| assert_eq!(count, 1, "must have exactly one row after replacement");
^0
839| 1| Ok(())
840| 1| }
841| |
842| | #[test]
843| 1| fn test_upsert_entity_vec_multiple_independent_entities() -> TestResult {
844| 1| let (_tmp, conn) = setup_db()?;
^0
845| 1| let emb = embedding_zero();
846| |
847| 4| for i in 0..3i64 {
^3
848| 3| let nome = format!("ent-{i}");
849| 3| let e = new_entity_helper(&nome);
850| 3| let entity_id = upsert_entity(&conn, "global", &e)?;
^0
851| 3| upsert_entity_vec(&conn, entity_id, "global", EntityType::Project, &emb, &nome)?;
^0
852| | }
853| |
854| 1| let count: i64 = conn.query_row("SELECT COUNT(*) FROM vec_entities", [], |r| r.get(0))?;
^0
855| 1| assert_eq!(count, 3, "must have three distinct rows in vec_entities");
^0
856| 1| Ok(())
857| 1| }
858| |
859| | // ------------------------------------------------------------------ //
860| | // find_entity_id
861| | // ------------------------------------------------------------------ //
862| |
863| | #[test]
864| 1| fn test_find_entity_id_existing_returns_some() -> TestResult {
865| 1| let (_tmp, conn) = setup_db()?;
^0
866| 1| let e = new_entity_helper("entidade-busca");
867| 1| let id_inserido = upsert_entity(&conn, "global", &e)?;
^0
868| 1| let id_encontrado = find_entity_id(&conn, "global", "entidade-busca")?;
^0
869| 1| assert_eq!(id_encontrado, Some(id_inserido));
870| 1| Ok(())
871| 1| }
872| |
873| | #[test]
874| 1| fn test_find_entity_id_missing_returns_none() -> TestResult {
875| 1| let (_tmp, conn) = setup_db()?;
^0
876| 1| let id = find_entity_id(&conn, "global", "nao-existe")?;
^0
877| 1| assert_eq!(id, None);
878| 1| Ok(())
879| 1| }
880| |
881| | // ------------------------------------------------------------------ //
882| | // delete_entities_by_ids
883| | // ------------------------------------------------------------------ //
884| |
885| | #[test]
886| 1| fn test_delete_entities_by_ids_empty_list_returns_zero() -> TestResult {
887| 1| let (_tmp, conn) = setup_db()?;
^0
888| 1| let removed = delete_entities_by_ids(&conn, &[])?;
^0
889| 1| assert_eq!(removed, 0);
890| 1| Ok(())
891| 1| }
892| |
893| | #[test]
894| 1| fn test_delete_entities_by_ids_removes_valid_entity() -> TestResult {
895| 1| let (_tmp, conn) = setup_db()?;
^0
896| 1| let e = new_entity_helper("to-delete");
897| 1| let entity_id = upsert_entity(&conn, "global", &e)?;
^0
898| |
899| 1| let removed = delete_entities_by_ids(&conn, &[entity_id])?;
^0
900| 1| assert_eq!(removed, 1);
901| |
902| 1| let id = find_entity_id(&conn, "global", "to-delete")?;
^0
903| 1| assert_eq!(id, None, "entity must have been removed");
^0
904| 1| Ok(())
905| 1| }
906| |
907| | #[test]
908| 1| fn test_delete_entities_by_ids_missing_id_returns_zero() -> TestResult {
909| 1| let (_tmp, conn) = setup_db()?;
^0
910| 1| let removed = delete_entities_by_ids(&conn, &[9999])?;
^0
911| 1| assert_eq!(removed, 0);
912| 1| Ok(())
913| 1| }
914| |
915| | #[test]
916| 1| fn test_delete_entities_by_ids_removes_multiple() -> TestResult {
917| 1| let (_tmp, conn) = setup_db()?;
^0
918| 1| let id1 = upsert_entity(&conn, "global", &new_entity_helper("del-a"))?;
^0
919| 1| let id2 = upsert_entity(&conn, "global", &new_entity_helper("del-b"))?;
^0
920| 1| let id3 = upsert_entity(&conn, "global", &new_entity_helper("del-c"))?;
^0
921| |
922| 1| let removed = delete_entities_by_ids(&conn, &[id1, id2])?;
^0
923| 1| assert_eq!(removed, 2);
924| |
925| 1| assert!(find_entity_id(&conn, "global", "del-a")?.is_none());
^0
926| 1| assert!(find_entity_id(&conn, "global", "del-b")?.is_none());
^0
927| 1| assert!(find_entity_id(&conn, "global", "del-c")?.is_some());
^0
928| 1| let _ = id3;
929| 1| Ok(())
930| 1| }
931| |
932| | #[test]
933| 1| fn test_delete_entities_by_ids_also_removes_vec() -> TestResult {
934| 1| let (_tmp, conn) = setup_db()?;
^0
935| 1| let e = new_entity_helper("del-com-vec");
936| 1| let entity_id = upsert_entity(&conn, "global", &e)?;
^0
937| 1| let emb = embedding_zero();
938| 1| upsert_entity_vec(
939| 1| &conn,
940| 1| entity_id,
941| 1| "global",
942| 1| EntityType::Project,
943| 1| &emb,
944| 1| "del-com-vec",
945| 0| )?;
946| |
947| 1| let count_antes: i64 = conn.query_row(
948| 1| "SELECT COUNT(*) FROM vec_entities WHERE entity_id = ?1",
949| 1| params![entity_id],
950| 1| |r| r.get(0),
951| 0| )?;
952| 1| assert_eq!(count_antes, 1);
953| |
954| 1| delete_entities_by_ids(&conn, &[entity_id])?;
^0
955| |
956| 1| let count_depois: i64 = conn.query_row(
957| 1| "SELECT COUNT(*) FROM vec_entities WHERE entity_id = ?1",
958| 1| params![entity_id],
959| 1| |r| r.get(0),
960| 0| )?;
961| 1| assert_eq!(
962| | count_depois, 0,
963| 0| "vec_entities deve ser limpo junto com entities"
964| | );
965| 1| Ok(())
966| 1| }
967| |
968| | // ------------------------------------------------------------------ //
969| | // upsert_relationship / find_relationship
970| | // ------------------------------------------------------------------ //
971| |
972| | #[test]
973| 1| fn test_upsert_relationship_creates_new() -> TestResult {
974| 1| let (_tmp, conn) = setup_db()?;
^0
975| 1| let id_a = upsert_entity(&conn, "global", &new_entity_helper("rel-a"))?;
^0
976| 1| let id_b = upsert_entity(&conn, "global", &new_entity_helper("rel-b"))?;
^0
977| |
978| 1| let rel = NewRelationship {
979| 1| source: "rel-a".to_string(),
980| 1| target: "rel-b".to_string(),
981| 1| relation: "uses".to_string(),
982| 1| strength: 0.8,
983| 1| description: None,
984| 1| };
985| 1| let rel_id = upsert_relationship(&conn, "global", id_a, id_b, &rel)?;
^0
986| 1| assert!(rel_id > 0);
987| 1| Ok(())
988| 1| }
989| |
990| | #[test]
991| 1| fn test_upsert_relationship_idempotent() -> TestResult {
992| 1| let (_tmp, conn) = setup_db()?;
^0
993| 1| let id_a = upsert_entity(&conn, "global", &new_entity_helper("idem-a"))?;
^0
994| 1| let id_b = upsert_entity(&conn, "global", &new_entity_helper("idem-b"))?;
^0
995| |
996| 1| let rel = NewRelationship {
997| 1| source: "idem-a".to_string(),
998| 1| target: "idem-b".to_string(),
999| 1| relation: "uses".to_string(),
1000| 1| strength: 0.5,
1001| 1| description: None,
1002| 1| };
1003| 1| let id1 = upsert_relationship(&conn, "global", id_a, id_b, &rel)?;
^0
1004| 1| let id2 = upsert_relationship(&conn, "global", id_a, id_b, &rel)?;
^0
1005| 1| assert_eq!(id1, id2);
1006| 1| Ok(())
1007| 1| }
1008| |
1009| | #[test]
1010| 1| fn test_find_relationship_existing() -> TestResult {
1011| 1| let (_tmp, conn) = setup_db()?;
^0
1012| 1| let id_a = upsert_entity(&conn, "global", &new_entity_helper("fr-a"))?;
^0
1013| 1| let id_b = upsert_entity(&conn, "global", &new_entity_helper("fr-b"))?;
^0
1014| |
1015| 1| let rel = NewRelationship {
1016| 1| source: "fr-a".to_string(),
1017| 1| target: "fr-b".to_string(),
1018| 1| relation: "depends_on".to_string(),
1019| 1| strength: 0.7,
1020| 1| description: None,
1021| 1| };
1022| 1| upsert_relationship(&conn, "global", id_a, id_b, &rel)?;
^0
1023| |
1024| 1| let encontrada = find_relationship(&conn, id_a, id_b, "depends_on")?;
^0
1025| 1| let row = encontrada.ok_or("relationship should exist")?;
^0
1026| 1| assert_eq!(row.source_id, id_a);
1027| 1| assert_eq!(row.target_id, id_b);
1028| 1| assert!((row.weight - 0.7).abs() < 1e-9);
1029| 1| Ok(())
1030| 1| }
1031| |
1032| | #[test]
1033| 1| fn test_find_relationship_missing_returns_none() -> TestResult {
1034| 1| let (_tmp, conn) = setup_db()?;
^0
1035| 1| let resultado = find_relationship(&conn, 9999, 8888, "uses")?;
^0
1036| 1| assert!(resultado.is_none());
1037| 1| Ok(())
1038| 1| }
1039| |
1040| | // ------------------------------------------------------------------ //
1041| | // link_memory_entity / link_memory_relationship
1042| | // ------------------------------------------------------------------ //
1043| |
1044| | #[test]
1045| 1| fn test_link_memory_entity_idempotent() -> TestResult {
1046| 1| let (_tmp, conn) = setup_db()?;
^0
1047| 1| let memory_id = insert_memory(&conn)?;
^0
1048| 1| let entity_id = upsert_entity(&conn, "global", &new_entity_helper("me-ent"))?;
^0
1049| |
1050| 1| link_memory_entity(&conn, memory_id, entity_id)?;
^0
1051| 1| let resultado = link_memory_entity(&conn, memory_id, entity_id);
1052| 1| assert!(
1053| 1| resultado.is_ok(),
1054| 0| "INSERT OR IGNORE must not fail on duplicate"
1055| | );
1056| 1| Ok(())
1057| 1| }
1058| |
1059| | #[test]
1060| 1| fn test_link_memory_relationship_idempotent() -> TestResult {
1061| 1| let (_tmp, conn) = setup_db()?;
^0
1062| 1| let memory_id = insert_memory(&conn)?;
^0
1063| 1| let id_a = upsert_entity(&conn, "global", &new_entity_helper("mr-a"))?;
^0
1064| 1| let id_b = upsert_entity(&conn, "global", &new_entity_helper("mr-b"))?;
^0
1065| |
1066| 1| let rel = NewRelationship {
1067| 1| source: "mr-a".to_string(),
1068| 1| target: "mr-b".to_string(),
1069| 1| relation: "uses".to_string(),
1070| 1| strength: 0.5,
1071| 1| description: None,
1072| 1| };
1073| 1| let rel_id = upsert_relationship(&conn, "global", id_a, id_b, &rel)?;
^0
1074| |
1075| 1| link_memory_relationship(&conn, memory_id, rel_id)?;
^0
1076| 1| let resultado = link_memory_relationship(&conn, memory_id, rel_id);
1077| 1| assert!(
1078| 1| resultado.is_ok(),
1079| 0| "INSERT OR IGNORE must not fail on duplicate"
1080| | );
1081| 1| Ok(())
1082| 1| }
1083| |
1084| | // ------------------------------------------------------------------ //
1085| | // increment_degree / recalculate_degree
1086| | // ------------------------------------------------------------------ //
1087| |
1088| | #[test]
1089| 1| fn test_increment_degree_increases_counter() -> TestResult {
1090| 1| let (_tmp, conn) = setup_db()?;
^0
1091| 1| let entity_id = upsert_entity(&conn, "global", &new_entity_helper("grau-ent"))?;
^0
1092| |
1093| 1| increment_degree(&conn, entity_id)?;
^0
1094| 1| increment_degree(&conn, entity_id)?;
^0
1095| |
1096| 1| let degree: i64 = conn.query_row(
1097| 1| "SELECT degree FROM entities WHERE id = ?1",
1098| 1| params![entity_id],
1099| 1| |r| r.get(0),
1100| 0| )?;
1101| 1| assert_eq!(degree, 2);
1102| 1| Ok(())
1103| 1| }
1104| |
1105| | #[test]
1106| 1| fn test_recalculate_degree_reflects_actual_relations() -> TestResult {
1107| 1| let (_tmp, conn) = setup_db()?;
^0
1108| 1| let id_a = upsert_entity(&conn, "global", &new_entity_helper("rc-a"))?;
^0
1109| 1| let id_b = upsert_entity(&conn, "global", &new_entity_helper("rc-b"))?;
^0
1110| 1| let id_c = upsert_entity(&conn, "global", &new_entity_helper("rc-c"))?;
^0
1111| |
1112| 1| let rel1 = NewRelationship {
1113| 1| source: "rc-a".to_string(),
1114| 1| target: "rc-b".to_string(),
1115| 1| relation: "uses".to_string(),
1116| 1| strength: 0.5,
1117| 1| description: None,
1118| 1| };
1119| 1| let rel2 = NewRelationship {
1120| 1| source: "rc-c".to_string(),
1121| 1| target: "rc-a".to_string(),
1122| 1| relation: "depends_on".to_string(),
1123| 1| strength: 0.5,
1124| 1| description: None,
1125| 1| };
1126| 1| upsert_relationship(&conn, "global", id_a, id_b, &rel1)?;
^0
1127| 1| upsert_relationship(&conn, "global", id_c, id_a, &rel2)?;
^0
1128| |
1129| 1| recalculate_degree(&conn, id_a)?;
^0
1130| |
1131| 1| let degree: i64 = conn.query_row(
1132| 1| "SELECT degree FROM entities WHERE id = ?1",
1133| 1| params![id_a],
1134| 1| |r| r.get(0),
1135| 0| )?;
1136| 1| assert_eq!(
1137| | degree, 2,
1138| 0| "rc-a appears in two relationships (source+target)"
1139| | );
1140| 1| Ok(())
1141| 1| }
1142| |
1143| | // ------------------------------------------------------------------ //
1144| | // find_orphan_entity_ids
1145| | // ------------------------------------------------------------------ //
1146| |
1147| | #[test]
1148| 1| fn test_find_orphan_entity_ids_without_orphans() -> TestResult {
1149| 1| let (_tmp, conn) = setup_db()?;
^0
1150| 1| let memory_id = insert_memory(&conn)?;
^0
1151| 1| let entity_id = upsert_entity(&conn, "global", &new_entity_helper("nao-orfa"))?;
^0
1152| 1| link_memory_entity(&conn, memory_id, entity_id)?;
^0
1153| |
1154| 1| let orfas = find_orphan_entity_ids(&conn, Some("global"))?;
^0
1155| 1| assert!(!orfas.contains(&entity_id));
1156| 1| Ok(())
1157| 1| }
1158| |
1159| | #[test]
1160| 1| fn test_find_orphan_entity_ids_detects_orphans() -> TestResult {
1161| 1| let (_tmp, conn) = setup_db()?;
^0
1162| 1| let entity_id = upsert_entity(&conn, "global", &new_entity_helper("sim-orfa"))?;
^0
1163| |
1164| 1| let orfas = find_orphan_entity_ids(&conn, Some("global"))?;
^0
1165| 1| assert!(orfas.contains(&entity_id));
1166| 1| Ok(())
1167| 1| }
1168| |
1169| | #[test]
1170| 1| fn test_find_orphan_entity_ids_without_namespace_returns_all() -> TestResult {
1171| 1| let (_tmp, conn) = setup_db()?;
^0
1172| 1| let id1 = upsert_entity(&conn, "ns-a", &new_entity_helper("orfa-a"))?;
^0
1173| 1| let id2 = upsert_entity(&conn, "ns-b", &new_entity_helper("orfa-b"))?;
^0
1174| |
1175| 1| let orfas = find_orphan_entity_ids(&conn, None)?;
^0
1176| 1| assert!(orfas.contains(&id1));
1177| 1| assert!(orfas.contains(&id2));
1178| 1| Ok(())
1179| 1| }
1180| |
1181| | // ------------------------------------------------------------------ //
1182| | // list_entities / list_relationships_by_namespace
1183| | // ------------------------------------------------------------------ //
1184| |
1185| | #[test]
1186| 1| fn test_list_entities_with_namespace() -> TestResult {
1187| 1| let (_tmp, conn) = setup_db()?;
^0
1188| 1| upsert_entity(&conn, "le-ns", &new_entity_helper("le-ent-1"))?;
^0
1189| 1| upsert_entity(&conn, "le-ns", &new_entity_helper("le-ent-2"))?;
^0
1190| 1| upsert_entity(&conn, "outro-ns", &new_entity_helper("le-ent-3"))?;
^0
1191| |
1192| 1| let lista = list_entities(&conn, Some("le-ns"))?;
^0
1193| 1| assert_eq!(lista.len(), 2);
1194| 2| assert!(lista.iter().all(|e| e.namespace == "le-ns"));
^1 ^1 ^1
1195| 1| Ok(())
1196| 1| }
1197| |
1198| | #[test]
1199| 1| fn test_list_entities_without_namespace_returns_all() -> TestResult {
1200| 1| let (_tmp, conn) = setup_db()?;
^0
1201| 1| upsert_entity(&conn, "ns1", &new_entity_helper("all-ent-1"))?;
^0
1202| 1| upsert_entity(&conn, "ns2", &new_entity_helper("all-ent-2"))?;
^0
1203| |
1204| 1| let lista = list_entities(&conn, None)?;
^0
1205| 1| assert!(lista.len() >= 2);
1206| 1| Ok(())
1207| 1| }
1208| |
1209| | #[test]
1210| 1| fn test_list_relationships_by_namespace_filters_correctly() -> TestResult {
1211| 1| let (_tmp, conn) = setup_db()?;
^0
1212| 1| let id_a = upsert_entity(&conn, "rel-ns", &new_entity_helper("lr-a"))?;
^0
1213| 1| let id_b = upsert_entity(&conn, "rel-ns", &new_entity_helper("lr-b"))?;
^0
1214| |
1215| 1| let rel = NewRelationship {
1216| 1| source: "lr-a".to_string(),
1217| 1| target: "lr-b".to_string(),
1218| 1| relation: "uses".to_string(),
1219| 1| strength: 0.5,
1220| 1| description: None,
1221| 1| };
1222| 1| upsert_relationship(&conn, "rel-ns", id_a, id_b, &rel)?;
^0
1223| |
1224| 1| let lista = list_relationships_by_namespace(&conn, Some("rel-ns"))?;
^0
1225| 1| assert!(!lista.is_empty());
1226| 1| assert!(lista.iter().all(|r| r.namespace == "rel-ns"));
1227| 1| Ok(())
1228| 1| }
1229| |
1230| | // ------------------------------------------------------------------ //
1231| | // delete_relationship_by_id / create_or_fetch_relationship
1232| | // ------------------------------------------------------------------ //
1233| |
1234| | #[test]
1235| 1| fn test_delete_relationship_by_id_removes_relation() -> TestResult {
1236| 1| let (_tmp, conn) = setup_db()?;
^0
1237| 1| let id_a = upsert_entity(&conn, "global", &new_entity_helper("dr-a"))?;
^0
1238| 1| let id_b = upsert_entity(&conn, "global", &new_entity_helper("dr-b"))?;
^0
1239| |
1240| 1| let rel = NewRelationship {
1241| 1| source: "dr-a".to_string(),
1242| 1| target: "dr-b".to_string(),
1243| 1| relation: "uses".to_string(),
1244| 1| strength: 0.5,
1245| 1| description: None,
1246| 1| };
1247| 1| let rel_id = upsert_relationship(&conn, "global", id_a, id_b, &rel)?;
^0
1248| |
1249| 1| delete_relationship_by_id(&conn, rel_id)?;
^0
1250| |
1251| 1| let encontrada = find_relationship(&conn, id_a, id_b, "uses")?;
^0
1252| 1| assert!(encontrada.is_none(), "relationship must have been removed");
^0
1253| 1| Ok(())
1254| 1| }
1255| |
1256| | #[test]
1257| 1| fn test_create_or_fetch_relationship_creates_new() -> TestResult {
1258| 1| let (_tmp, conn) = setup_db()?;
^0
1259| 1| let id_a = upsert_entity(&conn, "global", &new_entity_helper("cf-a"))?;
^0
1260| 1| let id_b = upsert_entity(&conn, "global", &new_entity_helper("cf-b"))?;
^0
1261| |
1262| 1| let (rel_id, created) =
1263| 1| create_or_fetch_relationship(&conn, "global", id_a, id_b, "uses", 0.5, None)?;
^0
1264| 1| assert!(rel_id > 0);
1265| 1| assert!(created);
1266| 1| Ok(())
1267| 1| }
1268| |
1269| | #[test]
1270| 1| fn test_create_or_fetch_relationship_returns_existing() -> TestResult {
1271| 1| let (_tmp, conn) = setup_db()?;
^0
1272| 1| let id_a = upsert_entity(&conn, "global", &new_entity_helper("cf2-a"))?;
^0
1273| 1| let id_b = upsert_entity(&conn, "global", &new_entity_helper("cf2-b"))?;
^0
1274| |
1275| 1| create_or_fetch_relationship(&conn, "global", id_a, id_b, "uses", 0.5, None)?;
^0
1276| 1| let (_, created) =
1277| 1| create_or_fetch_relationship(&conn, "global", id_a, id_b, "uses", 0.5, None)?;
^0
1278| 1| assert!(
1279| 1| !created,
1280| 0| "second call must return the existing relationship"
1281| | );
1282| 1| Ok(())
1283| 1| }
1284| |
1285| | // ------------------------------------------------------------------ //
1286| | // serde alias: field "type" accepted as a synonym for "entity_type"
1287| | // ------------------------------------------------------------------ //
1288| |
1289| | #[test]
1290| 1| fn accepts_type_field_as_alias() -> TestResult {
1291| 1| let json = r#"{"name": "X", "type": "concept"}"#;
1292| 1| let ent: NewEntity = serde_json::from_str(json)?;
^0
1293| 1| assert_eq!(ent.entity_type, EntityType::Concept);
1294| 1| Ok(())
1295| 1| }
1296| |
1297| | #[test]
1298| 1| fn accepts_canonical_entity_type_field() -> TestResult {
1299| 1| let json = r#"{"name": "X", "entity_type": "concept"}"#;
1300| 1| let ent: NewEntity = serde_json::from_str(json)?;
^0
1301| 1| assert_eq!(ent.entity_type, EntityType::Concept);
1302| 1| Ok(())
1303| 1| }
1304| |
1305| | #[test]
1306| 1| fn both_fields_present_yields_duplicate_error() {
1307| | // having both entity_type and type in the same JSON is a duplicate and must fail
1308| 1| let json = r#"{"name": "X", "entity_type": "concept", "type": "person"}"#;
1309| 1| let resultado: Result<NewEntity, _> = serde_json::from_str(json);
1310| 1| assert!(
1311| 1| resultado.is_err(),
1312| 0| "both fields in the same JSON are a duplicate"
1313| | );
1314| 1| }
1315| |
1316| | #[test]
1317| 1| fn validate_entity_name_accepts_valid() {
1318| 1| assert!(validate_entity_name("rust-lang").is_ok());
1319| 1| assert!(validate_entity_name("sqlite-graphrag").is_ok());
1320| 1| assert!(validate_entity_name("ab").is_ok());
1321| 1| }
1322| |
1323| | #[test]
1324| 1| fn validate_entity_name_rejects_short() {
1325| 1| assert!(validate_entity_name("a").is_err());
1326| 1| assert!(validate_entity_name("").is_err());
1327| 1| }
1328| |
1329| | #[test]
1330| 1| fn validate_entity_name_rejects_newlines() {
1331| 1| assert!(validate_entity_name("foo\nbar").is_err());
1332| 1| assert!(validate_entity_name("foo\rbar").is_err());
1333| 1| }
1334| |
1335| | #[test]
1336| 1| fn validate_entity_name_rejects_short_allcaps() {
1337| 1| assert!(validate_entity_name("RAM").is_err());
1338| 1| assert!(validate_entity_name("NAO").is_err());
1339| 1| assert!(validate_entity_name("OK").is_err());
1340| 1| }
1341| |
1342| | #[test]
1343| 1| fn validate_entity_name_accepts_long_allcaps() {
1344| 1| assert!(validate_entity_name("SQLITE").is_ok());
1345| 1| assert!(validate_entity_name("GRAPHRAG").is_ok());
1346| 1| }
1347| |
1348| | #[test]
1349| 1| fn validate_entity_name_accepts_mixed_case() {
1350| 1| assert!(validate_entity_name("FTS5").is_ok()); // 4 chars but has digit
1351| 1| assert!(validate_entity_name("WAL").is_err()); // 3 chars ALL_CAPS
1352| 1| }
1353| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/storage/fusion.rs:
1| |//! RRF (Reciprocal Rank Fusion) utilities shared between `hybrid-search` and
2| |//! `deep-research`.
3| |//!
4| |//! The formula used is the canonical RRF score:
5| |//!
6| |//! ```text
7| |//! score(d) = sum_over_lists { weight * 1 / (rrf_k + rank(d)) }
8| |//! ```
9| |//!
10| |//! where `rank` is 1-indexed position in each ordered list. The map returned
11| |//! by [`rrf_fuse`] contains un-normalised scores; callers that need a `[0,1]`
12| |//! range should divide by the theoretical maximum:
13| |//!
14| |//! ```text
15| |//! max_possible = sum_over_lists { weight * 1 / (rrf_k + 1) }
16| |//! ```
17| |
18| |use std::collections::HashMap;
19| |
20| |/// Fuse multiple ranked lists of integer IDs via Reciprocal Rank Fusion.
21| |///
22| |/// Each element of `lists` is `(weight, ranked_ids)` where `ranked_ids` is
23| |/// ordered best-first (index 0 = rank 1).
24| |///
25| |/// Returns a `HashMap<id, combined_score>` using un-normalised RRF scores.
26| |/// Higher score means higher relevance.
27| |///
28| |/// # Examples
29| |///
30| |/// ```
31| |/// use sqlite_graphrag::storage::fusion::rrf_fuse;
32| |///
33| |/// // Two lists with equal weight — item 1 appears in both at rank 1 and 2
34| |/// // so it accumulates more score than item 2 (rank 2) or item 3 (rank 1 only).
35| |/// let knn: Vec<i64> = vec![1, 2];
36| |/// let fts: Vec<i64> = vec![1, 3];
37| |/// let scores = rrf_fuse(&[(1.0, &knn), (1.0, &fts)], 60.0);
38| |/// assert!(scores[&1] > scores[&2]);
39| |/// assert!(scores[&1] > scores[&3]);
40| |/// ```
41| 9|pub fn rrf_fuse(lists: &[(f64, &Vec<i64>)], rrf_k: f64) -> HashMap<i64, f64> {
42| 14| let total_ids: usize = lists.iter().map(|(_, ids)| ids.len()).sum();
^9 ^9 ^9 ^9 ^9 ^9
43| 9| let mut combined: HashMap<i64, f64> = HashMap::with_capacity(total_ids);
44| 23| for (weight, ids) in lists {
^14 ^14
45| 31| for (rank, &id) in ids.iter().enumerate() {
^14 ^14
46| 31| // rank is 0-indexed here; formula uses 1-indexed, so we add 1.
47| 31| let contribution = weight * (1.0 / (rrf_k + rank as f64 + 1.0));
48| 31| *combined.entry(id).or_insert(0.0) += contribution;
49| 31| }
50| | }
51| 9| combined
52| 9|}
53| |
54| |/// Compute the theoretical maximum RRF score for a given set of weights and
55| |/// `rrf_k`.
56| |///
57| |/// Useful for normalising `rrf_fuse` scores to `[0, 1]`:
58| |///
59| |/// ```
60| |/// use sqlite_graphrag::storage::fusion::{rrf_fuse, rrf_max_possible};
61| |///
62| |/// let weights = vec![1.0_f64, 1.0_f64];
63| |/// let max = rrf_max_possible(&weights, 60.0);
64| |/// assert!(max > 0.0);
65| |/// ```
66| 2|pub fn rrf_max_possible(weights: &[f64], rrf_k: f64) -> f64 {
67| 3| weights.iter().map(|w| w * (1.0 / (rrf_k + 1.0))).sum()
^2 ^2 ^2 ^2
68| 2|}
69| |
70| |#[cfg(test)]
71| |mod tests {
72| | use super::*;
73| |
74| | #[test]
75| 1| fn rrf_fuse_single_list_rank_order_preserved() {
76| | // Items at lower rank index get higher scores.
77| 1| let list = vec![10i64, 20, 30];
78| 1| let scores = rrf_fuse(&[(1.0, &list)], 60.0);
79| 1| assert!(scores[&10] > scores[&20]);
80| 1| assert!(scores[&20] > scores[&30]);
81| 1| }
82| |
83| | #[test]
84| 1| fn rrf_fuse_two_lists_overlap_accumulates() {
85| | // Item 1 appears first in both lists — must beat item 2 (rank 1 in one list only).
86| 1| let knn = vec![1i64, 2];
87| 1| let fts = vec![1i64, 3];
88| 1| let scores = rrf_fuse(&[(1.0, &knn), (1.0, &fts)], 60.0);
89| 1| assert!(scores[&1] > scores[&2], "overlap item must score higher");
^0
90| 1| assert!(scores[&1] > scores[&3], "overlap item must score higher");
^0
91| 1| }
92| |
93| | #[test]
94| 1| fn rrf_fuse_empty_lists_returns_empty() {
95| 1| let empty: Vec<i64> = vec![];
96| 1| let scores = rrf_fuse(&[(1.0, &empty)], 60.0);
97| 1| assert!(scores.is_empty());
98| 1| }
99| |
100| | #[test]
101| 1| fn rrf_fuse_zero_weight_list_has_no_effect() {
102| 1| let list_a = vec![1i64, 2];
103| 1| let list_b = vec![3i64, 4];
104| 1| let scores_with = rrf_fuse(&[(1.0, &list_a), (0.0, &list_b)], 60.0);
105| | // Items 3 and 4 should have score 0.0 (or not present).
106| 1| assert_eq!(scores_with.get(&3).copied().unwrap_or(0.0), 0.0);
107| 1| assert_eq!(scores_with.get(&4).copied().unwrap_or(0.0), 0.0);
108| 1| }
109| |
110| | #[test]
111| 1| fn rrf_fuse_weights_scale_contribution() {
112| | // Higher weight means higher score for same rank.
113| 1| let list = vec![1i64];
114| 1| let low = rrf_fuse(&[(0.5, &list)], 60.0);
115| 1| let high = rrf_fuse(&[(2.0, &list)], 60.0);
116| 1| assert!(high[&1] > low[&1]);
117| 1| }
118| |
119| | #[test]
120| 1| fn rrf_max_possible_sums_weights() {
121| | // With rrf_k=60, max for one list of weight 1.0 is 1/(60+1) ≈ 0.01639.
122| 1| let max = rrf_max_possible(&[1.0], 60.0);
123| 1| let expected = 1.0 / 61.0;
124| 1| assert!((max - expected).abs() < 1e-9);
125| |
126| | // Two equal-weight lists: sum of both.
127| 1| let max2 = rrf_max_possible(&[1.0, 1.0], 60.0);
128| 1| assert!((max2 - 2.0 / 61.0).abs() < 1e-9);
129| 1| }
130| |
131| | #[test]
132| 1| fn rrf_fuse_deterministic_for_same_input() {
133| 1| let list_a = vec![1i64, 2, 3];
134| 1| let list_b = vec![2i64, 1, 4];
135| 1| let scores_1 = rrf_fuse(&[(1.0, &list_a), (1.0, &list_b)], 60.0);
136| 1| let scores_2 = rrf_fuse(&[(1.0, &list_a), (1.0, &list_b)], 60.0);
137| 5| for id in [1i64, 2, 3, 4] {
^4
138| 4| assert_eq!(
139| 4| scores_1.get(&id).copied().unwrap_or(0.0),
140| 4| scores_2.get(&id).copied().unwrap_or(0.0)
141| | );
142| | }
143| 1| }
144| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/storage/memories.rs:
1| |//! Persistence layer for the `memories` table and its vector companion.
2| |//!
3| |//! Functions here encapsulate every SQL statement touching `memories`,
4| |//! `vec_memories` and the FTS5 `fts_memories` shadow table. Callers receive
5| |//! typed [`MemoryRow`] or [`NewMemory`] values and never build SQL strings.
6| |
7| |use crate::embedder::f32_to_bytes;
8| |use crate::errors::AppError;
9| |use crate::storage::utils::with_busy_retry;
10| |use rusqlite::{params, Connection};
11| |use serde::{Deserialize, Serialize};
12| |
13| |/// Input payload for inserting or updating a memory.
14| |///
15| |/// `body_hash` must be the BLAKE3 digest of `body`. The `metadata` field is
16| |/// stored as a TEXT column containing JSON.
17| |#[derive(Debug, Serialize, Deserialize)]
18| |pub struct NewMemory {
19| | pub namespace: String,
20| | pub name: String,
21| | pub memory_type: String,
22| | pub description: String,
23| | pub body: String,
24| | pub body_hash: String,
25| | pub session_id: Option<String>,
26| | pub source: String,
27| | pub metadata: serde_json::Value,
28| |}
29| |
30| |/// Fully materialized row from the `memories` table.
31| |///
32| |/// Returned by [`read_by_name`], [`read_full`], [`list`] and [`fts_search`].
33| |/// The `metadata` field is kept as a JSON string to avoid double parsing.
34| |#[derive(Debug, Serialize)]
35| |pub struct MemoryRow {
36| | pub id: i64,
37| | pub namespace: String,
38| | pub name: String,
39| | pub memory_type: String,
40| | pub description: String,
41| | pub body: String,
42| | pub body_hash: String,
43| | pub session_id: Option<String>,
44| | pub source: String,
45| | pub metadata: String,
46| | pub created_at: i64,
47| | pub updated_at: i64,
48| | /// Unix epoch when the memory was soft-deleted, or `None` for active memories.
49| | /// Surfaced in `list --include-deleted --json` so LLM consumers can distinguish
50| | /// active from soft-deleted rows without a second SQL query (v1.0.37 H7+M9 fix).
51| | #[serde(skip_serializing_if = "Option::is_none")]
52| | pub deleted_at: Option<i64>,
53| |}
54| |
55| |/// Finds a live memory by `(namespace, name)` and returns key metadata.
56| |///
57| |/// # Arguments
58| |///
59| |/// - `conn` — open SQLite connection configured with the project pragmas.
60| |/// - `namespace` — resolved namespace for the lookup.
61| |/// - `name` — kebab-case memory name.
62| |///
63| |/// # Returns
64| |///
65| |/// `Ok(Some((id, updated_at, max_version)))` when the memory exists and is
66| |/// not soft-deleted, `Ok(None)` otherwise.
67| |///
68| |/// # Errors
69| |///
70| |/// Returns `Err(AppError::Database)` on any `rusqlite` failure.
71| 8|pub fn find_by_name(
72| 8| conn: &Connection,
73| 8| namespace: &str,
74| 8| name: &str,
75| 8|) -> Result<Option<(i64, i64, i64)>, AppError> {
76| 8| let mut stmt = conn.prepare_cached(
77| 8| "SELECT m.id, m.updated_at, COALESCE(MAX(v.version), 0)
78| 8| FROM memories m
79| 8| LEFT JOIN memory_versions v ON v.memory_id = m.id
80| 8| WHERE m.namespace = ?1 AND m.name = ?2 AND m.deleted_at IS NULL
81| 8| GROUP BY m.id",
82| 0| )?;
83| 8| let result = stmt.query_row(params![namespace, name], |r| {
^5
84| | Ok((
85| 5| r.get::<_, i64>(0)?,
^0
86| 5| r.get::<_, i64>(1)?,
^0
87| 5| r.get::<_, i64>(2)?,
^0
88| | ))
89| 5| });
90| 3| match result {
91| 5| Ok(row) => Ok(Some(row)),
92| 3| Err(rusqlite::Error::QueryReturnedNoRows) => Ok(None),
93| 0| Err(e) => Err(AppError::Database(e)),
94| | }
95| 8|}
96| |
97| |/// Looks up a memory by `(namespace, name)` regardless of deletion state.
98| |///
99| |/// Returns `Some((id, is_deleted))` when the row exists.
100| |/// `is_deleted` is `true` when `deleted_at IS NOT NULL`.
101| |///
102| |/// # Errors
103| |///
104| |/// Propagates [`AppError::Database`] on SQLite failures.
105| 3|pub fn find_by_name_any_state(
106| 3| conn: &Connection,
107| 3| namespace: &str,
108| 3| name: &str,
109| 3|) -> Result<Option<(i64, bool)>, AppError> {
110| 3| let mut stmt = conn.prepare_cached(
111| 3| "SELECT id, (deleted_at IS NOT NULL) AS is_deleted
112| 3| FROM memories WHERE namespace = ?1 AND name = ?2",
113| 0| )?;
114| 3| let result = stmt.query_row(params![namespace, name], |r| {
^2
115| 2| Ok((r.get::<_, i64>(0)?, r.get::<_, bool>(1)?))
^0 ^0
116| 2| });
117| 1| match result {
118| 2| Ok(row) => Ok(Some(row)),
119| 1| Err(rusqlite::Error::QueryReturnedNoRows) => Ok(None),
120| 0| Err(e) => Err(AppError::Database(e)),
121| | }
122| 3|}
123| |
124| |/// Clears `deleted_at` to restore a soft-deleted memory.
125| |///
126| |/// # Errors
127| |///
128| |/// Propagates [`AppError::Database`] on SQLite failures.
129| 1|pub fn clear_deleted_at(conn: &Connection, memory_id: i64) -> Result<(), AppError> {
130| 1| conn.execute(
131| 1| "UPDATE memories SET deleted_at = NULL WHERE id = ?1",
132| 1| params![memory_id],
133| 0| )?;
134| 1| Ok(())
135| 1|}
136| |
137| |/// Looks up a live memory by exact `body_hash` within a namespace.
138| |///
139| |/// Used during `remember` to short-circuit semantic duplicates before
140| |/// spending an embedding call.
141| |///
142| |/// # Returns
143| |///
144| |/// `Ok(Some(id))` when a live memory with the same hash exists,
145| |/// `Ok(None)` otherwise.
146| |///
147| |/// # Errors
148| |///
149| |/// Returns `Err(AppError::Database)` on any `rusqlite` failure.
150| 3|pub fn find_by_hash(
151| 3| conn: &Connection,
152| 3| namespace: &str,
153| 3| body_hash: &str,
154| 3|) -> Result<Option<i64>, AppError> {
155| 3| let mut stmt = conn.prepare_cached(
156| 3| "SELECT id FROM memories WHERE namespace = ?1 AND body_hash = ?2 AND deleted_at IS NULL",
157| 0| )?;
158| 3| match stmt.query_row(params![namespace, body_hash], |r| r.get(0)) {
^1^1
159| 1| Ok(id) => Ok(Some(id)),
160| 2| Err(rusqlite::Error::QueryReturnedNoRows) => Ok(None),
161| 0| Err(e) => Err(AppError::Database(e)),
162| | }
163| 3|}
164| |
165| |/// Inserts a new row into the `memories` table.
166| |///
167| |/// # Arguments
168| |///
169| |/// - `conn` — active SQLite connection, typically inside a transaction.
170| |/// - `m` — validated payload including `body_hash` and serialized metadata.
171| |///
172| |/// # Returns
173| |///
174| |/// The `rowid` assigned to the newly inserted memory.
175| |///
176| |/// # Errors
177| |///
178| |/// Returns `Err(AppError::Database)` on insertion failure and
179| |/// `Err(AppError::Json)` if metadata serialization fails.
180| 39|pub fn insert(conn: &Connection, m: &NewMemory) -> Result<i64, AppError> {
181| | // G29 Passo 2 (v1.0.69): runtime guard for the CHECK constraint on
182| | // `source`. Even though `MemorySource` is the typed future, every
183| | // legacy `NewMemory { source: "..." }` literal still flows through
184| | // this function; validating here keeps the footgun from regressing
185| | // for callers that have not yet migrated to the enum.
186| 39| let validated_source = crate::memory_source::validate_source(&m.source)?;
^0
187| 39| conn.execute(
188| 39| "INSERT INTO memories (namespace, name, type, description, body, body_hash, session_id, source, metadata)
189| 39| VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9)",
190| 39| params![
191| | m.namespace, m.name, m.memory_type, m.description, m.body,
192| | m.body_hash, m.session_id, validated_source,
193| 39| serde_json::to_string(&m.metadata)?
^0
194| | ],
195| 0| )?;
196| 39| Ok(conn.last_insert_rowid())
197| 39|}
198| |
199| |/// Updates an existing memory optionally guarded by optimistic concurrency.
200| |///
201| |/// When `expected_updated_at` is `Some(ts)` the row is only updated if its
202| |/// current `updated_at` equals `ts`. This protects concurrent `edit` calls
203| |/// from silently clobbering each other.
204| |///
205| |/// # Returns
206| |///
207| |/// `Ok(true)` when exactly one row was updated, `Ok(false)` when the
208| |/// optimistic check failed or the memory does not exist.
209| |///
210| |/// # Errors
211| |///
212| |/// Returns `Err(AppError::Database)` on any `rusqlite` failure.
213| 4|pub fn update(
214| 4| conn: &Connection,
215| 4| id: i64,
216| 4| m: &NewMemory,
217| 4| expected_updated_at: Option<i64>,
218| 4|) -> Result<bool, AppError> {
219| | // G29 Passo 2 (v1.0.69): runtime guard for the CHECK constraint on
220| | // `source`. Mirrors `insert` so `body-enrich` and other mutations
221| | // cannot reintroduce the historical "enrich" literal that broke
222| | // `body-enrich` in v1.0.55 - v1.0.68.
223| 4| let validated_source = crate::memory_source::validate_source(&m.source)?;
^0
224| 4| let affected = if let Some(ts) = expected_updated_at {
^2
225| 2| conn.execute(
226| 2| "UPDATE memories SET type=?2, description=?3, body=?4, body_hash=?5,
227| 2| session_id=?6, source=?7, metadata=?8
228| 2| WHERE id=?1 AND updated_at=?9 AND deleted_at IS NULL",
229| 2| params![
230| | id,
231| | m.memory_type,
232| | m.description,
233| | m.body,
234| | m.body_hash,
235| | m.session_id,
236| | validated_source,
237| 2| serde_json::to_string(&m.metadata)?,
^0
238| | ts
239| | ],
240| 0| )?
241| | } else {
242| 2| conn.execute(
243| 2| "UPDATE memories SET type=?2, description=?3, body=?4, body_hash=?5,
244| 2| session_id=?6, source=?7, metadata=?8
245| 2| WHERE id=?1 AND deleted_at IS NULL",
246| 2| params![
247| | id,
248| | m.memory_type,
249| | m.description,
250| | m.body,
251| | m.body_hash,
252| | m.session_id,
253| | validated_source,
254| 2| serde_json::to_string(&m.metadata)?
^0
255| | ],
256| 0| )?
257| | };
258| 4| Ok(affected == 1)
259| 4|}
260| |
261| |/// Replaces the vector row for a memory in `vec_memories`.
262| |///
263| |/// `sqlite-vec` virtual tables do not implement `INSERT OR REPLACE`, so the
264| |/// existing row is deleted first and a fresh vector is inserted. Callers
265| |/// must pass an `embedding` with length [`crate::constants::EMBEDDING_DIM`].
266| |///
267| |/// # Errors
268| |///
269| |/// Returns `Err(AppError::Database)` on any `rusqlite` failure.
270| 7|pub fn upsert_vec(
271| 7| conn: &Connection,
272| 7| memory_id: i64,
273| 7| namespace: &str,
274| 7| memory_type: &str,
275| 7| embedding: &[f32],
276| 7| name: &str,
277| 7| snippet: &str,
278| 7|) -> Result<(), AppError> {
279| | // sqlite-vec virtual tables do not support INSERT OR REPLACE semantics.
280| | // Must delete the existing row first, then insert. Both statements are
281| | // wrapped in `with_busy_retry` because WAL-mode concurrent writers can
282| | // cause SQLITE_BUSY on vec0 virtual table writes.
283| 7| let embedding_bytes = f32_to_bytes(embedding);
284| 7| with_busy_retry(|| {
285| 7| conn.execute(
286| 7| "DELETE FROM vec_memories WHERE memory_id = ?1",
287| 7| params![memory_id],
288| 0| )?;
289| 7| conn.execute(
290| 7| "INSERT INTO vec_memories(memory_id, namespace, type, embedding, name, snippet)
291| 7| VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
292| 7| params![
293| | memory_id,
294| | namespace,
295| | memory_type,
296| 7| &embedding_bytes,
297| | name,
298| | snippet
299| | ],
300| 0| )?;
301| 7| Ok(())
302| 7| })
303| 7|}
304| |
305| |/// Deletes the vector row for `memory_id` from `vec_memories`.
306| |///
307| |/// Called during `forget` and `purge` to keep the vector table consistent
308| |/// with the logical state of `memories`.
309| |///
310| |/// # Errors
311| |///
312| |/// Returns `Err(AppError::Database)` on any `rusqlite` failure.
313| 2|pub fn delete_vec(conn: &Connection, memory_id: i64) -> Result<(), AppError> {
314| 2| conn.execute(
315| 2| "DELETE FROM vec_memories WHERE memory_id = ?1",
316| 2| params![memory_id],
317| 0| )?;
318| 2| Ok(())
319| 2|}
320| |
321| |/// Fetches a live memory by `(namespace, name)` and returns all columns.
322| |///
323| |/// # Returns
324| |///
325| |/// `Ok(Some(row))` when found, `Ok(None)` when missing or soft-deleted.
326| |///
327| |/// # Errors
328| |///
329| |/// Returns `Err(AppError::Database)` on any `rusqlite` failure.
330| 3|pub fn read_by_name(
331| 3| conn: &Connection,
332| 3| namespace: &str,
333| 3| name: &str,
334| 3|) -> Result<Option<MemoryRow>, AppError> {
335| 3| let mut stmt = conn.prepare_cached(
336| 3| "SELECT id, namespace, name, type, description, body, body_hash,
337| 3| session_id, source, metadata, created_at, updated_at, deleted_at
338| 3| FROM memories WHERE namespace=?1 AND name=?2 AND deleted_at IS NULL",
339| 0| )?;
340| 3| match stmt.query_row(params![namespace, name], |r| {
^1
341| | Ok(MemoryRow {
342| 1| id: r.get(0)?,
^0
343| 1| namespace: r.get(1)?,
^0
344| 1| name: r.get(2)?,
^0
345| 1| memory_type: r.get(3)?,
^0
346| 1| description: r.get(4)?,
^0
347| 1| body: r.get(5)?,
^0
348| 1| body_hash: r.get(6)?,
^0
349| 1| session_id: r.get(7)?,
^0
350| 1| source: r.get(8)?,
^0
351| 1| metadata: r.get(9)?,
^0
352| 1| created_at: r.get(10)?,
^0
353| 1| updated_at: r.get(11)?,
^0
354| 1| deleted_at: r.get(12)?,
^0
355| | })
356| 1| }) {
357| 1| Ok(m) => Ok(Some(m)),
358| 2| Err(rusqlite::Error::QueryReturnedNoRows) => Ok(None),
359| 0| Err(e) => Err(AppError::Database(e)),
360| | }
361| 3|}
362| |
363| |/// Soft-deletes a memory by setting `deleted_at = unixepoch()`.
364| |///
365| |/// Versions and chunks are preserved so `restore` can undo the operation
366| |/// until a subsequent `purge` reclaims the storage permanently.
367| |///
368| |/// # Returns
369| |///
370| |/// `Ok(true)` when a live memory was soft-deleted, `Ok(false)` when no
371| |/// matching live row existed.
372| |///
373| |/// # Errors
374| |///
375| |/// Returns `Err(AppError::Database)` on any `rusqlite` failure.
376| 7|pub fn soft_delete(conn: &Connection, namespace: &str, name: &str) -> Result<bool, AppError> {
377| 7| let affected = conn.execute(
378| 7| "UPDATE memories SET deleted_at = unixepoch() WHERE namespace=?1 AND name=?2 AND deleted_at IS NULL",
379| 7| params![namespace, name],
380| 0| )?;
381| 7| Ok(affected == 1)
382| 7|}
383| |
384| |/// Lists live memories in a namespace ordered by `updated_at` descending.
385| |///
386| |/// # Arguments
387| |///
388| |/// - `memory_type` — optional filter on the `type` column.
389| |/// - `limit` / `offset` — standard pagination controls in rows.
390| |///
391| |/// # Errors
392| |///
393| |/// Returns `Err(AppError::Database)` on any `rusqlite` failure.
394| 6|pub fn list(
395| 6| conn: &Connection,
396| 6| namespace: &str,
397| 6| memory_type: Option<&str>,
398| 6| limit: usize,
399| 6| offset: usize,
400| 6| include_deleted: bool,
401| 6|) -> Result<Vec<MemoryRow>, AppError> {
402| 6| if let Some(mt) = memory_type {
^2
403| 2| let sql = if include_deleted {
404| 0| "SELECT id, namespace, name, type, description, body, body_hash,
405| 0| session_id, source, metadata, created_at, updated_at, deleted_at
406| 0| FROM memories WHERE namespace=?1 AND type=?2
407| 0| ORDER BY updated_at DESC LIMIT ?3 OFFSET ?4"
408| | } else {
409| 2| "SELECT id, namespace, name, type, description, body, body_hash,
410| 2| session_id, source, metadata, created_at, updated_at, deleted_at
411| 2| FROM memories WHERE namespace=?1 AND type=?2 AND deleted_at IS NULL
412| 2| ORDER BY updated_at DESC LIMIT ?3 OFFSET ?4"
413| | };
414| 2| let mut stmt = conn.prepare_cached(sql)?;
^0
415| 2| let rows = stmt
416| 2| .query_map(params![namespace, mt, limit as i64, offset as i64], |r| {
417| | Ok(MemoryRow {
418| 2| id: r.get(0)?,
^0
419| 2| namespace: r.get(1)?,
^0
420| 2| name: r.get(2)?,
^0
421| 2| memory_type: r.get(3)?,
^0
422| 2| description: r.get(4)?,
^0
423| 2| body: r.get(5)?,
^0
424| 2| body_hash: r.get(6)?,
^0
425| 2| session_id: r.get(7)?,
^0
426| 2| source: r.get(8)?,
^0
427| 2| metadata: r.get(9)?,
^0
428| 2| created_at: r.get(10)?,
^0
429| 2| updated_at: r.get(11)?,
^0
430| 2| deleted_at: r.get(12)?,
^0
431| | })
432| 2| })?
^0
433| 2| .collect::<Result<Vec<_>, _>>()?;
^0
434| 2| Ok(rows)
435| | } else {
436| 4| let sql = if include_deleted {
437| 0| "SELECT id, namespace, name, type, description, body, body_hash,
438| 0| session_id, source, metadata, created_at, updated_at, deleted_at
439| 0| FROM memories WHERE namespace=?1
440| 0| ORDER BY updated_at DESC LIMIT ?2 OFFSET ?3"
441| | } else {
442| 4| "SELECT id, namespace, name, type, description, body, body_hash,
443| 4| session_id, source, metadata, created_at, updated_at, deleted_at
444| 4| FROM memories WHERE namespace=?1 AND deleted_at IS NULL
445| 4| ORDER BY updated_at DESC LIMIT ?2 OFFSET ?3"
446| | };
447| 4| let mut stmt = conn.prepare_cached(sql)?;
^0
448| 4| let rows = stmt
449| 6| .query_map(params![namespace, limit as i64, offset as i64], |r| {
^4 ^4 ^4 ^4
450| | Ok(MemoryRow {
451| 6| id: r.get(0)?,
^0
452| 6| namespace: r.get(1)?,
^0
453| 6| name: r.get(2)?,
^0
454| 6| memory_type: r.get(3)?,
^0
455| 6| description: r.get(4)?,
^0
456| 6| body: r.get(5)?,
^0
457| 6| body_hash: r.get(6)?,
^0
458| 6| session_id: r.get(7)?,
^0
459| 6| source: r.get(8)?,
^0
460| 6| metadata: r.get(9)?,
^0
461| 6| created_at: r.get(10)?,
^0
462| 6| updated_at: r.get(11)?,
^0
463| 6| deleted_at: r.get(12)?,
^0
464| | })
465| 6| })?
^0
466| 4| .collect::<Result<Vec<_>, _>>()?;
^0
467| 4| Ok(rows)
468| | }
469| 6|}
470| |
471| |/// Runs a KNN search over `vec_memories`, optionally restricted to namespaces.
472| |///
473| |/// # Arguments
474| |///
475| |/// - `embedding` — query vector of length [`crate::constants::EMBEDDING_DIM`].
476| |/// - `namespaces` — namespaces to search. Empty slice means "all namespaces".
477| |/// - `memory_type` — optional filter on the `type` column.
478| |/// - `k` — maximum number of hits to return.
479| |///
480| |/// # Returns
481| |///
482| |/// A vector of `(memory_id, distance)` pairs sorted by ascending distance.
483| |///
484| |/// # Errors
485| |///
486| |/// Returns `Err(AppError::Database)` on any `rusqlite` failure.
487| 3|pub fn knn_search(
488| 3| conn: &Connection,
489| 3| embedding: &[f32],
490| 3| namespaces: &[String],
491| 3| memory_type: Option<&str>,
492| 3| k: usize,
493| 3|) -> Result<Vec<(i64, f32)>, AppError> {
494| 3| let bytes = f32_to_bytes(embedding);
495| |
496| 3| match namespaces.len() {
497| | 0 => {
498| | // No namespace filter — search all namespaces.
499| 0| if let Some(mt) = memory_type {
500| 0| let mut stmt = conn.prepare_cached(
501| 0| "SELECT memory_id, distance FROM vec_memories \
502| 0| WHERE embedding MATCH ?1 AND type = ?2 \
503| 0| ORDER BY distance LIMIT ?3",
504| 0| )?;
505| 0| let rows = stmt
506| 0| .query_map(params![bytes, mt, k as i64], |r| {
507| 0| Ok((r.get::<_, i64>(0)?, r.get::<_, f32>(1)?))
508| 0| })?
509| 0| .collect::<Result<Vec<_>, _>>()?;
510| 0| Ok(rows)
511| | } else {
512| 0| let mut stmt = conn.prepare_cached(
513| 0| "SELECT memory_id, distance FROM vec_memories \
514| 0| WHERE embedding MATCH ?1 \
515| 0| ORDER BY distance LIMIT ?2",
516| 0| )?;
517| 0| let rows = stmt
518| 0| .query_map(params![bytes, k as i64], |r| {
519| 0| Ok((r.get::<_, i64>(0)?, r.get::<_, f32>(1)?))
520| 0| })?
521| 0| .collect::<Result<Vec<_>, _>>()?;
522| 0| Ok(rows)
523| | }
524| | }
525| | 1 => {
526| | // Fast single-namespace path (preserved from previous implementation).
527| 3| let ns = &namespaces[0];
528| 3| if let Some(mt) = memory_type {
^2
529| 2| let mut stmt = conn.prepare_cached(
530| 2| "SELECT memory_id, distance FROM vec_memories \
531| 2| WHERE embedding MATCH ?1 AND namespace = ?2 AND type = ?3 \
532| 2| ORDER BY distance LIMIT ?4",
533| 0| )?;
534| 2| let rows = stmt
535| 2| .query_map(params![bytes, ns, mt, k as i64], |r| {
536| 2| Ok((r.get::<_, i64>(0)?, r.get::<_, f32>(1)?))
^0 ^0
537| 2| })?
^0
538| 2| .collect::<Result<Vec<_>, _>>()?;
^0
539| 2| Ok(rows)
540| | } else {
541| 1| let mut stmt = conn.prepare_cached(
542| 1| "SELECT memory_id, distance FROM vec_memories \
543| 1| WHERE embedding MATCH ?1 AND namespace = ?2 \
544| 1| ORDER BY distance LIMIT ?3",
545| 0| )?;
546| 1| let rows = stmt
547| 2| .query_map(params![bytes, ns, k as i64], |r| {
^1 ^1 ^1
548| 2| Ok((r.get::<_, i64>(0)?, r.get::<_, f32>(1)?))
^0 ^0
549| 2| })?
^0
550| 1| .collect::<Result<Vec<_>, _>>()?;
^0
551| 1| Ok(rows)
552| | }
553| | }
554| | _ => {
555| | // Multiple explicit namespaces: build IN clause with positional placeholders.
556| | // rusqlite does not support array binding, so we generate "?,?,..." manually.
557| 0| let placeholders = (0..namespaces.len())
558| 0| .map(|_| "?")
559| 0| .collect::<Vec<_>>()
560| 0| .join(",");
561| 0| if let Some(mt) = memory_type {
562| 0| let query = format!(
563| 0| "SELECT memory_id, distance FROM vec_memories \
564| 0| WHERE embedding MATCH ? AND type = ? AND namespace IN ({placeholders}) \
565| 0| ORDER BY distance LIMIT ?"
566| | );
567| 0| let mut stmt = conn.prepare(&query)?;
568| | // Params: [bytes, mt, ns0, ns1, ..., k]
569| 0| let mut raw_params: Vec<Box<dyn rusqlite::ToSql>> =
570| 0| vec![Box::new(bytes), Box::new(mt.to_string())];
571| 0| for ns in namespaces {
572| 0| raw_params.push(Box::new(ns.clone()));
573| 0| }
574| 0| raw_params.push(Box::new(k as i64));
575| 0| let param_refs: Vec<&dyn rusqlite::ToSql> =
576| 0| raw_params.iter().map(|b| b.as_ref()).collect();
577| 0| let rows = stmt
578| 0| .query_map(param_refs.as_slice(), |r| {
579| 0| Ok((r.get::<_, i64>(0)?, r.get::<_, f32>(1)?))
580| 0| })?
581| 0| .collect::<Result<Vec<_>, _>>()?;
582| 0| Ok(rows)
583| | } else {
584| 0| let query = format!(
585| 0| "SELECT memory_id, distance FROM vec_memories \
586| 0| WHERE embedding MATCH ? AND namespace IN ({placeholders}) \
587| 0| ORDER BY distance LIMIT ?"
588| | );
589| 0| let mut stmt = conn.prepare(&query)?;
590| | // Params: [bytes, ns0, ns1, ..., k]
591| 0| let mut raw_params: Vec<Box<dyn rusqlite::ToSql>> = vec![Box::new(bytes)];
592| 0| for ns in namespaces {
593| 0| raw_params.push(Box::new(ns.clone()));
594| 0| }
595| 0| raw_params.push(Box::new(k as i64));
596| 0| let param_refs: Vec<&dyn rusqlite::ToSql> =
597| 0| raw_params.iter().map(|b| b.as_ref()).collect();
598| 0| let rows = stmt
599| 0| .query_map(param_refs.as_slice(), |r| {
600| 0| Ok((r.get::<_, i64>(0)?, r.get::<_, f32>(1)?))
601| 0| })?
602| 0| .collect::<Result<Vec<_>, _>>()?;
603| 0| Ok(rows)
604| | }
605| | }
606| | }
607| 3|}
608| |
609| |/// Fetches a live memory by primary key and returns all columns.
610| |///
611| |/// Mirrors [`read_by_name`] but keyed on `rowid` for use after a KNN search.
612| |///
613| |/// # Errors
614| |///
615| |/// Returns `Err(AppError::Database)` on any `rusqlite` failure.
616| 7|pub fn read_full(conn: &Connection, memory_id: i64) -> Result<Option<MemoryRow>, AppError> {
617| 7| let mut stmt = conn.prepare_cached(
618| 7| "SELECT id, namespace, name, type, description, body, body_hash,
619| 7| session_id, source, metadata, created_at, updated_at, deleted_at
620| 7| FROM memories WHERE id=?1 AND deleted_at IS NULL",
621| 0| )?;
622| 7| match stmt.query_row(params![memory_id], |r| {
^6
623| | Ok(MemoryRow {
624| 6| id: r.get(0)?,
^0
625| 6| namespace: r.get(1)?,
^0
626| 6| name: r.get(2)?,
^0
627| 6| memory_type: r.get(3)?,
^0
628| 6| description: r.get(4)?,
^0
629| 6| body: r.get(5)?,
^0
630| 6| body_hash: r.get(6)?,
^0
631| 6| session_id: r.get(7)?,
^0
632| 6| source: r.get(8)?,
^0
633| 6| metadata: r.get(9)?,
^0
634| 6| created_at: r.get(10)?,
^0
635| 6| updated_at: r.get(11)?,
^0
636| 6| deleted_at: r.get(12)?,
^0
637| | })
638| 6| }) {
639| 6| Ok(m) => Ok(Some(m)),
640| 1| Err(rusqlite::Error::QueryReturnedNoRows) => Ok(None),
641| 0| Err(e) => Err(AppError::Database(e)),
642| | }
643| 7|}
644| |
645| |/// Fetches all memory_ids in a namespace that are soft-deleted and whose
646| |/// `deleted_at` is older than `before_ts` (unix epoch seconds).
647| |///
648| |/// Used by `purge` to collect stale rows for permanent deletion.
649| |///
650| |/// # Errors
651| |///
652| |/// Returns `Err(AppError::Database)` on any `rusqlite` failure.
653| 2|pub fn list_deleted_before(
654| 2| conn: &Connection,
655| 2| namespace: &str,
656| 2| before_ts: i64,
657| 2|) -> Result<Vec<i64>, AppError> {
658| 2| let mut stmt = conn.prepare_cached(
659| 2| "SELECT id FROM memories WHERE namespace = ?1 AND deleted_at IS NOT NULL AND deleted_at < ?2",
660| 0| )?;
661| 2| let ids = stmt
662| 2| .query_map(params![namespace, before_ts], |r| r.get::<_, i64>(0))?
^1^1 ^0
663| 2| .collect::<Result<Vec<_>, _>>()?;
^0
664| 2| Ok(ids)
665| 2|}
666| |
667| |/// Preprocesses a raw user query for FTS5 `MATCH`.
668| |///
669| |/// Technical separators (`-`, `.`, `_`, `/`) are treated as word boundaries by
670| |/// the `unicode61` tokenizer. When the query contains any of these characters
671| |/// the function builds a compound FTS5 expression:
672| |/// 1. A phrase query with the separated tokens (exact compound matching).
673| |/// 2. Individual prefix terms joined with OR (broader recall).
674| |///
675| |/// Queries without separators keep the original `term*` prefix behaviour.
676| 18|fn preprocess_fts_query(raw: &str) -> String {
677| | const SEPARATORS: &[char] = &['-', '.', '_', '/'];
678| | const FTS5_SYNTAX: &[char] = &['"', '*', '(', ')', '^', ':'];
679| | const FTS5_KEYWORDS: &[&str] = &["OR", "AND", "NOT", "NEAR"];
680| |
681| 177| let sanitized: String = raw.chars().filter(|c| !FTS5_SYNTAX.contains(c)).collect();
^18 ^18 ^18 ^18 ^18 ^18
682| 18| let trimmed = sanitized.trim();
683| 18| if trimmed.is_empty() {
684| 2| return String::new();
685| 16| }
686| |
687| 108| let is_fts_keyword = |t: &str| FTS5_KEYWORDS.iter().any(|kw| kw.eq_ignore_ascii_case(t));
^16 ^30 ^30
688| |
689| 126| if !trimmed.chars().any(|c| SEPARATORS.contains(&c)) {
^16 ^16
690| 11| return trimmed
691| 11| .split_whitespace()
692| 17| .filter(|t| !is_fts_keyword(t))
^11
693| 13| .map(|t| format!("{t}*"))
^11
694| 11| .collect::<Vec<_>>()
695| 11| .join(" ");
696| 5| }
697| 5| let tokens: Vec<&str> = trimmed
698| 81| .split(|c: char| SEPARATORS.contains(&c) || c.is_whitespace())
^5 ^73^73
699| 13| .filter(|t| !t.is_empty() && !is_fts_keyword(t))
^5
700| 5| .collect();
701| 5| if tokens.is_empty() {
702| 0| return String::new();
703| 5| }
704| 5| let phrase = format!("\"{}\"", tokens.join(" "));
705| 12| let prefix_terms: Vec<String> = tokens.iter().map(|t| format!("{t}*")).collect();
^5 ^5 ^5 ^5 ^5
706| 5| format!("{phrase} OR {}", prefix_terms.join(" OR "))
707| 18|}
708| |
709| |/// Executes an FTS5 search against `fts_memories` with query preprocessing.
710| |///
711| |/// Technical separators in the query are converted to phrase + prefix OR
712| |/// expressions so compound terms like `graphrag-precompact.sh` match correctly.
713| |///
714| |/// # Errors
715| |///
716| |/// Returns `Err(AppError::Database)` on any `rusqlite` failure.
717| 5|pub fn fts_search(
718| 5| conn: &Connection,
719| 5| query: &str,
720| 5| namespace: &str,
721| 5| memory_type: Option<&str>,
722| 5| limit: usize,
723| 5|) -> Result<Vec<MemoryRow>, AppError> {
724| 5| let fts_query = preprocess_fts_query(query);
725| 5| if let Some(mt) = memory_type {
^2
726| 2| let mut stmt = conn.prepare_cached(
727| 2| "SELECT m.id, m.namespace, m.name, m.type, m.description, m.body, m.body_hash,
728| 2| m.session_id, m.source, m.metadata, m.created_at, m.updated_at, m.deleted_at
729| 2| FROM fts_memories fts
730| 2| JOIN memories m ON m.id = fts.rowid
731| 2| WHERE fts_memories MATCH ?1 AND m.namespace = ?2 AND m.type = ?3 AND m.deleted_at IS NULL
732| 2| ORDER BY rank LIMIT ?4",
733| 0| )?;
734| 2| let rows = stmt
735| 2| .query_map(params![fts_query, namespace, mt, limit as i64], |r| {
736| | Ok(MemoryRow {
737| 2| id: r.get(0)?,
^0
738| 2| namespace: r.get(1)?,
^0
739| 2| name: r.get(2)?,
^0
740| 2| memory_type: r.get(3)?,
^0
741| 2| description: r.get(4)?,
^0
742| 2| body: r.get(5)?,
^0
743| 2| body_hash: r.get(6)?,
^0
744| 2| session_id: r.get(7)?,
^0
745| 2| source: r.get(8)?,
^0
746| 2| metadata: r.get(9)?,
^0
747| 2| created_at: r.get(10)?,
^0
748| 2| updated_at: r.get(11)?,
^0
749| 2| deleted_at: r.get(12)?,
^0
750| | })
751| 2| })?
^0
752| 2| .collect::<Result<Vec<_>, _>>()?;
^0
753| 2| Ok(rows)
754| | } else {
755| 3| let mut stmt = conn.prepare_cached(
756| 3| "SELECT m.id, m.namespace, m.name, m.type, m.description, m.body, m.body_hash,
757| 3| m.session_id, m.source, m.metadata, m.created_at, m.updated_at, m.deleted_at
758| 3| FROM fts_memories fts
759| 3| JOIN memories m ON m.id = fts.rowid
760| 3| WHERE fts_memories MATCH ?1 AND m.namespace = ?2 AND m.deleted_at IS NULL
761| 3| ORDER BY rank LIMIT ?3",
762| 0| )?;
763| 3| let rows = stmt
764| 3| .query_map(params![fts_query, namespace, limit as i64], |r| {
^2
765| | Ok(MemoryRow {
766| 2| id: r.get(0)?,
^0
767| 2| namespace: r.get(1)?,
^0
768| 2| name: r.get(2)?,
^0
769| 2| memory_type: r.get(3)?,
^0
770| 2| description: r.get(4)?,
^0
771| 2| body: r.get(5)?,
^0
772| 2| body_hash: r.get(6)?,
^0
773| 2| session_id: r.get(7)?,
^0
774| 2| source: r.get(8)?,
^0
775| 2| metadata: r.get(9)?,
^0
776| 2| created_at: r.get(10)?,
^0
777| 2| updated_at: r.get(11)?,
^0
778| 2| deleted_at: r.get(12)?,
^0
779| | })
780| 2| })?
^0
781| 3| .collect::<Result<Vec<_>, _>>()?;
^0
782| 3| Ok(rows)
783| | }
784| 5|}
785| |
786| |/// Syncs FTS5 external-content index after an UPDATE on the memories table.
787| |///
788| |/// The AFTER UPDATE trigger (`trg_fts_au`) is intentionally absent because
789| |/// sqlite-vec loaded via `sqlite3_auto_extension` conflicts with FTS5 inside
790| |/// UPDATE triggers. This function performs the equivalent sync in Rust:
791| |/// DELETE the old entry, then INSERT the new one (external-content FTS5
792| |/// tables do not support in-place UPDATE).
793| |#[allow(clippy::too_many_arguments)]
794| 0|pub fn sync_fts_after_update(
795| 0| conn: &Connection,
796| 0| memory_id: i64,
797| 0| old_name: &str,
798| 0| old_desc: &str,
799| 0| old_body: &str,
800| 0| new_name: &str,
801| 0| new_desc: &str,
802| 0| new_body: &str,
803| 0|) -> Result<(), AppError> {
804| 0| conn.execute(
805| 0| "INSERT INTO fts_memories(fts_memories, rowid, name, description, body)
806| 0| VALUES('delete', ?1, ?2, ?3, ?4)",
807| 0| params![memory_id, old_name, old_desc, old_body],
808| 0| )?;
809| 0| conn.execute(
810| 0| "INSERT INTO fts_memories(rowid, name, description, body)
811| 0| VALUES(?1, ?2, ?3, ?4)",
812| 0| params![memory_id, new_name, new_desc, new_body],
813| 0| )?;
814| 0| Ok(())
815| 0|}
816| |
817| |#[cfg(test)]
818| |mod tests {
819| | use super::*;
820| | use rusqlite::Connection;
821| |
822| | type TestResult = Result<(), Box<dyn std::error::Error>>;
823| |
824| 37| fn setup_conn() -> Result<Connection, Box<dyn std::error::Error>> {
825| 37| crate::storage::connection::register_vec_extension();
826| 37| let mut conn = Connection::open_in_memory()?;
^0
827| 37| conn.execute_batch(
828| 37| "PRAGMA foreign_keys = ON;
829| 37| PRAGMA temp_store = MEMORY;",
830| 0| )?;
831| 37| crate::migrations::runner().run(&mut conn)?;
^0
832| 37| Ok(conn)
833| 37| }
834| |
835| 42| fn new_memory(name: &str) -> NewMemory {
836| 42| NewMemory {
837| 42| namespace: "global".to_string(),
838| 42| name: name.to_string(),
839| 42| memory_type: "user".to_string(),
840| 42| description: "descricao de teste".to_string(),
841| 42| body: "test memory body".to_string(),
842| 42| body_hash: format!("hash-{name}"),
843| 42| session_id: None,
844| 42| source: "agent".to_string(),
845| 42| metadata: serde_json::json!({}),
846| 42| }
847| 42| }
848| |
849| | #[test]
850| 1| fn insert_and_find_by_name_return_id() -> TestResult {
851| 1| let conn = setup_conn()?;
^0
852| 1| let m = new_memory("mem-alpha");
853| 1| let id = insert(&conn, &m)?;
^0
854| 1| assert!(id > 0);
855| |
856| 1| let found = find_by_name(&conn, "global", "mem-alpha")?;
^0
857| 1| assert!(found.is_some());
858| 1| let (found_id, _, _) = found.ok_or("mem-alpha should exist")?;
^0
859| 1| assert_eq!(found_id, id);
860| 1| Ok(())
861| 1| }
862| |
863| | #[test]
864| 1| fn find_by_name_returns_none_when_not_found() -> TestResult {
865| 1| let conn = setup_conn()?;
^0
866| 1| let result = find_by_name(&conn, "global", "inexistente")?;
^0
867| 1| assert!(result.is_none());
868| 1| Ok(())
869| 1| }
870| |
871| | #[test]
872| 1| fn find_by_hash_returns_correct_id() -> TestResult {
873| 1| let conn = setup_conn()?;
^0
874| 1| let m = new_memory("mem-hash");
875| 1| let id = insert(&conn, &m)?;
^0
876| |
877| 1| let found = find_by_hash(&conn, "global", "hash-mem-hash")?;
^0
878| 1| assert_eq!(found, Some(id));
879| 1| Ok(())
880| 1| }
881| |
882| | #[test]
883| 1| fn find_by_hash_returns_none_when_hash_not_found() -> TestResult {
884| 1| let conn = setup_conn()?;
^0
885| 1| let result = find_by_hash(&conn, "global", "hash-inexistente")?;
^0
886| 1| assert!(result.is_none());
887| 1| Ok(())
888| 1| }
889| |
890| | #[test]
891| 1| fn find_by_hash_ignores_different_namespace() -> TestResult {
892| 1| let conn = setup_conn()?;
^0
893| 1| let m = new_memory("mem-ns");
894| 1| insert(&conn, &m)?;
^0
895| |
896| 1| let result = find_by_hash(&conn, "outro-namespace", "hash-mem-ns")?;
^0
897| 1| assert!(result.is_none());
898| 1| Ok(())
899| 1| }
900| |
901| | #[test]
902| 1| fn read_by_name_returns_full_memory() -> TestResult {
903| 1| let conn = setup_conn()?;
^0
904| 1| let m = new_memory("mem-read");
905| 1| let id = insert(&conn, &m)?;
^0
906| |
907| 1| let row = read_by_name(&conn, "global", "mem-read")?.ok_or("mem-read should exist")?;
^0 ^0
908| 1| assert_eq!(row.id, id);
909| 1| assert_eq!(row.name, "mem-read");
910| 1| assert_eq!(row.memory_type, "user");
911| 1| assert_eq!(row.body, "test memory body");
912| 1| assert_eq!(row.namespace, "global");
913| 1| Ok(())
914| 1| }
915| |
916| | #[test]
917| 1| fn read_by_name_returns_none_for_missing() -> TestResult {
918| 1| let conn = setup_conn()?;
^0
919| 1| let result = read_by_name(&conn, "global", "nao-existe")?;
^0
920| 1| assert!(result.is_none());
921| 1| Ok(())
922| 1| }
923| |
924| | #[test]
925| 1| fn read_full_by_id_returns_memory() -> TestResult {
926| 1| let conn = setup_conn()?;
^0
927| 1| let m = new_memory("mem-full");
928| 1| let id = insert(&conn, &m)?;
^0
929| |
930| 1| let row = read_full(&conn, id)?.ok_or("mem-full should exist")?;
^0 ^0
931| 1| assert_eq!(row.id, id);
932| 1| assert_eq!(row.name, "mem-full");
933| 1| Ok(())
934| 1| }
935| |
936| | #[test]
937| 1| fn read_full_returns_none_for_missing_id() -> TestResult {
938| 1| let conn = setup_conn()?;
^0
939| 1| let result = read_full(&conn, 9999)?;
^0
940| 1| assert!(result.is_none());
941| 1| Ok(())
942| 1| }
943| |
944| | #[test]
945| 1| fn update_without_optimism_modifies_fields() -> TestResult {
946| 1| let conn = setup_conn()?;
^0
947| 1| let m = new_memory("mem-upd");
948| 1| let id = insert(&conn, &m)?;
^0
949| |
950| 1| let mut m2 = new_memory("mem-upd");
951| 1| m2.body = "updated body".to_string();
952| 1| m2.body_hash = "hash-novo".to_string();
953| 1| let ok = update(&conn, id, &m2, None)?;
^0
954| 1| assert!(ok);
955| |
956| 1| let row = read_full(&conn, id)?.ok_or("mem-upd should exist")?;
^0 ^0
957| 1| assert_eq!(row.body, "updated body");
958| 1| assert_eq!(row.body_hash, "hash-novo");
959| 1| Ok(())
960| 1| }
961| |
962| | #[test]
963| 1| fn update_with_correct_expected_updated_at_succeeds() -> TestResult {
964| 1| let conn = setup_conn()?;
^0
965| 1| let m = new_memory("mem-opt");
966| 1| let id = insert(&conn, &m)?;
^0
967| |
968| 1| let (_, updated_at, _) =
969| 1| find_by_name(&conn, "global", "mem-opt")?.ok_or("mem-opt should exist")?;
^0 ^0
970| |
971| 1| let mut m2 = new_memory("mem-opt");
972| 1| m2.body = "optimistic body".to_string();
973| 1| m2.body_hash = "hash-optimistic".to_string();
974| 1| let ok = update(&conn, id, &m2, Some(updated_at))?;
^0
975| 1| assert!(ok);
976| |
977| 1| let row = read_full(&conn, id)?.ok_or("mem-opt should exist after update")?;
^0 ^0
978| 1| assert_eq!(row.body, "optimistic body");
979| 1| Ok(())
980| 1| }
981| |
982| | #[test]
983| 1| fn update_with_wrong_expected_updated_at_returns_false() -> TestResult {
984| 1| let conn = setup_conn()?;
^0
985| 1| let m = new_memory("mem-conflict");
986| 1| let id = insert(&conn, &m)?;
^0
987| |
988| 1| let mut m2 = new_memory("mem-conflict");
989| 1| m2.body = "must not appear".to_string();
990| 1| m2.body_hash = "hash-x".to_string();
991| 1| let ok = update(&conn, id, &m2, Some(0))?;
^0
992| 1| assert!(!ok);
993| |
994| 1| let row = read_full(&conn, id)?.ok_or("mem-conflict should exist")?;
^0 ^0
995| 1| assert_eq!(row.body, "test memory body");
996| 1| Ok(())
997| 1| }
998| |
999| | #[test]
1000| 1| fn update_missing_id_returns_false() -> TestResult {
1001| 1| let conn = setup_conn()?;
^0
1002| 1| let m = new_memory("fantasma");
1003| 1| let ok = update(&conn, 9999, &m, None)?;
^0
1004| 1| assert!(!ok);
1005| 1| Ok(())
1006| 1| }
1007| |
1008| | #[test]
1009| 1| fn soft_delete_marks_deleted_at() -> TestResult {
1010| 1| let conn = setup_conn()?;
^0
1011| 1| let m = new_memory("mem-del");
1012| 1| insert(&conn, &m)?;
^0
1013| |
1014| 1| let ok = soft_delete(&conn, "global", "mem-del")?;
^0
1015| 1| assert!(ok);
1016| |
1017| 1| let result = find_by_name(&conn, "global", "mem-del")?;
^0
1018| 1| assert!(result.is_none());
1019| |
1020| 1| let result_read = read_by_name(&conn, "global", "mem-del")?;
^0
1021| 1| assert!(result_read.is_none());
1022| 1| Ok(())
1023| 1| }
1024| |
1025| | #[test]
1026| 1| fn soft_delete_returns_false_when_not_found() -> TestResult {
1027| 1| let conn = setup_conn()?;
^0
1028| 1| let ok = soft_delete(&conn, "global", "nao-existe")?;
^0
1029| 1| assert!(!ok);
1030| 1| Ok(())
1031| 1| }
1032| |
1033| | #[test]
1034| 1| fn double_soft_delete_returns_false_on_second_call() -> TestResult {
1035| 1| let conn = setup_conn()?;
^0
1036| 1| let m = new_memory("mem-del2");
1037| 1| insert(&conn, &m)?;
^0
1038| |
1039| 1| soft_delete(&conn, "global", "mem-del2")?;
^0
1040| 1| let ok = soft_delete(&conn, "global", "mem-del2")?;
^0
1041| 1| assert!(!ok);
1042| 1| Ok(())
1043| 1| }
1044| |
1045| | #[test]
1046| 1| fn list_returns_memories_from_namespace() -> TestResult {
1047| 1| let conn = setup_conn()?;
^0
1048| 1| insert(&conn, &new_memory("mem-list-a"))?;
^0
1049| 1| insert(&conn, &new_memory("mem-list-b"))?;
^0
1050| |
1051| 1| let rows = list(&conn, "global", None, 10, 0, false)?;
^0
1052| 1| assert!(rows.len() >= 2);
1053| 2| let nomes: Vec<_> = rows.iter().map(|r| r.name.as_str()).collect();
^1 ^1 ^1 ^1 ^1
1054| 1| assert!(nomes.contains(&"mem-list-a"));
1055| 1| assert!(nomes.contains(&"mem-list-b"));
1056| 1| Ok(())
1057| 1| }
1058| |
1059| | #[test]
1060| 1| fn list_with_type_filter_returns_only_correct_type() -> TestResult {
1061| 1| let conn = setup_conn()?;
^0
1062| 1| insert(&conn, &new_memory("mem-user"))?;
^0
1063| |
1064| 1| let mut m2 = new_memory("mem-feedback");
1065| 1| m2.memory_type = "feedback".to_string();
1066| 1| insert(&conn, &m2)?;
^0
1067| |
1068| 1| let rows_user = list(&conn, "global", Some("user"), 10, 0, false)?;
^0
1069| 1| assert!(rows_user.iter().all(|r| r.memory_type == "user"));
1070| |
1071| 1| let rows_fb = list(&conn, "global", Some("feedback"), 10, 0, false)?;
^0
1072| 1| assert!(rows_fb.iter().all(|r| r.memory_type == "feedback"));
1073| 1| Ok(())
1074| 1| }
1075| |
1076| | #[test]
1077| 1| fn list_exclui_soft_deleted() -> TestResult {
1078| 1| let conn = setup_conn()?;
^0
1079| 1| let m = new_memory("mem-excluida");
1080| 1| insert(&conn, &m)?;
^0
1081| 1| soft_delete(&conn, "global", "mem-excluida")?;
^0
1082| |
1083| 1| let rows = list(&conn, "global", None, 10, 0, false)?;
^0
1084| 1| assert!(rows.iter().all(|r| r.name != "mem-excluida"));
^0 ^0
1085| 1| Ok(())
1086| 1| }
1087| |
1088| | #[test]
1089| 1| fn list_pagination_works() -> TestResult {
1090| 1| let conn = setup_conn()?;
^0
1091| 6| for i in 0..5 {
^5
1092| 5| insert(&conn, &new_memory(&format!("mem-pag-{i}")))?;
^0
1093| | }
1094| |
1095| 1| let pagina1 = list(&conn, "global", None, 2, 0, false)?;
^0
1096| 1| let pagina2 = list(&conn, "global", None, 2, 2, false)?;
^0
1097| 1| assert!(pagina1.len() <= 2);
1098| 1| assert!(pagina2.len() <= 2);
1099| 1| if !pagina1.is_empty() && !pagina2.is_empty() {
1100| 1| assert_ne!(pagina1[0].id, pagina2[0].id);
1101| 0| }
1102| 1| Ok(())
1103| 1| }
1104| |
1105| | #[test]
1106| 1| fn upsert_vec_and_delete_vec_work() -> TestResult {
1107| 1| let conn = setup_conn()?;
^0
1108| 1| let m = new_memory("mem-vec");
1109| 1| let id = insert(&conn, &m)?;
^0
1110| |
1111| 1| let embedding: Vec<f32> = vec![0.1; 384];
1112| 1| upsert_vec(
1113| 1| &conn, id, "global", "user", &embedding, "mem-vec", "snippet",
1114| 0| )?;
1115| |
1116| 1| let count: i64 = conn.query_row(
1117| 1| "SELECT COUNT(*) FROM vec_memories WHERE memory_id = ?1",
1118| 1| params![id],
1119| 1| |r| r.get(0),
1120| 0| )?;
1121| 1| assert_eq!(count, 1);
1122| |
1123| 1| delete_vec(&conn, id)?;
^0
1124| |
1125| 1| let count_after: i64 = conn.query_row(
1126| 1| "SELECT COUNT(*) FROM vec_memories WHERE memory_id = ?1",
1127| 1| params![id],
1128| 1| |r| r.get(0),
1129| 0| )?;
1130| 1| assert_eq!(count_after, 0);
1131| 1| Ok(())
1132| 1| }
1133| |
1134| | #[test]
1135| 1| fn upsert_vec_replaces_existing_vector() -> TestResult {
1136| 1| let conn = setup_conn()?;
^0
1137| 1| let m = new_memory("mem-vec-upsert");
1138| 1| let id = insert(&conn, &m)?;
^0
1139| |
1140| 1| let emb1: Vec<f32> = vec![0.1; 384];
1141| 1| upsert_vec(&conn, id, "global", "user", &emb1, "mem-vec-upsert", "s1")?;
^0
1142| |
1143| 1| let emb2: Vec<f32> = vec![0.9; 384];
1144| 1| upsert_vec(&conn, id, "global", "user", &emb2, "mem-vec-upsert", "s2")?;
^0
1145| |
1146| 1| let count: i64 = conn.query_row(
1147| 1| "SELECT COUNT(*) FROM vec_memories WHERE memory_id = ?1",
1148| 1| params![id],
1149| 1| |r| r.get(0),
1150| 0| )?;
1151| 1| assert_eq!(count, 1);
1152| 1| Ok(())
1153| 1| }
1154| |
1155| | #[test]
1156| 1| fn knn_search_returns_results_by_distance() -> TestResult {
1157| 1| let conn = setup_conn()?;
^0
1158| |
1159| | // emb_a: predominantemente positivo — cosseno alto com query [1.0; 384]
1160| 1| let ma = new_memory("mem-knn-a");
1161| 1| let id_a = insert(&conn, &ma)?;
^0
1162| 1| let emb_a: Vec<f32> = vec![1.0; 384];
1163| 1| upsert_vec(&conn, id_a, "global", "user", &emb_a, "mem-knn-a", "s")?;
^0
1164| |
1165| | // emb_b: predominantemente negativo — cosseno baixo com query [1.0; 384]
1166| 1| let mb = new_memory("mem-knn-b");
1167| 1| let id_b = insert(&conn, &mb)?;
^0
1168| 1| let emb_b: Vec<f32> = vec![-1.0; 384];
1169| 1| upsert_vec(&conn, id_b, "global", "user", &emb_b, "mem-knn-b", "s")?;
^0
1170| |
1171| 1| let query: Vec<f32> = vec![1.0; 384];
1172| 1| let results = knn_search(&conn, &query, &["global".to_string()], None, 2)?;
^0
1173| 1| assert!(!results.is_empty());
1174| 1| assert_eq!(results[0].0, id_a);
1175| 1| Ok(())
1176| 1| }
1177| |
1178| | #[test]
1179| 1| fn knn_search_with_type_filter_restricts_result() -> TestResult {
1180| 1| let conn = setup_conn()?;
^0
1181| |
1182| 1| let ma = new_memory("mem-knn-tipo-user");
1183| 1| let id_a = insert(&conn, &ma)?;
^0
1184| 1| let emb: Vec<f32> = vec![1.0; 384];
1185| 1| upsert_vec(
1186| 1| &conn,
1187| 1| id_a,
1188| 1| "global",
1189| 1| "user",
1190| 1| &emb,
1191| 1| "mem-knn-tipo-user",
1192| 1| "s",
1193| 0| )?;
1194| |
1195| 1| let mut mb = new_memory("mem-knn-tipo-fb");
1196| 1| mb.memory_type = "feedback".to_string();
1197| 1| let id_b = insert(&conn, &mb)?;
^0
1198| 1| upsert_vec(
1199| 1| &conn,
1200| 1| id_b,
1201| 1| "global",
1202| 1| "feedback",
1203| 1| &emb,
1204| 1| "mem-knn-tipo-fb",
1205| 1| "s",
1206| 0| )?;
1207| |
1208| 1| let query: Vec<f32> = vec![1.0; 384];
1209| 1| let results_user = knn_search(&conn, &query, &["global".to_string()], Some("user"), 5)?;
^0
1210| 1| assert!(results_user.iter().all(|(id, _)| *id == id_a));
1211| |
1212| 1| let results_fb = knn_search(&conn, &query, &["global".to_string()], Some("feedback"), 5)?;
^0
1213| 1| assert!(results_fb.iter().all(|(id, _)| *id == id_b));
1214| 1| Ok(())
1215| 1| }
1216| |
1217| | #[test]
1218| 1| fn fts_search_finds_by_prefix_in_body() -> TestResult {
1219| 1| let conn = setup_conn()?;
^0
1220| 1| let mut m = new_memory("mem-fts");
1221| 1| m.body = "linguagem de programacao rust".to_string();
1222| 1| insert(&conn, &m)?;
^0
1223| |
1224| 1| conn.execute_batch(
1225| 1| "INSERT INTO fts_memories(rowid, name, description, body)
1226| 1| SELECT id, name, description, body FROM memories WHERE deleted_at IS NULL",
1227| 0| )?;
1228| |
1229| 1| let rows = fts_search(&conn, "programacao", "global", None, 10)?;
^0
1230| 1| assert!(!rows.is_empty());
1231| 1| assert!(rows.iter().any(|r| r.name == "mem-fts"));
1232| 1| Ok(())
1233| 1| }
1234| |
1235| | #[test]
1236| 1| fn fts_search_with_type_filter() -> TestResult {
1237| 1| let conn = setup_conn()?;
^0
1238| 1| let mut m = new_memory("mem-fts-tipo");
1239| 1| m.body = "linguagem especial para filtro".to_string();
1240| 1| insert(&conn, &m)?;
^0
1241| |
1242| 1| let mut m2 = new_memory("mem-fts-feedback");
1243| 1| m2.memory_type = "feedback".to_string();
1244| 1| m2.body = "linguagem especial para filtro".to_string();
1245| 1| insert(&conn, &m2)?;
^0
1246| |
1247| 1| conn.execute_batch(
1248| 1| "INSERT INTO fts_memories(rowid, name, description, body)
1249| 1| SELECT id, name, description, body FROM memories WHERE deleted_at IS NULL",
1250| 0| )?;
1251| |
1252| 1| let rows_user = fts_search(&conn, "especial", "global", Some("user"), 10)?;
^0
1253| 1| assert!(rows_user.iter().all(|r| r.memory_type == "user"));
1254| |
1255| 1| let rows_fb = fts_search(&conn, "especial", "global", Some("feedback"), 10)?;
^0
1256| 1| assert!(rows_fb.iter().all(|r| r.memory_type == "feedback"));
1257| 1| Ok(())
1258| 1| }
1259| |
1260| | #[test]
1261| 1| fn fts_search_excludes_deleted() -> TestResult {
1262| 1| let conn = setup_conn()?;
^0
1263| 1| let mut m = new_memory("mem-fts-del");
1264| 1| m.body = "deleted fts content".to_string();
1265| 1| insert(&conn, &m)?;
^0
1266| |
1267| 1| conn.execute_batch(
1268| 1| "INSERT INTO fts_memories(rowid, name, description, body)
1269| 1| SELECT id, name, description, body FROM memories WHERE deleted_at IS NULL",
1270| 0| )?;
1271| |
1272| 1| soft_delete(&conn, "global", "mem-fts-del")?;
^0
1273| |
1274| 1| let rows = fts_search(&conn, "deleted", "global", None, 10)?;
^0
1275| 1| assert!(rows.iter().all(|r| r.name != "mem-fts-del"));
^0 ^0
1276| 1| Ok(())
1277| 1| }
1278| |
1279| | #[test]
1280| 1| fn list_deleted_before_returns_correct_ids() -> TestResult {
1281| 1| let conn = setup_conn()?;
^0
1282| 1| let m = new_memory("mem-purge");
1283| 1| insert(&conn, &m)?;
^0
1284| 1| soft_delete(&conn, "global", "mem-purge")?;
^0
1285| |
1286| 1| let ids = list_deleted_before(&conn, "global", i64::MAX)?;
^0
1287| 1| assert!(!ids.is_empty());
1288| |
1289| 1| let ids_antes = list_deleted_before(&conn, "global", 0)?;
^0
1290| 1| assert!(ids_antes.is_empty());
1291| 1| Ok(())
1292| 1| }
1293| |
1294| | #[test]
1295| 1| fn find_by_name_returns_correct_max_version() -> TestResult {
1296| 1| let conn = setup_conn()?;
^0
1297| 1| let m = new_memory("mem-ver");
1298| 1| let id = insert(&conn, &m)?;
^0
1299| |
1300| 1| let (_, _, v0) = find_by_name(&conn, "global", "mem-ver")?.ok_or("mem-ver should exist")?;
^0 ^0
1301| 1| assert_eq!(v0, 0);
1302| |
1303| 1| conn.execute(
1304| 1| "INSERT INTO memory_versions (memory_id, version, name, type, description, body, metadata, change_reason)
1305| 1| VALUES (?1, 1, 'mem-ver', 'user', 'desc', 'body', '{}', 'create')",
1306| 1| params![id],
1307| 0| )?;
1308| |
1309| 1| let (_, _, v1) =
1310| 1| find_by_name(&conn, "global", "mem-ver")?.ok_or("mem-ver should exist after insert")?;
^0 ^0
1311| 1| assert_eq!(v1, 1);
1312| 1| Ok(())
1313| 1| }
1314| |
1315| | #[test]
1316| 1| fn insert_com_metadata_json() -> TestResult {
1317| 1| let conn = setup_conn()?;
^0
1318| 1| let mut m = new_memory("mem-meta");
1319| 1| m.metadata = serde_json::json!({"chave": "valor", "numero": 42});
1320| 1| let id = insert(&conn, &m)?;
^0
1321| |
1322| 1| let row = read_full(&conn, id)?.ok_or("mem-meta should exist")?;
^0 ^0
1323| 1| let meta: serde_json::Value = serde_json::from_str(&row.metadata)?;
^0
1324| 1| assert_eq!(meta["chave"], "valor");
1325| 1| assert_eq!(meta["numero"], 42);
1326| 1| Ok(())
1327| 1| }
1328| |
1329| | #[test]
1330| 1| fn insert_com_session_id() -> TestResult {
1331| 1| let conn = setup_conn()?;
^0
1332| 1| let mut m = new_memory("mem-session");
1333| 1| m.session_id = Some("sessao-xyz".to_string());
1334| 1| let id = insert(&conn, &m)?;
^0
1335| |
1336| 1| let row = read_full(&conn, id)?.ok_or("mem-session should exist")?;
^0 ^0
1337| 1| assert_eq!(row.session_id, Some("sessao-xyz".to_string()));
1338| 1| Ok(())
1339| 1| }
1340| |
1341| | #[test]
1342| 1| fn delete_vec_for_nonexistent_id_does_not_fail() -> TestResult {
1343| 1| let conn = setup_conn()?;
^0
1344| 1| let result = delete_vec(&conn, 99999);
1345| 1| assert!(result.is_ok());
1346| 1| Ok(())
1347| 1| }
1348| |
1349| | #[test]
1350| 1| fn preprocess_fts_query_no_separators() {
1351| 1| assert_eq!(preprocess_fts_query("hello"), "hello*");
1352| 1| assert_eq!(preprocess_fts_query("hello world"), "hello* world*");
1353| 1| }
1354| |
1355| | #[test]
1356| 1| fn preprocess_fts_query_with_hyphens() {
1357| 1| let result = preprocess_fts_query("graphrag-precompact");
1358| 1| assert!(result.contains("\"graphrag precompact\""));
1359| 1| assert!(result.contains("graphrag*"));
1360| 1| assert!(result.contains("precompact*"));
1361| 1| }
1362| |
1363| | #[test]
1364| 1| fn preprocess_fts_query_with_dots() {
1365| 1| let result = preprocess_fts_query("v1.0.44");
1366| 1| assert!(result.contains("\"v1 0 44\""));
1367| 1| assert!(result.contains("v1*"));
1368| 1| assert!(result.contains("44*"));
1369| 1| }
1370| |
1371| | #[test]
1372| 1| fn preprocess_fts_query_with_mixed_separators() {
1373| 1| let result = preprocess_fts_query("graphrag-precompact.sh");
1374| 1| assert!(result.contains("\"graphrag precompact sh\""));
1375| 1| assert!(result.contains("graphrag*"));
1376| 1| }
1377| |
1378| | #[test]
1379| 1| fn preprocess_fts_query_empty_and_whitespace() {
1380| 1| assert_eq!(preprocess_fts_query(""), "");
1381| 1| assert_eq!(preprocess_fts_query(" "), "");
1382| 1| }
1383| |
1384| | #[test]
1385| 1| fn preprocess_fts_query_strips_quotes() {
1386| 1| let result = preprocess_fts_query(r#"hello "world"#);
1387| 1| assert!(result.contains("hello*"));
1388| 1| assert!(result.contains("world*"));
1389| 1| }
1390| |
1391| | #[test]
1392| 1| fn preprocess_fts_query_strips_asterisks() {
1393| 1| assert_eq!(preprocess_fts_query("test*"), "test*");
1394| 1| }
1395| |
1396| | #[test]
1397| 1| fn preprocess_fts_query_strips_parens() {
1398| 1| let result = preprocess_fts_query("(hello)");
1399| 1| assert!(result.contains("hello*"));
1400| 1| assert!(!result.contains('('));
1401| 1| }
1402| |
1403| | #[test]
1404| 1| fn preprocess_fts_query_filters_fts_keywords() {
1405| 1| let result = preprocess_fts_query("foo OR bar");
1406| 1| assert!(result.contains("foo*"));
1407| 1| assert!(result.contains("bar*"));
1408| 1| assert!(!result.contains("OR*"));
1409| 1| }
1410| |
1411| | #[test]
1412| 1| fn preprocess_fts_query_only_fts_keywords() {
1413| 1| assert_eq!(preprocess_fts_query("OR AND NOT"), "");
1414| 1| }
1415| |
1416| | #[test]
1417| 1| fn preprocess_fts_query_keywords_with_separators() {
1418| 1| let result = preprocess_fts_query("hello-OR-world");
1419| 1| assert!(result.contains("hello*"));
1420| 1| assert!(result.contains("world*"));
1421| 1| assert!(!result.contains("OR*"));
1422| 1| }
1423| |
1424| | #[test]
1425| 1| fn fts_search_finds_compound_term_with_hyphen() -> TestResult {
1426| 1| let conn = setup_conn()?;
^0
1427| 1| let mut m = new_memory("mem-compound");
1428| 1| m.body = "the graphrag-precompact script runs daily".to_string();
1429| 1| insert(&conn, &m)?;
^0
1430| 1| conn.execute_batch(
1431| 1| "INSERT INTO fts_memories(rowid, name, description, body)
1432| 1| SELECT id, name, description, body FROM memories WHERE deleted_at IS NULL",
1433| 0| )?;
1434| 1| let rows = fts_search(&conn, "graphrag-precompact", "global", None, 10)?;
^0
1435| 1| assert!(!rows.is_empty(), "should find compound hyphenated term");
^0
1436| 1| Ok(())
1437| 1| }
1438| |
1439| | #[test]
1440| 1| fn find_by_name_any_state_returns_deleted_flag() -> TestResult {
1441| 1| let conn = setup_conn()?;
^0
1442| 1| let m = new_memory("mem-soft-del");
1443| 1| let id = insert(&conn, &m)?;
^0
1444| 1| conn.execute(
1445| 1| "UPDATE memories SET deleted_at = unixepoch() WHERE id = ?1",
1446| 1| rusqlite::params![id],
1447| 0| )?;
1448| 1| let result = find_by_name_any_state(&conn, "global", "mem-soft-del")?;
^0
1449| 1| assert_eq!(result, Some((id, true)));
1450| 1| Ok(())
1451| 1| }
1452| |
1453| | #[test]
1454| 1| fn find_by_name_any_state_returns_not_deleted() -> TestResult {
1455| 1| let conn = setup_conn()?;
^0
1456| 1| let m = new_memory("mem-active");
1457| 1| let id = insert(&conn, &m)?;
^0
1458| 1| let result = find_by_name_any_state(&conn, "global", "mem-active")?;
^0
1459| 1| assert_eq!(result, Some((id, false)));
1460| 1| Ok(())
1461| 1| }
1462| |
1463| | #[test]
1464| 1| fn find_by_name_any_state_returns_none_when_absent() -> TestResult {
1465| 1| let conn = setup_conn()?;
^0
1466| 1| let result = find_by_name_any_state(&conn, "global", "does-not-exist")?;
^0
1467| 1| assert!(result.is_none());
1468| 1| Ok(())
1469| 1| }
1470| |
1471| | #[test]
1472| 1| fn clear_deleted_at_restores_memory() -> TestResult {
1473| 1| let conn = setup_conn()?;
^0
1474| 1| let m = new_memory("mem-restore");
1475| 1| let id = insert(&conn, &m)?;
^0
1476| 1| conn.execute(
1477| 1| "UPDATE memories SET deleted_at = unixepoch() WHERE id = ?1",
1478| 1| rusqlite::params![id],
1479| 0| )?;
1480| | // Soft-deleted: find_by_name should return None.
1481| 1| assert!(find_by_name(&conn, "global", "mem-restore")?.is_none());
^0
1482| 1| clear_deleted_at(&conn, id)?;
^0
1483| | // Restored: find_by_name should return Some again.
1484| 1| let found = find_by_name(&conn, "global", "mem-restore")?;
^0
1485| 1| assert!(found.is_some());
1486| 1| assert_eq!(found.unwrap().0, id);
1487| 1| Ok(())
1488| 1| }
1489| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/storage/urls.rs:
1| |//! Persistence for URLs extracted from memory bodies.
2| |//!
3| |//! Manages the `memory_urls` table: insert, deduplicate, and query URLs
4| |//! linked to a specific memory record.
5| |
6| |use crate::errors::AppError;
7| |use rusqlite::Connection;
8| |
9| |/// URL extracted from a memory body.
10| |pub struct MemoryUrl {
11| | pub url: String,
12| | pub offset: Option<i64>,
13| |}
14| |
15| |/// Insere uma URL na tabela `memory_urls`. Ignora duplicatas silenciosamente.
16| 7|pub fn insert_url(conn: &Connection, memory_id: i64, entry: &MemoryUrl) -> Result<(), AppError> {
17| 7| conn.execute(
18| 7| "INSERT OR IGNORE INTO memory_urls (memory_id, url, url_offset) VALUES (?1, ?2, ?3)",
19| 7| rusqlite::params![memory_id, entry.url, entry.offset],
20| 0| )?;
21| 7| Ok(())
22| 7|}
23| |
24| |/// Inserts multiple URLs for a memory. Returns the count inserted (duplicates ignored).
25| |/// Individual errors are logged as warn and not propagated — non-critical path.
26| 1|pub fn insert_urls(conn: &Connection, memory_id: i64, urls: &[MemoryUrl]) -> usize {
27| 1| let mut inserted = 0usize;
28| 4| for entry in urls {
^3
29| 3| match insert_url(conn, memory_id, entry) {
30| | Ok(()) => {
31| 3| let changed = conn.changes();
32| 3| if changed > 0 {
33| 2| inserted += 1;
34| 2| }
^1
35| | }
36| 0| Err(e) => {
37| 0| tracing::warn!(target: "storage", url = %entry.url, error = %e, "url persistence failed");
38| | }
39| | }
40| | }
41| 1| inserted
42| 1|}
43| |
44| |/// Lists all URLs associated with a memory.
45| 4|pub fn list_by_memory(conn: &Connection, memory_id: i64) -> Result<Vec<MemoryUrl>, AppError> {
46| 4| let mut stmt = conn.prepare_cached(
47| 4| "SELECT url, url_offset FROM memory_urls WHERE memory_id = ?1 ORDER BY id",
48| 0| )?;
49| 4| let rows = stmt.query_map(rusqlite::params![memory_id], |row| {
^3
50| | Ok(MemoryUrl {
51| 3| url: row.get(0)?,
^0
52| 3| offset: row.get(1)?,
^0
53| | })
54| 3| })?;
^0
55| 4| let mut result = Vec::with_capacity(8);
56| 7| for row in rows {
^3
57| 3| result.push(row?);
^0
58| | }
59| 4| Ok(result)
60| 4|}
61| |
62| |/// Removes all URLs for a memory.
63| 1|pub fn delete_by_memory(conn: &Connection, memory_id: i64) -> Result<(), AppError> {
64| 1| conn.execute(
65| 1| "DELETE FROM memory_urls WHERE memory_id = ?1",
66| 1| rusqlite::params![memory_id],
67| 0| )?;
68| 1| Ok(())
69| 1|}
70| |
71| |#[cfg(test)]
72| |mod tests {
73| | use super::*;
74| | use rusqlite::Connection;
75| | use tempfile::TempDir;
76| |
77| | type TestResult = Result<(), Box<dyn std::error::Error>>;
78| |
79| 4| fn setup_db() -> Result<(TempDir, Connection), Box<dyn std::error::Error>> {
80| 4| crate::storage::connection::register_vec_extension();
81| 4| let tmp = TempDir::new()?;
^0
82| 4| let db_path = tmp.path().join("test.db");
83| 4| let mut conn = Connection::open(&db_path)?;
^0
84| 4| crate::migrations::runner().run(&mut conn)?;
^0
85| 4| Ok((tmp, conn))
86| 4| }
87| |
88| 4| fn insert_test_memory(conn: &Connection) -> Result<i64, Box<dyn std::error::Error>> {
89| 4| conn.execute(
90| 4| "INSERT INTO memories (name, type, description, body, body_hash) VALUES ('mem', 'user', 'desc', 'body', 'hash')",
91| 4| [],
92| 0| )?;
93| 4| Ok(conn.last_insert_rowid())
94| 4| }
95| |
96| | #[test]
97| 1| fn insert_url_persists_and_list_returns() -> TestResult {
98| 1| let (_tmp, conn) = setup_db()?;
^0
99| 1| let mem_id = insert_test_memory(&conn)?;
^0
100| |
101| 1| insert_url(
102| 1| &conn,
103| 1| mem_id,
104| 1| &MemoryUrl {
105| 1| url: "https://example.com/page".to_string(),
106| 1| offset: Some(5),
107| 1| },
108| 0| )?;
109| |
110| 1| let urls = list_by_memory(&conn, mem_id)?;
^0
111| 1| assert_eq!(urls.len(), 1);
112| 1| assert_eq!(urls[0].url, "https://example.com/page");
113| 1| assert_eq!(urls[0].offset, Some(5));
114| 1| Ok(())
115| 1| }
116| |
117| | #[test]
118| 1| fn insert_url_duplicate_ignored() -> TestResult {
119| 1| let (_tmp, conn) = setup_db()?;
^0
120| 1| let mem_id = insert_test_memory(&conn)?;
^0
121| |
122| 1| let entry = MemoryUrl {
123| 1| url: "https://example.com/dup".to_string(),
124| 1| offset: None,
125| 1| };
126| 1| insert_url(&conn, mem_id, &entry)?;
^0
127| 1| insert_url(&conn, mem_id, &entry)?;
^0
128| |
129| 1| let urls = list_by_memory(&conn, mem_id)?;
^0
130| 1| assert_eq!(urls.len(), 1, "duplicata deve ser ignorada");
^0
131| 1| Ok(())
132| 1| }
133| |
134| | #[test]
135| 1| fn insert_urls_returns_inserted_count() -> TestResult {
136| 1| let (_tmp, conn) = setup_db()?;
^0
137| 1| let mem_id = insert_test_memory(&conn)?;
^0
138| |
139| 1| let batch = vec![
140| 1| MemoryUrl {
141| 1| url: "https://alpha.example.com".to_string(),
142| 1| offset: Some(0),
143| 1| },
144| 1| MemoryUrl {
145| 1| url: "https://beta.example.com".to_string(),
146| 1| offset: Some(10),
147| 1| },
148| 1| MemoryUrl {
149| 1| url: "https://alpha.example.com".to_string(),
150| 1| offset: Some(0),
151| 1| },
152| | ];
153| 1| let count = insert_urls(&conn, mem_id, &batch);
154| 1| assert_eq!(count, 2, "only 2 unique entries must be inserted");
^0
155| 1| Ok(())
156| 1| }
157| |
158| | #[test]
159| 1| fn delete_by_memory_removes_all_urls() -> TestResult {
160| 1| let (_tmp, conn) = setup_db()?;
^0
161| 1| let mem_id = insert_test_memory(&conn)?;
^0
162| |
163| 1| insert_url(
164| 1| &conn,
165| 1| mem_id,
166| 1| &MemoryUrl {
167| 1| url: "https://to-delete.example.com".to_string(),
168| 1| offset: None,
169| 1| },
170| 0| )?;
171| 1| assert_eq!(list_by_memory(&conn, mem_id)?.len(), 1);
^0
172| |
173| 1| delete_by_memory(&conn, mem_id)?;
^0
174| 1| assert_eq!(list_by_memory(&conn, mem_id)?.len(), 0);
^0
175| 1| Ok(())
176| 1| }
177| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/storage/utils.rs:
1| |//! Storage utility helpers shared across the storage sub-modules.
2| |
3| |use crate::constants::{MAX_SQLITE_BUSY_RETRIES, SQLITE_BUSY_BASE_DELAY_MS};
4| |use crate::errors::AppError;
5| |use rusqlite::ErrorCode;
6| |use std::thread;
7| |use std::time::Duration;
8| |
9| |/// Returns `true` when `err` wraps an `SQLITE_BUSY` (or `SQLITE_LOCKED`)
10| |/// condition reported by rusqlite.
11| |///
12| |/// Both `SQLITE_BUSY` (`ErrorCode::DatabaseBusy`) and `SQLITE_LOCKED`
13| |/// (`ErrorCode::DatabaseLocked`) indicate that the write cannot proceed
14| |/// immediately due to WAL concurrency. We treat both as transient and
15| |/// eligible for retry.
16| 11|pub fn is_sqlite_busy(err: &AppError) -> bool {
17| 9| match err {
18| 9| AppError::Database(rusqlite::Error::SqliteFailure(e, _)) => {
19| 9| e.code == ErrorCode::DatabaseBusy || e.code == ErrorCode::DatabaseLocked
^1
20| | }
21| 2| _ => false,
22| | }
23| 11|}
24| |
25| |/// Executes `op` up to `MAX_SQLITE_BUSY_RETRIES` times with exponential
26| |/// backoff whenever the operation fails with `SQLITE_BUSY` / `SQLITE_LOCKED`.
27| |///
28| |/// Delay schedule (base = `SQLITE_BUSY_BASE_DELAY_MS`):
29| |/// - attempt 1 → `base` ms
30| |/// - attempt 2 → `base * 2` ms
31| |/// - attempt 3 → `base * 4` ms
32| |/// - attempt 4 → `base * 8` ms
33| |/// - attempt 5 → `base * 16` ms
34| |///
35| |/// After all retries are exhausted the last `SQLITE_BUSY` error is converted
36| |/// to [`AppError::DbBusy`] so callers can route on exit-code `15`.
37| 17|pub fn with_busy_retry<F>(op: F) -> Result<(), AppError>
38| 17|where
39| 17| F: Fn() -> Result<(), AppError>,
40| |{
41| 24| for attempt in 0..MAX_SQLITE_BUSY_RETRIES {
^23
42| 23| match op() {
43| 15| Ok(()) => return Ok(()),
44| 8| Err(e) if is_sqlite_busy(&e) => {
^7 ^7
45| 7| if crate::retry::is_kill_switch_active() {
46| 0| tracing::warn!(target: "storage", "SQLITE_GRAPHRAG_DISABLE_RETRY=1, propagating SQLITE_BUSY immediately");
47| 0| return Err(e);
48| 7| }
49| 7| let base_ms = SQLITE_BUSY_BASE_DELAY_MS * (1u64 << attempt);
50| 7| let half = base_ms / 2;
51| 7| let jitter = if half == 0 { 0 } else { fastrand::u64(0..half) };
^0
52| 7| let delay_ms = half + jitter;
53| 7| tracing::debug!(
54| | target: "storage",
55| 0| attempt = attempt + 1,
56| | attempt_max = MAX_SQLITE_BUSY_RETRIES,
57| | delay_ms,
58| 0| "SQLITE_BUSY retry with half-jitter"
59| | );
60| 7| thread::sleep(Duration::from_millis(delay_ms));
61| | }
62| 1| Err(other) => return Err(other),
63| | }
64| | }
65| |
66| 1| tracing::error!(
67| | target: "storage",
68| | retries = MAX_SQLITE_BUSY_RETRIES,
69| 0| "SQLITE_BUSY exhausted all retries"
70| | );
71| 1| Err(AppError::DbBusy(format!(
72| 1| "SQLITE_BUSY after {MAX_SQLITE_BUSY_RETRIES} retries"
73| 1| )))
74| 17|}
75| |
76| |#[cfg(test)]
77| |mod tests {
78| | use super::*;
79| | use std::sync::atomic::{AtomicU32, Ordering};
80| | use std::sync::Arc;
81| |
82| | /// Helper that builds a fake `AppError::Database` wrapping
83| | /// `SQLITE_BUSY` (error code 5) so that `is_sqlite_busy` can be tested
84| | /// without needing a live SQLite connection.
85| 8| fn make_busy_error() -> AppError {
86| | // rusqlite::Error::SqliteFailure requires a `ffi::Error` + optional msg.
87| | // We construct it via the public `rusqlite::ffi` interface.
88| 8| let ffi_err = rusqlite::ffi::Error {
89| 8| code: ErrorCode::DatabaseBusy,
90| 8| extended_code: 5,
91| 8| };
92| 8| AppError::Database(rusqlite::Error::SqliteFailure(ffi_err, None))
93| 8| }
94| |
95| 1| fn make_locked_error() -> AppError {
96| 1| let ffi_err = rusqlite::ffi::Error {
97| 1| code: ErrorCode::DatabaseLocked,
98| 1| extended_code: 6,
99| 1| };
100| 1| AppError::Database(rusqlite::Error::SqliteFailure(ffi_err, None))
101| 1| }
102| |
103| | #[test]
104| 1| fn is_sqlite_busy_detects_database_busy() {
105| 1| assert!(is_sqlite_busy(&make_busy_error()));
106| 1| }
107| |
108| | #[test]
109| 1| fn is_sqlite_busy_detects_database_locked() {
110| 1| assert!(is_sqlite_busy(&make_locked_error()));
111| 1| }
112| |
113| | #[test]
114| 1| fn is_sqlite_busy_rejects_other_errors() {
115| 1| let err = AppError::Validation("invalid field".into());
116| 1| assert!(!is_sqlite_busy(&err));
117| 1| }
118| |
119| | #[test]
120| 1| fn with_busy_retry_propagates_non_busy_error() {
121| 1| let calls = Arc::new(AtomicU32::new(0));
122| 1| let calls_clone = Arc::clone(&calls);
123| |
124| 1| let result = with_busy_retry(|| {
125| 1| calls_clone.fetch_add(1, Ordering::SeqCst);
126| 1| Err(AppError::Validation("campo x".into()))
127| 1| });
128| |
129| | // Non-busy errors must propagate immediately without retrying.
130| 1| assert_eq!(calls.load(Ordering::SeqCst), 1);
131| 1| assert!(matches!(result, Err(AppError::Validation(_))));
^0
132| 1| }
133| |
134| | #[test]
135| 1| fn with_busy_retry_succeeds_on_third_attempt() {
136| 1| let calls = Arc::new(AtomicU32::new(0));
137| 1| let calls_clone = Arc::clone(&calls);
138| |
139| | // Fail twice with SQLITE_BUSY, succeed on the third call.
140| 3| let result = with_busy_retry(|| {
^1 ^1
141| 3| let n = calls_clone.fetch_add(1, Ordering::SeqCst);
142| 3| if n < 2 {
143| 2| Err(make_busy_error())
144| | } else {
145| 1| Ok(())
146| | }
147| 3| });
148| |
149| 1| assert_eq!(calls.load(Ordering::SeqCst), 3);
150| 1| assert!(result.is_ok(), "expected Ok after 3rd attempt");
^0
151| 1| }
152| |
153| | #[test]
154| 1| fn busy_retry_jitter_in_range() {
155| | // Verify that the half-jitter formula stays within [base/2, base) for attempt=2.
156| | // attempt=2 → base_ms = SQLITE_BUSY_BASE_DELAY_MS * 4; half = base_ms/2.
157| | // We call fastrand::u64 indirectly through with_busy_retry by observing that the
158| | // function completes; direct delay bounds are tested via the formula invariant.
159| 1| let base_ms = SQLITE_BUSY_BASE_DELAY_MS * (1u64 << 2); // attempt=2
160| 1| let half = base_ms / 2;
161| 101| for _ in 0..100 {
162| 100| let jitter = fastrand::u64(0..half);
163| 100| let delay_ms = half + jitter;
164| 100| assert!(
165| 100| delay_ms >= half && delay_ms < base_ms,
166| 0| "delay_ms {delay_ms} out of [{half}, {base_ms})"
167| | );
168| | }
169| 1| }
170| |
171| | #[test]
172| 1| fn with_busy_retry_returns_db_busy_after_all_retries() {
173| 1| let calls = Arc::new(AtomicU32::new(0));
174| 1| let calls_clone = Arc::clone(&calls);
175| |
176| 5| let result = with_busy_retry(|| {
^1 ^1
177| 5| calls_clone.fetch_add(1, Ordering::SeqCst);
178| 5| Err(make_busy_error())
179| 5| });
180| |
181| 1| assert_eq!(
182| 1| calls.load(Ordering::SeqCst),
183| | MAX_SQLITE_BUSY_RETRIES,
184| 0| "must attempt exactly MAX_SQLITE_BUSY_RETRIES times"
185| | );
186| 1| assert!(
187| 1| matches!(result, Err(AppError::DbBusy(_))),
^0
188| 0| "must convert to DbBusy after exhausting retries"
189| | );
190| 1| }
191| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/storage/versions.rs:
1| |//! Version history storage for memory records.
2| |//!
3| |//! Manages the `memory_versions` table: inserts a new version snapshot on
4| |//! every update so the `restore` command can roll back to any prior body.
5| |
6| |use crate::errors::AppError;
7| |use rusqlite::{params, Connection};
8| |
9| |#[allow(clippy::too_many_arguments)]
10| 0|pub fn insert_version(
11| 0| conn: &Connection,
12| 0| memory_id: i64,
13| 0| version: i64,
14| 0| name: &str,
15| 0| memory_type: &str,
16| 0| description: &str,
17| 0| body: &str,
18| 0| metadata: &str,
19| 0| changed_by: Option<&str>,
20| 0| change_reason: &str,
21| 0|) -> Result<(), AppError> {
22| 0| conn.execute(
23| 0| "INSERT INTO memory_versions
24| 0| (memory_id, version, name, type, description, body, metadata, changed_by, change_reason)
25| 0| VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9)",
26| 0| params![
27| | memory_id,
28| | version,
29| | name,
30| | memory_type,
31| | description,
32| | body,
33| | metadata,
34| | changed_by,
35| | change_reason
36| | ],
37| 0| )?;
38| 0| Ok(())
39| 0|}
40| |
41| 0|pub fn next_version(conn: &Connection, memory_id: i64) -> Result<i64, AppError> {
42| 0| let v: i64 = conn.query_row(
43| 0| "SELECT COALESCE(MAX(version), 0) + 1 FROM memory_versions WHERE memory_id = ?1",
44| 0| params![memory_id],
45| 0| |r| r.get(0),
46| 0| )?;
47| 0| Ok(v)
48| 0|}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/system_load.rs:
1| |//! G28-D: system load average observation before spawning LLM subprocesses.
2| |//!
3| |//! The 2026-06-03 incident saturated a 10-CPU host with load 276 because
4| |//! parallel `enrich` workers kept spawning `claude -p` / `codex exec`
5| |//! children even when the system was already at saturation. This module
6| |//! exposes a single helper that returns `true` when the 1-minute load
7| |//! average is above `2 × ncpus` (the conservative threshold the G28-D
8| |//! original discussion recommended).
9| |//!
10| |//! Uses `sysinfo::System::load_average()` which is already a transitive
11| |//! dependency of the project. The read is cheap (single syscall on
12| |//! Linux) and throttled to once per second via a Mutex-cached timestamp.
13| |
14| |use std::sync::Mutex;
15| |use std::time::{Duration, Instant};
16| |
17| |static LAST_REFRESH: Mutex<Option<Instant>> = Mutex::new(None);
18| |
19| |/// Returns the 1-minute load average as reported by the OS.
20| |///
21| |/// On platforms where `sysinfo` cannot read load average (very old Linux
22| |/// without /proc/loadavg), returns `0.0` so callers default to "no
23| |/// saturation detected".
24| 2|pub fn load_average_one() -> f64 {
25| 2| let _ = ensure_fresh();
26| 2| sysinfo::System::load_average().one
27| 2|}
28| |
29| |/// Returns the number of logical CPUs the runtime can detect.
30| |///
31| |/// Used together with [`load_average_one`] to apply a saturation check.
32| 2|pub fn ncpus() -> usize {
33| 2| std::thread::available_parallelism()
34| 2| .map(|n| n.get())
35| 2| .unwrap_or(4)
36| 2|}
37| |
38| |/// G28-D: returns `true` when the 1-minute load average exceeds
39| |/// `2 × ncpus` (the conservative threshold originally proposed in the
40| |/// G28 audit). The default threshold can be overridden by the
41| |/// `SQLITE_GRAPHRAG_MAX_LOAD_PER_NCPU` env var.
42| 1|pub fn is_system_saturated() -> bool {
43| 1| let load = load_average_one();
44| 1| let n = ncpus() as f64;
45| 1| let multiplier: f64 = std::env::var("SQLITE_GRAPHRAG_MAX_LOAD_PER_NCPU")
46| 1| .ok()
47| 1| .and_then(|v| v.parse().ok())
^0 ^0
48| 1| .unwrap_or(2.0);
49| 1| load > n * multiplier
50| 1|}
51| |
52| |/// Throttles the cached refresh timestamp so we read /proc/loadavg at
53| |/// most once per second across all callers. The function returns the
54| |/// previous timestamp (or None on first call) so the caller can decide
55| |/// whether to actually invoke the syscall.
56| 4|fn ensure_fresh() -> Option<Instant> {
57| 4| let mut guard = LAST_REFRESH.lock().expect("loadavg mutex poisoned");
58| 4| let now = Instant::now();
59| 4| let should_refresh = guard
60| 4| .as_ref()
61| 4| .is_none_or(|last| now.duration_since(*last) > Duration::from_secs(1));
^3 ^3
62| 4| let prev = guard.as_ref().copied();
63| 4| if should_refresh {
64| 1| *guard = Some(now);
65| 3| }
66| 4| prev
67| 4|}
68| |
69| |#[cfg(test)]
70| |mod tests {
71| | use super::*;
72| |
73| | #[test]
74| 1| fn ncpus_is_at_least_one() {
75| 1| assert!(ncpus() >= 1);
76| 1| }
77| |
78| | #[test]
79| 1| fn load_average_is_non_negative() {
80| 1| assert!(load_average_one() >= 0.0);
81| 1| }
82| |
83| | #[test]
84| 1| fn saturation_default_threshold_is_two() {
85| | // G28-D default: 2 × ncpus. Operators can lower it via env var
86| | // when running on contended CI runners.
87| 1| let env_default = std::env::var("SQLITE_GRAPHRAG_MAX_LOAD_PER_NCPU")
88| 1| .ok()
89| 1| .and_then(|v| v.parse().ok())
^0 ^0
90| 1| .unwrap_or(2.0);
91| 1| assert!(env_default >= 1.0);
92| 1| }
93| |
94| | #[test]
95| 1| fn saturation_check_does_not_panic() {
96| | // The function must always return a definitive answer.
97| 1| let _ = is_system_saturated();
98| 1| }
99| |
100| | #[test]
101| 1| fn ensure_fresh_returns_previous_then_sets_new() {
102| 1| let prev = ensure_fresh();
103| | // On the first call prev is None; subsequent calls return Some.
104| 1| if prev.is_none() {
105| 1| let second = ensure_fresh();
106| | // Within the same second the cache is fresh so prev is Some.
107| 1| assert!(second.is_some());
108| 0| }
109| 1| }
110| |}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/telemetry.rs:
1| |//! Centralized tracing subscriber initialization.
2| |//!
3| |//! Configures the global subscriber with JSON or pretty format,
4| |//! installs the panic hook and the log-to-tracing bridge.
5| |
6| |use tracing_subscriber::EnvFilter;
7| |
8| |/// Initializes the global tracing subscriber, panic hook, and log bridge.
9| |///
10| |/// Must be called exactly once, before any tracing events are emitted.
11| |/// After this call, panics on any thread produce `tracing::error!` events,
12| |/// and `log` crate events from dependencies (refinery, ureq, ort) are
13| |/// forwarded to the tracing subscriber.
14| 0|pub fn init_tracing(log_level: &str, log_format: &str) {
15| | // TR02: the log→tracing bridge is activated automatically by
16| | // tracing-subscriber's built-in `tracing-log` feature (default).
17| | // Calling LogTracer::init() separately would conflict with the
18| | // global logger that tracing-subscriber installs via .init().
19| 0| let use_ansi = crate::terminal::should_use_ansi();
20| |
21| 0| if log_format == "json" {
22| 0| tracing_subscriber::fmt()
23| 0| .json()
24| 0| .with_ansi(false)
25| 0| .with_thread_ids(true)
26| 0| .with_thread_names(true)
27| 0| .with_env_filter(EnvFilter::new(log_level))
28| 0| .with_writer(std::io::stderr)
29| 0| .init();
30| 0| } else {
31| 0| tracing_subscriber::fmt()
32| 0| .with_ansi(use_ansi)
33| 0| .with_env_filter(EnvFilter::new(log_level))
34| 0| .with_writer(std::io::stderr)
35| 0| .init();
36| 0| }
37| |
38| | // TR05: confirm effective filter after init
39| 0| tracing::debug!(
40| | target: "telemetry",
41| | filter = %log_level,
42| | format = %log_format,
43| | ansi = use_ansi,
44| 0| "tracing subscriber initialized"
45| | );
46| |
47| | // TR01: panic hook emitting structured tracing::error!
48| 0| let prev_hook = std::panic::take_hook();
49| 0| std::panic::set_hook(Box::new(move |info| {
50| 0| let payload = info
51| 0| .payload()
52| 0| .downcast_ref::<&str>()
53| 0| .copied()
54| 0| .or_else(|| info.payload().downcast_ref::<String>().map(|s| s.as_str()))
55| 0| .unwrap_or("<non-string panic>");
56| 0| let location = info
57| 0| .location()
58| 0| .map(|l| format!("{}:{}:{}", l.file(), l.line(), l.column()));
59| 0| tracing::error!(
60| | target: "panic",
61| | message = %payload,
62| 0| location = location.as_deref().unwrap_or("unknown"),
63| 0| "thread panicked"
64| | );
65| 0| prev_hook(info);
66| 0| }));
67| 0|}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/terminal.rs:
1| |//! Cross-platform terminal initialization: UTF-8, ANSI colors, NO_COLOR.
2| |
3| |/// Initializes the console for correct UTF-8 output and ANSI escape
4| |/// support. On non-Windows platforms this is a no-op because modern
5| |/// Unix terminals handle both natively.
6| 0|pub fn init_console() {
7| | #[cfg(windows)]
8| | init_windows_console();
9| 0|}
10| |
11| |#[cfg(windows)]
12| |fn init_windows_console() {
13| | use windows_sys::Win32::Foundation::{HANDLE, INVALID_HANDLE_VALUE};
14| | use windows_sys::Win32::System::Console::{
15| | GetConsoleMode, GetStdHandle, SetConsoleCP, SetConsoleMode, SetConsoleOutputCP,
16| | ENABLE_VIRTUAL_TERMINAL_PROCESSING, STD_ERROR_HANDLE, STD_OUTPUT_HANDLE,
17| | };
18| | const CP_UTF8: u32 = 65001;
19| |
20| | // SAFETY: Win32 console functions are safe to call from a single-threaded
21| | // context before any output occurs. GetStdHandle returns
22| | // INVALID_HANDLE_VALUE on failure (checked below); SetConsoleMode failure
23| | // is silently tolerated so the CLI degrades to plain text.
24| | // G29 (v1.0.68): HANDLE was `isize` in windows-sys <= 0.52 and became
25| | // `*mut c_void` in >= 0.59; the previous comparison `handle != 0 &&
26| | // handle as isize != -1` only worked for the old type and now fails
27| | // compilation. Replaced with the type-safe idiom `!handle.is_null() &&
28| | // handle != INVALID_HANDLE_VALUE`, which works for both type eras and
29| | // also catches the distinct INVALID_HANDLE_VALUE sentinel ((HANDLE)-1).
30| | unsafe {
31| | SetConsoleOutputCP(CP_UTF8);
32| | SetConsoleCP(CP_UTF8);
33| |
34| | for handle_id in [STD_OUTPUT_HANDLE, STD_ERROR_HANDLE] {
35| | let handle: HANDLE = GetStdHandle(handle_id);
36| | if !handle.is_null() && handle != INVALID_HANDLE_VALUE {
37| | let mut mode: u32 = 0;
38| | if GetConsoleMode(handle, &mut mode) != 0 {
39| | let _ = SetConsoleMode(handle, mode | ENABLE_VIRTUAL_TERMINAL_PROCESSING);
40| | }
41| | }
42| | }
43| | }
44| |}
45| |
46| |/// Returns whether ANSI escape codes should be emitted to stderr.
47| |///
48| |/// Precedence:
49| |/// 1. `NO_COLOR` set (any value) → false (<https://no-color.org> standard)
50| |/// 2. `CLICOLOR_FORCE=1` → true (force colors even without TTY)
51| |/// 3. stderr is a terminal → true
52| |/// 4. fallback → false
53| 0|pub fn should_use_ansi() -> bool {
54| 0| if std::env::var_os("NO_COLOR").is_some() {
55| 0| return false;
56| 0| }
57| 0| if std::env::var("CLICOLOR_FORCE").ok().as_deref() == Some("1") {
58| 0| return true;
59| 0| }
60| 0| std::io::IsTerminal::is_terminal(&std::io::stderr())
61| 0|}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/tokenizer.rs:
1| |//! Token-count utilities for embedding input sizing.
2| |//!
3| |//! Provides fast approximate token counting used to decide whether a body
4| |//! fits in a single chunk or requires the multi-chunk splitter.
5| |
6| |use crate::constants::PASSAGE_PREFIX;
7| |use crate::errors::AppError;
8| |use fastembed::{EmbeddingModel, TextEmbedding};
9| |use huggingface_hub::api::sync::ApiBuilder;
10| |use std::path::{Path, PathBuf};
11| |use std::sync::OnceLock;
12| |use tokenizers::Tokenizer;
13| |
14| |struct TokenizerRuntime {
15| | tokenizer: Tokenizer,
16| | model_max_length: usize,
17| |}
18| |
19| |static TOKENIZER_RUNTIME: OnceLock<TokenizerRuntime> = OnceLock::new();
20| |
21| |/// Returns the process-wide [`Tokenizer`] singleton, initializing it on first call.
22| |///
23| |/// # Errors
24| |/// Returns `Err` when the tokenizer files cannot be loaded from `models_dir`.
25| 0|pub fn get_tokenizer(models_dir: &Path) -> Result<&'static Tokenizer, AppError> {
26| 0| Ok(&get_runtime(models_dir)?.tokenizer)
27| 0|}
28| |
29| |/// Returns the model's `model_max_length` from `tokenizer_config.json`.
30| |///
31| |/// # Errors
32| |/// Returns `Err` when the tokenizer files cannot be loaded or the field is missing.
33| 0|pub fn get_model_max_length(models_dir: &Path) -> Result<usize, AppError> {
34| 0| Ok(get_runtime(models_dir)?.model_max_length)
35| 0|}
36| |
37| |/// Counts the tokens produced by encoding `text` with the passage prefix.
38| |///
39| |/// Prepends `PASSAGE_PREFIX` before tokenizing so the count reflects the actual
40| |/// number of tokens consumed by the embedding model.
41| |///
42| |/// # Errors
43| |/// Returns `Err` when the tokenizer fails to encode the input.
44| 0|pub fn count_passage_tokens(tokenizer: &Tokenizer, text: &str) -> Result<usize, AppError> {
45| 0| let prefixed = format!("{PASSAGE_PREFIX}{text}");
46| 0| count_tokens(tokenizer, &prefixed)
47| 0|}
48| |
49| |/// Returns the byte-offset pairs `(start, end)` for each token in `text`.
50| |///
51| |/// The passage prefix is prepended before tokenizing; offsets in the returned
52| |/// vector are adjusted back to be relative to the original `text` slice.
53| |///
54| |/// # Errors
55| |/// Returns `Err` when the tokenizer fails to encode the input.
56| 0|pub fn passage_token_offsets(
57| 0| tokenizer: &Tokenizer,
58| 0| text: &str,
59| 0|) -> Result<Vec<(usize, usize)>, AppError> {
60| 0| let prefixed = format!("{PASSAGE_PREFIX}{text}");
61| 0| let prefix_len = PASSAGE_PREFIX.len();
62| 0| let encoding = tokenizer
63| 0| .encode(prefixed, true)
64| 0| .map_err(|e| AppError::Embedding(e.to_string()))?;
65| |
66| 0| let mut offsets = Vec::with_capacity(encoding.get_offsets().len());
67| 0| for &(start, end) in encoding.get_offsets() {
68| 0| if end <= start || end <= prefix_len {
69| 0| continue;
70| 0| }
71| |
72| 0| let adjusted_start = start.saturating_sub(prefix_len).min(text.len());
73| 0| let adjusted_end = end.saturating_sub(prefix_len).min(text.len());
74| |
75| 0| if adjusted_end > adjusted_start
76| 0| && text.is_char_boundary(adjusted_start)
77| 0| && text.is_char_boundary(adjusted_end)
78| 0| {
79| 0| offsets.push((adjusted_start, adjusted_end));
80| 0| }
81| | }
82| |
83| 0| if offsets.is_empty() && !text.is_empty() {
84| 0| offsets.push((0, text.len()));
85| 0| }
86| |
87| 0| Ok(offsets)
88| 0|}
89| |
90| 0|fn count_tokens(tokenizer: &Tokenizer, text: &str) -> Result<usize, AppError> {
91| 0| let encoding = tokenizer
92| 0| .encode(text, true)
93| 0| .map_err(|e| AppError::Embedding(e.to_string()))?;
94| 0| Ok(encoding.len())
95| 0|}
96| |
97| 0|fn get_runtime(models_dir: &Path) -> Result<&'static TokenizerRuntime, AppError> {
98| 0| if let Some(runtime) = TOKENIZER_RUNTIME.get() {
99| 0| return Ok(runtime);
100| 0| }
101| |
102| 0| let runtime = load_runtime(models_dir)?;
103| 0| let _ = TOKENIZER_RUNTIME.set(runtime);
104| 0| Ok(TOKENIZER_RUNTIME
105| 0| .get()
106| 0| .expect("OnceLock::set succeeded above; get cannot fail in this single-init path"))
107| 0|}
108| |
109| 0|fn load_runtime(models_dir: &Path) -> Result<TokenizerRuntime, AppError> {
110| 0| let model_info = TextEmbedding::get_model_info(&EmbeddingModel::MultilingualE5Small)
111| 0| .map_err(|e| AppError::Embedding(e.to_string()))?;
112| |
113| 0| let cache_dir = std::env::var("HF_HOME")
114| 0| .map(PathBuf::from)
115| 0| .unwrap_or_else(|_| models_dir.to_path_buf());
116| 0| let endpoint =
117| 0| std::env::var("HF_ENDPOINT").unwrap_or_else(|_| "https://huggingface.co".to_string());
118| |
119| 0| let api = ApiBuilder::new()
120| 0| .with_cache_dir(cache_dir)
121| 0| .with_endpoint(endpoint)
122| 0| .with_progress(false)
123| 0| .build()
124| 0| .map_err(|e| AppError::Embedding(e.to_string()))?;
125| 0| let repo = api.model(model_info.model_code.clone());
126| |
127| 0| let tokenizer_bytes =
128| 0| std::fs::read(repo.get("tokenizer.json").map_err(map_hf_err)?).map_err(AppError::Io)?;
129| 0| let tokenizer_config_bytes =
130| 0| std::fs::read(repo.get("tokenizer_config.json").map_err(map_hf_err)?)
131| 0| .map_err(AppError::Io)?;
132| |
133| 0| let tokenizer =
134| 0| Tokenizer::from_bytes(tokenizer_bytes).map_err(|e| AppError::Embedding(e.to_string()))?;
135| 0| let tokenizer_config: serde_json::Value =
136| 0| serde_json::from_slice(&tokenizer_config_bytes).map_err(AppError::Json)?;
137| 0| let model_max_length = tokenizer_config["model_max_length"]
138| 0| .as_u64()
139| 0| .map(|n| n as usize)
140| 0| .or_else(|| {
141| 0| tokenizer_config["model_max_length"]
142| 0| .as_f64()
143| 0| .map(|n| n as usize)
144| 0| })
145| 0| .ok_or_else(|| {
146| 0| AppError::Embedding("tokenizer_config.json missing model_max_length field".into())
147| 0| })?;
148| |
149| 0| Ok(TokenizerRuntime {
150| 0| tokenizer,
151| 0| model_max_length,
152| 0| })
153| 0|}
154| |
155| 0|fn map_hf_err(err: huggingface_hub::api::sync::ApiError) -> AppError {
156| 0| AppError::Embedding(err.to_string())
157| 0|}
/Users/daniloteixeira/Dropbox/ai/dev/rust/linux/cli_sqlite-graphrag/src/tz.rs:
1| |//! Display timezone for `*_iso` fields in JSON output.
2| |//!
3| |//! Precedence (highest to lowest priority):
4| |//! 1. `--tz <IANA>` flag passed on the CLI
5| |//! 2. Env var `SQLITE_GRAPHRAG_DISPLAY_TZ`
6| |//! 3. Fallback UTC
7| |//!
8| |//! The timezone is initialized once via [`init`][crate::tz::init] and stored in
9| |//! `GLOBAL_TZ` (OnceLock). After initialization, [`format_iso`][crate::tz::format_iso] and
10| |//! [`epoch_to_iso`][crate::tz::epoch_to_iso] convert timestamps applying the chosen timezone.
11| |
12| |use crate::errors::AppError;
13| |use crate::i18n::validation;
14| |use chrono::{DateTime, TimeZone, Utc};
15| |use chrono_tz::Tz;
16| |use std::sync::OnceLock;
17| |
18| |static GLOBAL_TZ: OnceLock<Tz> = OnceLock::new();
19| |
20| |/// Resolves the timezone from the `SQLITE_GRAPHRAG_DISPLAY_TZ` env var.
21| |///
22| |/// Returns `Tz::UTC` if the variable is absent or empty.
23| |/// Returns a validation error if the value is an invalid IANA name.
24| 4|fn resolve_tz_from_env() -> Result<Tz, AppError> {
25| 4| match std::env::var("SQLITE_GRAPHRAG_DISPLAY_TZ") {
26| 3| Ok(v) if !v.trim().is_empty() => v
27| 3| .trim()
28| 3| .parse::<Tz>()
29| 3| .map_err(|_| AppError::Validation(validation::invalid_tz(v.trim()))),
^1 ^1
30| 1| _ => Ok(Tz::UTC),
31| | }
32| 4|}
33| |
34| |/// Initializes the global timezone.
35| |///
36| |/// `explicit` — value from the `--tz` CLI flag (already parsed).
37| |/// If `explicit` is `None`, tries `SQLITE_GRAPHRAG_DISPLAY_TZ`, then UTC.
38| |///
39| |/// Subsequent calls are silently ignored (OnceLock semantics).
40| |/// Returns an error only if `explicit` is `None` and the env var is invalid.
41| 0|pub fn init(explicit: Option<Tz>) -> Result<(), AppError> {
42| 0| let fuso = match explicit {
43| 0| Some(tz) => tz,
44| 0| None => resolve_tz_from_env()?,
45| | };
46| 0| let _ = GLOBAL_TZ.set(fuso);
47| 0| Ok(())
48| 0|}
49| |
50| |/// Returns the active timezone.
51| |///
52| |/// If [`init`] was never called, tries to read the env var; fallback UTC.
53| 7|pub fn current_tz() -> Tz {
54| 7| *GLOBAL_TZ.get_or_init(|| resolve_tz_from_env().unwrap_or(Tz::UTC))
^1 ^1 ^1
55| 7|}
56| |
57| |/// Formats a `DateTime<Utc>` using the global timezone.
58| |///
59| |/// Format: `%Y-%m-%dT%H:%M:%S%:z` (e.g. `2026-04-19T10:00:00+00:00` for UTC,
60| |/// `2026-04-19T07:00:00-03:00` for `America/Sao_Paulo`).
61| 7|pub fn format_iso(ts: DateTime<Utc>) -> String {
62| 7| let fuso = current_tz();
63| 7| ts.with_timezone(&fuso)
64| 7| .format("%Y-%m-%dT%H:%M:%S%:z")
65| 7| .to_string()
66| 7|}
67| |
68| |/// Converts a Unix epoch (seconds) to an ISO 8601 string with the global timezone.
69| |///
70| |/// Values outside the representable range return the fallback
71| |/// `"1970-01-01T00:00:00+00:00"`.
72| 9|pub fn epoch_to_iso(epoch: i64) -> String {
73| 9| Utc.timestamp_opt(epoch, 0)
74| 9| .single()
75| 9| .map(format_iso)
76| 9| .unwrap_or_else(|| "1970-01-01T00:00:00+00:00".to_string())
^2 ^2
77| 9|}
78| |
79| |#[cfg(test)]
80| |mod tests {
81| | use super::*;
82| | use serial_test::serial;
83| |
84| | #[test]
85| | #[serial]
86| 1| fn utc_default_when_env_missing() {
87| | // Remove variable to ensure UTC fallback
88| 1| std::env::remove_var("SQLITE_GRAPHRAG_DISPLAY_TZ");
89| 1| let result = resolve_tz_from_env().expect("must not fail with env absent");
90| 1| assert_eq!(result, Tz::UTC);
91| | }
92| |
93| | #[test]
94| | #[serial]
95| 1| fn env_valid_applies_timezone() {
96| 1| std::env::set_var("SQLITE_GRAPHRAG_DISPLAY_TZ", "America/Sao_Paulo");
97| 1| let result = resolve_tz_from_env().expect("America/Sao_Paulo is valid");
98| 1| assert_eq!(result.name(), "America/Sao_Paulo");
99| 1| std::env::remove_var("SQLITE_GRAPHRAG_DISPLAY_TZ");
100| | }
101| |
102| | #[test]
103| | #[serial]
104| 1| fn env_invalid_returns_validation_error() {
105| 1| std::env::set_var("SQLITE_GRAPHRAG_DISPLAY_TZ", "Invalid/Nonexistent");
106| 1| let result = resolve_tz_from_env();
107| 1| assert!(result.is_err(), "invalid timezone must return Err");
^0
108| 1| match result {
109| 1| Err(AppError::Validation(msg)) => {
110| 1| assert!(
111| 1| msg.contains("SQLITE_GRAPHRAG_DISPLAY_TZ"),
112| 0| "message must cite the env var"
113| | );
114| 1| assert!(
115| 1| msg.contains("Invalid/Nonexistent"),
116| 0| "message must cite the invalid value"
117| | );
118| | }
119| 0| other => unreachable!("expected AppError::Validation, got: {other:?}"),
120| | }
121| 1| std::env::remove_var("SQLITE_GRAPHRAG_DISPLAY_TZ");
122| | }
123| |
124| | #[test]
125| 1| fn epoch_zero_yields_utc_iso() {
126| | // Tests epoch_to_iso directly without global state
127| 1| std::env::remove_var("SQLITE_GRAPHRAG_DISPLAY_TZ");
128| 1| let result = {
129| | // Applies UTC directly without using GLOBAL_TZ
130| 1| let tz = Tz::UTC;
131| 1| Utc.timestamp_opt(0, 0)
132| 1| .single()
133| 1| .map(|dt| {
134| 1| dt.with_timezone(&tz)
135| 1| .format("%Y-%m-%dT%H:%M:%S%:z")
136| 1| .to_string()
137| 1| })
138| 1| .unwrap_or_else(|| "1970-01-01T00:00:00+00:00".to_string())
^0 ^0
139| | };
140| 1| assert_eq!(result, "1970-01-01T00:00:00+00:00");
141| 1| }
142| |
143| | #[test]
144| 1| fn format_iso_utc_preserves_zero_offset() {
145| 1| let ts = Utc.timestamp_opt(1_705_320_000, 0).single().unwrap();
146| | // Applies UTC directly
147| 1| let result = ts
148| 1| .with_timezone(&Tz::UTC)
149| 1| .format("%Y-%m-%dT%H:%M:%S%:z")
150| 1| .to_string();
151| 1| assert_eq!(result, "2024-01-15T12:00:00+00:00");
152| 1| }
153| |
154| | #[test]
155| 1| fn format_iso_sao_paulo_applies_offset() {
156| 1| let ts = Utc.timestamp_opt(1_705_320_000, 0).single().unwrap();
157| 1| let sao_paulo: Tz = "America/Sao_Paulo".parse().unwrap();
158| 1| let result = ts
159| 1| .with_timezone(&sao_paulo)
160| 1| .format("%Y-%m-%dT%H:%M:%S%:z")
161| 1| .to_string();
162| | // America/Sao_Paulo in January is UTC-3
163| 1| assert!(
164| 1| result.contains("-03:00"),
165| 0| "expected offset -03:00, got: {result}"
166| | );
167| 1| }
168| |}