1use crate::chunking;
35use crate::cli::MemoryType;
36use crate::entity_type::EntityType;
37use crate::errors::AppError;
38use crate::i18n::errors_msg;
39use crate::output::{self, JsonOutputFormat};
40use crate::paths::AppPaths;
41use crate::storage::chunks as storage_chunks;
42use crate::storage::connection::{ensure_db_ready, open_rw};
43use crate::storage::entities::{NewEntity, NewRelationship};
44use crate::storage::memories::NewMemory;
45use crate::storage::{entities, memories, urls as storage_urls, versions};
46use rayon::prelude::*;
47use rusqlite::Connection;
48use serde::Serialize;
49use std::collections::BTreeSet;
50use std::path::{Path, PathBuf};
51use std::sync::mpsc;
52use unicode_normalization::UnicodeNormalization;
53
54use crate::constants::DERIVED_NAME_MAX_LEN;
55
56const MAX_NAME_COLLISION_SUFFIX: usize = 1000;
59
60#[derive(clap::Args)]
61#[command(after_long_help = "EXAMPLES:\n \
62 # Ingest every Markdown file under ./docs as `document` memories\n \
63 sqlite-graphrag ingest ./docs --type document\n\n \
64 # Ingest .txt files recursively under ./notes\n \
65 sqlite-graphrag ingest ./notes --type note --pattern '*.txt' --recursive\n\n \
66 # Enable automatic URL extraction (URL-regex only since v1.0.79)\n \
67 sqlite-graphrag ingest ./big-corpus --type reference --enable-ner\n\n \
68 # Preview file-to-name mapping without ingesting\n \
69 sqlite-graphrag ingest ./docs --dry-run\n\n \
70 # LLM-curated extraction via Claude Code CLI\n \
71 sqlite-graphrag ingest ./docs --mode claude-code --recursive --json\n\n \
72 # Resume interrupted claude-code ingest\n \
73 sqlite-graphrag ingest ./docs --mode claude-code --resume --json\n\n \
74 # Claude Code with budget cap and custom timeout\n \
75 sqlite-graphrag ingest ./docs --mode claude-code --max-cost-usd 5.00 --claude-timeout 600 --json\n\n \
76AUTHENTICATION:\n \
77 --mode claude-code: Uses existing Claude Code authentication.\n \
78 OAuth (Pro/Max/Team): works automatically from ~/.claude/.credentials.json\n \
79 API key: set ANTHROPIC_API_KEY for faster startup (optional)\n\n \
80 --mode codex: Uses existing Codex CLI authentication.\n \
81 Device auth: run `codex auth login` first\n \
82 API key: set OPENAI_API_KEY (optional)\n\n \
83NOTES:\n \
84 Each file becomes a separate memory. Names derive from file basenames\n \
85 (kebab-case, lowercase, ASCII). Output is NDJSON: one JSON object per file,\n \
86 followed by a final summary line with counts. Per-file errors are reported\n \
87 inline and processing continues unless --fail-fast is set.")]
88pub struct IngestArgs {
89 #[arg(
91 value_name = "DIR",
92 help = "Directory to ingest recursively (each matching file becomes a memory)"
93 )]
94 pub dir: PathBuf,
95
96 #[arg(long, value_enum, default_value_t = MemoryType::Document)]
98 pub r#type: MemoryType,
99
100 #[arg(long, default_value = "*.md")]
103 pub pattern: String,
104
105 #[arg(long, default_value_t = false)]
107 pub recursive: bool,
108
109 #[arg(
110 long,
111 env = "SQLITE_GRAPHRAG_ENABLE_NER",
112 value_parser = crate::parsers::parse_bool_flexible,
113 action = clap::ArgAction::Set,
114 num_args = 0..=1,
115 default_missing_value = "true",
116 default_value = "false",
117 help = "Enable automatic URL-regex extraction (the GLiNER NER pipeline was removed in v1.0.79)"
118 )]
119 pub enable_ner: bool,
120
121 #[arg(
125 long,
126 default_value_t = true,
127 overrides_with = "no_auto_describe",
128 help = "Derive memory description from the first meaningful body line instead of the legacy `ingested from <path>` placeholder."
129 )]
130 pub auto_describe: bool,
131 #[arg(
132 long = "no-auto-describe",
133 default_value_t = false,
134 help = "Disable `--auto-describe` and fall back to the legacy `ingested from <path>` description placeholder."
135 )]
136 pub no_auto_describe: bool,
137 #[arg(
138 long,
139 env = "SQLITE_GRAPHRAG_GLINER_VARIANT",
140 default_value = "fp32",
141 help = "DEPRECATED: no effect since v1.0.79 (the GLiNER pipeline was removed); accepted for compatibility only"
142 )]
143 pub gliner_variant: String,
144
145 #[arg(long, default_value_t = false, hide = true)]
147 pub skip_extraction: bool,
148
149 #[arg(long, default_value_t = false)]
151 pub fail_fast: bool,
152
153 #[arg(long, default_value_t = false)]
155 pub dry_run: bool,
156
157 #[arg(long, default_value_t = 10_000)]
159 pub max_files: usize,
160
161 #[arg(long)]
163 pub namespace: Option<String>,
164
165 #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
167 pub db: Option<String>,
168
169 #[arg(long, value_enum, default_value_t = JsonOutputFormat::Json)]
170 pub format: JsonOutputFormat,
171
172 #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
173 pub json: bool,
174
175 #[arg(
177 long,
178 help = "Number of files to extract+embed in parallel; default = max(1, cpus/2).min(4)"
179 )]
180 pub ingest_parallelism: Option<usize>,
181
182 #[arg(
190 long,
191 default_value_t = false,
192 help = "Forces single-threaded ingest (--ingest-parallelism 1) to reduce RSS pressure. \
193 Recommended for environments with <4 GB available RAM or container/cgroup \
194 constraints. Trade-off: 3-4x longer wall time. Also honored via \
195 SQLITE_GRAPHRAG_LOW_MEMORY=1 env var."
196 )]
197 pub low_memory: bool,
198
199 #[arg(long, default_value_t = crate::constants::DEFAULT_MAX_RSS_MB,
201 help = "Maximum process RSS in MiB; abort if exceeded during embedding (default: 8192)")]
202 pub max_rss_mb: u64,
203
204 #[arg(long, default_value_t = 2, value_name = "N",
209 value_parser = clap::value_parser!(u64).range(1..=32),
210 help = "Maximum simultaneous LLM embedding subprocesses per file (default: 2, clamp [1,32])")]
211 pub llm_parallelism: u64,
212
213 #[arg(long, default_value_t = crate::constants::DERIVED_NAME_MAX_LEN,
218 help = "Maximum length for derived memory names (default: 60)")]
219 pub max_name_length: usize,
220
221 #[arg(long, value_enum, default_value_t = IngestMode::None)]
223 pub mode: IngestMode,
224
225 #[arg(long, env = "SQLITE_GRAPHRAG_CLAUDE_BINARY")]
227 pub claude_binary: Option<std::path::PathBuf>,
228
229 #[arg(long)]
231 pub claude_model: Option<String>,
232
233 #[arg(long, default_value_t = false)]
235 pub resume: bool,
236
237 #[arg(long, default_value_t = false)]
239 pub retry_failed: bool,
240
241 #[arg(long, default_value_t = false)]
243 pub keep_queue: bool,
244
245 #[arg(long, default_value = ".ingest-queue.sqlite")]
247 pub queue_db: String,
248
249 #[arg(long, default_value_t = 60)]
251 pub rate_limit_wait: u64,
252
253 #[arg(long)]
255 pub max_cost_usd: Option<f64>,
256
257 #[arg(
259 long,
260 default_value_t = 300,
261 help = "Timeout in seconds for each claude -p invocation (default: 300)"
262 )]
263 pub claude_timeout: u64,
264
265 #[arg(
267 long,
268 env = "SQLITE_GRAPHRAG_CODEX_BINARY",
269 help = "Explicit path to the Codex CLI binary (only with --mode codex)"
270 )]
271 pub codex_binary: Option<PathBuf>,
272
273 #[arg(
275 long,
276 help = "Model override for Codex extraction (e.g. o4-mini, gpt-5.1-codex)"
277 )]
278 pub codex_model: Option<String>,
279
280 #[arg(
282 long,
283 default_value_t = 300,
284 help = "Timeout in seconds for each codex exec invocation (default: 300)"
285 )]
286 pub codex_timeout: u64,
287
288 #[arg(long, value_name = "PATH", env = "SQLITE_GRAPHRAG_OPENCODE_BINARY")]
290 pub opencode_binary: Option<PathBuf>,
291
292 #[arg(
294 long,
295 value_name = "MODEL",
296 env = "SQLITE_GRAPHRAG_OPENCODE_MODEL",
297 help = "Model override for OpenCode extraction"
298 )]
299 pub opencode_model: Option<String>,
300
301 #[arg(
303 long,
304 value_name = "SECONDS",
305 env = "SQLITE_GRAPHRAG_OPENCODE_TIMEOUT",
306 default_value_t = 300,
307 help = "Timeout in seconds for each opencode run invocation (default: 300)"
308 )]
309 pub opencode_timeout: u64,
310
311 #[arg(long, value_name = "SECONDS")]
314 pub wait_job_singleton: Option<u64>,
315
316 #[arg(long, default_value_t = false)]
319 pub force_job_singleton: bool,
320
321 #[arg(
324 long,
325 default_value_t = false,
326 help = "Run enrich --operation memory-bindings after all files are ingested"
327 )]
328 pub enrich_after: bool,
329}
330
331#[derive(Clone, Debug, PartialEq, Eq, clap::ValueEnum)]
333pub enum IngestMode {
334 None,
336 Gliner,
338 ClaudeCode,
340 Codex,
342 #[value(name = "opencode")]
344 Opencode,
345}
346
347fn env_low_memory_enabled() -> bool {
352 match std::env::var("SQLITE_GRAPHRAG_LOW_MEMORY") {
353 Ok(v) if v.is_empty() => false,
354 Ok(v) => match v.to_lowercase().as_str() {
355 "1" | "true" | "yes" | "on" => true,
356 "0" | "false" | "no" | "off" => false,
357 other => {
358 tracing::warn!(
359 target: "ingest",
360 value = %other,
361 "SQLITE_GRAPHRAG_LOW_MEMORY value not recognized; treating as disabled"
362 );
363 false
364 }
365 },
366 Err(_) => false,
367 }
368}
369
370fn resolve_parallelism(low_memory_flag: bool, ingest_parallelism: Option<usize>) -> usize {
382 let env_flag = env_low_memory_enabled();
383 let low_memory = low_memory_flag || env_flag;
384
385 if low_memory {
386 if let Some(n) = ingest_parallelism {
387 if n > 1 {
388 tracing::warn!(
389 target: "ingest",
390 requested = n,
391 "--ingest-parallelism overridden by --low-memory; using 1"
392 );
393 }
394 }
395 if low_memory_flag {
396 tracing::info!(
397 target: "ingest",
398 source = "flag",
399 "low-memory mode enabled: forcing --ingest-parallelism 1"
400 );
401 } else {
402 tracing::info!(
403 target: "ingest",
404 source = "env",
405 "low-memory mode enabled via SQLITE_GRAPHRAG_LOW_MEMORY: forcing --ingest-parallelism 1"
406 );
407 }
408 return 1;
409 }
410
411 ingest_parallelism
412 .unwrap_or_else(|| {
413 std::thread::available_parallelism()
414 .map(|v| v.get() / 2)
415 .unwrap_or(1)
416 .clamp(1, 4)
417 })
418 .max(1)
419}
420
421#[derive(Serialize)]
422struct IngestFileEvent<'a> {
423 file: &'a str,
424 name: &'a str,
425 status: &'a str,
426 truncated: bool,
428 #[serde(skip_serializing_if = "Option::is_none")]
430 original_name: Option<String>,
431 #[serde(skip_serializing_if = "Option::is_none")]
433 original_filename: Option<&'a str>,
434 #[serde(skip_serializing_if = "Option::is_none")]
435 error: Option<String>,
436 #[serde(skip_serializing_if = "Option::is_none")]
437 memory_id: Option<i64>,
438 #[serde(skip_serializing_if = "Option::is_none")]
439 action: Option<String>,
440 body_length: usize,
442 #[serde(skip_serializing_if = "Option::is_none")]
447 backend_invoked: Option<&'a str>,
448}
449
450#[derive(Serialize)]
451struct IngestSummary {
452 summary: bool,
453 dir: String,
454 pattern: String,
455 recursive: bool,
456 files_total: usize,
457 files_succeeded: usize,
458 files_failed: usize,
459 files_skipped: usize,
460 elapsed_ms: u64,
461}
462
463struct FileSuccess {
465 memory_id: i64,
466 action: String,
467 body_length: usize,
468 backend_invoked: Option<&'static str>,
469}
470
471#[derive(Serialize)]
474struct StageProgressEvent<'a> {
475 schema_version: u8,
476 event: &'a str,
477 path: &'a str,
478 ms: u64,
479 entities: usize,
480 relationships: usize,
481}
482
483struct StagedFile {
486 body: String,
487 body_hash: String,
488 snippet: String,
489 name: String,
490 description: String,
491 embedding: Option<Vec<f32>>,
492 chunk_embeddings: Option<Vec<Vec<f32>>>,
493 chunks_info: Vec<crate::chunking::Chunk>,
494 entities: Vec<NewEntity>,
495 relationships: Vec<NewRelationship>,
496 entity_embeddings: Option<Vec<Vec<f32>>>,
497 urls: Vec<crate::extraction::ExtractedUrl>,
498 backend_invoked: Option<&'static str>,
503}
504
505#[allow(clippy::too_many_arguments)]
511fn stage_file(
512 _idx: usize,
513 path: &Path,
514 name: &str,
515 paths: &AppPaths,
516 enable_ner: bool,
517 gliner_variant: crate::extraction::GlinerVariant,
518 max_rss_mb: u64,
519 llm_parallelism: usize,
520 llm_backend: crate::cli::LlmBackendChoice,
521 embedding_backend: crate::cli::EmbeddingBackendChoice,
522 auto_describe: bool,
523) -> Result<StagedFile, AppError> {
524 use crate::constants::*;
525
526 if name.len() > MAX_MEMORY_NAME_LEN {
527 return Err(AppError::LimitExceeded(
528 crate::i18n::validation::name_length(MAX_MEMORY_NAME_LEN),
529 ));
530 }
531 if name.starts_with("__") {
532 return Err(AppError::Validation(
533 crate::i18n::validation::reserved_name(),
534 ));
535 }
536 {
537 let slug_re = crate::constants::name_slug_regex();
538 if !slug_re.is_match(name) {
539 return Err(AppError::Validation(crate::i18n::validation::name_kebab(
540 name,
541 )));
542 }
543 }
544
545 let file_size = std::fs::metadata(path).map_err(AppError::Io)?.len();
546 if file_size > MAX_MEMORY_BODY_LEN as u64 {
547 return Err(AppError::LimitExceeded(
548 crate::i18n::validation::body_exceeds(MAX_MEMORY_BODY_LEN),
549 ));
550 }
551 let raw_body = std::fs::read_to_string(path).map_err(AppError::Io)?;
552 if raw_body.len() > MAX_MEMORY_BODY_LEN {
553 return Err(AppError::LimitExceeded(
554 crate::i18n::validation::body_exceeds(MAX_MEMORY_BODY_LEN),
555 ));
556 }
557 if raw_body.trim().is_empty() {
558 return Err(AppError::Validation(crate::i18n::validation::empty_body()));
559 }
560
561 let description = if auto_describe {
562 crate::commands::ingest_heuristics::extract_heuristic_description(
563 &raw_body,
564 Some(&path.display().to_string()),
565 )
566 } else {
567 format!("ingested from {}", path.display())
568 };
569 if description.len() > MAX_MEMORY_DESCRIPTION_LEN {
570 return Err(AppError::Validation(
571 crate::i18n::validation::description_exceeds(MAX_MEMORY_DESCRIPTION_LEN),
572 ));
573 }
574
575 let mut extracted_entities: Vec<NewEntity> = Vec::with_capacity(30);
576 let mut extracted_relationships: Vec<NewRelationship> = Vec::with_capacity(50);
577 let mut extracted_urls: Vec<crate::extraction::ExtractedUrl> = Vec::with_capacity(4);
578 if enable_ner {
579 match crate::extraction::extract_graph_auto(&raw_body, paths, gliner_variant) {
580 Ok(extracted) => {
581 extracted_urls = extracted.urls;
582 extracted_entities = extracted
587 .entities
588 .into_iter()
589 .map(|e| NewEntity {
590 name: e.name,
591 entity_type: crate::entity_type::EntityType::Concept,
592 description: None,
593 })
594 .collect();
595 extracted_relationships.clear();
600
601 if extracted_entities.len() > max_entities_per_memory() {
602 extracted_entities.truncate(max_entities_per_memory());
603 }
604 if extracted_relationships.len() > max_relationships_per_memory() {
605 extracted_relationships.truncate(max_relationships_per_memory());
606 }
607 }
608 Err(e) => {
609 tracing::warn!(
610 target: "ingest",
611 file = %path.display(),
612 "auto-extraction failed (graceful degradation): {e:#}"
613 );
614 }
615 }
616 }
617
618 for rel in &mut extracted_relationships {
619 rel.relation = crate::parsers::normalize_relation(&rel.relation);
620 if let Err(e) = crate::parsers::validate_relation_format(&rel.relation) {
621 return Err(AppError::Validation(format!(
622 "{e} for relationship '{}' -> '{}'",
623 rel.source, rel.target
624 )));
625 }
626 crate::parsers::warn_if_non_canonical(&rel.relation);
627 if !(0.0..=1.0).contains(&rel.strength) {
628 return Err(AppError::Validation(format!(
629 "invalid strength {} for relationship '{}' -> '{}'; expected value in [0.0, 1.0]",
630 rel.strength, rel.source, rel.target
631 )));
632 }
633 }
634
635 let body_hash = blake3::hash(raw_body.as_bytes()).to_hex().to_string();
636 let snippet: String = raw_body.chars().take(200).collect();
637
638 let chunks_info = chunking::split_into_chunks_hierarchical(&raw_body);
639 if chunks_info.len() > REMEMBER_MAX_SAFE_MULTI_CHUNKS {
640 return Err(AppError::LimitExceeded(format!(
641 "document produces {} chunks; current safe operational limit is {} chunks; split the document before using remember",
642 chunks_info.len(),
643 REMEMBER_MAX_SAFE_MULTI_CHUNKS
644 )));
645 }
646
647 let mut chunk_embeddings_opt: Option<Vec<Vec<f32>>> = None;
648 let skip_embed = crate::embedder::should_skip_embedding_on_failure();
649 let (embedding, backend_invoked): (Option<Vec<f32>>, Option<&'static str>) = if chunks_info
653 .len()
654 == 1
655 {
656 match crate::embedder::embed_passage_with_embedding_choice(
657 &paths.models,
658 &raw_body,
659 embedding_backend,
660 llm_backend,
661 ) {
662 Ok((v, k)) => (Some(v), Some(k.as_str())),
663 Err(AppError::Validation(msg)) => return Err(AppError::Validation(msg)),
664 Err(e) if skip_embed => {
665 tracing::warn!(error = %e, file = %path.display(), "ingest: embedding failed; --skip-embedding-on-failure active, persisting without embedding");
666 (None, None)
667 }
668 Err(e) => return Err(e),
669 }
670 } else {
671 let chunk_texts: Vec<String> = chunks_info
674 .iter()
675 .map(|c| chunking::chunk_text(&raw_body, c).to_string())
676 .collect();
677 if let Some(rss) = crate::memory_guard::current_process_memory_mb() {
678 if rss > max_rss_mb {
679 tracing::error!(
680 target: "ingest",
681 rss_mb = rss,
682 max_rss_mb = max_rss_mb,
683 file = %path.display(),
684 "RSS exceeded --max-rss-mb threshold; aborting to prevent system instability"
685 );
686 return Err(AppError::LowMemory {
687 available_mb: crate::memory_guard::available_memory_mb(),
688 required_mb: max_rss_mb,
689 });
690 }
691 }
692 match crate::embedder::embed_passages_parallel_with_embedding_choice(
693 &paths.models,
694 &chunk_texts,
695 llm_parallelism,
696 crate::embedder::chunk_embed_batch_size(),
697 embedding_backend,
698 llm_backend,
699 ) {
700 Ok(chunk_embeddings) => {
701 let aggregated = chunking::aggregate_embeddings(&chunk_embeddings);
702 chunk_embeddings_opt = Some(chunk_embeddings);
703 (Some(aggregated), None)
706 }
707 Err(AppError::Validation(msg)) => return Err(AppError::Validation(msg)),
708 Err(e) if skip_embed => {
709 tracing::warn!(error = %e, file = %path.display(), "ingest: chunk embedding failed; --skip-embedding-on-failure active, persisting without embedding");
710 (None, None)
711 }
712 Err(e) => return Err(e),
713 }
714 };
715
716 let entity_texts: Vec<String> = extracted_entities
718 .iter()
719 .map(|entity| match &entity.description {
720 Some(desc) => format!("{} {}", entity.name, desc),
721 None => entity.name.clone(),
722 })
723 .collect();
724 let entity_embeddings_opt = match crate::embedder::embed_entity_texts_cached(
728 &paths.models,
729 &entity_texts,
730 llm_parallelism,
731 ) {
732 Ok((entity_embeddings, embed_cache_stats)) => {
733 if embed_cache_stats.hits > 0 {
734 tracing::debug!(
735 hits = embed_cache_stats.hits,
736 misses = embed_cache_stats.misses,
737 requested = embed_cache_stats.requested,
738 "G56: entity embed cache hit (ingest)"
739 );
740 }
741 Some(entity_embeddings)
742 }
743 Err(e) if skip_embed => {
744 tracing::warn!(error = %e, file = %path.display(), "ingest: entity embedding failed; --skip-embedding-on-failure active");
745 None
746 }
747 Err(e) => return Err(e),
748 };
749
750 Ok(StagedFile {
751 body: raw_body,
752 body_hash,
753 snippet,
754 name: name.to_string(),
755 description,
756 embedding,
757 chunk_embeddings: chunk_embeddings_opt,
758 chunks_info,
759 entities: extracted_entities,
760 relationships: extracted_relationships,
761 entity_embeddings: entity_embeddings_opt,
762 urls: extracted_urls,
763 backend_invoked,
764 })
765}
766
767fn persist_staged(
769 conn: &mut Connection,
770 namespace: &str,
771 memory_type: &str,
772 staged: StagedFile,
773) -> Result<FileSuccess, AppError> {
774 {
775 let active_count: u32 = conn.query_row(
776 "SELECT COUNT(DISTINCT namespace) FROM memories WHERE deleted_at IS NULL",
777 [],
778 |r| r.get::<_, i64>(0).map(|v| v as u32),
779 )?;
780 let ns_exists: bool = conn.query_row(
781 "SELECT EXISTS(SELECT 1 FROM memories WHERE namespace = ?1 AND deleted_at IS NULL)",
782 rusqlite::params![namespace],
783 |r| r.get::<_, i64>(0).map(|v| v > 0),
784 )?;
785 if !ns_exists && active_count >= crate::constants::MAX_NAMESPACES_ACTIVE {
786 return Err(AppError::NamespaceError(format!(
787 "active namespace limit of {} exceeded while creating '{namespace}'",
788 crate::constants::MAX_NAMESPACES_ACTIVE
789 )));
790 }
791 }
792
793 let existing_memory = memories::find_by_name(conn, namespace, &staged.name)?;
794 if existing_memory.is_some() {
795 return Err(AppError::Duplicate(errors_msg::duplicate_memory(
796 &staged.name,
797 namespace,
798 )));
799 }
800 let duplicate_hash_id = memories::find_by_hash(conn, namespace, &staged.body_hash)?;
801
802 let new_memory = NewMemory {
803 namespace: namespace.to_string(),
804 name: staged.name.clone(),
805 memory_type: memory_type.to_string(),
806 description: staged.description.clone(),
807 body: staged.body,
808 body_hash: staged.body_hash,
809 session_id: None,
810 source: "agent".to_string(),
811 metadata: serde_json::json!({}),
812 };
813
814 if let Some(hash_id) = duplicate_hash_id {
815 tracing::debug!(
816 target: "ingest",
817 duplicate_memory_id = hash_id,
818 "identical body already exists; persisting a new memory anyway"
819 );
820 }
821
822 let tx = conn.transaction_with_behavior(rusqlite::TransactionBehavior::Immediate)?;
823
824 let memory_id = memories::insert(&tx, &new_memory)?;
825 versions::insert_version(
826 &tx,
827 memory_id,
828 1,
829 &staged.name,
830 memory_type,
831 &staged.description,
832 &new_memory.body,
833 &serde_json::to_string(&new_memory.metadata)?,
834 None,
835 "create",
836 )?;
837 if let Some(ref emb) = staged.embedding {
838 memories::upsert_vec(
839 &tx,
840 memory_id,
841 namespace,
842 memory_type,
843 emb,
844 &staged.name,
845 &staged.snippet,
846 )?;
847 }
848
849 if staged.chunks_info.len() > 1 {
850 storage_chunks::insert_chunk_slices(&tx, memory_id, &new_memory.body, &staged.chunks_info)?;
851 if let Some(ref chunk_embeddings) = staged.chunk_embeddings {
852 for (i, emb) in chunk_embeddings.iter().enumerate() {
853 storage_chunks::upsert_chunk_vec(&tx, i as i64, memory_id, i as i32, emb)?;
854 }
855 }
856 }
857
858 if !staged.entities.is_empty() || !staged.relationships.is_empty() {
859 for (idx, entity) in staged.entities.iter().enumerate() {
860 let entity_id = entities::upsert_entity(&tx, namespace, entity)?;
861 if let Some(ref entity_embeddings) = staged.entity_embeddings {
862 if let Some(entity_embedding) = entity_embeddings.get(idx) {
863 entities::upsert_entity_vec(
864 &tx,
865 entity_id,
866 namespace,
867 entity.entity_type,
868 entity_embedding,
869 &entity.name,
870 )?;
871 }
872 }
873 entities::link_memory_entity(&tx, memory_id, entity_id)?;
874 }
875 let entity_types: std::collections::HashMap<&str, EntityType> = staged
876 .entities
877 .iter()
878 .map(|entity| (entity.name.as_str(), entity.entity_type))
879 .collect();
880
881 let mut affected_entity_ids: std::collections::HashSet<i64> =
882 std::collections::HashSet::new();
883 for entity in &staged.entities {
884 if let Some(eid) = entities::find_entity_id(&tx, namespace, &entity.name)? {
885 affected_entity_ids.insert(eid);
886 }
887 }
888
889 for rel in &staged.relationships {
890 let source_entity = NewEntity {
891 name: rel.source.clone(),
892 entity_type: entity_types
893 .get(rel.source.as_str())
894 .copied()
895 .unwrap_or(EntityType::Concept),
896 description: None,
897 };
898 let target_entity = NewEntity {
899 name: rel.target.clone(),
900 entity_type: entity_types
901 .get(rel.target.as_str())
902 .copied()
903 .unwrap_or(EntityType::Concept),
904 description: None,
905 };
906 let source_id = entities::upsert_entity(&tx, namespace, &source_entity)?;
907 let target_id = entities::upsert_entity(&tx, namespace, &target_entity)?;
908 let rel_id = entities::upsert_relationship(&tx, namespace, source_id, target_id, rel)?;
909 entities::link_memory_relationship(&tx, memory_id, rel_id)?;
910 affected_entity_ids.insert(source_id);
911 affected_entity_ids.insert(target_id);
912 }
913
914 for &eid in &affected_entity_ids {
915 entities::recalculate_degree(&tx, eid)?;
916 }
917 }
918
919 tx.commit()?;
920
921 if !staged.urls.is_empty() {
922 let url_entries: Vec<storage_urls::MemoryUrl> = staged
923 .urls
924 .into_iter()
925 .map(|u| storage_urls::MemoryUrl {
926 url: u.url,
927 offset: Some(u.start as i64),
928 })
929 .collect();
930 let _ = storage_urls::insert_urls(conn, memory_id, &url_entries);
931 }
932
933 Ok(FileSuccess {
934 memory_id,
935 action: "created".to_string(),
936 body_length: new_memory.body.len(),
937 backend_invoked: staged.backend_invoked,
938 })
939}
940
941fn is_at_default<T: PartialEq>(value: T, default: T) -> bool {
949 value == default
950}
951
952fn validate_mode_conditional_flags_ingest(args: &IngestArgs) -> Result<(), AppError> {
967 const DEFAULT_TIMEOUT: u64 = 300;
968 const DEFAULT_RATE_LIMIT_WAIT: u64 = 60;
969
970 let mut conflicts: Vec<String> = Vec::new();
971
972 let is_local_mode = args.mode == IngestMode::None || args.mode == IngestMode::Gliner;
973
974 if is_local_mode {
975 if args.claude_binary.is_some() {
976 conflicts.push("--claude-binary is ignored when --mode is none or gliner".to_string());
977 }
978 if args.claude_model.is_some() {
979 conflicts.push("--claude-model is ignored when --mode is none or gliner".to_string());
980 }
981 if !is_at_default(args.claude_timeout, DEFAULT_TIMEOUT) {
982 conflicts.push(format!(
983 "--claude-timeout={} is ignored when --mode is none or gliner (remove the flag to use the default 300s)",
984 args.claude_timeout
985 ));
986 }
987 if args.codex_binary.is_some() {
988 conflicts.push("--codex-binary is ignored when --mode is none or gliner".to_string());
989 }
990 if args.codex_model.is_some() {
991 conflicts.push("--codex-model is ignored when --mode is none or gliner".to_string());
992 }
993 if !is_at_default(args.codex_timeout, DEFAULT_TIMEOUT) {
994 conflicts.push(format!(
995 "--codex-timeout={} is ignored when --mode is none or gliner (remove the flag to use the default 300s)",
996 args.codex_timeout
997 ));
998 }
999 if args.opencode_binary.is_some() {
1000 conflicts
1001 .push("--opencode-binary is ignored when --mode is none or gliner".to_string());
1002 }
1003 if args.opencode_model.is_some() {
1004 conflicts.push("--opencode-model is ignored when --mode is none or gliner".to_string());
1005 }
1006 if !is_at_default(args.opencode_timeout, DEFAULT_TIMEOUT) {
1007 conflicts.push(format!(
1008 "--opencode-timeout={} is ignored when --mode is none or gliner (remove the flag to use the default 300s)",
1009 args.opencode_timeout
1010 ));
1011 }
1012 if args.max_cost_usd.is_some() {
1013 conflicts.push("--max-cost-usd is ignored when --mode is none or gliner (cost is only tracked for LLM-backed modes)".to_string());
1014 }
1015 if args.resume {
1016 conflicts.push("--resume is ignored when --mode is none or gliner (the queue DB is only used by LLM-backed modes)".to_string());
1017 }
1018 if args.retry_failed {
1019 conflicts.push("--retry-failed is ignored when --mode is none or gliner".to_string());
1020 }
1021 if args.keep_queue {
1022 conflicts.push("--keep-queue is ignored when --mode is none or gliner".to_string());
1023 }
1024 if !is_at_default(args.rate_limit_wait, DEFAULT_RATE_LIMIT_WAIT) {
1025 conflicts.push(format!(
1026 "--rate-limit-wait={} is ignored when --mode is none or gliner",
1027 args.rate_limit_wait
1028 ));
1029 }
1030 }
1031
1032 match args.mode {
1033 IngestMode::ClaudeCode => {
1034 if args.codex_binary.is_some() {
1035 conflicts.push("--codex-binary is ignored when --mode=claude-code".to_string());
1036 }
1037 if args.codex_model.is_some() {
1038 conflicts.push("--codex-model is ignored when --mode=claude-code".to_string());
1039 }
1040 if !is_at_default(args.codex_timeout, DEFAULT_TIMEOUT) {
1041 conflicts.push(format!(
1042 "--codex-timeout={} is ignored when --mode=claude-code (remove the flag to use the default 300s)",
1043 args.codex_timeout
1044 ));
1045 }
1046 if args.opencode_binary.is_some() {
1047 conflicts.push("--opencode-binary is ignored when --mode=claude-code".to_string());
1048 }
1049 if args.opencode_model.is_some() {
1050 conflicts.push("--opencode-model is ignored when --mode=claude-code".to_string());
1051 }
1052 if !is_at_default(args.opencode_timeout, DEFAULT_TIMEOUT) {
1053 conflicts.push(format!(
1054 "--opencode-timeout={} is ignored when --mode=claude-code (remove the flag to use the default 300s)",
1055 args.opencode_timeout
1056 ));
1057 }
1058 }
1059 IngestMode::Codex => {
1060 if args.claude_binary.is_some() {
1061 conflicts.push("--claude-binary is ignored when --mode=codex".to_string());
1062 }
1063 if args.claude_model.is_some() {
1064 conflicts.push("--claude-model is ignored when --mode=codex".to_string());
1065 }
1066 if !is_at_default(args.claude_timeout, DEFAULT_TIMEOUT) {
1067 conflicts.push(format!(
1068 "--claude-timeout={} is ignored when --mode=codex (remove the flag to use the default 300s)",
1069 args.claude_timeout
1070 ));
1071 }
1072 if args.max_cost_usd.is_some() {
1073 conflicts.push(
1074 "--max-cost-usd is ignored when --mode=codex (OAuth-first; cost is metered by your subscription)"
1075 .to_string(),
1076 );
1077 }
1078 if args.resume {
1079 conflicts.push("--resume is only valid for --mode=claude-code".to_string());
1080 }
1081 if args.retry_failed {
1082 conflicts.push("--retry-failed is only valid for --mode=claude-code".to_string());
1083 }
1084 if args.keep_queue {
1085 conflicts.push("--keep-queue is only valid for --mode=claude-code".to_string());
1086 }
1087 if args.opencode_binary.is_some() {
1088 conflicts.push("--opencode-binary is ignored when --mode=codex".to_string());
1089 }
1090 if args.opencode_model.is_some() {
1091 conflicts.push("--opencode-model is ignored when --mode=codex".to_string());
1092 }
1093 if !is_at_default(args.opencode_timeout, DEFAULT_TIMEOUT) {
1094 conflicts.push(format!(
1095 "--opencode-timeout={} is ignored when --mode=codex (remove the flag to use the default 300s)",
1096 args.opencode_timeout
1097 ));
1098 }
1099 }
1100 IngestMode::Opencode => {
1101 if args.claude_binary.is_some() {
1102 conflicts.push("--claude-binary is ignored when --mode=opencode".to_string());
1103 }
1104 if args.claude_model.is_some() {
1105 conflicts.push("--claude-model is ignored when --mode=opencode".to_string());
1106 }
1107 if !is_at_default(args.claude_timeout, DEFAULT_TIMEOUT) {
1108 conflicts.push(format!(
1109 "--claude-timeout={} is ignored when --mode=opencode (remove the flag to use the default 300s)",
1110 args.claude_timeout
1111 ));
1112 }
1113 if args.codex_binary.is_some() {
1114 conflicts.push("--codex-binary is ignored when --mode=opencode".to_string());
1115 }
1116 if args.codex_model.is_some() {
1117 conflicts.push("--codex-model is ignored when --mode=opencode".to_string());
1118 }
1119 if !is_at_default(args.codex_timeout, DEFAULT_TIMEOUT) {
1120 conflicts.push(format!(
1121 "--codex-timeout={} is ignored when --mode=opencode (remove the flag to use the default 300s)",
1122 args.codex_timeout
1123 ));
1124 }
1125 if args.max_cost_usd.is_some() {
1126 conflicts.push(
1127 "--max-cost-usd is ignored when --mode=opencode (OAuth-first; cost is metered by your subscription)"
1128 .to_string(),
1129 );
1130 }
1131 if args.resume {
1132 conflicts.push("--resume is only valid for --mode=claude-code".to_string());
1133 }
1134 if args.retry_failed {
1135 conflicts.push("--retry-failed is only valid for --mode=claude-code".to_string());
1136 }
1137 if args.keep_queue {
1138 conflicts.push("--keep-queue is only valid for --mode=claude-code".to_string());
1139 }
1140 }
1141 IngestMode::None | IngestMode::Gliner => {}
1142 }
1143
1144 if !conflicts.is_empty() {
1145 return Err(AppError::Validation(format!(
1146 "G20: mode-conditional flag conflicts detected for --mode={:?}:\n - {}",
1147 args.mode,
1148 conflicts.join("\n - ")
1149 )));
1150 }
1151
1152 Ok(())
1153}
1154
1155#[tracing::instrument(skip_all, level = "debug", name = "ingest")]
1158pub fn run(
1159 args: IngestArgs,
1160 llm_backend: crate::cli::LlmBackendChoice,
1161 embedding_backend: crate::cli::EmbeddingBackendChoice,
1162) -> Result<(), AppError> {
1163 validate_mode_conditional_flags_ingest(&args)?;
1166 tracing::debug!(target: "ingest", dir = %args.dir.display(), mode = ?args.mode, "starting ingest");
1167 if args.mode == IngestMode::ClaudeCode {
1168 return super::ingest_claude::run_claude_ingest(&args, embedding_backend, llm_backend);
1169 }
1170 if args.mode == IngestMode::Codex {
1171 return super::ingest_codex::run_codex_ingest(&args);
1172 }
1173 if args.mode == IngestMode::Opencode {
1174 return super::ingest_opencode::run_opencode_ingest(&args);
1175 }
1176
1177 let started = std::time::Instant::now();
1178
1179 if !args.dir.exists() {
1180 return Err(AppError::Validation(format!(
1181 "directory not found: {}",
1182 args.dir.display()
1183 )));
1184 }
1185 if !args.dir.is_dir() {
1186 return Err(AppError::Validation(format!(
1187 "path is not a directory: {}",
1188 args.dir.display()
1189 )));
1190 }
1191
1192 let mut files: Vec<PathBuf> = Vec::with_capacity(128);
1193 collect_files(&args.dir, &args.pattern, args.recursive, &mut files)?;
1194 files.sort_unstable();
1195
1196 if files.len() > args.max_files {
1197 return Err(AppError::Validation(format!(
1198 "found {} files matching pattern, exceeds --max-files cap of {} (raise the cap or narrow the pattern)",
1199 files.len(),
1200 args.max_files
1201 )));
1202 }
1203
1204 let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
1205 let memory_type_str = args.r#type.as_str().to_string();
1206
1207 let paths = AppPaths::resolve(args.db.as_deref())?;
1208 let mut conn_or_err = match init_storage(&paths) {
1209 Ok(c) => Ok(c),
1210 Err(e) => Err(format!("{e}")),
1211 };
1212
1213 let mut succeeded: usize = 0;
1214 let mut failed: usize = 0;
1215 let mut skipped: usize = 0;
1216 let total = files.len();
1217
1218 let mut taken_names: BTreeSet<String> = BTreeSet::new();
1221
1222 enum SlotMeta {
1228 Skip {
1229 file_str: String,
1230 derived_base: String,
1231 name_truncated: bool,
1232 original_name: Option<String>,
1233 original_filename: Option<String>,
1234 reason: String,
1235 },
1236 Process {
1237 file_str: String,
1238 derived_name: String,
1239 name_truncated: bool,
1240 original_name: Option<String>,
1241 original_filename: Option<String>,
1242 },
1243 }
1244
1245 struct ProcessItem {
1246 idx: usize,
1247 path: PathBuf,
1248 file_str: String,
1249 derived_name: String,
1250 }
1251
1252 let files_cap = files.len();
1253 let mut slots_meta: Vec<SlotMeta> = Vec::new();
1254 slots_meta.try_reserve(files_cap).map_err(|_| {
1255 AppError::LimitExceeded(format!(
1256 "allocation of {files_cap} slot metadata entries would exceed available memory"
1257 ))
1258 })?;
1259 let mut process_items: Vec<ProcessItem> = Vec::new();
1260 process_items.try_reserve(files_cap).map_err(|_| {
1261 AppError::LimitExceeded(format!(
1262 "allocation of {files_cap} process items would exceed available memory"
1263 ))
1264 })?;
1265 let mut truncations: Vec<(String, String)> = Vec::new();
1266 truncations.try_reserve(files_cap).map_err(|_| {
1267 AppError::LimitExceeded(format!(
1268 "allocation of {files_cap} truncation entries would exceed available memory"
1269 ))
1270 })?;
1271
1272 let max_name_length = args.max_name_length;
1273 for path in &files {
1274 let file_str = path.to_string_lossy().into_owned();
1275 let (derived_base, name_truncated, original_name) =
1276 derive_kebab_name(path, max_name_length);
1277 let original_basename = path.file_stem().and_then(|s| s.to_str()).unwrap_or("");
1278
1279 if name_truncated {
1280 if let Some(ref orig) = original_name {
1281 truncations.push((orig.clone(), derived_base.clone()));
1282 }
1283 }
1284
1285 if derived_base.is_empty() {
1286 let orig_filename = if !original_basename.is_empty() {
1288 Some(original_basename.to_string())
1289 } else {
1290 None
1291 };
1292 slots_meta.push(SlotMeta::Skip {
1293 file_str,
1294 derived_base: String::new(),
1295 name_truncated: false,
1296 original_name: None,
1297 original_filename: orig_filename,
1298 reason: "could not derive a non-empty kebab-case name from filename".to_string(),
1299 });
1300 continue;
1301 }
1302
1303 match unique_name(&derived_base, &taken_names) {
1304 Ok(derived_name) => {
1305 taken_names.insert(derived_name.clone());
1306 let idx = slots_meta.len();
1307 let orig_filename = if original_basename != derived_name {
1309 Some(original_basename.to_string())
1310 } else {
1311 None
1312 };
1313 process_items.push(ProcessItem {
1314 idx,
1315 path: path.clone(),
1316 file_str: file_str.clone(),
1317 derived_name: derived_name.clone(),
1318 });
1319 slots_meta.push(SlotMeta::Process {
1320 file_str,
1321 derived_name,
1322 name_truncated,
1323 original_name,
1324 original_filename: orig_filename,
1325 });
1326 }
1327 Err(e) => {
1328 let orig_filename = if original_basename != derived_base {
1329 Some(original_basename.to_string())
1330 } else {
1331 None
1332 };
1333 slots_meta.push(SlotMeta::Skip {
1334 file_str,
1335 derived_base,
1336 name_truncated,
1337 original_name,
1338 original_filename: orig_filename,
1339 reason: e.to_string(),
1340 });
1341 }
1342 }
1343 }
1344
1345 if !truncations.is_empty() {
1346 tracing::info!(
1347 target: "ingest",
1348 count = truncations.len(),
1349 max_name_length = max_name_length,
1350 max_len = DERIVED_NAME_MAX_LEN,
1351 "derived names truncated; pass -vv (debug) for per-file detail"
1352 );
1353 }
1354
1355 if args.dry_run {
1357 for meta in &slots_meta {
1358 match meta {
1359 SlotMeta::Skip {
1360 file_str,
1361 derived_base,
1362 name_truncated,
1363 original_name,
1364 original_filename,
1365 reason,
1366 } => {
1367 output::emit_json_compact(&IngestFileEvent {
1368 file: file_str,
1369 name: derived_base,
1370 status: "skip",
1371 truncated: *name_truncated,
1372 original_name: original_name.clone(),
1373 original_filename: original_filename.as_deref(),
1374 error: Some(reason.clone()),
1375 memory_id: None,
1376 action: None,
1377 body_length: 0,
1378 backend_invoked: None,
1379 })?;
1380 }
1381 SlotMeta::Process {
1382 file_str,
1383 derived_name,
1384 name_truncated,
1385 original_name,
1386 original_filename,
1387 } => {
1388 output::emit_json_compact(&IngestFileEvent {
1389 file: file_str,
1390 name: derived_name,
1391 status: "preview",
1392 truncated: *name_truncated,
1393 original_name: original_name.clone(),
1394 original_filename: original_filename.as_deref(),
1395 error: None,
1396 memory_id: None,
1397 action: None,
1398 body_length: 0,
1399 backend_invoked: None,
1400 })?;
1401 }
1402 }
1403 }
1404 output::emit_json_compact(&IngestSummary {
1405 summary: true,
1406 dir: args.dir.to_string_lossy().into_owned(),
1407 pattern: args.pattern.clone(),
1408 recursive: args.recursive,
1409 files_total: total,
1410 files_succeeded: 0,
1411 files_failed: 0,
1412 files_skipped: 0,
1413 elapsed_ms: started.elapsed().as_millis() as u64,
1414 })?;
1415 return Ok(());
1416 }
1417
1418 if args.low_memory {
1420 if let Some(n) = args.ingest_parallelism {
1421 if n > 1 {
1422 return Err(AppError::Validation(
1423 "--ingest-parallelism N>1 conflicts with --low-memory; use one or the other"
1424 .to_string(),
1425 ));
1426 }
1427 }
1428 }
1429
1430 let parallelism = resolve_parallelism(args.low_memory, args.ingest_parallelism);
1433
1434 let pool = rayon::ThreadPoolBuilder::new()
1435 .num_threads(parallelism)
1436 .build()
1437 .map_err(|e| AppError::Internal(anyhow::anyhow!("rayon pool: {e}")))?;
1438
1439 if args.enable_ner && args.skip_extraction {
1440 return Err(AppError::Validation(
1441 "--enable-ner and --skip-extraction are mutually exclusive; remove one".to_string(),
1442 ));
1443 }
1444 if args.skip_extraction && !args.enable_ner {
1445 tracing::warn!(
1452 "--skip-extraction is deprecated since v1.0.45 and has no effect (NER is disabled by default); remove this flag to silence the warning"
1453 );
1454 }
1455 let enable_ner = args.enable_ner;
1456 let auto_describe = args.auto_describe && !args.no_auto_describe;
1457 let max_rss_mb = args.max_rss_mb;
1458 let llm_parallelism = args.llm_parallelism as usize;
1459 if args.mode == IngestMode::Gliner {
1463 tracing::warn!(
1464 "--mode gliner is deprecated since v1.0.79 (the GLiNER pipeline was removed); it now performs URL-regex extraction only — use --mode claude-code or --mode codex for LLM-curated extraction"
1465 );
1466 }
1467 if args.gliner_variant != "fp32" {
1468 tracing::warn!(
1469 "--gliner-variant is deprecated and has no effect since v1.0.79 (the GLiNER pipeline was removed)"
1470 );
1471 }
1472 let gliner_variant: crate::extraction::GlinerVariant = match args.gliner_variant.as_str() {
1473 "int8" => crate::extraction::GlinerVariant::Int8,
1474 _ => crate::extraction::GlinerVariant::Fp32,
1475 };
1476
1477 let total_to_process = process_items.len();
1478 tracing::info!(
1479 target: "ingest",
1480 phase = "pipeline_start",
1481 files = total_to_process,
1482 ingest_parallelism = parallelism,
1483 "incremental pipeline starting: Phase A (rayon) → channel → Phase B (main thread)",
1484 );
1485
1486 let channel_bound = (parallelism * 2).max(1);
1490 let (tx, rx) = mpsc::sync_channel::<(usize, Result<StagedFile, AppError>)>(channel_bound);
1491
1492 let paths_owned = paths.clone();
1497 let llm_backend_owned = llm_backend;
1498 let embedding_backend_owned = embedding_backend;
1499 let producer_handle = std::thread::spawn(move || {
1500 pool.install(|| {
1501 process_items.into_par_iter().for_each(|item| {
1502 if crate::shutdown_requested() {
1503 return;
1504 }
1505 let t0 = std::time::Instant::now();
1506 let result = stage_file(
1507 item.idx,
1508 &item.path,
1509 &item.derived_name,
1510 &paths_owned,
1511 enable_ner,
1512 gliner_variant,
1513 max_rss_mb,
1514 llm_parallelism,
1515 llm_backend_owned,
1516 embedding_backend_owned,
1517 auto_describe,
1518 );
1519 let elapsed_ms = t0.elapsed().as_millis() as u64;
1520
1521 let (n_entities, n_relationships) = match &result {
1524 Ok(sf) => (sf.entities.len(), sf.relationships.len()),
1525 Err(_) => (0, 0),
1526 };
1527 let progress = StageProgressEvent {
1528 schema_version: 1,
1529 event: "file_extracted",
1530 path: &item.file_str,
1531 ms: elapsed_ms,
1532 entities: n_entities,
1533 relationships: n_relationships,
1534 };
1535 if let Ok(line) = serde_json::to_string(&progress) {
1536 tracing::info!(target: "ingest_progress", "{}", line);
1537 }
1538
1539 let _ = tx.send((item.idx, result));
1543 });
1544 drop(tx);
1546 });
1547 });
1548
1549 let fail_fast = args.fail_fast;
1561
1562 for meta in &slots_meta {
1564 if let SlotMeta::Skip {
1565 file_str,
1566 derived_base,
1567 name_truncated,
1568 original_name,
1569 original_filename,
1570 reason,
1571 } = meta
1572 {
1573 output::emit_json_compact(&IngestFileEvent {
1574 file: file_str,
1575 name: derived_base,
1576 status: "skipped",
1577 truncated: *name_truncated,
1578 original_name: original_name.clone(),
1579 original_filename: original_filename.as_deref(),
1580 error: Some(reason.clone()),
1581 memory_id: None,
1582 action: None,
1583 body_length: 0,
1584 backend_invoked: None,
1585 })?;
1586 skipped += 1;
1587 }
1588 }
1589
1590 let meta_index: std::collections::HashMap<usize, &SlotMeta> = slots_meta
1593 .iter()
1594 .enumerate()
1595 .filter(|(_, m)| matches!(m, SlotMeta::Process { .. }))
1596 .collect();
1597
1598 tracing::info!(
1599 target: "ingest",
1600 phase = "persist_start",
1601 files = total_to_process,
1602 "phase B starting: persisting files incrementally as Phase A completes each one",
1603 );
1604
1605 for (idx, stage_result) in rx {
1609 if crate::shutdown_requested() {
1610 tracing::info!(target: "ingest", "shutdown requested, stopping persistence loop");
1611 break;
1612 }
1613 let meta = meta_index.get(&idx).ok_or_else(|| {
1614 AppError::Internal(anyhow::anyhow!(
1615 "channel idx {idx} has no corresponding Process slot"
1616 ))
1617 })?;
1618 let (file_str, derived_name, name_truncated, original_name, original_filename) = match meta
1619 {
1620 SlotMeta::Process {
1621 file_str,
1622 derived_name,
1623 name_truncated,
1624 original_name,
1625 original_filename,
1626 } => (
1627 file_str,
1628 derived_name,
1629 name_truncated,
1630 original_name,
1631 original_filename,
1632 ),
1633 SlotMeta::Skip { .. } => unreachable!("channel only carries Process results"),
1634 };
1635
1636 let conn = match conn_or_err.as_mut() {
1638 Ok(c) => c,
1639 Err(err_msg) => {
1640 let err_clone = err_msg.clone();
1641 output::emit_json_compact(&IngestFileEvent {
1642 file: file_str,
1643 name: derived_name,
1644 status: "failed",
1645 truncated: *name_truncated,
1646 original_name: original_name.clone(),
1647 original_filename: original_filename.as_deref(),
1648 error: Some(err_clone.clone()),
1649 memory_id: None,
1650 action: None,
1651 body_length: 0,
1652 backend_invoked: None,
1653 })?;
1654 failed += 1;
1655 if fail_fast {
1656 output::emit_json_compact(&IngestSummary {
1657 summary: true,
1658 dir: args.dir.display().to_string(),
1659 pattern: args.pattern.clone(),
1660 recursive: args.recursive,
1661 files_total: total,
1662 files_succeeded: succeeded,
1663 files_failed: failed,
1664 files_skipped: skipped,
1665 elapsed_ms: started.elapsed().as_millis() as u64,
1666 })?;
1667 return Err(AppError::Validation(format!(
1668 "ingest aborted on first failure: {err_clone}"
1669 )));
1670 }
1671 continue;
1672 }
1673 };
1674
1675 let outcome =
1676 stage_result.and_then(|sf| persist_staged(conn, &namespace, &memory_type_str, sf));
1677
1678 match outcome {
1679 Ok(FileSuccess {
1680 memory_id,
1681 action,
1682 body_length,
1683 backend_invoked: file_backend_invoked,
1684 }) => {
1685 output::emit_json_compact(&IngestFileEvent {
1686 file: file_str,
1687 name: derived_name,
1688 status: "indexed",
1689 truncated: *name_truncated,
1690 original_name: original_name.clone(),
1691 original_filename: original_filename.as_deref(),
1692 error: None,
1693 memory_id: Some(memory_id),
1694 action: Some(action),
1695 body_length,
1696 backend_invoked: file_backend_invoked,
1697 })?;
1698 succeeded += 1;
1699 }
1700 Err(ref e) if matches!(e, AppError::Duplicate(_)) => {
1701 output::emit_json_compact(&IngestFileEvent {
1702 file: file_str,
1703 name: derived_name,
1704 status: "skipped",
1705 truncated: *name_truncated,
1706 original_name: original_name.clone(),
1707 original_filename: original_filename.as_deref(),
1708 error: Some(format!("{e}")),
1709 memory_id: None,
1710 action: Some("duplicate".to_string()),
1711 body_length: 0,
1712 backend_invoked: None,
1713 })?;
1714 skipped += 1;
1715 }
1716 Err(e) => {
1717 let err_msg = format!("{e}");
1718 output::emit_json_compact(&IngestFileEvent {
1719 file: file_str,
1720 name: derived_name,
1721 status: "failed",
1722 truncated: *name_truncated,
1723 original_name: original_name.clone(),
1724 original_filename: original_filename.as_deref(),
1725 error: Some(err_msg.clone()),
1726 memory_id: None,
1727 action: None,
1728 body_length: 0,
1729 backend_invoked: None,
1730 })?;
1731 failed += 1;
1732 if fail_fast {
1733 output::emit_json_compact(&IngestSummary {
1734 summary: true,
1735 dir: args.dir.display().to_string(),
1736 pattern: args.pattern.clone(),
1737 recursive: args.recursive,
1738 files_total: total,
1739 files_succeeded: succeeded,
1740 files_failed: failed,
1741 files_skipped: skipped,
1742 elapsed_ms: started.elapsed().as_millis() as u64,
1743 })?;
1744 return Err(AppError::Validation(format!(
1745 "ingest aborted on first failure: {err_msg}"
1746 )));
1747 }
1748 }
1749 }
1750 }
1751
1752 producer_handle
1754 .join()
1755 .map_err(|_| AppError::Internal(anyhow::anyhow!("ingest producer thread panicked")))?;
1756
1757 if let Ok(ref conn) = conn_or_err {
1758 if succeeded > 0 {
1759 let _ = conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);");
1760 }
1761 }
1762
1763 output::emit_json_compact(&IngestSummary {
1764 summary: true,
1765 dir: args.dir.display().to_string(),
1766 pattern: args.pattern.clone(),
1767 recursive: args.recursive,
1768 files_total: total,
1769 files_succeeded: succeeded,
1770 files_failed: failed,
1771 files_skipped: skipped,
1772 elapsed_ms: started.elapsed().as_millis() as u64,
1773 })?;
1774
1775 if args.enrich_after && succeeded > 0 {
1776 output::emit_json_compact(&serde_json::json!({
1777 "event": "enrich_phase_started",
1778 "operation": "memory-bindings"
1779 }))?;
1780 let enrich_args = super::enrich::EnrichArgs {
1781 operation: super::enrich::EnrichOperation::MemoryBindings,
1782 mode: super::enrich::EnrichMode::ClaudeCode,
1783 limit: None,
1784 dry_run: false,
1785 namespace: args.namespace.clone(),
1786 claude_binary: args.claude_binary.clone(),
1787 claude_model: args.claude_model.clone(),
1788 claude_timeout: args.claude_timeout,
1789 codex_binary: args.codex_binary.clone(),
1790 codex_model: args.codex_model.clone(),
1791 codex_timeout: args.codex_timeout,
1792 opencode_binary: args.opencode_binary.clone(),
1793 opencode_model: args.opencode_model.clone(),
1794 opencode_timeout: args.opencode_timeout,
1795 db: args.db.clone(),
1796 json: false,
1797 resume: false,
1798 retry_failed: false,
1799 max_cost_usd: args.max_cost_usd,
1800 llm_parallelism: args.llm_parallelism as u32,
1801 wait_job_singleton: args.wait_job_singleton,
1802 force_job_singleton: args.force_job_singleton,
1803 names: Vec::new(),
1804 names_file: None,
1805 preflight_check: false,
1806 fallback_mode: None,
1807 rate_limit_buffer: 300,
1808 max_load_check: true,
1809 circuit_breaker_threshold: 5,
1810 preserve_threshold: 0.7,
1811 codex_model_validate: true,
1812 codex_model_fallback: None,
1813 min_output_chars: 500,
1814 max_output_chars: 2000,
1815 preserve_check: true,
1816 prompt_template: None,
1817 };
1818 match super::enrich::run(&enrich_args, llm_backend, embedding_backend) {
1819 Ok(()) => {
1820 output::emit_json_compact(&serde_json::json!({
1821 "event": "enrich_phase_completed"
1822 }))?;
1823 }
1824 Err(e) => {
1825 tracing::warn!(error = %e, "enrich --operation memory-bindings failed after ingest");
1826 output::emit_json_compact(&serde_json::json!({
1827 "event": "enrich_phase_failed",
1828 "error": e.to_string()
1829 }))?;
1830 }
1831 }
1832 }
1833
1834 Ok(())
1835}
1836
1837fn init_storage(paths: &AppPaths) -> Result<Connection, AppError> {
1843 ensure_db_ready(paths)?;
1844 let conn = open_rw(&paths.db)?;
1845 Ok(conn)
1846}
1847
1848pub(crate) fn collect_files(
1849 dir: &Path,
1850 pattern: &str,
1851 recursive: bool,
1852 out: &mut Vec<PathBuf>,
1853) -> Result<(), AppError> {
1854 let entries = std::fs::read_dir(dir).map_err(AppError::Io)?;
1855 for entry in entries {
1856 let entry = entry.map_err(AppError::Io)?;
1857 let path = entry.path();
1858 let file_type = entry.file_type().map_err(AppError::Io)?;
1859 if file_type.is_file() {
1860 let name = entry.file_name();
1861 let name_str = name.to_string_lossy();
1862 if matches_pattern(&name_str, pattern) {
1863 out.push(path);
1864 }
1865 } else if file_type.is_dir() && recursive {
1866 collect_files(&path, pattern, recursive, out)?;
1867 }
1868 }
1869 Ok(())
1870}
1871
1872fn matches_pattern(name: &str, pattern: &str) -> bool {
1873 if let Some(suffix) = pattern.strip_prefix('*') {
1874 name.ends_with(suffix)
1875 } else if let Some(prefix) = pattern.strip_suffix('*') {
1876 name.starts_with(prefix)
1877 } else {
1878 name == pattern
1879 }
1880}
1881
1882pub(crate) fn derive_kebab_name(path: &Path, max_len: usize) -> (String, bool, Option<String>) {
1893 let stem = path.file_stem().and_then(|s| s.to_str()).unwrap_or("");
1894 let lowered: String = stem
1895 .nfd()
1896 .filter(|c| !unicode_normalization::char::is_combining_mark(*c))
1897 .map(|c| {
1898 if c == '_' || c.is_whitespace() {
1899 '-'
1900 } else {
1901 c
1902 }
1903 })
1904 .map(|c| c.to_ascii_lowercase())
1905 .filter(|c| c.is_ascii_lowercase() || c.is_ascii_digit() || *c == '-')
1906 .collect();
1907 let collapsed = collapse_dashes(&lowered);
1908 let trimmed_raw = collapsed.trim_matches('-').to_string();
1909 let trimmed = if trimmed_raw.starts_with(|c: char| c.is_ascii_digit()) {
1911 format!("doc-{trimmed_raw}")
1912 } else {
1913 trimmed_raw
1914 };
1915 if trimmed.len() > max_len {
1916 let truncated = trimmed[..max_len].trim_matches('-').to_string();
1917 tracing::debug!(
1918 target: "ingest",
1919 original = %trimmed,
1920 truncated_to = %truncated,
1921 max_len = max_len,
1922 "derived memory name truncated to fit length cap; collisions will be resolved with numeric suffixes"
1923 );
1924 (truncated, true, Some(trimmed))
1925 } else {
1926 (trimmed, false, None)
1927 }
1928}
1929
1930fn unique_name(base: &str, taken: &BTreeSet<String>) -> Result<String, AppError> {
1943 if !taken.contains(base) {
1944 return Ok(base.to_string());
1945 }
1946 for suffix in 1..=MAX_NAME_COLLISION_SUFFIX {
1947 let candidate = format!("{base}-{suffix}");
1948 if !taken.contains(&candidate) {
1949 tracing::warn!(
1950 target: "ingest",
1951 base = %base,
1952 resolved = %candidate,
1953 suffix,
1954 "memory name collision resolved with numeric suffix"
1955 );
1956 return Ok(candidate);
1957 }
1958 }
1959 Err(AppError::Validation(format!(
1960 "too many name collisions for base '{base}' (>{MAX_NAME_COLLISION_SUFFIX}); rename source files to disambiguate"
1961 )))
1962}
1963
1964fn collapse_dashes(s: &str) -> String {
1965 let mut out = String::with_capacity(s.len());
1966 let mut prev_dash = false;
1967 for c in s.chars() {
1968 if c == '-' {
1969 if !prev_dash {
1970 out.push('-');
1971 }
1972 prev_dash = true;
1973 } else {
1974 out.push(c);
1975 prev_dash = false;
1976 }
1977 }
1978 out
1979}
1980
1981#[cfg(test)]
1982mod tests {
1983 use super::*;
1984 use std::path::PathBuf;
1985
1986 #[test]
1987 fn matches_pattern_suffix() {
1988 assert!(matches_pattern("foo.md", "*.md"));
1989 assert!(!matches_pattern("foo.txt", "*.md"));
1990 assert!(matches_pattern("foo.md", "*"));
1991 }
1992
1993 #[test]
1994 fn matches_pattern_prefix() {
1995 assert!(matches_pattern("README.md", "README*"));
1996 assert!(!matches_pattern("CHANGELOG.md", "README*"));
1997 }
1998
1999 #[test]
2000 fn matches_pattern_exact() {
2001 assert!(matches_pattern("README.md", "README.md"));
2002 assert!(!matches_pattern("readme.md", "README.md"));
2003 }
2004
2005 #[test]
2006 fn derive_kebab_underscore_to_dash() {
2007 let p = PathBuf::from("/tmp/claude_code_headless.md");
2008 let (name, truncated, original) = derive_kebab_name(&p, DERIVED_NAME_MAX_LEN);
2009 assert_eq!(name, "claude-code-headless");
2010 assert!(!truncated);
2011 assert!(original.is_none());
2012 }
2013
2014 #[test]
2015 fn derive_kebab_uppercase_lowered() {
2016 let p = PathBuf::from("/tmp/README.md");
2017 let (name, truncated, original) = derive_kebab_name(&p, DERIVED_NAME_MAX_LEN);
2018 assert_eq!(name, "readme");
2019 assert!(!truncated);
2020 assert!(original.is_none());
2021 }
2022
2023 #[test]
2024 fn derive_kebab_strips_non_kebab_chars() {
2025 let p = PathBuf::from("/tmp/some@weird#name!.md");
2026 let (name, truncated, original) = derive_kebab_name(&p, DERIVED_NAME_MAX_LEN);
2027 assert_eq!(name, "someweirdname");
2028 assert!(!truncated);
2029 assert!(original.is_none());
2030 }
2031
2032 #[test]
2035 fn derive_kebab_folds_accented_letters_to_ascii() {
2036 let p = PathBuf::from("/tmp/açaí.md");
2037 let (name, _, _) = derive_kebab_name(&p, DERIVED_NAME_MAX_LEN);
2038 assert_eq!(name, "acai", "got '{name}'");
2039 }
2040
2041 #[test]
2042 fn derive_kebab_handles_naive_with_diaeresis() {
2043 let p = PathBuf::from("/tmp/naïve-test.md");
2044 let (name, _, _) = derive_kebab_name(&p, DERIVED_NAME_MAX_LEN);
2045 assert_eq!(name, "naive-test", "got '{name}'");
2046 }
2047
2048 #[test]
2049 fn derive_kebab_drops_emoji_keeps_word() {
2050 let p = PathBuf::from("/tmp/🚀-rocket.md");
2051 let (name, _, _) = derive_kebab_name(&p, DERIVED_NAME_MAX_LEN);
2052 assert_eq!(name, "rocket", "got '{name}'");
2053 }
2054
2055 #[test]
2056 fn derive_kebab_mixed_unicode_emoji_keeps_letters() {
2057 let p = PathBuf::from("/tmp/açaí🦜.md");
2058 let (name, _, _) = derive_kebab_name(&p, DERIVED_NAME_MAX_LEN);
2059 assert_eq!(name, "acai", "got '{name}'");
2060 }
2061
2062 #[test]
2063 fn derive_kebab_pure_emoji_yields_empty() {
2064 let p = PathBuf::from("/tmp/🦜🚀🌟.md");
2065 let (name, _, _) = derive_kebab_name(&p, DERIVED_NAME_MAX_LEN);
2066 assert!(name.is_empty(), "got '{name}'");
2067 }
2068
2069 #[test]
2070 fn derive_kebab_collapses_consecutive_dashes() {
2071 let p = PathBuf::from("/tmp/a__b___c.md");
2072 let (name, truncated, original) = derive_kebab_name(&p, DERIVED_NAME_MAX_LEN);
2073 assert_eq!(name, "a-b-c");
2074 assert!(!truncated);
2075 assert!(original.is_none());
2076 }
2077
2078 #[test]
2079 fn derive_kebab_truncates_to_60_chars() {
2080 let p = PathBuf::from(format!("/tmp/{}.md", "a".repeat(80)));
2081 let (name, truncated, original) = derive_kebab_name(&p, DERIVED_NAME_MAX_LEN);
2082 assert!(name.len() <= 60, "got len {}", name.len());
2083 assert!(truncated);
2084 assert!(original.is_some());
2085 assert!(original.unwrap().len() > 60);
2086 }
2087
2088 #[test]
2089 fn collect_files_finds_md_files() {
2090 let tmp = tempfile::tempdir().expect("tempdir");
2091 std::fs::write(tmp.path().join("a.md"), "x").unwrap();
2092 std::fs::write(tmp.path().join("b.md"), "y").unwrap();
2093 std::fs::write(tmp.path().join("c.txt"), "z").unwrap();
2094 let mut out = Vec::new();
2095 collect_files(tmp.path(), "*.md", false, &mut out).expect("collect");
2096 assert_eq!(out.len(), 2, "should find 2 .md files, got {out:?}");
2097 }
2098
2099 #[test]
2100 fn collect_files_recursive_descends_subdirs() {
2101 let tmp = tempfile::tempdir().expect("tempdir");
2102 let sub = tmp.path().join("sub");
2103 std::fs::create_dir(&sub).unwrap();
2104 std::fs::write(tmp.path().join("a.md"), "x").unwrap();
2105 std::fs::write(sub.join("b.md"), "y").unwrap();
2106 let mut out = Vec::new();
2107 collect_files(tmp.path(), "*.md", true, &mut out).expect("collect");
2108 assert_eq!(out.len(), 2);
2109 }
2110
2111 #[test]
2112 fn collect_files_non_recursive_skips_subdirs() {
2113 let tmp = tempfile::tempdir().expect("tempdir");
2114 let sub = tmp.path().join("sub");
2115 std::fs::create_dir(&sub).unwrap();
2116 std::fs::write(tmp.path().join("a.md"), "x").unwrap();
2117 std::fs::write(sub.join("b.md"), "y").unwrap();
2118 let mut out = Vec::new();
2119 collect_files(tmp.path(), "*.md", false, &mut out).expect("collect");
2120 assert_eq!(out.len(), 1);
2121 }
2122
2123 #[test]
2126 fn derive_kebab_long_basename_truncated_within_cap() {
2127 let p = PathBuf::from(format!("/tmp/{}.md", "a".repeat(120)));
2128 let (name, truncated, original) = derive_kebab_name(&p, DERIVED_NAME_MAX_LEN);
2129 assert!(
2130 name.len() <= DERIVED_NAME_MAX_LEN,
2131 "truncated name must respect cap; got {} chars",
2132 name.len()
2133 );
2134 assert!(!name.is_empty());
2135 assert!(truncated);
2136 assert!(original.is_some());
2137 }
2138
2139 #[test]
2140 fn unique_name_returns_base_when_free() {
2141 let taken: BTreeSet<String> = BTreeSet::new();
2142 let resolved = unique_name("note", &taken).expect("must resolve");
2143 assert_eq!(resolved, "note");
2144 }
2145
2146 #[test]
2147 fn unique_name_appends_first_free_suffix_on_collision() {
2148 let mut taken: BTreeSet<String> = BTreeSet::new();
2149 taken.insert("note".to_string());
2150 taken.insert("note-1".to_string());
2151 let resolved = unique_name("note", &taken).expect("must resolve");
2152 assert_eq!(resolved, "note-2");
2153 }
2154
2155 #[test]
2156 fn unique_name_errors_after_collision_cap() {
2157 let mut taken: BTreeSet<String> = BTreeSet::new();
2158 taken.insert("note".to_string());
2159 for i in 1..=MAX_NAME_COLLISION_SUFFIX {
2160 taken.insert(format!("note-{i}"));
2161 }
2162 let err = unique_name("note", &taken).expect_err("must surface error");
2163 assert!(matches!(err, AppError::Validation(_)));
2164 }
2165
2166 #[test]
2169 fn validate_relation_format_accepts_valid_relations() {
2170 use crate::parsers::{is_canonical_relation, validate_relation_format};
2171 assert!(validate_relation_format("applies_to").is_ok());
2172 assert!(validate_relation_format("depends_on").is_ok());
2173 assert!(validate_relation_format("implements").is_ok());
2174 assert!(validate_relation_format("").is_err());
2175 assert!(is_canonical_relation("applies_to"));
2176 assert!(!is_canonical_relation("implements"));
2177 }
2178
2179 use serial_test::serial;
2182
2183 fn with_env_var<F: FnOnce()>(value: Option<&str>, f: F) {
2185 let key = "SQLITE_GRAPHRAG_LOW_MEMORY";
2186 let prev = std::env::var(key).ok();
2187 match value {
2188 Some(v) => std::env::set_var(key, v),
2189 None => std::env::remove_var(key),
2190 }
2191 f();
2192 match prev {
2193 Some(p) => std::env::set_var(key, p),
2194 None => std::env::remove_var(key),
2195 }
2196 }
2197
2198 #[test]
2199 #[serial]
2200 fn env_low_memory_enabled_unset_returns_false() {
2201 with_env_var(None, || assert!(!env_low_memory_enabled()));
2202 }
2203
2204 #[test]
2205 #[serial]
2206 fn env_low_memory_enabled_empty_returns_false() {
2207 with_env_var(Some(""), || assert!(!env_low_memory_enabled()));
2208 }
2209
2210 #[test]
2211 #[serial]
2212 fn env_low_memory_enabled_truthy_values_return_true() {
2213 for v in ["1", "true", "TRUE", "yes", "YES", "on", "On"] {
2214 with_env_var(Some(v), || {
2215 assert!(env_low_memory_enabled(), "value {v:?} should be truthy")
2216 });
2217 }
2218 }
2219
2220 #[test]
2221 #[serial]
2222 fn env_low_memory_enabled_falsy_values_return_false() {
2223 for v in ["0", "false", "FALSE", "no", "off"] {
2224 with_env_var(Some(v), || {
2225 assert!(!env_low_memory_enabled(), "value {v:?} should be falsy")
2226 });
2227 }
2228 }
2229
2230 #[test]
2231 #[serial]
2232 fn env_low_memory_enabled_unrecognized_value_returns_false() {
2233 with_env_var(Some("maybe"), || assert!(!env_low_memory_enabled()));
2234 }
2235
2236 #[test]
2237 #[serial]
2238 fn resolve_parallelism_flag_forces_one_overriding_explicit_value() {
2239 with_env_var(None, || {
2240 assert_eq!(resolve_parallelism(true, Some(4)), 1);
2241 assert_eq!(resolve_parallelism(true, Some(8)), 1);
2242 assert_eq!(resolve_parallelism(true, None), 1);
2243 });
2244 }
2245
2246 #[test]
2247 #[serial]
2248 fn resolve_parallelism_env_forces_one_when_flag_off() {
2249 with_env_var(Some("1"), || {
2250 assert_eq!(resolve_parallelism(false, Some(4)), 1);
2251 assert_eq!(resolve_parallelism(false, None), 1);
2252 });
2253 }
2254
2255 #[test]
2256 #[serial]
2257 fn resolve_parallelism_falsy_env_does_not_override() {
2258 with_env_var(Some("0"), || {
2259 assert_eq!(resolve_parallelism(false, Some(4)), 4);
2260 });
2261 }
2262
2263 #[test]
2264 #[serial]
2265 fn resolve_parallelism_explicit_value_when_low_memory_off() {
2266 with_env_var(None, || {
2267 assert_eq!(resolve_parallelism(false, Some(3)), 3);
2268 assert_eq!(resolve_parallelism(false, Some(1)), 1);
2269 });
2270 }
2271
2272 #[test]
2273 #[serial]
2274 fn resolve_parallelism_default_when_unset() {
2275 with_env_var(None, || {
2276 let p = resolve_parallelism(false, None);
2277 assert!((1..=4).contains(&p), "default must be in [1, 4]; got {p}");
2278 });
2279 }
2280
2281 #[test]
2282 fn ingest_args_parses_low_memory_flag_via_clap() {
2283 use clap::Parser;
2284 let cli = crate::cli::Cli::try_parse_from([
2287 "sqlite-graphrag",
2288 "ingest",
2289 "/tmp/dummy",
2290 "--type",
2291 "document",
2292 "--low-memory",
2293 ])
2294 .expect("parse must succeed");
2295 match cli.command {
2296 Some(crate::cli::Commands::Ingest(args)) => {
2297 assert!(args.low_memory, "--low-memory must set field to true");
2298 }
2299 _ => panic!("expected Ingest subcommand"),
2300 }
2301 }
2302
2303 #[test]
2304 fn ingest_args_low_memory_defaults_false() {
2305 use clap::Parser;
2306 let cli = crate::cli::Cli::try_parse_from([
2307 "sqlite-graphrag",
2308 "ingest",
2309 "/tmp/dummy",
2310 "--type",
2311 "document",
2312 ])
2313 .expect("parse must succeed");
2314 match cli.command {
2315 Some(crate::cli::Commands::Ingest(args)) => {
2316 assert!(!args.low_memory, "default must be false");
2317 }
2318 _ => panic!("expected Ingest subcommand"),
2319 }
2320 }
2321}