1use crate::chunking;
35use crate::cli::MemoryType;
36use crate::entity_type::EntityType;
37use crate::errors::AppError;
38use crate::i18n::errors_msg;
39use crate::output::{self, JsonOutputFormat};
40use crate::paths::AppPaths;
41use crate::storage::chunks as storage_chunks;
42use crate::storage::connection::{ensure_db_ready, open_rw};
43use crate::storage::entities::{NewEntity, NewRelationship};
44use crate::storage::memories::NewMemory;
45use crate::storage::{entities, memories, urls as storage_urls, versions};
46use rayon::prelude::*;
47use rusqlite::Connection;
48use serde::Serialize;
49use std::collections::BTreeSet;
50use std::path::{Path, PathBuf};
51use std::sync::mpsc;
52use unicode_normalization::UnicodeNormalization;
53
54use crate::constants::DERIVED_NAME_MAX_LEN;
55
56const MAX_NAME_COLLISION_SUFFIX: usize = 1000;
59
60#[derive(clap::Args)]
61#[command(after_long_help = "EXAMPLES:\n \
62 # Ingest every Markdown file under ./docs as `document` memories\n \
63 sqlite-graphrag ingest ./docs --type document\n\n \
64 # Ingest .txt files recursively under ./notes\n \
65 sqlite-graphrag ingest ./notes --type note --pattern '*.txt' --recursive\n\n \
66 # Enable automatic URL extraction (URL-regex only since v1.0.79)\n \
67 sqlite-graphrag ingest ./big-corpus --type reference --enable-ner\n\n \
68 # Preview file-to-name mapping without ingesting\n \
69 sqlite-graphrag ingest ./docs --dry-run\n\n \
70 # LLM-curated extraction via Claude Code CLI\n \
71 sqlite-graphrag ingest ./docs --mode claude-code --recursive --json\n\n \
72 # Resume interrupted claude-code ingest\n \
73 sqlite-graphrag ingest ./docs --mode claude-code --resume --json\n\n \
74 # Claude Code with budget cap and custom timeout\n \
75 sqlite-graphrag ingest ./docs --mode claude-code --max-cost-usd 5.00 --claude-timeout 600 --json\n\n \
76AUTHENTICATION:\n \
77 --mode claude-code: Uses existing Claude Code authentication.\n \
78 OAuth (Pro/Max/Team): works automatically from ~/.claude/.credentials.json\n \
79 API key: set ANTHROPIC_API_KEY for faster startup (optional)\n\n \
80 --mode codex: Uses existing Codex CLI authentication.\n \
81 Device auth: run `codex auth login` first\n \
82 API key: set OPENAI_API_KEY (optional)\n\n \
83NOTES:\n \
84 Each file becomes a separate memory. Names derive from file basenames\n \
85 (kebab-case, lowercase, ASCII). Output is NDJSON: one JSON object per file,\n \
86 followed by a final summary line with counts. Per-file errors are reported\n \
87 inline and processing continues unless --fail-fast is set.")]
88pub struct IngestArgs {
89 #[arg(
91 value_name = "DIR",
92 help = "Directory to ingest recursively (each matching file becomes a memory)"
93 )]
94 pub dir: PathBuf,
95
96 #[arg(long, value_enum, default_value_t = MemoryType::Document)]
98 pub r#type: MemoryType,
99
100 #[arg(long, default_value = "*.md")]
103 pub pattern: String,
104
105 #[arg(long, default_value_t = false)]
107 pub recursive: bool,
108
109 #[arg(
110 long,
111 env = "SQLITE_GRAPHRAG_ENABLE_NER",
112 value_parser = crate::parsers::parse_bool_flexible,
113 action = clap::ArgAction::Set,
114 num_args = 0..=1,
115 default_missing_value = "true",
116 default_value = "false",
117 help = "Enable automatic URL-regex extraction (the GLiNER NER pipeline was removed in v1.0.79)"
118 )]
119 pub enable_ner: bool,
120
121 #[arg(
125 long,
126 default_value_t = true,
127 overrides_with = "no_auto_describe",
128 help = "Derive memory description from the first meaningful body line instead of the legacy `ingested from <path>` placeholder."
129 )]
130 pub auto_describe: bool,
131 #[arg(
132 long = "no-auto-describe",
133 default_value_t = false,
134 help = "Disable `--auto-describe` and fall back to the legacy `ingested from <path>` description placeholder."
135 )]
136 pub no_auto_describe: bool,
137 #[arg(
138 long,
139 env = "SQLITE_GRAPHRAG_GLINER_VARIANT",
140 default_value = "fp32",
141 help = "DEPRECATED: no effect since v1.0.79 (the GLiNER pipeline was removed); accepted for compatibility only"
142 )]
143 pub gliner_variant: String,
144
145 #[arg(long, default_value_t = false, hide = true)]
147 pub skip_extraction: bool,
148
149 #[arg(long, default_value_t = false)]
151 pub fail_fast: bool,
152
153 #[arg(long, default_value_t = false)]
155 pub dry_run: bool,
156
157 #[arg(long, default_value_t = 10_000)]
159 pub max_files: usize,
160
161 #[arg(long)]
163 pub namespace: Option<String>,
164
165 #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
167 pub db: Option<String>,
168
169 #[arg(long, value_enum, default_value_t = JsonOutputFormat::Json)]
170 pub format: JsonOutputFormat,
171
172 #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
173 pub json: bool,
174
175 #[arg(
177 long,
178 help = "Number of files to extract+embed in parallel; default = max(1, cpus/2).min(4)"
179 )]
180 pub ingest_parallelism: Option<usize>,
181
182 #[arg(
190 long,
191 default_value_t = false,
192 help = "Forces single-threaded ingest (--ingest-parallelism 1) to reduce RSS pressure. \
193 Recommended for environments with <4 GB available RAM or container/cgroup \
194 constraints. Trade-off: 3-4x longer wall time. Also honored via \
195 SQLITE_GRAPHRAG_LOW_MEMORY=1 env var."
196 )]
197 pub low_memory: bool,
198
199 #[arg(long, default_value_t = crate::constants::DEFAULT_MAX_RSS_MB,
201 help = "Maximum process RSS in MiB; abort if exceeded during embedding (default: 8192)")]
202 pub max_rss_mb: u64,
203
204 #[arg(long, default_value_t = 2, value_name = "N",
209 value_parser = clap::value_parser!(u64).range(1..=32),
210 help = "Maximum simultaneous LLM embedding subprocesses per file (default: 2, clamp [1,32])")]
211 pub llm_parallelism: u64,
212
213 #[arg(long, default_value_t = crate::constants::DERIVED_NAME_MAX_LEN,
218 help = "Maximum length for derived memory names (default: 60)")]
219 pub max_name_length: usize,
220
221 #[arg(long, value_enum, default_value_t = IngestMode::None)]
223 pub mode: IngestMode,
224
225 #[arg(long, env = "SQLITE_GRAPHRAG_CLAUDE_BINARY")]
227 pub claude_binary: Option<std::path::PathBuf>,
228
229 #[arg(long)]
231 pub claude_model: Option<String>,
232
233 #[arg(long, default_value_t = false)]
235 pub resume: bool,
236
237 #[arg(long, default_value_t = false)]
239 pub retry_failed: bool,
240
241 #[arg(long, default_value_t = false)]
243 pub keep_queue: bool,
244
245 #[arg(long, default_value = ".ingest-queue.sqlite")]
247 pub queue_db: String,
248
249 #[arg(long, default_value_t = 60)]
251 pub rate_limit_wait: u64,
252
253 #[arg(long)]
255 pub max_cost_usd: Option<f64>,
256
257 #[arg(
259 long,
260 default_value_t = 300,
261 help = "Timeout in seconds for each claude -p invocation (default: 300)"
262 )]
263 pub claude_timeout: u64,
264
265 #[arg(
267 long,
268 env = "SQLITE_GRAPHRAG_CODEX_BINARY",
269 help = "Explicit path to the Codex CLI binary (only with --mode codex)"
270 )]
271 pub codex_binary: Option<PathBuf>,
272
273 #[arg(
275 long,
276 help = "Model override for Codex extraction (e.g. o4-mini, gpt-5.1-codex)"
277 )]
278 pub codex_model: Option<String>,
279
280 #[arg(
282 long,
283 default_value_t = 300,
284 help = "Timeout in seconds for each codex exec invocation (default: 300)"
285 )]
286 pub codex_timeout: u64,
287
288 #[arg(long, value_name = "PATH", env = "SQLITE_GRAPHRAG_OPENCODE_BINARY")]
290 pub opencode_binary: Option<PathBuf>,
291
292 #[arg(
294 long,
295 value_name = "MODEL",
296 env = "SQLITE_GRAPHRAG_OPENCODE_MODEL",
297 help = "Model override for OpenCode extraction"
298 )]
299 pub opencode_model: Option<String>,
300
301 #[arg(
303 long,
304 value_name = "SECONDS",
305 env = "SQLITE_GRAPHRAG_OPENCODE_TIMEOUT",
306 default_value_t = 300,
307 help = "Timeout in seconds for each opencode run invocation (default: 300)"
308 )]
309 pub opencode_timeout: u64,
310
311 #[arg(long, value_name = "SECONDS")]
314 pub wait_job_singleton: Option<u64>,
315
316 #[arg(long, default_value_t = false)]
319 pub force_job_singleton: bool,
320
321 #[arg(
324 long,
325 default_value_t = false,
326 help = "Run enrich --operation memory-bindings after all files are ingested"
327 )]
328 pub enrich_after: bool,
329}
330
331#[derive(Clone, Debug, PartialEq, Eq, clap::ValueEnum)]
333pub enum IngestMode {
334 None,
336 Gliner,
338 ClaudeCode,
340 Codex,
342 #[value(name = "opencode")]
344 Opencode,
345}
346
347fn env_low_memory_enabled() -> bool {
352 match std::env::var("SQLITE_GRAPHRAG_LOW_MEMORY") {
353 Ok(v) if v.is_empty() => false,
354 Ok(v) => match v.to_lowercase().as_str() {
355 "1" | "true" | "yes" | "on" => true,
356 "0" | "false" | "no" | "off" => false,
357 other => {
358 tracing::warn!(
359 target: "ingest",
360 value = %other,
361 "SQLITE_GRAPHRAG_LOW_MEMORY value not recognized; treating as disabled"
362 );
363 false
364 }
365 },
366 Err(_) => false,
367 }
368}
369
370fn resolve_parallelism(low_memory_flag: bool, ingest_parallelism: Option<usize>) -> usize {
382 let env_flag = env_low_memory_enabled();
383 let low_memory = low_memory_flag || env_flag;
384
385 if low_memory {
386 if let Some(n) = ingest_parallelism {
387 if n > 1 {
388 tracing::warn!(
389 target: "ingest",
390 requested = n,
391 "--ingest-parallelism overridden by --low-memory; using 1"
392 );
393 }
394 }
395 if low_memory_flag {
396 tracing::info!(
397 target: "ingest",
398 source = "flag",
399 "low-memory mode enabled: forcing --ingest-parallelism 1"
400 );
401 } else {
402 tracing::info!(
403 target: "ingest",
404 source = "env",
405 "low-memory mode enabled via SQLITE_GRAPHRAG_LOW_MEMORY: forcing --ingest-parallelism 1"
406 );
407 }
408 return 1;
409 }
410
411 ingest_parallelism
412 .unwrap_or_else(|| {
413 std::thread::available_parallelism()
414 .map(|v| v.get() / 2)
415 .unwrap_or(1)
416 .clamp(1, 4)
417 })
418 .max(1)
419}
420
421#[derive(Serialize)]
422struct IngestFileEvent<'a> {
423 file: &'a str,
424 name: &'a str,
425 status: &'a str,
426 truncated: bool,
428 #[serde(skip_serializing_if = "Option::is_none")]
430 original_name: Option<String>,
431 #[serde(skip_serializing_if = "Option::is_none")]
433 original_filename: Option<&'a str>,
434 #[serde(skip_serializing_if = "Option::is_none")]
435 error: Option<String>,
436 #[serde(skip_serializing_if = "Option::is_none")]
437 memory_id: Option<i64>,
438 #[serde(skip_serializing_if = "Option::is_none")]
439 action: Option<String>,
440 body_length: usize,
442 #[serde(skip_serializing_if = "Option::is_none")]
447 backend_invoked: Option<&'a str>,
448}
449
450#[derive(Serialize)]
451struct IngestSummary {
452 summary: bool,
453 dir: String,
454 pattern: String,
455 recursive: bool,
456 files_total: usize,
457 files_succeeded: usize,
458 files_failed: usize,
459 files_skipped: usize,
460 elapsed_ms: u64,
461}
462
463struct FileSuccess {
465 memory_id: i64,
466 action: String,
467 body_length: usize,
468 backend_invoked: Option<&'static str>,
469}
470
471#[derive(Serialize)]
474struct StageProgressEvent<'a> {
475 schema_version: u8,
476 event: &'a str,
477 path: &'a str,
478 ms: u64,
479 entities: usize,
480 relationships: usize,
481}
482
483struct StagedFile {
486 body: String,
487 body_hash: String,
488 snippet: String,
489 name: String,
490 description: String,
491 embedding: Option<Vec<f32>>,
492 chunk_embeddings: Option<Vec<Vec<f32>>>,
493 chunks_info: Vec<crate::chunking::Chunk>,
494 entities: Vec<NewEntity>,
495 relationships: Vec<NewRelationship>,
496 entity_embeddings: Option<Vec<Vec<f32>>>,
497 urls: Vec<crate::extraction::ExtractedUrl>,
498 backend_invoked: Option<&'static str>,
503}
504
505#[allow(clippy::too_many_arguments)]
511fn stage_file(
512 _idx: usize,
513 path: &Path,
514 name: &str,
515 paths: &AppPaths,
516 enable_ner: bool,
517 gliner_variant: crate::extraction::GlinerVariant,
518 max_rss_mb: u64,
519 llm_parallelism: usize,
520 llm_backend: crate::cli::LlmBackendChoice,
521 embedding_backend: crate::cli::EmbeddingBackendChoice,
522 auto_describe: bool,
523) -> Result<StagedFile, AppError> {
524 use crate::constants::*;
525
526 if name.len() > MAX_MEMORY_NAME_LEN {
527 return Err(AppError::LimitExceeded(
528 crate::i18n::validation::name_length(MAX_MEMORY_NAME_LEN),
529 ));
530 }
531 if name.starts_with("__") {
532 return Err(AppError::Validation(
533 crate::i18n::validation::reserved_name(),
534 ));
535 }
536 {
537 let slug_re = crate::constants::name_slug_regex();
538 if !slug_re.is_match(name) {
539 return Err(AppError::Validation(crate::i18n::validation::name_kebab(
540 name,
541 )));
542 }
543 }
544
545 let file_size = std::fs::metadata(path).map_err(AppError::Io)?.len();
546 if file_size > MAX_MEMORY_BODY_LEN as u64 {
547 return Err(AppError::LimitExceeded(
548 crate::i18n::validation::body_exceeds(MAX_MEMORY_BODY_LEN),
549 ));
550 }
551 let raw_body = std::fs::read_to_string(path).map_err(AppError::Io)?;
552 if raw_body.len() > MAX_MEMORY_BODY_LEN {
553 return Err(AppError::LimitExceeded(
554 crate::i18n::validation::body_exceeds(MAX_MEMORY_BODY_LEN),
555 ));
556 }
557 if raw_body.trim().is_empty() {
558 return Err(AppError::Validation(crate::i18n::validation::empty_body()));
559 }
560
561 let description = if auto_describe {
562 crate::commands::ingest_heuristics::extract_heuristic_description(
563 &raw_body,
564 Some(&path.display().to_string()),
565 )
566 } else {
567 format!("ingested from {}", path.display())
568 };
569 if description.len() > MAX_MEMORY_DESCRIPTION_LEN {
570 return Err(AppError::Validation(
571 crate::i18n::validation::description_exceeds(MAX_MEMORY_DESCRIPTION_LEN),
572 ));
573 }
574
575 let mut extracted_entities: Vec<NewEntity> = Vec::with_capacity(30);
576 let mut extracted_relationships: Vec<NewRelationship> = Vec::with_capacity(50);
577 let mut extracted_urls: Vec<crate::extraction::ExtractedUrl> = Vec::with_capacity(4);
578 if enable_ner {
579 match crate::extraction::extract_graph_auto(&raw_body, paths, gliner_variant) {
580 Ok(extracted) => {
581 extracted_urls = extracted.urls;
582 extracted_entities = extracted
587 .entities
588 .into_iter()
589 .map(|e| NewEntity {
590 name: e.name,
591 entity_type: crate::entity_type::EntityType::Concept,
592 description: None,
593 })
594 .collect();
595 extracted_relationships.clear();
600
601 if extracted_entities.len() > max_entities_per_memory() {
602 extracted_entities.truncate(max_entities_per_memory());
603 }
604 if extracted_relationships.len() > max_relationships_per_memory() {
605 extracted_relationships.truncate(max_relationships_per_memory());
606 }
607 }
608 Err(e) => {
609 tracing::warn!(
610 target: "ingest",
611 file = %path.display(),
612 "auto-extraction failed (graceful degradation): {e:#}"
613 );
614 }
615 }
616 }
617
618 for rel in &mut extracted_relationships {
619 rel.relation = crate::parsers::normalize_relation(&rel.relation);
620 if let Err(e) = crate::parsers::validate_relation_format(&rel.relation) {
621 return Err(AppError::Validation(format!(
622 "{e} for relationship '{}' -> '{}'",
623 rel.source, rel.target
624 )));
625 }
626 crate::parsers::warn_if_non_canonical(&rel.relation);
627 if !(0.0..=1.0).contains(&rel.strength) {
628 return Err(AppError::Validation(format!(
629 "invalid strength {} for relationship '{}' -> '{}'; expected value in [0.0, 1.0]",
630 rel.strength, rel.source, rel.target
631 )));
632 }
633 }
634
635 let body_hash = blake3::hash(raw_body.as_bytes()).to_hex().to_string();
636 let snippet: String = raw_body.chars().take(200).collect();
637
638 let chunks_info = chunking::split_into_chunks_hierarchical(&raw_body);
639 if chunks_info.len() > REMEMBER_MAX_SAFE_MULTI_CHUNKS {
640 return Err(AppError::LimitExceeded(format!(
641 "document produces {} chunks; current safe operational limit is {} chunks; split the document before using remember",
642 chunks_info.len(),
643 REMEMBER_MAX_SAFE_MULTI_CHUNKS
644 )));
645 }
646
647 let mut chunk_embeddings_opt: Option<Vec<Vec<f32>>> = None;
648 let skip_embed = crate::embedder::should_skip_embedding_on_failure();
649 let (embedding, backend_invoked): (Option<Vec<f32>>, Option<&'static str>) = if chunks_info
653 .len()
654 == 1
655 {
656 match crate::embedder::embed_passage_with_embedding_choice(
657 &paths.models,
658 &raw_body,
659 embedding_backend,
660 llm_backend,
661 ) {
662 Ok((v, k)) => (Some(v), Some(k.as_str())),
663 Err(AppError::Validation(msg)) => return Err(AppError::Validation(msg)),
664 Err(e) if skip_embed => {
665 tracing::warn!(error = %e, file = %path.display(), "ingest: embedding failed; --skip-embedding-on-failure active, persisting without embedding");
666 (None, None)
667 }
668 Err(e) => return Err(e),
669 }
670 } else {
671 let chunk_texts: Vec<String> = chunks_info
674 .iter()
675 .map(|c| chunking::chunk_text(&raw_body, c).to_string())
676 .collect();
677 if let Some(rss) = crate::memory_guard::current_process_memory_mb() {
678 if rss > max_rss_mb {
679 tracing::error!(
680 target: "ingest",
681 rss_mb = rss,
682 max_rss_mb = max_rss_mb,
683 file = %path.display(),
684 "RSS exceeded --max-rss-mb threshold; aborting to prevent system instability"
685 );
686 return Err(AppError::LowMemory {
687 available_mb: crate::memory_guard::available_memory_mb(),
688 required_mb: max_rss_mb,
689 });
690 }
691 }
692 match crate::embedder::embed_passages_parallel_with_embedding_choice(
693 &paths.models,
694 &chunk_texts,
695 llm_parallelism,
696 crate::embedder::chunk_embed_batch_size(),
697 embedding_backend,
698 llm_backend,
699 ) {
700 Ok(chunk_embeddings) => {
701 let aggregated = chunking::aggregate_embeddings(&chunk_embeddings);
702 chunk_embeddings_opt = Some(chunk_embeddings);
703 (Some(aggregated), None)
706 }
707 Err(AppError::Validation(msg)) => return Err(AppError::Validation(msg)),
708 Err(e) if skip_embed => {
709 tracing::warn!(error = %e, file = %path.display(), "ingest: chunk embedding failed; --skip-embedding-on-failure active, persisting without embedding");
710 (None, None)
711 }
712 Err(e) => return Err(e),
713 }
714 };
715
716 let entity_texts: Vec<String> = extracted_entities
718 .iter()
719 .map(|entity| match &entity.description {
720 Some(desc) => format!("{} {}", entity.name, desc),
721 None => entity.name.clone(),
722 })
723 .collect();
724 let entity_embeddings_opt = match crate::embedder::embed_entity_texts_cached(
728 &paths.models,
729 &entity_texts,
730 llm_parallelism,
731 embedding_backend,
732 llm_backend,
733 ) {
734 Ok((entity_embeddings, embed_cache_stats)) => {
735 if embed_cache_stats.hits > 0 {
736 tracing::debug!(
737 hits = embed_cache_stats.hits,
738 misses = embed_cache_stats.misses,
739 requested = embed_cache_stats.requested,
740 "G56: entity embed cache hit (ingest)"
741 );
742 }
743 Some(entity_embeddings)
744 }
745 Err(e) if skip_embed => {
746 tracing::warn!(error = %e, file = %path.display(), "ingest: entity embedding failed; --skip-embedding-on-failure active");
747 None
748 }
749 Err(e) => return Err(e),
750 };
751
752 Ok(StagedFile {
753 body: raw_body,
754 body_hash,
755 snippet,
756 name: name.to_string(),
757 description,
758 embedding,
759 chunk_embeddings: chunk_embeddings_opt,
760 chunks_info,
761 entities: extracted_entities,
762 relationships: extracted_relationships,
763 entity_embeddings: entity_embeddings_opt,
764 urls: extracted_urls,
765 backend_invoked,
766 })
767}
768
769fn persist_staged(
771 conn: &mut Connection,
772 namespace: &str,
773 memory_type: &str,
774 staged: StagedFile,
775) -> Result<FileSuccess, AppError> {
776 {
777 let active_count: u32 = conn.query_row(
778 "SELECT COUNT(DISTINCT namespace) FROM memories WHERE deleted_at IS NULL",
779 [],
780 |r| r.get::<_, i64>(0).map(|v| v as u32),
781 )?;
782 let ns_exists: bool = conn.query_row(
783 "SELECT EXISTS(SELECT 1 FROM memories WHERE namespace = ?1 AND deleted_at IS NULL)",
784 rusqlite::params![namespace],
785 |r| r.get::<_, i64>(0).map(|v| v > 0),
786 )?;
787 if !ns_exists && active_count >= crate::constants::MAX_NAMESPACES_ACTIVE {
788 return Err(AppError::NamespaceError(format!(
789 "active namespace limit of {} exceeded while creating '{namespace}'",
790 crate::constants::MAX_NAMESPACES_ACTIVE
791 )));
792 }
793 }
794
795 let existing_memory = memories::find_by_name(conn, namespace, &staged.name)?;
796 if existing_memory.is_some() {
797 return Err(AppError::Duplicate(errors_msg::duplicate_memory(
798 &staged.name,
799 namespace,
800 )));
801 }
802 let duplicate_hash_id = memories::find_by_hash(conn, namespace, &staged.body_hash)?;
803
804 let new_memory = NewMemory {
805 namespace: namespace.to_string(),
806 name: staged.name.clone(),
807 memory_type: memory_type.to_string(),
808 description: staged.description.clone(),
809 body: staged.body,
810 body_hash: staged.body_hash,
811 session_id: None,
812 source: "agent".to_string(),
813 metadata: serde_json::json!({}),
814 };
815
816 if let Some(hash_id) = duplicate_hash_id {
817 tracing::debug!(
818 target: "ingest",
819 duplicate_memory_id = hash_id,
820 "identical body already exists; persisting a new memory anyway"
821 );
822 }
823
824 let tx = conn.transaction_with_behavior(rusqlite::TransactionBehavior::Immediate)?;
825
826 let memory_id = memories::insert(&tx, &new_memory)?;
827 versions::insert_version(
828 &tx,
829 memory_id,
830 1,
831 &staged.name,
832 memory_type,
833 &staged.description,
834 &new_memory.body,
835 &serde_json::to_string(&new_memory.metadata)?,
836 None,
837 "create",
838 )?;
839 if let Some(ref emb) = staged.embedding {
840 memories::upsert_vec(
841 &tx,
842 memory_id,
843 namespace,
844 memory_type,
845 emb,
846 &staged.name,
847 &staged.snippet,
848 )?;
849 }
850
851 if staged.chunks_info.len() > 1 {
852 storage_chunks::insert_chunk_slices(&tx, memory_id, &new_memory.body, &staged.chunks_info)?;
853 if let Some(ref chunk_embeddings) = staged.chunk_embeddings {
854 for (i, emb) in chunk_embeddings.iter().enumerate() {
855 storage_chunks::upsert_chunk_vec(&tx, i as i64, memory_id, i as i32, emb)?;
856 }
857 }
858 }
859
860 if !staged.entities.is_empty() || !staged.relationships.is_empty() {
861 for (idx, entity) in staged.entities.iter().enumerate() {
862 let entity_id = entities::upsert_entity(&tx, namespace, entity)?;
863 if let Some(ref entity_embeddings) = staged.entity_embeddings {
864 if let Some(entity_embedding) = entity_embeddings.get(idx) {
865 entities::upsert_entity_vec(
866 &tx,
867 entity_id,
868 namespace,
869 entity.entity_type,
870 entity_embedding,
871 &entity.name,
872 )?;
873 }
874 }
875 entities::link_memory_entity(&tx, memory_id, entity_id)?;
876 }
877 let entity_types: std::collections::HashMap<&str, EntityType> = staged
878 .entities
879 .iter()
880 .map(|entity| (entity.name.as_str(), entity.entity_type))
881 .collect();
882
883 let mut affected_entity_ids: std::collections::HashSet<i64> =
884 std::collections::HashSet::new();
885 for entity in &staged.entities {
886 if let Some(eid) = entities::find_entity_id(&tx, namespace, &entity.name)? {
887 affected_entity_ids.insert(eid);
888 }
889 }
890
891 for rel in &staged.relationships {
892 let source_entity = NewEntity {
893 name: rel.source.clone(),
894 entity_type: entity_types
895 .get(rel.source.as_str())
896 .copied()
897 .unwrap_or(EntityType::Concept),
898 description: None,
899 };
900 let target_entity = NewEntity {
901 name: rel.target.clone(),
902 entity_type: entity_types
903 .get(rel.target.as_str())
904 .copied()
905 .unwrap_or(EntityType::Concept),
906 description: None,
907 };
908 let source_id = entities::upsert_entity(&tx, namespace, &source_entity)?;
909 let target_id = entities::upsert_entity(&tx, namespace, &target_entity)?;
910 let rel_id = entities::upsert_relationship(&tx, namespace, source_id, target_id, rel)?;
911 entities::link_memory_relationship(&tx, memory_id, rel_id)?;
912 affected_entity_ids.insert(source_id);
913 affected_entity_ids.insert(target_id);
914 }
915
916 for &eid in &affected_entity_ids {
917 entities::recalculate_degree(&tx, eid)?;
918 }
919 }
920
921 tx.commit()?;
922
923 if !staged.urls.is_empty() {
924 let url_entries: Vec<storage_urls::MemoryUrl> = staged
925 .urls
926 .into_iter()
927 .map(|u| storage_urls::MemoryUrl {
928 url: u.url,
929 offset: Some(u.start as i64),
930 })
931 .collect();
932 let _ = storage_urls::insert_urls(conn, memory_id, &url_entries);
933 }
934
935 Ok(FileSuccess {
936 memory_id,
937 action: "created".to_string(),
938 body_length: new_memory.body.len(),
939 backend_invoked: staged.backend_invoked,
940 })
941}
942
943fn is_at_default<T: PartialEq>(value: T, default: T) -> bool {
951 value == default
952}
953
954fn validate_mode_conditional_flags_ingest(args: &IngestArgs) -> Result<(), AppError> {
969 const DEFAULT_TIMEOUT: u64 = 300;
970 const DEFAULT_RATE_LIMIT_WAIT: u64 = 60;
971
972 let mut conflicts: Vec<String> = Vec::new();
973
974 let is_local_mode = args.mode == IngestMode::None || args.mode == IngestMode::Gliner;
975
976 if is_local_mode {
977 if args.claude_binary.is_some() {
978 conflicts.push("--claude-binary is ignored when --mode is none or gliner".to_string());
979 }
980 if args.claude_model.is_some() {
981 conflicts.push("--claude-model is ignored when --mode is none or gliner".to_string());
982 }
983 if !is_at_default(args.claude_timeout, DEFAULT_TIMEOUT) {
984 conflicts.push(format!(
985 "--claude-timeout={} is ignored when --mode is none or gliner (remove the flag to use the default 300s)",
986 args.claude_timeout
987 ));
988 }
989 if args.codex_binary.is_some() {
990 conflicts.push("--codex-binary is ignored when --mode is none or gliner".to_string());
991 }
992 if args.codex_model.is_some() {
993 conflicts.push("--codex-model is ignored when --mode is none or gliner".to_string());
994 }
995 if !is_at_default(args.codex_timeout, DEFAULT_TIMEOUT) {
996 conflicts.push(format!(
997 "--codex-timeout={} is ignored when --mode is none or gliner (remove the flag to use the default 300s)",
998 args.codex_timeout
999 ));
1000 }
1001 if args.opencode_binary.is_some() {
1002 conflicts
1003 .push("--opencode-binary is ignored when --mode is none or gliner".to_string());
1004 }
1005 if args.opencode_model.is_some() {
1006 conflicts.push("--opencode-model is ignored when --mode is none or gliner".to_string());
1007 }
1008 if !is_at_default(args.opencode_timeout, DEFAULT_TIMEOUT) {
1009 conflicts.push(format!(
1010 "--opencode-timeout={} is ignored when --mode is none or gliner (remove the flag to use the default 300s)",
1011 args.opencode_timeout
1012 ));
1013 }
1014 if args.max_cost_usd.is_some() {
1015 conflicts.push("--max-cost-usd is ignored when --mode is none or gliner (cost is only tracked for LLM-backed modes)".to_string());
1016 }
1017 if args.resume {
1018 conflicts.push("--resume is ignored when --mode is none or gliner (the queue DB is only used by LLM-backed modes)".to_string());
1019 }
1020 if args.retry_failed {
1021 conflicts.push("--retry-failed is ignored when --mode is none or gliner".to_string());
1022 }
1023 if args.keep_queue {
1024 conflicts.push("--keep-queue is ignored when --mode is none or gliner".to_string());
1025 }
1026 if !is_at_default(args.rate_limit_wait, DEFAULT_RATE_LIMIT_WAIT) {
1027 conflicts.push(format!(
1028 "--rate-limit-wait={} is ignored when --mode is none or gliner",
1029 args.rate_limit_wait
1030 ));
1031 }
1032 }
1033
1034 match args.mode {
1035 IngestMode::ClaudeCode => {
1036 if args.codex_binary.is_some() {
1037 conflicts.push("--codex-binary is ignored when --mode=claude-code".to_string());
1038 }
1039 if args.codex_model.is_some() {
1040 conflicts.push("--codex-model is ignored when --mode=claude-code".to_string());
1041 }
1042 if !is_at_default(args.codex_timeout, DEFAULT_TIMEOUT) {
1043 conflicts.push(format!(
1044 "--codex-timeout={} is ignored when --mode=claude-code (remove the flag to use the default 300s)",
1045 args.codex_timeout
1046 ));
1047 }
1048 if args.opencode_binary.is_some() {
1049 conflicts.push("--opencode-binary is ignored when --mode=claude-code".to_string());
1050 }
1051 if args.opencode_model.is_some() {
1052 conflicts.push("--opencode-model is ignored when --mode=claude-code".to_string());
1053 }
1054 if !is_at_default(args.opencode_timeout, DEFAULT_TIMEOUT) {
1055 conflicts.push(format!(
1056 "--opencode-timeout={} is ignored when --mode=claude-code (remove the flag to use the default 300s)",
1057 args.opencode_timeout
1058 ));
1059 }
1060 }
1061 IngestMode::Codex => {
1062 if args.claude_binary.is_some() {
1063 conflicts.push("--claude-binary is ignored when --mode=codex".to_string());
1064 }
1065 if args.claude_model.is_some() {
1066 conflicts.push("--claude-model is ignored when --mode=codex".to_string());
1067 }
1068 if !is_at_default(args.claude_timeout, DEFAULT_TIMEOUT) {
1069 conflicts.push(format!(
1070 "--claude-timeout={} is ignored when --mode=codex (remove the flag to use the default 300s)",
1071 args.claude_timeout
1072 ));
1073 }
1074 if args.max_cost_usd.is_some() {
1075 conflicts.push(
1076 "--max-cost-usd is ignored when --mode=codex (OAuth-first; cost is metered by your subscription)"
1077 .to_string(),
1078 );
1079 }
1080 if args.resume {
1081 conflicts.push("--resume is only valid for --mode=claude-code".to_string());
1082 }
1083 if args.retry_failed {
1084 conflicts.push("--retry-failed is only valid for --mode=claude-code".to_string());
1085 }
1086 if args.keep_queue {
1087 conflicts.push("--keep-queue is only valid for --mode=claude-code".to_string());
1088 }
1089 if args.opencode_binary.is_some() {
1090 conflicts.push("--opencode-binary is ignored when --mode=codex".to_string());
1091 }
1092 if args.opencode_model.is_some() {
1093 conflicts.push("--opencode-model is ignored when --mode=codex".to_string());
1094 }
1095 if !is_at_default(args.opencode_timeout, DEFAULT_TIMEOUT) {
1096 conflicts.push(format!(
1097 "--opencode-timeout={} is ignored when --mode=codex (remove the flag to use the default 300s)",
1098 args.opencode_timeout
1099 ));
1100 }
1101 }
1102 IngestMode::Opencode => {
1103 if args.claude_binary.is_some() {
1104 conflicts.push("--claude-binary is ignored when --mode=opencode".to_string());
1105 }
1106 if args.claude_model.is_some() {
1107 conflicts.push("--claude-model is ignored when --mode=opencode".to_string());
1108 }
1109 if !is_at_default(args.claude_timeout, DEFAULT_TIMEOUT) {
1110 conflicts.push(format!(
1111 "--claude-timeout={} is ignored when --mode=opencode (remove the flag to use the default 300s)",
1112 args.claude_timeout
1113 ));
1114 }
1115 if args.codex_binary.is_some() {
1116 conflicts.push("--codex-binary is ignored when --mode=opencode".to_string());
1117 }
1118 if args.codex_model.is_some() {
1119 conflicts.push("--codex-model is ignored when --mode=opencode".to_string());
1120 }
1121 if !is_at_default(args.codex_timeout, DEFAULT_TIMEOUT) {
1122 conflicts.push(format!(
1123 "--codex-timeout={} is ignored when --mode=opencode (remove the flag to use the default 300s)",
1124 args.codex_timeout
1125 ));
1126 }
1127 if args.max_cost_usd.is_some() {
1128 conflicts.push(
1129 "--max-cost-usd is ignored when --mode=opencode (OAuth-first; cost is metered by your subscription)"
1130 .to_string(),
1131 );
1132 }
1133 if args.resume {
1134 conflicts.push("--resume is only valid for --mode=claude-code".to_string());
1135 }
1136 if args.retry_failed {
1137 conflicts.push("--retry-failed is only valid for --mode=claude-code".to_string());
1138 }
1139 if args.keep_queue {
1140 conflicts.push("--keep-queue is only valid for --mode=claude-code".to_string());
1141 }
1142 }
1143 IngestMode::None | IngestMode::Gliner => {}
1144 }
1145
1146 if !conflicts.is_empty() {
1147 return Err(AppError::Validation(format!(
1148 "G20: mode-conditional flag conflicts detected for --mode={:?}:\n - {}",
1149 args.mode,
1150 conflicts.join("\n - ")
1151 )));
1152 }
1153
1154 Ok(())
1155}
1156
1157#[tracing::instrument(skip_all, level = "debug", name = "ingest")]
1160pub fn run(
1161 args: IngestArgs,
1162 llm_backend: crate::cli::LlmBackendChoice,
1163 embedding_backend: crate::cli::EmbeddingBackendChoice,
1164) -> Result<(), AppError> {
1165 validate_mode_conditional_flags_ingest(&args)?;
1168 tracing::debug!(target: "ingest", dir = %args.dir.display(), mode = ?args.mode, "starting ingest");
1169 if args.mode == IngestMode::ClaudeCode {
1170 return super::ingest_claude::run_claude_ingest(&args, embedding_backend, llm_backend);
1171 }
1172 if args.mode == IngestMode::Codex {
1173 return super::ingest_codex::run_codex_ingest(&args);
1174 }
1175 if args.mode == IngestMode::Opencode {
1176 return super::ingest_opencode::run_opencode_ingest(&args);
1177 }
1178
1179 let started = std::time::Instant::now();
1180
1181 if !args.dir.exists() {
1182 return Err(AppError::Validation(format!(
1183 "directory not found: {}",
1184 args.dir.display()
1185 )));
1186 }
1187 if !args.dir.is_dir() {
1188 return Err(AppError::Validation(format!(
1189 "path is not a directory: {}",
1190 args.dir.display()
1191 )));
1192 }
1193
1194 let mut files: Vec<PathBuf> = Vec::with_capacity(128);
1195 collect_files(&args.dir, &args.pattern, args.recursive, &mut files)?;
1196 files.sort_unstable();
1197
1198 if files.len() > args.max_files {
1199 return Err(AppError::Validation(format!(
1200 "found {} files matching pattern, exceeds --max-files cap of {} (raise the cap or narrow the pattern)",
1201 files.len(),
1202 args.max_files
1203 )));
1204 }
1205
1206 let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
1207 let memory_type_str = args.r#type.as_str().to_string();
1208
1209 let paths = AppPaths::resolve(args.db.as_deref())?;
1210 let mut conn_or_err = match init_storage(&paths) {
1211 Ok(c) => Ok(c),
1212 Err(e) => Err(format!("{e}")),
1213 };
1214
1215 let mut succeeded: usize = 0;
1216 let mut failed: usize = 0;
1217 let mut skipped: usize = 0;
1218 let total = files.len();
1219
1220 let mut taken_names: BTreeSet<String> = BTreeSet::new();
1223
1224 enum SlotMeta {
1230 Skip {
1231 file_str: String,
1232 derived_base: String,
1233 name_truncated: bool,
1234 original_name: Option<String>,
1235 original_filename: Option<String>,
1236 reason: String,
1237 },
1238 Process {
1239 file_str: String,
1240 derived_name: String,
1241 name_truncated: bool,
1242 original_name: Option<String>,
1243 original_filename: Option<String>,
1244 },
1245 }
1246
1247 struct ProcessItem {
1248 idx: usize,
1249 path: PathBuf,
1250 file_str: String,
1251 derived_name: String,
1252 }
1253
1254 let files_cap = files.len();
1255 let mut slots_meta: Vec<SlotMeta> = Vec::new();
1256 slots_meta.try_reserve(files_cap).map_err(|_| {
1257 AppError::LimitExceeded(format!(
1258 "allocation of {files_cap} slot metadata entries would exceed available memory"
1259 ))
1260 })?;
1261 let mut process_items: Vec<ProcessItem> = Vec::new();
1262 process_items.try_reserve(files_cap).map_err(|_| {
1263 AppError::LimitExceeded(format!(
1264 "allocation of {files_cap} process items would exceed available memory"
1265 ))
1266 })?;
1267 let mut truncations: Vec<(String, String)> = Vec::new();
1268 truncations.try_reserve(files_cap).map_err(|_| {
1269 AppError::LimitExceeded(format!(
1270 "allocation of {files_cap} truncation entries would exceed available memory"
1271 ))
1272 })?;
1273
1274 let max_name_length = args.max_name_length;
1275 for path in &files {
1276 let file_str = path.to_string_lossy().into_owned();
1277 let (derived_base, name_truncated, original_name) =
1278 derive_kebab_name(path, max_name_length);
1279 let original_basename = path.file_stem().and_then(|s| s.to_str()).unwrap_or("");
1280
1281 if name_truncated {
1282 if let Some(ref orig) = original_name {
1283 truncations.push((orig.clone(), derived_base.clone()));
1284 }
1285 }
1286
1287 if derived_base.is_empty() {
1288 let orig_filename = if !original_basename.is_empty() {
1290 Some(original_basename.to_string())
1291 } else {
1292 None
1293 };
1294 slots_meta.push(SlotMeta::Skip {
1295 file_str,
1296 derived_base: String::new(),
1297 name_truncated: false,
1298 original_name: None,
1299 original_filename: orig_filename,
1300 reason: "could not derive a non-empty kebab-case name from filename".to_string(),
1301 });
1302 continue;
1303 }
1304
1305 match unique_name(&derived_base, &taken_names) {
1306 Ok(derived_name) => {
1307 taken_names.insert(derived_name.clone());
1308 let idx = slots_meta.len();
1309 let orig_filename = if original_basename != derived_name {
1311 Some(original_basename.to_string())
1312 } else {
1313 None
1314 };
1315 process_items.push(ProcessItem {
1316 idx,
1317 path: path.clone(),
1318 file_str: file_str.clone(),
1319 derived_name: derived_name.clone(),
1320 });
1321 slots_meta.push(SlotMeta::Process {
1322 file_str,
1323 derived_name,
1324 name_truncated,
1325 original_name,
1326 original_filename: orig_filename,
1327 });
1328 }
1329 Err(e) => {
1330 let orig_filename = if original_basename != derived_base {
1331 Some(original_basename.to_string())
1332 } else {
1333 None
1334 };
1335 slots_meta.push(SlotMeta::Skip {
1336 file_str,
1337 derived_base,
1338 name_truncated,
1339 original_name,
1340 original_filename: orig_filename,
1341 reason: e.to_string(),
1342 });
1343 }
1344 }
1345 }
1346
1347 if !truncations.is_empty() {
1348 tracing::info!(
1349 target: "ingest",
1350 count = truncations.len(),
1351 max_name_length = max_name_length,
1352 max_len = DERIVED_NAME_MAX_LEN,
1353 "derived names truncated; pass -vv (debug) for per-file detail"
1354 );
1355 }
1356
1357 if args.dry_run {
1359 for meta in &slots_meta {
1360 match meta {
1361 SlotMeta::Skip {
1362 file_str,
1363 derived_base,
1364 name_truncated,
1365 original_name,
1366 original_filename,
1367 reason,
1368 } => {
1369 output::emit_json_compact(&IngestFileEvent {
1370 file: file_str,
1371 name: derived_base,
1372 status: "skip",
1373 truncated: *name_truncated,
1374 original_name: original_name.clone(),
1375 original_filename: original_filename.as_deref(),
1376 error: Some(reason.clone()),
1377 memory_id: None,
1378 action: None,
1379 body_length: 0,
1380 backend_invoked: None,
1381 })?;
1382 }
1383 SlotMeta::Process {
1384 file_str,
1385 derived_name,
1386 name_truncated,
1387 original_name,
1388 original_filename,
1389 } => {
1390 output::emit_json_compact(&IngestFileEvent {
1391 file: file_str,
1392 name: derived_name,
1393 status: "preview",
1394 truncated: *name_truncated,
1395 original_name: original_name.clone(),
1396 original_filename: original_filename.as_deref(),
1397 error: None,
1398 memory_id: None,
1399 action: None,
1400 body_length: 0,
1401 backend_invoked: None,
1402 })?;
1403 }
1404 }
1405 }
1406 output::emit_json_compact(&IngestSummary {
1407 summary: true,
1408 dir: args.dir.to_string_lossy().into_owned(),
1409 pattern: args.pattern.clone(),
1410 recursive: args.recursive,
1411 files_total: total,
1412 files_succeeded: 0,
1413 files_failed: 0,
1414 files_skipped: 0,
1415 elapsed_ms: started.elapsed().as_millis() as u64,
1416 })?;
1417 return Ok(());
1418 }
1419
1420 if args.low_memory {
1422 if let Some(n) = args.ingest_parallelism {
1423 if n > 1 {
1424 return Err(AppError::Validation(
1425 "--ingest-parallelism N>1 conflicts with --low-memory; use one or the other"
1426 .to_string(),
1427 ));
1428 }
1429 }
1430 }
1431
1432 let parallelism = resolve_parallelism(args.low_memory, args.ingest_parallelism);
1435
1436 let pool = rayon::ThreadPoolBuilder::new()
1437 .num_threads(parallelism)
1438 .build()
1439 .map_err(|e| AppError::Internal(anyhow::anyhow!("rayon pool: {e}")))?;
1440
1441 if args.enable_ner && args.skip_extraction {
1442 return Err(AppError::Validation(
1443 "--enable-ner and --skip-extraction are mutually exclusive; remove one".to_string(),
1444 ));
1445 }
1446 if args.skip_extraction && !args.enable_ner {
1447 tracing::warn!(
1454 "--skip-extraction is deprecated since v1.0.45 and has no effect (NER is disabled by default); remove this flag to silence the warning"
1455 );
1456 }
1457 let enable_ner = args.enable_ner;
1458 let auto_describe = args.auto_describe && !args.no_auto_describe;
1459 let max_rss_mb = args.max_rss_mb;
1460 let llm_parallelism = args.llm_parallelism as usize;
1461 if args.mode == IngestMode::Gliner {
1465 tracing::warn!(
1466 "--mode gliner is deprecated since v1.0.79 (the GLiNER pipeline was removed); it now performs URL-regex extraction only — use --mode claude-code or --mode codex for LLM-curated extraction"
1467 );
1468 }
1469 if args.gliner_variant != "fp32" {
1470 tracing::warn!(
1471 "--gliner-variant is deprecated and has no effect since v1.0.79 (the GLiNER pipeline was removed)"
1472 );
1473 }
1474 let gliner_variant: crate::extraction::GlinerVariant = match args.gliner_variant.as_str() {
1475 "int8" => crate::extraction::GlinerVariant::Int8,
1476 _ => crate::extraction::GlinerVariant::Fp32,
1477 };
1478
1479 let total_to_process = process_items.len();
1480 tracing::info!(
1481 target: "ingest",
1482 phase = "pipeline_start",
1483 files = total_to_process,
1484 ingest_parallelism = parallelism,
1485 "incremental pipeline starting: Phase A (rayon) → channel → Phase B (main thread)",
1486 );
1487
1488 let channel_bound = (parallelism * 2).max(1);
1492 let (tx, rx) = mpsc::sync_channel::<(usize, Result<StagedFile, AppError>)>(channel_bound);
1493
1494 let paths_owned = paths.clone();
1499 let llm_backend_owned = llm_backend;
1500 let embedding_backend_owned = embedding_backend;
1501 let producer_handle = std::thread::spawn(move || {
1502 pool.install(|| {
1503 process_items.into_par_iter().for_each(|item| {
1504 if crate::shutdown_requested() {
1505 return;
1506 }
1507 let t0 = std::time::Instant::now();
1508 let result = stage_file(
1509 item.idx,
1510 &item.path,
1511 &item.derived_name,
1512 &paths_owned,
1513 enable_ner,
1514 gliner_variant,
1515 max_rss_mb,
1516 llm_parallelism,
1517 llm_backend_owned,
1518 embedding_backend_owned,
1519 auto_describe,
1520 );
1521 let elapsed_ms = t0.elapsed().as_millis() as u64;
1522
1523 let (n_entities, n_relationships) = match &result {
1526 Ok(sf) => (sf.entities.len(), sf.relationships.len()),
1527 Err(_) => (0, 0),
1528 };
1529 let progress = StageProgressEvent {
1530 schema_version: 1,
1531 event: "file_extracted",
1532 path: &item.file_str,
1533 ms: elapsed_ms,
1534 entities: n_entities,
1535 relationships: n_relationships,
1536 };
1537 if let Ok(line) = serde_json::to_string(&progress) {
1538 tracing::info!(target: "ingest_progress", "{}", line);
1539 }
1540
1541 let _ = tx.send((item.idx, result));
1545 });
1546 drop(tx);
1548 });
1549 });
1550
1551 let fail_fast = args.fail_fast;
1563
1564 for meta in &slots_meta {
1566 if let SlotMeta::Skip {
1567 file_str,
1568 derived_base,
1569 name_truncated,
1570 original_name,
1571 original_filename,
1572 reason,
1573 } = meta
1574 {
1575 output::emit_json_compact(&IngestFileEvent {
1576 file: file_str,
1577 name: derived_base,
1578 status: "skipped",
1579 truncated: *name_truncated,
1580 original_name: original_name.clone(),
1581 original_filename: original_filename.as_deref(),
1582 error: Some(reason.clone()),
1583 memory_id: None,
1584 action: None,
1585 body_length: 0,
1586 backend_invoked: None,
1587 })?;
1588 skipped += 1;
1589 }
1590 }
1591
1592 let meta_index: std::collections::HashMap<usize, &SlotMeta> = slots_meta
1595 .iter()
1596 .enumerate()
1597 .filter(|(_, m)| matches!(m, SlotMeta::Process { .. }))
1598 .collect();
1599
1600 tracing::info!(
1601 target: "ingest",
1602 phase = "persist_start",
1603 files = total_to_process,
1604 "phase B starting: persisting files incrementally as Phase A completes each one",
1605 );
1606
1607 for (idx, stage_result) in rx {
1611 if crate::shutdown_requested() {
1612 tracing::info!(target: "ingest", "shutdown requested, stopping persistence loop");
1613 break;
1614 }
1615 let meta = meta_index.get(&idx).ok_or_else(|| {
1616 AppError::Internal(anyhow::anyhow!(
1617 "channel idx {idx} has no corresponding Process slot"
1618 ))
1619 })?;
1620 let (file_str, derived_name, name_truncated, original_name, original_filename) = match meta
1621 {
1622 SlotMeta::Process {
1623 file_str,
1624 derived_name,
1625 name_truncated,
1626 original_name,
1627 original_filename,
1628 } => (
1629 file_str,
1630 derived_name,
1631 name_truncated,
1632 original_name,
1633 original_filename,
1634 ),
1635 SlotMeta::Skip { .. } => unreachable!("channel only carries Process results"),
1636 };
1637
1638 let conn = match conn_or_err.as_mut() {
1640 Ok(c) => c,
1641 Err(err_msg) => {
1642 let err_clone = err_msg.clone();
1643 output::emit_json_compact(&IngestFileEvent {
1644 file: file_str,
1645 name: derived_name,
1646 status: "failed",
1647 truncated: *name_truncated,
1648 original_name: original_name.clone(),
1649 original_filename: original_filename.as_deref(),
1650 error: Some(err_clone.clone()),
1651 memory_id: None,
1652 action: None,
1653 body_length: 0,
1654 backend_invoked: None,
1655 })?;
1656 failed += 1;
1657 if fail_fast {
1658 output::emit_json_compact(&IngestSummary {
1659 summary: true,
1660 dir: args.dir.display().to_string(),
1661 pattern: args.pattern.clone(),
1662 recursive: args.recursive,
1663 files_total: total,
1664 files_succeeded: succeeded,
1665 files_failed: failed,
1666 files_skipped: skipped,
1667 elapsed_ms: started.elapsed().as_millis() as u64,
1668 })?;
1669 return Err(AppError::Validation(format!(
1670 "ingest aborted on first failure: {err_clone}"
1671 )));
1672 }
1673 continue;
1674 }
1675 };
1676
1677 let outcome =
1678 stage_result.and_then(|sf| persist_staged(conn, &namespace, &memory_type_str, sf));
1679
1680 match outcome {
1681 Ok(FileSuccess {
1682 memory_id,
1683 action,
1684 body_length,
1685 backend_invoked: file_backend_invoked,
1686 }) => {
1687 output::emit_json_compact(&IngestFileEvent {
1688 file: file_str,
1689 name: derived_name,
1690 status: "indexed",
1691 truncated: *name_truncated,
1692 original_name: original_name.clone(),
1693 original_filename: original_filename.as_deref(),
1694 error: None,
1695 memory_id: Some(memory_id),
1696 action: Some(action),
1697 body_length,
1698 backend_invoked: file_backend_invoked,
1699 })?;
1700 succeeded += 1;
1701 }
1702 Err(ref e) if matches!(e, AppError::Duplicate(_)) => {
1703 output::emit_json_compact(&IngestFileEvent {
1704 file: file_str,
1705 name: derived_name,
1706 status: "skipped",
1707 truncated: *name_truncated,
1708 original_name: original_name.clone(),
1709 original_filename: original_filename.as_deref(),
1710 error: Some(format!("{e}")),
1711 memory_id: None,
1712 action: Some("duplicate".to_string()),
1713 body_length: 0,
1714 backend_invoked: None,
1715 })?;
1716 skipped += 1;
1717 }
1718 Err(e) => {
1719 let err_msg = format!("{e}");
1720 output::emit_json_compact(&IngestFileEvent {
1721 file: file_str,
1722 name: derived_name,
1723 status: "failed",
1724 truncated: *name_truncated,
1725 original_name: original_name.clone(),
1726 original_filename: original_filename.as_deref(),
1727 error: Some(err_msg.clone()),
1728 memory_id: None,
1729 action: None,
1730 body_length: 0,
1731 backend_invoked: None,
1732 })?;
1733 failed += 1;
1734 if fail_fast {
1735 output::emit_json_compact(&IngestSummary {
1736 summary: true,
1737 dir: args.dir.display().to_string(),
1738 pattern: args.pattern.clone(),
1739 recursive: args.recursive,
1740 files_total: total,
1741 files_succeeded: succeeded,
1742 files_failed: failed,
1743 files_skipped: skipped,
1744 elapsed_ms: started.elapsed().as_millis() as u64,
1745 })?;
1746 return Err(AppError::Validation(format!(
1747 "ingest aborted on first failure: {err_msg}"
1748 )));
1749 }
1750 }
1751 }
1752 }
1753
1754 producer_handle
1756 .join()
1757 .map_err(|_| AppError::Internal(anyhow::anyhow!("ingest producer thread panicked")))?;
1758
1759 if let Ok(ref conn) = conn_or_err {
1760 if succeeded > 0 {
1761 let _ = conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);");
1762 }
1763 }
1764
1765 output::emit_json_compact(&IngestSummary {
1766 summary: true,
1767 dir: args.dir.display().to_string(),
1768 pattern: args.pattern.clone(),
1769 recursive: args.recursive,
1770 files_total: total,
1771 files_succeeded: succeeded,
1772 files_failed: failed,
1773 files_skipped: skipped,
1774 elapsed_ms: started.elapsed().as_millis() as u64,
1775 })?;
1776
1777 if args.enrich_after && succeeded > 0 {
1778 output::emit_json_compact(&serde_json::json!({
1779 "event": "enrich_phase_started",
1780 "operation": "memory-bindings"
1781 }))?;
1782 let enrich_args = super::enrich::EnrichArgs {
1783 operation: super::enrich::EnrichOperation::MemoryBindings,
1784 mode: super::enrich::EnrichMode::ClaudeCode,
1785 limit: None,
1786 dry_run: false,
1787 namespace: args.namespace.clone(),
1788 claude_binary: args.claude_binary.clone(),
1789 claude_model: args.claude_model.clone(),
1790 claude_timeout: args.claude_timeout,
1791 codex_binary: args.codex_binary.clone(),
1792 codex_model: args.codex_model.clone(),
1793 codex_timeout: args.codex_timeout,
1794 opencode_binary: args.opencode_binary.clone(),
1795 opencode_model: args.opencode_model.clone(),
1796 opencode_timeout: args.opencode_timeout,
1797 db: args.db.clone(),
1798 json: false,
1799 resume: false,
1800 retry_failed: false,
1801 max_cost_usd: args.max_cost_usd,
1802 llm_parallelism: args.llm_parallelism as u32,
1803 wait_job_singleton: args.wait_job_singleton,
1804 force_job_singleton: args.force_job_singleton,
1805 names: Vec::new(),
1806 names_file: None,
1807 preflight_check: false,
1808 fallback_mode: None,
1809 rate_limit_buffer: 300,
1810 max_load_check: true,
1811 circuit_breaker_threshold: 5,
1812 preserve_threshold: 0.7,
1813 codex_model_validate: true,
1814 codex_model_fallback: None,
1815 min_output_chars: 500,
1816 max_output_chars: 2000,
1817 preserve_check: true,
1818 prompt_template: None,
1819 };
1820 match super::enrich::run(&enrich_args, llm_backend, embedding_backend) {
1821 Ok(()) => {
1822 output::emit_json_compact(&serde_json::json!({
1823 "event": "enrich_phase_completed"
1824 }))?;
1825 }
1826 Err(e) => {
1827 tracing::warn!(error = %e, "enrich --operation memory-bindings failed after ingest");
1828 output::emit_json_compact(&serde_json::json!({
1829 "event": "enrich_phase_failed",
1830 "error": e.to_string()
1831 }))?;
1832 }
1833 }
1834 }
1835
1836 Ok(())
1837}
1838
1839fn init_storage(paths: &AppPaths) -> Result<Connection, AppError> {
1845 ensure_db_ready(paths)?;
1846 let conn = open_rw(&paths.db)?;
1847 Ok(conn)
1848}
1849
1850pub(crate) fn collect_files(
1851 dir: &Path,
1852 pattern: &str,
1853 recursive: bool,
1854 out: &mut Vec<PathBuf>,
1855) -> Result<(), AppError> {
1856 let entries = std::fs::read_dir(dir).map_err(AppError::Io)?;
1857 for entry in entries {
1858 let entry = entry.map_err(AppError::Io)?;
1859 let path = entry.path();
1860 let file_type = entry.file_type().map_err(AppError::Io)?;
1861 if file_type.is_file() {
1862 let name = entry.file_name();
1863 let name_str = name.to_string_lossy();
1864 if matches_pattern(&name_str, pattern) {
1865 out.push(path);
1866 }
1867 } else if file_type.is_dir() && recursive {
1868 collect_files(&path, pattern, recursive, out)?;
1869 }
1870 }
1871 Ok(())
1872}
1873
1874fn matches_pattern(name: &str, pattern: &str) -> bool {
1875 if let Some(suffix) = pattern.strip_prefix('*') {
1876 name.ends_with(suffix)
1877 } else if let Some(prefix) = pattern.strip_suffix('*') {
1878 name.starts_with(prefix)
1879 } else {
1880 name == pattern
1881 }
1882}
1883
1884pub(crate) fn derive_kebab_name(path: &Path, max_len: usize) -> (String, bool, Option<String>) {
1895 let stem = path.file_stem().and_then(|s| s.to_str()).unwrap_or("");
1896 let lowered: String = stem
1897 .nfd()
1898 .filter(|c| !unicode_normalization::char::is_combining_mark(*c))
1899 .map(|c| {
1900 if c == '_' || c.is_whitespace() {
1901 '-'
1902 } else {
1903 c
1904 }
1905 })
1906 .map(|c| c.to_ascii_lowercase())
1907 .filter(|c| c.is_ascii_lowercase() || c.is_ascii_digit() || *c == '-')
1908 .collect();
1909 let collapsed = collapse_dashes(&lowered);
1910 let trimmed_raw = collapsed.trim_matches('-').to_string();
1911 let trimmed = if trimmed_raw.starts_with(|c: char| c.is_ascii_digit()) {
1913 format!("doc-{trimmed_raw}")
1914 } else {
1915 trimmed_raw
1916 };
1917 if trimmed.len() > max_len {
1918 let truncated = trimmed[..max_len].trim_matches('-').to_string();
1919 tracing::debug!(
1920 target: "ingest",
1921 original = %trimmed,
1922 truncated_to = %truncated,
1923 max_len = max_len,
1924 "derived memory name truncated to fit length cap; collisions will be resolved with numeric suffixes"
1925 );
1926 (truncated, true, Some(trimmed))
1927 } else {
1928 (trimmed, false, None)
1929 }
1930}
1931
1932fn unique_name(base: &str, taken: &BTreeSet<String>) -> Result<String, AppError> {
1945 if !taken.contains(base) {
1946 return Ok(base.to_string());
1947 }
1948 for suffix in 1..=MAX_NAME_COLLISION_SUFFIX {
1949 let candidate = format!("{base}-{suffix}");
1950 if !taken.contains(&candidate) {
1951 tracing::warn!(
1952 target: "ingest",
1953 base = %base,
1954 resolved = %candidate,
1955 suffix,
1956 "memory name collision resolved with numeric suffix"
1957 );
1958 return Ok(candidate);
1959 }
1960 }
1961 Err(AppError::Validation(format!(
1962 "too many name collisions for base '{base}' (>{MAX_NAME_COLLISION_SUFFIX}); rename source files to disambiguate"
1963 )))
1964}
1965
1966fn collapse_dashes(s: &str) -> String {
1967 let mut out = String::with_capacity(s.len());
1968 let mut prev_dash = false;
1969 for c in s.chars() {
1970 if c == '-' {
1971 if !prev_dash {
1972 out.push('-');
1973 }
1974 prev_dash = true;
1975 } else {
1976 out.push(c);
1977 prev_dash = false;
1978 }
1979 }
1980 out
1981}
1982
1983#[cfg(test)]
1984mod tests {
1985 use super::*;
1986 use std::path::PathBuf;
1987
1988 #[test]
1989 fn matches_pattern_suffix() {
1990 assert!(matches_pattern("foo.md", "*.md"));
1991 assert!(!matches_pattern("foo.txt", "*.md"));
1992 assert!(matches_pattern("foo.md", "*"));
1993 }
1994
1995 #[test]
1996 fn matches_pattern_prefix() {
1997 assert!(matches_pattern("README.md", "README*"));
1998 assert!(!matches_pattern("CHANGELOG.md", "README*"));
1999 }
2000
2001 #[test]
2002 fn matches_pattern_exact() {
2003 assert!(matches_pattern("README.md", "README.md"));
2004 assert!(!matches_pattern("readme.md", "README.md"));
2005 }
2006
2007 #[test]
2008 fn derive_kebab_underscore_to_dash() {
2009 let p = PathBuf::from("/tmp/claude_code_headless.md");
2010 let (name, truncated, original) = derive_kebab_name(&p, DERIVED_NAME_MAX_LEN);
2011 assert_eq!(name, "claude-code-headless");
2012 assert!(!truncated);
2013 assert!(original.is_none());
2014 }
2015
2016 #[test]
2017 fn derive_kebab_uppercase_lowered() {
2018 let p = PathBuf::from("/tmp/README.md");
2019 let (name, truncated, original) = derive_kebab_name(&p, DERIVED_NAME_MAX_LEN);
2020 assert_eq!(name, "readme");
2021 assert!(!truncated);
2022 assert!(original.is_none());
2023 }
2024
2025 #[test]
2026 fn derive_kebab_strips_non_kebab_chars() {
2027 let p = PathBuf::from("/tmp/some@weird#name!.md");
2028 let (name, truncated, original) = derive_kebab_name(&p, DERIVED_NAME_MAX_LEN);
2029 assert_eq!(name, "someweirdname");
2030 assert!(!truncated);
2031 assert!(original.is_none());
2032 }
2033
2034 #[test]
2037 fn derive_kebab_folds_accented_letters_to_ascii() {
2038 let p = PathBuf::from("/tmp/açaí.md");
2039 let (name, _, _) = derive_kebab_name(&p, DERIVED_NAME_MAX_LEN);
2040 assert_eq!(name, "acai", "got '{name}'");
2041 }
2042
2043 #[test]
2044 fn derive_kebab_handles_naive_with_diaeresis() {
2045 let p = PathBuf::from("/tmp/naïve-test.md");
2046 let (name, _, _) = derive_kebab_name(&p, DERIVED_NAME_MAX_LEN);
2047 assert_eq!(name, "naive-test", "got '{name}'");
2048 }
2049
2050 #[test]
2051 fn derive_kebab_drops_emoji_keeps_word() {
2052 let p = PathBuf::from("/tmp/🚀-rocket.md");
2053 let (name, _, _) = derive_kebab_name(&p, DERIVED_NAME_MAX_LEN);
2054 assert_eq!(name, "rocket", "got '{name}'");
2055 }
2056
2057 #[test]
2058 fn derive_kebab_mixed_unicode_emoji_keeps_letters() {
2059 let p = PathBuf::from("/tmp/açaí🦜.md");
2060 let (name, _, _) = derive_kebab_name(&p, DERIVED_NAME_MAX_LEN);
2061 assert_eq!(name, "acai", "got '{name}'");
2062 }
2063
2064 #[test]
2065 fn derive_kebab_pure_emoji_yields_empty() {
2066 let p = PathBuf::from("/tmp/🦜🚀🌟.md");
2067 let (name, _, _) = derive_kebab_name(&p, DERIVED_NAME_MAX_LEN);
2068 assert!(name.is_empty(), "got '{name}'");
2069 }
2070
2071 #[test]
2072 fn derive_kebab_collapses_consecutive_dashes() {
2073 let p = PathBuf::from("/tmp/a__b___c.md");
2074 let (name, truncated, original) = derive_kebab_name(&p, DERIVED_NAME_MAX_LEN);
2075 assert_eq!(name, "a-b-c");
2076 assert!(!truncated);
2077 assert!(original.is_none());
2078 }
2079
2080 #[test]
2081 fn derive_kebab_truncates_to_60_chars() {
2082 let p = PathBuf::from(format!("/tmp/{}.md", "a".repeat(80)));
2083 let (name, truncated, original) = derive_kebab_name(&p, DERIVED_NAME_MAX_LEN);
2084 assert!(name.len() <= 60, "got len {}", name.len());
2085 assert!(truncated);
2086 assert!(original.is_some());
2087 assert!(original.unwrap().len() > 60);
2088 }
2089
2090 #[test]
2091 fn collect_files_finds_md_files() {
2092 let tmp = tempfile::tempdir().expect("tempdir");
2093 std::fs::write(tmp.path().join("a.md"), "x").unwrap();
2094 std::fs::write(tmp.path().join("b.md"), "y").unwrap();
2095 std::fs::write(tmp.path().join("c.txt"), "z").unwrap();
2096 let mut out = Vec::new();
2097 collect_files(tmp.path(), "*.md", false, &mut out).expect("collect");
2098 assert_eq!(out.len(), 2, "should find 2 .md files, got {out:?}");
2099 }
2100
2101 #[test]
2102 fn collect_files_recursive_descends_subdirs() {
2103 let tmp = tempfile::tempdir().expect("tempdir");
2104 let sub = tmp.path().join("sub");
2105 std::fs::create_dir(&sub).unwrap();
2106 std::fs::write(tmp.path().join("a.md"), "x").unwrap();
2107 std::fs::write(sub.join("b.md"), "y").unwrap();
2108 let mut out = Vec::new();
2109 collect_files(tmp.path(), "*.md", true, &mut out).expect("collect");
2110 assert_eq!(out.len(), 2);
2111 }
2112
2113 #[test]
2114 fn collect_files_non_recursive_skips_subdirs() {
2115 let tmp = tempfile::tempdir().expect("tempdir");
2116 let sub = tmp.path().join("sub");
2117 std::fs::create_dir(&sub).unwrap();
2118 std::fs::write(tmp.path().join("a.md"), "x").unwrap();
2119 std::fs::write(sub.join("b.md"), "y").unwrap();
2120 let mut out = Vec::new();
2121 collect_files(tmp.path(), "*.md", false, &mut out).expect("collect");
2122 assert_eq!(out.len(), 1);
2123 }
2124
2125 #[test]
2128 fn derive_kebab_long_basename_truncated_within_cap() {
2129 let p = PathBuf::from(format!("/tmp/{}.md", "a".repeat(120)));
2130 let (name, truncated, original) = derive_kebab_name(&p, DERIVED_NAME_MAX_LEN);
2131 assert!(
2132 name.len() <= DERIVED_NAME_MAX_LEN,
2133 "truncated name must respect cap; got {} chars",
2134 name.len()
2135 );
2136 assert!(!name.is_empty());
2137 assert!(truncated);
2138 assert!(original.is_some());
2139 }
2140
2141 #[test]
2142 fn unique_name_returns_base_when_free() {
2143 let taken: BTreeSet<String> = BTreeSet::new();
2144 let resolved = unique_name("note", &taken).expect("must resolve");
2145 assert_eq!(resolved, "note");
2146 }
2147
2148 #[test]
2149 fn unique_name_appends_first_free_suffix_on_collision() {
2150 let mut taken: BTreeSet<String> = BTreeSet::new();
2151 taken.insert("note".to_string());
2152 taken.insert("note-1".to_string());
2153 let resolved = unique_name("note", &taken).expect("must resolve");
2154 assert_eq!(resolved, "note-2");
2155 }
2156
2157 #[test]
2158 fn unique_name_errors_after_collision_cap() {
2159 let mut taken: BTreeSet<String> = BTreeSet::new();
2160 taken.insert("note".to_string());
2161 for i in 1..=MAX_NAME_COLLISION_SUFFIX {
2162 taken.insert(format!("note-{i}"));
2163 }
2164 let err = unique_name("note", &taken).expect_err("must surface error");
2165 assert!(matches!(err, AppError::Validation(_)));
2166 }
2167
2168 #[test]
2171 fn validate_relation_format_accepts_valid_relations() {
2172 use crate::parsers::{is_canonical_relation, validate_relation_format};
2173 assert!(validate_relation_format("applies_to").is_ok());
2174 assert!(validate_relation_format("depends_on").is_ok());
2175 assert!(validate_relation_format("implements").is_ok());
2176 assert!(validate_relation_format("").is_err());
2177 assert!(is_canonical_relation("applies_to"));
2178 assert!(!is_canonical_relation("implements"));
2179 }
2180
2181 use serial_test::serial;
2184
2185 fn with_env_var<F: FnOnce()>(value: Option<&str>, f: F) {
2187 let key = "SQLITE_GRAPHRAG_LOW_MEMORY";
2188 let prev = std::env::var(key).ok();
2189 match value {
2190 Some(v) => std::env::set_var(key, v),
2191 None => std::env::remove_var(key),
2192 }
2193 f();
2194 match prev {
2195 Some(p) => std::env::set_var(key, p),
2196 None => std::env::remove_var(key),
2197 }
2198 }
2199
2200 #[test]
2201 #[serial]
2202 fn env_low_memory_enabled_unset_returns_false() {
2203 with_env_var(None, || assert!(!env_low_memory_enabled()));
2204 }
2205
2206 #[test]
2207 #[serial]
2208 fn env_low_memory_enabled_empty_returns_false() {
2209 with_env_var(Some(""), || assert!(!env_low_memory_enabled()));
2210 }
2211
2212 #[test]
2213 #[serial]
2214 fn env_low_memory_enabled_truthy_values_return_true() {
2215 for v in ["1", "true", "TRUE", "yes", "YES", "on", "On"] {
2216 with_env_var(Some(v), || {
2217 assert!(env_low_memory_enabled(), "value {v:?} should be truthy")
2218 });
2219 }
2220 }
2221
2222 #[test]
2223 #[serial]
2224 fn env_low_memory_enabled_falsy_values_return_false() {
2225 for v in ["0", "false", "FALSE", "no", "off"] {
2226 with_env_var(Some(v), || {
2227 assert!(!env_low_memory_enabled(), "value {v:?} should be falsy")
2228 });
2229 }
2230 }
2231
2232 #[test]
2233 #[serial]
2234 fn env_low_memory_enabled_unrecognized_value_returns_false() {
2235 with_env_var(Some("maybe"), || assert!(!env_low_memory_enabled()));
2236 }
2237
2238 #[test]
2239 #[serial]
2240 fn resolve_parallelism_flag_forces_one_overriding_explicit_value() {
2241 with_env_var(None, || {
2242 assert_eq!(resolve_parallelism(true, Some(4)), 1);
2243 assert_eq!(resolve_parallelism(true, Some(8)), 1);
2244 assert_eq!(resolve_parallelism(true, None), 1);
2245 });
2246 }
2247
2248 #[test]
2249 #[serial]
2250 fn resolve_parallelism_env_forces_one_when_flag_off() {
2251 with_env_var(Some("1"), || {
2252 assert_eq!(resolve_parallelism(false, Some(4)), 1);
2253 assert_eq!(resolve_parallelism(false, None), 1);
2254 });
2255 }
2256
2257 #[test]
2258 #[serial]
2259 fn resolve_parallelism_falsy_env_does_not_override() {
2260 with_env_var(Some("0"), || {
2261 assert_eq!(resolve_parallelism(false, Some(4)), 4);
2262 });
2263 }
2264
2265 #[test]
2266 #[serial]
2267 fn resolve_parallelism_explicit_value_when_low_memory_off() {
2268 with_env_var(None, || {
2269 assert_eq!(resolve_parallelism(false, Some(3)), 3);
2270 assert_eq!(resolve_parallelism(false, Some(1)), 1);
2271 });
2272 }
2273
2274 #[test]
2275 #[serial]
2276 fn resolve_parallelism_default_when_unset() {
2277 with_env_var(None, || {
2278 let p = resolve_parallelism(false, None);
2279 assert!((1..=4).contains(&p), "default must be in [1, 4]; got {p}");
2280 });
2281 }
2282
2283 #[test]
2284 fn ingest_args_parses_low_memory_flag_via_clap() {
2285 use clap::Parser;
2286 let cli = crate::cli::Cli::try_parse_from([
2289 "sqlite-graphrag",
2290 "ingest",
2291 "/tmp/dummy",
2292 "--type",
2293 "document",
2294 "--low-memory",
2295 ])
2296 .expect("parse must succeed");
2297 match cli.command {
2298 Some(crate::cli::Commands::Ingest(args)) => {
2299 assert!(args.low_memory, "--low-memory must set field to true");
2300 }
2301 _ => panic!("expected Ingest subcommand"),
2302 }
2303 }
2304
2305 #[test]
2306 fn ingest_args_low_memory_defaults_false() {
2307 use clap::Parser;
2308 let cli = crate::cli::Cli::try_parse_from([
2309 "sqlite-graphrag",
2310 "ingest",
2311 "/tmp/dummy",
2312 "--type",
2313 "document",
2314 ])
2315 .expect("parse must succeed");
2316 match cli.command {
2317 Some(crate::cli::Commands::Ingest(args)) => {
2318 assert!(!args.low_memory, "default must be false");
2319 }
2320 _ => panic!("expected Ingest subcommand"),
2321 }
2322 }
2323}