1use crate::chunking;
35use crate::cli::MemoryType;
36use crate::entity_type::EntityType;
37use crate::errors::AppError;
38use crate::i18n::errors_msg;
39use crate::output::{self, JsonOutputFormat};
40use crate::paths::AppPaths;
41use crate::storage::chunks as storage_chunks;
42use crate::storage::connection::{ensure_db_ready, open_rw};
43use crate::storage::entities::{NewEntity, NewRelationship};
44use crate::storage::memories::NewMemory;
45use crate::storage::{entities, memories, urls as storage_urls, versions};
46use rayon::prelude::*;
47use rusqlite::Connection;
48use serde::Serialize;
49use std::collections::BTreeSet;
50use std::path::{Path, PathBuf};
51use std::sync::mpsc;
52use unicode_normalization::UnicodeNormalization;
53
54use crate::constants::DERIVED_NAME_MAX_LEN;
55
56const MAX_NAME_COLLISION_SUFFIX: usize = 1000;
59
60#[derive(clap::Args)]
61#[command(after_long_help = "EXAMPLES:\n \
62 # Ingest every Markdown file under ./docs as `document` memories\n \
63 sqlite-graphrag ingest ./docs --type document\n\n \
64 # Ingest .txt files recursively under ./notes\n \
65 sqlite-graphrag ingest ./notes --type note --pattern '*.txt' --recursive\n\n \
66 # Enable automatic URL extraction (URL-regex only since v1.0.79)\n \
67 sqlite-graphrag ingest ./big-corpus --type reference --enable-ner\n\n \
68 # Preview file-to-name mapping without ingesting\n \
69 sqlite-graphrag ingest ./docs --dry-run\n\n \
70 # LLM-curated extraction via Claude Code CLI\n \
71 sqlite-graphrag ingest ./docs --mode claude-code --recursive --json\n\n \
72 # Resume interrupted claude-code ingest\n \
73 sqlite-graphrag ingest ./docs --mode claude-code --resume --json\n\n \
74 # Claude Code with budget cap and custom timeout\n \
75 sqlite-graphrag ingest ./docs --mode claude-code --max-cost-usd 5.00 --claude-timeout 600 --json\n\n \
76AUTHENTICATION:\n \
77 --mode claude-code: Uses existing Claude Code authentication.\n \
78 OAuth (Pro/Max/Team): works automatically from ~/.claude/.credentials.json\n \
79 API key: set ANTHROPIC_API_KEY for faster startup (optional)\n\n \
80 --mode codex: Uses existing Codex CLI authentication.\n \
81 Device auth: run `codex auth login` first\n \
82 API key: set OPENAI_API_KEY (optional)\n\n \
83NOTES:\n \
84 Each file becomes a separate memory. Names derive from file basenames\n \
85 (kebab-case, lowercase, ASCII). Output is NDJSON: one JSON object per file,\n \
86 followed by a final summary line with counts. Per-file errors are reported\n \
87 inline and processing continues unless --fail-fast is set.")]
88pub struct IngestArgs {
89 #[arg(
91 value_name = "DIR",
92 help = "Directory to ingest recursively (each matching file becomes a memory)"
93 )]
94 pub dir: PathBuf,
95
96 #[arg(long, value_enum, default_value_t = MemoryType::Document)]
98 pub r#type: MemoryType,
99
100 #[arg(long, default_value = "*.md")]
103 pub pattern: String,
104
105 #[arg(long, default_value_t = false)]
107 pub recursive: bool,
108
109 #[arg(
110 long,
111 env = "SQLITE_GRAPHRAG_ENABLE_NER",
112 value_parser = crate::parsers::parse_bool_flexible,
113 action = clap::ArgAction::Set,
114 num_args = 0..=1,
115 default_missing_value = "true",
116 default_value = "false",
117 help = "Enable automatic URL-regex extraction (the GLiNER NER pipeline was removed in v1.0.79)"
118 )]
119 pub enable_ner: bool,
120
121 #[arg(
125 long,
126 default_value_t = true,
127 overrides_with = "no_auto_describe",
128 help = "Derive memory description from the first meaningful body line instead of the legacy `ingested from <path>` placeholder."
129 )]
130 pub auto_describe: bool,
131 #[arg(
132 long = "no-auto-describe",
133 default_value_t = false,
134 help = "Disable `--auto-describe` and fall back to the legacy `ingested from <path>` description placeholder."
135 )]
136 pub no_auto_describe: bool,
137 #[arg(
138 long,
139 env = "SQLITE_GRAPHRAG_GLINER_VARIANT",
140 default_value = "fp32",
141 help = "DEPRECATED: no effect since v1.0.79 (the GLiNER pipeline was removed); accepted for compatibility only"
142 )]
143 pub gliner_variant: String,
144
145 #[arg(long, default_value_t = false, hide = true)]
147 pub skip_extraction: bool,
148
149 #[arg(long, default_value_t = false)]
151 pub fail_fast: bool,
152
153 #[arg(long, default_value_t = false)]
155 pub dry_run: bool,
156
157 #[arg(long, default_value_t = 10_000)]
159 pub max_files: usize,
160
161 #[arg(long)]
163 pub namespace: Option<String>,
164
165 #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
167 pub db: Option<String>,
168
169 #[arg(long, value_enum, default_value_t = JsonOutputFormat::Json)]
170 pub format: JsonOutputFormat,
171
172 #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
173 pub json: bool,
174
175 #[arg(
177 long,
178 help = "Number of files to extract+embed in parallel; default = max(1, cpus/2).min(4)"
179 )]
180 pub ingest_parallelism: Option<usize>,
181
182 #[arg(
190 long,
191 default_value_t = false,
192 help = "Forces single-threaded ingest (--ingest-parallelism 1) to reduce RSS pressure. \
193 Recommended for environments with <4 GB available RAM or container/cgroup \
194 constraints. Trade-off: 3-4x longer wall time. Also honored via \
195 SQLITE_GRAPHRAG_LOW_MEMORY=1 env var."
196 )]
197 pub low_memory: bool,
198
199 #[arg(long, default_value_t = crate::constants::DEFAULT_MAX_RSS_MB,
201 help = "Maximum process RSS in MiB; abort if exceeded during embedding (default: 8192)")]
202 pub max_rss_mb: u64,
203
204 #[arg(long, default_value_t = 2, value_name = "N",
209 value_parser = clap::value_parser!(u64).range(1..=32),
210 help = "Maximum simultaneous LLM embedding subprocesses per file (default: 2, clamp [1,32])")]
211 pub llm_parallelism: u64,
212
213 #[arg(long, default_value_t = crate::constants::DERIVED_NAME_MAX_LEN,
218 help = "Maximum length for derived memory names (default: 60)")]
219 pub max_name_length: usize,
220
221 #[arg(long, value_enum, default_value_t = IngestMode::None)]
223 pub mode: IngestMode,
224
225 #[arg(long, env = "SQLITE_GRAPHRAG_CLAUDE_BINARY")]
227 pub claude_binary: Option<std::path::PathBuf>,
228
229 #[arg(long)]
231 pub claude_model: Option<String>,
232
233 #[arg(long, default_value_t = false)]
235 pub resume: bool,
236
237 #[arg(long, default_value_t = false)]
239 pub retry_failed: bool,
240
241 #[arg(long, default_value_t = false)]
243 pub keep_queue: bool,
244
245 #[arg(long, default_value = ".ingest-queue.sqlite")]
247 pub queue_db: String,
248
249 #[arg(long, default_value_t = 60)]
251 pub rate_limit_wait: u64,
252
253 #[arg(long)]
255 pub max_cost_usd: Option<f64>,
256
257 #[arg(
259 long,
260 default_value_t = 300,
261 help = "Timeout in seconds for each claude -p invocation (default: 300)"
262 )]
263 pub claude_timeout: u64,
264
265 #[arg(
267 long,
268 env = "SQLITE_GRAPHRAG_CODEX_BINARY",
269 help = "Explicit path to the Codex CLI binary (only with --mode codex)"
270 )]
271 pub codex_binary: Option<PathBuf>,
272
273 #[arg(
275 long,
276 help = "Model override for Codex extraction (e.g. o4-mini, gpt-5.1-codex)"
277 )]
278 pub codex_model: Option<String>,
279
280 #[arg(
282 long,
283 default_value_t = 300,
284 help = "Timeout in seconds for each codex exec invocation (default: 300)"
285 )]
286 pub codex_timeout: u64,
287
288 #[arg(long, value_name = "PATH", env = "SQLITE_GRAPHRAG_OPENCODE_BINARY")]
290 pub opencode_binary: Option<PathBuf>,
291
292 #[arg(
294 long,
295 value_name = "MODEL",
296 env = "SQLITE_GRAPHRAG_OPENCODE_MODEL",
297 help = "Model override for OpenCode extraction"
298 )]
299 pub opencode_model: Option<String>,
300
301 #[arg(
303 long,
304 value_name = "SECONDS",
305 env = "SQLITE_GRAPHRAG_OPENCODE_TIMEOUT",
306 default_value_t = 300,
307 help = "Timeout in seconds for each opencode run invocation (default: 300)"
308 )]
309 pub opencode_timeout: u64,
310
311 #[arg(long, value_name = "SECONDS")]
314 pub wait_job_singleton: Option<u64>,
315
316 #[arg(long, default_value_t = false)]
319 pub force_job_singleton: bool,
320
321 #[arg(
324 long,
325 default_value_t = false,
326 help = "Run enrich --operation memory-bindings after all files are ingested"
327 )]
328 pub enrich_after: bool,
329}
330
331#[derive(Clone, Debug, PartialEq, Eq, clap::ValueEnum)]
333pub enum IngestMode {
334 None,
336 Gliner,
338 ClaudeCode,
340 Codex,
342 #[value(name = "opencode")]
344 Opencode,
345}
346
347fn env_low_memory_enabled() -> bool {
352 match std::env::var("SQLITE_GRAPHRAG_LOW_MEMORY") {
353 Ok(v) if v.is_empty() => false,
354 Ok(v) => match v.to_lowercase().as_str() {
355 "1" | "true" | "yes" | "on" => true,
356 "0" | "false" | "no" | "off" => false,
357 other => {
358 tracing::warn!(
359 target: "ingest",
360 value = %other,
361 "SQLITE_GRAPHRAG_LOW_MEMORY value not recognized; treating as disabled"
362 );
363 false
364 }
365 },
366 Err(_) => false,
367 }
368}
369
370fn resolve_parallelism(low_memory_flag: bool, ingest_parallelism: Option<usize>) -> usize {
382 let env_flag = env_low_memory_enabled();
383 let low_memory = low_memory_flag || env_flag;
384
385 if low_memory {
386 if let Some(n) = ingest_parallelism {
387 if n > 1 {
388 tracing::warn!(
389 target: "ingest",
390 requested = n,
391 "--ingest-parallelism overridden by --low-memory; using 1"
392 );
393 }
394 }
395 if low_memory_flag {
396 tracing::info!(
397 target: "ingest",
398 source = "flag",
399 "low-memory mode enabled: forcing --ingest-parallelism 1"
400 );
401 } else {
402 tracing::info!(
403 target: "ingest",
404 source = "env",
405 "low-memory mode enabled via SQLITE_GRAPHRAG_LOW_MEMORY: forcing --ingest-parallelism 1"
406 );
407 }
408 return 1;
409 }
410
411 ingest_parallelism
412 .unwrap_or_else(|| {
413 std::thread::available_parallelism()
414 .map(|v| v.get() / 2)
415 .unwrap_or(1)
416 .clamp(1, 4)
417 })
418 .max(1)
419}
420
421#[derive(Serialize)]
422struct IngestFileEvent<'a> {
423 file: &'a str,
424 name: &'a str,
425 status: &'a str,
426 truncated: bool,
428 #[serde(skip_serializing_if = "Option::is_none")]
430 original_name: Option<String>,
431 #[serde(skip_serializing_if = "Option::is_none")]
433 original_filename: Option<&'a str>,
434 #[serde(skip_serializing_if = "Option::is_none")]
435 error: Option<String>,
436 #[serde(skip_serializing_if = "Option::is_none")]
437 memory_id: Option<i64>,
438 #[serde(skip_serializing_if = "Option::is_none")]
439 action: Option<String>,
440 body_length: usize,
442 #[serde(skip_serializing_if = "Option::is_none")]
447 backend_invoked: Option<&'a str>,
448}
449
450#[derive(Serialize)]
451struct IngestSummary {
452 summary: bool,
453 dir: String,
454 pattern: String,
455 recursive: bool,
456 files_total: usize,
457 files_succeeded: usize,
458 files_failed: usize,
459 files_skipped: usize,
460 elapsed_ms: u64,
461}
462
463struct FileSuccess {
465 memory_id: i64,
466 action: String,
467 body_length: usize,
468 backend_invoked: Option<&'static str>,
469}
470
471#[derive(Serialize)]
474struct StageProgressEvent<'a> {
475 schema_version: u8,
476 event: &'a str,
477 path: &'a str,
478 ms: u64,
479 entities: usize,
480 relationships: usize,
481}
482
483struct StagedFile {
486 body: String,
487 body_hash: String,
488 snippet: String,
489 name: String,
490 description: String,
491 embedding: Option<Vec<f32>>,
492 chunk_embeddings: Option<Vec<Vec<f32>>>,
493 chunks_info: Vec<crate::chunking::Chunk>,
494 entities: Vec<NewEntity>,
495 relationships: Vec<NewRelationship>,
496 entity_embeddings: Option<Vec<Vec<f32>>>,
497 urls: Vec<crate::extraction::ExtractedUrl>,
498 backend_invoked: Option<&'static str>,
503}
504
505#[allow(clippy::too_many_arguments)]
511fn stage_file(
512 _idx: usize,
513 path: &Path,
514 name: &str,
515 paths: &AppPaths,
516 enable_ner: bool,
517 gliner_variant: crate::extraction::GlinerVariant,
518 max_rss_mb: u64,
519 llm_parallelism: usize,
520 llm_backend: crate::cli::LlmBackendChoice,
521 embedding_backend: crate::cli::EmbeddingBackendChoice,
522 auto_describe: bool,
523) -> Result<StagedFile, AppError> {
524 use crate::constants::*;
525
526 if name.len() > MAX_MEMORY_NAME_LEN {
527 return Err(AppError::LimitExceeded(
528 crate::i18n::validation::name_length(MAX_MEMORY_NAME_LEN),
529 ));
530 }
531 if name.starts_with("__") {
532 return Err(AppError::Validation(
533 crate::i18n::validation::reserved_name(),
534 ));
535 }
536 {
537 let slug_re = crate::constants::name_slug_regex();
538 if !slug_re.is_match(name) {
539 return Err(AppError::Validation(crate::i18n::validation::name_kebab(
540 name,
541 )));
542 }
543 }
544
545 let file_size = std::fs::metadata(path).map_err(AppError::Io)?.len();
546 if file_size > MAX_MEMORY_BODY_LEN as u64 {
547 return Err(AppError::LimitExceeded(
548 crate::i18n::validation::body_exceeds(MAX_MEMORY_BODY_LEN),
549 ));
550 }
551 let raw_body = std::fs::read_to_string(path).map_err(AppError::Io)?;
552 if raw_body.len() > MAX_MEMORY_BODY_LEN {
553 return Err(AppError::LimitExceeded(
554 crate::i18n::validation::body_exceeds(MAX_MEMORY_BODY_LEN),
555 ));
556 }
557 if raw_body.trim().is_empty() {
558 return Err(AppError::Validation(crate::i18n::validation::empty_body()));
559 }
560
561 let description = if auto_describe {
562 crate::commands::ingest_heuristics::extract_heuristic_description(
563 &raw_body,
564 Some(&path.display().to_string()),
565 )
566 } else {
567 format!("ingested from {}", path.display())
568 };
569 if description.len() > MAX_MEMORY_DESCRIPTION_LEN {
570 return Err(AppError::Validation(
571 crate::i18n::validation::description_exceeds(MAX_MEMORY_DESCRIPTION_LEN),
572 ));
573 }
574
575 let mut extracted_entities: Vec<NewEntity> = Vec::with_capacity(30);
576 let mut extracted_relationships: Vec<NewRelationship> = Vec::with_capacity(50);
577 let mut extracted_urls: Vec<crate::extraction::ExtractedUrl> = Vec::with_capacity(4);
578 if enable_ner {
579 match crate::extraction::extract_graph_auto(&raw_body, paths, gliner_variant) {
580 Ok(extracted) => {
581 extracted_urls = extracted.urls;
582 extracted_entities = extracted
587 .entities
588 .into_iter()
589 .map(|e| NewEntity {
590 name: e.name,
591 entity_type: crate::entity_type::EntityType::Concept,
592 description: None,
593 })
594 .collect();
595 extracted_relationships.clear();
600
601 if extracted_entities.len() > max_entities_per_memory() {
602 extracted_entities.truncate(max_entities_per_memory());
603 }
604 if extracted_relationships.len() > max_relationships_per_memory() {
605 extracted_relationships.truncate(max_relationships_per_memory());
606 }
607 }
608 Err(e) => {
609 tracing::warn!(
610 target: "ingest",
611 file = %path.display(),
612 "auto-extraction failed (graceful degradation): {e:#}"
613 );
614 }
615 }
616 }
617
618 for rel in &mut extracted_relationships {
619 rel.relation = crate::parsers::normalize_relation(&rel.relation);
620 if let Err(e) = crate::parsers::validate_relation_format(&rel.relation) {
621 return Err(AppError::Validation(format!(
622 "{e} for relationship '{}' -> '{}'",
623 rel.source, rel.target
624 )));
625 }
626 crate::parsers::warn_if_non_canonical(&rel.relation);
627 if !(0.0..=1.0).contains(&rel.strength) {
628 return Err(AppError::Validation(format!(
629 "invalid strength {} for relationship '{}' -> '{}'; expected value in [0.0, 1.0]",
630 rel.strength, rel.source, rel.target
631 )));
632 }
633 }
634
635 let body_hash = blake3::hash(raw_body.as_bytes()).to_hex().to_string();
636 let snippet: String = raw_body.chars().take(200).collect();
637
638 let chunks_info = chunking::split_into_chunks_hierarchical(&raw_body);
639 if chunks_info.len() > REMEMBER_MAX_SAFE_MULTI_CHUNKS {
640 return Err(AppError::LimitExceeded(format!(
641 "document produces {} chunks; current safe operational limit is {} chunks; split the document before using remember",
642 chunks_info.len(),
643 REMEMBER_MAX_SAFE_MULTI_CHUNKS
644 )));
645 }
646
647 let mut chunk_embeddings_opt: Option<Vec<Vec<f32>>> = None;
648 let skip_embed = crate::embedder::should_skip_embedding_on_failure();
649 let (embedding, backend_invoked): (Option<Vec<f32>>, Option<&'static str>) = if chunks_info
653 .len()
654 == 1
655 {
656 match crate::embedder::embed_passage_with_embedding_choice(
657 &paths.models,
658 &raw_body,
659 embedding_backend,
660 llm_backend,
661 ) {
662 Ok((v, k)) => (Some(v), Some(k.as_str())),
663 Err(AppError::Validation(msg)) => return Err(AppError::Validation(msg)),
664 Err(e) if skip_embed => {
665 tracing::warn!(error = %e, file = %path.display(), "ingest: embedding failed; --skip-embedding-on-failure active, persisting without embedding");
666 (None, None)
667 }
668 Err(e) => return Err(e),
669 }
670 } else {
671 let chunk_texts: Vec<String> = chunks_info
674 .iter()
675 .map(|c| chunking::chunk_text(&raw_body, c).to_string())
676 .collect();
677 if let Some(rss) = crate::memory_guard::current_process_memory_mb() {
678 if rss > max_rss_mb {
679 tracing::error!(
680 target: "ingest",
681 rss_mb = rss,
682 max_rss_mb = max_rss_mb,
683 file = %path.display(),
684 "RSS exceeded --max-rss-mb threshold; aborting to prevent system instability"
685 );
686 return Err(AppError::LowMemory {
687 available_mb: crate::memory_guard::available_memory_mb(),
688 required_mb: max_rss_mb,
689 });
690 }
691 }
692 match crate::embedder::embed_passages_parallel_with_embedding_choice(
693 &paths.models,
694 &chunk_texts,
695 llm_parallelism,
696 crate::embedder::chunk_embed_batch_size(),
697 embedding_backend,
698 llm_backend,
699 ) {
700 Ok(chunk_embeddings) => {
701 let aggregated = chunking::aggregate_embeddings(&chunk_embeddings);
702 chunk_embeddings_opt = Some(chunk_embeddings);
703 (Some(aggregated), None)
706 }
707 Err(AppError::Validation(msg)) => return Err(AppError::Validation(msg)),
708 Err(e) if skip_embed => {
709 tracing::warn!(error = %e, file = %path.display(), "ingest: chunk embedding failed; --skip-embedding-on-failure active, persisting without embedding");
710 (None, None)
711 }
712 Err(e) => return Err(e),
713 }
714 };
715
716 let entity_texts: Vec<String> = extracted_entities
718 .iter()
719 .map(|entity| match &entity.description {
720 Some(desc) => format!("{} {}", entity.name, desc),
721 None => entity.name.clone(),
722 })
723 .collect();
724 let entity_embeddings_opt = match crate::embedder::embed_entity_texts_cached(
728 &paths.models,
729 &entity_texts,
730 llm_parallelism,
731 embedding_backend,
732 llm_backend,
733 ) {
734 Ok((entity_embeddings, embed_cache_stats)) => {
735 if embed_cache_stats.hits > 0 {
736 tracing::debug!(
737 hits = embed_cache_stats.hits,
738 misses = embed_cache_stats.misses,
739 requested = embed_cache_stats.requested,
740 "G56: entity embed cache hit (ingest)"
741 );
742 }
743 Some(entity_embeddings)
744 }
745 Err(e) if skip_embed => {
746 tracing::warn!(error = %e, file = %path.display(), "ingest: entity embedding failed; --skip-embedding-on-failure active");
747 None
748 }
749 Err(e) => return Err(e),
750 };
751
752 Ok(StagedFile {
753 body: raw_body,
754 body_hash,
755 snippet,
756 name: name.to_string(),
757 description,
758 embedding,
759 chunk_embeddings: chunk_embeddings_opt,
760 chunks_info,
761 entities: extracted_entities,
762 relationships: extracted_relationships,
763 entity_embeddings: entity_embeddings_opt,
764 urls: extracted_urls,
765 backend_invoked,
766 })
767}
768
769fn persist_staged(
771 conn: &mut Connection,
772 namespace: &str,
773 memory_type: &str,
774 staged: StagedFile,
775) -> Result<FileSuccess, AppError> {
776 {
777 let active_count: u32 = conn.query_row(
778 "SELECT COUNT(DISTINCT namespace) FROM memories WHERE deleted_at IS NULL",
779 [],
780 |r| r.get::<_, i64>(0).map(|v| v as u32),
781 )?;
782 let ns_exists: bool = conn.query_row(
783 "SELECT EXISTS(SELECT 1 FROM memories WHERE namespace = ?1 AND deleted_at IS NULL)",
784 rusqlite::params![namespace],
785 |r| r.get::<_, i64>(0).map(|v| v > 0),
786 )?;
787 if !ns_exists && active_count >= crate::constants::MAX_NAMESPACES_ACTIVE {
788 return Err(AppError::NamespaceError(format!(
789 "active namespace limit of {} exceeded while creating '{namespace}'",
790 crate::constants::MAX_NAMESPACES_ACTIVE
791 )));
792 }
793 }
794
795 let existing_memory = memories::find_by_name(conn, namespace, &staged.name)?;
796 if existing_memory.is_some() {
797 return Err(AppError::Duplicate(errors_msg::duplicate_memory(
798 &staged.name,
799 namespace,
800 )));
801 }
802 let duplicate_hash_id = memories::find_by_hash(conn, namespace, &staged.body_hash)?;
803
804 let new_memory = NewMemory {
805 namespace: namespace.to_string(),
806 name: staged.name.clone(),
807 memory_type: memory_type.to_string(),
808 description: staged.description.clone(),
809 body: staged.body,
810 body_hash: staged.body_hash,
811 session_id: None,
812 source: "agent".to_string(),
813 metadata: serde_json::json!({}),
814 };
815
816 if let Some(hash_id) = duplicate_hash_id {
817 tracing::debug!(
818 target: "ingest",
819 duplicate_memory_id = hash_id,
820 "identical body already exists; persisting a new memory anyway"
821 );
822 }
823
824 let tx = conn.transaction_with_behavior(rusqlite::TransactionBehavior::Immediate)?;
825
826 let memory_id = memories::insert(&tx, &new_memory)?;
827 versions::insert_version(
828 &tx,
829 memory_id,
830 1,
831 &staged.name,
832 memory_type,
833 &staged.description,
834 &new_memory.body,
835 &serde_json::to_string(&new_memory.metadata)?,
836 None,
837 "create",
838 )?;
839 if let Some(ref emb) = staged.embedding {
840 memories::upsert_vec(
841 &tx,
842 memory_id,
843 namespace,
844 memory_type,
845 emb,
846 &staged.name,
847 &staged.snippet,
848 )?;
849 }
850
851 if staged.chunks_info.len() > 1 {
852 storage_chunks::insert_chunk_slices(&tx, memory_id, &new_memory.body, &staged.chunks_info)?;
853 if let Some(ref chunk_embeddings) = staged.chunk_embeddings {
854 for (i, emb) in chunk_embeddings.iter().enumerate() {
855 storage_chunks::upsert_chunk_vec(&tx, i as i64, memory_id, i as i32, emb)?;
856 }
857 }
858 }
859
860 if !staged.entities.is_empty() || !staged.relationships.is_empty() {
861 for (idx, entity) in staged.entities.iter().enumerate() {
862 let entity_id = entities::upsert_entity(&tx, namespace, entity)?;
863 if let Some(ref entity_embeddings) = staged.entity_embeddings {
864 if let Some(entity_embedding) = entity_embeddings.get(idx) {
865 entities::upsert_entity_vec(
866 &tx,
867 entity_id,
868 namespace,
869 entity.entity_type,
870 entity_embedding,
871 &entity.name,
872 )?;
873 }
874 }
875 entities::link_memory_entity(&tx, memory_id, entity_id)?;
876 }
877 let entity_types: std::collections::HashMap<&str, EntityType> = staged
878 .entities
879 .iter()
880 .map(|entity| (entity.name.as_str(), entity.entity_type))
881 .collect();
882
883 let mut affected_entity_ids: std::collections::HashSet<i64> =
884 std::collections::HashSet::new();
885 for entity in &staged.entities {
886 if let Some(eid) = entities::find_entity_id(&tx, namespace, &entity.name)? {
887 affected_entity_ids.insert(eid);
888 }
889 }
890
891 for rel in &staged.relationships {
892 let source_entity = NewEntity {
893 name: rel.source.clone(),
894 entity_type: entity_types
895 .get(rel.source.as_str())
896 .copied()
897 .unwrap_or(EntityType::Concept),
898 description: None,
899 };
900 let target_entity = NewEntity {
901 name: rel.target.clone(),
902 entity_type: entity_types
903 .get(rel.target.as_str())
904 .copied()
905 .unwrap_or(EntityType::Concept),
906 description: None,
907 };
908 let source_id = entities::upsert_entity(&tx, namespace, &source_entity)?;
909 let target_id = entities::upsert_entity(&tx, namespace, &target_entity)?;
910 let rel_id = entities::upsert_relationship(&tx, namespace, source_id, target_id, rel)?;
911 entities::link_memory_relationship(&tx, memory_id, rel_id)?;
912 affected_entity_ids.insert(source_id);
913 affected_entity_ids.insert(target_id);
914 }
915
916 for &eid in &affected_entity_ids {
917 entities::recalculate_degree(&tx, eid)?;
918 }
919 }
920
921 tx.commit()?;
922
923 if !staged.urls.is_empty() {
924 let url_entries: Vec<storage_urls::MemoryUrl> = staged
925 .urls
926 .into_iter()
927 .map(|u| storage_urls::MemoryUrl {
928 url: u.url,
929 offset: Some(u.start as i64),
930 })
931 .collect();
932 let _ = storage_urls::insert_urls(conn, memory_id, &url_entries);
933 }
934
935 Ok(FileSuccess {
936 memory_id,
937 action: "created".to_string(),
938 body_length: new_memory.body.len(),
939 backend_invoked: staged.backend_invoked,
940 })
941}
942
943fn is_at_default<T: PartialEq>(value: T, default: T) -> bool {
951 value == default
952}
953
954fn validate_mode_conditional_flags_ingest(args: &IngestArgs) -> Result<(), AppError> {
969 const DEFAULT_TIMEOUT: u64 = 300;
970 const DEFAULT_RATE_LIMIT_WAIT: u64 = 60;
971
972 let mut conflicts: Vec<String> = Vec::new();
973
974 let is_local_mode = args.mode == IngestMode::None || args.mode == IngestMode::Gliner;
975
976 if is_local_mode {
977 if args.claude_binary.is_some() {
978 conflicts.push("--claude-binary is ignored when --mode is none or gliner".to_string());
979 }
980 if args.claude_model.is_some() {
981 conflicts.push("--claude-model is ignored when --mode is none or gliner".to_string());
982 }
983 if !is_at_default(args.claude_timeout, DEFAULT_TIMEOUT) {
984 conflicts.push(format!(
985 "--claude-timeout={} is ignored when --mode is none or gliner (remove the flag to use the default 300s)",
986 args.claude_timeout
987 ));
988 }
989 if args.codex_binary.is_some() {
990 conflicts.push("--codex-binary is ignored when --mode is none or gliner".to_string());
991 }
992 if args.codex_model.is_some() {
993 conflicts.push("--codex-model is ignored when --mode is none or gliner".to_string());
994 }
995 if !is_at_default(args.codex_timeout, DEFAULT_TIMEOUT) {
996 conflicts.push(format!(
997 "--codex-timeout={} is ignored when --mode is none or gliner (remove the flag to use the default 300s)",
998 args.codex_timeout
999 ));
1000 }
1001 if args.opencode_binary.is_some() {
1002 conflicts
1003 .push("--opencode-binary is ignored when --mode is none or gliner".to_string());
1004 }
1005 if args.opencode_model.is_some() {
1006 conflicts.push("--opencode-model is ignored when --mode is none or gliner".to_string());
1007 }
1008 if !is_at_default(args.opencode_timeout, DEFAULT_TIMEOUT) {
1009 conflicts.push(format!(
1010 "--opencode-timeout={} is ignored when --mode is none or gliner (remove the flag to use the default 300s)",
1011 args.opencode_timeout
1012 ));
1013 }
1014 if args.max_cost_usd.is_some() {
1015 conflicts.push("--max-cost-usd is ignored when --mode is none or gliner (cost is only tracked for LLM-backed modes)".to_string());
1016 }
1017 if args.resume {
1018 conflicts.push("--resume is ignored when --mode is none or gliner (the queue DB is only used by LLM-backed modes)".to_string());
1019 }
1020 if args.retry_failed {
1021 conflicts.push("--retry-failed is ignored when --mode is none or gliner".to_string());
1022 }
1023 if args.keep_queue {
1024 conflicts.push("--keep-queue is ignored when --mode is none or gliner".to_string());
1025 }
1026 if !is_at_default(args.rate_limit_wait, DEFAULT_RATE_LIMIT_WAIT) {
1027 conflicts.push(format!(
1028 "--rate-limit-wait={} is ignored when --mode is none or gliner",
1029 args.rate_limit_wait
1030 ));
1031 }
1032 }
1033
1034 match args.mode {
1035 IngestMode::ClaudeCode => {
1036 if args.codex_binary.is_some() {
1037 conflicts.push("--codex-binary is ignored when --mode=claude-code".to_string());
1038 }
1039 if args.codex_model.is_some() {
1040 conflicts.push("--codex-model is ignored when --mode=claude-code".to_string());
1041 }
1042 if !is_at_default(args.codex_timeout, DEFAULT_TIMEOUT) {
1043 conflicts.push(format!(
1044 "--codex-timeout={} is ignored when --mode=claude-code (remove the flag to use the default 300s)",
1045 args.codex_timeout
1046 ));
1047 }
1048 if args.opencode_binary.is_some() {
1049 conflicts.push("--opencode-binary is ignored when --mode=claude-code".to_string());
1050 }
1051 if args.opencode_model.is_some() {
1052 conflicts.push("--opencode-model is ignored when --mode=claude-code".to_string());
1053 }
1054 if !is_at_default(args.opencode_timeout, DEFAULT_TIMEOUT) {
1055 conflicts.push(format!(
1056 "--opencode-timeout={} is ignored when --mode=claude-code (remove the flag to use the default 300s)",
1057 args.opencode_timeout
1058 ));
1059 }
1060 }
1061 IngestMode::Codex => {
1062 if args.claude_binary.is_some() {
1063 conflicts.push("--claude-binary is ignored when --mode=codex".to_string());
1064 }
1065 if args.claude_model.is_some() {
1066 conflicts.push("--claude-model is ignored when --mode=codex".to_string());
1067 }
1068 if !is_at_default(args.claude_timeout, DEFAULT_TIMEOUT) {
1069 conflicts.push(format!(
1070 "--claude-timeout={} is ignored when --mode=codex (remove the flag to use the default 300s)",
1071 args.claude_timeout
1072 ));
1073 }
1074 if args.max_cost_usd.is_some() {
1075 conflicts.push(
1076 "--max-cost-usd is ignored when --mode=codex (OAuth-first; cost is metered by your subscription)"
1077 .to_string(),
1078 );
1079 }
1080 if args.resume {
1081 conflicts.push("--resume is only valid for --mode=claude-code".to_string());
1082 }
1083 if args.retry_failed {
1084 conflicts.push("--retry-failed is only valid for --mode=claude-code".to_string());
1085 }
1086 if args.keep_queue {
1087 conflicts.push("--keep-queue is only valid for --mode=claude-code".to_string());
1088 }
1089 if args.opencode_binary.is_some() {
1090 conflicts.push("--opencode-binary is ignored when --mode=codex".to_string());
1091 }
1092 if args.opencode_model.is_some() {
1093 conflicts.push("--opencode-model is ignored when --mode=codex".to_string());
1094 }
1095 if !is_at_default(args.opencode_timeout, DEFAULT_TIMEOUT) {
1096 conflicts.push(format!(
1097 "--opencode-timeout={} is ignored when --mode=codex (remove the flag to use the default 300s)",
1098 args.opencode_timeout
1099 ));
1100 }
1101 }
1102 IngestMode::Opencode => {
1103 if args.claude_binary.is_some() {
1104 conflicts.push("--claude-binary is ignored when --mode=opencode".to_string());
1105 }
1106 if args.claude_model.is_some() {
1107 conflicts.push("--claude-model is ignored when --mode=opencode".to_string());
1108 }
1109 if !is_at_default(args.claude_timeout, DEFAULT_TIMEOUT) {
1110 conflicts.push(format!(
1111 "--claude-timeout={} is ignored when --mode=opencode (remove the flag to use the default 300s)",
1112 args.claude_timeout
1113 ));
1114 }
1115 if args.codex_binary.is_some() {
1116 conflicts.push("--codex-binary is ignored when --mode=opencode".to_string());
1117 }
1118 if args.codex_model.is_some() {
1119 conflicts.push("--codex-model is ignored when --mode=opencode".to_string());
1120 }
1121 if !is_at_default(args.codex_timeout, DEFAULT_TIMEOUT) {
1122 conflicts.push(format!(
1123 "--codex-timeout={} is ignored when --mode=opencode (remove the flag to use the default 300s)",
1124 args.codex_timeout
1125 ));
1126 }
1127 if args.max_cost_usd.is_some() {
1128 conflicts.push(
1129 "--max-cost-usd is ignored when --mode=opencode (OAuth-first; cost is metered by your subscription)"
1130 .to_string(),
1131 );
1132 }
1133 if args.resume {
1134 conflicts.push("--resume is only valid for --mode=claude-code".to_string());
1135 }
1136 if args.retry_failed {
1137 conflicts.push("--retry-failed is only valid for --mode=claude-code".to_string());
1138 }
1139 if args.keep_queue {
1140 conflicts.push("--keep-queue is only valid for --mode=claude-code".to_string());
1141 }
1142 }
1143 IngestMode::None | IngestMode::Gliner => {}
1144 }
1145
1146 if !conflicts.is_empty() {
1147 return Err(AppError::Validation(format!(
1148 "G20: mode-conditional flag conflicts detected for --mode={:?}:\n - {}",
1149 args.mode,
1150 conflicts.join("\n - ")
1151 )));
1152 }
1153
1154 Ok(())
1155}
1156
1157#[tracing::instrument(skip_all, level = "debug", name = "ingest")]
1160pub fn run(
1161 args: IngestArgs,
1162 llm_backend: crate::cli::LlmBackendChoice,
1163 embedding_backend: crate::cli::EmbeddingBackendChoice,
1164) -> Result<(), AppError> {
1165 validate_mode_conditional_flags_ingest(&args)?;
1168 tracing::debug!(target: "ingest", dir = %args.dir.display(), mode = ?args.mode, "starting ingest");
1169 if args.mode == IngestMode::ClaudeCode {
1170 return super::ingest_claude::run_claude_ingest(&args, embedding_backend, llm_backend);
1171 }
1172 if args.mode == IngestMode::Codex {
1173 return super::ingest_codex::run_codex_ingest(&args);
1174 }
1175 if args.mode == IngestMode::Opencode {
1176 return super::ingest_opencode::run_opencode_ingest(&args);
1177 }
1178
1179 let started = std::time::Instant::now();
1180
1181 if !args.dir.exists() {
1182 return Err(AppError::Validation(format!(
1183 "directory not found: {}",
1184 args.dir.display()
1185 )));
1186 }
1187 if !args.dir.is_dir() {
1188 return Err(AppError::Validation(format!(
1189 "path is not a directory: {}",
1190 args.dir.display()
1191 )));
1192 }
1193
1194 let mut files: Vec<PathBuf> = Vec::with_capacity(128);
1195 collect_files(&args.dir, &args.pattern, args.recursive, &mut files)?;
1196 files.sort_unstable();
1197
1198 if files.len() > args.max_files {
1199 return Err(AppError::Validation(format!(
1200 "found {} files matching pattern, exceeds --max-files cap of {} (raise the cap or narrow the pattern)",
1201 files.len(),
1202 args.max_files
1203 )));
1204 }
1205
1206 let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
1207 let memory_type_str = args.r#type.as_str().to_string();
1208
1209 let paths = AppPaths::resolve(args.db.as_deref())?;
1210 let mut conn_or_err = match init_storage(&paths) {
1211 Ok(c) => Ok(c),
1212 Err(e) => Err(format!("{e}")),
1213 };
1214
1215 let mut succeeded: usize = 0;
1216 let mut failed: usize = 0;
1217 let mut skipped: usize = 0;
1218 let total = files.len();
1219
1220 let mut taken_names: BTreeSet<String> = BTreeSet::new();
1223
1224 enum SlotMeta {
1230 Skip {
1231 file_str: String,
1232 derived_base: String,
1233 name_truncated: bool,
1234 original_name: Option<String>,
1235 original_filename: Option<String>,
1236 reason: String,
1237 },
1238 Process {
1239 file_str: String,
1240 derived_name: String,
1241 name_truncated: bool,
1242 original_name: Option<String>,
1243 original_filename: Option<String>,
1244 },
1245 }
1246
1247 struct ProcessItem {
1248 idx: usize,
1249 path: PathBuf,
1250 file_str: String,
1251 derived_name: String,
1252 }
1253
1254 let files_cap = files.len();
1255 let mut slots_meta: Vec<SlotMeta> = Vec::new();
1256 slots_meta.try_reserve(files_cap).map_err(|_| {
1257 AppError::LimitExceeded(format!(
1258 "allocation of {files_cap} slot metadata entries would exceed available memory"
1259 ))
1260 })?;
1261 let mut process_items: Vec<ProcessItem> = Vec::new();
1262 process_items.try_reserve(files_cap).map_err(|_| {
1263 AppError::LimitExceeded(format!(
1264 "allocation of {files_cap} process items would exceed available memory"
1265 ))
1266 })?;
1267 let mut truncations: Vec<(String, String)> = Vec::new();
1268 truncations.try_reserve(files_cap).map_err(|_| {
1269 AppError::LimitExceeded(format!(
1270 "allocation of {files_cap} truncation entries would exceed available memory"
1271 ))
1272 })?;
1273
1274 let max_name_length = args.max_name_length;
1275 for path in &files {
1276 let file_str = path.to_string_lossy().into_owned();
1277 let (derived_base, name_truncated, original_name) =
1278 derive_kebab_name(path, max_name_length);
1279 let original_basename = path.file_stem().and_then(|s| s.to_str()).unwrap_or("");
1280
1281 if name_truncated {
1282 if let Some(ref orig) = original_name {
1283 truncations.push((orig.clone(), derived_base.clone()));
1284 }
1285 }
1286
1287 if derived_base.is_empty() {
1288 let orig_filename = if !original_basename.is_empty() {
1290 Some(original_basename.to_string())
1291 } else {
1292 None
1293 };
1294 slots_meta.push(SlotMeta::Skip {
1295 file_str,
1296 derived_base: String::new(),
1297 name_truncated: false,
1298 original_name: None,
1299 original_filename: orig_filename,
1300 reason: "could not derive a non-empty kebab-case name from filename".to_string(),
1301 });
1302 continue;
1303 }
1304
1305 match unique_name(&derived_base, &taken_names) {
1306 Ok(derived_name) => {
1307 taken_names.insert(derived_name.clone());
1308 let idx = slots_meta.len();
1309 let orig_filename = if original_basename != derived_name {
1311 Some(original_basename.to_string())
1312 } else {
1313 None
1314 };
1315 process_items.push(ProcessItem {
1316 idx,
1317 path: path.clone(),
1318 file_str: file_str.clone(),
1319 derived_name: derived_name.clone(),
1320 });
1321 slots_meta.push(SlotMeta::Process {
1322 file_str,
1323 derived_name,
1324 name_truncated,
1325 original_name,
1326 original_filename: orig_filename,
1327 });
1328 }
1329 Err(e) => {
1330 let orig_filename = if original_basename != derived_base {
1331 Some(original_basename.to_string())
1332 } else {
1333 None
1334 };
1335 slots_meta.push(SlotMeta::Skip {
1336 file_str,
1337 derived_base,
1338 name_truncated,
1339 original_name,
1340 original_filename: orig_filename,
1341 reason: e.to_string(),
1342 });
1343 }
1344 }
1345 }
1346
1347 if !truncations.is_empty() {
1348 tracing::info!(
1349 target: "ingest",
1350 count = truncations.len(),
1351 max_name_length = max_name_length,
1352 max_len = DERIVED_NAME_MAX_LEN,
1353 "derived names truncated; pass -vv (debug) for per-file detail"
1354 );
1355 }
1356
1357 if args.dry_run {
1359 for meta in &slots_meta {
1360 match meta {
1361 SlotMeta::Skip {
1362 file_str,
1363 derived_base,
1364 name_truncated,
1365 original_name,
1366 original_filename,
1367 reason,
1368 } => {
1369 output::emit_json_compact(&IngestFileEvent {
1370 file: file_str,
1371 name: derived_base,
1372 status: "skip",
1373 truncated: *name_truncated,
1374 original_name: original_name.clone(),
1375 original_filename: original_filename.as_deref(),
1376 error: Some(reason.clone()),
1377 memory_id: None,
1378 action: None,
1379 body_length: 0,
1380 backend_invoked: None,
1381 })?;
1382 }
1383 SlotMeta::Process {
1384 file_str,
1385 derived_name,
1386 name_truncated,
1387 original_name,
1388 original_filename,
1389 } => {
1390 output::emit_json_compact(&IngestFileEvent {
1391 file: file_str,
1392 name: derived_name,
1393 status: "preview",
1394 truncated: *name_truncated,
1395 original_name: original_name.clone(),
1396 original_filename: original_filename.as_deref(),
1397 error: None,
1398 memory_id: None,
1399 action: None,
1400 body_length: 0,
1401 backend_invoked: None,
1402 })?;
1403 }
1404 }
1405 }
1406 output::emit_json_compact(&IngestSummary {
1407 summary: true,
1408 dir: args.dir.to_string_lossy().into_owned(),
1409 pattern: args.pattern.clone(),
1410 recursive: args.recursive,
1411 files_total: total,
1412 files_succeeded: 0,
1413 files_failed: 0,
1414 files_skipped: 0,
1415 elapsed_ms: started.elapsed().as_millis() as u64,
1416 })?;
1417 return Ok(());
1418 }
1419
1420 if args.low_memory {
1422 if let Some(n) = args.ingest_parallelism {
1423 if n > 1 {
1424 return Err(AppError::Validation(
1425 "--ingest-parallelism N>1 conflicts with --low-memory; use one or the other"
1426 .to_string(),
1427 ));
1428 }
1429 }
1430 }
1431
1432 let parallelism = resolve_parallelism(args.low_memory, args.ingest_parallelism);
1435
1436 let pool = rayon::ThreadPoolBuilder::new()
1437 .num_threads(parallelism)
1438 .build()
1439 .map_err(|e| AppError::Internal(anyhow::anyhow!("rayon pool: {e}")))?;
1440
1441 if args.enable_ner && args.skip_extraction {
1442 return Err(AppError::Validation(
1443 "--enable-ner and --skip-extraction are mutually exclusive; remove one".to_string(),
1444 ));
1445 }
1446 if args.skip_extraction && !args.enable_ner {
1447 tracing::warn!(
1454 "--skip-extraction is deprecated since v1.0.45 and has no effect (NER is disabled by default); remove this flag to silence the warning"
1455 );
1456 }
1457 let enable_ner = args.enable_ner;
1458 let auto_describe = args.auto_describe && !args.no_auto_describe;
1459 let max_rss_mb = args.max_rss_mb;
1460 let llm_parallelism = args.llm_parallelism as usize;
1461 if args.mode == IngestMode::Gliner {
1465 tracing::warn!(
1466 "--mode gliner is deprecated since v1.0.79 (the GLiNER pipeline was removed); it now performs URL-regex extraction only — use --mode claude-code or --mode codex for LLM-curated extraction"
1467 );
1468 }
1469 if args.gliner_variant != "fp32" {
1470 tracing::warn!(
1471 "--gliner-variant is deprecated and has no effect since v1.0.79 (the GLiNER pipeline was removed)"
1472 );
1473 }
1474 let gliner_variant: crate::extraction::GlinerVariant = match args.gliner_variant.as_str() {
1475 "int8" => crate::extraction::GlinerVariant::Int8,
1476 _ => crate::extraction::GlinerVariant::Fp32,
1477 };
1478
1479 let total_to_process = process_items.len();
1480 tracing::info!(
1481 target: "ingest",
1482 phase = "pipeline_start",
1483 files = total_to_process,
1484 ingest_parallelism = parallelism,
1485 "incremental pipeline starting: Phase A (rayon) → channel → Phase B (main thread)",
1486 );
1487
1488 let channel_bound = (parallelism * 2).max(1);
1492 let (tx, rx) = mpsc::sync_channel::<(usize, Result<StagedFile, AppError>)>(channel_bound);
1493
1494 let paths_owned = paths.clone();
1499 let llm_backend_owned = llm_backend;
1500 let embedding_backend_owned = embedding_backend;
1501 let producer_handle = std::thread::spawn(move || {
1502 pool.install(|| {
1503 process_items.into_par_iter().for_each(|item| {
1504 if crate::shutdown_requested() {
1505 return;
1506 }
1507 let t0 = std::time::Instant::now();
1508 let result = stage_file(
1509 item.idx,
1510 &item.path,
1511 &item.derived_name,
1512 &paths_owned,
1513 enable_ner,
1514 gliner_variant,
1515 max_rss_mb,
1516 llm_parallelism,
1517 llm_backend_owned,
1518 embedding_backend_owned,
1519 auto_describe,
1520 );
1521 let elapsed_ms = t0.elapsed().as_millis() as u64;
1522
1523 let (n_entities, n_relationships) = match &result {
1526 Ok(sf) => (sf.entities.len(), sf.relationships.len()),
1527 Err(_) => (0, 0),
1528 };
1529 let progress = StageProgressEvent {
1530 schema_version: 1,
1531 event: "file_extracted",
1532 path: &item.file_str,
1533 ms: elapsed_ms,
1534 entities: n_entities,
1535 relationships: n_relationships,
1536 };
1537 if let Ok(line) = serde_json::to_string(&progress) {
1538 tracing::info!(target: "ingest_progress", "{}", line);
1539 }
1540
1541 let _ = tx.send((item.idx, result));
1545 });
1546 drop(tx);
1548 });
1549 });
1550
1551 let fail_fast = args.fail_fast;
1563
1564 for meta in &slots_meta {
1566 if let SlotMeta::Skip {
1567 file_str,
1568 derived_base,
1569 name_truncated,
1570 original_name,
1571 original_filename,
1572 reason,
1573 } = meta
1574 {
1575 output::emit_json_compact(&IngestFileEvent {
1576 file: file_str,
1577 name: derived_base,
1578 status: "skipped",
1579 truncated: *name_truncated,
1580 original_name: original_name.clone(),
1581 original_filename: original_filename.as_deref(),
1582 error: Some(reason.clone()),
1583 memory_id: None,
1584 action: None,
1585 body_length: 0,
1586 backend_invoked: None,
1587 })?;
1588 skipped += 1;
1589 }
1590 }
1591
1592 let meta_index: std::collections::HashMap<usize, &SlotMeta> = slots_meta
1595 .iter()
1596 .enumerate()
1597 .filter(|(_, m)| matches!(m, SlotMeta::Process { .. }))
1598 .collect();
1599
1600 tracing::info!(
1601 target: "ingest",
1602 phase = "persist_start",
1603 files = total_to_process,
1604 "phase B starting: persisting files incrementally as Phase A completes each one",
1605 );
1606
1607 for (idx, stage_result) in rx {
1611 if crate::shutdown_requested() {
1612 tracing::info!(target: "ingest", "shutdown requested, stopping persistence loop");
1613 break;
1614 }
1615 let meta = meta_index.get(&idx).ok_or_else(|| {
1616 AppError::Internal(anyhow::anyhow!(
1617 "channel idx {idx} has no corresponding Process slot"
1618 ))
1619 })?;
1620 let (file_str, derived_name, name_truncated, original_name, original_filename) = match meta
1621 {
1622 SlotMeta::Process {
1623 file_str,
1624 derived_name,
1625 name_truncated,
1626 original_name,
1627 original_filename,
1628 } => (
1629 file_str,
1630 derived_name,
1631 name_truncated,
1632 original_name,
1633 original_filename,
1634 ),
1635 SlotMeta::Skip { .. } => unreachable!("channel only carries Process results"),
1636 };
1637
1638 let conn = match conn_or_err.as_mut() {
1640 Ok(c) => c,
1641 Err(err_msg) => {
1642 let err_clone = err_msg.clone();
1643 output::emit_json_compact(&IngestFileEvent {
1644 file: file_str,
1645 name: derived_name,
1646 status: "failed",
1647 truncated: *name_truncated,
1648 original_name: original_name.clone(),
1649 original_filename: original_filename.as_deref(),
1650 error: Some(err_clone.clone()),
1651 memory_id: None,
1652 action: None,
1653 body_length: 0,
1654 backend_invoked: None,
1655 })?;
1656 failed += 1;
1657 if fail_fast {
1658 output::emit_json_compact(&IngestSummary {
1659 summary: true,
1660 dir: args.dir.display().to_string(),
1661 pattern: args.pattern.clone(),
1662 recursive: args.recursive,
1663 files_total: total,
1664 files_succeeded: succeeded,
1665 files_failed: failed,
1666 files_skipped: skipped,
1667 elapsed_ms: started.elapsed().as_millis() as u64,
1668 })?;
1669 return Err(AppError::Validation(format!(
1670 "ingest aborted on first failure: {err_clone}"
1671 )));
1672 }
1673 continue;
1674 }
1675 };
1676
1677 let outcome =
1678 stage_result.and_then(|sf| persist_staged(conn, &namespace, &memory_type_str, sf));
1679
1680 match outcome {
1681 Ok(FileSuccess {
1682 memory_id,
1683 action,
1684 body_length,
1685 backend_invoked: file_backend_invoked,
1686 }) => {
1687 output::emit_json_compact(&IngestFileEvent {
1688 file: file_str,
1689 name: derived_name,
1690 status: "indexed",
1691 truncated: *name_truncated,
1692 original_name: original_name.clone(),
1693 original_filename: original_filename.as_deref(),
1694 error: None,
1695 memory_id: Some(memory_id),
1696 action: Some(action),
1697 body_length,
1698 backend_invoked: file_backend_invoked,
1699 })?;
1700 succeeded += 1;
1701 }
1702 Err(ref e) if matches!(e, AppError::Duplicate(_)) => {
1703 output::emit_json_compact(&IngestFileEvent {
1704 file: file_str,
1705 name: derived_name,
1706 status: "skipped",
1707 truncated: *name_truncated,
1708 original_name: original_name.clone(),
1709 original_filename: original_filename.as_deref(),
1710 error: Some(format!("{e}")),
1711 memory_id: None,
1712 action: Some("duplicate".to_string()),
1713 body_length: 0,
1714 backend_invoked: None,
1715 })?;
1716 skipped += 1;
1717 }
1718 Err(e) => {
1719 let err_msg = format!("{e}");
1720 output::emit_json_compact(&IngestFileEvent {
1721 file: file_str,
1722 name: derived_name,
1723 status: "failed",
1724 truncated: *name_truncated,
1725 original_name: original_name.clone(),
1726 original_filename: original_filename.as_deref(),
1727 error: Some(err_msg.clone()),
1728 memory_id: None,
1729 action: None,
1730 body_length: 0,
1731 backend_invoked: None,
1732 })?;
1733 failed += 1;
1734 if fail_fast {
1735 output::emit_json_compact(&IngestSummary {
1736 summary: true,
1737 dir: args.dir.display().to_string(),
1738 pattern: args.pattern.clone(),
1739 recursive: args.recursive,
1740 files_total: total,
1741 files_succeeded: succeeded,
1742 files_failed: failed,
1743 files_skipped: skipped,
1744 elapsed_ms: started.elapsed().as_millis() as u64,
1745 })?;
1746 return Err(AppError::Validation(format!(
1747 "ingest aborted on first failure: {err_msg}"
1748 )));
1749 }
1750 }
1751 }
1752 }
1753
1754 producer_handle
1756 .join()
1757 .map_err(|_| AppError::Internal(anyhow::anyhow!("ingest producer thread panicked")))?;
1758
1759 if let Ok(ref conn) = conn_or_err {
1760 if succeeded > 0 {
1761 let _ = conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);");
1762 }
1763 }
1764
1765 output::emit_json_compact(&IngestSummary {
1766 summary: true,
1767 dir: args.dir.display().to_string(),
1768 pattern: args.pattern.clone(),
1769 recursive: args.recursive,
1770 files_total: total,
1771 files_succeeded: succeeded,
1772 files_failed: failed,
1773 files_skipped: skipped,
1774 elapsed_ms: started.elapsed().as_millis() as u64,
1775 })?;
1776
1777 if args.enrich_after && succeeded > 0 {
1778 output::emit_json_compact(&serde_json::json!({
1779 "event": "enrich_phase_started",
1780 "operation": "memory-bindings"
1781 }))?;
1782 let enrich_args = super::enrich::EnrichArgs {
1783 operation: super::enrich::EnrichOperation::MemoryBindings,
1784 mode: super::enrich::EnrichMode::ClaudeCode,
1785 limit: None,
1786 dry_run: false,
1787 namespace: args.namespace.clone(),
1788 claude_binary: args.claude_binary.clone(),
1789 claude_model: args.claude_model.clone(),
1790 claude_timeout: args.claude_timeout,
1791 codex_binary: args.codex_binary.clone(),
1792 codex_model: args.codex_model.clone(),
1793 codex_timeout: args.codex_timeout,
1794 opencode_binary: args.opencode_binary.clone(),
1795 opencode_model: args.opencode_model.clone(),
1796 opencode_timeout: args.opencode_timeout,
1797 openrouter_model: None,
1798 openrouter_api_key: None,
1799 openrouter_timeout: 300,
1800 openrouter_base_url: None,
1801 db: args.db.clone(),
1802 json: false,
1803 resume: false,
1804 retry_failed: false,
1805 max_cost_usd: args.max_cost_usd,
1806 llm_parallelism: args.llm_parallelism as u32,
1807 wait_job_singleton: args.wait_job_singleton,
1808 force_job_singleton: args.force_job_singleton,
1809 names: Vec::new(),
1810 names_file: None,
1811 preflight_check: false,
1812 fallback_mode: None,
1813 rate_limit_buffer: 300,
1814 max_load_check: true,
1815 circuit_breaker_threshold: 5,
1816 preserve_threshold: 0.7,
1817 codex_model_validate: true,
1818 codex_model_fallback: None,
1819 min_output_chars: 500,
1820 max_output_chars: 2000,
1821 preserve_check: true,
1822 prompt_template: None,
1823 until_empty: false,
1824 max_runtime: None,
1825 max_attempts: 5,
1826 status: false,
1827 rest_concurrency: None,
1828 };
1829 match super::enrich::run(&enrich_args, llm_backend, embedding_backend) {
1830 Ok(()) => {
1831 output::emit_json_compact(&serde_json::json!({
1832 "event": "enrich_phase_completed"
1833 }))?;
1834 }
1835 Err(e) => {
1836 tracing::warn!(error = %e, "enrich --operation memory-bindings failed after ingest");
1837 output::emit_json_compact(&serde_json::json!({
1838 "event": "enrich_phase_failed",
1839 "error": e.to_string()
1840 }))?;
1841 }
1842 }
1843 }
1844
1845 Ok(())
1846}
1847
1848fn init_storage(paths: &AppPaths) -> Result<Connection, AppError> {
1854 ensure_db_ready(paths)?;
1855 let conn = open_rw(&paths.db)?;
1856 Ok(conn)
1857}
1858
1859pub(crate) fn collect_files(
1860 dir: &Path,
1861 pattern: &str,
1862 recursive: bool,
1863 out: &mut Vec<PathBuf>,
1864) -> Result<(), AppError> {
1865 let entries = std::fs::read_dir(dir).map_err(AppError::Io)?;
1866 for entry in entries {
1867 let entry = entry.map_err(AppError::Io)?;
1868 let path = entry.path();
1869 let file_type = entry.file_type().map_err(AppError::Io)?;
1870 if file_type.is_file() {
1871 let name = entry.file_name();
1872 let name_str = name.to_string_lossy();
1873 if matches_pattern(&name_str, pattern) {
1874 out.push(path);
1875 }
1876 } else if file_type.is_dir() && recursive {
1877 collect_files(&path, pattern, recursive, out)?;
1878 }
1879 }
1880 Ok(())
1881}
1882
1883fn matches_pattern(name: &str, pattern: &str) -> bool {
1884 if let Some(suffix) = pattern.strip_prefix('*') {
1885 name.ends_with(suffix)
1886 } else if let Some(prefix) = pattern.strip_suffix('*') {
1887 name.starts_with(prefix)
1888 } else {
1889 name == pattern
1890 }
1891}
1892
1893pub(crate) fn derive_kebab_name(path: &Path, max_len: usize) -> (String, bool, Option<String>) {
1904 let stem = path.file_stem().and_then(|s| s.to_str()).unwrap_or("");
1905 let lowered: String = stem
1906 .nfd()
1907 .filter(|c| !unicode_normalization::char::is_combining_mark(*c))
1908 .map(|c| {
1909 if c == '_' || c.is_whitespace() {
1910 '-'
1911 } else {
1912 c
1913 }
1914 })
1915 .map(|c| c.to_ascii_lowercase())
1916 .filter(|c| c.is_ascii_lowercase() || c.is_ascii_digit() || *c == '-')
1917 .collect();
1918 let collapsed = collapse_dashes(&lowered);
1919 let trimmed_raw = collapsed.trim_matches('-').to_string();
1920 let trimmed = if trimmed_raw.starts_with(|c: char| c.is_ascii_digit()) {
1922 format!("doc-{trimmed_raw}")
1923 } else {
1924 trimmed_raw
1925 };
1926 if trimmed.len() > max_len {
1927 let truncated = trimmed[..max_len].trim_matches('-').to_string();
1928 tracing::debug!(
1929 target: "ingest",
1930 original = %trimmed,
1931 truncated_to = %truncated,
1932 max_len = max_len,
1933 "derived memory name truncated to fit length cap; collisions will be resolved with numeric suffixes"
1934 );
1935 (truncated, true, Some(trimmed))
1936 } else {
1937 (trimmed, false, None)
1938 }
1939}
1940
1941fn unique_name(base: &str, taken: &BTreeSet<String>) -> Result<String, AppError> {
1954 if !taken.contains(base) {
1955 return Ok(base.to_string());
1956 }
1957 for suffix in 1..=MAX_NAME_COLLISION_SUFFIX {
1958 let candidate = format!("{base}-{suffix}");
1959 if !taken.contains(&candidate) {
1960 tracing::warn!(
1961 target: "ingest",
1962 base = %base,
1963 resolved = %candidate,
1964 suffix,
1965 "memory name collision resolved with numeric suffix"
1966 );
1967 return Ok(candidate);
1968 }
1969 }
1970 Err(AppError::Validation(format!(
1971 "too many name collisions for base '{base}' (>{MAX_NAME_COLLISION_SUFFIX}); rename source files to disambiguate"
1972 )))
1973}
1974
1975fn collapse_dashes(s: &str) -> String {
1976 let mut out = String::with_capacity(s.len());
1977 let mut prev_dash = false;
1978 for c in s.chars() {
1979 if c == '-' {
1980 if !prev_dash {
1981 out.push('-');
1982 }
1983 prev_dash = true;
1984 } else {
1985 out.push(c);
1986 prev_dash = false;
1987 }
1988 }
1989 out
1990}
1991
1992#[cfg(test)]
1993mod tests {
1994 use super::*;
1995 use std::path::PathBuf;
1996
1997 #[test]
1998 fn matches_pattern_suffix() {
1999 assert!(matches_pattern("foo.md", "*.md"));
2000 assert!(!matches_pattern("foo.txt", "*.md"));
2001 assert!(matches_pattern("foo.md", "*"));
2002 }
2003
2004 #[test]
2005 fn matches_pattern_prefix() {
2006 assert!(matches_pattern("README.md", "README*"));
2007 assert!(!matches_pattern("CHANGELOG.md", "README*"));
2008 }
2009
2010 #[test]
2011 fn matches_pattern_exact() {
2012 assert!(matches_pattern("README.md", "README.md"));
2013 assert!(!matches_pattern("readme.md", "README.md"));
2014 }
2015
2016 #[test]
2017 fn derive_kebab_underscore_to_dash() {
2018 let p = PathBuf::from("/tmp/claude_code_headless.md");
2019 let (name, truncated, original) = derive_kebab_name(&p, DERIVED_NAME_MAX_LEN);
2020 assert_eq!(name, "claude-code-headless");
2021 assert!(!truncated);
2022 assert!(original.is_none());
2023 }
2024
2025 #[test]
2026 fn derive_kebab_uppercase_lowered() {
2027 let p = PathBuf::from("/tmp/README.md");
2028 let (name, truncated, original) = derive_kebab_name(&p, DERIVED_NAME_MAX_LEN);
2029 assert_eq!(name, "readme");
2030 assert!(!truncated);
2031 assert!(original.is_none());
2032 }
2033
2034 #[test]
2035 fn derive_kebab_strips_non_kebab_chars() {
2036 let p = PathBuf::from("/tmp/some@weird#name!.md");
2037 let (name, truncated, original) = derive_kebab_name(&p, DERIVED_NAME_MAX_LEN);
2038 assert_eq!(name, "someweirdname");
2039 assert!(!truncated);
2040 assert!(original.is_none());
2041 }
2042
2043 #[test]
2046 fn derive_kebab_folds_accented_letters_to_ascii() {
2047 let p = PathBuf::from("/tmp/açaí.md");
2048 let (name, _, _) = derive_kebab_name(&p, DERIVED_NAME_MAX_LEN);
2049 assert_eq!(name, "acai", "got '{name}'");
2050 }
2051
2052 #[test]
2053 fn derive_kebab_handles_naive_with_diaeresis() {
2054 let p = PathBuf::from("/tmp/naïve-test.md");
2055 let (name, _, _) = derive_kebab_name(&p, DERIVED_NAME_MAX_LEN);
2056 assert_eq!(name, "naive-test", "got '{name}'");
2057 }
2058
2059 #[test]
2060 fn derive_kebab_drops_emoji_keeps_word() {
2061 let p = PathBuf::from("/tmp/🚀-rocket.md");
2062 let (name, _, _) = derive_kebab_name(&p, DERIVED_NAME_MAX_LEN);
2063 assert_eq!(name, "rocket", "got '{name}'");
2064 }
2065
2066 #[test]
2067 fn derive_kebab_mixed_unicode_emoji_keeps_letters() {
2068 let p = PathBuf::from("/tmp/açaí🦜.md");
2069 let (name, _, _) = derive_kebab_name(&p, DERIVED_NAME_MAX_LEN);
2070 assert_eq!(name, "acai", "got '{name}'");
2071 }
2072
2073 #[test]
2074 fn derive_kebab_pure_emoji_yields_empty() {
2075 let p = PathBuf::from("/tmp/🦜🚀🌟.md");
2076 let (name, _, _) = derive_kebab_name(&p, DERIVED_NAME_MAX_LEN);
2077 assert!(name.is_empty(), "got '{name}'");
2078 }
2079
2080 #[test]
2081 fn derive_kebab_collapses_consecutive_dashes() {
2082 let p = PathBuf::from("/tmp/a__b___c.md");
2083 let (name, truncated, original) = derive_kebab_name(&p, DERIVED_NAME_MAX_LEN);
2084 assert_eq!(name, "a-b-c");
2085 assert!(!truncated);
2086 assert!(original.is_none());
2087 }
2088
2089 #[test]
2090 fn derive_kebab_truncates_to_60_chars() {
2091 let p = PathBuf::from(format!("/tmp/{}.md", "a".repeat(80)));
2092 let (name, truncated, original) = derive_kebab_name(&p, DERIVED_NAME_MAX_LEN);
2093 assert!(name.len() <= 60, "got len {}", name.len());
2094 assert!(truncated);
2095 assert!(original.is_some());
2096 assert!(original.unwrap().len() > 60);
2097 }
2098
2099 #[test]
2100 fn collect_files_finds_md_files() {
2101 let tmp = tempfile::tempdir().expect("tempdir");
2102 std::fs::write(tmp.path().join("a.md"), "x").unwrap();
2103 std::fs::write(tmp.path().join("b.md"), "y").unwrap();
2104 std::fs::write(tmp.path().join("c.txt"), "z").unwrap();
2105 let mut out = Vec::new();
2106 collect_files(tmp.path(), "*.md", false, &mut out).expect("collect");
2107 assert_eq!(out.len(), 2, "should find 2 .md files, got {out:?}");
2108 }
2109
2110 #[test]
2111 fn collect_files_recursive_descends_subdirs() {
2112 let tmp = tempfile::tempdir().expect("tempdir");
2113 let sub = tmp.path().join("sub");
2114 std::fs::create_dir(&sub).unwrap();
2115 std::fs::write(tmp.path().join("a.md"), "x").unwrap();
2116 std::fs::write(sub.join("b.md"), "y").unwrap();
2117 let mut out = Vec::new();
2118 collect_files(tmp.path(), "*.md", true, &mut out).expect("collect");
2119 assert_eq!(out.len(), 2);
2120 }
2121
2122 #[test]
2123 fn collect_files_non_recursive_skips_subdirs() {
2124 let tmp = tempfile::tempdir().expect("tempdir");
2125 let sub = tmp.path().join("sub");
2126 std::fs::create_dir(&sub).unwrap();
2127 std::fs::write(tmp.path().join("a.md"), "x").unwrap();
2128 std::fs::write(sub.join("b.md"), "y").unwrap();
2129 let mut out = Vec::new();
2130 collect_files(tmp.path(), "*.md", false, &mut out).expect("collect");
2131 assert_eq!(out.len(), 1);
2132 }
2133
2134 #[test]
2137 fn derive_kebab_long_basename_truncated_within_cap() {
2138 let p = PathBuf::from(format!("/tmp/{}.md", "a".repeat(120)));
2139 let (name, truncated, original) = derive_kebab_name(&p, DERIVED_NAME_MAX_LEN);
2140 assert!(
2141 name.len() <= DERIVED_NAME_MAX_LEN,
2142 "truncated name must respect cap; got {} chars",
2143 name.len()
2144 );
2145 assert!(!name.is_empty());
2146 assert!(truncated);
2147 assert!(original.is_some());
2148 }
2149
2150 #[test]
2151 fn unique_name_returns_base_when_free() {
2152 let taken: BTreeSet<String> = BTreeSet::new();
2153 let resolved = unique_name("note", &taken).expect("must resolve");
2154 assert_eq!(resolved, "note");
2155 }
2156
2157 #[test]
2158 fn unique_name_appends_first_free_suffix_on_collision() {
2159 let mut taken: BTreeSet<String> = BTreeSet::new();
2160 taken.insert("note".to_string());
2161 taken.insert("note-1".to_string());
2162 let resolved = unique_name("note", &taken).expect("must resolve");
2163 assert_eq!(resolved, "note-2");
2164 }
2165
2166 #[test]
2167 fn unique_name_errors_after_collision_cap() {
2168 let mut taken: BTreeSet<String> = BTreeSet::new();
2169 taken.insert("note".to_string());
2170 for i in 1..=MAX_NAME_COLLISION_SUFFIX {
2171 taken.insert(format!("note-{i}"));
2172 }
2173 let err = unique_name("note", &taken).expect_err("must surface error");
2174 assert!(matches!(err, AppError::Validation(_)));
2175 }
2176
2177 #[test]
2180 fn validate_relation_format_accepts_valid_relations() {
2181 use crate::parsers::{is_canonical_relation, validate_relation_format};
2182 assert!(validate_relation_format("applies_to").is_ok());
2183 assert!(validate_relation_format("depends_on").is_ok());
2184 assert!(validate_relation_format("implements").is_ok());
2185 assert!(validate_relation_format("").is_err());
2186 assert!(is_canonical_relation("applies_to"));
2187 assert!(!is_canonical_relation("implements"));
2188 }
2189
2190 use serial_test::serial;
2193
2194 fn with_env_var<F: FnOnce()>(value: Option<&str>, f: F) {
2196 let key = "SQLITE_GRAPHRAG_LOW_MEMORY";
2197 let prev = std::env::var(key).ok();
2198 match value {
2199 Some(v) => std::env::set_var(key, v),
2200 None => std::env::remove_var(key),
2201 }
2202 f();
2203 match prev {
2204 Some(p) => std::env::set_var(key, p),
2205 None => std::env::remove_var(key),
2206 }
2207 }
2208
2209 #[test]
2210 #[serial]
2211 fn env_low_memory_enabled_unset_returns_false() {
2212 with_env_var(None, || assert!(!env_low_memory_enabled()));
2213 }
2214
2215 #[test]
2216 #[serial]
2217 fn env_low_memory_enabled_empty_returns_false() {
2218 with_env_var(Some(""), || assert!(!env_low_memory_enabled()));
2219 }
2220
2221 #[test]
2222 #[serial]
2223 fn env_low_memory_enabled_truthy_values_return_true() {
2224 for v in ["1", "true", "TRUE", "yes", "YES", "on", "On"] {
2225 with_env_var(Some(v), || {
2226 assert!(env_low_memory_enabled(), "value {v:?} should be truthy")
2227 });
2228 }
2229 }
2230
2231 #[test]
2232 #[serial]
2233 fn env_low_memory_enabled_falsy_values_return_false() {
2234 for v in ["0", "false", "FALSE", "no", "off"] {
2235 with_env_var(Some(v), || {
2236 assert!(!env_low_memory_enabled(), "value {v:?} should be falsy")
2237 });
2238 }
2239 }
2240
2241 #[test]
2242 #[serial]
2243 fn env_low_memory_enabled_unrecognized_value_returns_false() {
2244 with_env_var(Some("maybe"), || assert!(!env_low_memory_enabled()));
2245 }
2246
2247 #[test]
2248 #[serial]
2249 fn resolve_parallelism_flag_forces_one_overriding_explicit_value() {
2250 with_env_var(None, || {
2251 assert_eq!(resolve_parallelism(true, Some(4)), 1);
2252 assert_eq!(resolve_parallelism(true, Some(8)), 1);
2253 assert_eq!(resolve_parallelism(true, None), 1);
2254 });
2255 }
2256
2257 #[test]
2258 #[serial]
2259 fn resolve_parallelism_env_forces_one_when_flag_off() {
2260 with_env_var(Some("1"), || {
2261 assert_eq!(resolve_parallelism(false, Some(4)), 1);
2262 assert_eq!(resolve_parallelism(false, None), 1);
2263 });
2264 }
2265
2266 #[test]
2267 #[serial]
2268 fn resolve_parallelism_falsy_env_does_not_override() {
2269 with_env_var(Some("0"), || {
2270 assert_eq!(resolve_parallelism(false, Some(4)), 4);
2271 });
2272 }
2273
2274 #[test]
2275 #[serial]
2276 fn resolve_parallelism_explicit_value_when_low_memory_off() {
2277 with_env_var(None, || {
2278 assert_eq!(resolve_parallelism(false, Some(3)), 3);
2279 assert_eq!(resolve_parallelism(false, Some(1)), 1);
2280 });
2281 }
2282
2283 #[test]
2284 #[serial]
2285 fn resolve_parallelism_default_when_unset() {
2286 with_env_var(None, || {
2287 let p = resolve_parallelism(false, None);
2288 assert!((1..=4).contains(&p), "default must be in [1, 4]; got {p}");
2289 });
2290 }
2291
2292 #[test]
2293 fn ingest_args_parses_low_memory_flag_via_clap() {
2294 use clap::Parser;
2295 let cli = crate::cli::Cli::try_parse_from([
2298 "sqlite-graphrag",
2299 "ingest",
2300 "/tmp/dummy",
2301 "--type",
2302 "document",
2303 "--low-memory",
2304 ])
2305 .expect("parse must succeed");
2306 match cli.command {
2307 Some(crate::cli::Commands::Ingest(args)) => {
2308 assert!(args.low_memory, "--low-memory must set field to true");
2309 }
2310 _ => panic!("expected Ingest subcommand"),
2311 }
2312 }
2313
2314 #[test]
2315 fn ingest_args_low_memory_defaults_false() {
2316 use clap::Parser;
2317 let cli = crate::cli::Cli::try_parse_from([
2318 "sqlite-graphrag",
2319 "ingest",
2320 "/tmp/dummy",
2321 "--type",
2322 "document",
2323 ])
2324 .expect("parse must succeed");
2325 match cli.command {
2326 Some(crate::cli::Commands::Ingest(args)) => {
2327 assert!(!args.low_memory, "default must be false");
2328 }
2329 _ => panic!("expected Ingest subcommand"),
2330 }
2331 }
2332}