1use crate::chunking;
28use crate::cli::MemoryType;
29use crate::errors::AppError;
30use crate::i18n::errors_msg;
31use crate::output::{self, JsonOutputFormat};
32use crate::paths::AppPaths;
33use crate::storage::chunks as storage_chunks;
34use crate::storage::connection::{ensure_db_ready, open_rw};
35use crate::storage::entities::{NewEntity, NewRelationship};
36use crate::storage::memories::NewMemory;
37use crate::storage::{entities, memories, urls as storage_urls, versions};
38use rayon::prelude::*;
39use rusqlite::Connection;
40use serde::Serialize;
41use std::collections::BTreeSet;
42use std::path::{Path, PathBuf};
43use std::sync::Mutex;
44
45const DERIVED_NAME_MAX_LEN: usize = 60;
48
49const MAX_NAME_COLLISION_SUFFIX: usize = 1000;
52
53#[derive(clap::Args)]
54#[command(after_long_help = "EXAMPLES:\n \
55 # Ingest every Markdown file under ./docs as `document` memories\n \
56 sqlite-graphrag ingest ./docs --type document\n\n \
57 # Ingest .txt files recursively under ./notes\n \
58 sqlite-graphrag ingest ./notes --type note --pattern '*.txt' --recursive\n\n \
59 # Skip BERT NER auto-extraction for faster bulk import\n \
60 sqlite-graphrag ingest ./big-corpus --type reference --skip-extraction\n\n \
61NOTES:\n \
62 Each file becomes a separate memory. Names derive from file basenames\n \
63 (kebab-case, lowercase, ASCII). Output is NDJSON: one JSON object per file,\n \
64 followed by a final summary line with counts. Per-file errors are reported\n \
65 inline and processing continues unless --fail-fast is set.")]
66pub struct IngestArgs {
67 #[arg(
69 value_name = "DIR",
70 help = "Directory to ingest recursively (each matching file becomes a memory)"
71 )]
72 pub dir: PathBuf,
73
74 #[arg(long, value_enum)]
76 pub r#type: MemoryType,
77
78 #[arg(long, default_value = "*.md")]
81 pub pattern: String,
82
83 #[arg(long, default_value_t = false)]
85 pub recursive: bool,
86
87 #[arg(long, default_value_t = false)]
89 pub skip_extraction: bool,
90
91 #[arg(long, default_value_t = false)]
93 pub fail_fast: bool,
94
95 #[arg(long, default_value_t = 10_000)]
97 pub max_files: usize,
98
99 #[arg(long)]
101 pub namespace: Option<String>,
102
103 #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
105 pub db: Option<String>,
106
107 #[arg(long, value_enum, default_value_t = JsonOutputFormat::Json)]
108 pub format: JsonOutputFormat,
109
110 #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
111 pub json: bool,
112
113 #[arg(
115 long,
116 help = "Number of files to extract+embed in parallel; default = max(1, cpus/2).min(4)"
117 )]
118 pub ingest_parallelism: Option<usize>,
119}
120
121#[derive(Serialize)]
122struct IngestFileEvent<'a> {
123 file: &'a str,
124 name: &'a str,
125 status: &'a str,
126 truncated: bool,
128 #[serde(skip_serializing_if = "Option::is_none")]
130 original_name: Option<String>,
131 #[serde(skip_serializing_if = "Option::is_none")]
132 error: Option<String>,
133 #[serde(skip_serializing_if = "Option::is_none")]
134 memory_id: Option<i64>,
135 #[serde(skip_serializing_if = "Option::is_none")]
136 action: Option<String>,
137}
138
139#[derive(Serialize)]
140struct IngestSummary {
141 summary: bool,
142 dir: String,
143 pattern: String,
144 recursive: bool,
145 files_total: usize,
146 files_succeeded: usize,
147 files_failed: usize,
148 files_skipped: usize,
149 elapsed_ms: u64,
150}
151
152struct FileSuccess {
154 memory_id: i64,
155 action: String,
156}
157
158struct StagedFile {
161 body: String,
162 body_hash: String,
163 snippet: String,
164 name: String,
165 description: String,
166 embedding: Vec<f32>,
167 chunk_embeddings: Option<Vec<Vec<f32>>>,
168 chunks_info: Vec<crate::chunking::Chunk>,
169 entities: Vec<NewEntity>,
170 relationships: Vec<NewRelationship>,
171 entity_embeddings: Vec<Vec<f32>>,
172 urls: Vec<crate::extraction::ExtractedUrl>,
173}
174
175fn stage_file(
178 _idx: usize,
179 path: &Path,
180 name: &str,
181 paths: &AppPaths,
182 skip_extraction: bool,
183) -> Result<StagedFile, AppError> {
184 use crate::constants::*;
185
186 if name.len() > MAX_MEMORY_NAME_LEN {
187 return Err(AppError::LimitExceeded(
188 crate::i18n::validation::name_length(MAX_MEMORY_NAME_LEN),
189 ));
190 }
191 if name.starts_with("__") {
192 return Err(AppError::Validation(
193 crate::i18n::validation::reserved_name(),
194 ));
195 }
196 {
197 let slug_re = regex::Regex::new(NAME_SLUG_REGEX)
198 .map_err(|e| AppError::Internal(anyhow::anyhow!("regex: {e}")))?;
199 if !slug_re.is_match(name) {
200 return Err(AppError::Validation(crate::i18n::validation::name_kebab(
201 name,
202 )));
203 }
204 }
205
206 let raw_body = std::fs::read_to_string(path).map_err(AppError::Io)?;
207 if raw_body.len() > MAX_MEMORY_BODY_LEN {
208 return Err(AppError::LimitExceeded(
209 crate::i18n::validation::body_exceeds(MAX_MEMORY_BODY_LEN),
210 ));
211 }
212 if raw_body.trim().is_empty() {
213 return Err(AppError::Validation(crate::i18n::validation::empty_body()));
214 }
215
216 let description = format!("ingested from {}", path.display());
217 if description.len() > MAX_MEMORY_DESCRIPTION_LEN {
218 return Err(AppError::Validation(
219 crate::i18n::validation::description_exceeds(MAX_MEMORY_DESCRIPTION_LEN),
220 ));
221 }
222
223 let mut extracted_entities: Vec<NewEntity> = Vec::new();
224 let mut extracted_relationships: Vec<NewRelationship> = Vec::new();
225 let mut extracted_urls: Vec<crate::extraction::ExtractedUrl> = Vec::new();
226 if !skip_extraction {
227 match crate::extraction::extract_graph_auto(&raw_body, paths) {
228 Ok(extracted) => {
229 extracted_urls = extracted.urls;
230 extracted_entities = extracted.entities;
231 extracted_relationships = extracted.relationships;
232
233 if extracted_entities.len() > MAX_ENTITIES_PER_MEMORY {
234 extracted_entities.truncate(MAX_ENTITIES_PER_MEMORY);
235 }
236 if extracted_relationships.len() > MAX_RELATIONSHIPS_PER_MEMORY {
237 extracted_relationships.truncate(MAX_RELATIONSHIPS_PER_MEMORY);
238 }
239 }
240 Err(e) => {
241 tracing::warn!(
242 file = %path.display(),
243 "auto-extraction failed (graceful degradation): {e:#}"
244 );
245 }
246 }
247 }
248
249 for entity in &extracted_entities {
250 if !is_valid_entity_type(&entity.entity_type) {
251 return Err(AppError::Validation(format!(
252 "invalid entity_type '{}' for entity '{}'",
253 entity.entity_type, entity.name
254 )));
255 }
256 }
257 for rel in &mut extracted_relationships {
258 rel.relation = rel.relation.replace('-', "_");
259 if !is_valid_relation(&rel.relation) {
260 return Err(AppError::Validation(format!(
261 "invalid relation '{}' for relationship '{}' -> '{}'",
262 rel.relation, rel.source, rel.target
263 )));
264 }
265 if !(0.0..=1.0).contains(&rel.strength) {
266 return Err(AppError::Validation(format!(
267 "invalid strength {} for relationship '{}' -> '{}'; expected value in [0.0, 1.0]",
268 rel.strength, rel.source, rel.target
269 )));
270 }
271 }
272
273 let body_hash = blake3::hash(raw_body.as_bytes()).to_hex().to_string();
274 let snippet: String = raw_body.chars().take(200).collect();
275
276 let tokenizer = crate::tokenizer::get_tokenizer(&paths.models)?;
277 let chunks_info = chunking::split_into_chunks_hierarchical(&raw_body, tokenizer);
278 if chunks_info.len() > REMEMBER_MAX_SAFE_MULTI_CHUNKS {
279 return Err(AppError::LimitExceeded(format!(
280 "document produces {} chunks; current safe operational limit is {} chunks; split the document before using remember",
281 chunks_info.len(),
282 REMEMBER_MAX_SAFE_MULTI_CHUNKS
283 )));
284 }
285
286 let mut chunk_embeddings_opt: Option<Vec<Vec<f32>>> = None;
287 let embedding = if chunks_info.len() == 1 {
288 crate::daemon::embed_passage_or_local(&paths.models, &raw_body)?
289 } else {
290 let chunk_texts: Vec<&str> = chunks_info
291 .iter()
292 .map(|c| chunking::chunk_text(&raw_body, c))
293 .collect();
294 let mut chunk_embeddings = Vec::with_capacity(chunk_texts.len());
295 for chunk_text in &chunk_texts {
296 chunk_embeddings.push(crate::daemon::embed_passage_or_local(
297 &paths.models,
298 chunk_text,
299 )?);
300 }
301 let aggregated = chunking::aggregate_embeddings(&chunk_embeddings);
302 chunk_embeddings_opt = Some(chunk_embeddings);
303 aggregated
304 };
305
306 let entity_embeddings = extracted_entities
307 .iter()
308 .map(|entity| {
309 let entity_text = match &entity.description {
310 Some(desc) => format!("{} {}", entity.name, desc),
311 None => entity.name.clone(),
312 };
313 crate::daemon::embed_passage_or_local(&paths.models, &entity_text)
314 })
315 .collect::<Result<Vec<_>, _>>()?;
316
317 Ok(StagedFile {
318 body: raw_body,
319 body_hash,
320 snippet,
321 name: name.to_string(),
322 description,
323 embedding,
324 chunk_embeddings: chunk_embeddings_opt,
325 chunks_info,
326 entities: extracted_entities,
327 relationships: extracted_relationships,
328 entity_embeddings,
329 urls: extracted_urls,
330 })
331}
332
333fn persist_staged(
335 conn: &mut Connection,
336 namespace: &str,
337 memory_type: &str,
338 staged: StagedFile,
339) -> Result<FileSuccess, AppError> {
340 {
341 let active_count: u32 = conn.query_row(
342 "SELECT COUNT(DISTINCT namespace) FROM memories WHERE deleted_at IS NULL",
343 [],
344 |r| r.get::<_, i64>(0).map(|v| v as u32),
345 )?;
346 let ns_exists: bool = conn.query_row(
347 "SELECT EXISTS(SELECT 1 FROM memories WHERE namespace = ?1 AND deleted_at IS NULL)",
348 rusqlite::params![namespace],
349 |r| r.get::<_, i64>(0).map(|v| v > 0),
350 )?;
351 if !ns_exists && active_count >= crate::constants::MAX_NAMESPACES_ACTIVE {
352 return Err(AppError::NamespaceError(format!(
353 "active namespace limit of {} exceeded while creating '{namespace}'",
354 crate::constants::MAX_NAMESPACES_ACTIVE
355 )));
356 }
357 }
358
359 let existing_memory = memories::find_by_name(conn, namespace, &staged.name)?;
360 if existing_memory.is_some() {
361 return Err(AppError::Duplicate(errors_msg::duplicate_memory(
362 &staged.name,
363 namespace,
364 )));
365 }
366 let duplicate_hash_id = memories::find_by_hash(conn, namespace, &staged.body_hash)?;
367
368 let new_memory = NewMemory {
369 namespace: namespace.to_string(),
370 name: staged.name.clone(),
371 memory_type: memory_type.to_string(),
372 description: staged.description.clone(),
373 body: staged.body,
374 body_hash: staged.body_hash,
375 session_id: None,
376 source: "agent".to_string(),
377 metadata: serde_json::json!({}),
378 };
379
380 if let Some(hash_id) = duplicate_hash_id {
381 tracing::debug!(
382 target: "ingest",
383 duplicate_memory_id = hash_id,
384 "identical body already exists; persisting a new memory anyway"
385 );
386 }
387
388 let tx = conn.transaction_with_behavior(rusqlite::TransactionBehavior::Immediate)?;
389
390 let memory_id = memories::insert(&tx, &new_memory)?;
391 versions::insert_version(
392 &tx,
393 memory_id,
394 1,
395 &staged.name,
396 memory_type,
397 &staged.description,
398 &new_memory.body,
399 &serde_json::to_string(&new_memory.metadata)?,
400 None,
401 "create",
402 )?;
403 memories::upsert_vec(
404 &tx,
405 memory_id,
406 namespace,
407 memory_type,
408 &staged.embedding,
409 &staged.name,
410 &staged.snippet,
411 )?;
412
413 if staged.chunks_info.len() > 1 {
414 storage_chunks::insert_chunk_slices(&tx, memory_id, &new_memory.body, &staged.chunks_info)?;
415 let chunk_embeddings = staged.chunk_embeddings.ok_or_else(|| {
416 AppError::Internal(anyhow::anyhow!(
417 "missing chunk embeddings cache on multi-chunk ingest path"
418 ))
419 })?;
420 for (i, emb) in chunk_embeddings.iter().enumerate() {
421 storage_chunks::upsert_chunk_vec(&tx, i as i64, memory_id, i as i32, emb)?;
422 }
423 }
424
425 if !staged.entities.is_empty() || !staged.relationships.is_empty() {
426 for (idx, entity) in staged.entities.iter().enumerate() {
427 let entity_id = entities::upsert_entity(&tx, namespace, entity)?;
428 let entity_embedding = &staged.entity_embeddings[idx];
429 entities::upsert_entity_vec(
430 &tx,
431 entity_id,
432 namespace,
433 &entity.entity_type,
434 entity_embedding,
435 &entity.name,
436 )?;
437 entities::link_memory_entity(&tx, memory_id, entity_id)?;
438 entities::increment_degree(&tx, entity_id)?;
439 }
440 let entity_types: std::collections::HashMap<&str, &str> = staged
441 .entities
442 .iter()
443 .map(|entity| (entity.name.as_str(), entity.entity_type.as_str()))
444 .collect();
445 for rel in &staged.relationships {
446 let source_entity = NewEntity {
447 name: rel.source.clone(),
448 entity_type: entity_types
449 .get(rel.source.as_str())
450 .copied()
451 .unwrap_or("concept")
452 .to_string(),
453 description: None,
454 };
455 let target_entity = NewEntity {
456 name: rel.target.clone(),
457 entity_type: entity_types
458 .get(rel.target.as_str())
459 .copied()
460 .unwrap_or("concept")
461 .to_string(),
462 description: None,
463 };
464 let source_id = entities::upsert_entity(&tx, namespace, &source_entity)?;
465 let target_id = entities::upsert_entity(&tx, namespace, &target_entity)?;
466 let rel_id = entities::upsert_relationship(&tx, namespace, source_id, target_id, rel)?;
467 entities::link_memory_relationship(&tx, memory_id, rel_id)?;
468 }
469 }
470
471 tx.commit()?;
472
473 if !staged.urls.is_empty() {
474 let url_entries: Vec<storage_urls::MemoryUrl> = staged
475 .urls
476 .into_iter()
477 .map(|u| storage_urls::MemoryUrl {
478 url: u.url,
479 offset: Some(u.offset as i64),
480 })
481 .collect();
482 let _ = storage_urls::insert_urls(conn, memory_id, &url_entries);
483 }
484
485 Ok(FileSuccess {
486 memory_id,
487 action: "created".to_string(),
488 })
489}
490
491pub fn run(args: IngestArgs) -> Result<(), AppError> {
492 let started = std::time::Instant::now();
493
494 if !args.dir.exists() {
495 return Err(AppError::NotFound(format!(
496 "directory not found: {}",
497 args.dir.display()
498 )));
499 }
500 if !args.dir.is_dir() {
501 return Err(AppError::Validation(format!(
502 "path is not a directory: {}",
503 args.dir.display()
504 )));
505 }
506
507 let mut files: Vec<PathBuf> = Vec::new();
508 collect_files(&args.dir, &args.pattern, args.recursive, &mut files)?;
509 files.sort();
510
511 if files.len() > args.max_files {
512 return Err(AppError::Validation(format!(
513 "found {} files matching pattern, exceeds --max-files cap of {} (raise the cap or narrow the pattern)",
514 files.len(),
515 args.max_files
516 )));
517 }
518
519 let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
520 let memory_type_str = args.r#type.as_str().to_string();
521
522 let paths = AppPaths::resolve(args.db.as_deref())?;
523 let mut conn_or_err = match init_storage(&paths) {
524 Ok(c) => Ok(c),
525 Err(e) => Err(format!("{e}")),
526 };
527
528 let mut succeeded: usize = 0;
529 let mut failed: usize = 0;
530 let mut skipped: usize = 0;
531 let total = files.len();
532
533 let mut taken_names: BTreeSet<String> = BTreeSet::new();
536
537 struct FileSlot {
540 path: PathBuf,
541 file_str: String,
542 derived_name: String,
543 name_truncated: bool,
544 original_name: Option<String>,
545 }
546 enum Slot {
547 Skip {
548 file_str: String,
549 derived_base: String,
550 name_truncated: bool,
551 original_name: Option<String>,
552 reason: String,
553 },
554 Process(FileSlot),
555 }
556
557 let slots: Vec<Slot> = files
558 .iter()
559 .map(|path| {
560 let file_str = path.to_string_lossy().into_owned();
561 let (derived_base, name_truncated, original_name) = derive_kebab_name(path);
562
563 if derived_base.is_empty() {
564 return Slot::Skip {
565 file_str,
566 derived_base: String::new(),
567 name_truncated: false,
568 original_name: None,
569 reason: "could not derive a non-empty kebab-case name from filename"
570 .to_string(),
571 };
572 }
573
574 match unique_name(&derived_base, &taken_names) {
575 Ok(derived_name) => {
576 taken_names.insert(derived_name.clone());
577 Slot::Process(FileSlot {
578 path: path.clone(),
579 file_str,
580 derived_name,
581 name_truncated,
582 original_name,
583 })
584 }
585 Err(e) => Slot::Skip {
586 file_str,
587 derived_base,
588 name_truncated,
589 original_name,
590 reason: e.to_string(),
591 },
592 }
593 })
594 .collect();
595
596 let parallelism = args
598 .ingest_parallelism
599 .unwrap_or_else(|| {
600 std::thread::available_parallelism()
601 .map(|v| v.get() / 2)
602 .unwrap_or(1)
603 .clamp(1, 4)
604 })
605 .max(1);
606
607 let pool = rayon::ThreadPoolBuilder::new()
608 .num_threads(parallelism)
609 .build()
610 .map_err(|e| AppError::Internal(anyhow::anyhow!("rayon pool: {e}")))?;
611
612 let staged: Vec<Mutex<Option<Result<StagedFile, AppError>>>> =
614 (0..slots.len()).map(|_| Mutex::new(None)).collect();
615
616 let skip_extraction = args.skip_extraction;
617 let paths_ref = &paths;
618
619 pool.install(|| {
620 slots.par_iter().enumerate().for_each(|(idx, slot)| {
621 if let Slot::Process(fs) = slot {
622 let result =
623 stage_file(idx, &fs.path, &fs.derived_name, paths_ref, skip_extraction);
624 *staged[idx].lock().expect("staged slot poisoned") = Some(result);
626 }
627 });
628 });
629
630 let fail_fast = args.fail_fast;
632 for (idx, slot) in slots.iter().enumerate() {
633 match slot {
634 Slot::Skip {
635 file_str,
636 derived_base,
637 name_truncated,
638 original_name,
639 reason,
640 } => {
641 output::emit_json_compact(&IngestFileEvent {
642 file: file_str,
643 name: derived_base,
644 status: "skipped",
645 truncated: *name_truncated,
646 original_name: original_name.clone(),
647 error: Some(reason.clone()),
648 memory_id: None,
649 action: None,
650 })?;
651 skipped += 1;
652 }
653 Slot::Process(fs) => {
654 let conn = match conn_or_err.as_mut() {
656 Ok(c) => c,
657 Err(err_msg) => {
658 let err_clone = err_msg.clone();
659 output::emit_json_compact(&IngestFileEvent {
660 file: &fs.file_str,
661 name: &fs.derived_name,
662 status: "failed",
663 truncated: fs.name_truncated,
664 original_name: fs.original_name.clone(),
665 error: Some(err_clone.clone()),
666 memory_id: None,
667 action: None,
668 })?;
669 failed += 1;
670 if fail_fast {
671 output::emit_json_compact(&IngestSummary {
672 summary: true,
673 dir: args.dir.display().to_string(),
674 pattern: args.pattern.clone(),
675 recursive: args.recursive,
676 files_total: total,
677 files_succeeded: succeeded,
678 files_failed: failed,
679 files_skipped: skipped,
680 elapsed_ms: started.elapsed().as_millis() as u64,
681 })?;
682 return Err(AppError::Validation(format!(
683 "ingest aborted on first failure: {err_clone}"
684 )));
685 }
686 continue;
687 }
688 };
689
690 let stage_result = staged[idx]
692 .lock()
693 .expect("staged slot poisoned")
694 .take()
695 .expect("staged slot empty for Process slot");
696
697 let outcome = stage_result
698 .and_then(|sf| persist_staged(conn, &namespace, &memory_type_str, sf));
699
700 match outcome {
701 Ok(FileSuccess { memory_id, action }) => {
702 output::emit_json_compact(&IngestFileEvent {
703 file: &fs.file_str,
704 name: &fs.derived_name,
705 status: "indexed",
706 truncated: fs.name_truncated,
707 original_name: fs.original_name.clone(),
708 error: None,
709 memory_id: Some(memory_id),
710 action: Some(action),
711 })?;
712 succeeded += 1;
713 }
714 Err(e) => {
715 let err_msg = format!("{e}");
716 output::emit_json_compact(&IngestFileEvent {
717 file: &fs.file_str,
718 name: &fs.derived_name,
719 status: "failed",
720 truncated: fs.name_truncated,
721 original_name: fs.original_name.clone(),
722 error: Some(err_msg.clone()),
723 memory_id: None,
724 action: None,
725 })?;
726 failed += 1;
727 if fail_fast {
728 output::emit_json_compact(&IngestSummary {
729 summary: true,
730 dir: args.dir.display().to_string(),
731 pattern: args.pattern.clone(),
732 recursive: args.recursive,
733 files_total: total,
734 files_succeeded: succeeded,
735 files_failed: failed,
736 files_skipped: skipped,
737 elapsed_ms: started.elapsed().as_millis() as u64,
738 })?;
739 return Err(AppError::Validation(format!(
740 "ingest aborted on first failure: {err_msg}"
741 )));
742 }
743 }
744 }
745 }
746 }
747 }
748
749 output::emit_json_compact(&IngestSummary {
750 summary: true,
751 dir: args.dir.display().to_string(),
752 pattern: args.pattern.clone(),
753 recursive: args.recursive,
754 files_total: total,
755 files_succeeded: succeeded,
756 files_failed: failed,
757 files_skipped: skipped,
758 elapsed_ms: started.elapsed().as_millis() as u64,
759 })?;
760
761 Ok(())
762}
763
764fn init_storage(paths: &AppPaths) -> Result<Connection, AppError> {
770 ensure_db_ready(paths)?;
771 let conn = open_rw(&paths.db)?;
772 Ok(conn)
773}
774
775fn is_valid_entity_type(entity_type: &str) -> bool {
776 matches!(
777 entity_type,
778 "project"
779 | "tool"
780 | "person"
781 | "file"
782 | "concept"
783 | "incident"
784 | "decision"
785 | "memory"
786 | "dashboard"
787 | "issue_tracker"
788 | "organization"
789 | "location"
790 | "date"
791 )
792}
793
794fn is_valid_relation(relation: &str) -> bool {
795 matches!(
796 relation,
797 "applies_to"
798 | "uses"
799 | "depends_on"
800 | "causes"
801 | "fixes"
802 | "contradicts"
803 | "supports"
804 | "follows"
805 | "related"
806 | "mentions"
807 | "replaces"
808 | "tracked_in"
809 )
810}
811
812fn collect_files(
813 dir: &Path,
814 pattern: &str,
815 recursive: bool,
816 out: &mut Vec<PathBuf>,
817) -> Result<(), AppError> {
818 let entries = std::fs::read_dir(dir).map_err(AppError::Io)?;
819 for entry in entries {
820 let entry = entry.map_err(AppError::Io)?;
821 let path = entry.path();
822 let file_type = entry.file_type().map_err(AppError::Io)?;
823 if file_type.is_file() {
824 let name = entry.file_name();
825 let name_str = name.to_string_lossy();
826 if matches_pattern(&name_str, pattern) {
827 out.push(path);
828 }
829 } else if file_type.is_dir() && recursive {
830 collect_files(&path, pattern, recursive, out)?;
831 }
832 }
833 Ok(())
834}
835
836fn matches_pattern(name: &str, pattern: &str) -> bool {
837 if let Some(suffix) = pattern.strip_prefix('*') {
838 name.ends_with(suffix)
839 } else if let Some(prefix) = pattern.strip_suffix('*') {
840 name.starts_with(prefix)
841 } else {
842 name == pattern
843 }
844}
845
846fn derive_kebab_name(path: &Path) -> (String, bool, Option<String>) {
850 let stem = path.file_stem().and_then(|s| s.to_str()).unwrap_or("");
851 let lowered: String = stem
852 .chars()
853 .map(|c| {
854 if c == '_' || c.is_whitespace() {
855 '-'
856 } else {
857 c
858 }
859 })
860 .map(|c| c.to_ascii_lowercase())
861 .filter(|c| c.is_ascii_lowercase() || c.is_ascii_digit() || *c == '-')
862 .collect();
863 let collapsed = collapse_dashes(&lowered);
864 let trimmed = collapsed.trim_matches('-').to_string();
865 if trimmed.len() > DERIVED_NAME_MAX_LEN {
866 let truncated = trimmed[..DERIVED_NAME_MAX_LEN]
867 .trim_matches('-')
868 .to_string();
869 tracing::warn!(
872 target: "ingest",
873 original = %trimmed,
874 truncated_to = %truncated,
875 max_len = DERIVED_NAME_MAX_LEN,
876 "derived memory name truncated to fit length cap; collisions will be resolved with numeric suffixes"
877 );
878 (truncated, true, Some(trimmed))
879 } else {
880 (trimmed, false, None)
881 }
882}
883
884fn unique_name(base: &str, taken: &BTreeSet<String>) -> Result<String, AppError> {
897 if !taken.contains(base) {
898 return Ok(base.to_string());
899 }
900 for suffix in 1..=MAX_NAME_COLLISION_SUFFIX {
901 let candidate = format!("{base}-{suffix}");
902 if !taken.contains(&candidate) {
903 tracing::warn!(
904 target: "ingest",
905 base = %base,
906 resolved = %candidate,
907 suffix,
908 "memory name collision resolved with numeric suffix"
909 );
910 return Ok(candidate);
911 }
912 }
913 Err(AppError::Validation(format!(
914 "too many name collisions for base '{base}' (>{MAX_NAME_COLLISION_SUFFIX}); rename source files to disambiguate"
915 )))
916}
917
918fn collapse_dashes(s: &str) -> String {
919 let mut out = String::with_capacity(s.len());
920 let mut prev_dash = false;
921 for c in s.chars() {
922 if c == '-' {
923 if !prev_dash {
924 out.push('-');
925 }
926 prev_dash = true;
927 } else {
928 out.push(c);
929 prev_dash = false;
930 }
931 }
932 out
933}
934
935#[cfg(test)]
936mod tests {
937 use super::*;
938 use std::path::PathBuf;
939
940 #[test]
941 fn matches_pattern_suffix() {
942 assert!(matches_pattern("foo.md", "*.md"));
943 assert!(!matches_pattern("foo.txt", "*.md"));
944 assert!(matches_pattern("foo.md", "*"));
945 }
946
947 #[test]
948 fn matches_pattern_prefix() {
949 assert!(matches_pattern("README.md", "README*"));
950 assert!(!matches_pattern("CHANGELOG.md", "README*"));
951 }
952
953 #[test]
954 fn matches_pattern_exact() {
955 assert!(matches_pattern("README.md", "README.md"));
956 assert!(!matches_pattern("readme.md", "README.md"));
957 }
958
959 #[test]
960 fn derive_kebab_underscore_to_dash() {
961 let p = PathBuf::from("/tmp/claude_code_headless.md");
962 let (name, truncated, original) = derive_kebab_name(&p);
963 assert_eq!(name, "claude-code-headless");
964 assert!(!truncated);
965 assert!(original.is_none());
966 }
967
968 #[test]
969 fn derive_kebab_uppercase_lowered() {
970 let p = PathBuf::from("/tmp/README.md");
971 let (name, truncated, original) = derive_kebab_name(&p);
972 assert_eq!(name, "readme");
973 assert!(!truncated);
974 assert!(original.is_none());
975 }
976
977 #[test]
978 fn derive_kebab_strips_non_kebab_chars() {
979 let p = PathBuf::from("/tmp/some@weird#name!.md");
980 let (name, truncated, original) = derive_kebab_name(&p);
981 assert_eq!(name, "someweirdname");
982 assert!(!truncated);
983 assert!(original.is_none());
984 }
985
986 #[test]
987 fn derive_kebab_collapses_consecutive_dashes() {
988 let p = PathBuf::from("/tmp/a__b___c.md");
989 let (name, truncated, original) = derive_kebab_name(&p);
990 assert_eq!(name, "a-b-c");
991 assert!(!truncated);
992 assert!(original.is_none());
993 }
994
995 #[test]
996 fn derive_kebab_truncates_to_60_chars() {
997 let p = PathBuf::from(format!("/tmp/{}.md", "a".repeat(80)));
998 let (name, truncated, original) = derive_kebab_name(&p);
999 assert!(name.len() <= 60, "got len {}", name.len());
1000 assert!(truncated);
1001 assert!(original.is_some());
1002 assert!(original.unwrap().len() > 60);
1003 }
1004
1005 #[test]
1006 fn collect_files_finds_md_files() {
1007 let tmp = tempfile::tempdir().expect("tempdir");
1008 std::fs::write(tmp.path().join("a.md"), "x").unwrap();
1009 std::fs::write(tmp.path().join("b.md"), "y").unwrap();
1010 std::fs::write(tmp.path().join("c.txt"), "z").unwrap();
1011 let mut out = Vec::new();
1012 collect_files(tmp.path(), "*.md", false, &mut out).expect("collect");
1013 assert_eq!(out.len(), 2, "should find 2 .md files, got {out:?}");
1014 }
1015
1016 #[test]
1017 fn collect_files_recursive_descends_subdirs() {
1018 let tmp = tempfile::tempdir().expect("tempdir");
1019 let sub = tmp.path().join("sub");
1020 std::fs::create_dir(&sub).unwrap();
1021 std::fs::write(tmp.path().join("a.md"), "x").unwrap();
1022 std::fs::write(sub.join("b.md"), "y").unwrap();
1023 let mut out = Vec::new();
1024 collect_files(tmp.path(), "*.md", true, &mut out).expect("collect");
1025 assert_eq!(out.len(), 2);
1026 }
1027
1028 #[test]
1029 fn collect_files_non_recursive_skips_subdirs() {
1030 let tmp = tempfile::tempdir().expect("tempdir");
1031 let sub = tmp.path().join("sub");
1032 std::fs::create_dir(&sub).unwrap();
1033 std::fs::write(tmp.path().join("a.md"), "x").unwrap();
1034 std::fs::write(sub.join("b.md"), "y").unwrap();
1035 let mut out = Vec::new();
1036 collect_files(tmp.path(), "*.md", false, &mut out).expect("collect");
1037 assert_eq!(out.len(), 1);
1038 }
1039
1040 #[test]
1043 fn derive_kebab_long_basename_truncated_within_cap() {
1044 let p = PathBuf::from(format!("/tmp/{}.md", "a".repeat(120)));
1045 let (name, truncated, original) = derive_kebab_name(&p);
1046 assert!(
1047 name.len() <= DERIVED_NAME_MAX_LEN,
1048 "truncated name must respect cap; got {} chars",
1049 name.len()
1050 );
1051 assert!(!name.is_empty());
1052 assert!(truncated);
1053 assert!(original.is_some());
1054 }
1055
1056 #[test]
1057 fn unique_name_returns_base_when_free() {
1058 let taken: BTreeSet<String> = BTreeSet::new();
1059 let resolved = unique_name("note", &taken).expect("must resolve");
1060 assert_eq!(resolved, "note");
1061 }
1062
1063 #[test]
1064 fn unique_name_appends_first_free_suffix_on_collision() {
1065 let mut taken: BTreeSet<String> = BTreeSet::new();
1066 taken.insert("note".to_string());
1067 taken.insert("note-1".to_string());
1068 let resolved = unique_name("note", &taken).expect("must resolve");
1069 assert_eq!(resolved, "note-2");
1070 }
1071
1072 #[test]
1073 fn unique_name_errors_after_collision_cap() {
1074 let mut taken: BTreeSet<String> = BTreeSet::new();
1075 taken.insert("note".to_string());
1076 for i in 1..=MAX_NAME_COLLISION_SUFFIX {
1077 taken.insert(format!("note-{i}"));
1078 }
1079 let err = unique_name("note", &taken).expect_err("must surface error");
1080 assert!(matches!(err, AppError::Validation(_)));
1081 }
1082
1083 #[test]
1086 fn is_valid_entity_type_accepts_v008_types() {
1087 assert!(is_valid_entity_type("organization"));
1088 assert!(is_valid_entity_type("location"));
1089 assert!(is_valid_entity_type("date"));
1090 assert!(!is_valid_entity_type("unknown"));
1091 }
1092
1093 #[test]
1094 fn is_valid_relation_accepts_canonical_relations() {
1095 assert!(is_valid_relation("applies_to"));
1096 assert!(is_valid_relation("depends_on"));
1097 assert!(!is_valid_relation("foo_bar"));
1098 }
1099}