1use crate::chunking;
18use crate::cli::MemoryType;
19use crate::errors::AppError;
20use crate::i18n::errors_msg;
21use crate::output::{self, JsonOutputFormat};
22use crate::paths::AppPaths;
23use crate::storage::chunks as storage_chunks;
24use crate::storage::connection::{ensure_db_ready, open_rw};
25use crate::storage::entities::{NewEntity, NewRelationship};
26use crate::storage::memories::NewMemory;
27use crate::storage::{entities, memories, urls as storage_urls, versions};
28use rusqlite::Connection;
29use serde::Serialize;
30use std::collections::BTreeSet;
31use std::path::{Path, PathBuf};
32
33const DERIVED_NAME_MAX_LEN: usize = 60;
36
37const MAX_NAME_COLLISION_SUFFIX: usize = 1000;
40
41#[derive(clap::Args)]
42#[command(after_long_help = "EXAMPLES:\n \
43 # Ingest every Markdown file under ./docs as `document` memories\n \
44 sqlite-graphrag ingest ./docs --type document\n\n \
45 # Ingest .txt files recursively under ./notes\n \
46 sqlite-graphrag ingest ./notes --type note --pattern '*.txt' --recursive\n\n \
47 # Skip BERT NER auto-extraction for faster bulk import\n \
48 sqlite-graphrag ingest ./big-corpus --type reference --skip-extraction\n\n \
49NOTES:\n \
50 Each file becomes a separate memory. Names derive from file basenames\n \
51 (kebab-case, lowercase, ASCII). Output is NDJSON: one JSON object per file,\n \
52 followed by a final summary line with counts. Per-file errors are reported\n \
53 inline and processing continues unless --fail-fast is set.")]
54pub struct IngestArgs {
55 #[arg(
57 value_name = "DIR",
58 help = "Directory to ingest recursively (each matching file becomes a memory)"
59 )]
60 pub dir: PathBuf,
61
62 #[arg(long, value_enum)]
64 pub r#type: MemoryType,
65
66 #[arg(long, default_value = "*.md")]
69 pub pattern: String,
70
71 #[arg(long, default_value_t = false)]
73 pub recursive: bool,
74
75 #[arg(long, default_value_t = false)]
77 pub skip_extraction: bool,
78
79 #[arg(long, default_value_t = false)]
81 pub fail_fast: bool,
82
83 #[arg(long, default_value_t = 10_000)]
85 pub max_files: usize,
86
87 #[arg(long)]
89 pub namespace: Option<String>,
90
91 #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
93 pub db: Option<String>,
94
95 #[arg(long, value_enum, default_value_t = JsonOutputFormat::Json)]
96 pub format: JsonOutputFormat,
97
98 #[arg(long, hide = true, help = "No-op; JSON is always emitted on stdout")]
99 pub json: bool,
100}
101
102#[derive(Serialize)]
103struct IngestFileEvent<'a> {
104 file: &'a str,
105 name: &'a str,
106 status: &'a str,
107 truncated: bool,
109 #[serde(skip_serializing_if = "Option::is_none")]
111 original_name: Option<String>,
112 #[serde(skip_serializing_if = "Option::is_none")]
113 error: Option<String>,
114 #[serde(skip_serializing_if = "Option::is_none")]
115 memory_id: Option<i64>,
116 #[serde(skip_serializing_if = "Option::is_none")]
117 action: Option<String>,
118}
119
120#[derive(Serialize)]
121struct IngestSummary {
122 summary: bool,
123 dir: String,
124 pattern: String,
125 recursive: bool,
126 files_total: usize,
127 files_succeeded: usize,
128 files_failed: usize,
129 files_skipped: usize,
130 elapsed_ms: u64,
131}
132
133struct FileSuccess {
135 memory_id: i64,
136 action: String,
137}
138
139pub fn run(args: IngestArgs) -> Result<(), AppError> {
140 let started = std::time::Instant::now();
141
142 if !args.dir.exists() {
143 return Err(AppError::NotFound(format!(
144 "directory not found: {}",
145 args.dir.display()
146 )));
147 }
148 if !args.dir.is_dir() {
149 return Err(AppError::Validation(format!(
150 "path is not a directory: {}",
151 args.dir.display()
152 )));
153 }
154
155 let mut files: Vec<PathBuf> = Vec::new();
156 collect_files(&args.dir, &args.pattern, args.recursive, &mut files)?;
157 files.sort();
158
159 if files.len() > args.max_files {
160 return Err(AppError::Validation(format!(
161 "found {} files matching pattern, exceeds --max-files cap of {} (raise the cap or narrow the pattern)",
162 files.len(),
163 args.max_files
164 )));
165 }
166
167 let namespace = crate::namespace::resolve_namespace(args.namespace.as_deref())?;
168 let memory_type_str = args.r#type.as_str().to_string();
169
170 let paths = AppPaths::resolve(args.db.as_deref())?;
178 let mut conn_or_err = match init_storage(&paths) {
179 Ok(c) => Ok(c),
180 Err(e) => Err(format!("{e}")),
181 };
182
183 let mut succeeded: usize = 0;
184 let mut failed: usize = 0;
185 let mut skipped: usize = 0;
186 let total = files.len();
187
188 let mut taken_names: BTreeSet<String> = BTreeSet::new();
195
196 for path in &files {
197 let file_str = path.to_string_lossy().into_owned();
198 let (derived_base, name_truncated, original_name) = derive_kebab_name(path);
199
200 if derived_base.is_empty() {
201 output::emit_json_compact(&IngestFileEvent {
202 file: &file_str,
203 name: "",
204 status: "skipped",
205 truncated: false,
206 original_name: None,
207 error: Some(
208 "could not derive a non-empty kebab-case name from filename".to_string(),
209 ),
210 memory_id: None,
211 action: None,
212 })?;
213 skipped += 1;
214 continue;
215 }
216
217 let derived_name = match unique_name(&derived_base, &taken_names) {
218 Ok(n) => n,
219 Err(e) => {
220 output::emit_json_compact(&IngestFileEvent {
221 file: &file_str,
222 name: &derived_base,
223 status: "skipped",
224 truncated: name_truncated,
225 original_name: original_name.clone(),
226 error: Some(e.to_string()),
227 memory_id: None,
228 action: None,
229 })?;
230 skipped += 1;
231 continue;
232 }
233 };
234 taken_names.insert(derived_name.clone());
235
236 let conn = match conn_or_err.as_mut() {
239 Ok(c) => c,
240 Err(err_msg) => {
241 let err_clone = err_msg.clone();
242 output::emit_json_compact(&IngestFileEvent {
243 file: &file_str,
244 name: &derived_name,
245 status: "failed",
246 truncated: name_truncated,
247 original_name: original_name.clone(),
248 error: Some(err_clone.clone()),
249 memory_id: None,
250 action: None,
251 })?;
252 failed += 1;
253 if args.fail_fast {
254 output::emit_json_compact(&IngestSummary {
255 summary: true,
256 dir: args.dir.display().to_string(),
257 pattern: args.pattern.clone(),
258 recursive: args.recursive,
259 files_total: total,
260 files_succeeded: succeeded,
261 files_failed: failed,
262 files_skipped: skipped,
263 elapsed_ms: started.elapsed().as_millis() as u64,
264 })?;
265 return Err(AppError::Validation(format!(
266 "ingest aborted on first failure: {err_clone}"
267 )));
268 }
269 continue;
270 }
271 };
272
273 let outcome = process_file(
274 conn,
275 &paths,
276 &namespace,
277 &memory_type_str,
278 args.skip_extraction,
279 path,
280 &derived_name,
281 );
282
283 match outcome {
284 Ok(FileSuccess { memory_id, action }) => {
285 output::emit_json_compact(&IngestFileEvent {
286 file: &file_str,
287 name: &derived_name,
288 status: "indexed",
289 truncated: name_truncated,
290 original_name: original_name.clone(),
291 error: None,
292 memory_id: Some(memory_id),
293 action: Some(action),
294 })?;
295 succeeded += 1;
296 }
297 Err(e) => {
298 let err_msg = format!("{e}");
299 output::emit_json_compact(&IngestFileEvent {
300 file: &file_str,
301 name: &derived_name,
302 status: "failed",
303 truncated: name_truncated,
304 original_name: original_name.clone(),
305 error: Some(err_msg.clone()),
306 memory_id: None,
307 action: None,
308 })?;
309 failed += 1;
310 if args.fail_fast {
311 output::emit_json_compact(&IngestSummary {
312 summary: true,
313 dir: args.dir.display().to_string(),
314 pattern: args.pattern.clone(),
315 recursive: args.recursive,
316 files_total: total,
317 files_succeeded: succeeded,
318 files_failed: failed,
319 files_skipped: skipped,
320 elapsed_ms: started.elapsed().as_millis() as u64,
321 })?;
322 return Err(AppError::Validation(format!(
323 "ingest aborted on first failure: {err_msg}"
324 )));
325 }
326 }
327 }
328 }
329
330 output::emit_json_compact(&IngestSummary {
331 summary: true,
332 dir: args.dir.display().to_string(),
333 pattern: args.pattern.clone(),
334 recursive: args.recursive,
335 files_total: total,
336 files_succeeded: succeeded,
337 files_failed: failed,
338 files_skipped: skipped,
339 elapsed_ms: started.elapsed().as_millis() as u64,
340 })?;
341
342 Ok(())
343}
344
345fn init_storage(paths: &AppPaths) -> Result<Connection, AppError> {
351 ensure_db_ready(paths)?;
352 let conn = open_rw(&paths.db)?;
353 Ok(conn)
354}
355
356#[allow(clippy::too_many_arguments)]
362fn process_file(
363 conn: &mut Connection,
364 paths: &AppPaths,
365 namespace: &str,
366 memory_type: &str,
367 skip_extraction: bool,
368 path: &Path,
369 name: &str,
370) -> Result<FileSuccess, AppError> {
371 use crate::constants::*;
372
373 if name.len() > MAX_MEMORY_NAME_LEN {
374 return Err(AppError::LimitExceeded(
375 crate::i18n::validation::name_length(MAX_MEMORY_NAME_LEN),
376 ));
377 }
378 if name.starts_with("__") {
379 return Err(AppError::Validation(
380 crate::i18n::validation::reserved_name(),
381 ));
382 }
383 {
384 let slug_re = regex::Regex::new(NAME_SLUG_REGEX)
385 .map_err(|e| AppError::Internal(anyhow::anyhow!("regex: {e}")))?;
386 if !slug_re.is_match(name) {
387 return Err(AppError::Validation(crate::i18n::validation::name_kebab(
388 name,
389 )));
390 }
391 }
392
393 let raw_body = std::fs::read_to_string(path).map_err(AppError::Io)?;
394 if raw_body.len() > MAX_MEMORY_BODY_LEN {
395 return Err(AppError::LimitExceeded(
396 crate::i18n::validation::body_exceeds(MAX_MEMORY_BODY_LEN),
397 ));
398 }
399 if raw_body.trim().is_empty() {
400 return Err(AppError::Validation(crate::i18n::validation::empty_body()));
401 }
402
403 let description = format!("ingested from {}", path.display());
404 if description.len() > MAX_MEMORY_DESCRIPTION_LEN {
405 return Err(AppError::Validation(
406 crate::i18n::validation::description_exceeds(MAX_MEMORY_DESCRIPTION_LEN),
407 ));
408 }
409
410 let mut extracted_entities: Vec<NewEntity> = Vec::new();
414 let mut extracted_relationships: Vec<NewRelationship> = Vec::new();
415 let mut extracted_urls: Vec<crate::extraction::ExtractedUrl> = Vec::new();
416 let mut relationships_truncated = false;
417 if !skip_extraction {
418 match crate::extraction::extract_graph_auto(&raw_body, paths) {
419 Ok(extracted) => {
420 extracted_urls = extracted.urls;
421 extracted_entities = extracted.entities;
422 extracted_relationships = extracted.relationships;
423 relationships_truncated = extracted.relationships_truncated;
424
425 if extracted_entities.len() > MAX_ENTITIES_PER_MEMORY {
426 extracted_entities.truncate(MAX_ENTITIES_PER_MEMORY);
427 }
428 if extracted_relationships.len() > MAX_RELATIONSHIPS_PER_MEMORY {
429 relationships_truncated = true;
430 extracted_relationships.truncate(MAX_RELATIONSHIPS_PER_MEMORY);
431 }
432 }
433 Err(e) => {
434 tracing::warn!(
435 file = %path.display(),
436 "auto-extraction failed (graceful degradation): {e:#}"
437 );
438 }
439 }
440 }
441
442 for entity in &extracted_entities {
444 if !is_valid_entity_type(&entity.entity_type) {
445 return Err(AppError::Validation(format!(
446 "invalid entity_type '{}' for entity '{}'",
447 entity.entity_type, entity.name
448 )));
449 }
450 }
451 for rel in &mut extracted_relationships {
452 rel.relation = rel.relation.replace('-', "_");
453 if !is_valid_relation(&rel.relation) {
454 return Err(AppError::Validation(format!(
455 "invalid relation '{}' for relationship '{}' -> '{}'",
456 rel.relation, rel.source, rel.target
457 )));
458 }
459 if !(0.0..=1.0).contains(&rel.strength) {
460 return Err(AppError::Validation(format!(
461 "invalid strength {} for relationship '{}' -> '{}'; expected value in [0.0, 1.0]",
462 rel.strength, rel.source, rel.target
463 )));
464 }
465 }
466
467 let body_hash = blake3::hash(raw_body.as_bytes()).to_hex().to_string();
468 let snippet: String = raw_body.chars().take(200).collect();
469
470 let tokenizer = crate::tokenizer::get_tokenizer(&paths.models)?;
471 let chunks_info = chunking::split_into_chunks_hierarchical(&raw_body, tokenizer);
472 if chunks_info.len() > REMEMBER_MAX_SAFE_MULTI_CHUNKS {
473 return Err(AppError::LimitExceeded(format!(
474 "document produces {} chunks; current safe operational limit is {} chunks; split the document before using remember",
475 chunks_info.len(),
476 REMEMBER_MAX_SAFE_MULTI_CHUNKS
477 )));
478 }
479
480 let mut chunk_embeddings_cache: Option<Vec<Vec<f32>>> = None;
484 let embedding = if chunks_info.len() == 1 {
485 crate::daemon::embed_passage_or_local(&paths.models, &raw_body)?
486 } else {
487 let chunk_texts: Vec<&str> = chunks_info
488 .iter()
489 .map(|c| chunking::chunk_text(&raw_body, c))
490 .collect();
491 let mut chunk_embeddings = Vec::with_capacity(chunk_texts.len());
492 for chunk_text in &chunk_texts {
493 chunk_embeddings.push(crate::daemon::embed_passage_or_local(
494 &paths.models,
495 chunk_text,
496 )?);
497 }
498 let aggregated = chunking::aggregate_embeddings(&chunk_embeddings);
499 chunk_embeddings_cache = Some(chunk_embeddings);
500 aggregated
501 };
502
503 {
506 let active_count: u32 = conn.query_row(
507 "SELECT COUNT(DISTINCT namespace) FROM memories WHERE deleted_at IS NULL",
508 [],
509 |r| r.get::<_, i64>(0).map(|v| v as u32),
510 )?;
511 let ns_exists: bool = conn.query_row(
512 "SELECT EXISTS(SELECT 1 FROM memories WHERE namespace = ?1 AND deleted_at IS NULL)",
513 rusqlite::params![namespace],
514 |r| r.get::<_, i64>(0).map(|v| v > 0),
515 )?;
516 if !ns_exists && active_count >= MAX_NAMESPACES_ACTIVE {
517 return Err(AppError::NamespaceError(format!(
518 "active namespace limit of {MAX_NAMESPACES_ACTIVE} exceeded while creating '{namespace}'"
519 )));
520 }
521 }
522
523 let existing_memory = memories::find_by_name(conn, namespace, name)?;
524 if existing_memory.is_some() {
525 return Err(AppError::Duplicate(errors_msg::duplicate_memory(
529 name, namespace,
530 )));
531 }
532 let duplicate_hash_id = memories::find_by_hash(conn, namespace, &body_hash)?;
533
534 let new_memory = NewMemory {
535 namespace: namespace.to_string(),
536 name: name.to_string(),
537 memory_type: memory_type.to_string(),
538 description: description.clone(),
539 body: raw_body,
540 body_hash: body_hash.clone(),
541 session_id: None,
542 source: "agent".to_string(),
543 metadata: serde_json::json!({}),
544 };
545
546 let graph_entity_embeddings = extracted_entities
549 .iter()
550 .map(|entity| {
551 let entity_text = match &entity.description {
552 Some(desc) => format!("{} {}", entity.name, desc),
553 None => entity.name.clone(),
554 };
555 crate::daemon::embed_passage_or_local(&paths.models, &entity_text)
556 })
557 .collect::<Result<Vec<_>, _>>()?;
558
559 let _ = relationships_truncated; let tx = conn.transaction_with_behavior(rusqlite::TransactionBehavior::Immediate)?;
562
563 if let Some(hash_id) = duplicate_hash_id {
564 tracing::debug!(
565 target: "ingest",
566 duplicate_memory_id = hash_id,
567 "identical body already exists; persisting a new memory anyway"
568 );
569 }
570
571 let memory_id = memories::insert(&tx, &new_memory)?;
572 versions::insert_version(
573 &tx,
574 memory_id,
575 1,
576 name,
577 memory_type,
578 &description,
579 &new_memory.body,
580 &serde_json::to_string(&new_memory.metadata)?,
581 None,
582 "create",
583 )?;
584 memories::upsert_vec(
585 &tx,
586 memory_id,
587 namespace,
588 memory_type,
589 &embedding,
590 name,
591 &snippet,
592 )?;
593
594 if chunks_info.len() > 1 {
595 storage_chunks::insert_chunk_slices(&tx, memory_id, &new_memory.body, &chunks_info)?;
596 let chunk_embeddings = chunk_embeddings_cache.take().ok_or_else(|| {
597 AppError::Internal(anyhow::anyhow!(
598 "missing chunk embeddings cache on multi-chunk ingest path"
599 ))
600 })?;
601 for (i, emb) in chunk_embeddings.iter().enumerate() {
602 storage_chunks::upsert_chunk_vec(&tx, i as i64, memory_id, i as i32, emb)?;
603 }
604 }
605
606 if !extracted_entities.is_empty() || !extracted_relationships.is_empty() {
607 for (idx, entity) in extracted_entities.iter().enumerate() {
608 let entity_id = entities::upsert_entity(&tx, namespace, entity)?;
609 let entity_embedding = &graph_entity_embeddings[idx];
610 entities::upsert_entity_vec(
611 &tx,
612 entity_id,
613 namespace,
614 &entity.entity_type,
615 entity_embedding,
616 &entity.name,
617 )?;
618 entities::link_memory_entity(&tx, memory_id, entity_id)?;
619 entities::increment_degree(&tx, entity_id)?;
620 }
621 let entity_types: std::collections::HashMap<&str, &str> = extracted_entities
622 .iter()
623 .map(|entity| (entity.name.as_str(), entity.entity_type.as_str()))
624 .collect();
625 for rel in &extracted_relationships {
626 let source_entity = NewEntity {
627 name: rel.source.clone(),
628 entity_type: entity_types
629 .get(rel.source.as_str())
630 .copied()
631 .unwrap_or("concept")
632 .to_string(),
633 description: None,
634 };
635 let target_entity = NewEntity {
636 name: rel.target.clone(),
637 entity_type: entity_types
638 .get(rel.target.as_str())
639 .copied()
640 .unwrap_or("concept")
641 .to_string(),
642 description: None,
643 };
644 let source_id = entities::upsert_entity(&tx, namespace, &source_entity)?;
645 let target_id = entities::upsert_entity(&tx, namespace, &target_entity)?;
646 let rel_id = entities::upsert_relationship(&tx, namespace, source_id, target_id, rel)?;
647 entities::link_memory_relationship(&tx, memory_id, rel_id)?;
648 }
649 }
650
651 tx.commit()?;
652
653 if !extracted_urls.is_empty() {
656 let url_entries: Vec<storage_urls::MemoryUrl> = extracted_urls
657 .into_iter()
658 .map(|u| storage_urls::MemoryUrl {
659 url: u.url,
660 offset: Some(u.offset as i64),
661 })
662 .collect();
663 let _ = storage_urls::insert_urls(conn, memory_id, &url_entries);
664 }
665
666 Ok(FileSuccess {
667 memory_id,
668 action: "created".to_string(),
669 })
670}
671
672fn is_valid_entity_type(entity_type: &str) -> bool {
673 matches!(
674 entity_type,
675 "project"
676 | "tool"
677 | "person"
678 | "file"
679 | "concept"
680 | "incident"
681 | "decision"
682 | "memory"
683 | "dashboard"
684 | "issue_tracker"
685 | "organization"
686 | "location"
687 | "date"
688 )
689}
690
691fn is_valid_relation(relation: &str) -> bool {
692 matches!(
693 relation,
694 "applies_to"
695 | "uses"
696 | "depends_on"
697 | "causes"
698 | "fixes"
699 | "contradicts"
700 | "supports"
701 | "follows"
702 | "related"
703 | "mentions"
704 | "replaces"
705 | "tracked_in"
706 )
707}
708
709fn collect_files(
710 dir: &Path,
711 pattern: &str,
712 recursive: bool,
713 out: &mut Vec<PathBuf>,
714) -> Result<(), AppError> {
715 let entries = std::fs::read_dir(dir).map_err(AppError::Io)?;
716 for entry in entries {
717 let entry = entry.map_err(AppError::Io)?;
718 let path = entry.path();
719 let file_type = entry.file_type().map_err(AppError::Io)?;
720 if file_type.is_file() {
721 let name = entry.file_name();
722 let name_str = name.to_string_lossy();
723 if matches_pattern(&name_str, pattern) {
724 out.push(path);
725 }
726 } else if file_type.is_dir() && recursive {
727 collect_files(&path, pattern, recursive, out)?;
728 }
729 }
730 Ok(())
731}
732
733fn matches_pattern(name: &str, pattern: &str) -> bool {
734 if let Some(suffix) = pattern.strip_prefix('*') {
735 name.ends_with(suffix)
736 } else if let Some(prefix) = pattern.strip_suffix('*') {
737 name.starts_with(prefix)
738 } else {
739 name == pattern
740 }
741}
742
743fn derive_kebab_name(path: &Path) -> (String, bool, Option<String>) {
747 let stem = path.file_stem().and_then(|s| s.to_str()).unwrap_or("");
748 let lowered: String = stem
749 .chars()
750 .map(|c| {
751 if c == '_' || c.is_whitespace() {
752 '-'
753 } else {
754 c
755 }
756 })
757 .map(|c| c.to_ascii_lowercase())
758 .filter(|c| c.is_ascii_lowercase() || c.is_ascii_digit() || *c == '-')
759 .collect();
760 let collapsed = collapse_dashes(&lowered);
761 let trimmed = collapsed.trim_matches('-').to_string();
762 if trimmed.len() > DERIVED_NAME_MAX_LEN {
763 let truncated = trimmed[..DERIVED_NAME_MAX_LEN]
764 .trim_matches('-')
765 .to_string();
766 tracing::warn!(
769 target: "ingest",
770 original = %trimmed,
771 truncated_to = %truncated,
772 max_len = DERIVED_NAME_MAX_LEN,
773 "derived memory name truncated to fit length cap; collisions will be resolved with numeric suffixes"
774 );
775 (truncated, true, Some(trimmed))
776 } else {
777 (trimmed, false, None)
778 }
779}
780
781fn unique_name(base: &str, taken: &BTreeSet<String>) -> Result<String, AppError> {
794 if !taken.contains(base) {
795 return Ok(base.to_string());
796 }
797 for suffix in 1..=MAX_NAME_COLLISION_SUFFIX {
798 let candidate = format!("{base}-{suffix}");
799 if !taken.contains(&candidate) {
800 tracing::warn!(
801 target: "ingest",
802 base = %base,
803 resolved = %candidate,
804 suffix,
805 "memory name collision resolved with numeric suffix"
806 );
807 return Ok(candidate);
808 }
809 }
810 Err(AppError::Validation(format!(
811 "too many name collisions for base '{base}' (>{MAX_NAME_COLLISION_SUFFIX}); rename source files to disambiguate"
812 )))
813}
814
815fn collapse_dashes(s: &str) -> String {
816 let mut out = String::with_capacity(s.len());
817 let mut prev_dash = false;
818 for c in s.chars() {
819 if c == '-' {
820 if !prev_dash {
821 out.push('-');
822 }
823 prev_dash = true;
824 } else {
825 out.push(c);
826 prev_dash = false;
827 }
828 }
829 out
830}
831
832#[cfg(test)]
833mod tests {
834 use super::*;
835 use std::path::PathBuf;
836
837 #[test]
838 fn matches_pattern_suffix() {
839 assert!(matches_pattern("foo.md", "*.md"));
840 assert!(!matches_pattern("foo.txt", "*.md"));
841 assert!(matches_pattern("foo.md", "*"));
842 }
843
844 #[test]
845 fn matches_pattern_prefix() {
846 assert!(matches_pattern("README.md", "README*"));
847 assert!(!matches_pattern("CHANGELOG.md", "README*"));
848 }
849
850 #[test]
851 fn matches_pattern_exact() {
852 assert!(matches_pattern("README.md", "README.md"));
853 assert!(!matches_pattern("readme.md", "README.md"));
854 }
855
856 #[test]
857 fn derive_kebab_underscore_to_dash() {
858 let p = PathBuf::from("/tmp/claude_code_headless.md");
859 let (name, truncated, original) = derive_kebab_name(&p);
860 assert_eq!(name, "claude-code-headless");
861 assert!(!truncated);
862 assert!(original.is_none());
863 }
864
865 #[test]
866 fn derive_kebab_uppercase_lowered() {
867 let p = PathBuf::from("/tmp/README.md");
868 let (name, truncated, original) = derive_kebab_name(&p);
869 assert_eq!(name, "readme");
870 assert!(!truncated);
871 assert!(original.is_none());
872 }
873
874 #[test]
875 fn derive_kebab_strips_non_kebab_chars() {
876 let p = PathBuf::from("/tmp/some@weird#name!.md");
877 let (name, truncated, original) = derive_kebab_name(&p);
878 assert_eq!(name, "someweirdname");
879 assert!(!truncated);
880 assert!(original.is_none());
881 }
882
883 #[test]
884 fn derive_kebab_collapses_consecutive_dashes() {
885 let p = PathBuf::from("/tmp/a__b___c.md");
886 let (name, truncated, original) = derive_kebab_name(&p);
887 assert_eq!(name, "a-b-c");
888 assert!(!truncated);
889 assert!(original.is_none());
890 }
891
892 #[test]
893 fn derive_kebab_truncates_to_60_chars() {
894 let p = PathBuf::from(format!("/tmp/{}.md", "a".repeat(80)));
895 let (name, truncated, original) = derive_kebab_name(&p);
896 assert!(name.len() <= 60, "got len {}", name.len());
897 assert!(truncated);
898 assert!(original.is_some());
899 assert!(original.unwrap().len() > 60);
900 }
901
902 #[test]
903 fn collect_files_finds_md_files() {
904 let tmp = tempfile::tempdir().expect("tempdir");
905 std::fs::write(tmp.path().join("a.md"), "x").unwrap();
906 std::fs::write(tmp.path().join("b.md"), "y").unwrap();
907 std::fs::write(tmp.path().join("c.txt"), "z").unwrap();
908 let mut out = Vec::new();
909 collect_files(tmp.path(), "*.md", false, &mut out).expect("collect");
910 assert_eq!(out.len(), 2, "should find 2 .md files, got {out:?}");
911 }
912
913 #[test]
914 fn collect_files_recursive_descends_subdirs() {
915 let tmp = tempfile::tempdir().expect("tempdir");
916 let sub = tmp.path().join("sub");
917 std::fs::create_dir(&sub).unwrap();
918 std::fs::write(tmp.path().join("a.md"), "x").unwrap();
919 std::fs::write(sub.join("b.md"), "y").unwrap();
920 let mut out = Vec::new();
921 collect_files(tmp.path(), "*.md", true, &mut out).expect("collect");
922 assert_eq!(out.len(), 2);
923 }
924
925 #[test]
926 fn collect_files_non_recursive_skips_subdirs() {
927 let tmp = tempfile::tempdir().expect("tempdir");
928 let sub = tmp.path().join("sub");
929 std::fs::create_dir(&sub).unwrap();
930 std::fs::write(tmp.path().join("a.md"), "x").unwrap();
931 std::fs::write(sub.join("b.md"), "y").unwrap();
932 let mut out = Vec::new();
933 collect_files(tmp.path(), "*.md", false, &mut out).expect("collect");
934 assert_eq!(out.len(), 1);
935 }
936
937 #[test]
940 fn derive_kebab_long_basename_truncated_within_cap() {
941 let p = PathBuf::from(format!("/tmp/{}.md", "a".repeat(120)));
942 let (name, truncated, original) = derive_kebab_name(&p);
943 assert!(
944 name.len() <= DERIVED_NAME_MAX_LEN,
945 "truncated name must respect cap; got {} chars",
946 name.len()
947 );
948 assert!(!name.is_empty());
949 assert!(truncated);
950 assert!(original.is_some());
951 }
952
953 #[test]
954 fn unique_name_returns_base_when_free() {
955 let taken: BTreeSet<String> = BTreeSet::new();
956 let resolved = unique_name("note", &taken).expect("must resolve");
957 assert_eq!(resolved, "note");
958 }
959
960 #[test]
961 fn unique_name_appends_first_free_suffix_on_collision() {
962 let mut taken: BTreeSet<String> = BTreeSet::new();
963 taken.insert("note".to_string());
964 taken.insert("note-1".to_string());
965 let resolved = unique_name("note", &taken).expect("must resolve");
966 assert_eq!(resolved, "note-2");
967 }
968
969 #[test]
970 fn unique_name_errors_after_collision_cap() {
971 let mut taken: BTreeSet<String> = BTreeSet::new();
972 taken.insert("note".to_string());
973 for i in 1..=MAX_NAME_COLLISION_SUFFIX {
974 taken.insert(format!("note-{i}"));
975 }
976 let err = unique_name("note", &taken).expect_err("must surface error");
977 assert!(matches!(err, AppError::Validation(_)));
978 }
979
980 #[test]
983 fn is_valid_entity_type_accepts_v008_types() {
984 assert!(is_valid_entity_type("organization"));
985 assert!(is_valid_entity_type("location"));
986 assert!(is_valid_entity_type("date"));
987 assert!(!is_valid_entity_type("unknown"));
988 }
989
990 #[test]
991 fn is_valid_relation_accepts_canonical_relations() {
992 assert!(is_valid_relation("applies_to"));
993 assert!(is_valid_relation("depends_on"));
994 assert!(!is_valid_relation("foo_bar"));
995 }
996}