1use crate::persistence::{self, GraphMetadata};
20use crate::{Edge, EdgeOrigin, GraphData, GraphExtractor, Relationship};
21use fabryk_content::markdown::extract_frontmatter;
22use fabryk_core::{Error, Result};
23use serde::{Deserialize, Serialize};
24use std::collections::HashSet;
25use std::path::{Path, PathBuf};
26
27#[derive(Clone, Debug, Default)]
33pub enum ErrorHandling {
34 #[default]
36 FailFast,
37 Collect,
39 Skip,
41}
42
43#[derive(Debug, Clone)]
45pub struct BuildError {
46 pub file: PathBuf,
48 pub message: String,
50}
51
52#[derive(Clone, Debug, Serialize, Deserialize)]
54pub struct ManualEdge {
55 pub from: String,
57 pub to: String,
59 pub relationship: String,
61 pub weight: Option<f32>,
63}
64
65#[derive(Debug, Clone)]
67pub struct BuildStats {
68 pub nodes_created: usize,
70 pub edges_created: usize,
72 pub files_processed: usize,
74 pub files_skipped: usize,
76 pub errors: Vec<BuildError>,
78 pub manual_edges_loaded: usize,
80 pub dangling_refs: Vec<String>,
82 pub deduped_edges: usize,
84 pub from_cache: bool,
86}
87
88pub struct GraphBuilder<E: GraphExtractor> {
102 extractor: E,
103 content_path: Option<PathBuf>,
104 manual_edges_path: Option<PathBuf>,
105 error_handling: ErrorHandling,
106 cache_path: Option<PathBuf>,
107 skip_cache: bool,
108}
109
110impl<E: GraphExtractor> GraphBuilder<E> {
111 pub fn new(extractor: E) -> Self {
113 Self {
114 extractor,
115 content_path: None,
116 manual_edges_path: None,
117 error_handling: ErrorHandling::default(),
118 cache_path: None,
119 skip_cache: false,
120 }
121 }
122
123 pub fn with_content_path(mut self, path: impl Into<PathBuf>) -> Self {
125 self.content_path = Some(path.into());
126 self
127 }
128
129 pub fn with_manual_edges(mut self, path: impl Into<PathBuf>) -> Self {
131 self.manual_edges_path = Some(path.into());
132 self
133 }
134
135 pub fn with_error_handling(mut self, handling: ErrorHandling) -> Self {
137 self.error_handling = handling;
138 self
139 }
140
141 pub fn with_cache_path(mut self, path: impl Into<PathBuf>) -> Self {
148 self.cache_path = Some(path.into());
149 self
150 }
151
152 pub fn skip_cache(mut self) -> Self {
154 self.skip_cache = true;
155 self
156 }
157
158 pub async fn build(self) -> Result<(GraphData, BuildStats)> {
166 let content_path = self
167 .content_path
168 .as_ref()
169 .ok_or_else(|| Error::config("Content path not set. Use with_content_path() first."))?
170 .clone();
171
172 if let Some(ref cache_path) = self.cache_path
174 && !self.skip_cache
175 {
176 let content_hash = compute_content_hash(&content_path)?;
177 if persistence::is_cache_fresh(cache_path, &content_hash) {
178 log::info!(
179 "Graph cache is fresh, loading from {}",
180 cache_path.display()
181 );
182 let graph = persistence::load_graph(cache_path)?;
183 let stats = BuildStats {
184 nodes_created: graph.node_count(),
185 edges_created: graph.edge_count(),
186 files_processed: 0,
187 files_skipped: 0,
188 errors: Vec::new(),
189 manual_edges_loaded: 0,
190 dangling_refs: Vec::new(),
191 deduped_edges: 0,
192 from_cache: true,
193 };
194 return Ok((graph, stats));
195 }
196 }
197
198 let files = discover_files(&content_path).await?;
200
201 let mut stats = BuildStats {
202 nodes_created: 0,
203 edges_created: 0,
204 files_processed: 0,
205 files_skipped: 0,
206 errors: Vec::new(),
207 manual_edges_loaded: 0,
208 dangling_refs: Vec::new(),
209 deduped_edges: 0,
210 from_cache: false,
211 };
212
213 let mut graph = GraphData::new();
214
215 let mut pending_edges: Vec<(String, E::EdgeData)> = Vec::new();
217
218 for file_path in &files {
222 match self.process_file(&content_path, file_path) {
223 Ok((node_data, edge_data)) => {
224 let node = self.extractor.to_graph_node(&node_data);
225 graph.add_node(node.clone());
226 stats.nodes_created += 1;
227
228 if let Some(edges) = edge_data {
229 pending_edges.push((node.id.clone(), edges));
230 }
231 }
232 Err(e) => {
233 let build_error = BuildError {
234 file: file_path.clone(),
235 message: e.to_string(),
236 };
237
238 match self.error_handling {
239 ErrorHandling::FailFast => return Err(e),
240 ErrorHandling::Collect | ErrorHandling::Skip => {
241 stats.files_skipped += 1;
242 stats.errors.push(build_error);
243 }
244 }
245 }
246 }
247
248 stats.files_processed += 1;
249 }
250
251 let mut seen_edges: HashSet<(String, String, String)> = HashSet::new();
255
256 for (from_id, edge_data) in &pending_edges {
257 let edges = self.extractor.to_graph_edges(from_id, edge_data);
258 for edge in edges {
259 if !graph.contains_node(&edge.from) || !graph.contains_node(&edge.to) {
261 stats.dangling_refs.push(format!(
262 "{} -[{}]-> {}",
263 edge.from,
264 edge.relationship.name(),
265 edge.to
266 ));
267 continue;
268 }
269
270 let edge_key = (
272 edge.from.clone(),
273 edge.to.clone(),
274 edge.relationship.name().to_string(),
275 );
276 if !seen_edges.insert(edge_key) {
277 stats.deduped_edges += 1;
278 continue;
279 }
280
281 if graph.add_edge(edge).is_ok() {
282 stats.edges_created += 1;
283 }
284 }
285 }
286
287 if let Some(ref manual_path) = self.manual_edges_path {
291 stats.manual_edges_loaded =
292 load_manual_edges(manual_path, &mut graph, &mut seen_edges, &mut stats)?;
293 }
294
295 if let Some(ref cache_path) = self.cache_path {
297 let content_hash = compute_content_hash(&content_path)?;
298 let metadata = GraphMetadata {
299 content_hash: Some(content_hash),
300 source_file_count: Some(stats.files_processed),
301 ..Default::default()
302 };
303 if let Some(parent) = cache_path.parent()
305 && !parent.exists()
306 {
307 std::fs::create_dir_all(parent).map_err(|e| Error::io_with_path(e, parent))?;
308 }
309 if let Err(e) = persistence::save_graph(&graph, cache_path, Some(metadata)) {
310 log::warn!("Failed to save graph cache: {e}");
311 }
312 }
313
314 Ok((graph, stats))
315 }
316
317 fn process_file(
319 &self,
320 base_path: &Path,
321 file_path: &Path,
322 ) -> Result<(E::NodeData, Option<E::EdgeData>)> {
323 let content =
324 std::fs::read_to_string(file_path).map_err(|e| Error::io_with_path(e, file_path))?;
325
326 let fm_result = extract_frontmatter(&content)?;
327
328 let frontmatter = fm_result
329 .value()
330 .cloned()
331 .unwrap_or(yaml_serde::Value::Null);
332 let body = fm_result.body();
333
334 let node_data = self
335 .extractor
336 .extract_node(base_path, file_path, &frontmatter, body)?;
337
338 let edge_data = self.extractor.extract_edges(&frontmatter, body)?;
339
340 Ok((node_data, edge_data))
341 }
342}
343
344fn parse_relationship(s: &str) -> Relationship {
350 match s.to_lowercase().as_str() {
351 "prerequisite" | "prereq" => Relationship::Prerequisite,
352 "leads_to" | "leadsto" => Relationship::LeadsTo,
353 "relates_to" | "relatesto" | "related" => Relationship::RelatesTo,
354 "extends" => Relationship::Extends,
355 "introduces" => Relationship::Introduces,
356 "covers" => Relationship::Covers,
357 "variant_of" | "variantof" => Relationship::VariantOf,
358 "contrasts_with" | "contrastswith" => Relationship::ContrastsWith,
359 "answers_question" | "answersquestion" | "answers_questions" => {
360 Relationship::AnswersQuestion
361 }
362 other => Relationship::Custom(other.to_string()),
363 }
364}
365
366fn load_manual_edges(
368 path: &Path,
369 graph: &mut GraphData,
370 seen_edges: &mut HashSet<(String, String, String)>,
371 stats: &mut BuildStats,
372) -> Result<usize> {
373 if !path.exists() {
374 return Ok(0);
375 }
376
377 let json = std::fs::read_to_string(path).map_err(|e| Error::io_with_path(e, path))?;
378
379 let manual_edges: Vec<ManualEdge> = serde_json::from_str(&json)
380 .map_err(|e| Error::parse(format!("Failed to parse manual edges: {e}")))?;
381
382 let mut loaded = 0;
383 for manual in manual_edges {
384 if !graph.contains_node(&manual.from) || !graph.contains_node(&manual.to) {
385 stats.dangling_refs.push(format!(
386 "manual: {} -[{}]-> {}",
387 manual.from, manual.relationship, manual.to
388 ));
389 continue;
390 }
391
392 let edge_key = (
393 manual.from.clone(),
394 manual.to.clone(),
395 manual.relationship.clone(),
396 );
397 if !seen_edges.insert(edge_key) {
398 stats.deduped_edges += 1;
399 continue;
400 }
401
402 let relationship = parse_relationship(&manual.relationship);
403 let weight = manual
404 .weight
405 .unwrap_or_else(|| relationship.default_weight());
406
407 let edge = Edge {
408 from: manual.from,
409 to: manual.to,
410 relationship,
411 weight,
412 origin: EdgeOrigin::Manual,
413 };
414
415 if graph.add_edge(edge).is_ok() {
416 loaded += 1;
417 }
418 }
419
420 Ok(loaded)
421}
422
423fn compute_content_hash(dir: &Path) -> Result<String> {
428 use std::collections::hash_map::DefaultHasher;
429 use std::hash::{Hash, Hasher};
430
431 let mut hasher = DefaultHasher::new();
432 let mut file_info: Vec<(String, u64)> = Vec::new();
433
434 fn collect_files(dir: &Path, base: &Path, file_info: &mut Vec<(String, u64)>) -> Result<()> {
435 for entry in std::fs::read_dir(dir).map_err(|e| Error::io_with_path(e, dir))? {
436 let entry = entry.map_err(Error::io)?;
437 let path = entry.path();
438 if path.is_dir() {
439 collect_files(&path, base, file_info)?;
440 } else if path.extension().is_some_and(|e| e == "md") {
441 let relative = path
442 .strip_prefix(base)
443 .unwrap_or(&path)
444 .to_string_lossy()
445 .to_string();
446 let mtime = std::fs::metadata(&path)
447 .ok()
448 .and_then(|m| m.modified().ok())
449 .and_then(|t| t.duration_since(std::time::UNIX_EPOCH).ok())
450 .map(|d| d.as_secs())
451 .unwrap_or(0);
452 file_info.push((relative, mtime));
453 }
454 }
455 Ok(())
456 }
457
458 collect_files(dir, dir, &mut file_info)?;
459 file_info.sort_by(|a, b| a.0.cmp(&b.0));
460
461 for (path, mtime) in &file_info {
462 path.hash(&mut hasher);
463 mtime.hash(&mut hasher);
464 }
465
466 Ok(format!("{:016x}", hasher.finish()))
467}
468
469async fn discover_files(base_path: &Path) -> Result<Vec<PathBuf>> {
471 use fabryk_core::util::files::{FindOptions, find_all_files};
472
473 let files = find_all_files(base_path, FindOptions::markdown()).await?;
474 let paths: Vec<PathBuf> = files.into_iter().map(|f| f.path).collect();
475
476 Ok(paths)
477}
478
479#[cfg(test)]
484mod tests {
485 use super::*;
486 use crate::Relationship;
487 use crate::extractor::mock::MockExtractor;
488 use tempfile::tempdir;
489
490 async fn setup_test_files() -> (tempfile::TempDir, PathBuf) {
491 let dir = tempdir().unwrap();
492 let content_dir = dir.path().join("content");
493 std::fs::create_dir(&content_dir).unwrap();
494
495 let file_a = "---\ntitle: \"Concept A\"\ncategory: \"basics\"\nprerequisites:\n - concept-b\n---\n\n# Concept A\n\nContent here.\n";
496 let file_b = "---\ntitle: \"Concept B\"\ncategory: \"fundamentals\"\n---\n\n# Concept B\n\nFoundation content.\n";
497
498 std::fs::write(content_dir.join("concept-a.md"), file_a).unwrap();
499 std::fs::write(content_dir.join("concept-b.md"), file_b).unwrap();
500
501 (dir, content_dir)
502 }
503
504 #[tokio::test]
505 async fn test_builder_basic() {
506 let (_dir, content_dir) = setup_test_files().await;
507
508 let (graph, stats) = GraphBuilder::new(MockExtractor)
509 .with_content_path(&content_dir)
510 .build()
511 .await
512 .unwrap();
513
514 assert_eq!(stats.files_processed, 2);
515 assert_eq!(graph.node_count(), 2);
516 assert!(graph.contains_node("concept-a"));
517 assert!(graph.contains_node("concept-b"));
518 }
519
520 #[tokio::test]
521 async fn test_builder_extracts_edges() {
522 let (_dir, content_dir) = setup_test_files().await;
523
524 let (graph, stats) = GraphBuilder::new(MockExtractor)
525 .with_content_path(&content_dir)
526 .build()
527 .await
528 .unwrap();
529
530 assert!(graph.edge_count() >= 1);
532 assert!(stats.edges_created >= 1);
533 }
534
535 #[tokio::test]
536 async fn test_builder_manual_edges() {
537 let (_dir, content_dir) = setup_test_files().await;
538 let manual_edges_path = content_dir.parent().unwrap().join("manual_edges.json");
539
540 let manual_edges = r#"[
541 {"from": "concept-a", "to": "concept-b", "relationship": "relates_to", "weight": 0.9}
542 ]"#;
543 std::fs::write(&manual_edges_path, manual_edges).unwrap();
544
545 let (_graph, stats) = GraphBuilder::new(MockExtractor)
546 .with_content_path(&content_dir)
547 .with_manual_edges(&manual_edges_path)
548 .build()
549 .await
550 .unwrap();
551
552 assert_eq!(stats.manual_edges_loaded, 1);
553 }
554
555 #[tokio::test]
556 async fn test_builder_error_handling_collect() {
557 let dir = tempdir().unwrap();
558 let content_dir = dir.path().join("content");
559 std::fs::create_dir(&content_dir).unwrap();
560
561 std::fs::write(
562 content_dir.join("valid.md"),
563 "---\ntitle: Valid\n---\nContent",
564 )
565 .unwrap();
566 std::fs::write(content_dir.join("invalid.md"), "not yaml frontmatter").unwrap();
567
568 let (_graph, stats) = GraphBuilder::new(MockExtractor)
569 .with_content_path(&content_dir)
570 .with_error_handling(ErrorHandling::Collect)
571 .build()
572 .await
573 .unwrap();
574
575 assert_eq!(stats.files_processed, 2);
576 assert!(stats.files_processed >= 1);
580 }
581
582 #[tokio::test]
583 async fn test_builder_missing_content_path() {
584 let result = GraphBuilder::new(MockExtractor).build().await;
585 assert!(result.is_err());
586 }
587
588 #[tokio::test]
589 async fn test_builder_dangling_refs() {
590 let dir = tempdir().unwrap();
591 let content_dir = dir.path().join("content");
592 std::fs::create_dir(&content_dir).unwrap();
593
594 let file = "---\ntitle: \"Orphan\"\nprerequisites:\n - nonexistent\n---\n\n# Orphan\n";
596 std::fs::write(content_dir.join("orphan.md"), file).unwrap();
597
598 let (_graph, stats) = GraphBuilder::new(MockExtractor)
599 .with_content_path(&content_dir)
600 .build()
601 .await
602 .unwrap();
603
604 assert_eq!(stats.nodes_created, 1);
605 assert!(!stats.dangling_refs.is_empty());
606 assert!(stats.dangling_refs[0].contains("nonexistent"));
607 }
608
609 #[tokio::test]
610 async fn test_builder_edge_dedup() {
611 let dir = tempdir().unwrap();
612 let content_dir = dir.path().join("content");
613 std::fs::create_dir(&content_dir).unwrap();
614
615 let file_a = "---\ntitle: \"A\"\nrelated:\n - b\n---\n\n# A\n";
617 let file_b = "---\ntitle: \"B\"\nrelated:\n - a\n---\n\n# B\n";
618
619 std::fs::write(content_dir.join("a.md"), file_a).unwrap();
620 std::fs::write(content_dir.join("b.md"), file_b).unwrap();
621
622 let (graph, stats) = GraphBuilder::new(MockExtractor)
623 .with_content_path(&content_dir)
624 .build()
625 .await
626 .unwrap();
627
628 assert_eq!(graph.node_count(), 2);
630 assert_eq!(stats.nodes_created, 2);
631 assert_eq!(graph.edge_count(), 2);
633 assert_eq!(stats.edges_created, 2);
634 }
635
636 #[tokio::test]
637 async fn test_builder_empty_directory() {
638 let dir = tempdir().unwrap();
639 let content_dir = dir.path().join("empty");
640 std::fs::create_dir(&content_dir).unwrap();
641
642 let (graph, stats) = GraphBuilder::new(MockExtractor)
643 .with_content_path(&content_dir)
644 .build()
645 .await
646 .unwrap();
647
648 assert_eq!(graph.node_count(), 0);
649 assert_eq!(stats.files_processed, 0);
650 }
651
652 #[test]
653 fn test_parse_relationship() {
654 assert_eq!(
655 parse_relationship("prerequisite"),
656 Relationship::Prerequisite
657 );
658 assert_eq!(parse_relationship("prereq"), Relationship::Prerequisite);
659 assert_eq!(parse_relationship("leads_to"), Relationship::LeadsTo);
660 assert_eq!(parse_relationship("relates_to"), Relationship::RelatesTo);
661 assert_eq!(parse_relationship("related"), Relationship::RelatesTo);
662 assert_eq!(parse_relationship("extends"), Relationship::Extends);
663 assert_eq!(parse_relationship("introduces"), Relationship::Introduces);
664 assert_eq!(parse_relationship("covers"), Relationship::Covers);
665 assert_eq!(parse_relationship("variant_of"), Relationship::VariantOf);
666 assert_eq!(
667 parse_relationship("custom_rel"),
668 Relationship::Custom("custom_rel".to_string())
669 );
670 }
671
672 #[tokio::test]
673 async fn test_builder_manual_edges_missing_file() {
674 let (_dir, content_dir) = setup_test_files().await;
675 let missing_path = content_dir.parent().unwrap().join("nonexistent.json");
676
677 let (_graph, stats) = GraphBuilder::new(MockExtractor)
678 .with_content_path(&content_dir)
679 .with_manual_edges(&missing_path)
680 .build()
681 .await
682 .unwrap();
683
684 assert_eq!(stats.manual_edges_loaded, 0);
686 }
687
688 #[tokio::test]
689 async fn test_builder_manual_edges_dangling() {
690 let (_dir, content_dir) = setup_test_files().await;
691 let manual_path = content_dir.parent().unwrap().join("manual.json");
692
693 let manual = r#"[
694 {"from": "concept-a", "to": "nonexistent", "relationship": "relates_to"}
695 ]"#;
696 std::fs::write(&manual_path, manual).unwrap();
697
698 let (_graph, stats) = GraphBuilder::new(MockExtractor)
699 .with_content_path(&content_dir)
700 .with_manual_edges(&manual_path)
701 .build()
702 .await
703 .unwrap();
704
705 assert_eq!(stats.manual_edges_loaded, 0);
706 assert!(
707 stats
708 .dangling_refs
709 .iter()
710 .any(|r| r.contains("nonexistent"))
711 );
712 }
713
714 #[tokio::test]
719 async fn test_builder_cache_hit() {
720 let (_dir, content_dir) = setup_test_files().await;
721 let cache_path = content_dir.parent().unwrap().join("graph-cache.json");
722
723 let (graph1, stats1) = GraphBuilder::new(MockExtractor)
725 .with_content_path(&content_dir)
726 .with_cache_path(&cache_path)
727 .build()
728 .await
729 .unwrap();
730 assert!(!stats1.from_cache);
731 assert!(cache_path.exists());
732
733 let (graph2, stats2) = GraphBuilder::new(MockExtractor)
735 .with_content_path(&content_dir)
736 .with_cache_path(&cache_path)
737 .build()
738 .await
739 .unwrap();
740 assert!(stats2.from_cache);
741 assert_eq!(graph1.node_count(), graph2.node_count());
742 assert_eq!(graph1.edge_count(), graph2.edge_count());
743 }
744
745 #[tokio::test]
746 async fn test_builder_cache_miss_on_content_change() {
747 let (_dir, content_dir) = setup_test_files().await;
748 let cache_path = content_dir.parent().unwrap().join("graph-cache.json");
749
750 let (_graph, stats1) = GraphBuilder::new(MockExtractor)
752 .with_content_path(&content_dir)
753 .with_cache_path(&cache_path)
754 .build()
755 .await
756 .unwrap();
757 assert!(!stats1.from_cache);
758
759 let file_c = "---\ntitle: \"Concept C\"\ncategory: \"new\"\n---\n\n# Concept C\n";
761 std::fs::write(content_dir.join("concept-c.md"), file_c).unwrap();
762
763 let (graph, stats2) = GraphBuilder::new(MockExtractor)
765 .with_content_path(&content_dir)
766 .with_cache_path(&cache_path)
767 .build()
768 .await
769 .unwrap();
770 assert!(!stats2.from_cache);
771 assert_eq!(graph.node_count(), 3);
772 }
773
774 #[tokio::test]
775 async fn test_builder_skip_cache() {
776 let (_dir, content_dir) = setup_test_files().await;
777 let cache_path = content_dir.parent().unwrap().join("graph-cache.json");
778
779 GraphBuilder::new(MockExtractor)
781 .with_content_path(&content_dir)
782 .with_cache_path(&cache_path)
783 .build()
784 .await
785 .unwrap();
786
787 let (_graph, stats) = GraphBuilder::new(MockExtractor)
789 .with_content_path(&content_dir)
790 .with_cache_path(&cache_path)
791 .skip_cache()
792 .build()
793 .await
794 .unwrap();
795 assert!(!stats.from_cache);
796 assert_eq!(stats.files_processed, 2);
797 }
798
799 #[tokio::test]
800 async fn test_builder_no_cache_path() {
801 let (_dir, content_dir) = setup_test_files().await;
802
803 let (_graph, stats) = GraphBuilder::new(MockExtractor)
805 .with_content_path(&content_dir)
806 .build()
807 .await
808 .unwrap();
809 assert!(!stats.from_cache);
810 assert_eq!(stats.files_processed, 2);
811 }
812
813 #[test]
814 fn test_compute_content_hash_deterministic() {
815 let dir = tempdir().unwrap();
816 let content_dir = dir.path().join("content");
817 std::fs::create_dir(&content_dir).unwrap();
818 std::fs::write(content_dir.join("a.md"), "content a").unwrap();
819 std::fs::write(content_dir.join("b.md"), "content b").unwrap();
820
821 let hash1 = compute_content_hash(&content_dir).unwrap();
822 let hash2 = compute_content_hash(&content_dir).unwrap();
823 assert_eq!(hash1, hash2);
824 }
825
826 #[test]
827 fn test_compute_content_hash_changes() {
828 let dir = tempdir().unwrap();
829 let content_dir = dir.path().join("content");
830 std::fs::create_dir(&content_dir).unwrap();
831 std::fs::write(content_dir.join("a.md"), "content a").unwrap();
832
833 let hash1 = compute_content_hash(&content_dir).unwrap();
834
835 std::fs::write(content_dir.join("b.md"), "content b").unwrap();
836
837 let hash2 = compute_content_hash(&content_dir).unwrap();
838 assert_ne!(hash1, hash2);
839 }
840}