use crate::persistence::{self, GraphMetadata};
use crate::{Edge, EdgeOrigin, GraphData, GraphExtractor, Relationship};
use fabryk_content::markdown::extract_frontmatter;
use fabryk_core::{Error, Result};
use serde::{Deserialize, Serialize};
use std::collections::HashSet;
use std::path::{Path, PathBuf};
#[derive(Clone, Debug, Default)]
pub enum ErrorHandling {
#[default]
FailFast,
Collect,
Skip,
}
#[derive(Debug, Clone)]
pub struct BuildError {
pub file: PathBuf,
pub message: String,
}
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct ManualEdge {
pub from: String,
pub to: String,
pub relationship: String,
pub weight: Option<f32>,
}
#[derive(Debug, Clone)]
pub struct BuildStats {
pub nodes_created: usize,
pub edges_created: usize,
pub files_processed: usize,
pub files_skipped: usize,
pub errors: Vec<BuildError>,
pub manual_edges_loaded: usize,
pub dangling_refs: Vec<String>,
pub deduped_edges: usize,
pub from_cache: bool,
}
pub struct GraphBuilder<E: GraphExtractor> {
extractor: E,
content_path: Option<PathBuf>,
manual_edges_path: Option<PathBuf>,
error_handling: ErrorHandling,
cache_path: Option<PathBuf>,
skip_cache: bool,
}
impl<E: GraphExtractor> GraphBuilder<E> {
pub fn new(extractor: E) -> Self {
Self {
extractor,
content_path: None,
manual_edges_path: None,
error_handling: ErrorHandling::default(),
cache_path: None,
skip_cache: false,
}
}
pub fn with_content_path(mut self, path: impl Into<PathBuf>) -> Self {
self.content_path = Some(path.into());
self
}
pub fn with_manual_edges(mut self, path: impl Into<PathBuf>) -> Self {
self.manual_edges_path = Some(path.into());
self
}
pub fn with_error_handling(mut self, handling: ErrorHandling) -> Self {
self.error_handling = handling;
self
}
pub fn with_cache_path(mut self, path: impl Into<PathBuf>) -> Self {
self.cache_path = Some(path.into());
self
}
pub fn skip_cache(mut self) -> Self {
self.skip_cache = true;
self
}
pub async fn build(self) -> Result<(GraphData, BuildStats)> {
let content_path = self
.content_path
.as_ref()
.ok_or_else(|| Error::config("Content path not set. Use with_content_path() first."))?
.clone();
if let Some(ref cache_path) = self.cache_path {
if !self.skip_cache {
let content_hash = compute_content_hash(&content_path)?;
if persistence::is_cache_fresh(cache_path, &content_hash) {
log::info!(
"Graph cache is fresh, loading from {}",
cache_path.display()
);
let graph = persistence::load_graph(cache_path)?;
let stats = BuildStats {
nodes_created: graph.node_count(),
edges_created: graph.edge_count(),
files_processed: 0,
files_skipped: 0,
errors: Vec::new(),
manual_edges_loaded: 0,
dangling_refs: Vec::new(),
deduped_edges: 0,
from_cache: true,
};
return Ok((graph, stats));
}
}
}
let files = discover_files(&content_path).await?;
let mut stats = BuildStats {
nodes_created: 0,
edges_created: 0,
files_processed: 0,
files_skipped: 0,
errors: Vec::new(),
manual_edges_loaded: 0,
dangling_refs: Vec::new(),
deduped_edges: 0,
from_cache: false,
};
let mut graph = GraphData::new();
let mut pending_edges: Vec<(String, E::EdgeData)> = Vec::new();
for file_path in &files {
match self.process_file(&content_path, file_path) {
Ok((node_data, edge_data)) => {
let node = self.extractor.to_graph_node(&node_data);
graph.add_node(node.clone());
stats.nodes_created += 1;
if let Some(edges) = edge_data {
pending_edges.push((node.id.clone(), edges));
}
}
Err(e) => {
let build_error = BuildError {
file: file_path.clone(),
message: e.to_string(),
};
match self.error_handling {
ErrorHandling::FailFast => return Err(e),
ErrorHandling::Collect | ErrorHandling::Skip => {
stats.files_skipped += 1;
stats.errors.push(build_error);
}
}
}
}
stats.files_processed += 1;
}
let mut seen_edges: HashSet<(String, String, String)> = HashSet::new();
for (from_id, edge_data) in &pending_edges {
let edges = self.extractor.to_graph_edges(from_id, edge_data);
for edge in edges {
if !graph.contains_node(&edge.from) || !graph.contains_node(&edge.to) {
stats.dangling_refs.push(format!(
"{} -[{}]-> {}",
edge.from,
edge.relationship.name(),
edge.to
));
continue;
}
let edge_key = (
edge.from.clone(),
edge.to.clone(),
edge.relationship.name().to_string(),
);
if !seen_edges.insert(edge_key) {
stats.deduped_edges += 1;
continue;
}
if graph.add_edge(edge).is_ok() {
stats.edges_created += 1;
}
}
}
if let Some(ref manual_path) = self.manual_edges_path {
stats.manual_edges_loaded =
load_manual_edges(manual_path, &mut graph, &mut seen_edges, &mut stats)?;
}
if let Some(ref cache_path) = self.cache_path {
let content_hash = compute_content_hash(&content_path)?;
let metadata = GraphMetadata {
content_hash: Some(content_hash),
source_file_count: Some(stats.files_processed),
..Default::default()
};
if let Some(parent) = cache_path.parent() {
if !parent.exists() {
std::fs::create_dir_all(parent).map_err(|e| Error::io_with_path(e, parent))?;
}
}
if let Err(e) = persistence::save_graph(&graph, cache_path, Some(metadata)) {
log::warn!("Failed to save graph cache: {e}");
}
}
Ok((graph, stats))
}
fn process_file(
&self,
base_path: &Path,
file_path: &Path,
) -> Result<(E::NodeData, Option<E::EdgeData>)> {
let content =
std::fs::read_to_string(file_path).map_err(|e| Error::io_with_path(e, file_path))?;
let fm_result = extract_frontmatter(&content)?;
let frontmatter = fm_result
.value()
.cloned()
.unwrap_or(serde_yaml::Value::Null);
let body = fm_result.body();
let node_data = self
.extractor
.extract_node(base_path, file_path, &frontmatter, body)?;
let edge_data = self.extractor.extract_edges(&frontmatter, body)?;
Ok((node_data, edge_data))
}
}
fn parse_relationship(s: &str) -> Relationship {
match s.to_lowercase().as_str() {
"prerequisite" | "prereq" => Relationship::Prerequisite,
"leads_to" | "leadsto" => Relationship::LeadsTo,
"relates_to" | "relatesto" | "related" => Relationship::RelatesTo,
"extends" => Relationship::Extends,
"introduces" => Relationship::Introduces,
"covers" => Relationship::Covers,
"variant_of" | "variantof" => Relationship::VariantOf,
"contrasts_with" | "contrastswith" => Relationship::ContrastsWith,
"answers_question" | "answersquestion" | "answers_questions" => {
Relationship::AnswersQuestion
}
other => Relationship::Custom(other.to_string()),
}
}
fn load_manual_edges(
path: &Path,
graph: &mut GraphData,
seen_edges: &mut HashSet<(String, String, String)>,
stats: &mut BuildStats,
) -> Result<usize> {
if !path.exists() {
return Ok(0);
}
let json = std::fs::read_to_string(path).map_err(|e| Error::io_with_path(e, path))?;
let manual_edges: Vec<ManualEdge> = serde_json::from_str(&json)
.map_err(|e| Error::parse(format!("Failed to parse manual edges: {e}")))?;
let mut loaded = 0;
for manual in manual_edges {
if !graph.contains_node(&manual.from) || !graph.contains_node(&manual.to) {
stats.dangling_refs.push(format!(
"manual: {} -[{}]-> {}",
manual.from, manual.relationship, manual.to
));
continue;
}
let edge_key = (
manual.from.clone(),
manual.to.clone(),
manual.relationship.clone(),
);
if !seen_edges.insert(edge_key) {
stats.deduped_edges += 1;
continue;
}
let relationship = parse_relationship(&manual.relationship);
let weight = manual
.weight
.unwrap_or_else(|| relationship.default_weight());
let edge = Edge {
from: manual.from,
to: manual.to,
relationship,
weight,
origin: EdgeOrigin::Manual,
};
if graph.add_edge(edge).is_ok() {
loaded += 1;
}
}
Ok(loaded)
}
fn compute_content_hash(dir: &Path) -> Result<String> {
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
let mut hasher = DefaultHasher::new();
let mut file_info: Vec<(String, u64)> = Vec::new();
fn collect_files(dir: &Path, base: &Path, file_info: &mut Vec<(String, u64)>) -> Result<()> {
for entry in std::fs::read_dir(dir).map_err(|e| Error::io_with_path(e, dir))? {
let entry = entry.map_err(Error::io)?;
let path = entry.path();
if path.is_dir() {
collect_files(&path, base, file_info)?;
} else if path.extension().is_some_and(|e| e == "md") {
let relative = path
.strip_prefix(base)
.unwrap_or(&path)
.to_string_lossy()
.to_string();
let mtime = std::fs::metadata(&path)
.ok()
.and_then(|m| m.modified().ok())
.and_then(|t| t.duration_since(std::time::UNIX_EPOCH).ok())
.map(|d| d.as_secs())
.unwrap_or(0);
file_info.push((relative, mtime));
}
}
Ok(())
}
collect_files(dir, dir, &mut file_info)?;
file_info.sort_by(|a, b| a.0.cmp(&b.0));
for (path, mtime) in &file_info {
path.hash(&mut hasher);
mtime.hash(&mut hasher);
}
Ok(format!("{:016x}", hasher.finish()))
}
async fn discover_files(base_path: &Path) -> Result<Vec<PathBuf>> {
use fabryk_core::util::files::{FindOptions, find_all_files};
let files = find_all_files(base_path, FindOptions::markdown()).await?;
let paths: Vec<PathBuf> = files.into_iter().map(|f| f.path).collect();
Ok(paths)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::Relationship;
use crate::extractor::mock::MockExtractor;
use tempfile::tempdir;
async fn setup_test_files() -> (tempfile::TempDir, PathBuf) {
let dir = tempdir().unwrap();
let content_dir = dir.path().join("content");
std::fs::create_dir(&content_dir).unwrap();
let file_a = "---\ntitle: \"Concept A\"\ncategory: \"basics\"\nprerequisites:\n - concept-b\n---\n\n# Concept A\n\nContent here.\n";
let file_b = "---\ntitle: \"Concept B\"\ncategory: \"fundamentals\"\n---\n\n# Concept B\n\nFoundation content.\n";
std::fs::write(content_dir.join("concept-a.md"), file_a).unwrap();
std::fs::write(content_dir.join("concept-b.md"), file_b).unwrap();
(dir, content_dir)
}
#[tokio::test]
async fn test_builder_basic() {
let (_dir, content_dir) = setup_test_files().await;
let (graph, stats) = GraphBuilder::new(MockExtractor)
.with_content_path(&content_dir)
.build()
.await
.unwrap();
assert_eq!(stats.files_processed, 2);
assert_eq!(graph.node_count(), 2);
assert!(graph.contains_node("concept-a"));
assert!(graph.contains_node("concept-b"));
}
#[tokio::test]
async fn test_builder_extracts_edges() {
let (_dir, content_dir) = setup_test_files().await;
let (graph, stats) = GraphBuilder::new(MockExtractor)
.with_content_path(&content_dir)
.build()
.await
.unwrap();
assert!(graph.edge_count() >= 1);
assert!(stats.edges_created >= 1);
}
#[tokio::test]
async fn test_builder_manual_edges() {
let (_dir, content_dir) = setup_test_files().await;
let manual_edges_path = content_dir.parent().unwrap().join("manual_edges.json");
let manual_edges = r#"[
{"from": "concept-a", "to": "concept-b", "relationship": "relates_to", "weight": 0.9}
]"#;
std::fs::write(&manual_edges_path, manual_edges).unwrap();
let (_graph, stats) = GraphBuilder::new(MockExtractor)
.with_content_path(&content_dir)
.with_manual_edges(&manual_edges_path)
.build()
.await
.unwrap();
assert_eq!(stats.manual_edges_loaded, 1);
}
#[tokio::test]
async fn test_builder_error_handling_collect() {
let dir = tempdir().unwrap();
let content_dir = dir.path().join("content");
std::fs::create_dir(&content_dir).unwrap();
std::fs::write(
content_dir.join("valid.md"),
"---\ntitle: Valid\n---\nContent",
)
.unwrap();
std::fs::write(content_dir.join("invalid.md"), "not yaml frontmatter").unwrap();
let (_graph, stats) = GraphBuilder::new(MockExtractor)
.with_content_path(&content_dir)
.with_error_handling(ErrorHandling::Collect)
.build()
.await
.unwrap();
assert_eq!(stats.files_processed, 2);
assert!(stats.files_processed >= 1);
}
#[tokio::test]
async fn test_builder_missing_content_path() {
let result = GraphBuilder::new(MockExtractor).build().await;
assert!(result.is_err());
}
#[tokio::test]
async fn test_builder_dangling_refs() {
let dir = tempdir().unwrap();
let content_dir = dir.path().join("content");
std::fs::create_dir(&content_dir).unwrap();
let file = "---\ntitle: \"Orphan\"\nprerequisites:\n - nonexistent\n---\n\n# Orphan\n";
std::fs::write(content_dir.join("orphan.md"), file).unwrap();
let (_graph, stats) = GraphBuilder::new(MockExtractor)
.with_content_path(&content_dir)
.build()
.await
.unwrap();
assert_eq!(stats.nodes_created, 1);
assert!(!stats.dangling_refs.is_empty());
assert!(stats.dangling_refs[0].contains("nonexistent"));
}
#[tokio::test]
async fn test_builder_edge_dedup() {
let dir = tempdir().unwrap();
let content_dir = dir.path().join("content");
std::fs::create_dir(&content_dir).unwrap();
let file_a = "---\ntitle: \"A\"\nrelated:\n - b\n---\n\n# A\n";
let file_b = "---\ntitle: \"B\"\nrelated:\n - a\n---\n\n# B\n";
std::fs::write(content_dir.join("a.md"), file_a).unwrap();
std::fs::write(content_dir.join("b.md"), file_b).unwrap();
let (graph, stats) = GraphBuilder::new(MockExtractor)
.with_content_path(&content_dir)
.build()
.await
.unwrap();
assert_eq!(graph.node_count(), 2);
assert_eq!(stats.nodes_created, 2);
assert_eq!(graph.edge_count(), 2);
assert_eq!(stats.edges_created, 2);
}
#[tokio::test]
async fn test_builder_empty_directory() {
let dir = tempdir().unwrap();
let content_dir = dir.path().join("empty");
std::fs::create_dir(&content_dir).unwrap();
let (graph, stats) = GraphBuilder::new(MockExtractor)
.with_content_path(&content_dir)
.build()
.await
.unwrap();
assert_eq!(graph.node_count(), 0);
assert_eq!(stats.files_processed, 0);
}
#[test]
fn test_parse_relationship() {
assert_eq!(
parse_relationship("prerequisite"),
Relationship::Prerequisite
);
assert_eq!(parse_relationship("prereq"), Relationship::Prerequisite);
assert_eq!(parse_relationship("leads_to"), Relationship::LeadsTo);
assert_eq!(parse_relationship("relates_to"), Relationship::RelatesTo);
assert_eq!(parse_relationship("related"), Relationship::RelatesTo);
assert_eq!(parse_relationship("extends"), Relationship::Extends);
assert_eq!(parse_relationship("introduces"), Relationship::Introduces);
assert_eq!(parse_relationship("covers"), Relationship::Covers);
assert_eq!(parse_relationship("variant_of"), Relationship::VariantOf);
assert_eq!(
parse_relationship("custom_rel"),
Relationship::Custom("custom_rel".to_string())
);
}
#[tokio::test]
async fn test_builder_manual_edges_missing_file() {
let (_dir, content_dir) = setup_test_files().await;
let missing_path = content_dir.parent().unwrap().join("nonexistent.json");
let (_graph, stats) = GraphBuilder::new(MockExtractor)
.with_content_path(&content_dir)
.with_manual_edges(&missing_path)
.build()
.await
.unwrap();
assert_eq!(stats.manual_edges_loaded, 0);
}
#[tokio::test]
async fn test_builder_manual_edges_dangling() {
let (_dir, content_dir) = setup_test_files().await;
let manual_path = content_dir.parent().unwrap().join("manual.json");
let manual = r#"[
{"from": "concept-a", "to": "nonexistent", "relationship": "relates_to"}
]"#;
std::fs::write(&manual_path, manual).unwrap();
let (_graph, stats) = GraphBuilder::new(MockExtractor)
.with_content_path(&content_dir)
.with_manual_edges(&manual_path)
.build()
.await
.unwrap();
assert_eq!(stats.manual_edges_loaded, 0);
assert!(
stats
.dangling_refs
.iter()
.any(|r| r.contains("nonexistent"))
);
}
#[tokio::test]
async fn test_builder_cache_hit() {
let (_dir, content_dir) = setup_test_files().await;
let cache_path = content_dir.parent().unwrap().join("graph-cache.json");
let (graph1, stats1) = GraphBuilder::new(MockExtractor)
.with_content_path(&content_dir)
.with_cache_path(&cache_path)
.build()
.await
.unwrap();
assert!(!stats1.from_cache);
assert!(cache_path.exists());
let (graph2, stats2) = GraphBuilder::new(MockExtractor)
.with_content_path(&content_dir)
.with_cache_path(&cache_path)
.build()
.await
.unwrap();
assert!(stats2.from_cache);
assert_eq!(graph1.node_count(), graph2.node_count());
assert_eq!(graph1.edge_count(), graph2.edge_count());
}
#[tokio::test]
async fn test_builder_cache_miss_on_content_change() {
let (_dir, content_dir) = setup_test_files().await;
let cache_path = content_dir.parent().unwrap().join("graph-cache.json");
let (_graph, stats1) = GraphBuilder::new(MockExtractor)
.with_content_path(&content_dir)
.with_cache_path(&cache_path)
.build()
.await
.unwrap();
assert!(!stats1.from_cache);
let file_c = "---\ntitle: \"Concept C\"\ncategory: \"new\"\n---\n\n# Concept C\n";
std::fs::write(content_dir.join("concept-c.md"), file_c).unwrap();
let (graph, stats2) = GraphBuilder::new(MockExtractor)
.with_content_path(&content_dir)
.with_cache_path(&cache_path)
.build()
.await
.unwrap();
assert!(!stats2.from_cache);
assert_eq!(graph.node_count(), 3);
}
#[tokio::test]
async fn test_builder_skip_cache() {
let (_dir, content_dir) = setup_test_files().await;
let cache_path = content_dir.parent().unwrap().join("graph-cache.json");
GraphBuilder::new(MockExtractor)
.with_content_path(&content_dir)
.with_cache_path(&cache_path)
.build()
.await
.unwrap();
let (_graph, stats) = GraphBuilder::new(MockExtractor)
.with_content_path(&content_dir)
.with_cache_path(&cache_path)
.skip_cache()
.build()
.await
.unwrap();
assert!(!stats.from_cache);
assert_eq!(stats.files_processed, 2);
}
#[tokio::test]
async fn test_builder_no_cache_path() {
let (_dir, content_dir) = setup_test_files().await;
let (_graph, stats) = GraphBuilder::new(MockExtractor)
.with_content_path(&content_dir)
.build()
.await
.unwrap();
assert!(!stats.from_cache);
assert_eq!(stats.files_processed, 2);
}
#[test]
fn test_compute_content_hash_deterministic() {
let dir = tempdir().unwrap();
let content_dir = dir.path().join("content");
std::fs::create_dir(&content_dir).unwrap();
std::fs::write(content_dir.join("a.md"), "content a").unwrap();
std::fs::write(content_dir.join("b.md"), "content b").unwrap();
let hash1 = compute_content_hash(&content_dir).unwrap();
let hash2 = compute_content_hash(&content_dir).unwrap();
assert_eq!(hash1, hash2);
}
#[test]
fn test_compute_content_hash_changes() {
let dir = tempdir().unwrap();
let content_dir = dir.path().join("content");
std::fs::create_dir(&content_dir).unwrap();
std::fs::write(content_dir.join("a.md"), "content a").unwrap();
let hash1 = compute_content_hash(&content_dir).unwrap();
std::fs::write(content_dir.join("b.md"), "content b").unwrap();
let hash2 = compute_content_hash(&content_dir).unwrap();
assert_ne!(hash1, hash2);
}
}