Skip to main content

drft/
graph.rs

1use anyhow::Result;
2use std::collections::HashMap;
3use std::path::Path;
4
5use crate::config::Config;
6use crate::discovery::{discover, find_child_graphs};
7use crate::parsers;
8
9/// Check if a target string is a URI (has a scheme per RFC 3986).
10/// A scheme is `[a-zA-Z][a-zA-Z0-9+.-]*:` — e.g., `http:`, `mailto:`, `ftp:`, `tel:`.
11pub fn is_uri(target: &str) -> bool {
12    let bytes = target.as_bytes();
13    if bytes.is_empty() || !bytes[0].is_ascii_alphabetic() {
14        return false;
15    }
16    for &b in &bytes[1..] {
17        if b == b':' {
18            return true;
19        }
20        if !b.is_ascii_alphanumeric() && b != b'+' && b != b'-' && b != b'.' {
21            return false;
22        }
23    }
24    false
25}
26
27#[derive(
28    Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, serde::Serialize, serde::Deserialize,
29)]
30#[serde(rename_all = "lowercase")]
31pub enum NodeType {
32    File,
33    Directory,
34    External,
35}
36
37#[derive(Debug, Clone, serde::Serialize)]
38pub struct Node {
39    pub path: String,
40    pub node_type: NodeType,
41    pub hash: Option<String>,
42    /// Which graph this node belongs to — mirrors filesystem directory entries:
43    /// `"."` = local, `".."` = parent (escape), `"child"` = child graph, `None` = not on filesystem.
44    #[serde(skip_serializing_if = "Option::is_none")]
45    pub graph: Option<String>,
46    /// True when this Directory node has a drft.toml (is a drft graph).
47    #[serde(skip_serializing_if = "std::ops::Not::not")]
48    pub is_graph: bool,
49    /// Structured metadata from parsers, keyed by parser name.
50    #[serde(skip_serializing_if = "HashMap::is_empty")]
51    pub metadata: HashMap<String, serde_json::Value>,
52}
53
54#[derive(Debug, Clone)]
55pub struct Edge {
56    pub source: String,
57    /// Node identity — always matches a key in `graph.nodes` (or is a dangling target).
58    /// Fragment-stripped: `bar.md`, not `bar.md#heading`.
59    pub target: String,
60    /// Original link when it differs from target (e.g., `bar.md#heading`).
61    /// Absent when the link resolved to exactly the node ID.
62    pub link: Option<String>,
63    /// Which parser discovered this edge (provenance).
64    pub parser: String,
65}
66
67/// Filesystem properties of an edge target, probed during graph building.
68/// Stored per-target on the Graph, not per-edge.
69#[derive(Debug, Clone, Default)]
70pub struct TargetProperties {
71    pub is_symlink: bool,
72    pub is_directory: bool,
73    pub symlink_target: Option<String>,
74}
75
76#[derive(Debug, Default)]
77pub struct Graph {
78    pub nodes: HashMap<String, Node>,
79    pub edges: Vec<Edge>,
80    pub forward: HashMap<String, Vec<usize>>,
81    pub reverse: HashMap<String, Vec<usize>>,
82    pub child_graphs: Vec<String>,
83    /// Resolved interface nodes from config (empty = open graph).
84    pub interface: Vec<String>,
85    /// Filesystem properties of edge targets, keyed by node identity (fragment-stripped).
86    pub target_properties: HashMap<String, TargetProperties>,
87}
88
89impl Graph {
90    pub fn new() -> Self {
91        Self::default()
92    }
93
94    pub fn add_node(&mut self, node: Node) {
95        self.nodes.insert(node.path.clone(), node);
96    }
97
98    /// Returns true for File nodes (excludes External and Directory).
99    /// Used by structural analyses that operate only on declared file-backed nodes.
100    pub fn is_file_node(&self, path: &str) -> bool {
101        self.nodes
102            .get(path)
103            .is_some_and(|n| n.node_type == NodeType::File)
104    }
105
106    pub fn add_edge(&mut self, edge: Edge) {
107        let idx = self.edges.len();
108        self.forward
109            .entry(edge.source.clone())
110            .or_default()
111            .push(idx);
112        self.reverse
113            .entry(edge.target.clone())
114            .or_default()
115            .push(idx);
116        self.edges.push(edge);
117    }
118
119    /// Get filesystem properties for an edge target.
120    pub fn target_props(&self, target: &str) -> Option<&TargetProperties> {
121        self.target_properties.get(target)
122    }
123}
124
125/// Hash file contents with BLAKE3, returning `b3:<hex>`.
126pub fn hash_bytes(content: &[u8]) -> String {
127    format!("b3:{}", blake3::hash(content).to_hex())
128}
129
130/// Load a child graph's config, and if it declares an `[interface]`,
131/// add each interface file as an External node with a coupling edge
132/// to the child's Directory node.
133fn promote_interface_files(
134    root: &Path,
135    child_name: &str,
136    graph: &mut Graph,
137    implicit_edges: &mut Vec<Edge>,
138) {
139    let child_dir = root.join(child_name);
140    let config = match Config::load(&child_dir) {
141        Ok(c) => c,
142        Err(_) => return,
143    };
144
145    let (interface_files, interface_ignore) = match &config.interface {
146        Some(iface) => (&iface.files, &iface.ignore),
147        None => return,
148    };
149
150    // Resolve interface globs to actual files, honoring child excludes and interface ignores
151    let mut exclude_patterns = config.exclude.clone();
152    exclude_patterns.extend(interface_ignore.iter().cloned());
153
154    let included = match discover(&child_dir, interface_files, &exclude_patterns) {
155        Ok(files) => files,
156        Err(_) => return,
157    };
158
159    for file in included {
160        let node_path = format!("{child_name}/{file}");
161        if graph.nodes.contains_key(&node_path) {
162            continue;
163        }
164        let file_path = child_dir.join(&file);
165        let hash = std::fs::read(&file_path).ok().map(|c| hash_bytes(&c));
166        graph.add_node(Node {
167            path: node_path.clone(),
168            node_type: NodeType::External,
169            hash,
170            graph: Some(child_name.into()),
171            is_graph: false,
172            metadata: HashMap::new(),
173        });
174        implicit_edges.push(Edge {
175            source: node_path,
176            target: child_name.into(),
177            link: None,
178            parser: String::new(),
179        });
180    }
181}
182
183/// Build a graph from files in `root`.
184///
185/// 1. Discover File nodes via `include`/`exclude` — hash raw bytes for all.
186/// 2. Read text content for parser input (graceful skip for binary files).
187/// 3. Run parsers to extract edges.
188/// 4. Edge targets outside `include` become External nodes (not tracked).
189pub fn build_graph(root: &Path, config: &Config) -> Result<Graph> {
190    let included_files = discover(root, &config.include, &config.exclude)?;
191    let child_graphs = find_child_graphs(root, &config.exclude)?;
192    let mut graph = Graph::new();
193    graph.child_graphs = child_graphs;
194    let mut pending_edges = Vec::new();
195
196    // 1. Create File nodes for everything in include — hash raw bytes.
197    //    Separately read text content for files parsers will need.
198    let mut file_text: HashMap<String, String> = HashMap::new(); // path → text content
199
200    for file in &included_files {
201        let file_path = root.join(file);
202        let raw = std::fs::read(&file_path)?;
203        let hash = hash_bytes(&raw);
204
205        graph.add_node(Node {
206            path: file.clone(),
207            node_type: NodeType::File,
208            hash: Some(hash),
209            graph: Some(".".into()),
210            is_graph: false,
211            metadata: HashMap::new(),
212        });
213
214        // Try to read as text for parser input — binary files just won't have text
215        if let Ok(text) = String::from_utf8(raw) {
216            file_text.insert(file.clone(), text);
217        }
218    }
219
220    // 2. Build parser registry and determine which files each parser receives
221    let parser_list = parsers::build_parsers(&config.parsers, config.config_dir.as_deref(), root);
222    let mut parser_files: Vec<Vec<String>> = vec![Vec::new(); parser_list.len()];
223
224    for file in &included_files {
225        for (i, parser) in parser_list.iter().enumerate() {
226            if parser.matches(file) {
227                parser_files[i].push(file.clone());
228            }
229        }
230    }
231
232    // 3. Run each parser in batch mode
233    for (i, parser) in parser_list.iter().enumerate() {
234        let files: Vec<(&str, &str)> = parser_files[i]
235            .iter()
236            .filter_map(|path| {
237                file_text
238                    .get(path)
239                    .map(|content| (path.as_str(), content.as_str()))
240            })
241            .collect();
242
243        if files.is_empty() {
244            continue;
245        }
246
247        let batch_results = parser.parse_batch(&files);
248
249        for (file, result) in batch_results {
250            // Attach metadata to node if parser returned it
251            if let Some(metadata) = result.metadata
252                && let Some(node) = graph.nodes.get_mut(&file)
253            {
254                node.metadata.insert(parser.name().to_string(), metadata);
255            }
256
257            for link in result.links {
258                let normalized = match normalize_link_target(&link) {
259                    Some(n) => n,
260                    None => continue, // filtered (empty, anchor-only)
261                };
262
263                let target = if is_uri(&normalized.target) {
264                    normalized.target
265                } else {
266                    resolve_link(&file, &normalized.target)
267                };
268                // link carries the full original when it has a fragment
269                let link = normalized.fragment.map(|frag| format!("{target}{frag}"));
270                pending_edges.push(Edge {
271                    source: file.clone(),
272                    target,
273                    link,
274                    parser: parser.name().to_string(),
275                });
276            }
277        }
278    }
279
280    // 4. Edge-driven node creation.
281    //    Child graph directories are NOT pre-created — they only get nodes when
282    //    an edge references them or a file inside them. This keeps the model
283    //    uniform: edges create nodes, discovery creates files.
284    let graph_prefixes: Vec<String> = graph.child_graphs.clone();
285
286    // 5. Classify edge targets not already in the graph.
287    //    edge.target is already the node identity (fragment-stripped).
288    //    graph field uses filesystem-relative convention:
289    //      "."           — belongs to current graph
290    //      ".."          — escaped to parent graph
291    //      "research"    — belongs to child graph "research"
292    //      None          — not on the filesystem (URI)
293    let mut implicit_edges = Vec::new();
294    for edge in &pending_edges {
295        if graph.nodes.contains_key(&edge.target) {
296            continue;
297        }
298
299        // URIs → External (not on filesystem)
300        if is_uri(&edge.target) {
301            graph.add_node(Node {
302                path: edge.target.clone(),
303                node_type: NodeType::External,
304                hash: None,
305                graph: None,
306                is_graph: false,
307                metadata: HashMap::new(),
308            });
309            continue;
310        }
311
312        // Boundary escape (../ targets) → External with graph: ".."
313        if edge.target.starts_with("../") || edge.target == ".." {
314            graph.add_node(Node {
315                path: edge.target.clone(),
316                node_type: NodeType::External,
317                hash: None,
318                graph: Some("..".into()),
319                is_graph: false,
320                metadata: HashMap::new(),
321            });
322            continue;
323        }
324
325        // Target inside a child graph → External with graph: "child"
326        let in_child_graph = graph_prefixes
327            .iter()
328            .find(|s| edge.target.starts_with(&format!("{s}/")));
329        if let Some(child_name) = in_child_graph {
330            let target_path = root.join(&edge.target);
331            if target_path.is_file() {
332                let hash = std::fs::read(&target_path).ok().map(|c| hash_bytes(&c));
333                graph.add_node(Node {
334                    path: edge.target.clone(),
335                    node_type: NodeType::External,
336                    hash,
337                    graph: Some(child_name.clone()),
338                    is_graph: false,
339                    metadata: HashMap::new(),
340                });
341                // Ensure the child graph Directory node exists
342                if !graph.nodes.contains_key(child_name) {
343                    let child_dir = root.join(child_name);
344                    let config_hash = std::fs::read(child_dir.join("drft.toml"))
345                        .ok()
346                        .map(|c| hash_bytes(&c));
347                    graph.add_node(Node {
348                        path: child_name.clone(),
349                        node_type: NodeType::Directory,
350                        hash: config_hash,
351                        graph: Some(".".into()),
352                        is_graph: true,
353                        metadata: HashMap::new(),
354                    });
355                    // Promote interface files into parent graph
356                    promote_interface_files(root, child_name, &mut graph, &mut implicit_edges);
357                }
358                // Synthetic coupling edge: child-graph file → Directory node
359                implicit_edges.push(Edge {
360                    source: edge.target.clone(),
361                    target: child_name.clone(),
362                    link: None,
363                    parser: edge.parser.clone(),
364                });
365            }
366            continue;
367        }
368
369        let target_path = root.join(&edge.target);
370
371        // Directory on disk → Directory node
372        if target_path.is_dir() {
373            let has_config = target_path.join("drft.toml").exists();
374            let hash = if has_config {
375                std::fs::read(target_path.join("drft.toml"))
376                    .ok()
377                    .map(|c| hash_bytes(&c))
378            } else {
379                None
380            };
381            graph.add_node(Node {
382                path: edge.target.clone(),
383                node_type: NodeType::Directory,
384                hash,
385                graph: Some(".".into()),
386                is_graph: has_config,
387                metadata: HashMap::new(),
388            });
389            if has_config {
390                promote_interface_files(root, &edge.target, &mut graph, &mut implicit_edges);
391            }
392            continue;
393        }
394
395        // File exists on disk but not in include → External (local)
396        if target_path.is_file() {
397            let hash = std::fs::read(&target_path).ok().map(|c| hash_bytes(&c));
398            graph.add_node(Node {
399                path: edge.target.clone(),
400                node_type: NodeType::External,
401                hash,
402                graph: Some(".".into()),
403                is_graph: false,
404                metadata: HashMap::new(),
405            });
406        }
407        // If doesn't exist: no node created. dangling-edge rule handles this.
408    }
409
410    // Probe filesystem properties for non-URI edge targets (stored per-target, not per-edge)
411    pending_edges.extend(implicit_edges);
412    for edge in &pending_edges {
413        if is_uri(&edge.target) || graph.target_properties.contains_key(&edge.target) {
414            continue;
415        }
416        let target_path = root.join(&edge.target);
417        let is_symlink = target_path.is_symlink();
418        let is_directory = target_path.is_dir();
419        let symlink_target = if is_symlink {
420            std::fs::read_link(&target_path)
421                .ok()
422                .map(|p| p.to_string_lossy().to_string())
423        } else {
424            None
425        };
426        graph.target_properties.insert(
427            edge.target.clone(),
428            TargetProperties {
429                is_symlink,
430                is_directory,
431                symlink_target,
432            },
433        );
434    }
435
436    // Add all edges (explicit + implicit) to the graph
437    for edge in pending_edges {
438        graph.add_edge(edge);
439    }
440
441    // Resolve interface from config (files included, ignore excluded)
442    if let Some(ref iface) = config.interface {
443        let ignore_set = crate::config::compile_globs(&iface.ignore)?;
444
445        let mut resolved = Vec::new();
446        for pattern in &iface.files {
447            if let Ok(glob) = globset::Glob::new(pattern) {
448                let matcher = glob.compile_matcher();
449                for path in graph.nodes.keys() {
450                    if matcher.is_match(path) {
451                        resolved.push(path.clone());
452                    }
453                }
454            } else {
455                if graph.nodes.contains_key(pattern) {
456                    resolved.push(pattern.clone());
457                }
458            }
459        }
460
461        if let Some(ref ignore) = ignore_set {
462            resolved.retain(|p| !ignore.is_match(p));
463        }
464
465        resolved.sort();
466        resolved.dedup();
467        graph.interface = resolved;
468    }
469
470    Ok(graph)
471}
472
473/// Normalize a relative path by resolving `.` and `..` components using Path APIs.
474/// Does not touch the filesystem. Always returns forward-slash separated paths.
475/// Preserves leading `..` that escape above the root — these indicate graph escape.
476pub fn normalize_relative_path(path: &str) -> String {
477    let mut parts: Vec<String> = Vec::new();
478    for component in Path::new(path).components() {
479        match component {
480            std::path::Component::CurDir => {}
481            std::path::Component::ParentDir => {
482                // Only pop if there's a normal component to pop (not a leading ..)
483                if parts.last().is_some_and(|p| p != "..") {
484                    parts.pop();
485                } else {
486                    parts.push("..".to_string());
487                }
488            }
489            std::path::Component::Normal(c) => parts.push(c.to_string_lossy().to_string()),
490            _ => {}
491        }
492    }
493    parts.join("/")
494}
495
496/// Normalized link target: the node identity and optional fragment metadata.
497struct NormalizedTarget {
498    /// The target path or URI with fragment stripped (used for node identity).
499    target: String,
500    /// The fragment portion (e.g., `#heading`), if any. Preserved as edge metadata.
501    fragment: Option<String>,
502}
503
504/// Normalize a raw link target from a parser.
505/// Returns None for targets that should be filtered (empty, anchor-only with no file target).
506/// Strips fragments for node identity but preserves them as metadata.
507fn normalize_link_target(raw: &str) -> Option<NormalizedTarget> {
508    let target = raw.trim();
509    if target.is_empty() {
510        return None;
511    }
512
513    // Anchor-only links (#heading) have no file target — drop them
514    if target.starts_with('#') {
515        return None;
516    }
517
518    // Split target and fragment at the first #
519    let (base, fragment) = match target.find('#') {
520        Some(idx) => (&target[..idx], Some(target[idx..].to_string())),
521        None => (target, None),
522    };
523
524    // After stripping fragment, if nothing remains, drop
525    if base.is_empty() {
526        return None;
527    }
528
529    Some(NormalizedTarget {
530        target: base.to_string(),
531        fragment,
532    })
533}
534
535/// Resolve a link target relative to a source file, producing a path relative to the graph root.
536/// Uses Path::join for correct platform-aware path handling.
537pub fn resolve_link(source_file: &str, raw_target: &str) -> String {
538    let source_path = Path::new(source_file);
539    let source_dir = source_path.parent().unwrap_or(Path::new(""));
540    let joined = source_dir.join(raw_target);
541    normalize_relative_path(&joined.to_string_lossy())
542}
543
544#[cfg(test)]
545pub mod test_helpers {
546    use super::*;
547
548    pub fn make_node(path: &str) -> Node {
549        Node {
550            path: path.into(),
551            node_type: NodeType::File,
552            hash: None,
553            graph: Some(".".into()),
554            is_graph: false,
555            metadata: HashMap::new(),
556        }
557    }
558
559    pub fn make_edge(source: &str, target: &str) -> Edge {
560        Edge {
561            source: source.into(),
562            target: target.into(),
563            link: None,
564            parser: "markdown".into(),
565        }
566    }
567
568    pub fn make_enriched(graph: Graph) -> crate::analyses::EnrichedGraph {
569        crate::analyses::enrich_graph(
570            graph,
571            std::path::Path::new("."),
572            &crate::config::Config::defaults(),
573            None,
574        )
575    }
576
577    pub fn make_enriched_with_root(
578        graph: Graph,
579        root: &std::path::Path,
580    ) -> crate::analyses::EnrichedGraph {
581        crate::analyses::enrich_graph(graph, root, &crate::config::Config::defaults(), None)
582    }
583}
584
585#[cfg(test)]
586mod tests {
587    use super::*;
588
589    #[test]
590    fn normalize_simple() {
591        assert_eq!(normalize_relative_path("a/b/c"), "a/b/c");
592    }
593
594    #[test]
595    fn normalize_dot() {
596        assert_eq!(normalize_relative_path("./a/./b"), "a/b");
597    }
598
599    #[test]
600    fn normalize_dotdot() {
601        assert_eq!(normalize_relative_path("a/b/../c"), "a/c");
602    }
603
604    #[test]
605    fn normalize_preserves_leading_dotdot() {
606        assert_eq!(normalize_relative_path("../a"), "../a");
607    }
608
609    #[test]
610    fn normalize_deep_escape() {
611        assert_eq!(normalize_relative_path("../../a"), "../../a");
612    }
613
614    #[test]
615    fn normalize_escape_after_descent() {
616        // guides/../../README.md -> ../README.md (one level above root)
617        assert_eq!(
618            normalize_relative_path("guides/../../README.md"),
619            "../README.md"
620        );
621    }
622
623    #[test]
624    fn resolve_same_dir() {
625        assert_eq!(resolve_link("index.md", "setup.md"), "setup.md");
626    }
627
628    #[test]
629    fn resolve_subdir() {
630        assert_eq!(
631            resolve_link("guides/intro.md", "setup.md"),
632            "guides/setup.md"
633        );
634    }
635
636    #[test]
637    fn resolve_parent() {
638        assert_eq!(resolve_link("guides/intro.md", "../config.md"), "config.md");
639    }
640
641    #[test]
642    fn graph_adjacency() {
643        let mut g = Graph::new();
644        g.add_node(Node {
645            path: "a.md".into(),
646            node_type: NodeType::File,
647            hash: None,
648            graph: None,
649            is_graph: false,
650            metadata: HashMap::new(),
651        });
652        g.add_node(Node {
653            path: "b.md".into(),
654            node_type: NodeType::File,
655            hash: None,
656            graph: None,
657            is_graph: false,
658            metadata: HashMap::new(),
659        });
660        g.add_edge(Edge {
661            source: "a.md".into(),
662            target: "b.md".into(),
663            link: None,
664            parser: "markdown".into(),
665        });
666        assert_eq!(g.forward["a.md"], vec![0]);
667        assert_eq!(g.reverse["b.md"], vec![0]);
668        assert!(!g.forward.contains_key("b.md"));
669    }
670
671    #[test]
672    fn fragment_edge_resolves_to_node() {
673        let mut g = Graph::new();
674        g.add_node(test_helpers::make_node("a.md"));
675        g.add_node(test_helpers::make_node("b.md"));
676        g.add_edge(Edge {
677            source: "a.md".into(),
678            target: "b.md".into(),
679            link: Some("b.md#heading".into()),
680            parser: "markdown".into(),
681        });
682        // target is the node ID
683        assert_eq!(g.edges[0].target, "b.md");
684        // reference carries the full original
685        assert_eq!(g.edges[0].link.as_deref(), Some("b.md#heading"));
686        // reverse map works directly on target
687        assert_eq!(g.reverse["b.md"], vec![0]);
688    }
689
690    #[test]
691    fn is_uri_detects_schemes() {
692        assert!(is_uri("http://example.com"));
693        assert!(is_uri("https://example.com"));
694        assert!(is_uri("mailto:user@example.com"));
695        assert!(is_uri("ftp://files.example.com"));
696        assert!(is_uri("tel:+1234567890"));
697        assert!(is_uri("ssh://git@github.com"));
698        assert!(is_uri("custom+scheme://foo"));
699    }
700
701    #[test]
702    fn is_uri_rejects_paths() {
703        assert!(!is_uri("setup.md"));
704        assert!(!is_uri("./relative/path.md"));
705        assert!(!is_uri("../parent.md"));
706        assert!(!is_uri("#heading"));
707        assert!(!is_uri(""));
708        assert!(!is_uri("path/with:colon.md")); // colon after slash = not a scheme
709    }
710
711    #[test]
712    fn normalize_strips_fragment() {
713        let n = normalize_link_target("file.md#heading").unwrap();
714        assert_eq!(n.target, "file.md");
715        assert_eq!(n.fragment.as_deref(), Some("#heading"));
716    }
717
718    #[test]
719    fn normalize_strips_uri_fragment() {
720        let n = normalize_link_target("https://example.com/page#section").unwrap();
721        assert_eq!(n.target, "https://example.com/page");
722        assert_eq!(n.fragment.as_deref(), Some("#section"));
723    }
724
725    #[test]
726    fn normalize_drops_anchor_only() {
727        assert!(normalize_link_target("#heading").is_none());
728    }
729
730    #[test]
731    fn normalize_drops_empty() {
732        assert!(normalize_link_target("").is_none());
733        assert!(normalize_link_target("  ").is_none());
734    }
735
736    #[test]
737    fn normalize_preserves_mailto() {
738        let n = normalize_link_target("mailto:user@example.com").unwrap();
739        assert_eq!(n.target, "mailto:user@example.com");
740        assert!(n.fragment.is_none());
741    }
742}