Skip to main content

drft/
graph.rs

1use anyhow::Result;
2use std::collections::HashMap;
3use std::path::Path;
4
5use crate::config::Config;
6use crate::discovery::{discover, find_child_graphs};
7use crate::parsers;
8
9/// Check if a target string is a URI (has a scheme per RFC 3986).
10/// A scheme is `[a-zA-Z][a-zA-Z0-9+.-]*:` — e.g., `http:`, `mailto:`, `ftp:`, `tel:`.
11pub fn is_uri(target: &str) -> bool {
12    let bytes = target.as_bytes();
13    if bytes.is_empty() || !bytes[0].is_ascii_alphabetic() {
14        return false;
15    }
16    for &b in &bytes[1..] {
17        if b == b':' {
18            return true;
19        }
20        if !b.is_ascii_alphanumeric() && b != b'+' && b != b'-' && b != b'.' {
21            return false;
22        }
23    }
24    false
25}
26
27#[derive(
28    Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, serde::Serialize, serde::Deserialize,
29)]
30#[serde(rename_all = "lowercase")]
31pub enum NodeType {
32    File,
33    External,
34    Graph,
35}
36
37#[derive(Debug, Clone, serde::Serialize)]
38pub struct Node {
39    pub path: String,
40    pub node_type: NodeType,
41    pub hash: Option<String>,
42    /// If set, this node lives in a child graph (value is the child graph's path).
43    #[serde(skip_serializing_if = "Option::is_none")]
44    pub graph: Option<String>,
45    /// Structured metadata from parsers, keyed by parser name.
46    #[serde(skip_serializing_if = "HashMap::is_empty")]
47    pub metadata: HashMap<String, serde_json::Value>,
48}
49
50#[derive(Debug, Clone)]
51pub struct Edge {
52    pub source: String,
53    /// Node identity — always matches a key in `graph.nodes` (or is a dangling target).
54    /// Fragment-stripped: `bar.md`, not `bar.md#heading`.
55    pub target: String,
56    /// Original link when it differs from target (e.g., `bar.md#heading`).
57    /// Absent when the link resolved to exactly the node ID.
58    pub link: Option<String>,
59    /// Which parser discovered this edge (provenance).
60    pub parser: String,
61}
62
63/// Filesystem properties of an edge target, probed during graph building.
64/// Stored per-target on the Graph, not per-edge.
65#[derive(Debug, Clone, Default)]
66pub struct TargetProperties {
67    pub is_symlink: bool,
68    pub is_directory: bool,
69    pub symlink_target: Option<String>,
70}
71
72#[derive(Debug, Default)]
73pub struct Graph {
74    pub nodes: HashMap<String, Node>,
75    pub edges: Vec<Edge>,
76    pub forward: HashMap<String, Vec<usize>>,
77    pub reverse: HashMap<String, Vec<usize>>,
78    pub child_graphs: Vec<String>,
79    /// Resolved interface nodes from config (empty = open graph).
80    pub interface: Vec<String>,
81    /// Filesystem properties of edge targets, keyed by node identity (fragment-stripped).
82    pub target_properties: HashMap<String, TargetProperties>,
83}
84
85impl Graph {
86    pub fn new() -> Self {
87        Self::default()
88    }
89
90    /// Child graphs (nodes with type Graph).
91    #[allow(dead_code)]
92    pub fn children(&self) -> impl Iterator<Item = &Node> {
93        self.nodes
94            .values()
95            .filter(|n| n.node_type == NodeType::Graph)
96    }
97
98    /// Whether a node is part of this graph's interface.
99    #[allow(dead_code)]
100    pub fn is_interfaced(&self, path: &str) -> bool {
101        self.interface.iter().any(|e| e == path)
102    }
103
104    pub fn add_node(&mut self, node: Node) {
105        self.nodes.insert(node.path.clone(), node);
106    }
107
108    /// Returns true for File nodes (excludes External and Graph).
109    /// Used by structural analyses that operate only on declared file-backed nodes.
110    pub fn is_file_node(&self, path: &str) -> bool {
111        self.nodes
112            .get(path)
113            .is_some_and(|n| n.node_type == NodeType::File)
114    }
115
116    pub fn add_edge(&mut self, edge: Edge) {
117        let idx = self.edges.len();
118        self.forward
119            .entry(edge.source.clone())
120            .or_default()
121            .push(idx);
122        self.reverse
123            .entry(edge.target.clone())
124            .or_default()
125            .push(idx);
126        self.edges.push(edge);
127    }
128
129    /// Get filesystem properties for an edge target.
130    pub fn target_props(&self, target: &str) -> Option<&TargetProperties> {
131        self.target_properties.get(target)
132    }
133}
134
135/// Hash file contents with BLAKE3, returning `b3:<hex>`.
136pub fn hash_bytes(content: &[u8]) -> String {
137    format!("b3:{}", blake3::hash(content).to_hex())
138}
139
140/// Build a graph from files in `root`.
141///
142/// 1. Discover File nodes via `include`/`exclude` — hash raw bytes for all.
143/// 2. Read text content for parser input (graceful skip for binary files).
144/// 3. Run parsers to extract edges.
145/// 4. Edge targets outside `include` become External nodes (not tracked).
146pub fn build_graph(root: &Path, config: &Config) -> Result<Graph> {
147    let included_files = discover(root, &config.include, &config.exclude)?;
148    let child_graphs = find_child_graphs(root, &config.exclude)?;
149    let mut graph = Graph::new();
150    graph.child_graphs = child_graphs;
151    let mut pending_edges = Vec::new();
152
153    // 1. Create File nodes for everything in include — hash raw bytes.
154    //    Separately read text content for files parsers will need.
155    let mut file_text: HashMap<String, String> = HashMap::new(); // path → text content
156
157    for file in &included_files {
158        let file_path = root.join(file);
159        let raw = std::fs::read(&file_path)?;
160        let hash = hash_bytes(&raw);
161
162        graph.add_node(Node {
163            path: file.clone(),
164            node_type: NodeType::File,
165            hash: Some(hash),
166            graph: None,
167            metadata: HashMap::new(),
168        });
169
170        // Try to read as text for parser input — binary files just won't have text
171        if let Ok(text) = String::from_utf8(raw) {
172            file_text.insert(file.clone(), text);
173        }
174    }
175
176    // 2. Build parser registry and determine which files each parser receives
177    let parser_list = parsers::build_parsers(&config.parsers, config.config_dir.as_deref(), root);
178    let mut parser_files: Vec<Vec<String>> = vec![Vec::new(); parser_list.len()];
179
180    for file in &included_files {
181        for (i, parser) in parser_list.iter().enumerate() {
182            if parser.matches(file) {
183                parser_files[i].push(file.clone());
184            }
185        }
186    }
187
188    // 3. Run each parser in batch mode
189    for (i, parser) in parser_list.iter().enumerate() {
190        let files: Vec<(&str, &str)> = parser_files[i]
191            .iter()
192            .filter_map(|path| {
193                file_text
194                    .get(path)
195                    .map(|content| (path.as_str(), content.as_str()))
196            })
197            .collect();
198
199        if files.is_empty() {
200            continue;
201        }
202
203        let batch_results = parser.parse_batch(&files);
204
205        for (file, result) in batch_results {
206            // Attach metadata to node if parser returned it
207            if let Some(metadata) = result.metadata
208                && let Some(node) = graph.nodes.get_mut(&file)
209            {
210                node.metadata.insert(parser.name().to_string(), metadata);
211            }
212
213            for link in result.links {
214                let normalized = match normalize_link_target(&link) {
215                    Some(n) => n,
216                    None => continue, // filtered (empty, anchor-only)
217                };
218
219                let target = if is_uri(&normalized.target) {
220                    normalized.target
221                } else {
222                    resolve_link(&file, &normalized.target)
223                };
224                // link carries the full original when it has a fragment
225                let link = normalized.fragment.map(|frag| format!("{target}{frag}"));
226                pending_edges.push(Edge {
227                    source: file.clone(),
228                    target,
229                    link,
230                    parser: parser.name().to_string(),
231                });
232            }
233        }
234    }
235
236    // 4. Create Graph nodes for child graphs
237    let graph_prefixes: Vec<String> = graph.child_graphs.clone();
238    for graph_dir in &graph_prefixes {
239        graph.add_node(Node {
240            path: graph_dir.clone(),
241            node_type: NodeType::Graph,
242            hash: None,
243            graph: None,
244            metadata: HashMap::new(),
245        });
246    }
247
248    // 5. Classify edge targets not already in the graph.
249    //    edge.target is already the node identity (fragment-stripped).
250    //    - URIs → External
251    //    - Child graph files (exist on disk) → External with graph field
252    //    - Files on disk outside include → External
253    //    - Doesn't exist / is directory → no node (dangling-edge / directory-edge candidates)
254    let mut implicit_edges = Vec::new();
255    for edge in &pending_edges {
256        if graph.nodes.contains_key(&edge.target) {
257            continue;
258        }
259
260        // URIs → External
261        if is_uri(&edge.target) {
262            graph.add_node(Node {
263                path: edge.target.clone(),
264                node_type: NodeType::External,
265                hash: None,
266                graph: None,
267                metadata: HashMap::new(),
268            });
269            continue;
270        }
271
272        // Target inside a child graph → External with graph field (if file exists)
273        let in_child_graph = graph_prefixes
274            .iter()
275            .find(|s| edge.target.starts_with(s.as_str()));
276        if let Some(graph_prefix) = in_child_graph {
277            let target_path = root.join(&edge.target);
278            if target_path.is_file() {
279                graph.add_node(Node {
280                    path: edge.target.clone(),
281                    node_type: NodeType::External,
282                    hash: None,
283                    graph: Some(graph_prefix.clone()),
284                    metadata: HashMap::new(),
285                });
286                // Synthetic coupling edge: child-graph file → Graph node
287                implicit_edges.push(Edge {
288                    source: edge.target.clone(),
289                    target: graph_prefix.clone(),
290                    link: None,
291                    parser: edge.parser.clone(),
292                });
293            }
294            continue;
295        }
296
297        // File exists on disk but not in include → External (validated, not tracked)
298        let target_path = root.join(&edge.target);
299        if target_path.is_file() {
300            graph.add_node(Node {
301                path: edge.target.clone(),
302                node_type: NodeType::External,
303                hash: None,
304                graph: None,
305                metadata: HashMap::new(),
306            });
307        }
308        // If doesn't exist or is a directory: no node created.
309        // dangling-edge and directory-edge rules handle these cases.
310    }
311
312    // Probe filesystem properties for non-URI edge targets (stored per-target, not per-edge)
313    pending_edges.extend(implicit_edges);
314    for edge in &pending_edges {
315        if is_uri(&edge.target) || graph.target_properties.contains_key(&edge.target) {
316            continue;
317        }
318        let target_path = root.join(&edge.target);
319        let is_symlink = target_path.is_symlink();
320        let is_directory = target_path.is_dir();
321        let symlink_target = if is_symlink {
322            std::fs::read_link(&target_path)
323                .ok()
324                .map(|p| p.to_string_lossy().to_string())
325        } else {
326            None
327        };
328        graph.target_properties.insert(
329            edge.target.clone(),
330            TargetProperties {
331                is_symlink,
332                is_directory,
333                symlink_target,
334            },
335        );
336    }
337
338    // Add all edges (explicit + implicit) to the graph
339    for edge in pending_edges {
340        graph.add_edge(edge);
341    }
342
343    // Resolve interface from config
344    if let Some(ref iface) = config.interface {
345        let mut resolved = Vec::new();
346        for pattern in &iface.nodes {
347            if let Ok(glob) = globset::Glob::new(pattern) {
348                let matcher = glob.compile_matcher();
349                for path in graph.nodes.keys() {
350                    if matcher.is_match(path) {
351                        resolved.push(path.clone());
352                    }
353                }
354            } else {
355                // Treat as literal path
356                if graph.nodes.contains_key(pattern) {
357                    resolved.push(pattern.clone());
358                }
359            }
360        }
361        resolved.sort();
362        resolved.dedup();
363        graph.interface = resolved;
364    }
365
366    Ok(graph)
367}
368
369/// Normalize a relative path by resolving `.` and `..` components using Path APIs.
370/// Does not touch the filesystem. Always returns forward-slash separated paths.
371/// Preserves leading `..` that escape above the root — these indicate graph escape.
372pub fn normalize_relative_path(path: &str) -> String {
373    let mut parts: Vec<String> = Vec::new();
374    for component in Path::new(path).components() {
375        match component {
376            std::path::Component::CurDir => {}
377            std::path::Component::ParentDir => {
378                // Only pop if there's a normal component to pop (not a leading ..)
379                if parts.last().is_some_and(|p| p != "..") {
380                    parts.pop();
381                } else {
382                    parts.push("..".to_string());
383                }
384            }
385            std::path::Component::Normal(c) => parts.push(c.to_string_lossy().to_string()),
386            _ => {}
387        }
388    }
389    parts.join("/")
390}
391
392/// Normalized link target: the node identity and optional fragment metadata.
393struct NormalizedTarget {
394    /// The target path or URI with fragment stripped (used for node identity).
395    target: String,
396    /// The fragment portion (e.g., `#heading`), if any. Preserved as edge metadata.
397    fragment: Option<String>,
398}
399
400/// Normalize a raw link target from a parser.
401/// Returns None for targets that should be filtered (empty, anchor-only with no file target).
402/// Strips fragments for node identity but preserves them as metadata.
403fn normalize_link_target(raw: &str) -> Option<NormalizedTarget> {
404    let target = raw.trim();
405    if target.is_empty() {
406        return None;
407    }
408
409    // Anchor-only links (#heading) have no file target — drop them
410    if target.starts_with('#') {
411        return None;
412    }
413
414    // Split target and fragment at the first #
415    let (base, fragment) = match target.find('#') {
416        Some(idx) => (&target[..idx], Some(target[idx..].to_string())),
417        None => (target, None),
418    };
419
420    // After stripping fragment, if nothing remains, drop
421    if base.is_empty() {
422        return None;
423    }
424
425    Some(NormalizedTarget {
426        target: base.to_string(),
427        fragment,
428    })
429}
430
431/// Resolve a link target relative to a source file, producing a path relative to the graph root.
432/// Uses Path::join for correct platform-aware path handling.
433pub fn resolve_link(source_file: &str, raw_target: &str) -> String {
434    let source_path = Path::new(source_file);
435    let source_dir = source_path.parent().unwrap_or(Path::new(""));
436    let joined = source_dir.join(raw_target);
437    normalize_relative_path(&joined.to_string_lossy())
438}
439
440#[cfg(test)]
441pub mod test_helpers {
442    use super::*;
443
444    pub fn make_node(path: &str) -> Node {
445        Node {
446            path: path.into(),
447            node_type: NodeType::File,
448            hash: None,
449            graph: None,
450            metadata: HashMap::new(),
451        }
452    }
453
454    pub fn make_edge(source: &str, target: &str) -> Edge {
455        Edge {
456            source: source.into(),
457            target: target.into(),
458            link: None,
459            parser: "markdown".into(),
460        }
461    }
462}
463
464#[cfg(test)]
465mod tests {
466    use super::*;
467
468    #[test]
469    fn normalize_simple() {
470        assert_eq!(normalize_relative_path("a/b/c"), "a/b/c");
471    }
472
473    #[test]
474    fn normalize_dot() {
475        assert_eq!(normalize_relative_path("./a/./b"), "a/b");
476    }
477
478    #[test]
479    fn normalize_dotdot() {
480        assert_eq!(normalize_relative_path("a/b/../c"), "a/c");
481    }
482
483    #[test]
484    fn normalize_preserves_leading_dotdot() {
485        assert_eq!(normalize_relative_path("../a"), "../a");
486    }
487
488    #[test]
489    fn normalize_deep_escape() {
490        assert_eq!(normalize_relative_path("../../a"), "../../a");
491    }
492
493    #[test]
494    fn normalize_escape_after_descent() {
495        // guides/../../README.md -> ../README.md (one level above root)
496        assert_eq!(
497            normalize_relative_path("guides/../../README.md"),
498            "../README.md"
499        );
500    }
501
502    #[test]
503    fn resolve_same_dir() {
504        assert_eq!(resolve_link("index.md", "setup.md"), "setup.md");
505    }
506
507    #[test]
508    fn resolve_subdir() {
509        assert_eq!(
510            resolve_link("guides/intro.md", "setup.md"),
511            "guides/setup.md"
512        );
513    }
514
515    #[test]
516    fn resolve_parent() {
517        assert_eq!(resolve_link("guides/intro.md", "../config.md"), "config.md");
518    }
519
520    #[test]
521    fn graph_adjacency() {
522        let mut g = Graph::new();
523        g.add_node(Node {
524            path: "a.md".into(),
525            node_type: NodeType::File,
526            hash: None,
527            graph: None,
528            metadata: HashMap::new(),
529        });
530        g.add_node(Node {
531            path: "b.md".into(),
532            node_type: NodeType::File,
533            hash: None,
534            graph: None,
535            metadata: HashMap::new(),
536        });
537        g.add_edge(Edge {
538            source: "a.md".into(),
539            target: "b.md".into(),
540            link: None,
541            parser: "markdown".into(),
542        });
543        assert_eq!(g.forward["a.md"], vec![0]);
544        assert_eq!(g.reverse["b.md"], vec![0]);
545        assert!(!g.forward.contains_key("b.md"));
546    }
547
548    #[test]
549    fn fragment_edge_resolves_to_node() {
550        let mut g = Graph::new();
551        g.add_node(test_helpers::make_node("a.md"));
552        g.add_node(test_helpers::make_node("b.md"));
553        g.add_edge(Edge {
554            source: "a.md".into(),
555            target: "b.md".into(),
556            link: Some("b.md#heading".into()),
557            parser: "markdown".into(),
558        });
559        // target is the node ID
560        assert_eq!(g.edges[0].target, "b.md");
561        // reference carries the full original
562        assert_eq!(g.edges[0].link.as_deref(), Some("b.md#heading"));
563        // reverse map works directly on target
564        assert_eq!(g.reverse["b.md"], vec![0]);
565    }
566
567    #[test]
568    fn is_uri_detects_schemes() {
569        assert!(is_uri("http://example.com"));
570        assert!(is_uri("https://example.com"));
571        assert!(is_uri("mailto:user@example.com"));
572        assert!(is_uri("ftp://files.example.com"));
573        assert!(is_uri("tel:+1234567890"));
574        assert!(is_uri("ssh://git@github.com"));
575        assert!(is_uri("custom+scheme://foo"));
576    }
577
578    #[test]
579    fn is_uri_rejects_paths() {
580        assert!(!is_uri("setup.md"));
581        assert!(!is_uri("./relative/path.md"));
582        assert!(!is_uri("../parent.md"));
583        assert!(!is_uri("#heading"));
584        assert!(!is_uri(""));
585        assert!(!is_uri("path/with:colon.md")); // colon after slash = not a scheme
586    }
587
588    #[test]
589    fn normalize_strips_fragment() {
590        let n = normalize_link_target("file.md#heading").unwrap();
591        assert_eq!(n.target, "file.md");
592        assert_eq!(n.fragment.as_deref(), Some("#heading"));
593    }
594
595    #[test]
596    fn normalize_strips_uri_fragment() {
597        let n = normalize_link_target("https://example.com/page#section").unwrap();
598        assert_eq!(n.target, "https://example.com/page");
599        assert_eq!(n.fragment.as_deref(), Some("#section"));
600    }
601
602    #[test]
603    fn normalize_drops_anchor_only() {
604        assert!(normalize_link_target("#heading").is_none());
605    }
606
607    #[test]
608    fn normalize_drops_empty() {
609        assert!(normalize_link_target("").is_none());
610        assert!(normalize_link_target("  ").is_none());
611    }
612
613    #[test]
614    fn normalize_preserves_mailto() {
615        let n = normalize_link_target("mailto:user@example.com").unwrap();
616        assert_eq!(n.target, "mailto:user@example.com");
617        assert!(n.fragment.is_none());
618    }
619}