1use anyhow::Result;
2use std::collections::HashMap;
3use std::path::Path;
4
5use crate::config::Config;
6use crate::discovery::discover;
7use crate::parsers;
8
9pub fn is_uri(target: &str) -> bool {
16 match url::Url::parse(target) {
17 Ok(url) => {
18 if url.has_authority() {
19 return true;
20 }
21 matches!(
22 url.scheme(),
23 "mailto" | "tel" | "data" | "urn" | "javascript"
24 )
25 }
26 Err(_) => false,
27 }
28}
29
30#[derive(Debug, Clone, serde::Serialize)]
31pub struct Node {
32 pub path: String,
33 #[serde(rename = "type")]
35 pub node_type: Option<NodeType>,
36 pub included: bool,
38 #[serde(skip_serializing_if = "Option::is_none")]
39 pub hash: Option<String>,
40 #[serde(skip_serializing_if = "HashMap::is_empty")]
42 pub metadata: HashMap<String, serde_json::Value>,
43}
44
45#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
47#[serde(rename_all = "lowercase")]
48pub enum NodeType {
49 File,
50 Directory,
51 Symlink,
52 Uri,
53}
54
55#[derive(Debug, Clone)]
56pub struct Edge {
57 pub source: String,
58 pub target: String,
60 pub link: Option<String>,
62 pub parser: String,
64}
65
66#[derive(Debug, Default)]
67pub struct Graph {
68 pub nodes: HashMap<String, Node>,
69 pub edges: Vec<Edge>,
70 pub forward: HashMap<String, Vec<usize>>,
71 pub reverse: HashMap<String, Vec<usize>>,
72}
73
74impl Graph {
75 pub fn new() -> Self {
76 Self::default()
77 }
78
79 pub fn add_node(&mut self, node: Node) {
80 self.nodes.insert(node.path.clone(), node);
81 }
82
83 pub fn is_internal_edge(&self, edge: &Edge) -> bool {
85 self.nodes.get(&edge.target).is_some_and(|n| n.included)
86 }
87
88 pub fn add_edge(&mut self, edge: Edge) {
89 let idx = self.edges.len();
90 self.forward
91 .entry(edge.source.clone())
92 .or_default()
93 .push(idx);
94 self.reverse
95 .entry(edge.target.clone())
96 .or_default()
97 .push(idx);
98 self.edges.push(edge);
99 }
100
101 pub fn included_nodes(&self) -> impl Iterator<Item = (&String, &Node)> {
103 self.nodes.iter().filter(|(_, n)| n.included)
104 }
105
106 pub fn filter_by_parsers(&self, parsers: &[String]) -> Graph {
109 let mut filtered = Graph {
110 nodes: self.nodes.clone(),
111 ..Default::default()
112 };
113
114 for edge in &self.edges {
115 if parsers.iter().any(|p| p == &edge.parser) {
116 filtered.add_edge(edge.clone());
117 }
118 }
119
120 filtered
121 }
122}
123
124pub fn hash_bytes(content: &[u8]) -> String {
126 format!("b3:{}", blake3::hash(content).to_hex())
127}
128
129struct PendingEdge {
131 source: String,
132 target: String,
133 link: Option<String>,
134 parser: String,
135}
136
137pub fn build_graph(root: &Path, config: &Config) -> Result<Graph> {
144 let canonical_root = root.canonicalize()?;
145 let included_files = discover(root, &config.include, &config.exclude)?;
146 let include_globs = crate::config::compile_globs(&config.include)?;
147 let exclude_globs = crate::config::compile_globs(&config.exclude)?;
148 let mut graph = Graph::new();
149 let mut pending_edges: Vec<PendingEdge> = Vec::new();
150
151 let mut file_text: HashMap<String, String> = HashMap::new();
155
156 for file in &included_files {
157 let file_path = root.join(file);
158 let is_symlink = file_path
159 .symlink_metadata()
160 .is_ok_and(|m| m.file_type().is_symlink());
161
162 if is_symlink {
163 match file_path.canonicalize() {
164 Err(_) => {
165 eprintln!("warn: symlink '{file}' could not be resolved — skipping");
166 continue;
167 }
168 Ok(canonical) => {
169 let should_hash = canonical.starts_with(&canonical_root)
170 && canonical.strip_prefix(&canonical_root).is_ok_and(|rel| {
171 let rel_str = rel.to_string_lossy().replace('\\', "/");
172 include_globs
173 .as_ref()
174 .is_some_and(|set| set.is_match(rel_str.as_str()))
175 });
176
177 if should_hash {
178 let raw = std::fs::read(&file_path)?;
179 let hash = hash_bytes(&raw);
180 graph.add_node(Node {
181 path: file.clone(),
182 node_type: Some(NodeType::Symlink),
183 included: true,
184 hash: Some(hash),
185 metadata: HashMap::new(),
186 });
187 if let Ok(text) = String::from_utf8(raw) {
188 file_text.insert(file.clone(), text);
189 }
190 } else {
191 graph.add_node(Node {
192 path: file.clone(),
193 node_type: Some(NodeType::Symlink),
194 included: true,
195 hash: None,
196 metadata: HashMap::new(),
197 });
198 }
199 }
200 }
201 continue;
202 }
203
204 let raw = std::fs::read(&file_path)?;
205 let hash = hash_bytes(&raw);
206
207 graph.add_node(Node {
208 path: file.clone(),
209 node_type: Some(NodeType::File),
210 included: true,
211 hash: Some(hash),
212 metadata: HashMap::new(),
213 });
214
215 if let Ok(text) = String::from_utf8(raw) {
216 file_text.insert(file.clone(), text);
217 }
218 }
219
220 let parser_list = parsers::build_parsers(&config.parsers, config.config_dir.as_deref(), root);
222 let mut parser_files: Vec<Vec<String>> = vec![Vec::new(); parser_list.len()];
223
224 for file in &included_files {
225 for (i, parser) in parser_list.iter().enumerate() {
226 if parser.matches(file) {
227 parser_files[i].push(file.clone());
228 }
229 }
230 }
231
232 for (i, parser) in parser_list.iter().enumerate() {
234 let files: Vec<(&str, &str)> = parser_files[i]
235 .iter()
236 .filter_map(|path| {
237 file_text
238 .get(path)
239 .map(|content| (path.as_str(), content.as_str()))
240 })
241 .collect();
242
243 if files.is_empty() {
244 continue;
245 }
246
247 let batch_results = parser.parse_batch(&files);
248
249 for (file, result) in batch_results {
250 if let Some(metadata) = result.metadata
251 && let Some(node) = graph.nodes.get_mut(&file)
252 {
253 node.metadata.insert(parser.name().to_string(), metadata);
254 }
255
256 for link in result.links {
257 let normalized = match normalize_link_target(&link) {
258 Some(n) => n,
259 None => continue,
260 };
261
262 let target = if is_uri(&normalized.target) {
263 normalized.target
264 } else {
265 resolve_link(&file, &normalized.target)
266 };
267 let link = normalized.fragment.map(|frag| format!("{target}{frag}"));
268 pending_edges.push(PendingEdge {
269 source: file.clone(),
270 target,
271 link,
272 parser: parser.name().to_string(),
273 });
274 }
275 }
276 }
277
278 for pending in &pending_edges {
281 if graph.nodes.contains_key(&pending.target) {
282 continue;
283 }
284 if is_uri(&pending.target) {
285 graph.add_node(Node {
286 path: pending.target.clone(),
287 node_type: Some(NodeType::Uri),
288 included: false,
289 hash: None,
290 metadata: HashMap::new(),
291 });
292 continue;
293 }
294 let escapes_root = Path::new(&pending.target)
297 .components()
298 .next()
299 .is_some_and(|c| matches!(c, std::path::Component::ParentDir));
300 let is_absolute = Path::new(&pending.target).is_absolute();
301 let node_type = if is_absolute || escapes_root {
302 None
303 } else {
304 let target_path = root.join(&pending.target);
305 target_path.symlink_metadata().ok().map(|m| {
306 if m.file_type().is_symlink() {
307 NodeType::Symlink
308 } else if m.is_dir() {
309 NodeType::Directory
310 } else {
311 NodeType::File
312 }
313 })
314 };
315 let matches_include = !escapes_root
319 && !is_absolute
320 && include_globs
321 .as_ref()
322 .is_some_and(|set| set.is_match(&pending.target));
323 let matches_exclude = exclude_globs
324 .as_ref()
325 .is_some_and(|set| set.is_match(&pending.target));
326 let included = matches_include && !matches_exclude;
327 graph.add_node(Node {
328 path: pending.target.clone(),
329 node_type,
330 included,
331 hash: None,
332 metadata: HashMap::new(),
333 });
334 }
335
336 for file in &included_files {
339 let file_path = root.join(file);
340 if !file_path
341 .symlink_metadata()
342 .is_ok_and(|m| m.file_type().is_symlink())
343 {
344 continue;
345 }
346 if let Ok(link_target) = std::fs::read_link(&file_path) {
347 let resolved = if link_target.is_absolute() {
350 match link_target.canonicalize() {
351 Ok(canonical) => match canonical.strip_prefix(&canonical_root) {
352 Ok(rel) => rel.to_string_lossy().replace('\\', "/"),
353 Err(_) => continue, },
355 Err(_) => continue, }
357 } else {
358 resolve_link(file, &link_target.to_string_lossy())
359 };
360 let resolved_escapes = Path::new(&resolved)
362 .components()
363 .next()
364 .is_some_and(|c| matches!(c, std::path::Component::ParentDir));
365 if resolved_escapes {
366 continue;
367 }
368 if !graph.nodes.contains_key(&resolved) {
370 let resolved_path = root.join(&resolved);
371 let node_type = resolved_path.symlink_metadata().ok().map(|m| {
372 if m.file_type().is_symlink() {
373 NodeType::Symlink
374 } else if m.is_dir() {
375 NodeType::Directory
376 } else {
377 NodeType::File
378 }
379 });
380 graph.add_node(Node {
381 path: resolved.clone(),
382 node_type,
383 included: false,
384 hash: None,
385 metadata: HashMap::new(),
386 });
387 }
388 graph.add_edge(Edge {
389 source: file.clone(),
390 target: resolved,
391 link: None,
392 parser: "filesystem".into(),
393 });
394 }
395 }
396
397 for pending in pending_edges {
399 graph.add_edge(Edge {
400 source: pending.source,
401 target: pending.target,
402 link: pending.link,
403 parser: pending.parser,
404 });
405 }
406
407 Ok(graph)
408}
409
410pub fn normalize_relative_path(path: &str) -> String {
414 let mut parts: Vec<String> = Vec::new();
415 for component in Path::new(path).components() {
416 match component {
417 std::path::Component::CurDir => {}
418 std::path::Component::ParentDir => {
419 if parts.last().is_some_and(|p| p != "..") {
421 parts.pop();
422 } else {
423 parts.push("..".to_string());
424 }
425 }
426 std::path::Component::Normal(c) => parts.push(c.to_string_lossy().to_string()),
427 _ => {}
428 }
429 }
430 parts.join("/")
431}
432
433struct NormalizedTarget {
435 target: String,
437 fragment: Option<String>,
439}
440
441fn normalize_link_target(raw: &str) -> Option<NormalizedTarget> {
445 let target = raw.trim();
446 if target.is_empty() {
447 return None;
448 }
449
450 if target.starts_with('#') {
452 return None;
453 }
454
455 let (base, fragment) = match target.find('#') {
457 Some(idx) => (&target[..idx], Some(target[idx..].to_string())),
458 None => (target, None),
459 };
460
461 if base.is_empty() {
463 return None;
464 }
465
466 Some(NormalizedTarget {
467 target: base.to_string(),
468 fragment,
469 })
470}
471
472pub fn resolve_link(source_file: &str, raw_target: &str) -> String {
475 let source_path = Path::new(source_file);
476 let source_dir = source_path.parent().unwrap_or(Path::new(""));
477 let joined = source_dir.join(raw_target);
478 normalize_relative_path(&joined.to_string_lossy())
479}
480
481#[cfg(test)]
482pub mod test_helpers {
483 use super::*;
484
485 pub fn make_node(path: &str) -> Node {
486 Node {
487 path: path.into(),
488 node_type: Some(NodeType::File),
489 included: true,
490 hash: None,
491 metadata: HashMap::new(),
492 }
493 }
494
495 pub fn make_edge(source: &str, target: &str) -> Edge {
496 Edge {
497 source: source.into(),
498 target: target.into(),
499 link: None,
500 parser: "markdown".into(),
501 }
502 }
503
504 pub fn make_enriched(graph: Graph) -> crate::analyses::EnrichedGraph {
505 crate::analyses::enrich_graph(
506 graph,
507 std::path::Path::new("."),
508 &crate::config::Config::defaults(),
509 None,
510 )
511 }
512
513 pub fn make_enriched_with_root(
514 graph: Graph,
515 root: &std::path::Path,
516 ) -> crate::analyses::EnrichedGraph {
517 crate::analyses::enrich_graph(graph, root, &crate::config::Config::defaults(), None)
518 }
519}
520
521#[cfg(test)]
522mod tests {
523 use super::*;
524
525 #[test]
526 fn normalize_simple() {
527 assert_eq!(normalize_relative_path("a/b/c"), "a/b/c");
528 }
529
530 #[test]
531 fn normalize_dot() {
532 assert_eq!(normalize_relative_path("./a/./b"), "a/b");
533 }
534
535 #[test]
536 fn normalize_dotdot() {
537 assert_eq!(normalize_relative_path("a/b/../c"), "a/c");
538 }
539
540 #[test]
541 fn normalize_preserves_leading_dotdot() {
542 assert_eq!(normalize_relative_path("../a"), "../a");
543 }
544
545 #[test]
546 fn normalize_deep_escape() {
547 assert_eq!(normalize_relative_path("../../a"), "../../a");
548 }
549
550 #[test]
551 fn normalize_escape_after_descent() {
552 assert_eq!(
554 normalize_relative_path("guides/../../README.md"),
555 "../README.md"
556 );
557 }
558
559 #[test]
560 fn resolve_same_dir() {
561 assert_eq!(resolve_link("index.md", "setup.md"), "setup.md");
562 }
563
564 #[test]
565 fn resolve_subdir() {
566 assert_eq!(
567 resolve_link("guides/intro.md", "setup.md"),
568 "guides/setup.md"
569 );
570 }
571
572 #[test]
573 fn resolve_parent() {
574 assert_eq!(resolve_link("guides/intro.md", "../config.md"), "config.md");
575 }
576
577 #[test]
578 fn graph_adjacency() {
579 let mut g = Graph::new();
580 g.add_node(test_helpers::make_node("a.md"));
581 g.add_node(test_helpers::make_node("b.md"));
582 g.add_edge(test_helpers::make_edge("a.md", "b.md"));
583 assert_eq!(g.forward["a.md"], vec![0]);
584 assert_eq!(g.reverse["b.md"], vec![0]);
585 assert!(!g.forward.contains_key("b.md"));
586 }
587
588 #[test]
589 fn fragment_edge_resolves_to_node() {
590 let mut g = Graph::new();
591 g.add_node(test_helpers::make_node("a.md"));
592 g.add_node(test_helpers::make_node("b.md"));
593 g.add_edge(Edge {
594 source: "a.md".into(),
595 target: "b.md".into(),
596 link: Some("b.md#heading".into()),
597 parser: "markdown".into(),
598 });
599 assert_eq!(g.edges[0].target, "b.md");
601 assert_eq!(g.edges[0].link.as_deref(), Some("b.md#heading"));
603 assert_eq!(g.reverse["b.md"], vec![0]);
605 }
606
607 #[test]
608 fn is_uri_detects_schemes() {
609 assert!(is_uri("http://example.com"));
610 assert!(is_uri("https://example.com"));
611 assert!(is_uri("mailto:user@example.com"));
612 assert!(is_uri("ftp://files.example.com"));
613 assert!(is_uri("tel:+1234567890"));
614 assert!(is_uri("ssh://git@github.com"));
615 assert!(is_uri("custom+scheme://foo"));
616 }
617
618 #[test]
619 fn is_uri_rejects_paths() {
620 assert!(!is_uri("setup.md"));
621 assert!(!is_uri("./relative/path.md"));
622 assert!(!is_uri("../parent.md"));
623 assert!(!is_uri("#heading"));
624 assert!(!is_uri(""));
625 assert!(!is_uri("path/with:colon.md"));
626 }
627
628 #[test]
629 fn is_uri_rejects_bare_schemes() {
630 assert!(!is_uri("name: foo bar bazz"));
633 assert!(!is_uri("status: draft"));
634 assert!(!is_uri("title: My Document"));
635 assert!(!is_uri("name:foo"));
636 assert!(!is_uri("x:y"));
637 }
638
639 #[test]
640 fn normalize_strips_fragment() {
641 let n = normalize_link_target("file.md#heading").unwrap();
642 assert_eq!(n.target, "file.md");
643 assert_eq!(n.fragment.as_deref(), Some("#heading"));
644 }
645
646 #[test]
647 fn normalize_strips_uri_fragment() {
648 let n = normalize_link_target("https://example.com/page#section").unwrap();
649 assert_eq!(n.target, "https://example.com/page");
650 assert_eq!(n.fragment.as_deref(), Some("#section"));
651 }
652
653 #[test]
654 fn normalize_drops_anchor_only() {
655 assert!(normalize_link_target("#heading").is_none());
656 }
657
658 #[test]
659 fn normalize_drops_empty() {
660 assert!(normalize_link_target("").is_none());
661 assert!(normalize_link_target(" ").is_none());
662 }
663
664 #[test]
665 fn normalize_preserves_mailto() {
666 let n = normalize_link_target("mailto:user@example.com").unwrap();
667 assert_eq!(n.target, "mailto:user@example.com");
668 assert!(n.fragment.is_none());
669 }
670
671 fn edge_with_parser(source: &str, target: &str, parser: &str) -> Edge {
672 Edge {
673 source: source.into(),
674 target: target.into(),
675 link: None,
676 parser: parser.into(),
677 }
678 }
679
680 #[test]
681 fn filter_by_single_parser() {
682 let mut g = Graph::new();
683 g.add_node(test_helpers::make_node("a.md"));
684 g.add_node(test_helpers::make_node("b.md"));
685 g.add_node(test_helpers::make_node("c.md"));
686 g.add_edge(edge_with_parser("a.md", "b.md", "markdown"));
687 g.add_edge(edge_with_parser("a.md", "c.md", "frontmatter"));
688
689 let filtered = g.filter_by_parsers(&["frontmatter".into()]);
690 assert_eq!(filtered.edges.len(), 1);
691 assert_eq!(filtered.edges[0].target, "c.md");
692 assert_eq!(filtered.edges[0].parser, "frontmatter");
693 }
694
695 #[test]
696 fn filter_preserves_all_nodes() {
697 let mut g = Graph::new();
698 g.add_node(test_helpers::make_node("a.md"));
699 g.add_node(test_helpers::make_node("b.md"));
700 g.add_edge(edge_with_parser("a.md", "b.md", "markdown"));
701
702 let filtered = g.filter_by_parsers(&["frontmatter".into()]);
703 assert_eq!(filtered.nodes.len(), 2);
704 assert!(filtered.nodes.contains_key("a.md"));
705 assert!(filtered.nodes.contains_key("b.md"));
706 assert!(filtered.edges.is_empty());
707 }
708
709 #[test]
710 fn filter_rebuilds_adjacency_maps() {
711 let mut g = Graph::new();
712 g.add_node(test_helpers::make_node("a.md"));
713 g.add_node(test_helpers::make_node("b.md"));
714 g.add_node(test_helpers::make_node("c.md"));
715 g.add_edge(edge_with_parser("a.md", "b.md", "markdown"));
716 g.add_edge(edge_with_parser("a.md", "c.md", "frontmatter"));
717
718 let filtered = g.filter_by_parsers(&["frontmatter".into()]);
719 assert_eq!(filtered.forward["a.md"], vec![0]);
720 assert_eq!(filtered.reverse["c.md"], vec![0]);
721 assert!(!filtered.reverse.contains_key("b.md"));
722 }
723
724 #[test]
725 fn filter_by_multiple_parsers() {
726 let mut g = Graph::new();
727 g.add_node(test_helpers::make_node("a.md"));
728 g.add_node(test_helpers::make_node("b.md"));
729 g.add_node(test_helpers::make_node("c.md"));
730 g.add_edge(edge_with_parser("a.md", "b.md", "markdown"));
731 g.add_edge(edge_with_parser("a.md", "c.md", "frontmatter"));
732
733 let filtered = g.filter_by_parsers(&["markdown".into(), "frontmatter".into()]);
734 assert_eq!(filtered.edges.len(), 2);
735 }
736
737 #[test]
738 fn filter_empty_parsers_removes_all_edges() {
739 let mut g = Graph::new();
740 g.add_node(test_helpers::make_node("a.md"));
741 g.add_node(test_helpers::make_node("b.md"));
742 g.add_edge(test_helpers::make_edge("a.md", "b.md"));
743
744 let filtered = g.filter_by_parsers(&[]);
745 assert!(filtered.edges.is_empty());
746 assert_eq!(filtered.nodes.len(), 2);
747 }
748}