Skip to main content

vaultdb_core/
links.rs

1//! [`LinkGraph`] (the citation graph from a vault's `[[wikilinks]]`) plus the
2//! traversal types: [`Direction`], [`GraphScope`], [`UnresolvedLink`].
3//! Supports outgoing/incoming queries, BFS traversal, and unresolved-link
4//! discovery.
5
6use std::collections::{BTreeMap, BTreeSet};
7
8use regex::Regex;
9use std::sync::LazyLock;
10
11use crate::record::{Record, Value};
12
13/// The direction of edges to follow when querying or traversing the link
14/// graph: outgoing wikilinks (this note → others), incoming backlinks, or
15/// both at once.
16#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
17pub enum Direction {
18    Outgoing,
19    Incoming,
20    Both,
21}
22
23/// What subset of the vault to build the link graph over.
24#[derive(Debug, Clone)]
25pub enum GraphScope {
26    All,
27    Folder(String),
28    Where(crate::query::Expr),
29}
30
31/// A wikilink whose target file does not exist among the records the graph
32/// was built from.
33#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
34pub struct UnresolvedLink {
35    pub source: String,
36    pub target: String,
37}
38
39static WIKI_LINK_RE: LazyLock<Regex> =
40    LazyLock::new(|| Regex::new(r"\[\[([^\]\|#]+)(?:#[^\]\|]*)?\|?[^\]]*\]\]").unwrap());
41
42static FENCED_CODE_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(?s)```.*?```").unwrap());
43
44static INLINE_CODE_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"`[^`]+`").unwrap());
45
46/// Markdown inline link `[label](url)`. Used by [`extract_markdown_links`].
47static MD_LINK_RE: LazyLock<Regex> =
48    LazyLock::new(|| Regex::new(r"\[([^\]]+)\]\(([^)]+)\)").unwrap());
49
50/// Strip code blocks (fenced and inline) from content to avoid false link extraction.
51fn strip_code_blocks(content: &str) -> String {
52    let without_fenced = FENCED_CODE_RE.replace_all(content, "");
53    INLINE_CODE_RE.replace_all(&without_fenced, "").into_owned()
54}
55
56/// Extract all wiki-link targets from a string.
57/// Handles: [[Note]], [[Note|alias]], [[Note#section]], [[Note#section|alias]]
58fn extract_links_from_str(text: &str) -> Vec<String> {
59    WIKI_LINK_RE
60        .captures_iter(text)
61        .map(|cap| cap[1].trim().to_string())
62        .collect()
63}
64
65/// Extract all outgoing wiki-links from a record's full file content.
66/// Strips code blocks first to avoid false positives.
67pub fn extract_links(content: &str) -> BTreeSet<String> {
68    let cleaned = strip_code_blocks(content);
69    let mut links = BTreeSet::new();
70    for link in extract_links_from_str(&cleaned) {
71        links.insert(link);
72    }
73    links
74}
75
76/// Extract markdown links `[label](url)` from `content` as `(label, url)`
77/// pairs, in document order. Strips code blocks first (so links inside
78/// fenced/inline code are ignored) and skips image embeds (`![alt](url)`).
79/// Wiki-links `[[Note]]` are never matched — they have no `](`.
80pub fn extract_markdown_links(content: &str) -> Vec<(String, String)> {
81    let cleaned = strip_code_blocks(content);
82    let mut out = Vec::new();
83    for cap in MD_LINK_RE.captures_iter(&cleaned) {
84        let whole = cap.get(0).expect("capture group 0 always present");
85        // Skip the `[label](url)` that belongs to an image embed `![alt](url)`.
86        if cleaned[..whole.start()].ends_with('!') {
87            continue;
88        }
89        out.push((cap[1].trim().to_string(), cap[2].trim().to_string()));
90    }
91    out
92}
93
94/// Extract links from a record. Requires raw_content to be loaded.
95pub fn record_links(record: &Record) -> BTreeSet<String> {
96    match &record.raw_content {
97        Some(content) => extract_links(content),
98        None => BTreeSet::new(),
99    }
100}
101
102/// The citation graph extracted from a vault's `[[wikilinks]]`.
103///
104/// Maps note name → outgoing/incoming link sets, handles duplicate filenames
105/// across folders via path-based resolution, and retains a record-by-name map
106/// so `LinkPredicate::Where` can recurse the predicate into linked records.
107#[derive(Debug, Default)]
108pub struct LinkGraph {
109    /// note name -> outgoing link targets (as written in the wiki-links)
110    outgoing: BTreeMap<String, BTreeSet<String>>,
111    /// note name -> names of notes that link to it
112    incoming: BTreeMap<String, BTreeSet<String>>,
113    /// filename -> list of relative paths (for detecting duplicates)
114    name_to_paths: BTreeMap<String, Vec<String>>,
115    /// note name -> Record (for link-predicate evaluation)
116    records_by_name: BTreeMap<String, Record>,
117}
118
119impl LinkGraph {
120    /// Build the link index from a set of records.
121    /// All records must have raw_content loaded.
122    pub fn build(records: &[Record]) -> Self {
123        Self::build_with_root(records, None)
124    }
125
126    /// Build with a vault root for path resolution.
127    pub fn build_with_root(records: &[Record], vault_root: Option<&std::path::Path>) -> Self {
128        let mut index = LinkGraph::default();
129
130        // First pass: build name -> paths mapping and name -> record mapping.
131        for record in records {
132            let name = record.virtual_name();
133            let rel_path = match vault_root {
134                Some(root) => record.virtual_path(root),
135                None => record.path.to_string_lossy().into_owned(),
136            };
137            index
138                .name_to_paths
139                .entry(name.clone())
140                .or_default()
141                .push(rel_path);
142            index
143                .records_by_name
144                .entry(name)
145                .or_insert_with(|| record.clone());
146        }
147
148        // Second pass: extract links and resolve targets
149        for record in records {
150            let name = record.virtual_name();
151            let links = record_links(record);
152
153            // Resolve each link target to a note name
154            for target in &links {
155                let resolved = index.resolve_link_target(target);
156                index
157                    .incoming
158                    .entry(resolved.clone())
159                    .or_default()
160                    .insert(name.clone());
161            }
162
163            index.outgoing.insert(name, links);
164        }
165
166        index
167    }
168
169    /// Look up a record by its virtual name (filename without `.md`).
170    pub fn record_by_name(&self, name: &str) -> Option<&Record> {
171        self.records_by_name.get(name)
172    }
173
174    /// Resolve a wiki-link target to a note name.
175    /// Handles both plain names ([[Note]]) and path-qualified ([[folder/Note]]).
176    fn resolve_link_target(&self, target: &str) -> String {
177        if target.contains('/') {
178            // Path-qualified link like [[folder/Note]] — extract the filename part
179            target.rsplit('/').next().unwrap_or(target).to_string()
180        } else {
181            target.to_string()
182        }
183    }
184
185    /// Check if a filename has duplicates across folders.
186    pub fn is_ambiguous(&self, name: &str) -> bool {
187        self.name_to_paths
188            .get(name)
189            .is_some_and(|paths| paths.len() > 1)
190    }
191
192    /// Get all paths for a given filename.
193    pub fn paths_for_name(&self, name: &str) -> Vec<&str> {
194        self.name_to_paths
195            .get(name)
196            .map(|paths| paths.iter().map(|s| s.as_str()).collect())
197            .unwrap_or_default()
198    }
199
200    /// Get outgoing links for a note.
201    pub fn outgoing_links(&self, name: &str) -> Vec<&str> {
202        self.outgoing
203            .get(name)
204            .map(|s| s.iter().map(|s| s.as_str()).collect())
205            .unwrap_or_default()
206    }
207
208    /// Get incoming links (backlinks) for a note.
209    pub fn incoming_links(&self, name: &str) -> Vec<&str> {
210        self.incoming
211            .get(name)
212            .map(|s| s.iter().map(|s| s.as_str()).collect())
213            .unwrap_or_default()
214    }
215
216    /// Count of outgoing links.
217    pub fn outgoing_count(&self, name: &str) -> usize {
218        self.outgoing.get(name).map(|s| s.len()).unwrap_or(0)
219    }
220
221    /// Count of incoming links (backlinks).
222    pub fn incoming_count(&self, name: &str) -> usize {
223        self.incoming.get(name).map(|s| s.len()).unwrap_or(0)
224    }
225
226    /// BFS traversal from a starting note.
227    /// Returns (name, depth) pairs for all reachable notes within max_depth,
228    /// with the starting node included at depth 0.
229    pub fn traverse(
230        &self,
231        start: &str,
232        max_depth: usize,
233        direction: Direction,
234    ) -> Vec<(String, usize)> {
235        use std::collections::VecDeque;
236
237        let mut visited = BTreeSet::new();
238        let mut queue = VecDeque::new();
239        let mut results = Vec::new();
240
241        visited.insert(start.to_string());
242        queue.push_back((start.to_string(), 0usize));
243        results.push((start.to_string(), 0));
244
245        while let Some((current, depth)) = queue.pop_front() {
246            if depth >= max_depth {
247                continue;
248            }
249
250            let neighbors: Vec<&str> = match direction {
251                Direction::Outgoing => self.outgoing_links(&current),
252                Direction::Incoming => self.incoming_links(&current),
253                Direction::Both => {
254                    let mut all = self.outgoing_links(&current);
255                    all.extend(self.incoming_links(&current));
256                    all
257                }
258            };
259
260            for neighbor in neighbors {
261                if visited.insert(neighbor.to_string()) {
262                    let next_depth = depth + 1;
263                    results.push((neighbor.to_string(), next_depth));
264                    queue.push_back((neighbor.to_string(), next_depth));
265                }
266            }
267        }
268
269        results
270    }
271
272    /// Check if note `from` has an outgoing link to note `to`.
273    pub fn has_link_to(&self, from: &str, to: &str) -> bool {
274        self.outgoing
275            .get(from)
276            .is_some_and(|links| links.contains(to))
277    }
278
279    /// Check if note `to` has an incoming link from note `from`.
280    pub fn has_link_from(&self, to: &str, from: &str) -> bool {
281        self.incoming
282            .get(to)
283            .is_some_and(|links| links.contains(from))
284    }
285
286    /// All wikilinks pointing to non-existent records, returned as
287    /// `(source, target)` pairs. Targets are normalised via the same
288    /// folder-stripping rule used during graph construction.
289    pub fn unresolved(&self) -> Vec<UnresolvedLink> {
290        let mut out = Vec::new();
291        for (source, targets) in &self.outgoing {
292            for target in targets {
293                let resolved = self.resolve_link_target(target);
294                if !self.name_to_paths.contains_key(&resolved) {
295                    out.push(UnresolvedLink {
296                        source: source.clone(),
297                        target: target.clone(),
298                    });
299                }
300            }
301        }
302        out
303    }
304
305    /// BFS traversal returning just the reachable note names (without depth).
306    /// The starting note itself is NOT included.
307    pub fn traverse_from(&self, start: &str, depth: usize, direction: Direction) -> Vec<String> {
308        self.traverse(start, depth, direction)
309            .into_iter()
310            .filter(|(name, d)| name != start && *d > 0)
311            .map(|(name, _)| name)
312            .collect()
313    }
314
315    /// Get link data as Values for virtual fields on a record.
316    pub fn virtual_fields(&self, name: &str) -> Vec<(&'static str, Value)> {
317        let out_links = self.outgoing_links(name);
318        let in_links = self.incoming_links(name);
319
320        vec![
321            (
322                "_links",
323                Value::List(
324                    out_links
325                        .iter()
326                        .map(|s| Value::String(s.to_string()))
327                        .collect(),
328                ),
329            ),
330            ("_link_count", Value::Integer(out_links.len() as i64)),
331            (
332                "_backlinks",
333                Value::List(
334                    in_links
335                        .iter()
336                        .map(|s| Value::String(s.to_string()))
337                        .collect(),
338                ),
339            ),
340            ("_backlink_count", Value::Integer(in_links.len() as i64)),
341        ]
342    }
343}
344
345#[cfg(test)]
346mod tests {
347    use super::*;
348    use std::collections::BTreeMap;
349    use std::path::PathBuf;
350
351    #[test]
352    fn extract_simple_link() {
353        let links = extract_links("Some text with [[React]] and [[Node.js]] links.");
354        assert!(links.contains("React"));
355        assert!(links.contains("Node.js"));
356        assert_eq!(links.len(), 2);
357    }
358
359    #[test]
360    fn extract_link_with_alias() {
361        let links = extract_links("See [[React|the React framework]] for details.");
362        assert!(links.contains("React"));
363        assert_eq!(links.len(), 1);
364    }
365
366    #[test]
367    fn extract_link_with_section() {
368        let links = extract_links("Check [[React#Hooks]] and [[React#State Management|state]].");
369        assert!(links.contains("React"));
370        assert_eq!(links.len(), 1); // deduped
371    }
372
373    #[test]
374    fn extract_chinese_links() {
375        let links = extract_links("Zıt anlamlısı [[慢]] ile birlikte [[快]] kullanılır.");
376        assert!(links.contains("慢"));
377        assert!(links.contains("快"));
378    }
379
380    #[test]
381    fn extract_links_from_frontmatter_value() {
382        let content =
383            "---\nrelated-to:\n  - \"[[Watchlist]]\"\n  - \"[[2FA Setup]]\"\n---\nBody.\n";
384        let links = extract_links(content);
385        assert!(links.contains("Watchlist"));
386        assert!(links.contains("2FA Setup"));
387    }
388
389    #[test]
390    fn extract_no_links() {
391        let links = extract_links("Plain text with no links at all.");
392        assert!(links.is_empty());
393    }
394
395    #[test]
396    fn ignores_links_in_fenced_code_block() {
397        let content = "Real link [[React]].\n```\n[[FakeLink]] in code\n```\nMore text.";
398        let links = extract_links(content);
399        assert!(links.contains("React"));
400        assert!(!links.contains("FakeLink"));
401    }
402
403    #[test]
404    fn ignores_links_in_inline_code() {
405        let content = "Use `[[NotALink]]` but also see [[RealLink]].";
406        let links = extract_links(content);
407        assert!(links.contains("RealLink"));
408        assert!(!links.contains("NotALink"));
409    }
410
411    #[test]
412    fn extract_markdown_links_basic() {
413        let md = "See [Docs](https://example.com/docs) and [Home](https://example.com).";
414        assert_eq!(
415            extract_markdown_links(md),
416            vec![
417                ("Docs".to_string(), "https://example.com/docs".to_string()),
418                ("Home".to_string(), "https://example.com".to_string()),
419            ]
420        );
421    }
422
423    #[test]
424    fn extract_markdown_links_skips_images_wikilinks_and_code() {
425        let md = "![pic](https://img.test/a.png) [Real](https://real.test) [[WikiNote]] `[Code](https://code.test)`";
426        assert_eq!(
427            extract_markdown_links(md),
428            vec![("Real".to_string(), "https://real.test".to_string())]
429        );
430    }
431
432    #[test]
433    fn build_link_index() {
434        let records = vec![
435            Record {
436                path: PathBuf::from("/vault/A.md"),
437                fields: BTreeMap::new(),
438                raw_content: Some("Links to [[B]] and [[C]].".into()),
439            },
440            Record {
441                path: PathBuf::from("/vault/B.md"),
442                fields: BTreeMap::new(),
443                raw_content: Some("Links to [[C]].".into()),
444            },
445            Record {
446                path: PathBuf::from("/vault/C.md"),
447                fields: BTreeMap::new(),
448                raw_content: Some("No links here.".into()),
449            },
450        ];
451
452        let index = LinkGraph::build(&records);
453
454        // Outgoing
455        assert_eq!(index.outgoing_count("A"), 2);
456        assert_eq!(index.outgoing_count("B"), 1);
457        assert_eq!(index.outgoing_count("C"), 0);
458
459        // Incoming (backlinks)
460        assert_eq!(index.incoming_count("A"), 0); // nothing links to A
461        assert_eq!(index.incoming_count("B"), 1); // A links to B
462        assert_eq!(index.incoming_count("C"), 2); // A and B link to C
463
464        // Specific backlinks
465        let c_backlinks = index.incoming_links("C");
466        assert!(c_backlinks.contains(&"A"));
467        assert!(c_backlinks.contains(&"B"));
468    }
469
470    #[test]
471    fn virtual_fields_from_index() {
472        let records = vec![
473            Record {
474                path: PathBuf::from("/vault/A.md"),
475                fields: BTreeMap::new(),
476                raw_content: Some("Links to [[B]] and [[C]].".into()),
477            },
478            Record {
479                path: PathBuf::from("/vault/B.md"),
480                fields: BTreeMap::new(),
481                raw_content: Some("Links back to [[A]].".into()),
482            },
483        ];
484
485        let index = LinkGraph::build(&records);
486        let fields = index.virtual_fields("A");
487
488        let link_count = fields.iter().find(|(k, _)| *k == "_link_count").unwrap();
489        assert_eq!(link_count.1, Value::Integer(2));
490
491        let backlink_count = fields
492            .iter()
493            .find(|(k, _)| *k == "_backlink_count")
494            .unwrap();
495        assert_eq!(backlink_count.1, Value::Integer(1));
496    }
497
498    #[test]
499    fn unresolved_returns_dangling_targets() {
500        let records = vec![
501            Record {
502                path: PathBuf::from("/vault/a.md"),
503                fields: BTreeMap::new(),
504                raw_content: Some("Links to [[ghost]] and [[b]].".into()),
505            },
506            Record {
507                path: PathBuf::from("/vault/b.md"),
508                fields: BTreeMap::new(),
509                raw_content: Some("".into()),
510            },
511        ];
512
513        let index = LinkGraph::build(&records);
514        let unresolved = index.unresolved();
515        assert_eq!(
516            unresolved.len(),
517            1,
518            "expected one dangling link, got {:?}",
519            unresolved
520        );
521        assert_eq!(unresolved[0].source, "a");
522        assert_eq!(unresolved[0].target, "ghost");
523    }
524
525    #[test]
526    fn unresolved_empty_when_all_resolved() {
527        let records = vec![
528            Record {
529                path: PathBuf::from("/vault/a.md"),
530                fields: BTreeMap::new(),
531                raw_content: Some("Links to [[b]].".into()),
532            },
533            Record {
534                path: PathBuf::from("/vault/b.md"),
535                fields: BTreeMap::new(),
536                raw_content: Some("".into()),
537            },
538        ];
539
540        let index = LinkGraph::build(&records);
541        assert!(index.unresolved().is_empty());
542    }
543
544    #[test]
545    fn traverse_from_outgoing_skips_self() {
546        let mk = |name: &str, content: &str| Record {
547            path: PathBuf::from(format!("/vault/{}.md", name)),
548            fields: BTreeMap::new(),
549            raw_content: Some(content.into()),
550        };
551        let records = vec![mk("a", "[[b]]"), mk("b", "[[c]]"), mk("c", "")];
552        let index = LinkGraph::build(&records);
553        let names = index.traverse_from("a", 2, Direction::Outgoing);
554        assert!(names.contains(&"b".to_string()));
555        assert!(names.contains(&"c".to_string()));
556        assert!(
557            !names.contains(&"a".to_string()),
558            "starting node should not be in the result"
559        );
560    }
561
562    #[test]
563    fn traverse_from_respects_depth() {
564        let mk = |name: &str, content: &str| Record {
565            path: PathBuf::from(format!("/vault/{}.md", name)),
566            fields: BTreeMap::new(),
567            raw_content: Some(content.into()),
568        };
569        let records = vec![
570            mk("a", "[[b]]"),
571            mk("b", "[[c]]"),
572            mk("c", "[[d]]"),
573            mk("d", ""),
574        ];
575        let index = LinkGraph::build(&records);
576        let names = index.traverse_from("a", 1, Direction::Outgoing);
577        assert!(names.contains(&"b".to_string()));
578        assert!(
579            !names.contains(&"c".to_string()),
580            "depth=1 should not reach c"
581        );
582    }
583}