Skip to main content

vaultdb_core/
links.rs

1//! [`LinkGraph`] (the citation graph from a vault's `[[wikilinks]]`) plus the
2//! traversal types: [`Direction`], [`GraphScope`], [`UnresolvedLink`].
3//! Supports outgoing/incoming queries, BFS traversal, and unresolved-link
4//! discovery.
5
6use std::collections::{BTreeMap, BTreeSet};
7
8use regex::Regex;
9use std::sync::LazyLock;
10
11use crate::record::{Record, Value};
12
13/// The direction of edges to follow when querying or traversing the link
14/// graph: outgoing wikilinks (this note → others), incoming backlinks, or
15/// both at once.
16#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
17pub enum Direction {
18    Outgoing,
19    Incoming,
20    Both,
21}
22
23/// What subset of the vault to build the link graph over.
24#[derive(Debug, Clone)]
25pub enum GraphScope {
26    All,
27    Folder(String),
28    Where(crate::query::Expr),
29}
30
31/// A wikilink whose target file does not exist among the records the graph
32/// was built from.
33#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
34pub struct UnresolvedLink {
35    pub source: String,
36    pub target: String,
37}
38
39static WIKI_LINK_RE: LazyLock<Regex> =
40    LazyLock::new(|| Regex::new(r"\[\[([^\]\|#]+)(?:#[^\]\|]*)?\|?[^\]]*\]\]").unwrap());
41
42static FENCED_CODE_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(?s)```.*?```").unwrap());
43
44static INLINE_CODE_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"`[^`]+`").unwrap());
45
46/// Strip code blocks (fenced and inline) from content to avoid false link extraction.
47fn strip_code_blocks(content: &str) -> String {
48    let without_fenced = FENCED_CODE_RE.replace_all(content, "");
49    INLINE_CODE_RE.replace_all(&without_fenced, "").into_owned()
50}
51
52/// Extract all wiki-link targets from a string.
53/// Handles: [[Note]], [[Note|alias]], [[Note#section]], [[Note#section|alias]]
54fn extract_links_from_str(text: &str) -> Vec<String> {
55    WIKI_LINK_RE
56        .captures_iter(text)
57        .map(|cap| cap[1].trim().to_string())
58        .collect()
59}
60
61/// Extract all outgoing wiki-links from a record's full file content.
62/// Strips code blocks first to avoid false positives.
63pub fn extract_links(content: &str) -> BTreeSet<String> {
64    let cleaned = strip_code_blocks(content);
65    let mut links = BTreeSet::new();
66    for link in extract_links_from_str(&cleaned) {
67        links.insert(link);
68    }
69    links
70}
71
72/// Extract links from a record. Requires raw_content to be loaded.
73pub fn record_links(record: &Record) -> BTreeSet<String> {
74    match &record.raw_content {
75        Some(content) => extract_links(content),
76        None => BTreeSet::new(),
77    }
78}
79
80/// The citation graph extracted from a vault's `[[wikilinks]]`.
81///
82/// Maps note name → outgoing/incoming link sets, handles duplicate filenames
83/// across folders via path-based resolution, and retains a record-by-name map
84/// so `LinkPredicate::Where` can recurse the predicate into linked records.
85#[derive(Debug, Default)]
86pub struct LinkGraph {
87    /// note name -> outgoing link targets (as written in the wiki-links)
88    outgoing: BTreeMap<String, BTreeSet<String>>,
89    /// note name -> names of notes that link to it
90    incoming: BTreeMap<String, BTreeSet<String>>,
91    /// filename -> list of relative paths (for detecting duplicates)
92    name_to_paths: BTreeMap<String, Vec<String>>,
93    /// note name -> Record (for link-predicate evaluation)
94    records_by_name: BTreeMap<String, Record>,
95}
96
97impl LinkGraph {
98    /// Build the link index from a set of records.
99    /// All records must have raw_content loaded.
100    pub fn build(records: &[Record]) -> Self {
101        Self::build_with_root(records, None)
102    }
103
104    /// Build with a vault root for path resolution.
105    pub fn build_with_root(records: &[Record], vault_root: Option<&std::path::Path>) -> Self {
106        let mut index = LinkGraph::default();
107
108        // First pass: build name -> paths mapping and name -> record mapping.
109        for record in records {
110            let name = record.virtual_name();
111            let rel_path = match vault_root {
112                Some(root) => record.virtual_path(root),
113                None => record.path.to_string_lossy().into_owned(),
114            };
115            index
116                .name_to_paths
117                .entry(name.clone())
118                .or_default()
119                .push(rel_path);
120            index
121                .records_by_name
122                .entry(name)
123                .or_insert_with(|| record.clone());
124        }
125
126        // Second pass: extract links and resolve targets
127        for record in records {
128            let name = record.virtual_name();
129            let links = record_links(record);
130
131            // Resolve each link target to a note name
132            for target in &links {
133                let resolved = index.resolve_link_target(target);
134                index
135                    .incoming
136                    .entry(resolved.clone())
137                    .or_default()
138                    .insert(name.clone());
139            }
140
141            index.outgoing.insert(name, links);
142        }
143
144        index
145    }
146
147    /// Look up a record by its virtual name (filename without `.md`).
148    pub fn record_by_name(&self, name: &str) -> Option<&Record> {
149        self.records_by_name.get(name)
150    }
151
152    /// Resolve a wiki-link target to a note name.
153    /// Handles both plain names ([[Note]]) and path-qualified ([[folder/Note]]).
154    fn resolve_link_target(&self, target: &str) -> String {
155        if target.contains('/') {
156            // Path-qualified link like [[folder/Note]] — extract the filename part
157            target.rsplit('/').next().unwrap_or(target).to_string()
158        } else {
159            target.to_string()
160        }
161    }
162
163    /// Check if a filename has duplicates across folders.
164    pub fn is_ambiguous(&self, name: &str) -> bool {
165        self.name_to_paths
166            .get(name)
167            .is_some_and(|paths| paths.len() > 1)
168    }
169
170    /// Get all paths for a given filename.
171    pub fn paths_for_name(&self, name: &str) -> Vec<&str> {
172        self.name_to_paths
173            .get(name)
174            .map(|paths| paths.iter().map(|s| s.as_str()).collect())
175            .unwrap_or_default()
176    }
177
178    /// Get outgoing links for a note.
179    pub fn outgoing_links(&self, name: &str) -> Vec<&str> {
180        self.outgoing
181            .get(name)
182            .map(|s| s.iter().map(|s| s.as_str()).collect())
183            .unwrap_or_default()
184    }
185
186    /// Get incoming links (backlinks) for a note.
187    pub fn incoming_links(&self, name: &str) -> Vec<&str> {
188        self.incoming
189            .get(name)
190            .map(|s| s.iter().map(|s| s.as_str()).collect())
191            .unwrap_or_default()
192    }
193
194    /// Count of outgoing links.
195    pub fn outgoing_count(&self, name: &str) -> usize {
196        self.outgoing.get(name).map(|s| s.len()).unwrap_or(0)
197    }
198
199    /// Count of incoming links (backlinks).
200    pub fn incoming_count(&self, name: &str) -> usize {
201        self.incoming.get(name).map(|s| s.len()).unwrap_or(0)
202    }
203
204    /// BFS traversal from a starting note.
205    /// Returns (name, depth) pairs for all reachable notes within max_depth,
206    /// with the starting node included at depth 0.
207    pub fn traverse(
208        &self,
209        start: &str,
210        max_depth: usize,
211        direction: Direction,
212    ) -> Vec<(String, usize)> {
213        use std::collections::VecDeque;
214
215        let mut visited = BTreeSet::new();
216        let mut queue = VecDeque::new();
217        let mut results = Vec::new();
218
219        visited.insert(start.to_string());
220        queue.push_back((start.to_string(), 0usize));
221        results.push((start.to_string(), 0));
222
223        while let Some((current, depth)) = queue.pop_front() {
224            if depth >= max_depth {
225                continue;
226            }
227
228            let neighbors: Vec<&str> = match direction {
229                Direction::Outgoing => self.outgoing_links(&current),
230                Direction::Incoming => self.incoming_links(&current),
231                Direction::Both => {
232                    let mut all = self.outgoing_links(&current);
233                    all.extend(self.incoming_links(&current));
234                    all
235                }
236            };
237
238            for neighbor in neighbors {
239                if visited.insert(neighbor.to_string()) {
240                    let next_depth = depth + 1;
241                    results.push((neighbor.to_string(), next_depth));
242                    queue.push_back((neighbor.to_string(), next_depth));
243                }
244            }
245        }
246
247        results
248    }
249
250    /// Check if note `from` has an outgoing link to note `to`.
251    pub fn has_link_to(&self, from: &str, to: &str) -> bool {
252        self.outgoing
253            .get(from)
254            .is_some_and(|links| links.contains(to))
255    }
256
257    /// Check if note `to` has an incoming link from note `from`.
258    pub fn has_link_from(&self, to: &str, from: &str) -> bool {
259        self.incoming
260            .get(to)
261            .is_some_and(|links| links.contains(from))
262    }
263
264    /// All wikilinks pointing to non-existent records, returned as
265    /// `(source, target)` pairs. Targets are normalised via the same
266    /// folder-stripping rule used during graph construction.
267    pub fn unresolved(&self) -> Vec<UnresolvedLink> {
268        let mut out = Vec::new();
269        for (source, targets) in &self.outgoing {
270            for target in targets {
271                let resolved = self.resolve_link_target(target);
272                if !self.name_to_paths.contains_key(&resolved) {
273                    out.push(UnresolvedLink {
274                        source: source.clone(),
275                        target: target.clone(),
276                    });
277                }
278            }
279        }
280        out
281    }
282
283    /// BFS traversal returning just the reachable note names (without depth).
284    /// The starting note itself is NOT included.
285    pub fn traverse_from(&self, start: &str, depth: usize, direction: Direction) -> Vec<String> {
286        self.traverse(start, depth, direction)
287            .into_iter()
288            .filter(|(name, d)| name != start && *d > 0)
289            .map(|(name, _)| name)
290            .collect()
291    }
292
293    /// Get link data as Values for virtual fields on a record.
294    pub fn virtual_fields(&self, name: &str) -> Vec<(&'static str, Value)> {
295        let out_links = self.outgoing_links(name);
296        let in_links = self.incoming_links(name);
297
298        vec![
299            (
300                "_links",
301                Value::List(
302                    out_links
303                        .iter()
304                        .map(|s| Value::String(s.to_string()))
305                        .collect(),
306                ),
307            ),
308            ("_link_count", Value::Integer(out_links.len() as i64)),
309            (
310                "_backlinks",
311                Value::List(
312                    in_links
313                        .iter()
314                        .map(|s| Value::String(s.to_string()))
315                        .collect(),
316                ),
317            ),
318            ("_backlink_count", Value::Integer(in_links.len() as i64)),
319        ]
320    }
321}
322
323#[cfg(test)]
324mod tests {
325    use super::*;
326    use std::collections::BTreeMap;
327    use std::path::PathBuf;
328
329    #[test]
330    fn extract_simple_link() {
331        let links = extract_links("Some text with [[React]] and [[Node.js]] links.");
332        assert!(links.contains("React"));
333        assert!(links.contains("Node.js"));
334        assert_eq!(links.len(), 2);
335    }
336
337    #[test]
338    fn extract_link_with_alias() {
339        let links = extract_links("See [[React|the React framework]] for details.");
340        assert!(links.contains("React"));
341        assert_eq!(links.len(), 1);
342    }
343
344    #[test]
345    fn extract_link_with_section() {
346        let links = extract_links("Check [[React#Hooks]] and [[React#State Management|state]].");
347        assert!(links.contains("React"));
348        assert_eq!(links.len(), 1); // deduped
349    }
350
351    #[test]
352    fn extract_chinese_links() {
353        let links = extract_links("Zıt anlamlısı [[慢]] ile birlikte [[快]] kullanılır.");
354        assert!(links.contains("慢"));
355        assert!(links.contains("快"));
356    }
357
358    #[test]
359    fn extract_links_from_frontmatter_value() {
360        let content =
361            "---\nrelated-to:\n  - \"[[Watchlist]]\"\n  - \"[[2FA Setup]]\"\n---\nBody.\n";
362        let links = extract_links(content);
363        assert!(links.contains("Watchlist"));
364        assert!(links.contains("2FA Setup"));
365    }
366
367    #[test]
368    fn extract_no_links() {
369        let links = extract_links("Plain text with no links at all.");
370        assert!(links.is_empty());
371    }
372
373    #[test]
374    fn ignores_links_in_fenced_code_block() {
375        let content = "Real link [[React]].\n```\n[[FakeLink]] in code\n```\nMore text.";
376        let links = extract_links(content);
377        assert!(links.contains("React"));
378        assert!(!links.contains("FakeLink"));
379    }
380
381    #[test]
382    fn ignores_links_in_inline_code() {
383        let content = "Use `[[NotALink]]` but also see [[RealLink]].";
384        let links = extract_links(content);
385        assert!(links.contains("RealLink"));
386        assert!(!links.contains("NotALink"));
387    }
388
389    #[test]
390    fn build_link_index() {
391        let records = vec![
392            Record {
393                path: PathBuf::from("/vault/A.md"),
394                fields: BTreeMap::new(),
395                raw_content: Some("Links to [[B]] and [[C]].".into()),
396            },
397            Record {
398                path: PathBuf::from("/vault/B.md"),
399                fields: BTreeMap::new(),
400                raw_content: Some("Links to [[C]].".into()),
401            },
402            Record {
403                path: PathBuf::from("/vault/C.md"),
404                fields: BTreeMap::new(),
405                raw_content: Some("No links here.".into()),
406            },
407        ];
408
409        let index = LinkGraph::build(&records);
410
411        // Outgoing
412        assert_eq!(index.outgoing_count("A"), 2);
413        assert_eq!(index.outgoing_count("B"), 1);
414        assert_eq!(index.outgoing_count("C"), 0);
415
416        // Incoming (backlinks)
417        assert_eq!(index.incoming_count("A"), 0); // nothing links to A
418        assert_eq!(index.incoming_count("B"), 1); // A links to B
419        assert_eq!(index.incoming_count("C"), 2); // A and B link to C
420
421        // Specific backlinks
422        let c_backlinks = index.incoming_links("C");
423        assert!(c_backlinks.contains(&"A"));
424        assert!(c_backlinks.contains(&"B"));
425    }
426
427    #[test]
428    fn virtual_fields_from_index() {
429        let records = vec![
430            Record {
431                path: PathBuf::from("/vault/A.md"),
432                fields: BTreeMap::new(),
433                raw_content: Some("Links to [[B]] and [[C]].".into()),
434            },
435            Record {
436                path: PathBuf::from("/vault/B.md"),
437                fields: BTreeMap::new(),
438                raw_content: Some("Links back to [[A]].".into()),
439            },
440        ];
441
442        let index = LinkGraph::build(&records);
443        let fields = index.virtual_fields("A");
444
445        let link_count = fields.iter().find(|(k, _)| *k == "_link_count").unwrap();
446        assert_eq!(link_count.1, Value::Integer(2));
447
448        let backlink_count = fields
449            .iter()
450            .find(|(k, _)| *k == "_backlink_count")
451            .unwrap();
452        assert_eq!(backlink_count.1, Value::Integer(1));
453    }
454
455    #[test]
456    fn unresolved_returns_dangling_targets() {
457        let records = vec![
458            Record {
459                path: PathBuf::from("/vault/a.md"),
460                fields: BTreeMap::new(),
461                raw_content: Some("Links to [[ghost]] and [[b]].".into()),
462            },
463            Record {
464                path: PathBuf::from("/vault/b.md"),
465                fields: BTreeMap::new(),
466                raw_content: Some("".into()),
467            },
468        ];
469
470        let index = LinkGraph::build(&records);
471        let unresolved = index.unresolved();
472        assert_eq!(
473            unresolved.len(),
474            1,
475            "expected one dangling link, got {:?}",
476            unresolved
477        );
478        assert_eq!(unresolved[0].source, "a");
479        assert_eq!(unresolved[0].target, "ghost");
480    }
481
482    #[test]
483    fn unresolved_empty_when_all_resolved() {
484        let records = vec![
485            Record {
486                path: PathBuf::from("/vault/a.md"),
487                fields: BTreeMap::new(),
488                raw_content: Some("Links to [[b]].".into()),
489            },
490            Record {
491                path: PathBuf::from("/vault/b.md"),
492                fields: BTreeMap::new(),
493                raw_content: Some("".into()),
494            },
495        ];
496
497        let index = LinkGraph::build(&records);
498        assert!(index.unresolved().is_empty());
499    }
500
501    #[test]
502    fn traverse_from_outgoing_skips_self() {
503        let mk = |name: &str, content: &str| Record {
504            path: PathBuf::from(format!("/vault/{}.md", name)),
505            fields: BTreeMap::new(),
506            raw_content: Some(content.into()),
507        };
508        let records = vec![mk("a", "[[b]]"), mk("b", "[[c]]"), mk("c", "")];
509        let index = LinkGraph::build(&records);
510        let names = index.traverse_from("a", 2, Direction::Outgoing);
511        assert!(names.contains(&"b".to_string()));
512        assert!(names.contains(&"c".to_string()));
513        assert!(
514            !names.contains(&"a".to_string()),
515            "starting node should not be in the result"
516        );
517    }
518
519    #[test]
520    fn traverse_from_respects_depth() {
521        let mk = |name: &str, content: &str| Record {
522            path: PathBuf::from(format!("/vault/{}.md", name)),
523            fields: BTreeMap::new(),
524            raw_content: Some(content.into()),
525        };
526        let records = vec![
527            mk("a", "[[b]]"),
528            mk("b", "[[c]]"),
529            mk("c", "[[d]]"),
530            mk("d", ""),
531        ];
532        let index = LinkGraph::build(&records);
533        let names = index.traverse_from("a", 1, Direction::Outgoing);
534        assert!(names.contains(&"b".to_string()));
535        assert!(
536            !names.contains(&"c".to_string()),
537            "depth=1 should not reach c"
538        );
539    }
540}