Skip to main content

lore_engine/engine/
link_parser.rs

1//! Wiki link parsing, slug generation, and link rewriting.
2//!
3//! Handles `[[Page Name]]` and `[[Page Name|alias]]` syntax. Uses
4//! pulldown-cmark to correctly identify code blocks and inline code spans,
5//! so links inside code are reliably ignored.
6
7use pulldown_cmark::{Event, Options, Parser, Tag};
8use regex::Regex;
9use serde::Serialize;
10use std::sync::LazyLock;
11
12static WIKILINK_RE: LazyLock<Regex> =
13    LazyLock::new(|| Regex::new(r"\[\[([^\[\]]+)\]\]").expect("Invalid wikilink regex"));
14
15/// A parsed `[[wikilink]]` extracted from markdown content.
16#[derive(Debug, Serialize, Clone, PartialEq, Eq)]
17pub struct WikiLink {
18    /// Original target text as written (e.g. `"Page Name"`).
19    pub target: String,
20    /// Alias text if present (`[[target|alias]]`).
21    pub alias: Option<String>,
22    /// Normalized slug derived from target (e.g. `"page-name"`).
23    pub slug: String,
24    /// 1-based line number where this link appears.
25    pub line_number: usize,
26}
27
28/// Normalize a page name into a slug.
29///
30/// Rules:
31/// - Lowercase
32/// - Trim whitespace
33/// - Split on `/` and normalize each segment independently
34/// - Within each segment: replace spaces and underscores with hyphens,
35///   strip punctuation, collapse consecutive hyphens, trim leading/trailing hyphens
36/// - Preserve Unicode alphanumeric characters
37/// - Drop empty segments (handles `//`, leading `/`, trailing `/`)
38/// - Rejoin with `/`
39pub fn slugify(name: &str) -> String {
40    let trimmed = name.trim().to_lowercase();
41
42    let segments: Vec<String> = trimmed
43        .split('/')
44        .filter_map(|segment| {
45            let normalized = slugify_segment(segment);
46            if normalized.is_empty() { None } else { Some(normalized) }
47        })
48        .collect();
49
50    segments.join("/")
51}
52
53/// Normalize a single path segment (no slashes).
54fn slugify_segment(segment: &str) -> String {
55    let mut result = String::with_capacity(segment.len());
56
57    for ch in segment.chars() {
58        if ch.is_alphanumeric() {
59            result.push(ch);
60        } else if ch == ' ' || ch == '_' || ch == '-' {
61            result.push('-');
62        }
63        // Other punctuation is dropped
64    }
65
66    // Collapse consecutive hyphens
67    let mut collapsed = String::with_capacity(result.len());
68    let mut prev_hyphen = false;
69    for ch in result.chars() {
70        if ch == '-' {
71            if !prev_hyphen && !collapsed.is_empty() {
72                collapsed.push('-');
73            }
74            prev_hyphen = true;
75        } else {
76            prev_hyphen = false;
77            collapsed.push(ch);
78        }
79    }
80
81    collapsed.trim_matches('-').to_string()
82}
83
84/// Convert a slug back to a title (best-effort reverse of slugify).
85///
86/// Replaces hyphens with spaces and capitalizes each word.
87/// Handles folder prefixes by using only the last segment.
88pub fn title_from_slug(slug: &str) -> String {
89    let leaf = slug.rsplit('/').next().unwrap_or(slug);
90    leaf.split('-')
91        .filter(|s| !s.is_empty())
92        .map(|word| {
93            let mut chars = word.chars();
94            match chars.next() {
95                Some(c) => {
96                    let upper: String = c.to_uppercase().collect();
97                    format!("{upper}{}", chars.as_str())
98                }
99                None => String::new(),
100            }
101        })
102        .collect::<Vec<_>>()
103        .join(" ")
104}
105
106/// Extract the H1 title from markdown content, if present.
107///
108/// Looks for the first `# Title` line at the top of the document
109/// (skipping blank lines). Returns `None` if no H1 is found before
110/// the first non-blank, non-heading line.
111pub fn extract_h1(content: &str) -> Option<String> {
112    for line in content.lines() {
113        let trimmed = line.trim();
114        if trimmed.is_empty() {
115            continue;
116        }
117        if let Some(rest) = trimmed.strip_prefix("# ") {
118            let title = rest.trim();
119            if !title.is_empty() {
120                return Some(title.to_string());
121            }
122        }
123        // Stop at first non-blank, non-H1 line
124        break;
125    }
126    None
127}
128
129/// Build a sorted list of byte ranges that are inside code (fenced blocks or inline spans).
130///
131/// Uses pulldown-cmark to correctly identify all code regions, including
132/// indented code blocks, multi-backtick spans, and tilde fences.
133fn code_byte_ranges(content: &str) -> Vec<(usize, usize)> {
134    let parser = Parser::new_ext(content, Options::all()).into_offset_iter();
135    let mut ranges = Vec::new();
136
137    for (event, range) in parser {
138        match event {
139            Event::Start(Tag::CodeBlock(_)) | Event::Code(_) => {
140                ranges.push((range.start, range.end));
141            }
142            _ => {}
143        }
144    }
145
146    ranges
147}
148
149/// Check whether a byte offset falls inside any code range.
150fn is_in_code(offset: usize, code_ranges: &[(usize, usize)]) -> bool {
151    code_ranges.iter().any(|(s, e)| offset >= *s && offset < *e)
152}
153
154/// Compute the 1-based line number for a byte offset in content.
155fn line_number_at(content: &str, byte_offset: usize) -> usize {
156    content[..byte_offset].matches('\n').count() + 1
157}
158
159/// Extract all `\[\[wikilinks\]\]` from markdown content.
160///
161/// Uses pulldown-cmark to identify code blocks and inline code spans,
162/// then scans non-code regions with regex. This correctly handles all
163/// Markdown code constructs (fenced, indented, inline, multi-backtick).
164pub fn extract_links(content: &str) -> Vec<WikiLink> {
165    let code_ranges = code_byte_ranges(content);
166    let mut links = Vec::new();
167
168    for cap in WIKILINK_RE.captures_iter(content) {
169        let m = cap.get(0).expect("Regex match should exist");
170
171        if is_in_code(m.start(), &code_ranges) {
172            continue;
173        }
174
175        let body = cap.get(1).expect("Capture group 1 should exist").as_str();
176        let (target, alias) = parse_link_body(body);
177
178        // Strip .md extension if present (users may write [[Page.md]] instead of [[Page]])
179        let target_for_slug = if target.to_lowercase().ends_with(".md") {
180            &target[..target.len() - 3]
181        } else {
182            &target
183        };
184        let slug = slugify(target_for_slug);
185
186        if !slug.is_empty() {
187            links.push(WikiLink {
188                target,
189                alias,
190                slug,
191                line_number: line_number_at(content, m.start()),
192            });
193        }
194    }
195
196    links
197}
198
199fn parse_link_body(body: &str) -> (String, Option<String>) {
200    if let Some(pipe_pos) = body.find('|') {
201        let target = body[..pipe_pos].trim().to_string();
202        let alias = body[pipe_pos + 1..].trim().to_string();
203        (target, Some(alias))
204    } else {
205        (body.trim().to_string(), None)
206    }
207}
208
209/// Replace all `\[\[wikilinks\]\]` that resolve to `old_slug` with `new_title`.
210/// Preserves aliases: `[[Old|alias]]` -> `[[New Title|alias]]`.
211/// Uses pulldown-cmark to skip links inside code blocks and inline code spans.
212pub fn replace_wikilinks(content: &str, old_slug: &str, new_title: &str) -> String {
213    let code_ranges = code_byte_ranges(content);
214    let mut result = String::with_capacity(content.len());
215    let mut last_end = 0;
216
217    for cap in WIKILINK_RE.captures_iter(content) {
218        let m = cap.get(0).expect("Regex match should exist");
219        let start = m.start();
220        let end = m.end();
221
222        if is_in_code(start, &code_ranges) {
223            continue;
224        }
225
226        let body = cap.get(1).expect("Capture group 1 should exist").as_str();
227        let (target, alias) = parse_link_body(body);
228        let target_slug = slugify(&target);
229
230        if target_slug == old_slug {
231            result.push_str(&content[last_end..start]);
232
233            match alias {
234                Some(a) => result.push_str(&format!("[[{new_title}|{a}]]")),
235                None => result.push_str(&format!("[[{new_title}]]")),
236            }
237
238            last_end = end;
239        }
240    }
241
242    result.push_str(&content[last_end..]);
243    result
244}
245
246#[cfg(test)]
247mod tests {
248    use super::*;
249
250    #[test]
251    fn test_slugify_basic() {
252        assert_eq!(slugify("My Page Name"), "my-page-name");
253    }
254
255    #[test]
256    fn test_slugify_folder() {
257        assert_eq!(slugify("folder/My Page"), "folder/my-page");
258    }
259
260    #[test]
261    fn test_slugify_special_chars() {
262        assert_eq!(slugify("Hello, World! (2024)"), "hello-world-2024");
263    }
264
265    #[test]
266    fn test_slugify_underscores() {
267        assert_eq!(slugify("my_page_name"), "my-page-name");
268    }
269
270    #[test]
271    fn test_slugify_unicode() {
272        assert_eq!(slugify("Tagesnotiz"), "tagesnotiz");
273        assert_eq!(slugify("日本語ページ"), "日本語ページ");
274    }
275
276    #[test]
277    fn test_slugify_whitespace() {
278        assert_eq!(slugify("  spaces  everywhere  "), "spaces-everywhere");
279    }
280
281    #[test]
282    fn test_extract_simple() {
283        let links = extract_links("see [[Page One]] here");
284        assert_eq!(links.len(), 1);
285        assert_eq!(links[0].target, "Page One");
286        assert_eq!(links[0].slug, "page-one");
287        assert_eq!(links[0].alias, None);
288        assert_eq!(links[0].line_number, 1);
289    }
290
291    #[test]
292    fn test_extract_alias() {
293        let links = extract_links("see [[Page One|click here]]");
294        assert_eq!(links.len(), 1);
295        assert_eq!(links[0].target, "Page One");
296        assert_eq!(links[0].alias, Some("click here".to_string()));
297        assert_eq!(links[0].slug, "page-one");
298    }
299
300    #[test]
301    fn test_extract_multiple() {
302        let links = extract_links("[[A]] and [[B|bee]] plus [[folder/C]]");
303        assert_eq!(links.len(), 3);
304        assert_eq!(links[0].slug, "a");
305        assert_eq!(links[1].slug, "b");
306        assert_eq!(links[1].alias, Some("bee".to_string()));
307        assert_eq!(links[2].slug, "folder/c");
308    }
309
310    #[test]
311    fn test_extract_in_fenced_code_block() {
312        let content = "before\n```\n[[not a link]]\n```\nafter [[real link]]";
313        let links = extract_links(content);
314        assert_eq!(links.len(), 1);
315        assert_eq!(links[0].slug, "real-link");
316    }
317
318    #[test]
319    fn test_extract_in_inline_code() {
320        let links = extract_links("see `[[not a link]]` here");
321        assert_eq!(links.len(), 0);
322    }
323
324    #[test]
325    fn test_extract_line_numbers() {
326        let content = "line one\n[[Link A]]\nline three\nline four\n[[Link B]]";
327        let links = extract_links(content);
328        assert_eq!(links.len(), 2);
329        assert_eq!(links[0].line_number, 2);
330        assert_eq!(links[1].line_number, 5);
331    }
332
333    #[test]
334    fn test_nested_brackets_no_panic() {
335        let links = extract_links("[[outer [[inner]] end]]");
336        assert!(links.len() <= 2);
337    }
338
339    #[test]
340    fn test_empty_link_ignored() {
341        let links = extract_links("[[]]");
342        assert_eq!(links.len(), 0);
343    }
344
345    #[test]
346    fn test_replace_simple() {
347        let result = replace_wikilinks("see [[Rust]] here", "rust", "Rust Language");
348        assert_eq!(result, "see [[Rust Language]] here");
349    }
350
351    #[test]
352    fn test_replace_preserves_alias() {
353        let result = replace_wikilinks("see [[Rust|my fav]] here", "rust", "Rust Language");
354        assert_eq!(result, "see [[Rust Language|my fav]] here");
355    }
356
357    #[test]
358    fn test_replace_skips_code_block() {
359        let content = "before [[Rust]]\n```\n[[Rust]]\n```\nafter [[Rust]]";
360        let result = replace_wikilinks(content, "rust", "Rust Language");
361        assert_eq!(
362            result,
363            "before [[Rust Language]]\n```\n[[Rust]]\n```\nafter [[Rust Language]]"
364        );
365    }
366
367    #[test]
368    fn test_replace_skips_inline_code() {
369        let result = replace_wikilinks("see `[[Rust]]` and [[Rust]]", "rust", "Rust Language");
370        assert_eq!(result, "see `[[Rust]]` and [[Rust Language]]");
371    }
372
373    #[test]
374    fn test_slugify_spaces_only() {
375        assert_eq!(slugify("   "), "");
376    }
377
378    #[test]
379    fn test_slugify_punctuation_only() {
380        assert_eq!(slugify("!@#$%"), "");
381    }
382
383    #[test]
384    fn test_title_from_slug_basic() {
385        assert_eq!(title_from_slug("my-page"), "My Page");
386    }
387
388    #[test]
389    fn test_title_from_slug_with_folder() {
390        assert_eq!(title_from_slug("notes/deep/my-page"), "My Page");
391    }
392
393    #[test]
394    fn test_title_from_slug_single_word() {
395        assert_eq!(title_from_slug("cael"), "Cael");
396    }
397
398    #[test]
399    fn test_extract_h1_basic() {
400        assert_eq!(extract_h1("# My Title\n\nContent here"), Some("My Title".to_string()));
401    }
402
403    #[test]
404    fn test_extract_h1_with_leading_blanks() {
405        assert_eq!(extract_h1("\n\n# Title\nContent"), Some("Title".to_string()));
406    }
407
408    #[test]
409    fn test_extract_h1_none_without_h1() {
410        assert_eq!(extract_h1("No heading here\nJust text"), None);
411    }
412
413    #[test]
414    fn test_extract_h1_ignores_h2() {
415        assert_eq!(extract_h1("## Not an H1"), None);
416    }
417
418    #[test]
419    fn test_extract_h1_ignores_deep_h1() {
420        // H1 after non-blank non-heading line should not match
421        assert_eq!(extract_h1("Some text\n# Not the title"), None);
422    }
423
424    #[test]
425    fn test_extract_h1_empty_content() {
426        assert_eq!(extract_h1(""), None);
427        assert_eq!(extract_h1("\n\n\n"), None);
428    }
429
430    // --- Slugify segment-awareness tests ---
431
432    #[test]
433    fn test_slugify_idempotent() {
434        let cases = ["My Page", "folder/My Page", "a b c", "日本語ページ", "C++ Guide"];
435        for input in &cases {
436            let once = slugify(input);
437            let twice = slugify(&once);
438            assert_eq!(once, twice, "slugify not idempotent for: {input}");
439        }
440    }
441
442    #[test]
443    fn test_slugify_double_slash() {
444        assert_eq!(slugify("folder//page"), "folder/page");
445    }
446
447    #[test]
448    fn test_slugify_leading_trailing_slash() {
449        assert_eq!(slugify("/page/"), "page");
450        assert_eq!(slugify("/folder/page/"), "folder/page");
451    }
452
453    #[test]
454    fn test_slugify_slash_only() {
455        assert_eq!(slugify("/"), "");
456        assert_eq!(slugify("///"), "");
457    }
458
459    // --- pulldown-cmark code detection tests ---
460
461    #[test]
462    fn test_extract_skips_indented_code_block() {
463        let content = "normal text\n\n    [[not a link]]\n\nreal [[link]]";
464        let links = extract_links(content);
465        assert_eq!(links.len(), 1);
466        assert_eq!(links[0].slug, "link");
467    }
468
469    #[test]
470    fn test_extract_skips_tilde_fence() {
471        let content = "before\n~~~\n[[not a link]]\n~~~\nafter [[real]]";
472        let links = extract_links(content);
473        assert_eq!(links.len(), 1);
474        assert_eq!(links[0].slug, "real");
475    }
476
477    #[test]
478    fn test_extract_skips_multi_backtick_inline() {
479        let content = "see ``[[not a link]]`` and [[real]]";
480        let links = extract_links(content);
481        assert_eq!(links.len(), 1);
482        assert_eq!(links[0].slug, "real");
483    }
484
485    #[test]
486    fn test_replace_skips_indented_code_block() {
487        let content = "[[Rust]]\n\n    [[Rust]]\n\n[[Rust]]";
488        let result = replace_wikilinks(content, "rust", "Rust Language");
489        assert!(result.contains("[[Rust Language]]"));
490        // The indented block should preserve the original
491        assert!(result.contains("    [[Rust]]"));
492    }
493}