lex_core/lex/ast/links.rs
1//! Document link extraction for LSP support
2//!
3//! This module provides APIs for extracting clickable links from Lex documents,
4//! enabling the LSP "document links" feature that makes URLs and file references
5//! clickable in editors.
6//!
7//! ## Problem
8//!
9//! The LSP document links feature needs to find all clickable links:
10//! - URLs in text (`[https://example.com]`)
11//! - File references (`[./file.txt]`)
12//! - Verbatim block `src` parameters (images, includes)
13//!
14//! While `ReferenceType::Url` and `ReferenceType::File` exist, there's no API to
15//! extract all links from a document.
16//!
17//! ## Solution
18//!
19//! This module provides:
20//! - `DocumentLink` struct representing a link with its location and type
21//! - `find_all_links()` methods on Document and Session
22//! - `src_parameter()` method on Verbatim to access src parameters
23//!
24//! ## Link Types
25//!
26//! 1. **URL links**: `[https://example.com]` - HTTP/HTTPS URLs
27//! 2. **File links**: `[./file.txt]`, `[../path/to/file.md]` - File references
28//! 3. **Verbatim src**: `:: image src=./image.png ::` - External resource references
29
30use super::anchoring::{ReferenceAnchor, ReferenceLine};
31use super::elements::Verbatim;
32use super::inline_positions::{walk_text_content_positions, InlinePositionVisitor};
33use super::range::{Position, Range};
34use super::text_content::TextContent;
35use super::{Document, Session};
36use crate::lex::inlines::{AnchorDirection, ReferenceInline, ReferenceType, WordAnchor};
37use std::fmt;
38
39/// Represents a document link with its location and type
40#[derive(Debug, Clone, PartialEq)]
41pub struct DocumentLink {
42 pub range: Range,
43 pub target: String,
44 pub link_type: LinkType,
45}
46
47impl DocumentLink {
48 pub fn new(range: Range, target: String, link_type: LinkType) -> Self {
49 Self {
50 range,
51 target,
52 link_type,
53 }
54 }
55}
56
57impl fmt::Display for DocumentLink {
58 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
59 write!(
60 f,
61 "{:?} link: {} at {}",
62 self.link_type, self.target, self.range.start
63 )
64 }
65}
66
67/// Type of document link
68#[derive(Debug, Clone, Copy, PartialEq, Eq)]
69pub enum LinkType {
70 /// HTTP/HTTPS URL
71 Url,
72 /// File reference (relative or absolute path)
73 File,
74 /// Verbatim block src parameter
75 VerbatimSrc,
76}
77
78impl Verbatim {
79 /// Get the src parameter value if present
80 ///
81 /// The src parameter is commonly used for:
82 /// - Image sources: `:: image src=./diagram.png ::`
83 /// - File includes: `:: include src=./code.rs ::`
84 /// - External resources: `:: data src=./data.csv ::`
85 ///
86 /// # Returns
87 /// The value of the `src` parameter, or None if not present
88 ///
89 /// # Example
90 /// ```rust,ignore
91 /// if let Some(src) = verbatim.src_parameter() {
92 /// // Make src clickable in editor
93 /// println!("Link to: {}", src);
94 /// }
95 /// ```
96 pub fn src_parameter(&self) -> Option<&str> {
97 self.closing_data
98 .parameters
99 .iter()
100 .find(|p| p.key == "src")
101 .map(|p| p.value.as_str())
102 }
103}
104
105impl Session {
106 /// Find all links at any depth in this session
107 ///
108 /// This searches recursively through all content to find:
109 /// - URL references: `[https://example.com]`
110 /// - File references: `[./path/to/file.txt]`
111 /// - Verbatim src parameters: `src=./image.png`
112 ///
113 /// # Returns
114 /// Vector of all links found in this session and its descendants
115 ///
116 /// # Example
117 /// ```rust,ignore
118 /// let links = session.find_all_links();
119 /// for link in links {
120 /// println!("Found {} link: {}", link.link_type, link.target);
121 /// }
122 /// ```
123 pub fn find_all_links(&self) -> Vec<DocumentLink> {
124 use super::elements::content_item::ContentItem;
125 use super::traits::AstNode;
126
127 let mut links = Vec::new();
128
129 // Links in this session's title and every nested session's title.
130 //
131 // `Document::find_all_links` invokes us on the implicit root session
132 // (whose title is empty), so without the recursive sweep below we
133 // would silently drop every URL/File reference that appears in a
134 // section heading — `1. See [./handlers.lex] for details` and
135 // similar — even though paragraph-body refs were correctly found.
136 collect_text_content_links(&self.title, &mut links);
137 for nested in self.iter_sessions_recursive() {
138 collect_text_content_links(&nested.title, &mut links);
139 }
140
141 // Paragraphs (recursively into nested sessions).
142 for paragraph in self.iter_paragraphs_recursive() {
143 for line_item in ¶graph.lines {
144 if let ContentItem::TextLine(line) = line_item {
145 collect_text_content_links(&line.content, &mut links);
146 }
147 }
148 }
149
150 // Verbatim `src` parameters — these aren't bracketed inline references,
151 // so the verbatim's range stays as-is.
152 for (item, _depth) in self.iter_all_nodes_with_depth() {
153 if let ContentItem::VerbatimBlock(verbatim) = item {
154 if let Some(src) = verbatim.src_parameter() {
155 let link = DocumentLink::new(
156 verbatim.range().clone(),
157 src.to_string(),
158 LinkType::VerbatimSrc,
159 );
160 links.push(link);
161 }
162 }
163 }
164
165 links
166 }
167}
168
169impl Document {
170 /// Find all links in the entire document
171 ///
172 /// This searches the entire document tree to find all clickable links:
173 /// - URL references in text
174 /// - File references in text
175 /// - Verbatim block src parameters
176 ///
177 /// # Returns
178 /// Vector of all links found in the document
179 ///
180 /// # Example
181 /// ```rust,ignore
182 /// let doc = parse_document(source)?;
183 /// let links = doc.find_all_links();
184 /// for link in links {
185 /// // Make link clickable in LSP
186 /// send_document_link(link.range, link.target);
187 /// }
188 /// ```
189 pub fn find_all_links(&self) -> Vec<DocumentLink> {
190 let mut links = Vec::new();
191 if let Some(title) = &self.title {
192 collect_text_content_links(&title.content, &mut links);
193 }
194 links.extend(self.root.find_all_links());
195
196 // Reference lines (whole-element anchors and self-links) live outside
197 // the structural tree — they are removed from the line stream before
198 // parsing (see `crate::lex::anchoring`) and collected on the document.
199 // Each becomes a `DocumentLink` whose range is the *anchored span*
200 // (the head line for a whole-element anchor, the reference's own text
201 // for a self-link), so editors underline/navigate the anchored text
202 // rather than the bracketed reference.
203 for ref_line in self.reference_lines() {
204 collect_reference_line_link(ref_line, &mut links);
205 }
206
207 links
208 }
209}
210
211/// Emit a [`DocumentLink`] for a reference line (§2.3.2). Only link-like Url/File
212/// reference types become document links — Session/General reference types have
213/// no navigable Url/File target today, mirroring the inline collector which only
214/// surfaces Url/File. Marker-style types never reach here (they are not reference
215/// lines).
216///
217/// - [`ReferenceAnchor::WholeElement`]: range = the anchored head line
218/// (`anchor_range`); target = the reference's Url/File.
219/// - [`ReferenceAnchor::SelfLink`]: range = the reference's own bracketed text
220/// (`reference_range`); target = the reference's Url/File.
221fn collect_reference_line_link(ref_line: &ReferenceLine, out: &mut Vec<DocumentLink>) {
222 let (target, link_type) = match &ref_line.reference.reference_type {
223 ReferenceType::Url { target } => (target.clone(), LinkType::Url),
224 ReferenceType::File { target } => (target.clone(), LinkType::File),
225 _ => return,
226 };
227
228 // `anchor_range` / `reference_range` were produced via
229 // `SourceLocation::byte_range_to_ast_range`, whose columns are *byte*
230 // offsets within the line. The inline word-anchor path (and LSP's default
231 // `positionEncoding`) uses *UTF-16* columns. Normalize the reference-line
232 // ranges to UTF-16: keep the byte span and start column (the start sits at
233 // a known boundary — the anchor head or the `[`) and recompute the end
234 // column from the anchored text's UTF-16 width. For non-ASCII anchor or
235 // reference text this is the only correct mapping; for ASCII it is a no-op.
236 let (base, anchored_utf16) = match &ref_line.anchor {
237 ReferenceAnchor::WholeElement {
238 anchor_range,
239 anchor_text,
240 ..
241 } => (anchor_range, utf16_width(anchor_text)),
242 // Self-link covers the reference's own `[bracketed]` text:
243 // `[` + raw + `]`.
244 ReferenceAnchor::SelfLink => (
245 &ref_line.reference_range,
246 utf16_width("[") + utf16_width(&ref_line.reference.raw) + utf16_width("]"),
247 ),
248 };
249
250 let range = Range::new(
251 base.span.clone(),
252 base.start,
253 Position::new(base.start.line, base.start.column + anchored_utf16),
254 );
255
256 out.push(DocumentLink::new(range, target, link_type));
257}
258
259/// Walks `text`'s inline tree and pushes a [`DocumentLink`] for each URL and
260/// File reference, with a range covering exactly the `[bracketed]` reference.
261///
262/// LSP `textDocument/documentLink` ranges drive the clickable + visually
263/// underlined area in editors. Using the containing paragraph or title range
264/// would underline the whole element — which is exactly the bug this function
265/// is replacing.
266///
267/// The cursor work is delegated to the shared
268/// [`crate::lex::ast::inline_positions::walk_text_content_positions`] visitor;
269/// this function only contributes the link-shaping logic in `LinkCollector`.
270fn collect_text_content_links(text: &TextContent, out: &mut Vec<DocumentLink>) {
271 let mut collector = LinkCollector::new(out);
272 walk_text_content_positions(text, &mut collector);
273 // A `Following`-anchored reference defers its link until the next `Plain`
274 // node arrives. If the walk ends with one still pending (the reference was
275 // the last node, or only non-`Plain` nodes followed it), flush it now so
276 // the link is not lost — it falls back to the bracket range.
277 collector.flush();
278}
279
280/// Visitor that emits a [`DocumentLink`] per URL/File reference. All other
281/// inline node variants are intentionally ignored (footnote/citation/session/
282/// annotation/TK refs do not become document links).
283///
284/// ## Range widening for inline word anchors (§2.3.1)
285///
286/// An *inline* URL/File reference (one that shares its line with other text)
287/// anchors a single word — the word immediately *preceding* it (default) or,
288/// when it is the first token on the line, the word immediately *following* it.
289/// The anchor-resolution pass records that word (and direction) on
290/// `ReferenceInline::word_anchor`. To make the link underline/navigate the
291/// *word* rather than the `[bracketed]` text, the collector computes the word's
292/// source range from the adjacent `Plain` node:
293///
294/// - `Preceding`: the word lies at the end of the most-recently-visited `Plain`
295/// node (`last_plain`), which is emitted before the reference. Computed
296/// immediately.
297/// - `Following`: the word lies at the start of the *next* `Plain` node, which
298/// is visited after the reference. The reference is recorded as `pending` and
299/// resolved when that `Plain` arrives.
300///
301/// The word stored on `WordAnchor` is *cleaned* (surrounding punctuation
302/// trimmed) by the resolver; we locate that cleaned substring inside the plain
303/// text so the range covers exactly the word, not its surrounding punctuation.
304/// If the word cannot be located in the adjacent plain text (e.g. it was
305/// flattened across a formatting span, or the plain node was escaped in a way
306/// that shifts byte math), the link falls back to the `[bracketed]` range.
307struct LinkCollector<'a> {
308 out: &'a mut Vec<DocumentLink>,
309 /// The most recently visited `Plain` node's range + text, used to resolve a
310 /// `Preceding` word anchor (the word at the end of the text before a
311 /// reference).
312 last_plain: Option<PlainSpan>,
313 /// A reference whose `Following` word anchor is waiting for the next `Plain`
314 /// node to be visited so its start word can be located.
315 pending_following: Option<PendingFollowing>,
316}
317
318/// A `Plain` inline node captured for word-anchor range computation: the node's
319/// source range and the (unescaped) text the range covers.
320struct PlainSpan {
321 range: Range,
322 text: String,
323}
324
325/// A reference awaiting the next `Plain` node to resolve a `Following` anchor.
326struct PendingFollowing {
327 word: String,
328 target: String,
329 link_type: LinkType,
330 /// The bracket-bounded fallback range, used if the word can't be located.
331 bracket_range: Range,
332}
333
334impl<'a> LinkCollector<'a> {
335 fn new(out: &'a mut Vec<DocumentLink>) -> Self {
336 Self {
337 out,
338 last_plain: None,
339 pending_following: None,
340 }
341 }
342
343 /// Bracket-bounded range of a reference: open marker start → close marker end.
344 fn bracket_range(open_marker: &Range, close_marker: &Range) -> Range {
345 Range::new(
346 open_marker.span.start..close_marker.span.end,
347 open_marker.start,
348 close_marker.end,
349 )
350 }
351
352 fn push(&mut self, range: Range, target: String, link_type: LinkType) {
353 self.out.push(DocumentLink::new(range, target, link_type));
354 }
355
356 /// Emit any reference still waiting on a following `Plain` node, falling back
357 /// to its bracket range (no following plain text was found to anchor in).
358 ///
359 /// Called at the end of the walk and whenever a *new* `Following` anchor
360 /// arrives while one is already pending — without this the earlier pending
361 /// link would be silently overwritten and lost.
362 fn flush(&mut self) {
363 if let Some(pending) = self.pending_following.take() {
364 self.push(pending.bracket_range, pending.target, pending.link_type);
365 }
366 }
367}
368
369impl<'a> InlinePositionVisitor for LinkCollector<'a> {
370 fn visit_plain(&mut self, range: &Range, text: &str) {
371 // Resolve any reference waiting on a following word first — this plain
372 // node is the text that follows it.
373 if let Some(pending) = self.pending_following.take() {
374 let plain = PlainSpan {
375 range: range.clone(),
376 text: text.to_string(),
377 };
378 let resolved = locate_word_range(&plain, &pending.word, WordEnd::Start)
379 .unwrap_or(pending.bracket_range);
380 self.push(resolved, pending.target, pending.link_type);
381 }
382 self.last_plain = Some(PlainSpan {
383 range: range.clone(),
384 text: text.to_string(),
385 });
386 }
387
388 fn visit_reference(
389 &mut self,
390 open_marker: &Range,
391 _content: &Range,
392 close_marker: &Range,
393 data: &ReferenceInline,
394 ) {
395 let (target, link_type) = match &data.reference_type {
396 ReferenceType::Url { target } => (target.clone(), LinkType::Url),
397 ReferenceType::File { target } => (target.clone(), LinkType::File),
398 _ => return,
399 };
400 let bracket_range = Self::bracket_range(open_marker, close_marker);
401
402 match &data.word_anchor {
403 // Inline reference anchoring the preceding word: resolve against the
404 // last plain node we visited.
405 Some(WordAnchor {
406 word,
407 direction: AnchorDirection::Preceding,
408 }) => {
409 let range = self
410 .last_plain
411 .as_ref()
412 .and_then(|plain| locate_word_range(plain, word, WordEnd::End))
413 .unwrap_or(bracket_range);
414 self.push(range, target, link_type);
415 }
416 // Inline reference anchoring the following word: defer until the
417 // next plain node is visited.
418 Some(WordAnchor {
419 word,
420 direction: AnchorDirection::Following,
421 }) => {
422 // Flush any earlier pending `Following` link first — two
423 // `Following` references back-to-back (before the next `Plain`
424 // node) would otherwise clobber the first.
425 self.flush();
426 self.pending_following = Some(PendingFollowing {
427 word: word.clone(),
428 target,
429 link_type,
430 bracket_range,
431 });
432 }
433 // No word anchor (reference lines are handled separately; a lone
434 // marker reference has no word). Fall back to the bracket range.
435 None => {
436 self.push(bracket_range, target, link_type);
437 }
438 }
439 }
440}
441
442/// Which end of the plain text the anchored word sits at.
443#[derive(Clone, Copy)]
444enum WordEnd {
445 /// The word is the *last* whitespace-delimited token (preceding anchor).
446 End,
447 /// The word is the *first* whitespace-delimited token (following anchor).
448 Start,
449}
450
451/// Compute the source [`Range`] of `word` within `plain`, looking at the
452/// appropriate end of the plain text.
453///
454/// `word` is the *cleaned* anchor word (surrounding punctuation already trimmed
455/// by the resolver). We find the matching whitespace-delimited token at the
456/// requested end, then locate the cleaned word inside it so trailing/leading
457/// punctuation (`website,` → `website`) is excluded from the range.
458///
459/// Returns `None` (caller falls back to the bracket range) when the word can't
460/// be located — e.g. the anchor was flattened across a formatting span, so the
461/// adjacent plain node doesn't literally contain it.
462fn locate_word_range(plain: &PlainSpan, word: &str, end: WordEnd) -> Option<Range> {
463 let text = &plain.text;
464 // `text` is the *unescaped* plain text, but `plain.range.span` covers the
465 // *raw* source. When the run contains escapes (`\X`), the raw span is longer
466 // than the unescaped text, so byte offsets computed against `text` no longer
467 // map onto the raw span. Bail out (caller falls back to the bracket range)
468 // rather than emit a misplaced underline.
469 if plain.range.span.len() != text.len() {
470 return None;
471 }
472 // The token at the requested end, with its byte offset within `text`.
473 let token = match end {
474 WordEnd::End => last_token(text),
475 WordEnd::Start => first_token(text),
476 }?;
477 // Locate the cleaned word inside the token (punctuation trimmed). The token
478 // contains the word as a contiguous substring (cleaning only strips leading
479 // and trailing chars), so a single `find` recovers its offset.
480 let word_in_token = token.text.find(word)?;
481 let word_start = token.offset + word_in_token;
482 let word_end = word_start + word.len();
483
484 // Map byte offsets within the plain text to source coordinates. The plain
485 // node is single-line (inline parsing is per-line), so the source byte span
486 // is the plain node's span offset by these byte positions, and columns
487 // advance from the plain node's start column. This holds when the plain
488 // text's bytes line up 1:1 with the source (the common, escape-free case);
489 // if an escape shifted the bytes, `find` would still give a plausible
490 // offset but the column math could drift — acceptable since the worst case
491 // is a slightly-off underline, and callers can fall back to brackets.
492 let base = &plain.range;
493 let span = (base.span.start + word_start)..(base.span.start + word_end);
494 let start_col = base.start.column + utf16_width(&text[..word_start]);
495 let end_col = base.start.column + utf16_width(&text[..word_end]);
496 Some(Range::new(
497 span,
498 Position::new(base.start.line, start_col),
499 Position::new(base.start.line, end_col),
500 ))
501}
502
503/// A whitespace-delimited token with its byte offset within the parent text.
504struct Token<'a> {
505 text: &'a str,
506 offset: usize,
507}
508
509/// The last whitespace-delimited token of `text`, with its byte offset.
510fn last_token(text: &str) -> Option<Token<'_>> {
511 let tok = text.split_whitespace().next_back()?;
512 // `split_whitespace` doesn't give offsets; the last token ends at the last
513 // non-whitespace byte, so find it from the trimmed end.
514 let trimmed_end = text.trim_end().len();
515 let offset = trimmed_end - tok.len();
516 Some(Token { text: tok, offset })
517}
518
519/// The first whitespace-delimited token of `text`, with its byte offset.
520fn first_token(text: &str) -> Option<Token<'_>> {
521 let tok = text.split_whitespace().next()?;
522 let offset = text.len() - text.trim_start().len();
523 Some(Token { text: tok, offset })
524}
525
526/// UTF-16 code-unit width of `s` — matches the column units used by the inline
527/// position walker (LSP default `positionEncoding`).
528fn utf16_width(s: &str) -> usize {
529 s.chars().map(char::len_utf16).sum()
530}
531
532#[cfg(test)]
533mod tests {
534 use super::*;
535 use crate::lex::parsing::parse_document;
536
537 #[test]
538 fn test_url_link_extraction() {
539 let source = "Check out [https://example.com] for more info.\n\n";
540 let doc = parse_document(source).unwrap();
541
542 let links = doc.find_all_links();
543
544 assert_eq!(links.len(), 1);
545 assert_eq!(links[0].link_type, LinkType::Url);
546 assert_eq!(links[0].target, "https://example.com");
547 }
548
549 #[test]
550 fn test_file_link_extraction() {
551 let source = "See [./README.md] for details.\n\n";
552 let doc = parse_document(source).unwrap();
553
554 let links = doc.find_all_links();
555
556 assert_eq!(links.len(), 1);
557 assert_eq!(links[0].link_type, LinkType::File);
558 assert_eq!(links[0].target, "./README.md");
559 }
560
561 #[test]
562 fn test_multiple_links() {
563 let source = "Visit [https://example.com] and check [./docs.md].\n\n";
564 let doc = parse_document(source).unwrap();
565
566 let links = doc.find_all_links();
567
568 assert_eq!(links.len(), 2);
569 assert!(links.iter().any(|l| l.link_type == LinkType::Url));
570 assert!(links.iter().any(|l| l.link_type == LinkType::File));
571 }
572
573 #[test]
574 fn test_verbatim_src_parameter() {
575 let source =
576 "Sunset Photo:\n As the sun sets over the ocean.\n:: image src=./diagram.png ::\n\n";
577 let doc = parse_document(source).unwrap();
578
579 let links = doc.find_all_links();
580
581 // Find verbatim src link
582 let src_links: Vec<_> = links
583 .iter()
584 .filter(|l| l.link_type == LinkType::VerbatimSrc)
585 .collect();
586 assert_eq!(
587 src_links.len(),
588 1,
589 "Expected 1 verbatim src link, found {}. All links: {:?}",
590 src_links.len(),
591 links
592 );
593 assert_eq!(src_links[0].target, "./diagram.png");
594 }
595
596 #[test]
597 fn test_verbatim_src_parameter_method() {
598 use super::super::elements::{Data, Label, Parameter};
599
600 let verbatim = Verbatim::with_subject(
601 "Test".to_string(),
602 Data::new(
603 Label::new("image".to_string()),
604 vec![Parameter::new("src".to_string(), "./test.png".to_string())],
605 ),
606 );
607
608 assert_eq!(verbatim.src_parameter(), Some("./test.png"));
609
610 // Test verbatim without src parameter
611 let verbatim_no_src = Verbatim::with_subject(
612 "Test".to_string(),
613 Data::new(Label::new("code".to_string()), vec![]),
614 );
615
616 assert_eq!(verbatim_no_src.src_parameter(), None);
617 }
618
619 #[test]
620 fn test_no_links() {
621 let source = "Just plain text with no links.\n\n";
622 let doc = parse_document(source).unwrap();
623
624 let links = doc.find_all_links();
625
626 assert_eq!(links.len(), 0);
627 }
628
629 #[test]
630 fn test_footnote_not_a_link() {
631 let source = "Text with footnote [42].\n\n";
632 let doc = parse_document(source).unwrap();
633
634 let links = doc.find_all_links();
635
636 // Footnote references are not clickable links
637 assert_eq!(links.len(), 0);
638 }
639
640 #[test]
641 fn test_nested_session_links() {
642 let source = "Outer Session\n\n Inner session with [https://example.com].\n\n";
643 let doc = parse_document(source).unwrap();
644
645 let links = doc.find_all_links();
646
647 // Should find link in nested session
648 assert_eq!(links.len(), 1);
649 assert_eq!(links[0].target, "https://example.com");
650 }
651
652 // -----------------------------------------------------------------------
653 // Range-precision tests (inline word anchors, §2.3.1)
654 //
655 // The LSP `textDocument/documentLink` response uses each link's `range`
656 // to decide what is clickable and what gets the link decoration in the
657 // editor. Editors (notably VSCode) render the entire range as an
658 // underlined link.
659 //
660 // An *inline* reference (one that shares its line with other text) anchors
661 // a single word — the word immediately *preceding* it by default. So the
662 // link range covers that anchored word, not the `[bracketed]` reference and
663 // not the surrounding paragraph. This is the PR-C widening: editors now
664 // underline/navigate the word, matching how the reference renders.
665 // -----------------------------------------------------------------------
666
667 use super::super::range::Position;
668
669 #[test]
670 fn test_url_link_range_covers_preceding_word_in_paragraph() {
671 // "Check out [https://example.com] for more info."
672 // 0123456789^
673 // The reference shares its line with text, so it anchors the preceding
674 // word "out" (bytes 6..9), not the brackets.
675 let source = "Check out [https://example.com] for more info.\n\n";
676 let doc = parse_document(source).unwrap();
677 let links = doc.find_all_links();
678
679 assert_eq!(links.len(), 1);
680 let link = &links[0];
681 assert_eq!(link.target, "https://example.com");
682
683 let captured = &source[link.range.span.clone()];
684 assert_eq!(
685 link.range.span,
686 6..9,
687 "inline link range must cover the anchored word 'out'. Captured: {captured:?}"
688 );
689 assert_eq!(captured, "out");
690 assert_eq!(link.range.start, Position::new(0, 6));
691 assert_eq!(link.range.end, Position::new(0, 9));
692 }
693
694 #[test]
695 fn test_file_link_range_covers_preceding_word_in_paragraph() {
696 // "See [./README.md] for details." → anchors the preceding word "See".
697 let source = "See [./README.md] for details.\n\n";
698 let doc = parse_document(source).unwrap();
699 let links = doc.find_all_links();
700
701 assert_eq!(links.len(), 1);
702 let link = &links[0];
703 assert_eq!(link.target, "./README.md");
704
705 let captured = &source[link.range.span.clone()];
706 assert_eq!(
707 link.range.span,
708 0..3,
709 "inline link range must cover the anchored word 'See'. Captured: {captured:?}"
710 );
711 assert_eq!(captured, "See");
712 assert_eq!(link.range.start, Position::new(0, 0));
713 assert_eq!(link.range.end, Position::new(0, 3));
714 }
715
716 #[test]
717 fn test_following_word_anchor_range() {
718 // First-on-line reference anchors the *following* word "is".
719 // "[https://lex.ing] is the home page."
720 let source = "[https://lex.ing] is the home page.\n\n";
721 let doc = parse_document(source).unwrap();
722 let links = doc.find_all_links();
723
724 assert_eq!(links.len(), 1);
725 let link = &links[0];
726 assert_eq!(link.target, "https://lex.ing");
727
728 let captured = &source[link.range.span.clone()];
729 assert_eq!(captured, "is", "following-anchor link must cover 'is'");
730 let is_start = source.find("is").unwrap();
731 assert_eq!(link.range.span, is_start..is_start + 2);
732 }
733
734 #[test]
735 fn test_word_anchor_excludes_trailing_punctuation() {
736 // The preceding token is "website," but the anchor word is "website"
737 // (punctuation trimmed), so the range must exclude the comma.
738 let source = "the project website, [https://x.example] is fast.\n\n";
739 let doc = parse_document(source).unwrap();
740 let links = doc.find_all_links();
741
742 assert_eq!(links.len(), 1);
743 let captured = &source[links[0].range.span.clone()];
744 assert_eq!(captured, "website", "range must exclude the trailing comma");
745 }
746
747 #[test]
748 fn test_abutting_word_anchor_range() {
749 // "Hello[./file.txt] World" → abutting preceding word "Hello".
750 let source = "Hello[./file.txt] World\n\n";
751 let doc = parse_document(source).unwrap();
752 let links = doc.find_all_links();
753
754 assert_eq!(links.len(), 1);
755 let captured = &source[links[0].range.span.clone()];
756 assert_eq!(captured, "Hello");
757 }
758
759 #[test]
760 fn test_multiple_links_anchor_distinct_words() {
761 // "Visit [https://example.com] and check [./docs.md]."
762 // URL anchors "Visit", file anchors "check".
763 let source = "Visit [https://example.com] and check [./docs.md].\n\n";
764 let doc = parse_document(source).unwrap();
765 let links = doc.find_all_links();
766
767 assert_eq!(links.len(), 2);
768
769 let url = links
770 .iter()
771 .find(|l| l.link_type == LinkType::Url)
772 .expect("url link");
773 let file = links
774 .iter()
775 .find(|l| l.link_type == LinkType::File)
776 .expect("file link");
777
778 assert_eq!(&source[url.range.span.clone()], "Visit");
779 assert_eq!(&source[file.range.span.clone()], "check");
780 }
781
782 #[test]
783 fn test_long_paragraph_with_single_file_ref_anchors_only_the_word() {
784 // Reproduces the dodot architecture.lex case: a long paragraph that
785 // contains a single file reference. The link's range covers only the
786 // anchored word "see", never the whole paragraph.
787 let source = "\
788This document describes how dodot is organized. It is the conceptual view. \
789For concrete types, crate layout, and trait signatures, see [./types.lex].\n\n";
790 let doc = parse_document(source).unwrap();
791 let links = doc.find_all_links();
792
793 assert_eq!(links.len(), 1);
794 let link = &links[0];
795 assert_eq!(link.target, "./types.lex");
796
797 let captured = &source[link.range.span.clone()];
798 assert_eq!(
799 captured, "see",
800 "inline link range must cover only the anchored word, not the paragraph"
801 );
802 }
803
804 // -----------------------------------------------------------------------
805 // Nested-session title coverage
806 //
807 // `Session::find_all_links` originally only inspected `self.title`, while
808 // `Document::find_all_links` calls it on the implicit root session whose
809 // title is empty. Paragraph traversal recurses into nested sessions, but
810 // nested-session *titles* never get scanned. So URL/File refs that appear
811 // in a section heading like
812 //
813 // 1. See [./handlers.lex] for the phase list
814 //
815 // (body)
816 //
817 // were silently dropped from the LSP `documentLink` response, and editors
818 // had no clickable surface on the heading.
819 // -----------------------------------------------------------------------
820
821 #[test]
822 fn test_file_ref_in_nested_session_title_produces_link() {
823 // "Doc title" + blank + indent → outer session whose title is
824 // "Doc title". Then the indented "See [./other.lex] for details"
825 // line, followed by a blank and a deeper indent, becomes a *nested*
826 // session whose title contains a file reference.
827 let source =
828 "Doc title\n\n See [./other.lex] for details\n\n nested content here.\n\n";
829 let doc = parse_document(source).unwrap();
830 let links = doc.find_all_links();
831
832 assert_eq!(
833 links.len(),
834 1,
835 "expected one link for the file ref in the nested-session title; got {links:?}"
836 );
837 let link = &links[0];
838 assert_eq!(link.target, "./other.lex");
839 assert_eq!(link.link_type, LinkType::File);
840
841 // Inline reference in the title → anchors the preceding word "See".
842 assert_eq!(
843 &source[link.range.span.clone()],
844 "See",
845 "nested-session title link anchors the preceding word"
846 );
847 }
848
849 #[test]
850 fn test_url_ref_in_nested_session_title_produces_link() {
851 let source = "Doc title\n\n Visit [https://example.com] today\n\n body line.\n\n";
852 let doc = parse_document(source).unwrap();
853 let links = doc.find_all_links();
854
855 assert_eq!(links.len(), 1);
856 let link = &links[0];
857 assert_eq!(link.target, "https://example.com");
858 assert_eq!(link.link_type, LinkType::Url);
859
860 // Inline reference in the title → anchors the preceding word "Visit".
861 assert_eq!(&source[link.range.span.clone()], "Visit");
862 }
863
864 #[test]
865 fn test_refs_in_both_outer_and_nested_session_titles_produce_links() {
866 // The outer title also contains a file reference, so both the outer
867 // and nested titles should each contribute one link, distinct from
868 // any links found in paragraphs.
869 let source = "\
870Top [./top.lex] section
871
872 Inner [./inner.lex] subsection
873
874 See also [./body.lex] in the body.
875";
876 let doc = parse_document(source).unwrap();
877 let links = doc.find_all_links();
878
879 assert_eq!(
880 links.len(),
881 3,
882 "expected three links (outer-title, inner-title, body); got {links:?}"
883 );
884 let targets: Vec<&str> = links.iter().map(|l| l.target.as_str()).collect();
885 assert!(targets.contains(&"./top.lex"));
886 assert!(targets.contains(&"./inner.lex"));
887 assert!(targets.contains(&"./body.lex"));
888 }
889
890 // -----------------------------------------------------------------------
891 // Reference lines: whole-element anchors and self-links (§2.3.2)
892 //
893 // A reference line (`[ref]` alone on its line) is removed from the
894 // structural stream and collected on the document. PR C surfaces it as a
895 // `DocumentLink` whose range is the *anchored span*: the head line of the
896 // element above (whole-element anchor), or the reference's own text when
897 // there is no content line above (self-link). These prove the LSP emits a
898 // standard range + target — the editor needs no special handling.
899 // -----------------------------------------------------------------------
900
901 #[test]
902 fn test_reference_line_whole_element_anchors_session_title() {
903 // The reference line anchors the entire session title "Getting Started".
904 let source = "Getting Started\n[./readme.txt]\n\n Welcome to the docs.\n\n";
905 let doc = parse_document(source).unwrap();
906 let links = doc.find_all_links();
907
908 assert_eq!(links.len(), 1, "one whole-element link; got {links:?}");
909 let link = &links[0];
910 assert_eq!(link.target, "./readme.txt");
911 assert_eq!(link.link_type, LinkType::File);
912
913 // Range covers the head line, not the `[./readme.txt]` reference line.
914 assert_eq!(
915 &source[link.range.span.clone()],
916 "Getting Started",
917 "whole-element link must cover the anchored head line"
918 );
919 // Positions point at the title line (line 0), full width.
920 assert_eq!(link.range.start, Position::new(0, 0));
921 assert_eq!(link.range.end, Position::new(0, "Getting Started".len()));
922 }
923
924 #[test]
925 fn test_reference_line_whole_element_anchors_list_item() {
926 // Anchors the whole "Water" list item; the `- ` marker is excluded.
927 let source = "- Food\n- Water\n[https://water.example]\n- Bread\n\n";
928 let doc = parse_document(source).unwrap();
929 let links = doc.find_all_links();
930
931 assert_eq!(links.len(), 1);
932 let link = &links[0];
933 assert_eq!(link.target, "https://water.example");
934 assert_eq!(link.link_type, LinkType::Url);
935 assert_eq!(
936 &source[link.range.span.clone()],
937 "Water",
938 "list-item anchor excludes the `- ` marker"
939 );
940 }
941
942 #[test]
943 fn test_reference_line_whole_element_anchors_definition_subject() {
944 // Anchors the definition term "API Endpoint"; the trailing `:` excluded.
945 let source =
946 "API Endpoint:\n[./endpoint.txt]\n A URL that provides access to a resource.\n\n";
947 let doc = parse_document(source).unwrap();
948 let links = doc.find_all_links();
949
950 assert_eq!(links.len(), 1);
951 let link = &links[0];
952 assert_eq!(link.target, "./endpoint.txt");
953 assert_eq!(
954 &source[link.range.span.clone()],
955 "API Endpoint",
956 "subject anchor excludes the trailing colon"
957 );
958 }
959
960 #[test]
961 fn test_reference_line_self_link_range_is_the_reference_text() {
962 // No content line directly above (blank line above) → self-link. The
963 // link covers the reference's own `[bracketed]` text.
964 let source = "See the upstream project:\n\n[https://github.com/lex-fmt/lex]\n\n";
965 let doc = parse_document(source).unwrap();
966 let links = doc.find_all_links();
967
968 assert_eq!(links.len(), 1, "one self-link; got {links:?}");
969 let link = &links[0];
970 assert_eq!(link.target, "https://github.com/lex-fmt/lex");
971 assert_eq!(
972 &source[link.range.span.clone()],
973 "[https://github.com/lex-fmt/lex]",
974 "self-link covers the reference's own bracketed text"
975 );
976 }
977
978 #[test]
979 fn test_reference_line_self_link_at_start_of_document() {
980 // First line of the document → no content above → self-link.
981 let source = "[https://lex.ing]\n\n";
982 let doc = parse_document(source).unwrap();
983 let links = doc.find_all_links();
984
985 assert_eq!(links.len(), 1);
986 assert_eq!(
987 &source[links[0].range.span.clone()],
988 "[https://lex.ing]",
989 "self-link covers the reference's own bracketed text"
990 );
991 }
992
993 // -----------------------------------------------------------------------
994 // Finding #1: pending `Following` link must never be silently dropped.
995 //
996 // A `Following`-anchored reference defers its link until the next `Plain`
997 // node. If that never arrives (the walk ends, or only non-`Plain` nodes
998 // follow) the collector must still flush the pending link, falling back to
999 // the bracket range. Likewise, two `Following` refs back-to-back must each
1000 // emit (the first is flushed before the second becomes pending).
1001 // -----------------------------------------------------------------------
1002
1003 #[test]
1004 fn test_following_anchor_followed_by_only_non_plain_nodes_still_emits() {
1005 // The reference is first on its line (→ `Following` anchor) and the only
1006 // node after it is inline code, which the walker reports via
1007 // `visit_code` — never `visit_plain`. So the pending link is never
1008 // resolved during the walk. Without the end-of-walk `flush()` this link
1009 // is silently dropped (0 links); with it, the link still emits, falling
1010 // back to the bracket range.
1011 let source = "[https://a.example]`code`\n\n";
1012 let doc = parse_document(source).unwrap();
1013 let links = doc.find_all_links();
1014
1015 assert_eq!(
1016 links.len(),
1017 1,
1018 "following-anchor link followed only by non-plain nodes must still \
1019 emit (bracket-range fallback); got {links:?}"
1020 );
1021 let link = &links[0];
1022 assert_eq!(link.target, "https://a.example");
1023 // Bracket-range fallback: covers the `[https://a.example]` text.
1024 assert_eq!(&source[link.range.span.clone()], "[https://a.example]");
1025 }
1026
1027 #[test]
1028 fn test_two_following_anchors_in_a_row_both_emit() {
1029 // Two first-on-line references with no plain text between them. The
1030 // first becomes pending, then the second arrives: the first must be
1031 // flushed (bracket fallback) before the second is stored, so both emit.
1032 let source = "[https://a.example][https://b.example] tail\n\n";
1033 let doc = parse_document(source).unwrap();
1034 let links = doc.find_all_links();
1035
1036 assert_eq!(
1037 links.len(),
1038 2,
1039 "both back-to-back following-anchor refs must emit; got {links:?}"
1040 );
1041 let targets: Vec<&str> = links.iter().map(|l| l.target.as_str()).collect();
1042 assert!(targets.contains(&"https://a.example"));
1043 assert!(targets.contains(&"https://b.example"));
1044 }
1045
1046 // -----------------------------------------------------------------------
1047 // Finding #2: reference-line ranges must use UTF-16 columns, matching the
1048 // inline word-anchor path and LSP's default `positionEncoding`.
1049 //
1050 // The reference-line ranges arrive as byte columns (built via
1051 // `SourceLocation::byte_range_to_ast_range`). For non-ASCII anchor /
1052 // reference text, a byte column overshoots the UTF-16 column an editor
1053 // expects, misplacing the link decoration. The collector normalizes them.
1054 // -----------------------------------------------------------------------
1055
1056 #[test]
1057 fn test_reference_line_whole_element_end_column_is_utf16() {
1058 // Title contains a multi-byte char "é" (2 UTF-8 bytes, 1 UTF-16 unit).
1059 // "Café Menu" is 9 chars / 9 UTF-16 units, but 10 UTF-8 bytes.
1060 let source = "Café Menu\n[./menu.txt]\n\n Today's specials.\n\n";
1061 let doc = parse_document(source).unwrap();
1062 let links = doc.find_all_links();
1063
1064 assert_eq!(links.len(), 1, "got {links:?}");
1065 let link = &links[0];
1066 assert_eq!(link.target, "./menu.txt");
1067 assert_eq!(
1068 &source[link.range.span.clone()],
1069 "Café Menu",
1070 "byte span still covers the whole anchored title"
1071 );
1072 // End column counts UTF-16 units (9), not UTF-8 bytes (10).
1073 assert_eq!(link.range.start, Position::new(0, 0));
1074 assert_eq!(
1075 link.range.end,
1076 Position::new(0, 9),
1077 "end column must be the UTF-16 width of the anchor, not its byte length"
1078 );
1079 }
1080
1081 #[test]
1082 fn test_reference_line_self_link_end_column_is_utf16() {
1083 // Self-link whose URL contains a multi-byte char. "[https://café.example]"
1084 // is 22 chars / 22 UTF-16 units but 23 UTF-8 bytes (é = 2 bytes).
1085 let source = "[https://café.example]\n\n";
1086 let doc = parse_document(source).unwrap();
1087 let links = doc.find_all_links();
1088
1089 assert_eq!(links.len(), 1, "got {links:?}");
1090 let link = &links[0];
1091 assert_eq!(link.target, "https://café.example");
1092 assert_eq!(
1093 &source[link.range.span.clone()],
1094 "[https://café.example]",
1095 "byte span still covers the bracketed reference"
1096 );
1097 assert_eq!(link.range.start, Position::new(0, 0));
1098 assert_eq!(
1099 link.range.end,
1100 Position::new(0, 22),
1101 "self-link end column must be the UTF-16 width of `[` + raw + `]`"
1102 );
1103 }
1104
1105 // -----------------------------------------------------------------------
1106 // Finding #3: `locate_word_range` must not mis-map when the anchored plain
1107 // text run contains an escape. The unescaped `PlainSpan.text` is shorter
1108 // than its raw `range.span`, so byte offsets into the text don't line up
1109 // with the raw source. The guard returns `None` → bracket-range fallback.
1110 // -----------------------------------------------------------------------
1111
1112 #[test]
1113 fn test_escaped_char_in_anchored_word_falls_back_to_bracket_range() {
1114 // The preceding plain run "a\\*b " contains an escape (`\*` → `*`), so
1115 // the unescaped text ("a*b ") is one byte shorter than the raw span.
1116 // The collector must fall back to the bracket range rather than emit a
1117 // misplaced underline.
1118 let source = "a\\*b [https://x.example] tail\n\n";
1119 let doc = parse_document(source).unwrap();
1120 let links = doc.find_all_links();
1121
1122 assert_eq!(links.len(), 1, "got {links:?}");
1123 let link = &links[0];
1124 assert_eq!(link.target, "https://x.example");
1125 assert_eq!(
1126 &source[link.range.span.clone()],
1127 "[https://x.example]",
1128 "escaped anchored run must fall back to the bracket range"
1129 );
1130 }
1131
1132 #[test]
1133 fn test_marker_reference_line_is_not_a_document_link() {
1134 // A footnote on its own line is a marker-style reference: not a
1135 // reference line and not a document link.
1136 let source = "Some claim.\n[42]\n\n:: 42 :: A footnote.\n\n";
1137 let doc = parse_document(source).unwrap();
1138 let links = doc.find_all_links();
1139 assert!(
1140 links.is_empty(),
1141 "marker-style references are not document links: {links:?}"
1142 );
1143 }
1144}