lex-core 0.16.0

Parser library for the lex format
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
//! Reference-line extraction and whole-element anchor resolution.
//!
//! Implements §2.3.2–§2.3.4 of the references spec
//! (`comms/specs/elements/inlines.docs/specs/references/references-general.lex`).
//!
//! # What a reference line is
//!
//! A *reference line* is a physical source line whose only content, after
//! stripping leading indentation, is a single bracketed reference — the line is
//! exactly `[<inner>]` with nothing else. When `<inner>` classifies to a
//! *link-like* reference type (`Url`, `File`, `Session`, `General`; see
//! [`ReferenceType::anchoring`]) the line takes a **whole-element anchor**: it
//! anchors the entire head line of the element directly above it. A
//! marker-style reference (`[1]`, `[@key]`, `[::label]`) on its own line is
//! *not* a reference line for anchoring purposes — it self-links / resolves as
//! usual, exactly like an inline marker.
//!
//! # Why this is a pre-pass
//!
//! A reference line is *transparent* to structural parsing (§2.3.3): it is
//! neither a content line nor a blank line. Its tokens are removed from the
//! token stream **before** structure is resolved, so the lines around it keep
//! their original adjacency. This matters because a blank line after a subject
//! is exactly what separates a *definition* from a *session*:
//!
//! ```text
//! API Endpoint:
//! [./endpoint.txt]
//!     A URL that provides access to a resource.
//! ```
//!
//! Removing (not blanking) the reference line keeps `API Endpoint:` immediately
//! adjacent to its indented body, so it stays a **definition** and the
//! reference line anchors the term "API Endpoint". Blanking it would wrongly
//! turn it into a session.
//!
//! To make removal a token-stream operation (rather than a source-string edit
//! that would shift every downstream byte offset), the pre-pass reports the
//! **byte range of each removed reference line** — the line plus its terminating
//! newline. The caller lexes the *original* source and drops every token whose
//! range falls inside a removed line before parsing. The newline is included so
//! the content line above and the line below become directly adjacent.
//!
//! # Coordinates
//!
//! Nothing here edits the source string, and the caller parses the original
//! source with token ranges intact, so **every AST range stays in
//! original-source coordinates** — including elements that appear *after* a
//! reference line. Every [`Range`] this module produces is likewise in
//! original-source coordinates. That is what the document the user edits still
//! contains, which is what editors (LSP `documentLink`) and serializers need.

use crate::lex::ast::anchoring::{AnchoredElement, ReferenceAnchor, ReferenceLine};
use crate::lex::ast::diagnostics::{Diagnostic, DiagnosticSeverity};
use crate::lex::ast::range::SourceLocation;
use crate::lex::inlines::{
    determine_reference_type, parse_inlines, AnchorDirection, AnchorKind, InlineNode,
    ReferenceInline, WordAnchor,
};
use std::ops::Range;

/// Result of the reference-line pre-pass.
pub struct AnchoringPrepass {
    /// Byte ranges (in original-source coordinates) of every removed reference
    /// line — each covers the line *plus its terminating newline*. The caller
    /// drops every token whose range falls inside one of these ranges from the
    /// token stream before parsing, which keeps all surviving tokens (and thus
    /// all AST node ranges) in original-source coordinates.
    pub removed_line_ranges: Vec<Range<usize>>,
    /// Resolved reference lines, in source order.
    pub reference_lines: Vec<ReferenceLine>,
    /// Overlap / stacking warnings (§2.3.3).
    pub diagnostics: Vec<Diagnostic>,
}

impl AnchoringPrepass {
    /// True when no reference line was found, so the caller can skip the
    /// token-filtering work entirely.
    pub fn is_empty(&self) -> bool {
        self.removed_line_ranges.is_empty()
    }

    /// Drop every token whose range starts inside a removed reference line.
    ///
    /// A token belongs to a removed line when its start offset lies within that
    /// line's `[start, end)` byte range (the range includes the terminating
    /// newline, so the line's `BlankLine`/newline token is dropped too — that is
    /// what makes the surrounding content lines directly adjacent rather than
    /// separated by a blank line). Tokens keep their original ranges, so the
    /// survivors stay in original-source coordinates.
    pub fn filter_tokens<T>(&self, tokens: Vec<(T, Range<usize>)>) -> Vec<(T, Range<usize>)> {
        if self.removed_line_ranges.is_empty() {
            return tokens;
        }
        tokens
            .into_iter()
            .filter(|(_, range)| {
                !self
                    .removed_line_ranges
                    .iter()
                    .any(|removed| removed.contains(&range.start))
            })
            .collect()
    }
}

/// A single physical source line with its byte bounds.
struct PhysicalLine<'a> {
    /// Byte offset of the first character of the line (after the previous `\n`).
    start: usize,
    /// Byte offset just past the line's terminating `\n` (== the line's content
    /// end when the final line has no trailing newline).
    end: usize,
    /// The line's text, excluding the trailing newline.
    text: &'a str,
}

impl PhysicalLine<'_> {
    /// The line trimmed of surrounding whitespace.
    fn trimmed(&self) -> &str {
        self.text.trim()
    }

    /// True when the line has no non-whitespace content.
    fn is_blank(&self) -> bool {
        self.trimmed().is_empty()
    }
}

/// Split `source` into physical lines, preserving byte bounds.
fn physical_lines(source: &str) -> Vec<PhysicalLine<'_>> {
    let mut lines = Vec::new();
    let mut start = 0;
    for line in source.split_inclusive('\n') {
        let end = start + line.len();
        lines.push(PhysicalLine {
            start,
            end,
            text: line.strip_suffix('\n').unwrap_or(line),
        });
        start = end;
    }
    lines
}

/// If `trimmed` is exactly a single bracketed reference (`[<inner>]` with no
/// other `[`/`]` and a non-empty inner), return the inner content. Otherwise
/// `None`. The inner is returned verbatim (not trimmed) so byte math lines up.
fn bracketed_inner(trimmed: &str) -> Option<&str> {
    let inner = trimmed.strip_prefix('[')?.strip_suffix(']')?;
    if inner.is_empty() || inner.contains('[') || inner.contains(']') {
        return None;
    }
    Some(inner)
}

/// Derive the anchor head-line text for a content line, applying the
/// element-specific head-line rules (§2.3.2):
/// - list item: drop the leading list marker (`- `, `1. `, `a) `, …).
/// - definition / session subject: drop the trailing `:` marker.
///
/// Returns `(anchor_text, byte_start, byte_end)` where the byte range is into
/// the original source and covers exactly `anchor_text`.
fn head_line_anchor(line: &PhysicalLine<'_>) -> HeadLine {
    let text = line.text;
    // Byte offset of the first non-indentation, non-leading-whitespace char.
    let content_offset = text.len() - text.trim_start().len();
    // Trimmed body (both ends) and its start offset in the original source.
    let mut start = line.start + content_offset;
    let mut body = text.trim();
    let mut end = start + body.len();

    let mut element = AnchoredElement::WholeLine;

    // List marker: `-`, `*`, `+`, or an ordered marker like `1.` / `a)` /
    // `I.`. We only need to strip the marker + following whitespace.
    if let Some(marker_len) = list_marker_len(body) {
        let after = &body[marker_len..];
        let ws = after.len() - after.trim_start().len();
        start += marker_len + ws;
        body = after.trim_start();
        end = start + body.len();
        element = AnchoredElement::ListItem;
    }

    // Trailing `:` subject marker (definition term / session title written with
    // a colon, verbatim subject). Strip a single trailing colon (and the
    // whitespace before it, if any). This is a *subject* rule only — a list
    // item like `- Note:` keeps its literal text `Note:`, so we never strip the
    // colon once the list-marker branch has classified the line as a ListItem.
    if element != AnchoredElement::ListItem {
        if let Some(stripped) = body.strip_suffix(':') {
            body = stripped.trim_end();
            end = start + body.len();
            element = AnchoredElement::Subject;
        }
    }

    HeadLine {
        text: body.to_string(),
        start,
        end,
        element,
    }
}

struct HeadLine {
    text: String,
    start: usize,
    end: usize,
    element: AnchoredElement,
}

/// Length in bytes of a leading list marker on `body`, if present.
///
/// Recognises the unordered markers `-`, `*`, `+` and ordered markers of the
/// form `<seq><.|)>` where `<seq>` is digits or ASCII letters (covers `1.`,
/// `a)`, `IV.`) — matching the markers the structural parser accepts. The
/// returned length covers the marker punctuation only (not trailing
/// whitespace), and a marker must be followed by whitespace (so `-5` and
/// `note:` are not markers).
fn list_marker_len(body: &str) -> Option<usize> {
    let first = body.chars().next()?;

    // Unordered: a single bullet char followed by whitespace.
    if matches!(first, '-' | '*' | '+') {
        if body[first.len_utf8()..].starts_with(char::is_whitespace) {
            return Some(first.len_utf8());
        }
        return None;
    }

    // Ordered: a run of alphanumerics terminated by `.` or `)`.
    let mut seq_end = 0;
    for (i, c) in body.char_indices() {
        if c.is_ascii_alphanumeric() {
            seq_end = i + c.len_utf8();
        } else {
            break;
        }
    }
    if seq_end == 0 {
        return None;
    }
    let term = body[seq_end..].chars().next()?;
    if matches!(term, '.' | ')') {
        let marker_len = seq_end + term.len_utf8();
        if marker_len == body.len() || body[marker_len..].starts_with(char::is_whitespace) {
            return Some(marker_len);
        }
    }
    None
}

/// Run the reference-line pre-pass over `source`.
pub fn extract_reference_lines(source: &str) -> AnchoringPrepass {
    let lines = physical_lines(source);
    let loc = SourceLocation::new(source);

    let mut reference_lines: Vec<ReferenceLine> = Vec::new();
    let mut diagnostics: Vec<Diagnostic> = Vec::new();

    // Which line indices are reference lines, so we skip them when building the
    // cleaned source.
    let mut removed: Vec<bool> = vec![false; lines.len()];

    // Track, per source line index, whether an element head line at that index
    // has already been claimed by a reference line — so a second (stacked)
    // reference line over the same element is flagged.
    let mut anchored_line: Vec<bool> = vec![false; lines.len()];

    for idx in 0..lines.len() {
        let line = &lines[idx];
        let trimmed = line.trimmed();
        let Some(inner) = bracketed_inner(trimmed) else {
            continue;
        };
        let reference_type = determine_reference_type(inner);

        // Build the reference's range (brackets inclusive) in original coords.
        let bracket_start = line.start + (line.text.len() - line.text.trim_start().len());
        let bracket_end = bracket_start + trimmed.len();
        let reference_range = loc.byte_range_to_ast_range(&(bracket_start..bracket_end));

        let reference = {
            let mut r = ReferenceInline::new(inner.to_string());
            r.reference_type = reference_type.clone();
            r
        };

        // Marker-style references on their own line are not reference lines for
        // anchoring: they remain in the stream and self-link / resolve as usual.
        if reference_type.anchoring() != AnchorKind::WholeLineCapable {
            continue;
        }

        // This is a reference line: it is removed from the structural stream.
        removed[idx] = true;

        // Find the content line directly above (upward-only). Skipping is *not*
        // allowed: a blank line directly above means self-link. But preceding
        // reference lines have already been removed from the logical stream, so
        // we look past lines we have already marked `removed` to find the
        // nearest physical predecessor — and if that predecessor is itself a
        // reference line we just removed, the *element* it anchored is the head
        // line, which would now be double-anchored (stacked): flag it.
        let mut above: Option<usize> = None;
        let mut stacked_over: Option<usize> = None;
        if idx > 0 {
            let mut j = idx - 1;
            loop {
                if removed[j] {
                    // A reference line directly above us: stacking. Its own
                    // anchor target (if any) is what we'd collide on.
                    stacked_over = Some(j);
                    if j == 0 {
                        break;
                    }
                    j -= 1;
                    continue;
                }
                if lines[j].is_blank() {
                    above = None;
                } else {
                    above = Some(j);
                }
                break;
            }
        }

        let anchor = match above {
            Some(above_idx) => {
                let head = head_line_anchor(&lines[above_idx]);
                let anchor_range = loc.byte_range_to_ast_range(&(head.start..head.end));

                // Overlap diagnostics (§2.3.3): at most one reference line may
                // anchor a given element. Two situations are illegal:
                //  (a) the head line is already claimed by an earlier
                //      reference line (stacked), or
                //  (b) the head line itself carries an inline reference (a
                //      nested link over the same text).
                if anchored_line[above_idx] || stacked_over.is_some() {
                    diagnostics.push(
                        Diagnostic::new(
                            reference_range.clone(),
                            DiagnosticSeverity::Warning,
                            format!(
                                "Multiple reference lines anchor the same element \
                                 ('{}'); only the first is honored",
                                head.text
                            ),
                        )
                        .with_code("stacked-reference-line"),
                    );
                } else if head_line_has_inline_reference(&lines[above_idx]) {
                    diagnostics.push(
                        Diagnostic::new(
                            reference_range.clone(),
                            DiagnosticSeverity::Warning,
                            format!(
                                "Reference line anchors an element whose head line \
                                 ('{}') already carries an inline reference; the \
                                 whole-line anchor is honored",
                                head.text
                            ),
                        )
                        .with_code("overlapping-reference-line"),
                    );
                }

                anchored_line[above_idx] = true;
                ReferenceAnchor::WholeElement {
                    anchor_text: head.text,
                    anchor_range,
                    element: head.element,
                }
            }
            None => ReferenceAnchor::SelfLink,
        };

        reference_lines.push(ReferenceLine {
            reference,
            reference_range,
            anchor,
        });
    }

    // Report the byte range of every removed reference line — the line *plus*
    // its terminating newline (`line.end` already points just past the `\n`).
    // The caller drops the tokens inside these ranges from the token stream, so
    // a reference line that self-links is *also* removed from structure (it is
    // transparent either way); its standalone rendering is reconstructed by
    // consumers from the collected `reference_lines` entry. This keeps the
    // structural parser unaware of reference lines (§2.3.3) without editing the
    // source string, so all surviving tokens keep original-source coordinates.
    let removed_line_ranges: Vec<Range<usize>> = lines
        .iter()
        .enumerate()
        .filter(|(idx, _)| removed[*idx])
        .map(|(_, line)| line.start..line.end)
        .collect();

    AnchoringPrepass {
        removed_line_ranges,
        reference_lines,
        diagnostics,
    }
}

/// Resolve word anchors (§2.3.1) for every top-level inline reference in a
/// single line's inline node sequence, mutating each `Reference` node's
/// `word_anchor` in place.
///
/// Rules:
/// - Default: the word immediately *preceding* the reference.
/// - If the reference is the first token on the line (only whitespace before
///   it) and text follows on the same line, the word immediately *following*.
/// - A reference directly abutting a preceding word counts as that word
///   (`Hello[./f] World` → "Hello") — the preceding-word logic already does
///   this because abutting text has no whitespace before the word boundary.
///
/// A reference that is the only token on its line gets no word anchor (it would
/// have been a reference line if link-like; a lone marker reference simply has
/// no word to anchor). Whitespace-only text on one side is treated as empty.
pub(crate) fn resolve_word_anchors(nodes: &mut [crate::lex::inlines::InlineNode]) {
    use crate::lex::inlines::InlineNode;

    // Fast path: nothing to anchor if the line carries no reference. This avoids
    // the flatten/allocate work on the overwhelmingly common reference-free line
    // (this runs for every `TextContent`).
    if !nodes
        .iter()
        .any(|n| matches!(n, InlineNode::Reference { .. }))
    {
        return;
    }

    // Flatten each top-level node to its plain text so word boundaries can be
    // computed across formatting spans.
    let texts: Vec<String> = nodes.iter().map(flatten_inline_text).collect();

    let n = nodes.len();
    for i in 0..n {
        if !matches!(nodes[i], InlineNode::Reference { .. }) {
            continue;
        }

        let before: String = texts[..i].concat();
        let after: String = texts[i + 1..].concat();

        let first_on_line = before.trim().is_empty();
        let anchor = if first_on_line {
            // Following word (only when text actually follows).
            after
                .split_whitespace()
                .next()
                .and_then(clean_anchor_word)
                .map(|word| WordAnchor {
                    word,
                    direction: AnchorDirection::Following,
                })
        } else {
            // Preceding word: the last whitespace-delimited token of `before`.
            before
                .split_whitespace()
                .next_back()
                .and_then(clean_anchor_word)
                .map(|word| WordAnchor {
                    word,
                    direction: AnchorDirection::Preceding,
                })
        };

        if let InlineNode::Reference { data, .. } = &mut nodes[i] {
            data.word_anchor = anchor;
        }
    }
}

/// Strip surrounding punctuation from a candidate anchor word, honoring
/// [`WordAnchor::word`]'s contract that the stored word carries no surrounding
/// punctuation (`website, [url]` anchors `"website"`, not `"website,"`).
///
/// Leading and trailing non-alphanumeric characters are removed; interior
/// punctuation (e.g. `lex.ing`, `can't`) is preserved. Returns `None` when
/// nothing alphanumeric remains, so a punctuation-only token produces no anchor.
fn clean_anchor_word(word: &str) -> Option<String> {
    let cleaned = word.trim_matches(|c: char| !c.is_alphanumeric());
    if cleaned.is_empty() {
        None
    } else {
        Some(cleaned.to_string())
    }
}

/// Flatten an inline node to its plain text content (recursing into formatting
/// spans). References contribute no text (their bracketed content is not part
/// of the surrounding word stream).
fn flatten_inline_text(node: &crate::lex::inlines::InlineNode) -> String {
    use crate::lex::inlines::InlineNode;
    match node {
        InlineNode::Plain { text, .. }
        | InlineNode::Code { text, .. }
        | InlineNode::Math { text, .. } => text.clone(),
        InlineNode::Strong { content, .. } | InlineNode::Emphasis { content, .. } => {
            content.iter().map(flatten_inline_text).collect()
        }
        InlineNode::Reference { .. } => String::new(),
    }
}

/// Does a head line carry a genuine inline reference? Used only for the overlap
/// warning (§2.3.3).
///
/// This parses the line's inline content and inspects the resulting
/// [`InlineNode::Reference`] nodes rather than matching brackets textually, so
/// it does not false-positive on stray `[` characters (e.g. a verbatim subject
/// or prose that merely contains a bracket but no real reference).
fn head_line_has_inline_reference(line: &PhysicalLine<'_>) -> bool {
    parse_inlines(line.text.trim())
        .iter()
        .any(|node| matches!(node, InlineNode::Reference { .. }))
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::lex::ast::traits::AstNode;
    use crate::lex::inlines::{AnchorDirection, ReferenceType};
    use crate::lex::parsing::parse_document;

    /// Helper: resolved reference lines for a source.
    fn ref_lines(src: &str) -> Vec<ReferenceLine> {
        parse_document(src).unwrap().reference_lines
    }

    /// Helper: the single whole-element anchor text for a one-reference-line
    /// source. Panics if there isn't exactly one whole-element anchor.
    fn sole_whole_anchor(src: &str) -> (String, AnchoredElement) {
        let lines = ref_lines(src);
        assert_eq!(
            lines.len(),
            1,
            "expected exactly one reference line: {lines:?}"
        );
        match &lines[0].anchor {
            ReferenceAnchor::WholeElement {
                anchor_text,
                element,
                ..
            } => (anchor_text.clone(), *element),
            other => panic!("expected whole-element anchor, got {other:?}"),
        }
    }

    // -- Fixture §1: inline word anchors -----------------------------------

    fn word_anchor(src: &str) -> WordAnchor {
        let doc = parse_document(src).unwrap();
        let r = doc
            .iter_all_references()
            .find(|r| r.word_anchor.is_some())
            .expect("a reference with a word anchor");
        r.word_anchor.clone().unwrap()
    }

    #[test]
    fn inline_preceding_word_anchor() {
        // "the project website [https://lex.ing] is fast."
        let wa = word_anchor("the project website [https://lex.ing] is fast.\n\n");
        assert_eq!(wa.word, "website");
        assert_eq!(wa.direction, AnchorDirection::Preceding);
    }

    #[test]
    fn inline_following_word_anchor() {
        // "[https://lex.ing] is the home page." — first on line → following.
        let wa = word_anchor("[https://lex.ing] is the home page.\n\n");
        assert_eq!(wa.word, "is");
        assert_eq!(wa.direction, AnchorDirection::Following);
    }

    #[test]
    fn inline_abutting_word_anchor() {
        // "Hello[./file.txt] World" — abutting → preceding "Hello".
        let wa = word_anchor("Hello[./file.txt] World\n\n");
        assert_eq!(wa.word, "Hello");
        assert_eq!(wa.direction, AnchorDirection::Preceding);
    }

    #[test]
    fn inline_preceding_word_anchor_trims_trailing_punctuation() {
        // "website, [https://x]" — the preceding token is "website," but the
        // stored word must carry no surrounding punctuation (per the
        // `WordAnchor::word` contract): "website".
        let wa = word_anchor("the project website, [https://x] is fast.\n\n");
        assert_eq!(wa.word, "website");
        assert_eq!(wa.direction, AnchorDirection::Preceding);
    }

    #[test]
    fn inline_following_word_anchor_trims_punctuation() {
        // First-on-line reference, following token has trailing punctuation.
        let wa = word_anchor("[https://x] (home) page.\n\n");
        assert_eq!(wa.word, "home");
        assert_eq!(wa.direction, AnchorDirection::Following);
    }

    #[test]
    fn inline_word_anchor_preserves_interior_punctuation() {
        // Interior dots/apostrophes are part of the word, not surrounding it.
        let wa = word_anchor("visit lex.ing [https://lex.ing] now.\n\n");
        assert_eq!(wa.word, "lex.ing");
        assert_eq!(wa.direction, AnchorDirection::Preceding);
    }

    #[test]
    fn inline_punctuation_only_neighbor_yields_no_anchor() {
        // The token preceding the reference is punctuation-only; after trimming
        // nothing alphanumeric remains, so no word anchor is produced.
        let doc = parse_document("word -- [https://x] end.\n\n").unwrap();
        let r = doc
            .iter_all_references()
            .find(|r| matches!(r.reference_type, ReferenceType::Url { .. }))
            .expect("the url reference");
        assert!(
            r.word_anchor.is_none(),
            "punctuation-only neighbor must not produce an anchor: {:?}",
            r.word_anchor
        );
    }

    // -- Fixture §2: reference line on a session title ---------------------

    #[test]
    fn reference_line_anchors_session_title() {
        let src = "Getting Started\n[./readme.txt]\n\n    Welcome to the docs.\n\n";
        let (anchor, _kind) = sole_whole_anchor(src);
        assert_eq!(anchor, "Getting Started");
        // The reference line is removed; structure is a session with a body.
        let doc = parse_document(src).unwrap();
        assert_eq!(doc.root.children[0].node_type(), "Session");
    }

    // -- Fixture §3: reference line on a list item ------------------------

    #[test]
    fn reference_line_anchors_list_item() {
        let src = "- Food\n- Water\n[https://water.example]\n- Bread\n\n";
        let (anchor, kind) = sole_whole_anchor(src);
        assert_eq!(anchor, "Water");
        assert_eq!(kind, AnchoredElement::ListItem);
        // List structure is preserved (3 items, the reference line removed).
        let doc = parse_document(src).unwrap();
        assert_eq!(doc.root.children[0].node_type(), "List");
    }

    #[test]
    fn reference_line_on_list_item_keeps_trailing_colon() {
        // A list item ending in `:` is not a subject — the colon is part of the
        // item text. Anchoring must keep it literal (`Note:`), never strip it
        // the way a definition/verbatim subject would.
        let src = "- Note:\n[./n.txt]\n- Other\n\n";
        let (anchor, kind) = sole_whole_anchor(src);
        assert_eq!(anchor, "Note:");
        assert_eq!(kind, AnchoredElement::ListItem);
    }

    // -- Fixture §4: reference line on a definition term (transparency) ----

    #[test]
    fn reference_line_keeps_definition_a_definition() {
        // The critical transparency case: with the reference line *removed*
        // (not blanked), `API Endpoint:` stays adjacent to its indented body,
        // so it remains a definition — not a session.
        let src =
            "API Endpoint:\n[./endpoint.txt]\n    A URL that provides access to a resource.\n\n";
        let (anchor, kind) = sole_whole_anchor(src);
        assert_eq!(anchor, "API Endpoint");
        assert_eq!(kind, AnchoredElement::Subject);

        let doc = parse_document(src).unwrap();
        assert_eq!(
            doc.root.children[0].node_type(),
            "Definition",
            "reference line must be transparent: blanking it would wrongly \
             turn the definition into a session"
        );
    }

    #[test]
    fn reference_line_as_blank_would_make_a_session() {
        // Control: the *same* source but with a genuine blank line in place of
        // the reference line parses as a session. This pins down exactly what
        // the transparency rule prevents.
        let src = "API Endpoint:\n\n    A URL that provides access to a resource.\n\n";
        let doc = parse_document(src).unwrap();
        assert_eq!(doc.root.children[0].node_type(), "Session");
    }

    // -- Fixture §5: reference line on a verbatim subject -----------------

    #[test]
    fn reference_line_anchors_verbatim_subject() {
        let src = "Example Source:\n[./example.rs]\n    fn main() {}\n:: rust ::\n\n";
        let (anchor, kind) = sole_whole_anchor(src);
        assert_eq!(anchor, "Example Source");
        assert_eq!(kind, AnchoredElement::Subject);

        let doc = parse_document(src).unwrap();
        assert_eq!(doc.root.children[0].node_type(), "VerbatimBlock");
    }

    // -- Fixture §6: reference line on a paragraph -----------------------

    #[test]
    fn reference_line_anchors_paragraph_line() {
        // A multi-line paragraph above so the line above the reference is a
        // genuine paragraph line (not promoted to a document title).
        let src =
            "First paragraph line.\nThe release notes cover every change.\n[./CHANGELOG.md]\n\n";
        let (anchor, kind) = sole_whole_anchor(src);
        assert_eq!(anchor, "The release notes cover every change.");
        assert_eq!(kind, AnchoredElement::WholeLine);
    }

    // -- Fixture §7: self-link fallback ----------------------------------

    #[test]
    fn reference_line_self_links_when_blank_above() {
        let src = "See the upstream project:\n\n[https://github.com/lex-fmt/lex]\n\n";
        let lines = ref_lines(src);
        assert_eq!(lines.len(), 1);
        assert_eq!(lines[0].anchor, ReferenceAnchor::SelfLink);
    }

    #[test]
    fn reference_line_self_links_at_start_of_container() {
        // First line of the document → no content above → self-link.
        let src = "[https://lex.ing]\n\n";
        let lines = ref_lines(src);
        assert_eq!(lines.len(), 1);
        assert_eq!(lines[0].anchor, ReferenceAnchor::SelfLink);
    }

    // -- Fixture §8: marker-style references on a reference line ----------

    #[test]
    fn marker_reference_on_its_own_line_is_not_a_reference_line() {
        // `[::summary-note]` is marker-style: it does NOT take a whole-element
        // anchor; it stays in the stream and resolves as usual.
        let src = "Closing remarks.\n[::summary-note]\n\n:: summary-note ::\n    Resolved.\n\n";
        let lines = ref_lines(src);
        assert!(
            lines.is_empty(),
            "marker-style reference must not become a whole-element anchor: {lines:?}"
        );
        // It remains an inline reference in the document.
        let doc = parse_document(src).unwrap();
        assert!(doc
            .iter_all_references()
            .any(|r| matches!(r.reference_type, ReferenceType::AnnotationReference { .. })));
    }

    #[test]
    fn footnote_on_its_own_line_is_not_a_reference_line() {
        let src = "Some claim.\n[42]\n\n:: 42 :: A footnote.\n\n";
        assert!(ref_lines(src).is_empty());
    }

    // -- §2.3.3: overlap / stacking diagnostics --------------------------

    #[test]
    fn stacked_reference_lines_warn_and_keep_first() {
        // Two reference lines over the same paragraph line.
        let src = "First line.\nClaim line here.\n[./a.txt]\n[./b.txt]\n\n";
        let doc = parse_document(src).unwrap();
        let lines = &doc.reference_lines;
        assert_eq!(lines.len(), 2, "both reference lines are collected");
        // Exactly one stacked-reference-line warning is emitted.
        let warns: Vec<_> = doc
            .diagnostics()
            .into_iter()
            .filter(|d| d.code.as_deref() == Some("stacked-reference-line"))
            .collect();
        assert_eq!(warns.len(), 1, "one stacking warning: {warns:?}");
    }

    #[test]
    fn reference_line_over_head_line_with_inline_reference_warns() {
        // The head line already carries an inline reference, so the
        // whole-element anchor would nest two links over the same text.
        let src = "See more here.\nVisit [https://a.example] now.\n[./b.txt]\n\n";
        let doc = parse_document(src).unwrap();
        let warns: Vec<_> = doc
            .diagnostics()
            .into_iter()
            .filter(|d| d.code.as_deref() == Some("overlapping-reference-line"))
            .collect();
        assert_eq!(warns.len(), 1, "one overlap warning: {warns:?}");
        // The whole-line anchor is still honored (§2.3.3).
        assert!(doc.reference_lines[0].anchor.is_whole_element());
    }

    #[test]
    fn head_line_with_stray_bracket_does_not_warn() {
        // The head line contains a `[` but no genuine inline reference (it is a
        // code span, not a reference). The string/bracket heuristic used to
        // false-positive here; the AST-based check must not fire the overlap
        // warning.
        let src = "Intro.\nThe array index `a[0]` matters.\n[./b.txt]\n\n";
        let doc = parse_document(src).unwrap();
        let warns: Vec<_> = doc
            .diagnostics()
            .into_iter()
            .filter(|d| d.code.as_deref() == Some("overlapping-reference-line"))
            .collect();
        assert!(
            warns.is_empty(),
            "a stray bracket is not an inline reference: {warns:?}"
        );
        // The whole-line anchor is still resolved.
        assert!(doc.reference_lines[0].anchor.is_whole_element());
    }

    // -- Type-level anchoring split (§2.3.4) -----------------------------

    #[test]
    fn anchor_kind_split_matches_spec() {
        use crate::lex::inlines::AnchorKind;
        assert_eq!(
            ReferenceType::Url { target: "x".into() }.anchoring(),
            AnchorKind::WholeLineCapable
        );
        assert_eq!(
            ReferenceType::File { target: "x".into() }.anchoring(),
            AnchorKind::WholeLineCapable
        );
        assert_eq!(
            ReferenceType::Session { target: "1".into() }.anchoring(),
            AnchorKind::WholeLineCapable
        );
        assert_eq!(
            ReferenceType::General { target: "x".into() }.anchoring(),
            AnchorKind::WholeLineCapable
        );
        assert_eq!(
            ReferenceType::FootnoteNumber { number: 1 }.anchoring(),
            AnchorKind::MarkerOnly
        );
        assert_eq!(
            ReferenceType::AnnotationReference { label: "n".into() }.anchoring(),
            AnchorKind::MarkerOnly
        );
        assert_eq!(ReferenceType::NotSure.anchoring(), AnchorKind::MarkerOnly);
    }

    // -- Range fidelity ---------------------------------------------------

    #[test]
    fn anchor_range_covers_the_head_line_text() {
        let src = "Getting Started\n[./readme.txt]\n\n    Body.\n\n";
        let doc = parse_document(src).unwrap();
        let ReferenceAnchor::WholeElement { anchor_range, .. } = &doc.reference_lines[0].anchor
        else {
            panic!("expected whole-element anchor");
        };
        assert_eq!(&src[anchor_range.span.clone()], "Getting Started");
    }

    #[test]
    fn reference_range_covers_brackets_inclusive() {
        let src = "Getting Started\n[./readme.txt]\n\n    Body.\n\n";
        let doc = parse_document(src).unwrap();
        let range = &doc.reference_lines[0].reference_range;
        assert_eq!(&src[range.span.clone()], "[./readme.txt]");
    }

    // -- Original-coordinate invariant (regression for the cleaned-source
    //    coordinate bug) --------------------------------------------------

    /// Removing a reference line by *editing the source string* used to shift
    /// every byte offset after it, so parsed AST nodes that followed a reference
    /// line carried "cleaned-source" coordinates instead of original-source
    /// ones. The token-filtering pre-pass keeps tokens at their original ranges,
    /// so every node after a reference line must still report its position in
    /// the ORIGINAL source.
    ///
    /// This asserts a later element's parsed range start equals the byte offset
    /// of its text in the original source. It fails against the old
    /// cleaned-source approach (the offset is short by the removed line's
    /// length) and passes with token filtering.
    #[test]
    fn later_element_keeps_original_source_coordinates() {
        // A reference line near the top, then a clearly later paragraph. The
        // removed `[./top.txt]\n` line is 12 bytes; under the old cleaned-source
        // approach every node after it was shifted left by 12.
        let original =
            "Intro paragraph here.\n[./top.txt]\n\nLater Section paragraph text.\n\n".to_string();

        let doc = parse_document(&original).unwrap();

        // Find the parsed paragraph whose text starts with "Later Section".
        let later = doc
            .root
            .children
            .iter()
            .find(|c| {
                c.text()
                    .map(|t| t.contains("Later Section"))
                    .unwrap_or(false)
            })
            .expect("a 'Later Section' element after the reference line");

        let expected_start = original
            .find("Later Section")
            .expect("the literal text in the original source");

        assert_eq!(
            later.range().span.start,
            expected_start,
            "node after a reference line must carry an ORIGINAL-source offset \
             (got {}, expected {}); a mismatch means a cleaned-source coordinate \
             leaked into the AST",
            later.range().span.start,
            expected_start,
        );

        // And the slice at that range is the actual original text.
        assert!(original[later.range().span.clone()].starts_with("Later Section"));
    }

    // -- Cleaned-source / no-reference-line passthrough ------------------

    #[test]
    fn documents_without_reference_lines_have_empty_collection() {
        let doc = parse_document("Just a paragraph.\n\n").unwrap();
        assert!(doc.reference_lines.is_empty());
        assert!(doc.reference_line_diagnostics.is_empty());
    }

    #[test]
    fn list_marker_stripping_handles_ordered_markers() {
        let src = "1. First item\n[./x.txt]\n2. Second item\n\n";
        let (anchor, kind) = sole_whole_anchor(src);
        assert_eq!(anchor, "First item");
        assert_eq!(kind, AnchoredElement::ListItem);
    }
}