use crate::lex::ast::anchoring::{AnchoredElement, ReferenceAnchor, ReferenceLine};
use crate::lex::ast::diagnostics::{Diagnostic, DiagnosticSeverity};
use crate::lex::ast::range::SourceLocation;
use crate::lex::inlines::{
determine_reference_type, parse_inlines, AnchorDirection, AnchorKind, InlineNode,
ReferenceInline, WordAnchor,
};
use std::ops::Range;
pub struct AnchoringPrepass {
pub removed_line_ranges: Vec<Range<usize>>,
pub reference_lines: Vec<ReferenceLine>,
pub diagnostics: Vec<Diagnostic>,
}
impl AnchoringPrepass {
pub fn is_empty(&self) -> bool {
self.removed_line_ranges.is_empty()
}
pub fn filter_tokens<T>(&self, tokens: Vec<(T, Range<usize>)>) -> Vec<(T, Range<usize>)> {
if self.removed_line_ranges.is_empty() {
return tokens;
}
tokens
.into_iter()
.filter(|(_, range)| {
!self
.removed_line_ranges
.iter()
.any(|removed| removed.contains(&range.start))
})
.collect()
}
}
struct PhysicalLine<'a> {
start: usize,
end: usize,
text: &'a str,
}
impl PhysicalLine<'_> {
fn trimmed(&self) -> &str {
self.text.trim()
}
fn is_blank(&self) -> bool {
self.trimmed().is_empty()
}
}
fn physical_lines(source: &str) -> Vec<PhysicalLine<'_>> {
let mut lines = Vec::new();
let mut start = 0;
for line in source.split_inclusive('\n') {
let end = start + line.len();
lines.push(PhysicalLine {
start,
end,
text: line.strip_suffix('\n').unwrap_or(line),
});
start = end;
}
lines
}
fn bracketed_inner(trimmed: &str) -> Option<&str> {
let inner = trimmed.strip_prefix('[')?.strip_suffix(']')?;
if inner.is_empty() || inner.contains('[') || inner.contains(']') {
return None;
}
Some(inner)
}
fn head_line_anchor(line: &PhysicalLine<'_>) -> HeadLine {
let text = line.text;
let content_offset = text.len() - text.trim_start().len();
let mut start = line.start + content_offset;
let mut body = text.trim();
let mut end = start + body.len();
let mut element = AnchoredElement::WholeLine;
if let Some(marker_len) = list_marker_len(body) {
let after = &body[marker_len..];
let ws = after.len() - after.trim_start().len();
start += marker_len + ws;
body = after.trim_start();
end = start + body.len();
element = AnchoredElement::ListItem;
}
if element != AnchoredElement::ListItem {
if let Some(stripped) = body.strip_suffix(':') {
body = stripped.trim_end();
end = start + body.len();
element = AnchoredElement::Subject;
}
}
HeadLine {
text: body.to_string(),
start,
end,
element,
}
}
struct HeadLine {
text: String,
start: usize,
end: usize,
element: AnchoredElement,
}
fn list_marker_len(body: &str) -> Option<usize> {
let first = body.chars().next()?;
if matches!(first, '-' | '*' | '+') {
if body[first.len_utf8()..].starts_with(char::is_whitespace) {
return Some(first.len_utf8());
}
return None;
}
let mut seq_end = 0;
for (i, c) in body.char_indices() {
if c.is_ascii_alphanumeric() {
seq_end = i + c.len_utf8();
} else {
break;
}
}
if seq_end == 0 {
return None;
}
let term = body[seq_end..].chars().next()?;
if matches!(term, '.' | ')') {
let marker_len = seq_end + term.len_utf8();
if marker_len == body.len() || body[marker_len..].starts_with(char::is_whitespace) {
return Some(marker_len);
}
}
None
}
pub fn extract_reference_lines(source: &str) -> AnchoringPrepass {
let lines = physical_lines(source);
let loc = SourceLocation::new(source);
let mut reference_lines: Vec<ReferenceLine> = Vec::new();
let mut diagnostics: Vec<Diagnostic> = Vec::new();
let mut removed: Vec<bool> = vec![false; lines.len()];
let mut anchored_line: Vec<bool> = vec![false; lines.len()];
for idx in 0..lines.len() {
let line = &lines[idx];
let trimmed = line.trimmed();
let Some(inner) = bracketed_inner(trimmed) else {
continue;
};
let reference_type = determine_reference_type(inner);
let bracket_start = line.start + (line.text.len() - line.text.trim_start().len());
let bracket_end = bracket_start + trimmed.len();
let reference_range = loc.byte_range_to_ast_range(&(bracket_start..bracket_end));
let reference = {
let mut r = ReferenceInline::new(inner.to_string());
r.reference_type = reference_type.clone();
r
};
if reference_type.anchoring() != AnchorKind::WholeLineCapable {
continue;
}
removed[idx] = true;
let mut above: Option<usize> = None;
let mut stacked_over: Option<usize> = None;
if idx > 0 {
let mut j = idx - 1;
loop {
if removed[j] {
stacked_over = Some(j);
if j == 0 {
break;
}
j -= 1;
continue;
}
if lines[j].is_blank() {
above = None;
} else {
above = Some(j);
}
break;
}
}
let anchor = match above {
Some(above_idx) => {
let head = head_line_anchor(&lines[above_idx]);
let anchor_range = loc.byte_range_to_ast_range(&(head.start..head.end));
if anchored_line[above_idx] || stacked_over.is_some() {
diagnostics.push(
Diagnostic::new(
reference_range.clone(),
DiagnosticSeverity::Warning,
format!(
"Multiple reference lines anchor the same element \
('{}'); only the first is honored",
head.text
),
)
.with_code("stacked-reference-line"),
);
} else if head_line_has_inline_reference(&lines[above_idx]) {
diagnostics.push(
Diagnostic::new(
reference_range.clone(),
DiagnosticSeverity::Warning,
format!(
"Reference line anchors an element whose head line \
('{}') already carries an inline reference; the \
whole-line anchor is honored",
head.text
),
)
.with_code("overlapping-reference-line"),
);
}
anchored_line[above_idx] = true;
ReferenceAnchor::WholeElement {
anchor_text: head.text,
anchor_range,
element: head.element,
}
}
None => ReferenceAnchor::SelfLink,
};
reference_lines.push(ReferenceLine {
reference,
reference_range,
anchor,
});
}
let removed_line_ranges: Vec<Range<usize>> = lines
.iter()
.enumerate()
.filter(|(idx, _)| removed[*idx])
.map(|(_, line)| line.start..line.end)
.collect();
AnchoringPrepass {
removed_line_ranges,
reference_lines,
diagnostics,
}
}
pub(crate) fn resolve_word_anchors(nodes: &mut [crate::lex::inlines::InlineNode]) {
use crate::lex::inlines::InlineNode;
if !nodes
.iter()
.any(|n| matches!(n, InlineNode::Reference { .. }))
{
return;
}
let texts: Vec<String> = nodes.iter().map(flatten_inline_text).collect();
let n = nodes.len();
for i in 0..n {
if !matches!(nodes[i], InlineNode::Reference { .. }) {
continue;
}
let before: String = texts[..i].concat();
let after: String = texts[i + 1..].concat();
let first_on_line = before.trim().is_empty();
let anchor = if first_on_line {
after
.split_whitespace()
.next()
.and_then(clean_anchor_word)
.map(|word| WordAnchor {
word,
direction: AnchorDirection::Following,
})
} else {
before
.split_whitespace()
.next_back()
.and_then(clean_anchor_word)
.map(|word| WordAnchor {
word,
direction: AnchorDirection::Preceding,
})
};
if let InlineNode::Reference { data, .. } = &mut nodes[i] {
data.word_anchor = anchor;
}
}
}
fn clean_anchor_word(word: &str) -> Option<String> {
let cleaned = word.trim_matches(|c: char| !c.is_alphanumeric());
if cleaned.is_empty() {
None
} else {
Some(cleaned.to_string())
}
}
fn flatten_inline_text(node: &crate::lex::inlines::InlineNode) -> String {
use crate::lex::inlines::InlineNode;
match node {
InlineNode::Plain { text, .. }
| InlineNode::Code { text, .. }
| InlineNode::Math { text, .. } => text.clone(),
InlineNode::Strong { content, .. } | InlineNode::Emphasis { content, .. } => {
content.iter().map(flatten_inline_text).collect()
}
InlineNode::Reference { .. } => String::new(),
}
}
fn head_line_has_inline_reference(line: &PhysicalLine<'_>) -> bool {
parse_inlines(line.text.trim())
.iter()
.any(|node| matches!(node, InlineNode::Reference { .. }))
}
#[cfg(test)]
mod tests {
use super::*;
use crate::lex::ast::traits::AstNode;
use crate::lex::inlines::{AnchorDirection, ReferenceType};
use crate::lex::parsing::parse_document;
fn ref_lines(src: &str) -> Vec<ReferenceLine> {
parse_document(src).unwrap().reference_lines
}
fn sole_whole_anchor(src: &str) -> (String, AnchoredElement) {
let lines = ref_lines(src);
assert_eq!(
lines.len(),
1,
"expected exactly one reference line: {lines:?}"
);
match &lines[0].anchor {
ReferenceAnchor::WholeElement {
anchor_text,
element,
..
} => (anchor_text.clone(), *element),
other => panic!("expected whole-element anchor, got {other:?}"),
}
}
fn word_anchor(src: &str) -> WordAnchor {
let doc = parse_document(src).unwrap();
let r = doc
.iter_all_references()
.find(|r| r.word_anchor.is_some())
.expect("a reference with a word anchor");
r.word_anchor.clone().unwrap()
}
#[test]
fn inline_preceding_word_anchor() {
let wa = word_anchor("the project website [https://lex.ing] is fast.\n\n");
assert_eq!(wa.word, "website");
assert_eq!(wa.direction, AnchorDirection::Preceding);
}
#[test]
fn inline_following_word_anchor() {
let wa = word_anchor("[https://lex.ing] is the home page.\n\n");
assert_eq!(wa.word, "is");
assert_eq!(wa.direction, AnchorDirection::Following);
}
#[test]
fn inline_abutting_word_anchor() {
let wa = word_anchor("Hello[./file.txt] World\n\n");
assert_eq!(wa.word, "Hello");
assert_eq!(wa.direction, AnchorDirection::Preceding);
}
#[test]
fn inline_preceding_word_anchor_trims_trailing_punctuation() {
let wa = word_anchor("the project website, [https://x] is fast.\n\n");
assert_eq!(wa.word, "website");
assert_eq!(wa.direction, AnchorDirection::Preceding);
}
#[test]
fn inline_following_word_anchor_trims_punctuation() {
let wa = word_anchor("[https://x] (home) page.\n\n");
assert_eq!(wa.word, "home");
assert_eq!(wa.direction, AnchorDirection::Following);
}
#[test]
fn inline_word_anchor_preserves_interior_punctuation() {
let wa = word_anchor("visit lex.ing [https://lex.ing] now.\n\n");
assert_eq!(wa.word, "lex.ing");
assert_eq!(wa.direction, AnchorDirection::Preceding);
}
#[test]
fn inline_punctuation_only_neighbor_yields_no_anchor() {
let doc = parse_document("word -- [https://x] end.\n\n").unwrap();
let r = doc
.iter_all_references()
.find(|r| matches!(r.reference_type, ReferenceType::Url { .. }))
.expect("the url reference");
assert!(
r.word_anchor.is_none(),
"punctuation-only neighbor must not produce an anchor: {:?}",
r.word_anchor
);
}
#[test]
fn reference_line_anchors_session_title() {
let src = "Getting Started\n[./readme.txt]\n\n Welcome to the docs.\n\n";
let (anchor, _kind) = sole_whole_anchor(src);
assert_eq!(anchor, "Getting Started");
let doc = parse_document(src).unwrap();
assert_eq!(doc.root.children[0].node_type(), "Session");
}
#[test]
fn reference_line_anchors_list_item() {
let src = "- Food\n- Water\n[https://water.example]\n- Bread\n\n";
let (anchor, kind) = sole_whole_anchor(src);
assert_eq!(anchor, "Water");
assert_eq!(kind, AnchoredElement::ListItem);
let doc = parse_document(src).unwrap();
assert_eq!(doc.root.children[0].node_type(), "List");
}
#[test]
fn reference_line_on_list_item_keeps_trailing_colon() {
let src = "- Note:\n[./n.txt]\n- Other\n\n";
let (anchor, kind) = sole_whole_anchor(src);
assert_eq!(anchor, "Note:");
assert_eq!(kind, AnchoredElement::ListItem);
}
#[test]
fn reference_line_keeps_definition_a_definition() {
let src =
"API Endpoint:\n[./endpoint.txt]\n A URL that provides access to a resource.\n\n";
let (anchor, kind) = sole_whole_anchor(src);
assert_eq!(anchor, "API Endpoint");
assert_eq!(kind, AnchoredElement::Subject);
let doc = parse_document(src).unwrap();
assert_eq!(
doc.root.children[0].node_type(),
"Definition",
"reference line must be transparent: blanking it would wrongly \
turn the definition into a session"
);
}
#[test]
fn reference_line_as_blank_would_make_a_session() {
let src = "API Endpoint:\n\n A URL that provides access to a resource.\n\n";
let doc = parse_document(src).unwrap();
assert_eq!(doc.root.children[0].node_type(), "Session");
}
#[test]
fn reference_line_anchors_verbatim_subject() {
let src = "Example Source:\n[./example.rs]\n fn main() {}\n:: rust ::\n\n";
let (anchor, kind) = sole_whole_anchor(src);
assert_eq!(anchor, "Example Source");
assert_eq!(kind, AnchoredElement::Subject);
let doc = parse_document(src).unwrap();
assert_eq!(doc.root.children[0].node_type(), "VerbatimBlock");
}
#[test]
fn reference_line_anchors_paragraph_line() {
let src =
"First paragraph line.\nThe release notes cover every change.\n[./CHANGELOG.md]\n\n";
let (anchor, kind) = sole_whole_anchor(src);
assert_eq!(anchor, "The release notes cover every change.");
assert_eq!(kind, AnchoredElement::WholeLine);
}
#[test]
fn reference_line_self_links_when_blank_above() {
let src = "See the upstream project:\n\n[https://github.com/lex-fmt/lex]\n\n";
let lines = ref_lines(src);
assert_eq!(lines.len(), 1);
assert_eq!(lines[0].anchor, ReferenceAnchor::SelfLink);
}
#[test]
fn reference_line_self_links_at_start_of_container() {
let src = "[https://lex.ing]\n\n";
let lines = ref_lines(src);
assert_eq!(lines.len(), 1);
assert_eq!(lines[0].anchor, ReferenceAnchor::SelfLink);
}
#[test]
fn marker_reference_on_its_own_line_is_not_a_reference_line() {
let src = "Closing remarks.\n[::summary-note]\n\n:: summary-note ::\n Resolved.\n\n";
let lines = ref_lines(src);
assert!(
lines.is_empty(),
"marker-style reference must not become a whole-element anchor: {lines:?}"
);
let doc = parse_document(src).unwrap();
assert!(doc
.iter_all_references()
.any(|r| matches!(r.reference_type, ReferenceType::AnnotationReference { .. })));
}
#[test]
fn footnote_on_its_own_line_is_not_a_reference_line() {
let src = "Some claim.\n[42]\n\n:: 42 :: A footnote.\n\n";
assert!(ref_lines(src).is_empty());
}
#[test]
fn stacked_reference_lines_warn_and_keep_first() {
let src = "First line.\nClaim line here.\n[./a.txt]\n[./b.txt]\n\n";
let doc = parse_document(src).unwrap();
let lines = &doc.reference_lines;
assert_eq!(lines.len(), 2, "both reference lines are collected");
let warns: Vec<_> = doc
.diagnostics()
.into_iter()
.filter(|d| d.code.as_deref() == Some("stacked-reference-line"))
.collect();
assert_eq!(warns.len(), 1, "one stacking warning: {warns:?}");
}
#[test]
fn reference_line_over_head_line_with_inline_reference_warns() {
let src = "See more here.\nVisit [https://a.example] now.\n[./b.txt]\n\n";
let doc = parse_document(src).unwrap();
let warns: Vec<_> = doc
.diagnostics()
.into_iter()
.filter(|d| d.code.as_deref() == Some("overlapping-reference-line"))
.collect();
assert_eq!(warns.len(), 1, "one overlap warning: {warns:?}");
assert!(doc.reference_lines[0].anchor.is_whole_element());
}
#[test]
fn head_line_with_stray_bracket_does_not_warn() {
let src = "Intro.\nThe array index `a[0]` matters.\n[./b.txt]\n\n";
let doc = parse_document(src).unwrap();
let warns: Vec<_> = doc
.diagnostics()
.into_iter()
.filter(|d| d.code.as_deref() == Some("overlapping-reference-line"))
.collect();
assert!(
warns.is_empty(),
"a stray bracket is not an inline reference: {warns:?}"
);
assert!(doc.reference_lines[0].anchor.is_whole_element());
}
#[test]
fn anchor_kind_split_matches_spec() {
use crate::lex::inlines::AnchorKind;
assert_eq!(
ReferenceType::Url { target: "x".into() }.anchoring(),
AnchorKind::WholeLineCapable
);
assert_eq!(
ReferenceType::File { target: "x".into() }.anchoring(),
AnchorKind::WholeLineCapable
);
assert_eq!(
ReferenceType::Session { target: "1".into() }.anchoring(),
AnchorKind::WholeLineCapable
);
assert_eq!(
ReferenceType::General { target: "x".into() }.anchoring(),
AnchorKind::WholeLineCapable
);
assert_eq!(
ReferenceType::FootnoteNumber { number: 1 }.anchoring(),
AnchorKind::MarkerOnly
);
assert_eq!(
ReferenceType::AnnotationReference { label: "n".into() }.anchoring(),
AnchorKind::MarkerOnly
);
assert_eq!(ReferenceType::NotSure.anchoring(), AnchorKind::MarkerOnly);
}
#[test]
fn anchor_range_covers_the_head_line_text() {
let src = "Getting Started\n[./readme.txt]\n\n Body.\n\n";
let doc = parse_document(src).unwrap();
let ReferenceAnchor::WholeElement { anchor_range, .. } = &doc.reference_lines[0].anchor
else {
panic!("expected whole-element anchor");
};
assert_eq!(&src[anchor_range.span.clone()], "Getting Started");
}
#[test]
fn reference_range_covers_brackets_inclusive() {
let src = "Getting Started\n[./readme.txt]\n\n Body.\n\n";
let doc = parse_document(src).unwrap();
let range = &doc.reference_lines[0].reference_range;
assert_eq!(&src[range.span.clone()], "[./readme.txt]");
}
#[test]
fn later_element_keeps_original_source_coordinates() {
let original =
"Intro paragraph here.\n[./top.txt]\n\nLater Section paragraph text.\n\n".to_string();
let doc = parse_document(&original).unwrap();
let later = doc
.root
.children
.iter()
.find(|c| {
c.text()
.map(|t| t.contains("Later Section"))
.unwrap_or(false)
})
.expect("a 'Later Section' element after the reference line");
let expected_start = original
.find("Later Section")
.expect("the literal text in the original source");
assert_eq!(
later.range().span.start,
expected_start,
"node after a reference line must carry an ORIGINAL-source offset \
(got {}, expected {}); a mismatch means a cleaned-source coordinate \
leaked into the AST",
later.range().span.start,
expected_start,
);
assert!(original[later.range().span.clone()].starts_with("Later Section"));
}
#[test]
fn documents_without_reference_lines_have_empty_collection() {
let doc = parse_document("Just a paragraph.\n\n").unwrap();
assert!(doc.reference_lines.is_empty());
assert!(doc.reference_line_diagnostics.is_empty());
}
#[test]
fn list_marker_stripping_handles_ordered_markers() {
let src = "1. First item\n[./x.txt]\n2. Second item\n\n";
let (anchor, kind) = sole_whole_anchor(src);
assert_eq!(anchor, "First item");
assert_eq!(kind, AnchoredElement::ListItem);
}
}