use std::ops::Range;
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Citation {
pub marker: u32,
pub span: Range<usize>,
pub source_index: u32,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct CitationWarning {
pub kind: CitationWarningKind,
pub span: Range<usize>,
pub detail: String,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum CitationWarningKind {
Malformed,
OutOfRange,
}
pub fn parse_citations(text: &str, sources_count: usize) -> CitationParseResult {
let bytes = text.as_bytes();
let mut citations: Vec<Citation> = Vec::new();
let mut warnings: Vec<CitationWarning> = Vec::new();
let mut i = 0usize;
let mut in_fence = false;
while i < bytes.len() {
if is_line_start(bytes, i) {
let line_first = first_non_ws_on_line(bytes, i);
if line_first + 2 < bytes.len()
&& bytes[line_first] == b'`'
&& bytes[line_first + 1] == b'`'
&& bytes[line_first + 2] == b'`'
{
in_fence = !in_fence;
i = advance_to_newline(bytes, line_first + 3);
continue;
}
}
if in_fence {
i += 1;
continue;
}
if bytes[i] == b'[' {
if i > 0 && bytes[i - 1] == b'\\' {
let backslashes = count_preceding_backslashes(bytes, i);
if backslashes % 2 == 1 {
i += 1;
continue;
}
}
if i + 1 < bytes.len() && bytes[i + 1] == b'^' {
match read_marker(bytes, i) {
MarkerScan::Ok { marker, end } => {
let span = i..end;
let source_index = marker.saturating_sub(1);
if (source_index as usize) >= sources_count {
warnings.push(CitationWarning {
kind: CitationWarningKind::OutOfRange,
span: span.clone(),
detail: format!(
"marker [^{marker}] references source #{} but only {} sources available",
source_index + 1,
sources_count
),
});
}
citations.push(Citation {
marker,
span,
source_index,
});
i = end;
continue;
}
MarkerScan::Malformed { end, reason } => {
warnings.push(CitationWarning {
kind: CitationWarningKind::Malformed,
span: i..end,
detail: reason,
});
i = end;
continue;
}
MarkerScan::NotAMarker => {
i += 1;
continue;
}
}
}
}
i += 1;
}
CitationParseResult {
citations,
warnings,
}
}
#[derive(Debug, Clone, PartialEq, Eq, Default)]
pub struct CitationParseResult {
pub citations: Vec<Citation>,
pub warnings: Vec<CitationWarning>,
}
enum MarkerScan {
Ok { marker: u32, end: usize },
Malformed { end: usize, reason: String },
NotAMarker,
}
fn read_marker(bytes: &[u8], start: usize) -> MarkerScan {
let body_start = start + 2;
if body_start >= bytes.len() {
return MarkerScan::NotAMarker;
}
let mut j = body_start;
while j < bytes.len() && bytes[j] != b']' {
if !bytes[j].is_ascii_digit() {
let mut k = body_start;
let mut all_inside = true;
while k < bytes.len() && k - body_start < 16 {
if bytes[k] == b']' {
break;
}
k += 1;
if k < bytes.len() && bytes[k] == b'\n' {
all_inside = false;
break;
}
}
if all_inside && k < bytes.len() && bytes[k] == b']' {
return MarkerScan::Malformed {
end: k + 1,
reason: format!(
"expected digits inside [^…], got `{}`",
String::from_utf8_lossy(&bytes[body_start..k])
),
};
}
return MarkerScan::NotAMarker;
}
j += 1;
}
if j >= bytes.len() {
return MarkerScan::NotAMarker;
}
if j == body_start {
return MarkerScan::Malformed {
end: j + 1,
reason: "empty marker body".to_string(),
};
}
if bytes[body_start] == b'0' {
return MarkerScan::Malformed {
end: j + 1,
reason: format!(
"marker must be a positive integer with no leading zero, got `{}`",
String::from_utf8_lossy(&bytes[body_start..j])
),
};
}
let digits = &bytes[body_start..j];
let mut acc: u64 = 0;
for &d in digits {
acc = acc * 10 + (d - b'0') as u64;
if acc > u32::MAX as u64 {
return MarkerScan::Malformed {
end: j + 1,
reason: format!(
"marker value `{}` exceeds u32::MAX",
String::from_utf8_lossy(digits)
),
};
}
}
let marker = acc as u32;
if marker == 0 {
return MarkerScan::Malformed {
end: j + 1,
reason: "marker must be ≥ 1".to_string(),
};
}
MarkerScan::Ok { marker, end: j + 1 }
}
fn is_line_start(bytes: &[u8], i: usize) -> bool {
i == 0 || bytes[i - 1] == b'\n'
}
fn first_non_ws_on_line(bytes: &[u8], i: usize) -> usize {
let mut k = i;
while k < bytes.len() && (bytes[k] == b' ' || bytes[k] == b'\t') {
k += 1;
}
k
}
fn advance_to_newline(bytes: &[u8], i: usize) -> usize {
let mut k = i;
while k < bytes.len() && bytes[k] != b'\n' {
k += 1;
}
if k < bytes.len() {
k + 1
} else {
k
}
}
fn count_preceding_backslashes(bytes: &[u8], i: usize) -> usize {
let mut k = i;
let mut count = 0;
while k > 0 && bytes[k - 1] == b'\\' {
count += 1;
k -= 1;
}
count
}
#[cfg(test)]
mod tests {
use super::*;
fn parse(text: &str, n_sources: usize) -> CitationParseResult {
parse_citations(text, n_sources)
}
#[test]
fn well_formed_single_marker() {
let r = parse("Churn was driven by pricing[^1].", 1);
assert_eq!(r.citations.len(), 1);
assert!(r.warnings.is_empty());
assert_eq!(r.citations[0].marker, 1);
assert_eq!(r.citations[0].source_index, 0);
let c = &r.citations[0];
assert_eq!(&"Churn was driven by pricing[^1]."[c.span.clone()], "[^1]");
}
#[test]
fn well_formed_multi_digit_marker() {
let r = parse("see [^42] and [^1234]", 1300);
assert_eq!(
r.citations.iter().map(|c| c.marker).collect::<Vec<_>>(),
vec![42, 1234]
);
assert!(r.warnings.is_empty());
}
#[test]
fn repeated_markers_are_each_emitted() {
let r = parse("a[^1] b[^1] c[^2]", 2);
assert_eq!(r.citations.len(), 3);
assert_eq!(r.citations[0].marker, 1);
assert_eq!(r.citations[1].marker, 1);
assert_eq!(r.citations[2].marker, 2);
assert!(r.warnings.is_empty());
}
#[test]
fn empty_marker_body_is_malformed() {
let r = parse("a[^] b", 0);
assert!(r.citations.is_empty());
assert_eq!(r.warnings.len(), 1);
assert!(matches!(r.warnings[0].kind, CitationWarningKind::Malformed));
}
#[test]
fn non_digit_marker_is_malformed() {
let r = parse("see [^abc] for context", 0);
assert!(r.citations.is_empty());
assert_eq!(r.warnings.len(), 1);
assert!(matches!(r.warnings[0].kind, CitationWarningKind::Malformed));
}
#[test]
fn negative_looking_marker_is_malformed() {
let r = parse("nope[^-1]nope", 0);
assert!(r.citations.is_empty());
assert_eq!(r.warnings.len(), 1);
assert!(matches!(r.warnings[0].kind, CitationWarningKind::Malformed));
}
#[test]
fn leading_zero_marker_is_malformed() {
let r = parse("nope[^01]nope", 5);
assert!(r.citations.is_empty());
assert_eq!(r.warnings.len(), 1);
assert!(matches!(r.warnings[0].kind, CitationWarningKind::Malformed));
}
#[test]
fn lone_zero_marker_is_malformed() {
let r = parse("nope[^0]nope", 5);
assert!(r.citations.is_empty());
assert_eq!(r.warnings.len(), 1);
}
#[test]
fn very_large_marker_within_u32() {
let r = parse("see [^4294967295]", 1);
assert_eq!(r.citations.len(), 1);
assert_eq!(r.citations[0].marker, u32::MAX);
assert_eq!(r.warnings.len(), 1);
assert!(matches!(
r.warnings[0].kind,
CitationWarningKind::OutOfRange
));
}
#[test]
fn marker_over_u32_is_malformed() {
let r = parse("see [^9999999999999]", 0);
assert!(r.citations.is_empty());
assert_eq!(r.warnings.len(), 1);
assert!(matches!(r.warnings[0].kind, CitationWarningKind::Malformed));
}
#[test]
fn escaped_marker_is_not_parsed() {
let r = parse(r"literal \[^1\] in text", 1);
assert!(r.citations.is_empty());
assert!(r.warnings.is_empty());
}
#[test]
fn double_backslash_does_not_escape() {
let r = parse(r"path\\[^1] continues", 1);
assert_eq!(r.citations.len(), 1);
}
#[test]
fn marker_inside_code_fence_is_ignored() {
let text = "before[^1]\n```\nthe code uses [^2] internally\n```\nafter[^3]";
let r = parse(text, 3);
let markers: Vec<u32> = r.citations.iter().map(|c| c.marker).collect();
assert_eq!(markers, vec![1, 3]);
assert!(r.warnings.is_empty());
}
#[test]
fn fenced_with_info_string_still_ignored() {
let text = "head[^1]\n```rust\nlet x = [^99];\n```\ntail[^2]";
let r = parse(text, 2);
let markers: Vec<u32> = r.citations.iter().map(|c| c.marker).collect();
assert_eq!(markers, vec![1, 2]);
}
#[test]
fn unicode_neighbors_are_safe() {
let text = "感谢[^1]谢谢";
let r = parse(text, 1);
assert_eq!(r.citations.len(), 1);
let span = r.citations[0].span.clone();
assert_eq!(&text[span], "[^1]");
}
#[test]
fn out_of_range_emits_citation_and_warning() {
let r = parse("see [^5] and [^1]", 2);
assert_eq!(r.citations.len(), 2);
assert_eq!(r.warnings.len(), 1);
assert_eq!(r.warnings[0].kind, CitationWarningKind::OutOfRange);
assert_eq!(r.citations[0].marker, 5);
assert_eq!(r.citations[0].source_index, 4);
}
#[test]
fn empty_text_yields_empty_result() {
let r = parse("", 0);
assert!(r.citations.is_empty());
assert!(r.warnings.is_empty());
}
#[test]
fn no_panics_on_truncated_markers() {
for bad in ["[", "[^", "[^1", "[^123", "[^abc", "[^\n1]", "[^99"] {
let _ = parse(bad, 0);
}
}
#[test]
fn malformed_with_newline_inside_body() {
let r = parse("see [^12\n] here", 0);
assert!(r.citations.is_empty());
assert!(r.warnings.is_empty());
}
#[test]
fn back_to_back_markers() {
let r = parse("[^1][^2][^3]", 3);
assert_eq!(
r.citations.iter().map(|c| c.marker).collect::<Vec<_>>(),
vec![1, 2, 3]
);
assert!(r.warnings.is_empty());
}
}